Merge remote-tracking branch 'origin/master' into sl/mmid-cpu-perf

This commit is contained in:
slaren 2025-02-09 16:27:24 +01:00
commit 2d493d26ab
93 changed files with 9479 additions and 3868 deletions

View file

@ -10,10 +10,10 @@ on:
push: push:
branches: branches:
- master - master
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal'] paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
pull_request: pull_request:
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal'] paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}

View file

@ -17,7 +17,7 @@ jobs:
steps: steps:
- uses: actions/stale@v5 - uses: actions/stale@v5
with: with:
exempt-issue-labels: "refactor,help wanted,good first issue,research,bug" exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
days-before-issue-stale: 30 days-before-issue-stale: 30
days-before-issue-close: 14 days-before-issue-close: 14
stale-issue-label: "stale" stale-issue-label: "stale"

View file

@ -81,13 +81,36 @@ jobs:
with: with:
node-version: '22.11.0' node-version: '22.11.0'
- name: WebUI - Install dependencies
id: webui_lint
run: |
cd examples/server/webui
npm ci
- name: WebUI - Check code format
id: webui_format
run: |
git config --global --add safe.directory $(realpath .)
cd examples/server/webui
git status
npm run format
git status
modified_files="$(git status -s)"
echo "Modified files: ${modified_files}"
if [ -n "${modified_files}" ]; then
echo "Files do not follow coding style. To fix: npm run format"
echo "${modified_files}"
exit 1
fi
- name: Verify bundled index.html - name: Verify bundled index.html
id: verify_server_index_html id: verify_server_index_html
run: | run: |
git config --global --add safe.directory $(realpath .) git config --global --add safe.directory $(realpath .)
cd examples/server/webui cd examples/server/webui
git status git status
npm ci
npm run build npm run build
git status git status
modified_files="$(git status -s)" modified_files="$(git status -s)"

83
AUTHORS
View file

@ -1,4 +1,4 @@
# date: Thu Nov 28 20:46:15 EET 2024 # date: Tue Feb 4 13:04:05 EET 2025
# this file is auto-generated by scripts/gen-authors.sh # this file is auto-generated by scripts/gen-authors.sh
0cc4m <picard12@live.de> 0cc4m <picard12@live.de>
@ -20,6 +20,8 @@ Adithya Balaji <adithya.b94@gmail.com>
AdithyanI <adithyan.i4internet@gmail.com> AdithyanI <adithyan.i4internet@gmail.com>
Adrian <smith.adriane@gmail.com> Adrian <smith.adriane@gmail.com>
Adrian Hesketh <a-h@users.noreply.github.com> Adrian Hesketh <a-h@users.noreply.github.com>
Adrien Gallouët <adrien@gallouet.fr>
Adrien Gallouët <angt@huggingface.co>
Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com> Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com>
Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr> Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
@ -55,6 +57,7 @@ Ananta Bastola <anantarajbastola@gmail.com>
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
András Salamon <ott2@users.noreply.github.com> András Salamon <ott2@users.noreply.github.com>
Andreas (Andi) Kunar <andreask@msn.com> Andreas (Andi) Kunar <andreask@msn.com>
Andreas Kieslinger <47689530+aendk@users.noreply.github.com>
Andrei <abetlen@gmail.com> Andrei <abetlen@gmail.com>
Andrew Canis <andrew.canis@gmail.com> Andrew Canis <andrew.canis@gmail.com>
Andrew Downing <andrew2085@gmail.com> Andrew Downing <andrew2085@gmail.com>
@ -91,13 +94,17 @@ Ben Siraphob <bensiraphob@gmail.com>
Ben Williams <ben@719ben.com> Ben Williams <ben@719ben.com>
Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com> Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com> Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
Benson Wong <mostlygeek@gmail.com>
Bernat Vadell <hounter.caza@gmail.com> Bernat Vadell <hounter.caza@gmail.com>
Bernhard M. Wiedemann <githubbmwprimary@lsmod.de>
Bert Wagner <github@bertwagner.com> Bert Wagner <github@bertwagner.com>
Billel Mokeddem <billel.mokeddem.ml@gmail.com>
Bingan <70050083+binganao@users.noreply.github.com> Bingan <70050083+binganao@users.noreply.github.com>
Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com> Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com>
Bodo Graumann <mail@bodograumann.de> Bodo Graumann <mail@bodograumann.de>
Bono Lv <lvscar@users.noreply.github.com> Bono Lv <lvscar@users.noreply.github.com>
Borislav Stanimirov <b.stanimirov@abv.bg> Borislav Stanimirov <b.stanimirov@abv.bg>
Borislav Stanimirov <b@ibob.bg>
Branden Butler <bwtbutler@hotmail.com> Branden Butler <bwtbutler@hotmail.com>
Brandon Squizzato <35474886+bsquizz@users.noreply.github.com> Brandon Squizzato <35474886+bsquizz@users.noreply.github.com>
Brian <mofosyne@gmail.com> Brian <mofosyne@gmail.com>
@ -117,6 +124,7 @@ Casey Primozic <casey@cprimozic.net>
Casey Primozic <me@ameo.link> Casey Primozic <me@ameo.link>
CausalLM <148736309+CausalLM@users.noreply.github.com> CausalLM <148736309+CausalLM@users.noreply.github.com>
Cebtenzzre <cebtenzzre@gmail.com> Cebtenzzre <cebtenzzre@gmail.com>
CentricStorm <CentricStorm@users.noreply.github.com>
Chad Brewbaker <crb002@gmail.com> Chad Brewbaker <crb002@gmail.com>
Changyeon Kim <cyzero.kim@samsung.com> Changyeon Kim <cyzero.kim@samsung.com>
Chao Jiang <jc19chaoj@zoho.com> Chao Jiang <jc19chaoj@zoho.com>
@ -131,12 +139,15 @@ Chris Kuehl <ckuehl@ckuehl.me>
Christian Demsar <christian@github.email.demsar.us> Christian Demsar <christian@github.email.demsar.us>
Christian Demsar <crasm@git.vczf.us> Christian Demsar <crasm@git.vczf.us>
Christian Falch <875252+chrfalch@users.noreply.github.com> Christian Falch <875252+chrfalch@users.noreply.github.com>
Christian Kastner <ckk@kvr.at>
Christian Kögler <ck3d@gmx.de> Christian Kögler <ck3d@gmx.de>
Christian Köhnenkamp <cvk5@me.com> Christian Köhnenkamp <cvk5@me.com>
Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
Christopher Nielsen <62156882+mascguy@users.noreply.github.com>
Clark Saben <76020733+csaben@users.noreply.github.com> Clark Saben <76020733+csaben@users.noreply.github.com>
Clint Herron <hanclinto@gmail.com> Clint Herron <hanclinto@gmail.com>
Conrad Kramer <conrad@conradkramer.com> Conrad Kramer <conrad@conradkramer.com>
Corentin REGAL <corentin.regal@gmail.com>
CrispStrobe <154636388+CrispStrobe@users.noreply.github.com> CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
Csaba Kecskemeti <csaba.kecskemeti@gmail.com> Csaba Kecskemeti <csaba.kecskemeti@gmail.com>
Cuong Trinh Manh <nguoithichkhampha@gmail.com> Cuong Trinh Manh <nguoithichkhampha@gmail.com>
@ -176,6 +187,7 @@ Dibakar Gope <dibakar.gope@arm.com>
Didzis Gosko <didzis@users.noreply.github.com> Didzis Gosko <didzis@users.noreply.github.com>
Diego Devesa <slarengh@gmail.com> Diego Devesa <slarengh@gmail.com>
Diogo Teles Sant'Anna <diogoteles@google.com> Diogo Teles Sant'Anna <diogoteles@google.com>
Djip007 <3705339+Djip007@users.noreply.github.com>
Djip007 <djip.perois@free.fr> Djip007 <djip.perois@free.fr>
Don Mahurin <dmahurin@users.noreply.github.com> Don Mahurin <dmahurin@users.noreply.github.com>
DooWoong Lee (David) <manics99@naver.com> DooWoong Lee (David) <manics99@naver.com>
@ -193,6 +205,7 @@ Edward Taylor <edeetee@gmail.com>
Elaine <elaine.zosa@gmail.com> Elaine <elaine.zosa@gmail.com>
Elbios <141279586+Elbios@users.noreply.github.com> Elbios <141279586+Elbios@users.noreply.github.com>
Elton Kola <eltonkola@gmail.com> Elton Kola <eltonkola@gmail.com>
Emreerdog <34742675+Emreerdog@users.noreply.github.com>
Engininja2 <139037756+Engininja2@users.noreply.github.com> Engininja2 <139037756+Engininja2@users.noreply.github.com>
Equim <sayaka@ekyu.moe> Equim <sayaka@ekyu.moe>
Eric Curtin <ecurtin@redhat.com> Eric Curtin <ecurtin@redhat.com>
@ -233,6 +246,7 @@ Fred Douglas <43351173+fredlas@users.noreply.github.com>
Frederik Vogel <Schaltfehler@users.noreply.github.com> Frederik Vogel <Schaltfehler@users.noreply.github.com>
Gabe Goodhart <gabe.l.hart@gmail.com> Gabe Goodhart <gabe.l.hart@gmail.com>
Gabe Goodhart <ghart@us.ibm.com> Gabe Goodhart <ghart@us.ibm.com>
Gaetan Bisson <gaetan@fenua.org>
GainLee <perfecter.gen@gmail.com> GainLee <perfecter.gen@gmail.com>
Galunid <karolek1231456@gmail.com> Galunid <karolek1231456@gmail.com>
Gary Linscott <glinscott@gmail.com> Gary Linscott <glinscott@gmail.com>
@ -249,6 +263,7 @@ Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
Guillaume Wenzek <gwenzek@users.noreply.github.com> Guillaume Wenzek <gwenzek@users.noreply.github.com>
Guoliang Hua <32868157+nbcsm@users.noreply.github.com> Guoliang Hua <32868157+nbcsm@users.noreply.github.com>
Guoteng <32697156+SolenoidWGT@users.noreply.github.com> Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
Guspan Tanadi <36249910+guspan-tanadi@users.noreply.github.com>
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com> Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
Haggai Nuchi <h.nuchi@gmail.com> Haggai Nuchi <h.nuchi@gmail.com>
Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com> Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
@ -259,11 +274,13 @@ Haoxiang Fei <tonyfettes@tonyfettes.com>
Harald Fernengel <harald.fernengel@here.com> Harald Fernengel <harald.fernengel@here.com>
Hatsune Miku <129688334+at8u@users.noreply.github.com> Hatsune Miku <129688334+at8u@users.noreply.github.com>
HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com> HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com>
Haus1 <haus.xda@gmail.com>
Henk Poley <HenkPoley@gmail.com> Henk Poley <HenkPoley@gmail.com>
Henri Vasserman <henv@hot.ee> Henri Vasserman <henv@hot.ee>
Henrik Forstén <henrik.forsten@gmail.com> Henrik Forstén <henrik.forsten@gmail.com>
Herman Semenov <GermanAizek@yandex.ru> Herman Semenov <GermanAizek@yandex.ru>
Hesen Peng <hesen.peng@gmail.com> Hesen Peng <hesen.peng@gmail.com>
HimariO <dsfhe49854@gmail.com>
Hoang Nguyen <hugo53@users.noreply.github.com> Hoang Nguyen <hugo53@users.noreply.github.com>
Hong Bo PENG <penghb@cn.ibm.com> Hong Bo PENG <penghb@cn.ibm.com>
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com> Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
@ -280,6 +297,7 @@ Icecream95 <the.real.icecream95@gmail.com>
Ido S <ido.pluto@gmail.com> Ido S <ido.pluto@gmail.com>
IgnacioFDM <ignaciofdm@gmail.com> IgnacioFDM <ignaciofdm@gmail.com>
Igor Okulist <okigan@gmail.com> Igor Okulist <okigan@gmail.com>
Ihar Hrachyshka <ihrachys@redhat.com>
Ikko Eltociear Ashimine <eltociear@gmail.com> Ikko Eltociear Ashimine <eltociear@gmail.com>
Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com> Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Ionoclast Laboratories <brigham@ionoclast.com> Ionoclast Laboratories <brigham@ionoclast.com>
@ -289,12 +307,14 @@ Ivan <nekotekina@gmail.com>
Ivan Filipov <159561759+vanaka11@users.noreply.github.com> Ivan Filipov <159561759+vanaka11@users.noreply.github.com>
Ivan Komarov <Ivan.Komarov@dfyz.info> Ivan Komarov <Ivan.Komarov@dfyz.info>
Ivan Stepanov <ivanstepanovftw@gmail.com> Ivan Stepanov <ivanstepanovftw@gmail.com>
JFLFY2255 <JFLFY2255@163.com>
JH23X <165871467+JH23X@users.noreply.github.com> JH23X <165871467+JH23X@users.noreply.github.com>
Jack Mousseau <jack@software.inc> Jack Mousseau <jack@software.inc>
Jack Mousseau <jmousseau@users.noreply.github.com> Jack Mousseau <jmousseau@users.noreply.github.com>
JackJollimore <130917767+JackJollimore@users.noreply.github.com> JackJollimore <130917767+JackJollimore@users.noreply.github.com>
Jaeden Amero <jaeden@patater.com> Jaeden Amero <jaeden@patater.com>
Jaemin Son <woalsdnd@gmail.com> Jaemin Son <woalsdnd@gmail.com>
Jafar Uruç <jafar.uruc@gmail.com>
Jag Chadha <jagtesh@gmail.com> Jag Chadha <jagtesh@gmail.com>
Jakub N <jakubniemczyk97@gmail.com> Jakub N <jakubniemczyk97@gmail.com>
James A Capozzoli <157492257+jac-jim@users.noreply.github.com> James A Capozzoli <157492257+jac-jim@users.noreply.github.com>
@ -315,6 +335,7 @@ Jeffrey Morgan <jmorganca@gmail.com>
Jeffrey Quesnelle <emozilla@nousresearch.com> Jeffrey Quesnelle <emozilla@nousresearch.com>
Jeroen Mostert <jeroen.mostert@cm.com> Jeroen Mostert <jeroen.mostert@cm.com>
Jesse Jojo Johnson <williamsaintgeorge@gmail.com> Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
Jett Janiak <jettjaniak@gmail.com>
Jeximo <jeximo@gmail.com> Jeximo <jeximo@gmail.com>
Jhen-Jie Hong <iainst0409@gmail.com> Jhen-Jie Hong <iainst0409@gmail.com>
Jiahao Li <liplus17@163.com> Jiahao Li <liplus17@163.com>
@ -343,6 +364,7 @@ Josh Ramer <josh.ramer@icloud.com>
Joyce <joycebrum@google.com> Joyce <joycebrum@google.com>
Juan Calderon-Perez <835733+gaby@users.noreply.github.com> Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
Judd <foldl@users.noreply.github.com> Judd <foldl@users.noreply.github.com>
Juk Armstrong <69222624+jukofyork@users.noreply.github.com>
Julius Arkenberg <arki05@users.noreply.github.com> Julius Arkenberg <arki05@users.noreply.github.com>
Jun Hee Yoo <contact.jhyoo@gmail.com> Jun Hee Yoo <contact.jhyoo@gmail.com>
Jun Jie <71215065+junnjiee16@users.noreply.github.com> Jun Jie <71215065+junnjiee16@users.noreply.github.com>
@ -357,6 +379,7 @@ Justine Tunney <jtunney@mozilla.com>
Juuso Alasuutari <juuso.alasuutari@gmail.com> Juuso Alasuutari <juuso.alasuutari@gmail.com>
KASR <karim.asrih@gmail.com> KASR <karim.asrih@gmail.com>
Kamil Tomšík <info@tomsik.cz> Kamil Tomšík <info@tomsik.cz>
Karol Kontny <82021046+kkontny@users.noreply.github.com>
Karsten Weiss <knweiss@gmail.com> Karsten Weiss <knweiss@gmail.com>
Karthick <j.karthic2004@gmail.com> Karthick <j.karthic2004@gmail.com>
Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com> Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com>
@ -376,6 +399,7 @@ Kolen Cheung <ickc@users.noreply.github.com>
Konstantin Herud <konstantin.herud@denkbares.com> Konstantin Herud <konstantin.herud@denkbares.com>
Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com> Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
Kunshang Ji <kunshang.ji@intel.com> Kunshang Ji <kunshang.ji@intel.com>
Kyle Bruene <KyleBruene@users.noreply.github.com>
Kyle Liang <liangmanlai@gmail.com> Kyle Liang <liangmanlai@gmail.com>
Kyle Mistele <kyle@mistele.com> Kyle Mistele <kyle@mistele.com>
Kylin <56434533+KyL0N@users.noreply.github.com> Kylin <56434533+KyL0N@users.noreply.github.com>
@ -394,6 +418,7 @@ Liu Jia <jia3.liu@intel.com>
LoganDark <github@logandark.mozmail.com> LoganDark <github@logandark.mozmail.com>
Loïc Carrère <loic.carrere@gmail.com> Loïc Carrère <loic.carrere@gmail.com>
LostRuins <39025047+LostRuins@users.noreply.github.com> LostRuins <39025047+LostRuins@users.noreply.github.com>
LostRuins Concedo <39025047+LostRuins@users.noreply.github.com>
Luciano <lucianostrika44@gmail.com> Luciano <lucianostrika44@gmail.com>
Luo Tian <lt@basecity.com> Luo Tian <lt@basecity.com>
Lyle Dean <dean@lyle.dev> Lyle Dean <dean@lyle.dev>
@ -423,6 +448,7 @@ MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com>
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com> Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
Matheus C. França <matheus-catarino@hotmail.com> Matheus C. França <matheus-catarino@hotmail.com>
Matheus Gabriel Alves Silva <matheusgasource@gmail.com> Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
Mathieu Baudier <mbaudier@argeo.org>
Mathieu Geli <mathieu.geli@gmail.com> Mathieu Geli <mathieu.geli@gmail.com>
Mathieu Nayrolles <MathieuNls@users.noreply.github.com> Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
Mathijs Henquet <mathijs.henquet@gmail.com> Mathijs Henquet <mathijs.henquet@gmail.com>
@ -444,6 +470,7 @@ Meng, Hengyu <hengyu.meng@intel.com>
Mengqing Cao <cmq0113@163.com> Mengqing Cao <cmq0113@163.com>
Merrick Christensen <merrick.christensen@gmail.com> Merrick Christensen <merrick.christensen@gmail.com>
Michael Coppola <m18coppola@gmail.com> Michael Coppola <m18coppola@gmail.com>
Michael Engel <mengel@redhat.com>
Michael Francis <edude03@gmail.com> Michael Francis <edude03@gmail.com>
Michael Hueschen <m@mhueschen.dev> Michael Hueschen <m@mhueschen.dev>
Michael Kesper <mkesper@schokokeks.org> Michael Kesper <mkesper@schokokeks.org>
@ -452,7 +479,9 @@ Michael Podvitskiy <podvitskiymichael@gmail.com>
Michael Potter <NanoTekGuy@Gmail.com> Michael Potter <NanoTekGuy@Gmail.com>
Michael de Gans <michael.john.degans@gmail.com> Michael de Gans <michael.john.degans@gmail.com>
Michaël de Vries <vriesdemichael@gmail.com> Michaël de Vries <vriesdemichael@gmail.com>
Michał Moskal <michal@moskal.me>
Michał Tuszyński <srgtuszy@gmail.com> Michał Tuszyński <srgtuszy@gmail.com>
Michelle Tan <41475767+MichelleTanPY@users.noreply.github.com>
Mihai <mihai.chirculescu@yahoo.com> Mihai <mihai.chirculescu@yahoo.com>
Mike <ytianhui2004@gmail.com> Mike <ytianhui2004@gmail.com>
Mikko Juola <mikjuo@gmail.com> Mikko Juola <mikjuo@gmail.com>
@ -477,6 +506,7 @@ Neo Zhang <14088817+arthw@users.noreply.github.com>
Neo Zhang <zhang.jianyu@outlook.com> Neo Zhang <zhang.jianyu@outlook.com>
Neo Zhang Jianyu <jianyu.zhang@intel.com> Neo Zhang Jianyu <jianyu.zhang@intel.com>
Neuman Vong <neuman.vong@gmail.com> Neuman Vong <neuman.vong@gmail.com>
NeverLucky <92274250+nvrxq@users.noreply.github.com>
Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Nexes the Old <124105151+Nexesenex@users.noreply.github.com>
Nexesenex <124105151+Nexesenex@users.noreply.github.com> Nexesenex <124105151+Nexesenex@users.noreply.github.com>
Niall Coates <1349685+Niall-@users.noreply.github.com> Niall Coates <1349685+Niall-@users.noreply.github.com>
@ -484,11 +514,15 @@ Nicholai Tukanov <nicholaitukanov@gmail.com>
Nico Bosshard <nico@bosshome.ch> Nico Bosshard <nico@bosshome.ch>
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de> Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
Nicolás Pérez <nicolas_perez@brown.edu> Nicolás Pérez <nicolas_perez@brown.edu>
Nicolò Scipione <nicolo.scipione@codeplay.com>
Nigel Bosch <pnigelb@gmail.com> Nigel Bosch <pnigelb@gmail.com>
Nikita Sarychev <42014488+sARY77@users.noreply.github.com>
Niklas Korz <niklas@niklaskorz.de> Niklas Korz <niklas@niklaskorz.de>
NikolaiLyssogor <59844691+NikolaiLyssogor@users.noreply.github.com> NikolaiLyssogor <59844691+NikolaiLyssogor@users.noreply.github.com>
Nikolaos Pothitos <pothitos@di.uoa.gr>
Nikolas <127742645+nneubacher@users.noreply.github.com> Nikolas <127742645+nneubacher@users.noreply.github.com>
Nindaleth <Nindaleth@users.noreply.github.com> Nindaleth <Nindaleth@users.noreply.github.com>
Nuno <rare-magma@posteo.eu>
OSecret <135510162+OLSecret@users.noreply.github.com> OSecret <135510162+OLSecret@users.noreply.github.com>
Oleksandr Nikitin <oleksandr@tvori.info> Oleksandr Nikitin <oleksandr@tvori.info>
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com> Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
@ -504,6 +538,7 @@ Pavel Zloi <github.com@drteam.rocks>
Pavol Rusnak <pavol@rusnak.io> Pavol Rusnak <pavol@rusnak.io>
Paweł Wodnicki <151604+32bitmicro@users.noreply.github.com> Paweł Wodnicki <151604+32bitmicro@users.noreply.github.com>
Pedro Cuenca <pedro@huggingface.co> Pedro Cuenca <pedro@huggingface.co>
Peter <peter277@users.noreply.github.com>
Peter Sugihara <peter@campsh.com> Peter Sugihara <peter@campsh.com>
Phil H <5756783+phiharri@users.noreply.github.com> Phil H <5756783+phiharri@users.noreply.github.com>
Philip Taron <philip.taron@gmail.com> Philip Taron <philip.taron@gmail.com>
@ -529,9 +564,12 @@ Rand Xie <randxiexyy29@gmail.com>
Randall Fitzgerald <randall@dasaku.net> Randall Fitzgerald <randall@dasaku.net>
Random Fly <renfei8@live.cn> Random Fly <renfei8@live.cn>
Reinforce-II <fate@eastal.com> Reinforce-II <fate@eastal.com>
Rémy Oudompheng <oudomphe@phare.normalesup.org>
Ren Xuancheng <jklj077@users.noreply.github.com> Ren Xuancheng <jklj077@users.noreply.github.com>
Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com> Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
Reza Kakhki <rezakakhki.de@gmail.com>
RhinoDevel <RhinoDevel@users.noreply.github.com> RhinoDevel <RhinoDevel@users.noreply.github.com>
Riccardo Orlando <Riccorl@users.noreply.github.com>
Riceball LEE <snowyu.lee@gmail.com> Riceball LEE <snowyu.lee@gmail.com>
Rich Dougherty <rich@rd.nz> Rich Dougherty <rich@rd.nz>
Richard Kiss <him@richardkiss.com> Richard Kiss <him@richardkiss.com>
@ -544,6 +582,8 @@ Riley Stewart <ristew@users.noreply.github.com>
Rinne <AsakusaRinne@gmail.com> Rinne <AsakusaRinne@gmail.com>
Rinne <liu_yaohui1998@126.com> Rinne <liu_yaohui1998@126.com>
Robert Brisita <986796+rbrisita@users.noreply.github.com> Robert Brisita <986796+rbrisita@users.noreply.github.com>
Robert Collins <roberto.tomas.cuentas@gmail.com>
Robert Ormandi <52251610+ormandi@users.noreply.github.com>
Robert Sung-wook Shin <edp1096@users.noreply.github.com> Robert Sung-wook Shin <edp1096@users.noreply.github.com>
Robey Holderith <robey@flaminglunchbox.net> Robey Holderith <robey@flaminglunchbox.net>
Robyn <robyngraf@users.noreply.github.com> Robyn <robyngraf@users.noreply.github.com>
@ -559,7 +599,9 @@ Roni <sulpher@gmx.net>
Ronny Brendel <ronnybrendel@gmail.com> Ronny Brendel <ronnybrendel@gmail.com>
Ronsor <ronsor@ronsor.pw> Ronsor <ronsor@ronsor.pw>
Rowan Hart <rowanbhart@gmail.com> Rowan Hart <rowanbhart@gmail.com>
Ruan <47767371+ruanych@users.noreply.github.com>
Ruchira Hasaranga <ruchira66@gmail.com> Ruchira Hasaranga <ruchira66@gmail.com>
Rudi Servo <rudiservo@gmail.com>
Ruixin Huang <18860020911@163.com> Ruixin Huang <18860020911@163.com>
Rune <43761327+Rune-AI@users.noreply.github.com> Rune <43761327+Rune-AI@users.noreply.github.com>
RunningLeon <maningsheng@sensetime.com> RunningLeon <maningsheng@sensetime.com>
@ -623,12 +665,14 @@ Steven Roussey <sroussey@gmail.com>
Steward Garcia <57494570+FSSRepo@users.noreply.github.com> Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
StrangeBytesDev <141275258+StrangeBytesDev@users.noreply.github.com> StrangeBytesDev <141275258+StrangeBytesDev@users.noreply.github.com>
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com> Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
Sukriti Sharma <Ssukriti@users.noreply.github.com>
SuperUserNameMan <yoann@terminajones.com> SuperUserNameMan <yoann@terminajones.com>
Sutou Kouhei <kou@cozmixng.org> Sutou Kouhei <kou@cozmixng.org>
Tai Duc Nguyen <taiducnguyen.drexel@gmail.com> Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
Taikono-Himazin <kazu@po.harenet.ne.jp> Taikono-Himazin <kazu@po.harenet.ne.jp>
Tameem <113388789+AhmadTameem@users.noreply.github.com> Tameem <113388789+AhmadTameem@users.noreply.github.com>
Tamotsu Takahashi <ttakah+github@gmail.com> Tamotsu Takahashi <ttakah+github@gmail.com>
Tei Home <taiteitonghome@proton.me>
Thái Hoàng Tâm <75922889+RoyalHeart@users.noreply.github.com> Thái Hoàng Tâm <75922889+RoyalHeart@users.noreply.github.com>
Thatcher Chamberlin <j.thatcher.c@gmail.com> Thatcher Chamberlin <j.thatcher.c@gmail.com>
Theia Vogel <theia@vgel.me> Theia Vogel <theia@vgel.me>
@ -640,6 +684,7 @@ Tim Miller <drasticactions@users.noreply.github.com>
Tim Wang <overocean@gmail.com> Tim Wang <overocean@gmail.com>
Timmy Knight <r2d2fish@gmail.com> Timmy Knight <r2d2fish@gmail.com>
Timothy Cronin <40186632+4imothy@users.noreply.github.com> Timothy Cronin <40186632+4imothy@users.noreply.github.com>
Ting Lou <louting@189.cn>
Ting Lou <ting.lou@gmail.com> Ting Lou <ting.lou@gmail.com>
Ting Sun <suntcrick@gmail.com> Ting Sun <suntcrick@gmail.com>
Tobias Lütke <tobi@shopify.com> Tobias Lütke <tobi@shopify.com>
@ -661,6 +706,7 @@ Uzo Nweke <uzoechi@gmail.com>
Vaibhav Srivastav <vaibhavs10@gmail.com> Vaibhav Srivastav <vaibhavs10@gmail.com>
Val Kharitonov <mail@kharvd.com> Val Kharitonov <mail@kharvd.com>
Valentin Konovalov <valle.ketsujin@gmail.com> Valentin Konovalov <valle.ketsujin@gmail.com>
Valentin Mamedov <45292985+Inf1delis@users.noreply.github.com>
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com> Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
Vali Malinoiu <0x4139@gmail.com> Vali Malinoiu <0x4139@gmail.com>
Victor Nogueira <felladrin@gmail.com> Victor Nogueira <felladrin@gmail.com>
@ -673,13 +719,17 @@ Vladimir Malyutin <first-leon@yandex.ru>
Vladimir Zorin <vladimir@deviant.guru> Vladimir Zorin <vladimir@deviant.guru>
VoidIsVoid <343750470@qq.com> VoidIsVoid <343750470@qq.com>
Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com> Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
Wang Qin <37098874+wangqin0@users.noreply.github.com>
Wang Ran (汪然) <wangr@smail.nju.edu.cn>
WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com> WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
Weird Constructor <weirdconstructor@gmail.com> Weird Constructor <weirdconstructor@gmail.com>
Welby Seely <welbyseely@gmail.com> Welby Seely <welbyseely@gmail.com>
Wentai Zhang <rchardx@gmail.com> Wentai Zhang <rchardx@gmail.com>
WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com> WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
William Tambellini <william.tambellini@gmail.com> William Tambellini <william.tambellini@gmail.com>
William Tambellini <wtambellini@sdl.com>
Willy Tarreau <w@1wt.eu> Willy Tarreau <w@1wt.eu>
Woof Dog <197125663+woof-dog@users.noreply.github.com>
Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com> Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com>
Wu Jian Ping <wujjpp@hotmail.com> Wu Jian Ping <wujjpp@hotmail.com>
Wu Jian Ping <wujp@greatld.com> Wu Jian Ping <wujp@greatld.com>
@ -692,6 +742,7 @@ Xie Yanbo <xieyanbo@gmail.com>
Xingchen Song(宋星辰) <xingchensong1996@163.com> Xingchen Song(宋星辰) <xingchensong1996@163.com>
Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com> Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com>
Xuan Son Nguyen <thichthat@gmail.com> Xuan Son Nguyen <thichthat@gmail.com>
Xuan-Son Nguyen <thichthat@gmail.com>
Yaiko <elyaiko@hotmail.com> Yaiko <elyaiko@hotmail.com>
Yann Follet <131855179+YannFollet@users.noreply.github.com> Yann Follet <131855179+YannFollet@users.noreply.github.com>
Yaroslav <yaroslav.yashin@me.com> Yaroslav <yaroslav.yashin@me.com>
@ -702,7 +753,9 @@ Yoshi Suhara <y.suhara@gmail.com>
Yoshi Suhara <ysuhara@nvidia.com> Yoshi Suhara <ysuhara@nvidia.com>
Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com> Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
Yüg <eugeniosegalaweb@gmail.com>
Yui <dev@sleepyyui.com> Yui <dev@sleepyyui.com>
Yun Dou <dixyes@gmail.com>
Yuri Khrustalev <ykhrustalev@users.noreply.github.com> Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
Yusuf Kağan Hanoğlu <hanoglu@yahoo.com> Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com> Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
@ -714,18 +767,23 @@ Zhang Peiyuan <a1286225768@gmail.com>
Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com> Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com> Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com>
Zhiyuan Li <lizhiyuan@uniartisan.com> Zhiyuan Li <lizhiyuan@uniartisan.com>
Zhiyuan Li <uniartisan2017@gmail.com>
ZhouYuChen <zhouyuchen@naver.com> ZhouYuChen <zhouyuchen@naver.com>
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com> Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com> Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
Zsapi <martin1.zsapka@gmail.com> Zsapi <martin1.zsapka@gmail.com>
a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com> a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
a3sh <38979186+A3shTnT@users.noreply.github.com>
adel boussaken <netdur@gmail.com> adel boussaken <netdur@gmail.com>
afrideva <95653597+afrideva@users.noreply.github.com> afrideva <95653597+afrideva@users.noreply.github.com>
ag2s20150909 <19373730+ag2s20150909@users.noreply.github.com>
agray3 <agray3@users.noreply.github.com> agray3 <agray3@users.noreply.github.com>
akawrykow <142945436+akawrykow@users.noreply.github.com> akawrykow <142945436+akawrykow@users.noreply.github.com>
alek3y <44779186+alek3y@users.noreply.github.com>
alexpinel <93524949+alexpinel@users.noreply.github.com> alexpinel <93524949+alexpinel@users.noreply.github.com>
alonfaraj <alonfaraj@gmail.com> alonfaraj <alonfaraj@gmail.com>
alwqx <kenan3015@gmail.com> alwqx <kenan3015@gmail.com>
amd-dwang <dong.wang@amd.com>
amd-lalithnc <lalithnc@amd.com> amd-lalithnc <lalithnc@amd.com>
amritahs-ibm <amritahs@linux.vnet.ibm.com> amritahs-ibm <amritahs@linux.vnet.ibm.com>
andrijdavid <david@geek.mg> andrijdavid <david@geek.mg>
@ -737,6 +795,7 @@ arch-btw <57669023+arch-btw@users.noreply.github.com>
arcrank <arcrank@gmail.com> arcrank <arcrank@gmail.com>
ardfork <134447697+ardfork@users.noreply.github.com> ardfork <134447697+ardfork@users.noreply.github.com>
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com> arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
aryantandon01 <80969509+aryantandon01@users.noreply.github.com>
at8u <129688334+at8u@users.noreply.github.com> at8u <129688334+at8u@users.noreply.github.com>
automaticcat <daogiatuank54@gmail.com> automaticcat <daogiatuank54@gmail.com>
awatuna <23447591+awatuna@users.noreply.github.com> awatuna <23447591+awatuna@users.noreply.github.com>
@ -751,12 +810,14 @@ bryanSwk <93190252+bryanSwk@users.noreply.github.com>
bsilvereagle <bsilvereagle@users.noreply.github.com> bsilvereagle <bsilvereagle@users.noreply.github.com>
bssrdf <merlintiger@hotmail.com> bssrdf <merlintiger@hotmail.com>
byte-6174 <88070277+byte-6174@users.noreply.github.com> byte-6174 <88070277+byte-6174@users.noreply.github.com>
cduk <19917266+cduk@users.noreply.github.com>
cebtenzzre <cebtenzzre@gmail.com> cebtenzzre <cebtenzzre@gmail.com>
chaihahaha <chai836275709@gmail.com> chaihahaha <chai836275709@gmail.com>
chiranko <96988916+chiranko@users.noreply.github.com> chiranko <96988916+chiranko@users.noreply.github.com>
clibdev <52199778+clibdev@users.noreply.github.com> clibdev <52199778+clibdev@users.noreply.github.com>
clyang <clyang@clyang.net> clyang <clyang@clyang.net>
cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com> cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
codezjx <code.zjx@gmail.com>
coezbek <c.oezbek@gmail.com> coezbek <c.oezbek@gmail.com>
comex <comexk@gmail.com> comex <comexk@gmail.com>
compilade <113953597+compilade@users.noreply.github.com> compilade <113953597+compilade@users.noreply.github.com>
@ -780,14 +841,17 @@ drbh <david.richard.holtz@gmail.com>
ds5t5 <145942675+ds5t5@users.noreply.github.com> ds5t5 <145942675+ds5t5@users.noreply.github.com>
dylan <canardleteer@users.noreply.github.com> dylan <canardleteer@users.noreply.github.com>
eastriver <lee@eastriver.dev> eastriver <lee@eastriver.dev>
ebraminio <ebrahim@gnu.org>
ebraminio <ebraminio@gmail.com> ebraminio <ebraminio@gmail.com>
eiery <19350831+eiery@users.noreply.github.com> eiery <19350831+eiery@users.noreply.github.com>
eric8607242 <e0928021388@gmail.com> eric8607242 <e0928021388@gmail.com>
fairydreaming <166155368+fairydreaming@users.noreply.github.com> fairydreaming <166155368+fairydreaming@users.noreply.github.com>
fengerhu1 <2748250768@qq.com> fengerhu1 <2748250768@qq.com>
fj-y-saito <85871716+fj-y-saito@users.noreply.github.com>
fraxy-v <65565042+fraxy-v@users.noreply.github.com> fraxy-v <65565042+fraxy-v@users.noreply.github.com>
github-actions[bot] <github-actions[bot]@users.noreply.github.com> github-actions[bot] <github-actions[bot]@users.noreply.github.com>
gliptic <gliptic@users.noreply.github.com> gliptic <gliptic@users.noreply.github.com>
gn64 <yukikaze.jp@gmail.com>
goerch <jhr.walter@t-online.de> goerch <jhr.walter@t-online.de>
grahameth <96447521+grahameth@users.noreply.github.com> grahameth <96447521+grahameth@users.noreply.github.com>
gtygo <gtydoit@gmail.com> gtygo <gtydoit@gmail.com>
@ -812,10 +876,12 @@ icppWorld <124377669+icppWorld@users.noreply.github.com>
igarnier <igarnier@protonmail.com> igarnier <igarnier@protonmail.com>
intelmatt <61025942+intelmatt@users.noreply.github.com> intelmatt <61025942+intelmatt@users.noreply.github.com>
iohub <rickyang.pro@gmail.com> iohub <rickyang.pro@gmail.com>
issixx <46835150+issixx@users.noreply.github.com>
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com> jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
jameswu2014 <545426914@qq.com> jameswu2014 <545426914@qq.com>
jdomke <28772296+jdomke@users.noreply.github.com> jdomke <28772296+jdomke@users.noreply.github.com>
jiahao su <damow890@gmail.com>
jiez <373447296@qq.com> jiez <373447296@qq.com>
jneem <joeneeman@gmail.com> jneem <joeneeman@gmail.com>
joecryptotoo <80373433+joecryptotoo@users.noreply.github.com> joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
@ -828,6 +894,7 @@ junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
jwj7140 <32943891+jwj7140@users.noreply.github.com> jwj7140 <32943891+jwj7140@users.noreply.github.com>
k.h.lai <adrian.k.h.lai@outlook.com> k.h.lai <adrian.k.h.lai@outlook.com>
kaizau <kaizau@users.noreply.github.com> kaizau <kaizau@users.noreply.github.com>
kallewoof <kalle.alm@gmail.com>
kalomaze <66376113+kalomaze@users.noreply.github.com> kalomaze <66376113+kalomaze@users.noreply.github.com>
kang <tpdns9032100@gmail.com> kang <tpdns9032100@gmail.com>
katsu560 <118887472+katsu560@users.noreply.github.com> katsu560 <118887472+katsu560@users.noreply.github.com>
@ -835,6 +902,7 @@ kchro3 <62481661+kchro3@users.noreply.github.com>
khimaros <me@khimaros.com> khimaros <me@khimaros.com>
kiltyj <kiltyj@gmail.com> kiltyj <kiltyj@gmail.com>
klosax <131523366+klosax@users.noreply.github.com> klosax <131523366+klosax@users.noreply.github.com>
krystiancha <krystian@krystianch.com>
kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
kunnis <kunnis@users.noreply.github.com> kunnis <kunnis@users.noreply.github.com>
kuronekosaiko <EvanChanJ@163.com> kuronekosaiko <EvanChanJ@163.com>
@ -847,6 +915,8 @@ ldwang <ftgreat@163.com>
le.chang <cljs118@126.com> le.chang <cljs118@126.com>
leejet <leejet714@gmail.com> leejet <leejet714@gmail.com>
leo-pony <nengjunma@outlook.com> leo-pony <nengjunma@outlook.com>
lexasub <lexakopp2212@gmail.com>
lhez <quic_lih@quicinc.com>
limitedAtonement <limitedAtonement@users.noreply.github.com> limitedAtonement <limitedAtonement@users.noreply.github.com>
liuwei-git <14815172+liuwei-git@users.noreply.github.com> liuwei-git <14815172+liuwei-git@users.noreply.github.com>
lon <114724657+longregen@users.noreply.github.com> lon <114724657+longregen@users.noreply.github.com>
@ -855,10 +925,13 @@ ltoniazzi <61414566+ltoniazzi@users.noreply.github.com>
luoyu-intel <yu.luo@intel.com> luoyu-intel <yu.luo@intel.com>
m3ndax <adrian.goessl@outlook.com> m3ndax <adrian.goessl@outlook.com>
maddes8cht <55592906+maddes8cht@users.noreply.github.com> maddes8cht <55592906+maddes8cht@users.noreply.github.com>
mahorozte <41834471+mahorozte@users.noreply.github.com>
makomk <makosoft@googlemail.com> makomk <makosoft@googlemail.com>
manikbhandari <mbbhandarimanik2@gmail.com> manikbhandari <mbbhandarimanik2@gmail.com>
maor-ps <154728172+maor-ps@users.noreply.github.com> maor-ps <154728172+maor-ps@users.noreply.github.com>
mashdragon <122402293+mashdragon@users.noreply.github.com>
matiaslin <45382001+matiaslin@users.noreply.github.com> matiaslin <45382001+matiaslin@users.noreply.github.com>
matt23654 <matthew.webber@protonmail.com>
matteo <matteogeniaccio@yahoo.it> matteo <matteogeniaccio@yahoo.it>
mdrokz <mohammadmunshi@gmail.com> mdrokz <mohammadmunshi@gmail.com>
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com> mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
@ -868,6 +941,7 @@ mmyjona <jonathan.gonse@gmail.com>
momonga <115213907+mmnga@users.noreply.github.com> momonga <115213907+mmnga@users.noreply.github.com>
momonga <146910567+mmngays@users.noreply.github.com> momonga <146910567+mmngays@users.noreply.github.com>
moritzbrantner <31051084+moritzbrantner@users.noreply.github.com> moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
musoles <135031143+musoles@users.noreply.github.com>
mzcu <milos.cubrilo@gmail.com> mzcu <milos.cubrilo@gmail.com>
nanahi <130121847+na-na-hi@users.noreply.github.com> nanahi <130121847+na-na-hi@users.noreply.github.com>
ngc92 <7938269+ngc92@users.noreply.github.com> ngc92 <7938269+ngc92@users.noreply.github.com>
@ -885,6 +959,7 @@ oobabooga <112222186+oobabooga@users.noreply.github.com>
opparco <parco.opaai@gmail.com> opparco <parco.opaai@gmail.com>
ostix360 <55257054+ostix360@users.noreply.github.com> ostix360 <55257054+ostix360@users.noreply.github.com>
pculliton <phillipculliton@gmail.com> pculliton <phillipculliton@gmail.com>
peidaqi <peidaqi@gmail.com>
pengxin99 <pengxin.yuan@intel.com> pengxin99 <pengxin.yuan@intel.com>
perserk <perserk@gmail.com> perserk <perserk@gmail.com>
piDack <104877312+piDack@users.noreply.github.com> piDack <104877312+piDack@users.noreply.github.com>
@ -892,10 +967,12 @@ pmysl <piotr.myslinski@outlook.com>
postmasters <namnguyen@google.com> postmasters <namnguyen@google.com>
pudepiedj <pudepiedj@gmail.com> pudepiedj <pudepiedj@gmail.com>
qingfengfenga <41416092+qingfengfenga@users.noreply.github.com> qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
qingy1337 <qxli2@students.everettcc.edu>
qouoq <qouoq@fastmail.com> qouoq <qouoq@fastmail.com>
qunash <anzoria@gmail.com> qunash <anzoria@gmail.com>
rabidcopy <rabidcopy@yahoo.com> rabidcopy <rabidcopy@yahoo.com>
rankaiyx <rankaiyx@rankaiyx.com> rankaiyx <rankaiyx@rankaiyx.com>
redbeard <bharrington@alticon.net>
rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com> rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com>
rhuddleston <ryan.huddleston@percona.com> rhuddleston <ryan.huddleston@percona.com>
rimoliga <53384203+rimoliga@users.noreply.github.com> rimoliga <53384203+rimoliga@users.noreply.github.com>
@ -912,6 +989,7 @@ sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
slaren <2141330+slaren@users.noreply.github.com> slaren <2141330+slaren@users.noreply.github.com>
slaren <slarengh@gmail.com> slaren <slarengh@gmail.com>
snadampal <87143774+snadampal@users.noreply.github.com> snadampal <87143774+snadampal@users.noreply.github.com>
someone13574 <81528246+someone13574@users.noreply.github.com>
standby24x7 <standby24x7@gmail.com> standby24x7 <standby24x7@gmail.com>
staviq <staviq@gmail.com> staviq <staviq@gmail.com>
stduhpf <stephduh@live.fr> stduhpf <stephduh@live.fr>
@ -931,6 +1009,7 @@ uint256_t <konndennsa@gmail.com>
uint256_t <maekawatoshiki1017@gmail.com> uint256_t <maekawatoshiki1017@gmail.com>
unbounded <haakon@likedan.net> unbounded <haakon@likedan.net>
uvos <devnull@uvos.xyz> uvos <devnull@uvos.xyz>
uvos <philipp@uvos.xyz>
valiray <133289098+valiray@users.noreply.github.com> valiray <133289098+valiray@users.noreply.github.com>
vb <vaibhavs10@gmail.com> vb <vaibhavs10@gmail.com>
vik <vikhyatk@gmail.com> vik <vikhyatk@gmail.com>
@ -951,6 +1030,7 @@ xaedes <xaedes@googlemail.com>
xctan <axunlei@gmail.com> xctan <axunlei@gmail.com>
xloem <0xloem@gmail.com> xloem <0xloem@gmail.com>
yangli2 <yangli2@gmail.com> yangli2 <yangli2@gmail.com>
ymcki <84055651+ymcki@users.noreply.github.com>
yuiseki <yuiseki@gmail.com> yuiseki <yuiseki@gmail.com>
yuri@FreeBSD <yurivict@users.noreply.github.com> yuri@FreeBSD <yurivict@users.noreply.github.com>
zakkor <edward.partenie@gmail.com> zakkor <edward.partenie@gmail.com>
@ -963,4 +1043,5 @@ zrm <trustiosity.zrm@gmail.com>
杨朱 · Kiki <baofa.fan@daocloud.io> 杨朱 · Kiki <baofa.fan@daocloud.io>
源文雨 <41315874+fumiama@users.noreply.github.com> 源文雨 <41315874+fumiama@users.noreply.github.com>
蕭澧邦 <45505768+shou692199@users.noreply.github.com> 蕭澧邦 <45505768+shou692199@users.noreply.github.com>
谢乃闻 <sienaiwun@users.noreply.github.com>
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com> Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>

View file

@ -233,4 +233,4 @@ configure_file(cmake/llama.pc.in
@ONLY) @ONLY)
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc" install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
DESTINATION lib/pkgconfig) DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)

View file

@ -136,6 +136,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs) - Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs) - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
- Rust (automated build from crates.io): [ShelbyJenkins/llm_client](https://github.com/ShelbyJenkins/llm_client)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html) - C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
@ -188,6 +189,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [ramalama](https://github.com/containers/ramalama) (MIT) - [ramalama](https://github.com/containers/ramalama) (MIT)
- [semperai/amica](https://github.com/semperai/amica) (MIT) - [semperai/amica](https://github.com/semperai/amica) (MIT)
- [withcatai/catai](https://github.com/withcatai/catai) (MIT) - [withcatai/catai](https://github.com/withcatai/catai) (MIT)
- [Autopen](https://github.com/blackhole89/autopen) (GPL)
</details> </details>

View file

@ -1,10 +1,10 @@
prefix=@CMAKE_INSTALL_PREFIX@ prefix=@CMAKE_INSTALL_PREFIX@
exec_prefix=${prefix} exec_prefix=@CMAKE_INSTALL_PREFIX@
libdir=${exec_prefix}/lib libdir=@CMAKE_INSTALL_FULL_LIBDIR@
includedir=${prefix}/include includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
Name: llama Name: llama
Description: Port of Facebook's LLaMA model in C/C++ Description: Port of Facebook's LLaMA model in C/C++
Version: @PROJECT_VERSION@ Version: @LLAMA_INSTALL_VERSION@
Libs: -L${libdir} -lggml -lggml-base -lllama Libs: -L${libdir} -lggml -lggml-base -lllama
Cflags: -I${includedir} Cflags: -I${includedir}

View file

@ -1465,15 +1465,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--list-devices"}, {"--list-devices"},
"print list of available devices and exit", "print list of available devices and exit",
[](common_params &) { [](common_params &) {
printf("Available devices:\n"); std::vector<ggml_backend_dev_t> rpc_devices;
std::vector<ggml_backend_dev_t> all_devices;
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
auto * dev = ggml_backend_dev_get(i); auto * dev = ggml_backend_dev_get(i);
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
rpc_devices.push_back(dev);
} else {
all_devices.push_back(dev);
}
}
}
// insert RPC devices in front
all_devices.insert(all_devices.begin(), rpc_devices.begin(), rpc_devices.end());
printf("Available devices:\n");
for (size_t i = 0; i < all_devices.size(); ++i) {
auto * dev = all_devices[i];
size_t free, total; size_t free, total;
ggml_backend_dev_memory(dev, &free, &total); ggml_backend_dev_memory(dev, &free, &total);
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024); printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
} }
}
exit(0); exit(0);
} }
)); ));
@ -2311,5 +2324,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_examples({LLAMA_EXAMPLE_TTS})); ).set_examples({LLAMA_EXAMPLE_TTS}));
add_opt(common_arg(
{"--embd-bge-small-en-default"},
string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
[](common_params & params) {
params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
params.embd_normalize = 2;
params.n_ctx = 512;
params.verbose_prompt = true;
params.embedding = true;
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--embd-e5-small-en-default"},
string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
[](common_params & params) {
params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
params.hf_file = "e5-small-v2-q8_0.gguf";
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
params.embd_normalize = 2;
params.n_ctx = 512;
params.verbose_prompt = true;
params.embedding = true;
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--embd-gte-small-default"},
string_format("use default gte-small model (note: can download weights from the internet)"),
[](common_params & params) {
params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
params.hf_file = "gte-small-q8_0.gguf";
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
params.embd_normalize = 2;
params.n_ctx = 512;
params.verbose_prompt = true;
params.embedding = true;
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
return ctx_arg; return ctx_arg;
} }

View file

@ -33,6 +33,29 @@ struct chat_template_caps {
bool requires_typed_content = false; bool requires_typed_content = false;
}; };
struct chat_template_inputs {
nlohmann::ordered_json messages;
nlohmann::ordered_json tools;
bool add_generation_prompt = true;
nlohmann::ordered_json extra_context;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
};
struct chat_template_options {
bool apply_polyfills = true;
bool use_bos_token = true;
bool use_eos_token = true;
bool define_strftime_now = true;
bool polyfill_tools = true;
bool polyfill_tool_call_examples = true;
bool polyfill_tool_calls = true;
bool polyfill_tool_responses = true;
bool polyfill_system_role = true;
bool polyfill_object_arguments = true;
bool polyfill_typed_content = true;
};
class chat_template { class chat_template {
private: private:
@ -41,6 +64,7 @@ class chat_template {
std::string bos_token_; std::string bos_token_;
std::string eos_token_; std::string eos_token_;
std::shared_ptr<minja::TemplateNode> template_root_; std::shared_ptr<minja::TemplateNode> template_root_;
std::string tool_call_example_;
std::string try_raw_render( std::string try_raw_render(
const nlohmann::ordered_json & messages, const nlohmann::ordered_json & messages,
@ -49,7 +73,18 @@ class chat_template {
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json()) const const nlohmann::ordered_json & extra_context = nlohmann::ordered_json()) const
{ {
try { try {
auto prompt = apply(messages, tools, add_generation_prompt, extra_context, /* adjust_inputs= */ false); chat_template_inputs inputs;
inputs.messages = messages;
inputs.tools = tools;
inputs.add_generation_prompt = add_generation_prompt;
inputs.extra_context = extra_context;
// Use fixed date for tests
inputs.now = std::chrono::system_clock::from_time_t(0);
chat_template_options opts;
opts.apply_polyfills = false;
auto prompt = apply(inputs, opts);
// fprintf(stderr, "try_raw_render: %s\n", prompt.c_str()); // fprintf(stderr, "try_raw_render: %s\n", prompt.c_str());
return prompt; return prompt;
} catch (const std::exception & e) { } catch (const std::exception & e) {
@ -176,6 +211,58 @@ class chat_template {
caps_.supports_tool_responses = contains(out, "Some response!"); caps_.supports_tool_responses = contains(out, "Some response!");
caps_.supports_tool_call_id = contains(out, "call_911_"); caps_.supports_tool_call_id = contains(out, "call_911_");
} }
try {
if (!caps_.supports_tools) {
const json user_msg {
{"role", "user"},
{"content", "Hey"},
};
const json args {
{"arg1", "some_value"},
};
const json tool_call_msg {
{"role", "assistant"},
{"content", nullptr},
{"tool_calls", json::array({
{
// TODO: detect if requires numerical id or fixed length == 6 like Nemo
{"id", "call_1___"},
{"type", "function"},
{"function", {
{"name", "tool_name"},
{"arguments", (caps_.requires_object_arguments ? args : json(minja::Value(args).dump(-1, /* to_json= */ true)))},
}},
},
})},
};
std::string prefix, full;
{
chat_template_inputs inputs;
inputs.messages = json::array({user_msg});
inputs.add_generation_prompt = true;
prefix = apply(inputs);
}
{
chat_template_inputs inputs;
inputs.messages = json::array({user_msg, tool_call_msg});
inputs.add_generation_prompt = false;
full = apply(inputs);
}
if (full.find(prefix) != 0) {
if (prefix.rfind(eos_token_) == prefix.size() - eos_token_.size()) {
prefix = prefix.substr(0, prefix.size() - eos_token_.size());
}
}
if (full.find(prefix) != 0) {
fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
}
tool_call_example_ = full.substr(prefix.size());
}
} catch (const std::exception & e) {
fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
}
} }
const std::string & source() const { return source_; } const std::string & source() const { return source_; }
@ -183,28 +270,72 @@ class chat_template {
const std::string & eos_token() const { return eos_token_; } const std::string & eos_token() const { return eos_token_; }
const chat_template_caps & original_caps() const { return caps_; } const chat_template_caps & original_caps() const { return caps_; }
// Deprecated, please use the form with chat_template_inputs and chat_template_options
std::string apply( std::string apply(
const nlohmann::ordered_json & messages, const nlohmann::ordered_json & messages,
const nlohmann::ordered_json & tools, const nlohmann::ordered_json & tools,
bool add_generation_prompt, bool add_generation_prompt,
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json(), const nlohmann::ordered_json & extra_context = nlohmann::ordered_json(),
bool adjust_inputs = true) const bool apply_polyfills = true)
{
fprintf(stderr, "[%s] Deprecated!\n", __func__);
chat_template_inputs inputs;
inputs.messages = messages;
inputs.tools = tools;
inputs.add_generation_prompt = add_generation_prompt;
inputs.extra_context = extra_context;
inputs.now = std::chrono::system_clock::now();
chat_template_options opts;
opts.apply_polyfills = apply_polyfills;
return apply(inputs, opts);
}
std::string apply(
const chat_template_inputs & inputs,
const chat_template_options & opts = chat_template_options()) const
{ {
json actual_messages; json actual_messages;
auto needs_adjustments = adjust_inputs && (false auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|| !caps_.supports_system_role auto has_tool_calls = false;
|| !caps_.supports_tools auto has_tool_responses = false;
|| !caps_.supports_tool_responses auto has_string_content = false;
|| !caps_.supports_tool_calls for (const auto & message : inputs.messages) {
|| caps_.requires_object_arguments if (message.contains("tool_calls") && !message["tool_calls"].is_null()) {
|| caps_.requires_typed_content has_tool_calls = true;
}
if (message.contains("role") && message["role"] == "tool") {
has_tool_responses = true;
}
if (message.contains("content") && message["content"].is_string()) {
has_string_content = true;
}
}
auto polyfill_system_role = opts.polyfill_system_role && !caps_.supports_system_role;
auto polyfill_tools = opts.polyfill_tools && has_tools && !caps_.supports_tools;
auto polyfill_tool_call_example = polyfill_tools && opts.polyfill_tool_call_examples;
auto polyfill_tool_calls = opts.polyfill_tool_calls && has_tool_calls && !caps_.supports_tool_calls;
auto polyfill_tool_responses = opts.polyfill_tool_responses && has_tool_responses && !caps_.supports_tool_responses;
auto polyfill_object_arguments = opts.polyfill_object_arguments && has_tool_calls && caps_.requires_object_arguments;
auto polyfill_typed_content = opts.polyfill_typed_content && has_string_content && caps_.requires_typed_content;
auto needs_polyfills = opts.apply_polyfills && (false
|| polyfill_system_role
|| polyfill_tools
|| polyfill_tool_calls
|| polyfill_tool_responses
|| polyfill_object_arguments
|| polyfill_typed_content
); );
if (needs_adjustments) {
if (needs_polyfills) {
actual_messages = json::array(); actual_messages = json::array();
auto add_message = [&](const json & msg) { auto add_message = [&](const json & msg) {
if (caps_.requires_typed_content && msg.contains("content") && !msg.at("content").is_null() && msg.at("content").is_string()) { if (polyfill_typed_content && msg.contains("content") && !msg.at("content").is_null() && msg.at("content").is_string()) {
actual_messages.push_back({ actual_messages.push_back({
{"role", msg.at("role")}, {"role", msg.at("role")},
{"content", {{ {"content", {{
@ -227,9 +358,17 @@ class chat_template {
pending_system.clear(); pending_system.clear();
} }
}; };
auto needs_tools_in_system = !tools.is_null() && tools.size() > 0 && !caps_.supports_tools;
for (const auto & message_ : needs_tools_in_system ? add_system(messages, "Available tools: " + tools.dump(2)) : messages) { json adjusted_messages;
if (polyfill_tools) {
adjusted_messages = add_system(inputs.messages,
"You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
(!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_));
} else {
adjusted_messages = inputs.messages;
}
for (const auto & message_ : adjusted_messages) {
auto message = message_; auto message = message_;
if (!message.contains("role") || !message.contains("content")) { if (!message.contains("role") || !message.contains("content")) {
throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump()); throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
@ -237,7 +376,7 @@ class chat_template {
std::string role = message.at("role"); std::string role = message.at("role");
if (message.contains("tool_calls")) { if (message.contains("tool_calls")) {
if (caps_.requires_object_arguments || !caps_.supports_tool_calls) { if (polyfill_object_arguments || polyfill_tool_calls) {
for (auto & tool_call : message.at("tool_calls")) { for (auto & tool_call : message.at("tool_calls")) {
if (tool_call["type"] == "function") { if (tool_call["type"] == "function") {
auto & function = tool_call.at("function"); auto & function = tool_call.at("function");
@ -252,7 +391,7 @@ class chat_template {
} }
} }
} }
if (!caps_.supports_tool_calls) { if (polyfill_tool_calls) {
auto content = message.at("content"); auto content = message.at("content");
auto tool_calls = json::array(); auto tool_calls = json::array();
for (const auto & tool_call : message.at("tool_calls")) { for (const auto & tool_call : message.at("tool_calls")) {
@ -279,7 +418,7 @@ class chat_template {
message.erase("tool_calls"); message.erase("tool_calls");
} }
} }
if (!caps_.supports_tool_responses && role == "tool") { if (polyfill_tool_responses && role == "tool") {
message["role"] = "user"; message["role"] = "user";
auto obj = json { auto obj = json {
{"tool_response", { {"tool_response", {
@ -296,7 +435,7 @@ class chat_template {
message.erase("name"); message.erase("name");
} }
if (!message["content"].is_null() && !caps_.supports_system_role) { if (!message["content"].is_null() && polyfill_system_role) {
std::string content = message.at("content"); std::string content = message.at("content");
if (role == "system") { if (role == "system") {
if (!pending_system.empty()) pending_system += "\n"; if (!pending_system.empty()) pending_system += "\n";
@ -315,28 +454,36 @@ class chat_template {
} }
add_message(message); add_message(message);
} }
if (!caps_.supports_system_role) {
flush_sys(); flush_sys();
}
} else { } else {
actual_messages = messages; actual_messages = inputs.messages;
} }
auto context = minja::Context::make(json({ auto context = minja::Context::make(json({
{"messages", actual_messages}, {"messages", actual_messages},
{"add_generation_prompt", add_generation_prompt}, {"add_generation_prompt", inputs.add_generation_prompt},
{"bos_token", bos_token_},
{"eos_token", eos_token_},
})); }));
context->set("bos_token", opts.use_bos_token ? bos_token_ : "");
context->set("eos_token", opts.use_eos_token ? eos_token_ : "");
if (opts.define_strftime_now) {
auto now = inputs.now;
context->set("strftime_now", Value::callable([now](const std::shared_ptr<minja::Context> &, minja::ArgumentsValue & args) {
args.expectArgs("strftime_now", {1, 1}, {0, 0});
auto format = args.args[0].get<std::string>();
if (!tools.is_null()) { auto time = std::chrono::system_clock::to_time_t(now);
auto tools_val = minja::Value(tools); auto local_time = *std::localtime(&time);
context->set("tools", tools_val); std::ostringstream ss;
ss << std::put_time(&local_time, format.c_str());
return ss.str();
}));
} }
if (!extra_context.is_null()) { if (!inputs.tools.is_null()) {
for (auto & kv : extra_context.items()) { context->set("tools", minja::Value(inputs.tools));
minja::Value val(kv.value()); }
context->set(kv.key(), val); if (!inputs.extra_context.is_null()) {
for (auto & kv : inputs.extra_context.items()) {
context->set(kv.key(), minja::Value(kv.value()));
} }
} }
@ -353,7 +500,7 @@ class chat_template {
std::string existing_system = messages_with_system.at(0).at("content"); std::string existing_system = messages_with_system.at(0).at("content");
messages_with_system[0] = json { messages_with_system[0] = json {
{"role", "system"}, {"role", "system"},
{"content", existing_system + "\n" + system_prompt}, {"content", existing_system + "\n\n" + system_prompt},
}; };
} else { } else {
messages_with_system.insert(messages_with_system.begin(), json { messages_with_system.insert(messages_with_system.begin(), json {

View file

@ -163,6 +163,28 @@ static void foreach_function(const json & tools, const std::function<void(const
} }
} }
static std::string apply(
const common_chat_template & tmpl,
const nlohmann::ordered_json & messages,
const nlohmann::ordered_json & tools,
bool add_generation_prompt,
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json())
{
minja::chat_template_inputs tmpl_inputs;
tmpl_inputs.messages = messages;
tmpl_inputs.tools = tools;
tmpl_inputs.add_generation_prompt = add_generation_prompt;
tmpl_inputs.extra_context = extra_context;
// TODO: add flag to control date/time, if only for testing purposes.
// tmpl_inputs.now = std::chrono::system_clock::now();
minja::chat_template_options tmpl_opts;
tmpl_opts.use_bos_token = false;
tmpl_opts.use_eos_token = false;
return tmpl.apply(tmpl_inputs, tmpl_opts);
}
static common_chat_params common_chat_params_init_generic(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { static common_chat_params common_chat_params_init_generic(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
common_chat_params data; common_chat_params data;
@ -244,7 +266,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
inputs.messages, inputs.messages,
"Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request"); "Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
data.prompt = tmpl.apply(tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_GENERIC; data.format = COMMON_CHAT_FORMAT_GENERIC;
return data; return data;
} }
@ -310,7 +332,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema)); builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
}, grammar_options); }, grammar_options);
data.grammar_triggers.push_back({"[TOOL_CALLS]", /* .at_start = */ true}); data.grammar_triggers.push_back({"[TOOL_CALLS]", /* .at_start = */ true});
data.prompt = tmpl.apply(inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO; data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
return data; return data;
} }
@ -360,12 +382,12 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
"<|END_THINKING|>", "<|END_THINKING|>",
"<|END_ACTION|>", "<|END_ACTION|>",
}; };
data.prompt = tmpl.apply(inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B; data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
return data; return data;
} }
static common_chat_msg common_chat_parse_command_r7b(const std::string & input) { static common_chat_msg common_chat_parse_command_r7b(const std::string & input) {
static std::regex response_regex("<\\|START_RESPONSE\\|>(.*?)<\\|END_RESPONSE\\|>"); static std::regex response_regex("<\\|START_RESPONSE\\|>([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>");
static std::regex thought_action_regex("<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|><\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>"); static std::regex thought_action_regex("<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|><\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>");
std::smatch match; std::smatch match;
@ -477,7 +499,7 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com
builder.add_rule("root", string_join(tool_rules, " | ")); builder.add_rule("root", string_join(tool_rules, " | "));
}, grammar_options); }, grammar_options);
data.additional_stops.push_back("<|eom_id|>"); data.additional_stops.push_back("<|eom_id|>");
data.prompt = tmpl.apply(inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, { data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
{"tools_in_user_message", false}, {"tools_in_user_message", false},
{"builtin_tools", builtin_tools.empty() ? json() : builtin_tools}, {"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
}); });
@ -542,7 +564,8 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
}; };
builder.add_rule("root", "\"<tool▁calls▁begin>\" (" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " space"); builder.add_rule("root", "\"<tool▁calls▁begin>\" (" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " space");
}, grammar_options); }, grammar_options);
data.prompt = tmpl.apply(inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.prompt = prompt;
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1; data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
return data; return data;
} }
@ -556,10 +579,10 @@ static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input)
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
fprintf(stderr, "%s\n", __func__); fprintf(stderr, "%s\n", __func__);
common_chat_params data; common_chat_params data;
data.prompt = tmpl.apply(inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, { data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, {
{"datetime", "Jan 29 2025 13:00:00 GMT"}, {"datetime", "Jan 29 2025 13:00:00 GMT"},
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))}, {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
}, /* adjust_inputs= */ false); });
if (!inputs.tools.is_null() && !inputs.tools.empty()) { if (!inputs.tools.is_null() && !inputs.tools.empty()) {
data.grammar_lazy = inputs.tool_choice != "required"; data.grammar_lazy = inputs.tool_choice != "required";
data.grammar = build_grammar([&](const common_grammar_builder & builder) { data.grammar = build_grammar([&](const common_grammar_builder & builder) {
@ -603,7 +626,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
// >>>all\nlet's call functions>>>fn1\n{"arg1": 1...}\n>>>fn2\n{"arg1": 1...}... // >>>all\nlet's call functions>>>fn1\n{"arg1": 1...}\n>>>fn2\n{"arg1": 1...}...
// Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar // Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
common_chat_params data; common_chat_params data;
data.prompt = tmpl.apply(inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2; data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
if (!inputs.tools.is_null() && !inputs.tools.empty()) { if (!inputs.tools.is_null() && !inputs.tools.empty()) {
data.grammar_lazy = inputs.tool_choice != "required"; data.grammar_lazy = inputs.tool_choice != "required";
@ -730,7 +753,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
data.grammar_triggers.push_back({"<function=", /* .at_start = */ false}); data.grammar_triggers.push_back({"<function=", /* .at_start = */ false});
}, grammar_options); }, grammar_options);
data.prompt = tmpl.apply(inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
// TODO: if (has_raw_python) // TODO: if (has_raw_python)
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1; data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1;
return data; return data;
@ -785,7 +808,7 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
data.preserved_tokens = { "</tool_call>" }; data.preserved_tokens = { "</tool_call>" };
}, grammar_options); }, grammar_options);
data.prompt = tmpl.apply(inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO; data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
return data; return data;
} }
@ -846,7 +869,7 @@ static common_chat_msg common_chat_parse_hermes_2_pro(const std::string & input)
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
common_chat_params data; common_chat_params data;
data.prompt = tmpl.apply(inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
data.grammar_lazy = false; data.grammar_lazy = false;
if (!inputs.json_schema.is_null()) { if (!inputs.json_schema.is_null()) {

View file

@ -1869,11 +1869,19 @@ std::string common_chat_format_example(const common_chat_template & tmpl, bool u
return common_chat_apply_template(tmpl, msgs, true, use_jinja); return common_chat_apply_template(tmpl, msgs, true, use_jinja);
} }
#define CHATML_TEMPLATE_SRC \
"{%- for message in messages -%}\n" \
" {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \
"{%- endfor -%}\n" \
"{%- if add_generation_prompt -%}\n" \
" {{- '<|im_start|>assistant\n' -}}\n" \
"{%- endif -%}"
common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override) common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
{ {
auto vocab = llama_model_get_vocab(model); std::string default_template_src;
std::string default_template_src = chat_template_override; std::string template_tool_use_src;
std::string template_tool_use_src = chat_template_override;
bool has_explicit_template = !chat_template_override.empty(); bool has_explicit_template = !chat_template_override.empty();
if (chat_template_override.empty()) { if (chat_template_override.empty()) {
auto str = llama_model_chat_template(model, /* name */ nullptr); auto str = llama_model_chat_template(model, /* name */ nullptr);
@ -1886,21 +1894,17 @@ common_chat_templates common_chat_templates_from_model(const struct llama_model
template_tool_use_src = str; template_tool_use_src = str;
has_explicit_template = true; has_explicit_template = true;
} }
} else {
default_template_src = chat_template_override;
} }
if (default_template_src.empty() || default_template_src == "chatml") { if (default_template_src.empty() || default_template_src == "chatml") {
if (!template_tool_use_src.empty()) { if (!template_tool_use_src.empty()) {
default_template_src = template_tool_use_src; default_template_src = template_tool_use_src;
} else { } else {
default_template_src = R"( default_template_src = CHATML_TEMPLATE_SRC;
{%- for message in messages -%}
{{- "<|im_start|>" + message.role + "\n" + message.content + "<|im_end|>\n" -}}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{- "<|im_start|>assistant\n" -}}
{%- endif -%}
)";
} }
} }
auto vocab = llama_model_get_vocab(model);
const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) { const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
if (token == LLAMA_TOKEN_NULL) { if (token == LLAMA_TOKEN_NULL) {
if (default_template_src.find(jinja_variable_name) != std::string::npos if (default_template_src.find(jinja_variable_name) != std::string::npos
@ -1914,13 +1918,22 @@ common_chat_templates common_chat_templates_from_model(const struct llama_model
}; };
auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token"); auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token"); auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
try {
return { return {
has_explicit_template, has_explicit_template,
std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos), std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
template_tool_use_src.empty() template_tool_use_src.empty()
? nullptr ? nullptr
: std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos) : std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos),
}; };
} catch (const std::exception & e) {
LOG_ERR("%s: failed to parse chat template: %s\n", __func__, e.what());
return {
has_explicit_template,
std::make_unique<minja::chat_template>(CHATML_TEMPLATE_SRC, token_bos, token_eos),
nullptr,
};
}
} }
// //

View file

@ -254,10 +254,10 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
}; };
} }
return new llama_sampler{ return llama_sampler_init(
/* .iface = */ &llama_sampler_llg_i, /* .iface = */ &llama_sampler_llg_i,
/* .ctx = */ ctx, /* .ctx = */ ctx
}; );
} }
#else #else

View file

@ -2,6 +2,7 @@
#include "ggml.h" // for ggml_log_level #include "ggml.h" // for ggml_log_level
#define LOG_CLR_TO_EOL "\033[K\r"
#define LOG_COL_DEFAULT "\033[0m" #define LOG_COL_DEFAULT "\033[0m"
#define LOG_COL_BOLD "\033[1m" #define LOG_COL_BOLD "\033[1m"
#define LOG_COL_RED "\033[31m" #define LOG_COL_RED "\033[31m"

View file

@ -2194,7 +2194,7 @@ private:
} }
TemplateTokenVector tokenize() { TemplateTokenVector tokenize() {
static std::regex comment_tok(R"(\{#([-~]?)(.*?)([-~]?)#\})"); static std::regex comment_tok(R"(\{#([-~]?)([\s\S\r\n]*?)([-~]?)#\})");
static std::regex expr_open_regex(R"(\{\{([-~])?)"); static std::regex expr_open_regex(R"(\{\{([-~])?)");
static std::regex block_open_regex(R"(^\{%([-~])?[\s\n\r]*)"); static std::regex block_open_regex(R"(^\{%([-~])?[\s\n\r]*)");
static std::regex block_keyword_tok(R"((if|else|elif|endif|for|endfor|generation|endgeneration|set|endset|block|endblock|macro|endmacro|filter|endfilter|break|continue)\b)"); static std::regex block_keyword_tok(R"((if|else|elif|endif|for|endfor|generation|endgeneration|set|endset|block|endblock|macro|endmacro|filter|endfilter|break|continue)\b)");
@ -2615,6 +2615,7 @@ inline std::shared_ptr<Context> Context::builtins() {
})); }));
globals.set("join", simple_function("join", { "items", "d" }, [](const std::shared_ptr<Context> &, Value & args) { globals.set("join", simple_function("join", { "items", "d" }, [](const std::shared_ptr<Context> &, Value & args) {
auto do_join = [](Value & items, const std::string & sep) { auto do_join = [](Value & items, const std::string & sep) {
if (!items.is_array()) throw std::runtime_error("object is not iterable: " + items.dump());
std::ostringstream oss; std::ostringstream oss;
auto first = true; auto first = true;
for (size_t i = 0, n = items.size(); i < n; ++i) { for (size_t i = 0, n = items.size(); i < n; ++i) {
@ -2695,6 +2696,10 @@ inline std::shared_ptr<Context> Context::builtins() {
return Value::callable([=](const std::shared_ptr<Context> & context, ArgumentsValue & args) { return Value::callable([=](const std::shared_ptr<Context> & context, ArgumentsValue & args) {
args.expectArgs(is_select ? "select" : "reject", {2, (std::numeric_limits<size_t>::max)()}, {0, 0}); args.expectArgs(is_select ? "select" : "reject", {2, (std::numeric_limits<size_t>::max)()}, {0, 0});
auto & items = args.args[0]; auto & items = args.args[0];
if (items.is_null())
return Value::array();
if (!items.is_array()) throw std::runtime_error("object is not iterable: " + items.dump());
auto filter_fn = context->get(args.args[1]); auto filter_fn = context->get(args.args[1]);
if (filter_fn.is_null()) throw std::runtime_error("Undefined filter: " + args.args[1].dump()); if (filter_fn.is_null()) throw std::runtime_error("Undefined filter: " + args.args[1].dump());
@ -2772,6 +2777,7 @@ inline std::shared_ptr<Context> Context::builtins() {
auto & items = args.args[0]; auto & items = args.args[0];
if (items.is_null()) if (items.is_null())
return Value::array(); return Value::array();
if (!items.is_array()) throw std::runtime_error("object is not iterable: " + items.dump());
auto attr_name = args.args[1].get<std::string>(); auto attr_name = args.args[1].get<std::string>();
bool has_test = false; bool has_test = false;

View file

@ -125,21 +125,66 @@ For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
## CUDA ## CUDA
This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from the [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads). This provides GPU acceleration using an NVIDIA GPU. Make sure to have the [CUDA toolkit](https://developer.nvidia.com/cuda-toolkit) installed.
If you are using Fedora (using Fedora Workstation, or an 'Atomic' variant such as Silverblue), or would like to set up CUDA in a toolbox, please consider our [Fedora CUDA guide](./cuda-fedora.md). Unfortunately, the process is not as simple as one might expect. #### Download directly from NVIDIA
You may find the official downloads here: [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
- Using `CMake`:
#### Compile and run inside a Fedora Toolbox Container
We also have a [guide](./cuda-fedora.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
**Recommended for:**
- ***Particularly*** *convenient* for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
- Toolbox is installed by default: [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde).
- *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)
### Compilation
```bash ```bash
cmake -B build -DGGML_CUDA=ON cmake -B build -DGGML_CUDA=ON
cmake --build build --config Release cmake --build build --config Release
``` ```
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. ### Override Compute Capability Specifications
If `nvcc` cannot detect your gpu, you may get compile-warnings such as:
```text
nvcc warning : Cannot find valid GPU for '-arch=native', default arch is used
```
To override the `native` GPU detection:
#### 1. Take note of the `Compute Capability` of your NVIDIA devices: ["CUDA: Your GPU Compute > Capability"](https://developer.nvidia.com/cuda-gpus).
```text
GeForce RTX 4090 8.9
GeForce RTX 3080 Ti 8.6
GeForce RTX 3070 8.6
```
#### 2. Manually list each varying `Compute Capability` in the `CMAKE_CUDA_ARCHITECTURES` list.
```bash
cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="86;89"
```
### Runtime CUDA environmental variables
You may set the [cuda environmental variables](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) at runtime.
```bash
# Use `CUDA_VISIBLE_DEVICES` to hide the first compute device.
CUDA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
```
### Unified Memory
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`. The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
### Performance Tuning
The following compilation options are also available to tweak performance: The following compilation options are also available to tweak performance:
| Option | Legal values | Default | Description | | Option | Legal values | Default | Description |

View file

@ -1,17 +1,16 @@
# Setting Up CUDA on Fedora # Setting Up CUDA on Fedora
In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox container. This guide is applicable for: In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox container. This guide is applicable for:
- [Fedora Workstation](https://fedoraproject.org/workstation/) - [Fedora Workstation](https://fedoraproject.org/workstation/)
- [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/) - [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/)
- [Fedora Spins](https://fedoraproject.org/spins) - [Fedora Spins](https://fedoraproject.org/spins)
- [Other Distributions](https://containertoolbx.org/distros/), including `Red Hat Enterprise Linux >= 8.`, `Arch Linux`, and `Ubuntu`. - [Other Distributions](https://containertoolbx.org/distros/), including `Red Hat Enterprise Linux >= 8.5`, `Arch Linux`, and `Ubuntu`.
## Table of Contents ## Table of Contents
- [Prerequisites](#prerequisites) - [Prerequisites](#prerequisites)
- [Monitoring NVIDIA CUDA Repositories](#monitoring-nvidia-cuda-repositories) - [Using the Fedora 41 CUDA Repository](#using-the-fedora-41-cuda-repository)
- [Using the Fedora 39 CUDA Repository](#using-the-fedora-39-cuda-repository)
- [Creating a Fedora Toolbox Environment](#creating-a-fedora-toolbox-environment) - [Creating a Fedora Toolbox Environment](#creating-a-fedora-toolbox-environment)
- [Installing Essential Development Tools](#installing-essential-development-tools) - [Installing Essential Development Tools](#installing-essential-development-tools)
- [Adding the CUDA Repository](#adding-the-cuda-repository) - [Adding the CUDA Repository](#adding-the-cuda-repository)
@ -29,44 +28,33 @@ In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox
## Prerequisites ## Prerequisites
- **Toolbox Installed on the Host System** `Fedora Silverblue` and `Fedora Workstation` both have toolbox by default, other distributions may need to install the [toolbox package](https://containertoolbx.org/install/). - **Toolbox Installed on the Host System** `Fedora Silverblue` and `Fedora Workstation` both have toolbox by default, other distributions may need to install the [toolbox package](https://containertoolbx.org/install/).
- **NVIDIA Drivers and Graphics Card installed on Host System (optional)** To run CUDA program, such as `llama.cpp`, the host should be setup to access your NVIDIA hardware. Fedora Hosts can use the [RPM Fusion Repository](https://rpmfusion.org/Howto/NVIDIA). - **NVIDIA Drivers and Graphics Card installed on Host System (recommended)** To run CUDA program, such as `llama.cpp`, the host should be setup to access your NVIDIA hardware. Fedora Hosts can use the [RPM Fusion Repository](https://rpmfusion.org/Howto/NVIDIA).
- **Internet connectivity** to download packages. - **Internet connectivity** to download packages.
### Monitoring NVIDIA CUDA Repositories ### Using the Fedora 41 CUDA Repository
Before proceeding, it is advisable to check if NVIDIA has updated their CUDA repositories for your Fedora version. NVIDIA's repositories can be found at: The latest release is 41.
- [Fedora 40 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora40/x86_64/)
- [Fedora 41 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora41/x86_64/) - [Fedora 41 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora41/x86_64/)
As of the latest update, these repositories do not contain the `cuda` meta-package or are missing essential components. **Note:** We recommend using a toolbox environment to prevent system conflicts.
### Using the Fedora 39 CUDA Repository
Since the newer repositories are incomplete, we'll use the Fedora 39 repository:
- [Fedora 39 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora39/x86_64/)
**Note:** Fedora 39 is no longer maintained, so we recommend using a toolbox environment to prevent system conflicts.
## Creating a Fedora Toolbox Environment ## Creating a Fedora Toolbox Environment
This guide focuses on Fedora hosts, but with small adjustments, it can work for other hosts. Using a Fedora 39 toolbox allows us to install the necessary packages without affecting the host system. This guide focuses on Fedora hosts, but with small adjustments, it can work for other hosts. Using the Fedora Toolbox allows us to install the necessary packages without affecting the host system.
**Note:** Toolbox is available for other systems, and even without Toolbox, it is possible to use Podman or Docker. **Note:** Toolbox is available for other systems, and even without Toolbox, it is possible to use Podman or Docker.
We do not recommend installing on the host system, as Fedora 39 is out-of-maintenance, and instead you should upgrade to a maintained version of Fedora for your host. 1. **Create a Fedora 41 Toolbox:**
1. **Create a Fedora 39 Toolbox:**
```bash ```bash
toolbox create --image registry.fedoraproject.org/fedora-toolbox:39 --container fedora-toolbox-39-cuda toolbox create --image registry.fedoraproject.org/fedora-toolbox:41 --container fedora-toolbox-41-cuda
``` ```
2. **Enter the Toolbox:** 2. **Enter the Toolbox:**
```bash ```bash
toolbox enter --container fedora-toolbox-39-cuda toolbox enter --container fedora-toolbox-41-cuda
``` ```
Inside the toolbox, you have root privileges and can install packages without affecting the host system. Inside the toolbox, you have root privileges and can install packages without affecting the host system.
@ -85,7 +73,7 @@ We do not recommend installing on the host system, as Fedora 39 is out-of-mainte
sudo dnf install vim-default-editor --allowerasing sudo dnf install vim-default-editor --allowerasing
``` ```
The `--allowerasing` flag resolves any package conflicts. The `--allowerasing` flag will allow the removal of the conflicting `nano-default-editor` package.
3. **Install Development Tools and Libraries:** 3. **Install Development Tools and Libraries:**
@ -100,7 +88,7 @@ We do not recommend installing on the host system, as Fedora 39 is out-of-mainte
Add the NVIDIA CUDA repository to your DNF configuration: Add the NVIDIA CUDA repository to your DNF configuration:
```bash ```bash
sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/fedora39/x86_64/cuda-fedora39.repo sudo dnf config-manager addrepo --from-repofile=https://developer.download.nvidia.com/compute/cuda/repos/fedora41/x86_64/cuda-fedora41.repo
``` ```
After adding the repository, synchronize the package manager again: After adding the repository, synchronize the package manager again:
@ -109,106 +97,62 @@ After adding the repository, synchronize the package manager again:
sudo dnf distro-sync sudo dnf distro-sync
``` ```
## Installing `nvidia-driver-libs` ## Installing `nvidia-driver-libs` and `nvidia-driver-cuda-libs`
Attempt to install `nvidia-driver-libs`: We need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go).
```bash ```bash
sudo dnf install nvidia-driver-libs ls -la /usr/lib64/libcuda.so.1
``` ```
**Explanation:** **Explanation:**
- `nvidia-driver-libs` contains necessary NVIDIA driver libraries required by CUDA. - `nvidia-driver-libs` and `nvidia-driver-cuda-libs` contains necessary NVIDIA driver libraries required by CUDA,
- This step might fail due to conflicts with existing NVIDIA drivers on the host system. on hosts with NVIDIA drivers installed the Fedora Container will supply the host libraries.
## Manually Resolving Package Conflicts ### Install Nvidia Driver Libraries on Guest (if `libcuda.so.1` was NOT found).
```bash
sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
```
### Manually Updating the RPM database for host-supplied NVIDIA drivers (if `libcuda.so.1` was found).
If the installation fails due to conflicts, we'll manually download and install the required packages, excluding conflicting files. If the installation fails due to conflicts, we'll manually download and install the required packages, excluding conflicting files.
### 1. Download the `nvidia-driver-libs` RPM #### 1. Download `nvidia-driver-libs` and `nvidia-driver-cuda-libs` RPM's (with dependencies)
```bash ```bash
sudo dnf download --arch x86_64 nvidia-driver-libs sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-libs nvidia-driver-cuda-libs
``` ```
You should see a file similar to: #### 2. Update the RPM database to assume the installation of these packages.
```
nvidia-driver-libs-560.35.05-1.fc39.x86_64.rpm
```
### 2. Attempt to Install the RPM
```bash ```bash
sudo dnf install nvidia-driver-libs-560.35.05-1.fc39.x86_64.rpm sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/*
```
**Expected Error:**
Installation may fail with errors pointing to conflicts with `egl-gbm` and `egl-wayland`.
**Note: It is important to carefully read the error messages to identify the exact paths that need to be excluded.**
### 3. Download Dependencies
```bash
sudo dnf download --arch x86_64 egl-gbm egl-wayland
```
### 4. Install `egl-gbm` with Excluded Paths
Exclude conflicting files during installation:
```bash
sudo rpm --install --verbose --hash \
--excludepath=/usr/lib64/libnvidia-egl-gbm.so.1.1.2 \
--excludepath=/usr/share/egl/egl_external_platform.d/15_nvidia_gbm.json \
egl-gbm-1.1.2^20240919gitb24587d-3.fc39.x86_64.rpm
```
**Explanation:**
- The `--excludepath` option skips installing files that conflict with existing files.
- Adjust the paths based on the error messages you receive.
### 5. Install `egl-wayland` with Excluded Paths
```bash
sudo rpm --install --verbose --hash \
--excludepath=/usr/share/egl/egl_external_platform.d/10_nvidia_wayland.json \
egl-wayland-1.1.17^20241118giteeb29e1-5.fc39.x86_64.rpm
```
### 6. Install `nvidia-driver-libs` with Excluded Paths
```bash
sudo rpm --install --verbose --hash \
--excludepath=/usr/share/glvnd/egl_vendor.d/10_nvidia.json \
--excludepath=/usr/share/nvidia/nvoptix.bin \
nvidia-driver-libs-560.35.05-1.fc39.x86_64.rpm
``` ```
**Note:** **Note:**
- Replace the paths with the ones causing conflicts in your installation if they differ. - The `--justdb` option only updates the RPM database, without touching the filesystem.
- The `--verbose` and `--hash` options provide detailed output during installation.
## Finalizing the Installation of `nvidia-driver-libs` #### Finalizing the Installation of `nvidia-driver-libs` and `nvidia-driver-cuda-libs`
After manually installing the dependencies, run: After manually installing the dependencies, run:
```bash ```bash
sudo dnf install nvidia-driver-libs sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
``` ```
You should receive a message indicating the package is already installed: You should receive a message indicating the package is already installed:
``` ```
Package nvidia-driver-libs-3:560.35.05-1.fc39.x86_64 is already installed. Updating and loading repositories:
Dependencies resolved. Repositories loaded.
Package "nvidia-driver-libs-3:570.86.10-1.fc41.x86_64" is already installed.
Package "nvidia-driver-cuda-libs-3:570.86.10-1.fc41.x86_64" is already installed.
Nothing to do. Nothing to do.
Complete!
``` ```
## Installing the CUDA Meta-Package ## Installing the CUDA Meta-Package
@ -262,26 +206,33 @@ You should see output similar to:
``` ```
nvcc: NVIDIA (R) Cuda compiler driver nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation Copyright (c) 2005-2025 NVIDIA Corporation
Built on Tue_Oct_29_23:50:19_PDT_2024 Built on Wed_Jan_15_19:20:09_PST_2025
Cuda compilation tools, release 12.6, V12.6.85 Cuda compilation tools, release 12.8, V12.8.61
Build cuda_12.6.r12.6/compiler.35059454_0 Build cuda_12.8.r12.8/compiler.35404655_0
``` ```
This output confirms that the CUDA compiler is accessible and indicates the installed version. This output confirms that the CUDA compiler is accessible and indicates the installed version.
## Conclusion ## Conclusion
You have successfully set up CUDA on Fedora within a toolbox environment using the Fedora 39 CUDA repository. By manually resolving package conflicts and configuring the environment, you can develop CUDA applications without affecting your host system. You have successfully set up CUDA on Fedora within a toolbox environment using the Fedora 41 CUDA repository. By manually updating the RPM db and configuring the environment, you can develop CUDA applications without affecting your host system.
## Troubleshooting ## Troubleshooting
- **Installation Failures:** - **Installation Failures:**
- If you encounter errors during installation, carefully read the error messages. They often indicate conflicting files or missing dependencies.
- Use the `--excludepath` option with `rpm` to exclude conflicting files during manual installations.
- **Driver Conflicts:** - If you encounter errors during installation, carefully read the error messages. They often indicate conflicting files or missing dependencies.
- Since the host system may already have NVIDIA drivers installed, conflicts can arise. Using the toolbox environment helps isolate these issues. - You may use the `--excludepath` option with `rpm` to exclude conflicting files during manual RPM installations.
- **Rebooting the Container:**
- Sometimes there may be a bug in the NVIDIA driver host passthrough (such as missing a shared library). Rebooting the container may solve this issue:
```bash
# on the host system
podman container restart --all
```
- **Environment Variables Not Set:** - **Environment Variables Not Set:**
- If `nvcc` is not found after installation, ensure that `/usr/local/cuda/bin` is in your `PATH`. - If `nvcc` is not found after installation, ensure that `/usr/local/cuda/bin` is in your `PATH`.
@ -291,10 +242,12 @@ You have successfully set up CUDA on Fedora within a toolbox environment using t
## Additional Notes ## Additional Notes
- **Updating CUDA in the Future:** - **Updating CUDA in the Future:**
- Keep an eye on the official NVIDIA repositories for updates to your Fedora version. - Keep an eye on the official NVIDIA repositories for updates to your Fedora version.
- When an updated repository becomes available, adjust your `dnf` configuration accordingly. - When an updated repository becomes available, adjust your `dnf` configuration accordingly.
- **Building `llama.cpp`:** - **Building `llama.cpp`:**
- With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support. - With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support.
- Ensure that any CUDA-specific build flags or paths are correctly set in your build configuration. - Ensure that any CUDA-specific build flags or paths are correctly set in your build configuration.

View file

@ -31,6 +31,11 @@ defer {
llama_model_free(model) llama_model_free(model)
} }
guard let vocab = llama_model_get_vocab(model) else {
print("Failed to get vocab")
exit(1)
}
var tokens = tokenize(text: prompt, add_bos: true) var tokens = tokenize(text: prompt, add_bos: true)
let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel) let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
@ -41,7 +46,7 @@ context_params.n_batch = UInt32(max(n_len, n_parallel))
context_params.n_threads = 8 context_params.n_threads = 8
context_params.n_threads_batch = 8 context_params.n_threads_batch = 8
let context = llama_new_context_with_model(model, context_params) let context = llama_init_from_model(model, context_params)
guard context != nil else { guard context != nil else {
print("Failed to initialize context") print("Failed to initialize context")
exit(1) exit(1)
@ -141,7 +146,7 @@ while n_cur <= n_len {
let new_token_id = llama_sampler_sample(smpl, context, i_batch[i]) let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])
// is it an end of stream? -> mark the stream as finished // is it an end of stream? -> mark the stream as finished
if llama_vocab_is_eog(model, new_token_id) || n_cur == n_len { if llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len {
i_batch[i] = -1 i_batch[i] = -1
// print("") // print("")
if n_parallel > 1 { if n_parallel > 1 {
@ -207,7 +212,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let utf8Count = text.utf8.count let utf8Count = text.utf8.count
let n_tokens = utf8Count + (add_bos ? 1 : 0) let n_tokens = utf8Count + (add_bos ? 1 : 0)
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens) let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false) let tokenCount = llama_tokenize(vocab, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
var swiftTokens: [llama_token] = [] var swiftTokens: [llama_token] = []
for i in 0 ..< tokenCount { for i in 0 ..< tokenCount {
swiftTokens.append(tokens[Int(i)]) swiftTokens.append(tokens[Int(i)])
@ -218,12 +223,12 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? { private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
var result = [CChar](repeating: 0, count: 8) var result = [CChar](repeating: 0, count: 8)
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), 0, false) let nTokens = llama_token_to_piece(vocab, token, &result, Int32(result.count), 0, false)
if nTokens < 0 { if nTokens < 0 {
let actualTokensCount = -Int(nTokens) let actualTokensCount = -Int(nTokens)
result = .init(repeating: 0, count: actualTokensCount) result = .init(repeating: 0, count: actualTokensCount)
let check = llama_token_to_piece( let check = llama_token_to_piece(
model, vocab,
token, token,
&result, &result,
Int32(result.count), Int32(result.count),

View file

@ -24,6 +24,7 @@ func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama
actor LlamaContext { actor LlamaContext {
private var model: OpaquePointer private var model: OpaquePointer
private var context: OpaquePointer private var context: OpaquePointer
private var vocab: OpaquePointer
private var sampling: UnsafeMutablePointer<llama_sampler> private var sampling: UnsafeMutablePointer<llama_sampler>
private var batch: llama_batch private var batch: llama_batch
private var tokens_list: [llama_token] private var tokens_list: [llama_token]
@ -47,6 +48,7 @@ actor LlamaContext {
self.sampling = llama_sampler_chain_init(sparams) self.sampling = llama_sampler_chain_init(sparams)
llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4)) llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4))
llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234)) llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234))
vocab = llama_model_get_vocab(model)
} }
deinit { deinit {
@ -79,7 +81,7 @@ actor LlamaContext {
ctx_params.n_threads = Int32(n_threads) ctx_params.n_threads = Int32(n_threads)
ctx_params.n_threads_batch = Int32(n_threads) ctx_params.n_threads_batch = Int32(n_threads)
let context = llama_new_context_with_model(model, ctx_params) let context = llama_init_from_model(model, ctx_params)
guard let context else { guard let context else {
print("Could not load context!") print("Could not load context!")
throw LlamaError.couldNotInitializeContext throw LlamaError.couldNotInitializeContext
@ -151,7 +153,7 @@ actor LlamaContext {
new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1) new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)
if llama_vocab_is_eog(model, new_token_id) || n_cur == n_len { if llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len {
print("\n") print("\n")
is_done = true is_done = true
let new_token_str = String(cString: temporary_invalid_cchars + [0]) let new_token_str = String(cString: temporary_invalid_cchars + [0])
@ -297,7 +299,7 @@ actor LlamaContext {
let utf8Count = text.utf8.count let utf8Count = text.utf8.count
let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1 let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens) let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false) let tokenCount = llama_tokenize(vocab, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
var swiftTokens: [llama_token] = [] var swiftTokens: [llama_token] = []
for i in 0..<tokenCount { for i in 0..<tokenCount {
@ -316,7 +318,7 @@ actor LlamaContext {
defer { defer {
result.deallocate() result.deallocate()
} }
let nTokens = llama_token_to_piece(model, token, result, 8, 0, false) let nTokens = llama_token_to_piece(vocab, token, result, 8, 0, false)
if nTokens < 0 { if nTokens < 0 {
let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens)) let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
@ -324,7 +326,7 @@ actor LlamaContext {
defer { defer {
newResult.deallocate() newResult.deallocate()
} }
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false) let nNewTokens = llama_token_to_piece(vocab, token, newResult, -nTokens, 0, false)
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens)) let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
return Array(bufferPointer) return Array(bufferPointer)
} else { } else {

View file

@ -50,3 +50,10 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17) target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-llava-clip-quantize-cli)
add_executable(${TARGET} clip-quantize-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-clip-quantize-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View file

@ -0,0 +1,44 @@
# Quantizing CLIP Visual Projector
This is the tool for quantizing the CLIP visual projector model. Quantization reduces the precision of the model's weights, which can significantly decrease the model size and improve inference speed, often with minimal impact on performance.
## Usage
To quantize a CLIP visual projector model, use the following command:
```sh
./bin/llama-llava-clip-quantize-cli /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf <type>
```
After the quantization, the visual projector can be used freely with the existing LLAVA cli (LLAVA, Qwen2VL, etc).
### Arguments
- `/path/to/ggml-model-f32.gguf`: The path to the input model file in FP32 or FP16 format.
- `/path/to/ggml-model-quantized.gguf`: The path where the quantized model will be saved.
- `<type>`: The quantization type to apply. This should be an integer corresponding to one of the quantization types defined in the `enum ggml_type`.
### Quantization Types
The following quantization types are supported, based on the `enum ggml_type` definition:
- `2` - `q4_0`: 4-bit quantization with a single scale value.
- `3` - `q4_1`: 4-bit quantization with a separate scale value for each block.
- `6` - `q5_0`: 5-bit quantization with a single scale value.
- `7` - `q5_1`: 5-bit quantization with a separate scale value for each block.
- `8` - `q8_0`: 8-bit quantization with a single scale value.
### Example
To quantize a model using the `q4_0` quantization type, you would run:
```sh
./bin/llama-llava-clip-quantize-cli /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf 2
```
This command will generate a quantized model at `/path/to/ggml-model-quantized.gguf` using the `q4_0` quantization method.
## Notes
- Quantization can lead to a loss in model accuracy, depending on the chosen quantization type. It is recommended to evaluate the quantized model's performance on your specific task to ensure it meets your requirements.
- The quantized model will typically be smaller in size and faster to run, making it more suitable for deployment in resource-constrained environments.

View file

@ -0,0 +1,59 @@
#include "arg.h"
#include "base64.hpp"
#include "log.h"
#include "common.h"
#include "sampling.h"
#include "clip.h"
#include "llava.h"
#include "llama.h"
#include "ggml.h"
static void print_usage(int argc, char ** argv) {
(void) argc;
fprintf(stderr, "usage: %s /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf type\n", argv[0]);
fprintf(stderr, " type = 2 - q4_0\n");
fprintf(stderr, " type = 3 - q4_1\n");
fprintf(stderr, " type = 6 - q5_0\n");
fprintf(stderr, " type = 7 - q5_1\n");
fprintf(stderr, " type = 8 - q8_0\n");
}
int main(int argc, char ** argv) {
if (argc != 4) {
print_usage(argc, argv);
return 1;
}
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];
const int itype = atoi(argv[3]);
const int64_t t_main_start_us = ggml_time_us();
int64_t t_quantize_us = 0;
// load the model
{
const int64_t t_start_us = ggml_time_us();
if (!clip_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}
t_quantize_us = ggml_time_us() - t_start_us;
}
// report timing
{
const int64_t t_main_end_us = ggml_time_us();
printf("\n");
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us / 1000.0f);
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
}
return 0;
}

View file

@ -2745,10 +2745,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} }
bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) { bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
ggml_type type = GGML_TYPE_Q4_1;
assert(itype < GGML_TYPE_COUNT); assert(itype < GGML_TYPE_COUNT);
type = static_cast<ggml_type>(itype); ggml_type type = static_cast<ggml_type>(itype);
auto * ctx_clip = clip_model_load(fname_inp, 2); auto * ctx_clip = clip_model_load(fname_inp, 2);
@ -2801,8 +2799,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
} }
} }
// quantize only 2D tensors // quantize only 2D tensors and bigger than block size
quantize &= (ggml_n_dims(cur) == 2); quantize &= (ggml_n_dims(cur) == 2) && cur->ne[0] > ggml_blck_size(type);
if (quantize) { if (quantize) {
new_type = type; new_type = type;

View file

@ -346,7 +346,7 @@ class HttpClient {
if (!output_file.empty()) { if (!output_file.empty()) {
output_file_partial = output_file + ".partial"; output_file_partial = output_file + ".partial";
if (!out.open(output_file_partial, "ab")) { if (!out.open(output_file_partial, "ab")) {
printe("Failed to open file\n"); printe("Failed to open file for writing\n");
return 1; return 1;
} }
@ -535,8 +535,7 @@ class HttpClient {
static void print_progress(const std::string & progress_prefix, const std::string & progress_bar, static void print_progress(const std::string & progress_prefix, const std::string & progress_bar,
const std::string & progress_suffix) { const std::string & progress_suffix) {
printe("\r%*s\r%s%s| %s", get_terminal_width(), " ", progress_prefix.c_str(), progress_bar.c_str(), printe("\r" LOG_CLR_TO_EOL "%s%s| %s", progress_prefix.c_str(), progress_bar.c_str(), progress_suffix.c_str());
progress_suffix.c_str());
} }
// Function to write data to a file // Function to write data to a file
static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) { static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) {
@ -797,16 +796,13 @@ class LlamaData {
llama_model_ptr initialize_model(Opt & opt) { llama_model_ptr initialize_model(Opt & opt) {
ggml_backend_load_all(); ggml_backend_load_all();
resolve_model(opt.model_); resolve_model(opt.model_);
printe( printe("\r" LOG_CLR_TO_EOL "Loading model");
"\r%*s"
"\rLoading model",
get_terminal_width(), " ");
llama_model_ptr model(llama_model_load_from_file(opt.model_.c_str(), opt.model_params)); llama_model_ptr model(llama_model_load_from_file(opt.model_.c_str(), opt.model_params));
if (!model) { if (!model) {
printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str()); printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
} }
printe("\r%*s\r", static_cast<int>(sizeof("Loading model")), " "); printe("\r" LOG_CLR_TO_EOL);
return model; return model;
} }
@ -848,7 +844,15 @@ static int apply_chat_template(const common_chat_template & tmpl, LlamaData & ll
}); });
} }
try { try {
auto result = tmpl.apply(messages, /* tools= */ json(), append); minja::chat_template_inputs tmpl_inputs;
tmpl_inputs.messages = messages;
tmpl_inputs.add_generation_prompt = append;
minja::chat_template_options tmpl_opts;
tmpl_opts.use_bos_token = false;
tmpl_opts.use_eos_token = false;
auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
llama_data.fmtted.resize(result.size() + 1); llama_data.fmtted.resize(result.size() + 1);
memcpy(llama_data.fmtted.data(), result.c_str(), result.size() + 1); memcpy(llama_data.fmtted.data(), result.c_str(), result.size() + 1);
return result.size(); return result.size();
@ -961,10 +965,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
static int read_user_input(std::string & user_input) { static int read_user_input(std::string & user_input) {
static const char * prompt_prefix = "> "; static const char * prompt_prefix = "> ";
#ifdef WIN32 #ifdef WIN32
printf( printf("\r" LOG_CLR_TO_EOL LOG_COL_DEFAULT "%s", prompt_prefix);
"\r%*s"
"\r" LOG_COL_DEFAULT "%s",
get_terminal_width(), " ", prompt_prefix);
std::getline(std::cin, user_input); std::getline(std::cin, user_input);
if (std::cin.eof()) { if (std::cin.eof()) {

View file

@ -220,7 +220,7 @@ services:
The project includes a web-based user interface that enables interaction with the model through the `/chat/completions` endpoint. The project includes a web-based user interface that enables interaction with the model through the `/chat/completions` endpoint.
The web UI is developed using: The web UI is developed using:
- `vue` framework for frontend development - `react` framework for frontend development
- `tailwindcss` and `daisyui` for styling - `tailwindcss` and `daisyui` for styling
- `vite` for build tooling - `vite` for build tooling

Binary file not shown.

View file

@ -334,24 +334,24 @@ struct server_task {
if (data.contains("json_schema") && !data.contains("grammar")) { if (data.contains("json_schema") && !data.contains("grammar")) {
try { try {
auto schema = json_value(data, "json_schema", json::object()); auto schema = json_value(data, "json_schema", json::object());
LOG_DBG("JSON schema: %s\n", schema.dump(2).c_str()); SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str());
params.sampling.grammar = json_schema_to_grammar(schema); params.sampling.grammar = json_schema_to_grammar(schema);
LOG_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str()); SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
} catch (const std::exception & e) { } catch (const std::exception & e) {
throw std::runtime_error(std::string("\"json_schema\": ") + e.what()); throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
} }
} else { } else {
params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar); params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar);
LOG_DBG("Grammar: %s\n", params.sampling.grammar.c_str()); SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy); params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
LOG_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false"); SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
} }
{ {
auto it = data.find("chat_format"); auto it = data.find("chat_format");
if (it != data.end()) { if (it != data.end()) {
params.oaicompat_chat_format = static_cast<common_chat_format>(it->get<int>()); params.oaicompat_chat_format = static_cast<common_chat_format>(it->get<int>());
LOG_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str()); SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
} else { } else {
params.oaicompat_chat_format = defaults.oaicompat_chat_format; params.oaicompat_chat_format = defaults.oaicompat_chat_format;
} }
@ -367,12 +367,12 @@ struct server_task {
auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true); auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true);
if (ids.size() == 1) { if (ids.size() == 1) {
LOG_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str()); SRV_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
params.sampling.grammar_trigger_tokens.push_back(ids[0]); params.sampling.grammar_trigger_tokens.push_back(ids[0]);
params.sampling.preserved_tokens.insert(ids[0]); params.sampling.preserved_tokens.insert(ids[0]);
continue; continue;
} }
LOG_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str()); SRV_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
params.sampling.grammar_trigger_words.push_back(trigger); params.sampling.grammar_trigger_words.push_back(trigger);
} }
} }
@ -381,11 +381,11 @@ struct server_task {
for (const auto & t : *preserved_tokens) { for (const auto & t : *preserved_tokens) {
auto ids = common_tokenize(vocab, t.get<std::string>(), /* add_special= */ false, /* parse_special= */ true); auto ids = common_tokenize(vocab, t.get<std::string>(), /* add_special= */ false, /* parse_special= */ true);
if (ids.size() == 1) { if (ids.size() == 1) {
LOG_DBG("Preserved token: %d\n", ids[0]); SRV_DBG("Preserved token: %d\n", ids[0]);
params.sampling.preserved_tokens.insert(ids[0]); params.sampling.preserved_tokens.insert(ids[0]);
} else { } else {
// This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens. // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
LOG_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str()); SRV_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str());
} }
} }
} }
@ -717,7 +717,7 @@ struct server_task_result_cmpl_final : server_task_result {
std::string finish_reason = "length"; std::string finish_reason = "length";
common_chat_msg msg; common_chat_msg msg;
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
LOG_DBG("Parsing chat message: %s\n", content.c_str()); SRV_DBG("Parsing chat message: %s\n", content.c_str());
msg = common_chat_parse(content, oaicompat_chat_format); msg = common_chat_parse(content, oaicompat_chat_format);
finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls"; finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls";
} else { } else {
@ -1885,7 +1885,7 @@ struct server_context {
} }
if (params_base.chat_template.empty() && !validate_builtin_chat_template(params.use_jinja)) { if (params_base.chat_template.empty() && !validate_builtin_chat_template(params.use_jinja)) {
LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__); SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
chat_templates = common_chat_templates_from_model(model, "chatml"); chat_templates = common_chat_templates_from_model(model, "chatml");
} else { } else {
chat_templates = common_chat_templates_from_model(model, params_base.chat_template); chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
@ -3353,10 +3353,12 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
return; return;
} }
LOG_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch
LOG_DBG("request: %s\n", req.body.c_str()); SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
LOG_DBG("response: %s\n", res.body.c_str());
SRV_DBG("request: %s\n", req.body.c_str());
SRV_DBG("response: %s\n", res.body.c_str());
} }
std::function<void(int)> shutdown_handler; std::function<void(int)> shutdown_handler;
@ -3439,9 +3441,13 @@ int main(int argc, char ** argv) {
message = "Unknown Exception"; message = "Unknown Exception";
} }
try {
json formatted_error = format_error_response(message, ERROR_TYPE_SERVER); json formatted_error = format_error_response(message, ERROR_TYPE_SERVER);
LOG_WRN("got exception: %s\n", formatted_error.dump().c_str()); LOG_WRN("got exception: %s\n", formatted_error.dump().c_str());
res_error(res, formatted_error); res_error(res, formatted_error);
} catch (const std::exception & e) {
LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str());
}
}); });
svr->set_error_handler([&res_error](const httplib::Request &, httplib::Response & res) { svr->set_error_handler([&res_error](const httplib::Request &, httplib::Response & res) {
@ -3854,7 +3860,9 @@ int main(int argc, char ** argv) {
try { try {
const auto & prompt = data.at("prompt"); const auto & prompt = data.at("prompt");
LOG_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str()); // TODO: this log can become very long, put it behind a flag or think about a more compact format
//SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true); std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
tasks.reserve(tokenized_prompts.size()); tasks.reserve(tokenized_prompts.size());
for (size_t i = 0; i < tokenized_prompts.size(); i++) { for (size_t i = 0; i < tokenized_prompts.size(); i++) {
@ -4370,6 +4378,9 @@ int main(int argc, char ** argv) {
res.set_content("Error: gzip is not supported by this browser", "text/plain"); res.set_content("Error: gzip is not supported by this browser", "text/plain");
} else { } else {
res.set_header("Content-Encoding", "gzip"); res.set_header("Content-Encoding", "gzip");
// COEP and COOP headers, required by pyodide (python interpreter)
res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
res.set_header("Cross-Origin-Opener-Policy", "same-origin");
res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8"); res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
} }
return false; return false;

View file

@ -13,8 +13,11 @@ def create_server():
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason,jinja,chat_template", "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason,jinja,chat_template",
[ [
(None, "Book", "Hey", 8, "But she couldn't", 69, 8, "length", False, None),
(None, "Book", "Hey", 8, "But she couldn't", 69, 8, "length", True, None),
(None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", False, None), (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", False, None),
(None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True, None), (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True, None),
(None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True, 'chatml'),
(None, "Book", "What is the best book", 8, "^ blue", 23, 8, "length", True, "This is not a chat template, it is"), (None, "Book", "What is the best book", 8, "^ blue", 23, 8, "length", True, "This is not a chat template, it is"),
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", False, None), ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", False, None),
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", True, None), ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", True, None),

View file

@ -67,8 +67,8 @@ WEATHER_TOOL = {
def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, argument_key: str | None): def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, argument_key: str | None):
n_predict = 512
global server global server
n_predict = 512
# server = ServerPreset.stories15m_moe() # server = ServerPreset.stories15m_moe()
server.jinja = True server.jinja = True
server.n_predict = n_predict server.n_predict = n_predict
@ -139,29 +139,49 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict,
@pytest.mark.parametrize("tool,argument_key,hf_repo,template_override", [ @pytest.mark.parametrize("tool,argument_key,hf_repo,template_override", [
(TEST_TOOL, "success", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), (TEST_TOOL, "success", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
(PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
(PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
# Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
(TEST_TOOL, "success", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None), (TEST_TOOL, "success", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
(PYTHON_TOOL, "code", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
(TEST_TOOL, "success", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), (TEST_TOOL, "success", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
(PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
(PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
(TEST_TOOL, "success", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None), (TEST_TOOL, "success", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
(PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
(PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
(TEST_TOOL, "success", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), (TEST_TOOL, "success", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
(PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), (PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
(PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
(TEST_TOOL, "success", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")), (TEST_TOOL, "success", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
(PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")), (PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
(PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
(TEST_TOOL, "success", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), (TEST_TOOL, "success", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
(PYTHON_TOOL, "code", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
(TEST_TOOL, "success", "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)), (PYTHON_TOOL, "code", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
(PYTHON_TOOL, "code", "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)),
(TEST_TOOL, "success", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", ("meetkai/functionary-medium-v3.2", None)),
(PYTHON_TOOL, "code", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", ("meetkai/functionary-medium-v3.2", None)),
(PYTHON_TOOL, "code", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", "chatml"),
(TEST_TOOL, "success", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), (TEST_TOOL, "success", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
(PYTHON_TOOL, "code", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), (PYTHON_TOOL, "code", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
(PYTHON_TOOL, "code", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
(TEST_TOOL, "success", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), (TEST_TOOL, "success", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
(PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), (PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
(PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"),
# TODO: fix these # TODO: fix these
# (TEST_TOOL, "success", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), # (TEST_TOOL, "success", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
# (PYTHON_TOOL, "code", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), # (PYTHON_TOOL, "code", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
]) ])
def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: Tuple[str, str | None] | None): def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
global server
n_predict = 512 n_predict = 512
server.n_slots = 1 server.n_slots = 1
server.jinja = True server.jinja = True
@ -169,10 +189,12 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str
server.n_predict = n_predict server.n_predict = n_predict
server.model_hf_repo = hf_repo server.model_hf_repo = hf_repo
server.model_hf_file = None server.model_hf_file = None
if template_override: if isinstance(template_override, tuple):
(template_hf_repo, template_variant) = template_override (template_hf_repo, template_variant) = template_override
server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja" server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template." assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
elif isinstance(template_override, str):
server.chat_template = template_override
server.start(timeout_seconds=TIMEOUT_SERVER_START) server.start(timeout_seconds=TIMEOUT_SERVER_START)
res = server.make_request("POST", "/chat/completions", data={ res = server.make_request("POST", "/chat/completions", data={
"max_tokens": n_predict, "max_tokens": n_predict,
@ -251,33 +273,55 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t
@pytest.mark.slow @pytest.mark.slow
@pytest.mark.parametrize("hf_repo,template_override", [ @pytest.mark.parametrize("hf_repo,template_override", [
("bartowski/c4ai-command-r7b-12-2024-GGUF:Q4_K_M", ("CohereForAI/c4ai-command-r7b-12-2024", "tool_use")),
("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
("bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None), ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None), ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")), ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
("bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)), ("bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)),
("bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"),
("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
# Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
("bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
# ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), # ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
# ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), # ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
]) ])
def test_weather_tool_call(hf_repo: str, template_override: Tuple[str, str | None] | None): def test_weather(hf_repo: str, template_override: Tuple[str, str | None] | None):
global server global server
n_predict = 512
server.n_slots = 1 server.n_slots = 1
server.jinja = True server.jinja = True
server.n_ctx = 8192 server.n_ctx = 8192
server.n_predict = 512 server.n_predict = n_predict
server.model_hf_repo = hf_repo server.model_hf_repo = hf_repo
server.model_hf_file = None server.model_hf_file = None
if template_override: if isinstance(template_override, tuple):
(template_hf_repo, template_variant) = template_override (template_hf_repo, template_variant) = template_override
server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja" server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template." assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
elif isinstance(template_override, str):
server.chat_template = template_override
server.start(timeout_seconds=TIMEOUT_SERVER_START) server.start(timeout_seconds=TIMEOUT_SERVER_START)
res = server.make_request("POST", "/chat/completions", data={ res = server.make_request("POST", "/chat/completions", data={
"max_tokens": 256, "max_tokens": n_predict,
"messages": [ "messages": [
{"role": "user", "content": "What is the weather in Istanbul?"}, {"role": "user", "content": "What is the weather in Istanbul?"},
], ],
@ -298,19 +342,39 @@ def test_weather_tool_call(hf_repo: str, template_override: Tuple[str, str | Non
@pytest.mark.slow @pytest.mark.slow
@pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [ @pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [
(None, "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
(None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), (None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
(None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
(None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai-functionary-medium-v3.2", None)), (None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai-functionary-medium-v3.2", None)),
('{"code":"print("}', "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), (None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"),
(None, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
(None, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
('{"code":"print("}', "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
('{"code":"print("}', "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
(None, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"),
('{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)), ('{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
('{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
(None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None), (None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
(None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
(None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), (None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
(None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
(None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")), (None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
(None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
(None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), (None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
(None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
# Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
(None, "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
# (None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), # (None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
]) ])
def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo: str, template_override: Tuple[str, str | None] | None): def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
global server global server
server.n_slots = 1 server.n_slots = 1
server.jinja = True server.jinja = True
@ -318,10 +382,12 @@ def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo:
server.n_predict = 128 server.n_predict = 128
server.model_hf_repo = hf_repo server.model_hf_repo = hf_repo
server.model_hf_file = None server.model_hf_file = None
if template_override: if isinstance(template_override, tuple):
(template_hf_repo, template_variant) = template_override (template_hf_repo, template_variant) = template_override
server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja" server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template." assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
elif isinstance(template_override, str):
server.chat_template = template_override
server.start(timeout_seconds=TIMEOUT_SERVER_START) server.start(timeout_seconds=TIMEOUT_SERVER_START)
res = server.make_request("POST", "/chat/completions", data={ res = server.make_request("POST", "/chat/completions", data={
"max_tokens": 256, "max_tokens": 256,

24
examples/server/webui/.gitignore vendored Normal file
View file

@ -0,0 +1,24 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
pnpm-debug.log*
lerna-debug.log*
node_modules
dist
dist-ssr
*.local
# Editor directories and files
.vscode/*
!.vscode/extensions.json
.idea
.DS_Store
*.suo
*.ntvs*
*.njsproj
*.sln
*.sw?

View file

@ -0,0 +1,10 @@
**/.vscode
**/.github
**/.git
**/.svn
**/.hg
**/node_modules
**/dist
**/build
*.config.js

View file

@ -0,0 +1,26 @@
import js from '@eslint/js'
import globals from 'globals'
import reactHooks from 'eslint-plugin-react-hooks'
import reactRefresh from 'eslint-plugin-react-refresh'
import tseslint from 'typescript-eslint'
export default tseslint.config(
{ ignores: ['dist'] },
{
extends: [js.configs.recommended, ...tseslint.configs.recommended],
files: ['**/*.{ts,tsx}'],
languageOptions: {
ecmaVersion: 2020,
globals: globals.browser,
},
plugins: {
'react-hooks': reactHooks,
'react-refresh': reactRefresh,
},
rules: {
...reactHooks.configs.recommended.rules,
'react-refresh/only-export-components': 'off',
'@typescript-eslint/no-unused-vars': 'off',
},
},
)

View file

@ -1,341 +1,16 @@
<!DOCTYPE html> <!doctype html>
<html> <html>
<head> <head>
<meta charset="UTF-8"> <meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" /> <meta
<meta name="color-scheme" content="light dark"> name="viewport"
content="width=device-width, initial-scale=1, maximum-scale=1"
/>
<meta name="color-scheme" content="light dark" />
<title>🦙 llama.cpp - chat</title> <title>🦙 llama.cpp - chat</title>
</head> </head>
<body> <body>
<div id="app" class="opacity-0"> <!-- opacity-0 will be removed on app mounted --> <div id="root"></div>
<div class="flex flex-row drawer lg:drawer-open"> <script type="module" src="/src/main.tsx"></script>
<input id="toggle-drawer" type="checkbox" class="drawer-toggle" checked />
<!-- sidebar -->
<div class="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
<label for="toggle-drawer" aria-label="close sidebar" class="drawer-overlay"></label>
<div class="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
<div class="flex flex-row items-center justify-between mb-4 mt-4">
<h2 class="font-bold ml-4">Conversations</h2>
<!-- close sidebar button -->
<label for="toggle-drawer" class="btn btn-ghost lg:hidden">
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-arrow-bar-left" viewBox="0 0 16 16">
<path fill-rule="evenodd" d="M12.5 15a.5.5 0 0 1-.5-.5v-13a.5.5 0 0 1 1 0v13a.5.5 0 0 1-.5.5M10 8a.5.5 0 0 1-.5.5H3.707l2.147 2.146a.5.5 0 0 1-.708.708l-3-3a.5.5 0 0 1 0-.708l3-3a.5.5 0 1 1 .708.708L3.707 7.5H9.5a.5.5 0 0 1 .5.5"/>
</svg>
</label>
</div>
<!-- list of conversations -->
<div :class="{
'btn btn-ghost justify-start': true,
'btn-active': messages.length === 0,
}" @click="newConversation">
+ New conversation
</div>
<div v-for="conv in conversations" :class="{
'btn btn-ghost justify-start font-normal': true,
'btn-active': conv.id === viewingConvId,
}" @click="setViewingConv(conv.id)" dir="auto">
<span class="truncate">{{ conv.messages[0].content }}</span>
</div>
<div class="text-center text-xs opacity-40 mt-auto mx-4">
Conversations are saved to browser's localStorage
</div>
</div>
</div>
<!-- main view -->
<div class="chat-screen drawer-content grow flex flex-col h-screen w-screen mx-auto px-4">
<!-- header -->
<div class="flex flex-row items-center mt-6 mb-6">
<!-- open sidebar button -->
<label for="toggle-drawer" class="btn btn-ghost lg:hidden">
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-list" viewBox="0 0 16 16">
<path fill-rule="evenodd" d="M2.5 12a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5"/>
</svg>
</label>
<div class="grow text-2xl font-bold ml-2">llama.cpp</div>
<!-- action buttons (top right) -->
<div class="flex items-center">
<div v-if="messages.length > 0" class="dropdown dropdown-end">
<!-- "..." button -->
<button tabindex="0" role="button" class="btn m-1" :disabled="isGenerating">
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-three-dots-vertical" viewBox="0 0 16 16">
<path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0"/>
</svg>
</button>
<!-- "delete" dropdown menu -->
<ul tabindex="0" class="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow">
<li @click="downloadConv(viewingConvId)"><a>Download</a></li>
<li class="text-error" @click="deleteConv(viewingConvId)"><a>Delete</a></li>
</ul>
</div>
<div class="tooltip tooltip-bottom" data-tip="Settings">
<button class="btn" @click="showConfigDialog = true" :disabled="isGenerating">
<!-- settings button -->
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
<path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0"/>
<path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z"/>
</svg>
</button>
</div>
<!-- theme controller is copied from https://daisyui.com/components/theme-controller/ -->
<div class="tooltip tooltip-bottom" data-tip="Themes">
<div class="dropdown dropdown-end dropdown-bottom">
<div tabindex="0" role="button" class="btn m-1">
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-palette2" viewBox="0 0 16 16">
<path d="M0 .5A.5.5 0 0 1 .5 0h5a.5.5 0 0 1 .5.5v5.277l4.147-4.131a.5.5 0 0 1 .707 0l3.535 3.536a.5.5 0 0 1 0 .708L10.261 10H15.5a.5.5 0 0 1 .5.5v5a.5.5 0 0 1-.5.5H3a3 3 0 0 1-2.121-.879A3 3 0 0 1 0 13.044m6-.21 7.328-7.3-2.829-2.828L6 7.188zM4.5 13a1.5 1.5 0 1 0-3 0 1.5 1.5 0 0 0 3 0M15 15v-4H9.258l-4.015 4zM0 .5v12.495zm0 12.495V13z"/>
</svg>
</div>
<ul tabindex="0" class="dropdown-content bg-base-300 rounded-box z-[1] w-52 p-2 shadow-2xl h-80 overflow-y-auto">
<li>
<button
class="btn btn-sm btn-block btn-ghost justify-start"
:class="{ 'btn-active': selectedTheme === 'auto' }"
@click="setSelectedTheme('auto')">
auto
</button>
</li>
<li v-for="theme in themes">
<input
type="radio"
name="theme-dropdown"
class="theme-controller btn btn-sm btn-block btn-ghost justify-start"
:aria-label="theme"
:value="theme"
:checked="selectedTheme === theme"
@click="setSelectedTheme(theme)" />
</li>
</ul>
</div>
</div>
</div>
</div>
<!-- chat messages -->
<div id="messages-list" class="flex flex-col grow overflow-y-auto">
<div class="mt-auto flex justify-center">
<!-- placeholder to shift the message to the bottom -->
{{ messages.length === 0 ? 'Send a message to start' : '' }}
</div>
<div v-for="msg in messages" class="group">
<message-bubble
:config="config"
:msg="msg"
:key="msg.id"
:is-generating="isGenerating"
:edit-user-msg-and-regenerate="editUserMsgAndRegenerate"
:regenerate-msg="regenerateMsg"></message-bubble>
</div>
<!-- pending (ongoing) assistant message -->
<div id="pending-msg" class="group">
<message-bubble
v-if="pendingMsg"
:config="config"
:msg="pendingMsg"
:key="pendingMsg.id"
:is-generating="isGenerating"
:show-thought-in-progress="config.showThoughtInProgress"
:edit-user-msg-and-regenerate="() => {}"
:regenerate-msg="() => {}"></message-bubble>
</div>
</div>
<!-- chat input -->
<div class="flex flex-row items-center mt-8 mb-6">
<textarea
class="textarea textarea-bordered w-full"
placeholder="Type a message (Shift+Enter to add a new line)"
v-model="inputMsg"
@keydown.enter.exact.prevent="sendMessage"
id="msg-input"
dir="auto"
></textarea>
<button v-if="!isGenerating" class="btn btn-primary ml-2" @click="sendMessage" :disabled="inputMsg.length === 0">Send</button>
<button v-else class="btn btn-neutral ml-2" @click="stopGeneration">Stop</button>
</div>
</div>
</div>
<!-- modal for editing config -->
<dialog class="modal" :class="{'modal-open': showConfigDialog}">
<div class="modal-box">
<h3 class="text-lg font-bold mb-6">Settings</h3>
<div class="h-[calc(90vh-12rem)] overflow-y-auto">
<p class="opacity-40 mb-6">Settings below are saved in browser's localStorage</p>
<settings-modal-short-input :config-key="'apiKey'" :config-default="configDefault" :config-info="configInfo" v-model="config.apiKey"></settings-modal-short-input>
<label class="form-control mb-2">
<div class="label">System Message</div>
<textarea class="textarea textarea-bordered h-24" :placeholder="'Default: ' + configDefault.systemMessage" v-model="config.systemMessage"></textarea>
</label>
<template v-for="configKey in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
<settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
</template>
<!-- TODO: add more sampling-related configs, please regroup them into different "collapse" sections -->
<!-- Section: Other sampler settings -->
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
<summary class="collapse-title font-bold">Other sampler settings</summary>
<div class="collapse-content">
<!-- Samplers queue -->
<settings-modal-short-input label="Samplers queue" :config-key="'samplers'" :config-default="configDefault" :config-info="configInfo" v-model="config.samplers"></settings-modal-short-input>
<!-- Samplers -->
<template v-for="configKey in ['dynatemp_range', 'dynatemp_exponent', 'typical_p', 'xtc_probability', 'xtc_threshold']">
<settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
</template>
</div>
</details>
<!-- Section: Penalties settings -->
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
<summary class="collapse-title font-bold">Penalties settings</summary>
<div class="collapse-content">
<template v-for="configKey in ['repeat_last_n', 'repeat_penalty', 'presence_penalty', 'frequency_penalty', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_penalty_last_n']">
<settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
</template>
</div>
</details>
<!-- Section: Reasoning models -->
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
<summary class="collapse-title font-bold">Reasoning models</summary>
<div class="collapse-content">
<div class="flex flex-row items-center mb-2">
<input type="checkbox" class="checkbox" v-model="config.showThoughtInProgress" />
<span class="ml-4">Expand though process by default for generating message</span>
</div>
<div class="flex flex-row items-center mb-2">
<input type="checkbox" class="checkbox" v-model="config.excludeThoughtOnReq" />
<span class="ml-4">Exclude thought process when sending request to API (Recommended for DeepSeek-R1)</span>
</div>
</div>
</details>
<!-- Section: Advanced config -->
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
<summary class="collapse-title font-bold">Advanced config</summary>
<div class="collapse-content">
<div class="flex flex-row items-center mb-2" v-if="isDev">
<!-- this button only shows in dev mode, used to import a demo conversation to test message rendering -->
<button class="btn" @click="debugImportDemoConv()">(debug) Import demo conversation</button>
</div>
<div class="flex flex-row items-center mb-2">
<input type="checkbox" class="checkbox" v-model="config.showTokensPerSecond" />
<span class="ml-4">Show tokens per second</span>
</div>
<label class="form-control mb-2">
<!-- Custom parameters input -->
<div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
<textarea class="textarea textarea-bordered h-24" placeholder="Example: { &quot;mirostat&quot;: 1, &quot;min_p&quot;: 0.1 }" v-model="config.custom"></textarea>
</label>
</div>
</details>
</div>
<!-- action buttons -->
<div class="modal-action">
<button class="btn" @click="resetConfigDialog">Reset to default</button>
<button class="btn" @click="closeAndDiscardConfigDialog">Close</button>
<button class="btn btn-primary" @click="closeAndSaveConfigDialog">Save</button>
</div>
</div>
</dialog>
</div>
<!-- Template to be used as message bubble -->
<template id="message-bubble">
<div :class="{
'chat': true,
'chat-start': msg.role !== 'user',
'chat-end': msg.role === 'user',
}">
<div :class="{
'chat-bubble markdown': true,
'chat-bubble-base-300': msg.role !== 'user',
}">
<!-- textarea for editing message -->
<template v-if="editingContent !== null">
<textarea
dir="auto"
class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
v-model="editingContent"></textarea>
<br/>
<button class="btn btn-ghost mt-2 mr-2" @click="editingContent = null">Cancel</button>
<button class="btn mt-2" @click="editMsg()">Submit</button>
</template>
<template v-else>
<!-- show loading dots for pending message -->
<span v-if="msg.content === null" class="loading loading-dots loading-md"></span>
<!-- render message as markdown -->
<div v-else dir="auto">
<details v-if="msg.role === 'assistant' && splitMsgContent.cot" class="collapse bg-base-200 collapse-arrow mb-4" :open="splitMsgContent.isThinking && showThoughtInProgress">
<summary class="collapse-title">
<span v-if="splitMsgContent.isThinking">
<span v-if="isGenerating" class="loading loading-spinner loading-md mr-2" style="vertical-align: middle;"></span>
<b>Thinking</b>
</span>
<b v-else>Thought Process</b>
</summary>
<vue-markdown :source="splitMsgContent.cot" dir="auto" class="collapse-content"></vue-markdown>
</details>
<vue-markdown :source="splitMsgContent.content"></vue-markdown>
</div>
<!-- render timings if enabled -->
<div class="dropdown dropdown-hover dropdown-top mt-2" v-if="timings && config.showTokensPerSecond">
<div tabindex="0" role="button" class="cursor-pointer font-semibold text-sm opacity-60">Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s</div>
<div class="dropdown-content bg-base-100 z-10 w-64 p-2 shadow mt-4">
<b>Prompt</b><br/>
- Tokens: {{ timings.prompt_n }}<br/>
- Time: {{ timings.prompt_ms }} ms<br/>
- Speed: {{ timings.prompt_per_second.toFixed(1) }} t/s<br/>
<b>Generation</b><br/>
- Tokens: {{ timings.predicted_n }}<br/>
- Time: {{ timings.predicted_ms }} ms<br/>
- Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s<br/>
</div>
</div>
</template>
</div>
</div>
<!-- actions for each message -->
<div :class="{'text-right': msg.role === 'user', 'opacity-0': isGenerating}" class="mx-4 mt-2 mb-2">
<!-- user message -->
<button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingContent = msg.content" :disabled="isGenerating">
✍️ Edit
</button>
<!-- assistant message -->
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
🔄 Regenerate
</button>
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg()" :disabled="isGenerating">
📋 Copy
</button>
</div>
</template>
<!-- Template to be used by settings modal -->
<template id="settings-modal-short-input">
<label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
<!-- Show help message on hovering on the input label -->
<div class="dropdown dropdown-hover">
<div tabindex="0" role="button" class="font-bold">{{ label || configKey }}</div>
<div class="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
{{ configInfo[configKey] || '(no help message available)' }}
</div>
</div>
<!-- Here we forward v-model from parent to child component, see: https://stackoverflow.com/questions/47311936/v-model-and-child-components -->
<input type="text" class="grow" :placeholder="'Default: ' + (configDefault[configKey] || 'none')" :value="modelValue" @input="$emit('update:modelValue', $event.target.value)" />
</label>
</template>
<script type="module" src="/src/main.js"></script>
</body> </body>
</html> </html>

File diff suppressed because it is too large Load diff

View file

@ -5,26 +5,55 @@
"type": "module", "type": "module",
"scripts": { "scripts": {
"dev": "vite", "dev": "vite",
"build": "vite build", "build": "tsc -b && vite build",
"preview": "vite preview", "format": "eslint . && prettier --write .",
"analyze": "ANALYZE=1 npx vite-bundle-visualizer" "lint": "eslint .",
}, "preview": "vite preview"
"devDependencies": {
"sass-embedded": "^1.83.0",
"vite": "^5.4.10"
}, },
"dependencies": { "dependencies": {
"@heroicons/react": "^2.2.0",
"@sec-ant/readable-stream": "^0.6.0", "@sec-ant/readable-stream": "^0.6.0",
"@vscode/markdown-it-katex": "^1.1.1", "@vscode/markdown-it-katex": "^1.1.1",
"autoprefixer": "^10.4.20", "autoprefixer": "^10.4.20",
"daisyui": "^4.12.14", "daisyui": "^4.12.14",
"highlight.js": "^11.10.0", "highlight.js": "^11.10.0",
"katex": "^0.16.15", "katex": "^0.16.15",
"markdown-it": "^14.1.0",
"postcss": "^8.4.49", "postcss": "^8.4.49",
"react": "^18.3.1",
"react-dom": "^18.3.1",
"react-markdown": "^9.0.3",
"react-router": "^7.1.5",
"rehype-highlight": "^7.0.2",
"rehype-katex": "^7.0.1",
"remark-breaks": "^4.0.0",
"remark-gfm": "^4.0.0",
"remark-math": "^6.0.0",
"tailwindcss": "^3.4.15", "tailwindcss": "^3.4.15",
"textlinestream": "^1.1.1", "textlinestream": "^1.1.1",
"vite-plugin-singlefile": "^2.0.3", "vite-plugin-singlefile": "^2.0.3"
"vue": "^3.5.13" },
"devDependencies": {
"@eslint/js": "^9.17.0",
"@types/markdown-it": "^14.1.2",
"@types/node": "^22.13.1",
"@types/react": "^18.3.18",
"@types/react-dom": "^18.3.5",
"@vitejs/plugin-react": "^4.3.4",
"eslint": "^9.17.0",
"eslint-plugin-react-hooks": "^5.0.0",
"eslint-plugin-react-refresh": "^0.4.16",
"globals": "^15.14.0",
"prettier": "^3.4.2",
"sass-embedded": "^1.83.4",
"typescript": "~5.6.2",
"typescript-eslint": "^8.18.2",
"vite": "^6.0.5"
},
"prettier": {
"trailingComma": "es5",
"tabWidth": 2,
"semi": true,
"singleQuote": true,
"bracketSameLine": false
} }
} }

View file

@ -11,7 +11,7 @@
{ {
"id": 1734087548327, "id": 1734087548327,
"role": "assistant", "role": "assistant",
"content": "This is the formula:\n\n$\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}$\n\nGiven an input vector \\(\\mathbf{x} = [x_1, x_2, \\ldots, x_n]\\)\n\n\\[\ny_i = \\frac{e^{x_i}}{\\sum_{j=1}^n e^{x_j}}\n\\]\n\nCode block latex:\n```latex\n\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}\n```\n\nTest dollar sign: $1234 $4567\n\nInvalid latex syntax: $E = mc^$ and $$E = mc^$$", "content": "This is the formula:\n\n$\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}$\n\nGiven an input vector \\(\\mathbf{x} = [x_1, x_2, \\ldots, x_n]\\)\n\n\\[\ny_i = \\frac{e^{x_i}}{\\sum_{j=1}^n e^{x_j}}\n\\]\n\n$2x + y = z$\n\nCode block latex:\n```latex\n\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}\n```\n\nTest dollar sign: $1234 $4567\n\nInvalid latex syntax: $E = mc^$ and $$E = mc^$$",
"timings": { "timings": {
"prompt_n": 1, "prompt_n": 1,
"prompt_ms": 28.923, "prompt_ms": 28.923,

View file

@ -0,0 +1,47 @@
import { HashRouter, Outlet, Route, Routes } from 'react-router';
import Header from './components/Header';
import Sidebar from './components/Sidebar';
import { AppContextProvider, useAppContext } from './utils/app.context';
import ChatScreen from './components/ChatScreen';
import SettingDialog from './components/SettingDialog';
function App() {
return (
<HashRouter>
<div className="flex flex-row drawer lg:drawer-open">
<AppContextProvider>
<Routes>
<Route element={<AppLayout />}>
<Route path="/chat/:convId" element={<ChatScreen />} />
<Route path="*" element={<ChatScreen />} />
</Route>
</Routes>
</AppContextProvider>
</div>
</HashRouter>
);
}
function AppLayout() {
const { showSettings, setShowSettings } = useAppContext();
return (
<>
<Sidebar />
<div
className="drawer-content grow flex flex-col h-screen w-screen mx-auto px-4 overflow-auto"
id="main-scroll"
>
<Header />
<Outlet />
</div>
{
<SettingDialog
show={showSettings}
onClose={() => setShowSettings(false)}
/>
}
</>
);
}
export default App;

View file

@ -0,0 +1,92 @@
import daisyuiThemes from 'daisyui/src/theming/themes';
import { isNumeric } from './utils/misc';
export const isDev = import.meta.env.MODE === 'development';
// constants
export const BASE_URL = new URL('.', document.baseURI).href
.toString()
.replace(/\/$/, '');
export const CONFIG_DEFAULT = {
// Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
// Do not use nested objects, keep it single level. Prefix the key if you need to group them.
apiKey: '',
systemMessage: 'You are a helpful assistant.',
showTokensPerSecond: false,
showThoughtInProgress: false,
excludeThoughtOnReq: true,
// make sure these default values are in sync with `common.h`
samplers: 'edkypmxt',
temperature: 0.8,
dynatemp_range: 0.0,
dynatemp_exponent: 1.0,
top_k: 40,
top_p: 0.95,
min_p: 0.05,
xtc_probability: 0.0,
xtc_threshold: 0.1,
typical_p: 1.0,
repeat_last_n: 64,
repeat_penalty: 1.0,
presence_penalty: 0.0,
frequency_penalty: 0.0,
dry_multiplier: 0.0,
dry_base: 1.75,
dry_allowed_length: 2,
dry_penalty_last_n: -1,
max_tokens: -1,
custom: '', // custom json-stringified object
// experimental features
pyIntepreterEnabled: false,
};
export const CONFIG_INFO: Record<string, string> = {
apiKey: 'Set the API Key if you are using --api-key option for the server.',
systemMessage: 'The starting message that defines how model should behave.',
samplers:
'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
temperature:
'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
dynatemp_range:
'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
dynatemp_exponent:
'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
top_k: 'Keeps only k top tokens.',
top_p:
'Limits tokens to those that together have a cumulative probability of at least p',
min_p:
'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.',
xtc_probability:
'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.',
xtc_threshold:
'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.',
typical_p:
'Sorts and limits tokens based on the difference between log-probability and entropy.',
repeat_last_n: 'Last n tokens to consider for penalizing repetition',
repeat_penalty:
'Controls the repetition of token sequences in the generated text',
presence_penalty:
'Limits tokens based on whether they appear in the output or not.',
frequency_penalty:
'Limits tokens based on how often they appear in the output.',
dry_multiplier:
'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.',
dry_base:
'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.',
dry_allowed_length:
'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.',
dry_penalty_last_n:
'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.',
max_tokens: 'The maximum number of token per output.',
custom: '', // custom json-stringified object
};
// config keys having numeric value (i.e. temperature, top_k, top_p, etc)
export const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT)
.filter((e) => isNumeric(e[1]))
.map((e) => e[0]);
// list of themes supported by daisyui
export const THEMES = ['light', 'dark']
// make sure light & dark are always at the beginning
.concat(
Object.keys(daisyuiThemes).filter((t) => t !== 'light' && t !== 'dark')
);

View file

@ -0,0 +1,195 @@
import { useEffect, useState } from 'react';
import { useAppContext } from '../utils/app.context';
import { OpenInNewTab, XCloseButton } from '../utils/common';
import { CanvasType } from '../utils/types';
import { PlayIcon, StopIcon } from '@heroicons/react/24/outline';
import StorageUtils from '../utils/storage';
const canInterrupt = typeof SharedArrayBuffer === 'function';
// adapted from https://pyodide.org/en/stable/usage/webworker.html
const WORKER_CODE = `
importScripts("https://cdn.jsdelivr.net/pyodide/v0.27.2/full/pyodide.js");
let stdOutAndErr = [];
let pyodideReadyPromise = loadPyodide({
stdout: (data) => stdOutAndErr.push(data),
stderr: (data) => stdOutAndErr.push(data),
});
let alreadySetBuff = false;
self.onmessage = async (event) => {
stdOutAndErr = [];
// make sure loading is done
const pyodide = await pyodideReadyPromise;
const { id, python, context, interruptBuffer } = event.data;
if (interruptBuffer && !alreadySetBuff) {
pyodide.setInterruptBuffer(interruptBuffer);
alreadySetBuff = true;
}
// Now load any packages we need, run the code, and send the result back.
await pyodide.loadPackagesFromImports(python);
// make a Python dictionary with the data from content
const dict = pyodide.globals.get("dict");
const globals = dict(Object.entries(context));
try {
self.postMessage({ id, running: true });
// Execute the python code in this context
const result = pyodide.runPython(python, { globals });
self.postMessage({ result, id, stdOutAndErr });
} catch (error) {
self.postMessage({ error: error.message, id });
}
interruptBuffer[0] = 0;
};
`;
let worker: Worker;
const interruptBuffer = canInterrupt
? new Uint8Array(new SharedArrayBuffer(1))
: null;
const startWorker = () => {
if (!worker) {
worker = new Worker(
URL.createObjectURL(new Blob([WORKER_CODE], { type: 'text/javascript' }))
);
}
};
if (StorageUtils.getConfig().pyIntepreterEnabled) {
startWorker();
}
const runCodeInWorker = (
pyCode: string,
callbackRunning: () => void
): {
donePromise: Promise<string>;
interrupt: () => void;
} => {
startWorker();
const id = Math.random() * 1e8;
const context = {};
if (interruptBuffer) {
interruptBuffer[0] = 0;
}
const donePromise = new Promise<string>((resolve) => {
worker.onmessage = (event) => {
const { error, stdOutAndErr, running } = event.data;
if (id !== event.data.id) return;
if (running) {
callbackRunning();
return;
} else if (error) {
resolve(error.toString());
} else {
resolve(stdOutAndErr.join('\n'));
}
};
worker.postMessage({ id, python: pyCode, context, interruptBuffer });
});
const interrupt = () => {
console.log('Interrupting...');
console.trace();
if (interruptBuffer) {
interruptBuffer[0] = 2;
}
};
return { donePromise, interrupt };
};
export default function CanvasPyInterpreter() {
const { canvasData, setCanvasData } = useAppContext();
const [code, setCode] = useState(canvasData?.content ?? ''); // copy to avoid direct mutation
const [running, setRunning] = useState(false);
const [output, setOutput] = useState('');
const [interruptFn, setInterruptFn] = useState<() => void>();
const [showStopBtn, setShowStopBtn] = useState(false);
const runCode = async (pycode: string) => {
interruptFn?.();
setRunning(true);
setOutput('Loading Pyodide...');
const { donePromise, interrupt } = runCodeInWorker(pycode, () => {
setOutput('Running...');
setShowStopBtn(canInterrupt);
});
setInterruptFn(() => interrupt);
const out = await donePromise;
setOutput(out);
setRunning(false);
setShowStopBtn(false);
};
// run code on mount
useEffect(() => {
setCode(canvasData?.content ?? '');
runCode(canvasData?.content ?? '');
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [canvasData?.content]);
if (canvasData?.type !== CanvasType.PY_INTERPRETER) {
return null;
}
return (
<div className="card bg-base-200 w-full h-full shadow-xl">
<div className="card-body">
<div className="flex justify-between items-center mb-4">
<span className="text-lg font-bold">Python Interpreter</span>
<XCloseButton
className="bg-base-100"
onClick={() => setCanvasData(null)}
/>
</div>
<div className="grid grid-rows-3 gap-4 h-full">
<textarea
className="textarea textarea-bordered w-full h-full font-mono"
value={code}
onChange={(e) => setCode(e.target.value)}
></textarea>
<div className="font-mono flex flex-col row-span-2">
<div className="flex items-center mb-2">
<button
className="btn btn-sm bg-base-100"
onClick={() => runCode(code)}
disabled={running}
>
<PlayIcon className="h-6 w-6" /> Run
</button>
{showStopBtn && (
<button
className="btn btn-sm bg-base-100 ml-2"
onClick={() => interruptFn?.()}
>
<StopIcon className="h-6 w-6" /> Stop
</button>
)}
<span className="grow text-right text-xs">
<OpenInNewTab href="https://github.com/ggerganov/llama.cpp/issues/11762">
Report a bug
</OpenInNewTab>
</span>
</div>
<textarea
className="textarea textarea-bordered h-full dark-color"
value={output}
readOnly
></textarea>
</div>
</div>
</div>
</div>
);
}

View file

@ -0,0 +1,235 @@
import { useMemo, useState } from 'react';
import { useAppContext } from '../utils/app.context';
import { Message, PendingMessage } from '../utils/types';
import { classNames } from '../utils/misc';
import MarkdownDisplay, { CopyButton } from './MarkdownDisplay';
interface SplitMessage {
content: PendingMessage['content'];
thought?: string;
isThinking?: boolean;
}
export default function ChatMessage({
msg,
id,
scrollToBottom,
isPending,
}: {
msg: Message | PendingMessage;
id?: string;
scrollToBottom: (requiresNearBottom: boolean) => void;
isPending?: boolean;
}) {
const { viewingConversation, replaceMessageAndGenerate, config } =
useAppContext();
const [editingContent, setEditingContent] = useState<string | null>(null);
const timings = useMemo(
() =>
msg.timings
? {
...msg.timings,
prompt_per_second:
(msg.timings.prompt_n / msg.timings.prompt_ms) * 1000,
predicted_per_second:
(msg.timings.predicted_n / msg.timings.predicted_ms) * 1000,
}
: null,
[msg.timings]
);
// for reasoning model, we split the message into content and thought
// TODO: implement this as remark/rehype plugin in the future
const { content, thought, isThinking }: SplitMessage = useMemo(() => {
if (msg.content === null || msg.role !== 'assistant') {
return { content: msg.content };
}
let actualContent = '';
let thought = '';
let isThinking = false;
let thinkSplit = msg.content.split('<think>', 2);
actualContent += thinkSplit[0];
while (thinkSplit[1] !== undefined) {
// <think> tag found
thinkSplit = thinkSplit[1].split('</think>', 2);
thought += thinkSplit[0];
isThinking = true;
if (thinkSplit[1] !== undefined) {
// </think> closing tag found
isThinking = false;
thinkSplit = thinkSplit[1].split('<think>', 2);
actualContent += thinkSplit[0];
}
}
return { content: actualContent, thought, isThinking };
}, [msg]);
if (!viewingConversation) return null;
const regenerate = async () => {
replaceMessageAndGenerate(viewingConversation.id, msg.id, undefined, () =>
scrollToBottom(true)
);
};
return (
<div className="group" id={id}>
<div
className={classNames({
chat: true,
'chat-start': msg.role !== 'user',
'chat-end': msg.role === 'user',
})}
>
<div
className={classNames({
'chat-bubble markdown': true,
'chat-bubble-base-300': msg.role !== 'user',
})}
>
{/* textarea for editing message */}
{editingContent !== null && (
<>
<textarea
dir="auto"
className="textarea textarea-bordered bg-base-100 text-base-content max-w-2xl w-[calc(90vw-8em)] h-24"
value={editingContent}
onChange={(e) => setEditingContent(e.target.value)}
></textarea>
<br />
<button
className="btn btn-ghost mt-2 mr-2"
onClick={() => setEditingContent(null)}
>
Cancel
</button>
<button
className="btn mt-2"
onClick={() =>
replaceMessageAndGenerate(
viewingConversation.id,
msg.id,
editingContent
)
}
>
Submit
</button>
</>
)}
{/* not editing content, render message */}
{editingContent === null && (
<>
{content === null ? (
<>
{/* show loading dots for pending message */}
<span className="loading loading-dots loading-md"></span>
</>
) : (
<>
{/* render message as markdown */}
<div dir="auto">
{thought && (
<details
className="collapse bg-base-200 collapse-arrow mb-4"
open={isThinking && config.showThoughtInProgress}
>
<summary className="collapse-title">
{isPending && isThinking ? (
<span>
<span
v-if="isGenerating"
className="loading loading-spinner loading-md mr-2"
style={{ verticalAlign: 'middle' }}
></span>
<b>Thinking</b>
</span>
) : (
<b>Thought Process</b>
)}
</summary>
<div className="collapse-content">
<MarkdownDisplay
content={thought}
isGenerating={isPending}
/>
</div>
</details>
)}
<MarkdownDisplay
content={content}
isGenerating={isPending}
/>
</div>
</>
)}
{/* render timings if enabled */}
{timings && config.showTokensPerSecond && (
<div className="dropdown dropdown-hover dropdown-top mt-2">
<div
tabIndex={0}
role="button"
className="cursor-pointer font-semibold text-sm opacity-60"
>
Speed: {timings.predicted_per_second.toFixed(1)} t/s
</div>
<div className="dropdown-content bg-base-100 z-10 w-64 p-2 shadow mt-4">
<b>Prompt</b>
<br />- Tokens: {timings.prompt_n}
<br />- Time: {timings.prompt_ms} ms
<br />- Speed: {timings.prompt_per_second.toFixed(1)} t/s
<br />
<b>Generation</b>
<br />- Tokens: {timings.predicted_n}
<br />- Time: {timings.predicted_ms} ms
<br />- Speed: {timings.predicted_per_second.toFixed(1)} t/s
<br />
</div>
</div>
)}
</>
)}
</div>
</div>
{/* actions for each message */}
{msg.content !== null && (
<div
className={classNames({
'mx-4 mt-2 mb-2': true,
'text-right': msg.role === 'user',
})}
>
{/* user message */}
{msg.role === 'user' && (
<button
className="badge btn-mini show-on-hover"
onClick={() => setEditingContent(msg.content)}
disabled={msg.content === null}
>
Edit
</button>
)}
{/* assistant message */}
{msg.role === 'assistant' && (
<>
{!isPending && (
<button
className="badge btn-mini show-on-hover mr-2"
onClick={regenerate}
disabled={msg.content === null}
>
🔄 Regenerate
</button>
)}
<CopyButton
className="badge btn-mini show-on-hover mr-2"
content={msg.content}
/>
</>
)}
</div>
)}
</div>
);
}

View file

@ -0,0 +1,146 @@
import { useEffect, useState } from 'react';
import { useAppContext } from '../utils/app.context';
import StorageUtils from '../utils/storage';
import { useNavigate } from 'react-router';
import ChatMessage from './ChatMessage';
import { CanvasType, PendingMessage } from '../utils/types';
import { classNames } from '../utils/misc';
import CanvasPyInterpreter from './CanvasPyInterpreter';
export default function ChatScreen() {
const {
viewingConversation,
sendMessage,
isGenerating,
stopGenerating,
pendingMessages,
canvasData,
} = useAppContext();
const [inputMsg, setInputMsg] = useState('');
const navigate = useNavigate();
const currConvId = viewingConversation?.id ?? '';
const pendingMsg: PendingMessage | undefined = pendingMessages[currConvId];
const scrollToBottom = (requiresNearBottom: boolean) => {
const mainScrollElem = document.getElementById('main-scroll');
if (!mainScrollElem) return;
const spaceToBottom =
mainScrollElem.scrollHeight -
mainScrollElem.scrollTop -
mainScrollElem.clientHeight;
if (!requiresNearBottom || spaceToBottom < 50) {
setTimeout(
() => mainScrollElem.scrollTo({ top: mainScrollElem.scrollHeight }),
1
);
}
};
// scroll to bottom when conversation changes
useEffect(() => {
scrollToBottom(false);
}, [viewingConversation?.id]);
const sendNewMessage = async () => {
if (inputMsg.trim().length === 0 || isGenerating(currConvId)) return;
const convId = viewingConversation?.id ?? StorageUtils.getNewConvId();
const lastInpMsg = inputMsg;
setInputMsg('');
if (!viewingConversation) {
// if user is creating a new conversation, redirect to the new conversation
navigate(`/chat/${convId}`);
}
scrollToBottom(false);
// auto scroll as message is being generated
const onChunk = () => scrollToBottom(true);
if (!(await sendMessage(convId, inputMsg, onChunk))) {
// restore the input message if failed
setInputMsg(lastInpMsg);
}
};
const hasCanvas = !!canvasData;
return (
<div
className={classNames({
'grid lg:gap-8 grow transition-[300ms]': true,
'grid-cols-[1fr_0fr] lg:grid-cols-[1fr_1fr]': hasCanvas, // adapted for mobile
'grid-cols-[1fr_0fr]': !hasCanvas,
})}
>
<div
className={classNames({
'flex flex-col w-full max-w-[900px] mx-auto': true,
'hidden lg:flex': hasCanvas, // adapted for mobile
flex: !hasCanvas,
})}
>
{/* chat messages */}
<div id="messages-list" className="grow">
<div className="mt-auto flex justify-center">
{/* placeholder to shift the message to the bottom */}
{viewingConversation ? '' : 'Send a message to start'}
</div>
{viewingConversation?.messages.map((msg) => (
<ChatMessage
key={msg.id}
msg={msg}
scrollToBottom={scrollToBottom}
/>
))}
{pendingMsg && (
<ChatMessage
msg={pendingMsg}
scrollToBottom={scrollToBottom}
isPending
id="pending-msg"
/>
)}
</div>
{/* chat input */}
<div className="flex flex-row items-center pt-8 pb-6 sticky bottom-0 bg-base-100">
<textarea
className="textarea textarea-bordered w-full"
placeholder="Type a message (Shift+Enter to add a new line)"
value={inputMsg}
onChange={(e) => setInputMsg(e.target.value)}
onKeyDown={(e) => {
if (e.key === 'Enter' && e.shiftKey) return;
if (e.key === 'Enter' && !e.shiftKey) {
e.preventDefault();
sendNewMessage();
}
}}
id="msg-input"
dir="auto"
></textarea>
{isGenerating(currConvId) ? (
<button
className="btn btn-neutral ml-2"
onClick={() => stopGenerating(currConvId)}
>
Stop
</button>
) : (
<button
className="btn btn-primary ml-2"
onClick={sendNewMessage}
disabled={inputMsg.trim().length === 0}
>
Send
</button>
)}
</div>
</div>
<div className="w-full sticky top-[7em] h-[calc(100vh-9em)]">
{canvasData?.type === CanvasType.PY_INTERPRETER && (
<CanvasPyInterpreter />
)}
</div>
</div>
);
}

View file

@ -0,0 +1,176 @@
import { useEffect, useState } from 'react';
import StorageUtils from '../utils/storage';
import { useAppContext } from '../utils/app.context';
import { classNames } from '../utils/misc';
import daisyuiThemes from 'daisyui/src/theming/themes';
import { THEMES } from '../Config';
import { useNavigate } from 'react-router';
export default function Header() {
const navigate = useNavigate();
const [selectedTheme, setSelectedTheme] = useState(StorageUtils.getTheme());
const { setShowSettings } = useAppContext();
const setTheme = (theme: string) => {
StorageUtils.setTheme(theme);
setSelectedTheme(theme);
};
useEffect(() => {
document.body.setAttribute('data-theme', selectedTheme);
document.body.setAttribute(
'data-color-scheme',
// @ts-expect-error daisyuiThemes complains about index type, but it should work
daisyuiThemes[selectedTheme]?.['color-scheme'] ?? 'auto'
);
}, [selectedTheme]);
const { isGenerating, viewingConversation } = useAppContext();
const isCurrConvGenerating = isGenerating(viewingConversation?.id ?? '');
const removeConversation = () => {
if (isCurrConvGenerating || !viewingConversation) return;
const convId = viewingConversation.id;
if (window.confirm('Are you sure to delete this conversation?')) {
StorageUtils.remove(convId);
navigate('/');
}
};
const downloadConversation = () => {
if (isCurrConvGenerating || !viewingConversation) return;
const convId = viewingConversation.id;
const conversationJson = JSON.stringify(viewingConversation, null, 2);
const blob = new Blob([conversationJson], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `conversation_${convId}.json`;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
};
return (
<div className="flex flex-row items-center pt-6 pb-6 sticky top-0 z-10 bg-base-100">
{/* open sidebar button */}
<label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
<svg
xmlns="http://www.w3.org/2000/svg"
width="16"
height="16"
fill="currentColor"
className="bi bi-list"
viewBox="0 0 16 16"
>
<path
fillRule="evenodd"
d="M2.5 12a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5"
/>
</svg>
</label>
<div className="grow text-2xl font-bold ml-2">llama.cpp</div>
{/* action buttons (top right) */}
<div className="flex items-center">
<div v-if="messages.length > 0" className="dropdown dropdown-end">
{/* "..." button */}
<button
tabIndex={0}
role="button"
className="btn m-1"
disabled={isCurrConvGenerating}
>
<svg
xmlns="http://www.w3.org/2000/svg"
width="16"
height="16"
fill="currentColor"
className="bi bi-three-dots-vertical"
viewBox="0 0 16 16"
>
<path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0" />
</svg>
</button>
{/* dropdown menu */}
<ul
tabIndex={0}
className="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow"
>
<li onClick={downloadConversation}>
<a>Download</a>
</li>
<li className="text-error" onClick={removeConversation}>
<a>Delete</a>
</li>
</ul>
</div>
<div className="tooltip tooltip-bottom" data-tip="Settings">
<button className="btn" onClick={() => setShowSettings(true)}>
{/* settings button */}
<svg
xmlns="http://www.w3.org/2000/svg"
width="16"
height="16"
fill="currentColor"
className="bi bi-gear"
viewBox="0 0 16 16"
>
<path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0" />
<path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z" />
</svg>
</button>
</div>
{/* theme controller is copied from https://daisyui.com/components/theme-controller/ */}
<div className="tooltip tooltip-bottom" data-tip="Themes">
<div className="dropdown dropdown-end dropdown-bottom">
<div tabIndex={0} role="button" className="btn m-1">
<svg
xmlns="http://www.w3.org/2000/svg"
width="16"
height="16"
fill="currentColor"
className="bi bi-palette2"
viewBox="0 0 16 16"
>
<path d="M0 .5A.5.5 0 0 1 .5 0h5a.5.5 0 0 1 .5.5v5.277l4.147-4.131a.5.5 0 0 1 .707 0l3.535 3.536a.5.5 0 0 1 0 .708L10.261 10H15.5a.5.5 0 0 1 .5.5v5a.5.5 0 0 1-.5.5H3a3 3 0 0 1-2.121-.879A3 3 0 0 1 0 13.044m6-.21 7.328-7.3-2.829-2.828L6 7.188zM4.5 13a1.5 1.5 0 1 0-3 0 1.5 1.5 0 0 0 3 0M15 15v-4H9.258l-4.015 4zM0 .5v12.495zm0 12.495V13z" />
</svg>
</div>
<ul
tabIndex={0}
className="dropdown-content bg-base-300 rounded-box z-[1] w-52 p-2 shadow-2xl h-80 overflow-y-auto"
>
<li>
<button
className={classNames({
'btn btn-sm btn-block btn-ghost justify-start': true,
'btn-active': selectedTheme === 'auto',
})}
onClick={() => setTheme('auto')}
>
auto
</button>
</li>
{THEMES.map((theme) => (
<li key={theme}>
<input
type="radio"
name="theme-dropdown"
className="theme-controller btn btn-sm btn-block btn-ghost justify-start"
aria-label={theme}
value={theme}
checked={selectedTheme === theme}
onChange={(e) => e.target.checked && setTheme(theme)}
/>
</li>
))}
</ul>
</div>
</div>
</div>
</div>
);
}

View file

@ -0,0 +1,310 @@
import React, { useMemo, useState } from 'react';
import Markdown, { ExtraProps } from 'react-markdown';
import remarkGfm from 'remark-gfm';
import rehypeHightlight from 'rehype-highlight';
import rehypeKatex from 'rehype-katex';
import remarkMath from 'remark-math';
import remarkBreaks from 'remark-breaks';
import 'katex/dist/katex.min.css';
import { classNames, copyStr } from '../utils/misc';
import { ElementContent, Root } from 'hast';
import { visit } from 'unist-util-visit';
import { useAppContext } from '../utils/app.context';
import { CanvasType } from '../utils/types';
export default function MarkdownDisplay({
content,
isGenerating,
}: {
content: string;
isGenerating?: boolean;
}) {
const preprocessedContent = useMemo(
() => preprocessLaTeX(content),
[content]
);
return (
<Markdown
remarkPlugins={[remarkGfm, remarkMath, remarkBreaks]}
rehypePlugins={[rehypeHightlight, rehypeKatex, rehypeCustomCopyButton]}
components={{
button: (props) => (
<CodeBlockButtons
{...props}
isGenerating={isGenerating}
origContent={preprocessedContent}
/>
),
// note: do not use "pre", "p" or other basic html elements here, it will cause the node to re-render when the message is being generated (this should be a bug with react-markdown, not sure how to fix it)
}}
>
{preprocessedContent}
</Markdown>
);
}
const CodeBlockButtons: React.ElementType<
React.ClassAttributes<HTMLButtonElement> &
React.HTMLAttributes<HTMLButtonElement> &
ExtraProps & { origContent: string; isGenerating?: boolean }
> = ({ node, origContent, isGenerating }) => {
const { config } = useAppContext();
const startOffset = node?.position?.start.offset ?? 0;
const endOffset = node?.position?.end.offset ?? 0;
const copiedContent = useMemo(
() =>
origContent
.substring(startOffset, endOffset)
.replace(/^```[^\n]+\n/g, '')
.replace(/```$/g, ''),
[origContent, startOffset, endOffset]
);
const codeLanguage = useMemo(
() =>
origContent
.substring(startOffset, startOffset + 10)
.match(/^```([^\n]+)\n/)?.[1] ?? '',
[origContent, startOffset]
);
const canRunCode =
!isGenerating &&
config.pyIntepreterEnabled &&
codeLanguage.startsWith('py');
return (
<div
className={classNames({
'text-right sticky top-[7em] mb-2 mr-2 h-0': true,
'display-none': !node?.position,
})}
>
<CopyButton className="badge btn-mini" content={copiedContent} />
{canRunCode && (
<RunPyCodeButton
className="badge btn-mini ml-2"
content={copiedContent}
/>
)}
</div>
);
};
export const CopyButton = ({
content,
className,
}: {
content: string;
className?: string;
}) => {
const [copied, setCopied] = useState(false);
return (
<button
className={className}
onClick={() => {
copyStr(content);
setCopied(true);
}}
onMouseLeave={() => setCopied(false)}
>
{copied ? 'Copied!' : '📋 Copy'}
</button>
);
};
export const RunPyCodeButton = ({
content,
className,
}: {
content: string;
className?: string;
}) => {
const { setCanvasData } = useAppContext();
return (
<>
<button
className={className}
onClick={() =>
setCanvasData({
type: CanvasType.PY_INTERPRETER,
content,
})
}
>
Run
</button>
</>
);
};
/**
* This injects the "button" element before each "pre" element.
* The actual button will be replaced with a react component in the MarkdownDisplay.
* We don't replace "pre" node directly because it will cause the node to re-render, which causes this bug: https://github.com/ggerganov/llama.cpp/issues/9608
*/
function rehypeCustomCopyButton() {
return function (tree: Root) {
visit(tree, 'element', function (node) {
if (node.tagName === 'pre' && !node.properties.visited) {
const preNode = { ...node };
// replace current node
preNode.properties.visited = 'true';
node.tagName = 'div';
node.properties = {};
// add node for button
const btnNode: ElementContent = {
type: 'element',
tagName: 'button',
properties: {},
children: [],
position: node.position,
};
node.children = [btnNode, preNode];
}
});
};
}
/**
* The part below is copied and adapted from:
* https://github.com/danny-avila/LibreChat/blob/main/client/src/utils/latex.ts
* (MIT License)
*/
// Regex to check if the processed content contains any potential LaTeX patterns
const containsLatexRegex =
/\\\(.*?\\\)|\\\[.*?\\\]|\$.*?\$|\\begin\{equation\}.*?\\end\{equation\}/;
// Regex for inline and block LaTeX expressions
const inlineLatex = new RegExp(/\\\((.+?)\\\)/, 'g');
const blockLatex = new RegExp(/\\\[(.*?[^\\])\\\]/, 'gs');
// Function to restore code blocks
const restoreCodeBlocks = (content: string, codeBlocks: string[]) => {
return content.replace(
/<<CODE_BLOCK_(\d+)>>/g,
(_, index) => codeBlocks[index]
);
};
// Regex to identify code blocks and inline code
const codeBlockRegex = /(```[\s\S]*?```|`.*?`)/g;
export const processLaTeX = (_content: string) => {
let content = _content;
// Temporarily replace code blocks and inline code with placeholders
const codeBlocks: string[] = [];
let index = 0;
content = content.replace(codeBlockRegex, (match) => {
codeBlocks[index] = match;
return `<<CODE_BLOCK_${index++}>>`;
});
// Escape dollar signs followed by a digit or space and digit
let processedContent = content.replace(/(\$)(?=\s?\d)/g, '\\$');
// If no LaTeX patterns are found, restore code blocks and return the processed content
if (!containsLatexRegex.test(processedContent)) {
return restoreCodeBlocks(processedContent, codeBlocks);
}
// Convert LaTeX expressions to a markdown compatible format
processedContent = processedContent
.replace(inlineLatex, (_: string, equation: string) => `$${equation}$`) // Convert inline LaTeX
.replace(blockLatex, (_: string, equation: string) => `$$${equation}$$`); // Convert block LaTeX
// Restore code blocks
return restoreCodeBlocks(processedContent, codeBlocks);
};
/**
* Preprocesses LaTeX content by replacing delimiters and escaping certain characters.
*
* @param content The input string containing LaTeX expressions.
* @returns The processed string with replaced delimiters and escaped characters.
*/
export function preprocessLaTeX(content: string): string {
// Step 1: Protect code blocks
const codeBlocks: string[] = [];
content = content.replace(/(```[\s\S]*?```|`[^`\n]+`)/g, (_, code) => {
codeBlocks.push(code);
return `<<CODE_BLOCK_${codeBlocks.length - 1}>>`;
});
// Step 2: Protect existing LaTeX expressions
const latexExpressions: string[] = [];
// Protect block math ($$...$$), \[...\], and \(...\) as before.
content = content.replace(
/(\$\$[\s\S]*?\$\$|\\\[[\s\S]*?\\\]|\\\(.*?\\\))/g,
(match) => {
latexExpressions.push(match);
return `<<LATEX_${latexExpressions.length - 1}>>`;
}
);
// Protect inline math ($...$) only if it does NOT match a currency pattern.
// We assume a currency pattern is one where the inner content is purely numeric (with optional decimals).
content = content.replace(/\$([^$]+)\$/g, (match, inner) => {
if (/^\s*\d+(?:\.\d+)?\s*$/.test(inner)) {
// This looks like a currency value (e.g. "$123" or "$12.34"),
// so don't protect it.
return match;
} else {
// Otherwise, treat it as a LaTeX expression.
latexExpressions.push(match);
return `<<LATEX_${latexExpressions.length - 1}>>`;
}
});
// Step 3: Escape dollar signs that are likely currency indicators.
// (Now that inline math is protected, this will only escape dollars not already protected)
content = content.replace(/\$(?=\d)/g, '\\$');
// Step 4: Restore LaTeX expressions
content = content.replace(
/<<LATEX_(\d+)>>/g,
(_, index) => latexExpressions[parseInt(index)]
);
// Step 5: Restore code blocks
content = content.replace(
/<<CODE_BLOCK_(\d+)>>/g,
(_, index) => codeBlocks[parseInt(index)]
);
// Step 6: Apply additional escaping functions
content = escapeBrackets(content);
content = escapeMhchem(content);
return content;
}
export function escapeBrackets(text: string): string {
const pattern =
/(```[\S\s]*?```|`.*?`)|\\\[([\S\s]*?[^\\])\\]|\\\((.*?)\\\)/g;
return text.replace(
pattern,
(
match: string,
codeBlock: string | undefined,
squareBracket: string | undefined,
roundBracket: string | undefined
): string => {
if (codeBlock != null) {
return codeBlock;
} else if (squareBracket != null) {
return `$$${squareBracket}$$`;
} else if (roundBracket != null) {
return `$${roundBracket}$`;
}
return match;
}
);
}
export function escapeMhchem(text: string) {
return text.replaceAll('$\\ce{', '$\\\\ce{').replaceAll('$\\pu{', '$\\\\pu{');
}

View file

@ -0,0 +1,536 @@
import { useState } from 'react';
import { useAppContext } from '../utils/app.context';
import { CONFIG_DEFAULT, CONFIG_INFO } from '../Config';
import { isDev } from '../Config';
import StorageUtils from '../utils/storage';
import { classNames, isBoolean, isNumeric, isString } from '../utils/misc';
import {
BeakerIcon,
ChatBubbleOvalLeftEllipsisIcon,
Cog6ToothIcon,
FunnelIcon,
HandRaisedIcon,
SquaresPlusIcon,
} from '@heroicons/react/24/outline';
import { OpenInNewTab } from '../utils/common';
type SettKey = keyof typeof CONFIG_DEFAULT;
const BASIC_KEYS: SettKey[] = [
'temperature',
'top_k',
'top_p',
'min_p',
'max_tokens',
];
const SAMPLER_KEYS: SettKey[] = [
'dynatemp_range',
'dynatemp_exponent',
'typical_p',
'xtc_probability',
'xtc_threshold',
];
const PENALTY_KEYS: SettKey[] = [
'repeat_last_n',
'repeat_penalty',
'presence_penalty',
'frequency_penalty',
'dry_multiplier',
'dry_base',
'dry_allowed_length',
'dry_penalty_last_n',
];
enum SettingInputType {
SHORT_INPUT,
LONG_INPUT,
CHECKBOX,
CUSTOM,
}
interface SettingFieldInput {
type: Exclude<SettingInputType, SettingInputType.CUSTOM>;
label: string | React.ReactElement;
help?: string | React.ReactElement;
key: SettKey;
}
interface SettingFieldCustom {
type: SettingInputType.CUSTOM;
key: SettKey;
component:
| string
| React.FC<{
value: string | boolean | number;
onChange: (value: string) => void;
}>;
}
interface SettingSection {
title: React.ReactElement;
fields: (SettingFieldInput | SettingFieldCustom)[];
}
const ICON_CLASSNAME = 'w-4 h-4 mr-1 inline';
const SETTING_SECTIONS: SettingSection[] = [
{
title: (
<>
<Cog6ToothIcon className={ICON_CLASSNAME} />
General
</>
),
fields: [
{
type: SettingInputType.SHORT_INPUT,
label: 'API Key',
key: 'apiKey',
},
{
type: SettingInputType.LONG_INPUT,
label: 'System Message (will be disabled if left empty)',
key: 'systemMessage',
},
...BASIC_KEYS.map(
(key) =>
({
type: SettingInputType.SHORT_INPUT,
label: key,
key,
}) as SettingFieldInput
),
],
},
{
title: (
<>
<FunnelIcon className={ICON_CLASSNAME} />
Samplers
</>
),
fields: [
{
type: SettingInputType.SHORT_INPUT,
label: 'Samplers queue',
key: 'samplers',
},
...SAMPLER_KEYS.map(
(key) =>
({
type: SettingInputType.SHORT_INPUT,
label: key,
key,
}) as SettingFieldInput
),
],
},
{
title: (
<>
<HandRaisedIcon className={ICON_CLASSNAME} />
Penalties
</>
),
fields: PENALTY_KEYS.map((key) => ({
type: SettingInputType.SHORT_INPUT,
label: key,
key,
})),
},
{
title: (
<>
<ChatBubbleOvalLeftEllipsisIcon className={ICON_CLASSNAME} />
Reasoning
</>
),
fields: [
{
type: SettingInputType.CHECKBOX,
label: 'Expand though process by default for generating message',
key: 'showThoughtInProgress',
},
{
type: SettingInputType.CHECKBOX,
label:
'Exclude thought process when sending request to API (Recommended for DeepSeek-R1)',
key: 'excludeThoughtOnReq',
},
],
},
{
title: (
<>
<SquaresPlusIcon className={ICON_CLASSNAME} />
Advanced
</>
),
fields: [
{
type: SettingInputType.CUSTOM,
key: 'custom', // dummy key, won't be used
component: () => {
const debugImportDemoConv = async () => {
const res = await fetch('/demo-conversation.json');
const demoConv = await res.json();
StorageUtils.remove(demoConv.id);
for (const msg of demoConv.messages) {
StorageUtils.appendMsg(demoConv.id, msg);
}
};
return (
<button className="btn" onClick={debugImportDemoConv}>
(debug) Import demo conversation
</button>
);
},
},
{
type: SettingInputType.CHECKBOX,
label: 'Show tokens per second',
key: 'showTokensPerSecond',
},
{
type: SettingInputType.LONG_INPUT,
label: (
<>
Custom JSON config (For more info, refer to{' '}
<OpenInNewTab href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md">
server documentation
</OpenInNewTab>
)
</>
),
key: 'custom',
},
],
},
{
title: (
<>
<BeakerIcon className={ICON_CLASSNAME} />
Experimental
</>
),
fields: [
{
type: SettingInputType.CUSTOM,
key: 'custom', // dummy key, won't be used
component: () => (
<>
<p className="mb-8">
Experimental features are not guaranteed to work correctly.
<br />
<br />
If you encounter any problems, create a{' '}
<OpenInNewTab href="https://github.com/ggerganov/llama.cpp/issues/new?template=019-bug-misc.yml">
Bug (misc.)
</OpenInNewTab>{' '}
report on Github. Please also specify <b>webui/experimental</b> on
the report title and include screenshots.
<br />
<br />
Some features may require packages downloaded from CDN, so they
need internet connection.
</p>
</>
),
},
{
type: SettingInputType.CHECKBOX,
label: (
<>
<b>Enable Python interpreter</b>
<br />
<small className="text-xs">
This feature uses{' '}
<OpenInNewTab href="https://pyodide.org">pyodide</OpenInNewTab>,
downloaded from CDN. To use this feature, ask the LLM to generate
python code inside a markdown code block. You will see a "Run"
button on the code block, near the "Copy" button.
</small>
</>
),
key: 'pyIntepreterEnabled',
},
],
},
];
export default function SettingDialog({
show,
onClose,
}: {
show: boolean;
onClose: () => void;
}) {
const { config, saveConfig } = useAppContext();
const [sectionIdx, setSectionIdx] = useState(0);
// clone the config object to prevent direct mutation
const [localConfig, setLocalConfig] = useState<typeof CONFIG_DEFAULT>(
JSON.parse(JSON.stringify(config))
);
const resetConfig = () => {
if (window.confirm('Are you sure to reset all settings?')) {
setLocalConfig(CONFIG_DEFAULT);
}
};
const handleSave = () => {
// copy the local config to prevent direct mutation
const newConfig: typeof CONFIG_DEFAULT = JSON.parse(
JSON.stringify(localConfig)
);
// validate the config
for (const key in newConfig) {
const value = newConfig[key as SettKey];
const mustBeBoolean = isBoolean(CONFIG_DEFAULT[key as SettKey]);
const mustBeString = isString(CONFIG_DEFAULT[key as SettKey]);
const mustBeNumeric = isNumeric(CONFIG_DEFAULT[key as SettKey]);
if (mustBeString) {
if (!isString(value)) {
alert(`Value for ${key} must be string`);
return;
}
} else if (mustBeNumeric) {
const trimedValue = value.toString().trim();
const numVal = Number(trimedValue);
if (isNaN(numVal) || !isNumeric(numVal) || trimedValue.length === 0) {
alert(`Value for ${key} must be numeric`);
return;
}
// force conversion to number
// @ts-expect-error this is safe
newConfig[key] = numVal;
} else if (mustBeBoolean) {
if (!isBoolean(value)) {
alert(`Value for ${key} must be boolean`);
return;
}
} else {
console.error(`Unknown default type for key ${key}`);
}
}
if (isDev) console.log('Saving config', newConfig);
saveConfig(newConfig);
onClose();
};
const onChange = (key: SettKey) => (value: string | boolean) => {
// note: we do not perform validation here, because we may get incomplete value as user is still typing it
setLocalConfig({ ...localConfig, [key]: value });
};
return (
<dialog className={classNames({ modal: true, 'modal-open': show })}>
<div className="modal-box w-11/12 max-w-3xl">
<h3 className="text-lg font-bold mb-6">Settings</h3>
<div className="flex flex-col md:flex-row h-[calc(90vh-12rem)]">
{/* Left panel, showing sections - Desktop version */}
<div className="hidden md:flex flex-col items-stretch pr-4 mr-4 border-r-2 border-base-200">
{SETTING_SECTIONS.map((section, idx) => (
<div
key={idx}
className={classNames({
'btn btn-ghost justify-start font-normal w-44 mb-1': true,
'btn-active': sectionIdx === idx,
})}
onClick={() => setSectionIdx(idx)}
dir="auto"
>
{section.title}
</div>
))}
</div>
{/* Left panel, showing sections - Mobile version */}
<div className="md:hidden flex flex-row gap-2 mb-4">
<details className="dropdown">
<summary className="btn bt-sm w-full m-1">
{SETTING_SECTIONS[sectionIdx].title}
</summary>
<ul className="menu dropdown-content bg-base-100 rounded-box z-[1] w-52 p-2 shadow">
{SETTING_SECTIONS.map((section, idx) => (
<div
key={idx}
className={classNames({
'btn btn-ghost justify-start font-normal': true,
'btn-active': sectionIdx === idx,
})}
onClick={() => setSectionIdx(idx)}
dir="auto"
>
{section.title}
</div>
))}
</ul>
</details>
</div>
{/* Right panel, showing setting fields */}
<div className="grow overflow-y-auto px-4">
{SETTING_SECTIONS[sectionIdx].fields.map((field, idx) => {
const key = `${sectionIdx}-${idx}`;
if (field.type === SettingInputType.SHORT_INPUT) {
return (
<SettingsModalShortInput
key={key}
configKey={field.key}
value={localConfig[field.key]}
onChange={onChange(field.key)}
label={field.label as string}
/>
);
} else if (field.type === SettingInputType.LONG_INPUT) {
return (
<SettingsModalLongInput
key={key}
configKey={field.key}
value={localConfig[field.key].toString()}
onChange={onChange(field.key)}
label={field.label as string}
/>
);
} else if (field.type === SettingInputType.CHECKBOX) {
return (
<SettingsModalCheckbox
key={key}
configKey={field.key}
value={!!localConfig[field.key]}
onChange={onChange(field.key)}
label={field.label as string}
/>
);
} else if (field.type === SettingInputType.CUSTOM) {
return (
<div key={key} className="mb-2">
{typeof field.component === 'string'
? field.component
: field.component({
value: localConfig[field.key],
onChange: onChange(field.key),
})}
</div>
);
}
})}
<p className="opacity-40 mb-6 text-sm mt-8">
Settings are saved in browser's localStorage
</p>
</div>
</div>
<div className="modal-action">
<button className="btn" onClick={resetConfig}>
Reset to default
</button>
<button className="btn" onClick={onClose}>
Close
</button>
<button className="btn btn-primary" onClick={handleSave}>
Save
</button>
</div>
</div>
</dialog>
);
}
function SettingsModalLongInput({
configKey,
value,
onChange,
label,
}: {
configKey: SettKey;
value: string;
onChange: (value: string) => void;
label?: string;
}) {
return (
<label className="form-control mb-2">
<div className="label inline">{label || configKey}</div>
<textarea
className="textarea textarea-bordered h-24"
placeholder={`Default: ${CONFIG_DEFAULT[configKey] || 'none'}`}
value={value}
onChange={(e) => onChange(e.target.value)}
/>
</label>
);
}
function SettingsModalShortInput({
configKey,
value,
onChange,
label,
}: {
configKey: SettKey;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
value: any;
onChange: (value: string) => void;
label?: string;
}) {
const helpMsg = CONFIG_INFO[configKey];
return (
<>
{/* on mobile, we simply show the help message here */}
{helpMsg && (
<div className="block md:hidden mb-1">
<b>{label || configKey}</b>
<br />
<p className="text-xs">{helpMsg}</p>
</div>
)}
<label className="input input-bordered join-item grow flex items-center gap-2 mb-2">
<div className="dropdown dropdown-hover">
<div tabIndex={0} role="button" className="font-bold hidden md:block">
{label || configKey}
</div>
{helpMsg && (
<div className="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
{helpMsg}
</div>
)}
</div>
<input
type="text"
className="grow"
placeholder={`Default: ${CONFIG_DEFAULT[configKey] || 'none'}`}
value={value}
onChange={(e) => onChange(e.target.value)}
/>
</label>
</>
);
}
function SettingsModalCheckbox({
configKey,
value,
onChange,
label,
}: {
configKey: SettKey;
value: boolean;
onChange: (value: boolean) => void;
label: string;
}) {
return (
<div className="flex flex-row items-center mb-2">
<input
type="checkbox"
className="toggle"
checked={value}
onChange={(e) => onChange(e.target.checked)}
/>
<span className="ml-4">{label || configKey}</span>
</div>
);
}

View file

@ -0,0 +1,95 @@
import { useEffect, useMemo, useState } from 'react';
import { classNames } from '../utils/misc';
import { Conversation } from '../utils/types';
import StorageUtils from '../utils/storage';
import { useNavigate, useParams } from 'react-router';
export default function Sidebar() {
const params = useParams();
const navigate = useNavigate();
const currConv = useMemo(
() => StorageUtils.getOneConversation(params.convId ?? ''),
[params.convId]
);
const [conversations, setConversations] = useState<Conversation[]>([]);
useEffect(() => {
const handleConversationChange = () => {
setConversations(StorageUtils.getAllConversations());
};
StorageUtils.onConversationChanged(handleConversationChange);
handleConversationChange();
return () => {
StorageUtils.offConversationChanged(handleConversationChange);
};
}, []);
return (
<>
<input
id="toggle-drawer"
type="checkbox"
className="drawer-toggle"
defaultChecked
/>
<div className="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
<label
htmlFor="toggle-drawer"
aria-label="close sidebar"
className="drawer-overlay"
></label>
<div className="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
<div className="flex flex-row items-center justify-between mb-4 mt-4">
<h2 className="font-bold ml-4">Conversations</h2>
{/* close sidebar button */}
<label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
<svg
xmlns="http://www.w3.org/2000/svg"
width="16"
height="16"
fill="currentColor"
className="bi bi-arrow-bar-left"
viewBox="0 0 16 16"
>
<path
fillRule="evenodd"
d="M12.5 15a.5.5 0 0 1-.5-.5v-13a.5.5 0 0 1 1 0v13a.5.5 0 0 1-.5.5M10 8a.5.5 0 0 1-.5.5H3.707l2.147 2.146a.5.5 0 0 1-.708.708l-3-3a.5.5 0 0 1 0-.708l3-3a.5.5 0 1 1 .708.708L3.707 7.5H9.5a.5.5 0 0 1 .5.5"
/>
</svg>
</label>
</div>
{/* list of conversations */}
<div
className={classNames({
'btn btn-ghost justify-start': true,
'btn-active': !currConv,
})}
onClick={() => navigate('/')}
>
+ New conversation
</div>
{conversations.map((conv) => (
<div
key={conv.id}
className={classNames({
'btn btn-ghost justify-start font-normal': true,
'btn-active': conv.id === currConv?.id,
})}
onClick={() => navigate(`/chat/${conv.id}`)}
dir="auto"
>
<span className="truncate">{conv.messages[0].content}</span>
</div>
))}
<div className="text-center text-xs opacity-40 mt-auto mx-4">
Conversations are saved to browser's localStorage
</div>
</div>
</div>
</>
);
}

View file

@ -1,60 +0,0 @@
import hljs from 'highlight.js/lib/core';
// only import commonly used languages to reduce bundle size
import python from 'highlight.js/lib/languages/python';
import javascript from 'highlight.js/lib/languages/javascript';
import json from 'highlight.js/lib/languages/json';
import bash from 'highlight.js/lib/languages/bash';
import yaml from 'highlight.js/lib/languages/yaml';
import markdown from 'highlight.js/lib/languages/markdown';
import scss from 'highlight.js/lib/languages/scss';
import xml from 'highlight.js/lib/languages/xml';
import ruby from 'highlight.js/lib/languages/ruby';
import go from 'highlight.js/lib/languages/go';
import java from 'highlight.js/lib/languages/java';
import rust from 'highlight.js/lib/languages/rust';
import scala from 'highlight.js/lib/languages/scala';
import cpp from 'highlight.js/lib/languages/cpp';
import csharp from 'highlight.js/lib/languages/csharp';
import swift from 'highlight.js/lib/languages/swift';
import dart from 'highlight.js/lib/languages/dart';
import elixir from 'highlight.js/lib/languages/elixir';
import kotlin from 'highlight.js/lib/languages/kotlin';
import lua from 'highlight.js/lib/languages/lua';
import php from 'highlight.js/lib/languages/php';
import latex from 'highlight.js/lib/languages/latex';
hljs.registerLanguage('python', python);
hljs.registerLanguage('javascript', javascript);
hljs.registerLanguage('json', json);
hljs.registerLanguage('yaml', yaml);
hljs.registerLanguage('markdown', markdown);
hljs.registerLanguage('xml', xml);
hljs.registerLanguage('ruby', ruby);
hljs.registerLanguage('go', go);
hljs.registerLanguage('java', java);
hljs.registerLanguage('rust', rust);
hljs.registerLanguage('scala', scala);
hljs.registerLanguage('csharp', csharp);
hljs.registerLanguage('swift', swift);
hljs.registerLanguage('dart', dart);
hljs.registerLanguage('elixir', elixir);
hljs.registerLanguage('kotlin', kotlin);
hljs.registerLanguage('lua', lua);
hljs.registerLanguage('php', php);
hljs.registerLanguage('latex', latex);
// reuse some languages to further reduce bundle size
hljs.registerLanguage('shell', bash);
hljs.registerLanguage('bash', bash);
hljs.registerLanguage('sh', bash);
hljs.registerLanguage('css', scss);
hljs.registerLanguage('scss', scss);
hljs.registerLanguage('c', cpp);
hljs.registerLanguage('cpp', cpp);
export default hljs;

View file

@ -1,15 +1,28 @@
@use "sass:meta"; @use 'sass:meta';
@tailwind base; @tailwind base;
@tailwind components; @tailwind components;
@tailwind utilities; @tailwind utilities;
.markdown { .markdown {
h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; } h1,
h2,
h3,
h4,
h5,
h6,
ul,
ol,
li {
all: revert;
}
pre { pre {
@apply whitespace-pre-wrap rounded-lg p-2; @apply whitespace-pre-wrap rounded-lg p-2;
border: 1px solid currentColor; border: 1px solid currentColor;
} }
p {
@apply mb-2;
}
/* TODO: fix markdown table */ /* TODO: fix markdown table */
} }
@ -19,7 +32,9 @@
.btn-mini { .btn-mini {
@apply cursor-pointer hover:shadow-md; @apply cursor-pointer hover:shadow-md;
} }
.chat-screen { max-width: 900px; } .chat-screen {
max-width: 900px;
}
.chat-bubble-base-300 { .chat-bubble-base-300 {
--tw-bg-opacity: 1; --tw-bg-opacity: 1;
@ -30,6 +45,9 @@
/* Highlight.js */ /* Highlight.js */
[data-color-scheme='light'] { [data-color-scheme='light'] {
@include meta.load-css('highlight.js/styles/stackoverflow-light'); @include meta.load-css('highlight.js/styles/stackoverflow-light');
.dark-color {
@apply bg-base-content text-base-100;
}
} }
[data-color-scheme='dark'] { [data-color-scheme='dark'] {
@include meta.load-css('highlight.js/styles/stackoverflow-dark'); @include meta.load-css('highlight.js/styles/stackoverflow-dark');
@ -37,6 +55,9 @@
[data-color-scheme='auto'] { [data-color-scheme='auto'] {
@media (prefers-color-scheme: light) { @media (prefers-color-scheme: light) {
@include meta.load-css('highlight.js/styles/stackoverflow-light'); @include meta.load-css('highlight.js/styles/stackoverflow-light');
.dark-color {
@apply bg-base-content text-base-100;
}
} }
@media (prefers-color-scheme: dark) { @media (prefers-color-scheme: dark) {
@include meta.load-css('highlight.js/styles/stackoverflow-dark'); @include meta.load-css('highlight.js/styles/stackoverflow-dark');
@ -46,3 +67,7 @@
background: transparent !important; background: transparent !important;
padding: 0.5em !important; padding: 0.5em !important;
} }
.katex-display {
margin: 0 0 !important;
}

View file

@ -1,66 +0,0 @@
import katex from 'katex';
// Adapted from https://github.com/SchneeHertz/markdown-it-katex-gpt
// MIT license
const defaultOptions = {
delimiters: [
{ left: '\\[', right: '\\]', display: true },
{ left: '\\(', right: '\\)', display: false },
],
};
export function renderLatexHTML(content, display = false) {
return katex.renderToString(content, {
throwOnError: false,
output: 'mathml',
displayMode: display,
});
}
function escapedBracketRule(options) {
return (state, silent) => {
const max = state.posMax;
const start = state.pos;
for (const { left, right, display } of options.delimiters) {
// Check if it starts with the left delimiter
if (!state.src.slice(start).startsWith(left)) continue;
// Skip the length of the left delimiter
let pos = start + left.length;
// Find the matching right delimiter
while (pos < max) {
if (state.src.slice(pos).startsWith(right)) {
break;
}
pos++;
}
// No matching right delimiter found, skip to the next match
if (pos >= max) continue;
// If not in silent mode, convert LaTeX formula to MathML
if (!silent) {
const content = state.src.slice(start + left.length, pos);
try {
const renderedContent = renderLatexHTML(content, display);
const token = state.push('html_inline', '', 0);
token.content = renderedContent;
} catch (e) {
console.error(e);
}
}
// Update position, skip the length of the right delimiter
state.pos = pos + right.length;
return true;
}
}
}
export default function (md, options = defaultOptions) {
md.inline.ruler.after('text', 'escaped_bracket', escapedBracketRule(options));
}

View file

@ -1,704 +0,0 @@
import './styles.scss';
import { createApp, defineComponent, shallowRef, computed, h } from 'vue/dist/vue.esm-bundler.js';
import MarkdownIt from 'markdown-it';
import TextLineStream from 'textlinestream';
// math formula rendering
import 'katex/dist/katex.min.css';
import markdownItKatexGpt from './katex-gpt';
import markdownItKatexNormal from '@vscode/markdown-it-katex';
// code highlighting
import hljs from './highlight-config';
import daisyuiThemes from 'daisyui/src/theming/themes';
// ponyfill for missing ReadableStream asyncIterator on Safari
import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
const isDev = import.meta.env.MODE === 'development';
// types
/** @typedef {{ id: number, role: 'user' | 'assistant', content: string, timings: any }} Message */
/** @typedef {{ role: 'user' | 'assistant', content: string }} APIMessage */
/** @typedef {{ id: string, lastModified: number, messages: Array<Message> }} Conversation */
// utility functions
const isString = (x) => !!x.toLowerCase;
const isBoolean = (x) => x === true || x === false;
const isNumeric = (n) => !isString(n) && !isNaN(n) && !isBoolean(n);
const escapeAttr = (str) => str.replace(/>/g, '&gt;').replace(/"/g, '&quot;');
const copyStr = (textToCopy) => {
// Navigator clipboard api needs a secure context (https)
if (navigator.clipboard && window.isSecureContext) {
navigator.clipboard.writeText(textToCopy);
} else {
// Use the 'out of viewport hidden text area' trick
const textArea = document.createElement('textarea');
textArea.value = textToCopy;
// Move textarea out of the viewport so it's not visible
textArea.style.position = 'absolute';
textArea.style.left = '-999999px';
document.body.prepend(textArea);
textArea.select();
document.execCommand('copy');
}
};
// constants
const BASE_URL = isDev
? (localStorage.getItem('base') || 'https://localhost:8080') // for debugging
: (new URL('.', document.baseURI).href).toString().replace(/\/$/, ''); // for production
console.log({ BASE_URL });
const CONFIG_DEFAULT = {
// Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
apiKey: '',
systemMessage: 'You are a helpful assistant.',
showTokensPerSecond: false,
showThoughtInProgress: false,
excludeThoughtOnReq: true,
// make sure these default values are in sync with `common.h`
samplers: 'edkypmxt',
temperature: 0.8,
dynatemp_range: 0.0,
dynatemp_exponent: 1.0,
top_k: 40,
top_p: 0.95,
min_p: 0.05,
xtc_probability: 0.0,
xtc_threshold: 0.1,
typical_p: 1.0,
repeat_last_n: 64,
repeat_penalty: 1.0,
presence_penalty: 0.0,
frequency_penalty: 0.0,
dry_multiplier: 0.0,
dry_base: 1.75,
dry_allowed_length: 2,
dry_penalty_last_n: -1,
max_tokens: -1,
custom: '', // custom json-stringified object
};
const CONFIG_INFO = {
apiKey: 'Set the API Key if you are using --api-key option for the server.',
systemMessage: 'The starting message that defines how model should behave.',
samplers: 'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
dynatemp_exponent: 'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
top_k: 'Keeps only k top tokens.',
top_p: 'Limits tokens to those that together have a cumulative probability of at least p',
min_p: 'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.',
xtc_probability: 'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.',
xtc_threshold: 'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.',
typical_p: 'Sorts and limits tokens based on the difference between log-probability and entropy.',
repeat_last_n: 'Last n tokens to consider for penalizing repetition',
repeat_penalty: 'Controls the repetition of token sequences in the generated text',
presence_penalty: 'Limits tokens based on whether they appear in the output or not.',
frequency_penalty: 'Limits tokens based on how often they appear in the output.',
dry_multiplier: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.',
dry_base: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.',
dry_allowed_length: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.',
dry_penalty_last_n: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.',
max_tokens: 'The maximum number of token per output.',
custom: '', // custom json-stringified object
};
// config keys having numeric value (i.e. temperature, top_k, top_p, etc)
const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
// list of themes supported by daisyui
const THEMES = ['light', 'dark']
// make sure light & dark are always at the beginning
.concat(Object.keys(daisyuiThemes).filter(t => t !== 'light' && t !== 'dark'));
// markdown support
const VueMarkdown = defineComponent(
(props) => {
const md = shallowRef(new MarkdownIt({
breaks: true,
highlight: function (str, lang) { // Add highlight.js
if (lang && hljs.getLanguage(lang)) {
try {
return '<pre dir="auto"><code class="hljs">' +
hljs.highlight(str, { language: lang, ignoreIllegals: true }).value +
'</code></pre>';
} catch (__) {}
}
return '<pre dir="auto"><code class="hljs">' + md.value.utils.escapeHtml(str) + '</code></pre>';
}
}));
// support latex with double dollar sign and square brackets
md.value.use(markdownItKatexGpt, {
delimiters: [
{ left: '\\[', right: '\\]', display: true },
{ left: '\\(', right: '\\)', display: false },
{ left: '$$', right: '$$', display: false },
// do not add single dollar sign here, other wise it will confused with dollar used for money symbol
],
throwOnError: false,
});
// support latex with single dollar sign
md.value.use(markdownItKatexNormal, { throwOnError: false });
// add copy button to code blocks
const origFenchRenderer = md.value.renderer.rules.fence;
md.value.renderer.rules.fence = (tokens, idx, ...args) => {
const content = tokens[idx].content;
const origRendered = origFenchRenderer(tokens, idx, ...args);
return `<div class="relative my-4">
<div class="text-right sticky top-4 mb-2 mr-2 h-0">
<button class="badge btn-mini" onclick="copyStr(${escapeAttr(JSON.stringify(content))})">📋 Copy</button>
</div>
${origRendered}
</div>`;
};
window.copyStr = copyStr;
const content = computed(() => md.value.render(props.source));
return () => h('div', { innerHTML: content.value });
},
{ props: ['source'] }
);
// input field to be used by settings modal
const SettingsModalShortInput = defineComponent({
template: document.getElementById('settings-modal-short-input').innerHTML,
props: {
label: { type: String, required: false },
configKey: String,
configDefault: Object,
configInfo: Object,
modelValue: [Object, String, Number],
},
});
// message bubble component
const MessageBubble = defineComponent({
components: {
VueMarkdown
},
template: document.getElementById('message-bubble').innerHTML,
props: {
config: Object,
msg: Object,
isGenerating: Boolean,
showThoughtInProgress: Boolean,
editUserMsgAndRegenerate: Function,
regenerateMsg: Function,
},
data() {
return {
editingContent: null,
};
},
computed: {
timings() {
if (!this.msg.timings) return null;
return {
...this.msg.timings,
prompt_per_second: this.msg.timings.prompt_n / (this.msg.timings.prompt_ms / 1000),
predicted_per_second: this.msg.timings.predicted_n / (this.msg.timings.predicted_ms / 1000),
};
},
splitMsgContent() {
const content = this.msg.content;
if (this.msg.role !== 'assistant') {
return { content };
}
let actualContent = '';
let cot = '';
let isThinking = false;
let thinkSplit = content.split('<think>', 2);
actualContent += thinkSplit[0];
while (thinkSplit[1] !== undefined) {
// <think> tag found
thinkSplit = thinkSplit[1].split('</think>', 2);
cot += thinkSplit[0];
isThinking = true;
if (thinkSplit[1] !== undefined) {
// </think> closing tag found
isThinking = false;
thinkSplit = thinkSplit[1].split('<think>', 2);
actualContent += thinkSplit[0];
}
}
return { content: actualContent, cot, isThinking };
},
},
methods: {
copyMsg() {
copyStr(this.msg.content);
},
editMsg() {
this.editUserMsgAndRegenerate({
...this.msg,
content: this.editingContent,
});
this.editingContent = null;
},
},
});
// coversations is stored in localStorage
// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
// convId is a string prefixed with 'conv-'
const StorageUtils = {
/**
* manage conversations
* @returns {Array<Conversation>}
*/
getAllConversations() {
const res = [];
for (const key in localStorage) {
if (key.startsWith('conv-')) {
res.push(JSON.parse(localStorage.getItem(key)));
}
}
res.sort((a, b) => b.lastModified - a.lastModified);
return res;
},
/**
* can return null if convId does not exist
* @param {string} convId
* @returns {Conversation | null}
*/
getOneConversation(convId) {
return JSON.parse(localStorage.getItem(convId) || 'null');
},
/**
* if convId does not exist, create one
* @param {string} convId
* @param {Message} msg
*/
appendMsg(convId, msg) {
if (msg.content === null) return;
const conv = StorageUtils.getOneConversation(convId) || {
id: convId,
lastModified: Date.now(),
messages: [],
};
conv.messages.push(msg);
conv.lastModified = Date.now();
localStorage.setItem(convId, JSON.stringify(conv));
},
/**
* Get new conversation id
* @returns {string}
*/
getNewConvId() {
return `conv-${Date.now()}`;
},
/**
* remove conversation by id
* @param {string} convId
*/
remove(convId) {
localStorage.removeItem(convId);
},
/**
* remove all conversations
* @param {string} convId
*/
filterAndKeepMsgs(convId, predicate) {
const conv = StorageUtils.getOneConversation(convId);
if (!conv) return;
conv.messages = conv.messages.filter(predicate);
conv.lastModified = Date.now();
localStorage.setItem(convId, JSON.stringify(conv));
},
/**
* remove last message from conversation
* @param {string} convId
* @returns {Message | undefined}
*/
popMsg(convId) {
const conv = StorageUtils.getOneConversation(convId);
if (!conv) return;
const msg = conv.messages.pop();
conv.lastModified = Date.now();
if (conv.messages.length === 0) {
StorageUtils.remove(convId);
} else {
localStorage.setItem(convId, JSON.stringify(conv));
}
return msg;
},
// manage config
getConfig() {
const savedVal = JSON.parse(localStorage.getItem('config') || '{}');
// to prevent breaking changes in the future, we always provide default value for missing keys
return {
...CONFIG_DEFAULT,
...savedVal,
};
},
setConfig(config) {
localStorage.setItem('config', JSON.stringify(config));
},
getTheme() {
return localStorage.getItem('theme') || 'auto';
},
setTheme(theme) {
if (theme === 'auto') {
localStorage.removeItem('theme');
} else {
localStorage.setItem('theme', theme);
}
},
};
// scroll to bottom of chat messages
// if requiresNearBottom is true, only auto-scroll if user is near bottom
const chatScrollToBottom = (requiresNearBottom) => {
const msgListElem = document.getElementById('messages-list');
const spaceToBottom = msgListElem.scrollHeight - msgListElem.scrollTop - msgListElem.clientHeight;
if (!requiresNearBottom || (spaceToBottom < 100)) {
setTimeout(() => msgListElem.scrollTo({ top: msgListElem.scrollHeight }), 1);
}
};
// wrapper for SSE
async function* sendSSEPostRequest(url, fetchOptions) {
const res = await fetch(url, fetchOptions);
const lines = res.body
.pipeThrough(new TextDecoderStream())
.pipeThrough(new TextLineStream());
for await (const line of asyncIterator(lines)) {
if (isDev) console.log({line});
if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
const data = JSON.parse(line.slice(5));
yield data;
} else if (line.startsWith('error:')) {
const data = JSON.parse(line.slice(6));
throw new Error(data.message || 'Unknown error');
}
}
};
const mainApp = createApp({
components: {
VueMarkdown,
SettingsModalShortInput,
MessageBubble,
},
data() {
return {
conversations: StorageUtils.getAllConversations(),
/** @type {Array<Message>} */
messages: [],
viewingConvId: StorageUtils.getNewConvId(),
inputMsg: '',
isGenerating: false,
/** @type {Array<Message> | null} */
pendingMsg: null, // the on-going message from assistant
stopGeneration: () => {},
selectedTheme: StorageUtils.getTheme(),
config: StorageUtils.getConfig(),
showConfigDialog: false,
// const
themes: THEMES,
/** @type {CONFIG_DEFAULT} */
configDefault: {...CONFIG_DEFAULT},
configInfo: {...CONFIG_INFO},
isDev,
}
},
computed: {},
mounted() {
document.getElementById('app').classList.remove('opacity-0'); // show app
// scroll to the bottom when the pending message height is updated
const pendingMsgElem = document.getElementById('pending-msg');
const resizeObserver = new ResizeObserver(() => {
if (this.isGenerating) chatScrollToBottom(true);
});
resizeObserver.observe(pendingMsgElem);
this.setSelectedTheme(this.selectedTheme);
},
watch: {
viewingConvId: function(val, oldVal) {
if (val != oldVal) {
this.fetchMessages();
chatScrollToBottom();
this.hideSidebar();
}
}
},
methods: {
hideSidebar() {
document.getElementById('toggle-drawer').checked = false;
},
setSelectedTheme(theme) {
this.selectedTheme = theme;
document.body.setAttribute('data-theme', theme);
document.body.setAttribute('data-color-scheme', daisyuiThemes[theme]?.['color-scheme'] ?? 'auto');
StorageUtils.setTheme(theme);
},
newConversation() {
if (this.isGenerating) return;
this.viewingConvId = StorageUtils.getNewConvId();
},
setViewingConv(convId) {
if (this.isGenerating) return;
this.viewingConvId = convId;
},
deleteConv(convId) {
if (this.isGenerating) return;
if (window.confirm('Are you sure to delete this conversation?')) {
StorageUtils.remove(convId);
if (this.viewingConvId === convId) {
this.viewingConvId = StorageUtils.getNewConvId();
}
this.fetchConversation();
this.fetchMessages();
}
},
downloadConv(convId) {
const conversation = StorageUtils.getOneConversation(convId);
if (!conversation) {
alert('Conversation not found.');
return;
}
const conversationJson = JSON.stringify(conversation, null, 2);
const blob = new Blob([conversationJson], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `conversation_${convId}.json`;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
},
async sendMessage() {
// prevent sending empty message
// also allow typing the message while generating, but does not allow sending it (to match UX/UI behavior of other chat apps)
if (!this.inputMsg || this.isGenerating) return;
const currConvId = this.viewingConvId;
StorageUtils.appendMsg(currConvId, {
id: Date.now(),
role: 'user',
content: this.inputMsg,
});
this.fetchConversation();
this.fetchMessages();
this.inputMsg = '';
this.generateMessage(currConvId);
chatScrollToBottom();
},
async generateMessage(currConvId) {
if (this.isGenerating) return;
this.pendingMsg = { id: Date.now()+1, role: 'assistant', content: null };
this.isGenerating = true;
try {
/** @type {CONFIG_DEFAULT} */
const config = this.config;
const abortController = new AbortController();
this.stopGeneration = () => abortController.abort();
/** @type {Array<APIMessage>} */
let messages = [
{ role: 'system', content: config.systemMessage },
...normalizeMsgsForAPI(this.messages),
];
if (config.excludeThoughtOnReq) {
messages = filterThoughtFromMsgs(messages);
}
if (isDev) console.log({messages});
const params = {
messages,
stream: true,
cache_prompt: true,
samplers: config.samplers,
temperature: config.temperature,
dynatemp_range: config.dynatemp_range,
dynatemp_exponent: config.dynatemp_exponent,
top_k: config.top_k,
top_p: config.top_p,
min_p: config.min_p,
typical_p: config.typical_p,
xtc_probability: config.xtc_probability,
xtc_threshold: config.xtc_threshold,
repeat_last_n: config.repeat_last_n,
repeat_penalty: config.repeat_penalty,
presence_penalty: config.presence_penalty,
frequency_penalty: config.frequency_penalty,
dry_multiplier: config.dry_multiplier,
dry_base: config.dry_base,
dry_allowed_length: config.dry_allowed_length,
dry_penalty_last_n: config.dry_penalty_last_n,
max_tokens: config.max_tokens,
timings_per_token: !!config.showTokensPerSecond,
...(config.custom.length ? JSON.parse(config.custom) : {}),
};
const chunks = sendSSEPostRequest(`${BASE_URL}/v1/chat/completions`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
...(config.apiKey ? {'Authorization': `Bearer ${config.apiKey}`} : {})
},
body: JSON.stringify(params),
signal: abortController.signal,
});
for await (const chunk of chunks) {
const stop = chunk.stop;
const addedContent = chunk.choices[0].delta.content;
const lastContent = this.pendingMsg.content || '';
if (addedContent) {
this.pendingMsg = {
id: this.pendingMsg.id,
role: 'assistant',
content: lastContent + addedContent,
};
}
const timings = chunk.timings;
if (timings && config.showTokensPerSecond) {
// only extract what's really needed, to save some space
this.pendingMsg.timings = {
prompt_n: timings.prompt_n,
prompt_ms: timings.prompt_ms,
predicted_n: timings.predicted_n,
predicted_ms: timings.predicted_ms,
};
}
}
StorageUtils.appendMsg(currConvId, this.pendingMsg);
this.fetchConversation();
this.fetchMessages();
setTimeout(() => document.getElementById('msg-input').focus(), 1);
} catch (error) {
if (error.name === 'AbortError') {
// user stopped the generation via stopGeneration() function
StorageUtils.appendMsg(currConvId, this.pendingMsg);
this.fetchConversation();
this.fetchMessages();
} else {
console.error(error);
alert(error);
// pop last user message
const lastUserMsg = StorageUtils.popMsg(currConvId);
this.inputMsg = lastUserMsg ? lastUserMsg.content : '';
}
}
this.pendingMsg = null;
this.isGenerating = false;
this.stopGeneration = () => {};
this.fetchMessages();
chatScrollToBottom();
},
// message actions
regenerateMsg(msg) {
if (this.isGenerating) return;
// TODO: somehow keep old history (like how ChatGPT has different "tree"). This can be done by adding "sub-conversations" with "subconv-" prefix, and new message will have a list of subconvIds
const currConvId = this.viewingConvId;
StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
this.fetchConversation();
this.fetchMessages();
this.generateMessage(currConvId);
},
editUserMsgAndRegenerate(msg) {
if (this.isGenerating) return;
const currConvId = this.viewingConvId;
const newContent = msg.content;
StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
StorageUtils.appendMsg(currConvId, {
id: Date.now(),
role: 'user',
content: newContent,
});
this.fetchConversation();
this.fetchMessages();
this.generateMessage(currConvId);
},
// settings dialog methods
closeAndSaveConfigDialog() {
try {
if (this.config.custom.length) JSON.parse(this.config.custom);
} catch (error) {
alert('Invalid JSON for custom config. Please either fix it or leave it empty.');
return;
}
for (const key of CONFIG_NUMERIC_KEYS) {
if (isNaN(this.config[key]) || this.config[key].toString().trim().length === 0) {
alert(`Invalid number for ${key} (expected an integer or a float)`);
return;
}
this.config[key] = parseFloat(this.config[key]);
}
this.showConfigDialog = false;
StorageUtils.setConfig(this.config);
},
closeAndDiscardConfigDialog() {
this.showConfigDialog = false;
this.config = StorageUtils.getConfig();
},
resetConfigDialog() {
if (window.confirm('Are you sure to reset all settings?')) {
this.config = {...CONFIG_DEFAULT};
}
},
// sync state functions
fetchConversation() {
this.conversations = StorageUtils.getAllConversations();
},
fetchMessages() {
this.messages = StorageUtils.getOneConversation(this.viewingConvId)?.messages ?? [];
},
// debug functions
async debugImportDemoConv() {
const res = await fetch('/demo-conversation.json');
const demoConv = await res.json();
StorageUtils.remove(demoConv.id);
for (const msg of demoConv.messages) {
StorageUtils.appendMsg(demoConv.id, msg);
}
this.fetchConversation();
}
},
});
mainApp.config.errorHandler = alert;
try {
mainApp.mount('#app');
} catch (err) {
console.error(err);
document.getElementById('app').innerHTML = `<div style="margin:2em auto">
Failed to start app. Please try clearing localStorage and try again.<br/>
<br/>
<button class="btn" onClick="localStorage.clear(); window.location.reload();">Clear localStorage</button>
</div>`;
}
/**
* filter out redundant fields upon sending to API
* @param {Array<APIMessage>} messages
* @returns {Array<APIMessage>}
*/
function normalizeMsgsForAPI(messages) {
return messages.map((msg) => {
return {
role: msg.role,
content: msg.content,
};
});
}
/**
* recommended for DeepsSeek-R1, filter out content between <think> and </think> tags
* @param {Array<APIMessage>} messages
* @returns {Array<APIMessage>}
*/
function filterThoughtFromMsgs(messages) {
return messages.map((msg) => {
return {
role: msg.role,
content: msg.role === 'assistant'
? msg.content.split('</think>').at(-1).trim()
: msg.content,
};
});
}

View file

@ -0,0 +1,10 @@
import { StrictMode } from 'react';
import { createRoot } from 'react-dom/client';
import './index.scss';
import App from './App.tsx';
createRoot(document.getElementById('root')!).render(
<StrictMode>
<App />
</StrictMode>
);

View file

@ -0,0 +1,327 @@
import React, { createContext, useContext, useEffect, useState } from 'react';
import {
APIMessage,
CanvasData,
Conversation,
Message,
PendingMessage,
} from './types';
import StorageUtils from './storage';
import {
filterThoughtFromMsgs,
normalizeMsgsForAPI,
getSSEStreamAsync,
} from './misc';
import { BASE_URL, CONFIG_DEFAULT, isDev } from '../Config';
import { matchPath, useLocation } from 'react-router';
interface AppContextValue {
// conversations and messages
viewingConversation: Conversation | null;
pendingMessages: Record<Conversation['id'], PendingMessage>;
isGenerating: (convId: string) => boolean;
sendMessage: (
convId: string,
content: string,
onChunk?: CallbackGeneratedChunk
) => Promise<boolean>;
stopGenerating: (convId: string) => void;
replaceMessageAndGenerate: (
convId: string,
origMsgId: Message['id'],
content?: string,
onChunk?: CallbackGeneratedChunk
) => Promise<void>;
// canvas
canvasData: CanvasData | null;
setCanvasData: (data: CanvasData | null) => void;
// config
config: typeof CONFIG_DEFAULT;
saveConfig: (config: typeof CONFIG_DEFAULT) => void;
showSettings: boolean;
setShowSettings: (show: boolean) => void;
}
// for now, this callback is only used for scrolling to the bottom of the chat
type CallbackGeneratedChunk = () => void;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const AppContext = createContext<AppContextValue>({} as any);
export const AppContextProvider = ({
children,
}: {
children: React.ReactElement;
}) => {
const { pathname } = useLocation();
const params = matchPath('/chat/:convId', pathname);
const convId = params?.params?.convId;
const [viewingConversation, setViewingConversation] =
useState<Conversation | null>(null);
const [pendingMessages, setPendingMessages] = useState<
Record<Conversation['id'], PendingMessage>
>({});
const [aborts, setAborts] = useState<
Record<Conversation['id'], AbortController>
>({});
const [config, setConfig] = useState(StorageUtils.getConfig());
const [canvasData, setCanvasData] = useState<CanvasData | null>(null);
const [showSettings, setShowSettings] = useState(false);
// handle change when the convId from URL is changed
useEffect(() => {
// also reset the canvas data
setCanvasData(null);
const handleConversationChange = (changedConvId: string) => {
if (changedConvId !== convId) return;
setViewingConversation(StorageUtils.getOneConversation(convId));
};
StorageUtils.onConversationChanged(handleConversationChange);
setViewingConversation(StorageUtils.getOneConversation(convId ?? ''));
return () => {
StorageUtils.offConversationChanged(handleConversationChange);
};
}, [convId]);
const setPending = (convId: string, pendingMsg: PendingMessage | null) => {
// if pendingMsg is null, remove the key from the object
if (!pendingMsg) {
setPendingMessages((prev) => {
const newState = { ...prev };
delete newState[convId];
return newState;
});
} else {
setPendingMessages((prev) => ({ ...prev, [convId]: pendingMsg }));
}
};
const setAbort = (convId: string, controller: AbortController | null) => {
if (!controller) {
setAborts((prev) => {
const newState = { ...prev };
delete newState[convId];
return newState;
});
} else {
setAborts((prev) => ({ ...prev, [convId]: controller }));
}
};
////////////////////////////////////////////////////////////////////////
// public functions
const isGenerating = (convId: string) => !!pendingMessages[convId];
const generateMessage = async (
convId: string,
onChunk?: CallbackGeneratedChunk
) => {
if (isGenerating(convId)) return;
const config = StorageUtils.getConfig();
const currConversation = StorageUtils.getOneConversation(convId);
if (!currConversation) {
throw new Error('Current conversation is not found');
}
const abortController = new AbortController();
setAbort(convId, abortController);
let pendingMsg: PendingMessage = {
id: Date.now() + 1,
role: 'assistant',
content: null,
};
setPending(convId, pendingMsg);
try {
// prepare messages for API
let messages: APIMessage[] = [
...(config.systemMessage.length === 0
? []
: [{ role: 'system', content: config.systemMessage } as APIMessage]),
...normalizeMsgsForAPI(currConversation?.messages ?? []),
];
if (config.excludeThoughtOnReq) {
messages = filterThoughtFromMsgs(messages);
}
if (isDev) console.log({ messages });
// prepare params
const params = {
messages,
stream: true,
cache_prompt: true,
samplers: config.samplers,
temperature: config.temperature,
dynatemp_range: config.dynatemp_range,
dynatemp_exponent: config.dynatemp_exponent,
top_k: config.top_k,
top_p: config.top_p,
min_p: config.min_p,
typical_p: config.typical_p,
xtc_probability: config.xtc_probability,
xtc_threshold: config.xtc_threshold,
repeat_last_n: config.repeat_last_n,
repeat_penalty: config.repeat_penalty,
presence_penalty: config.presence_penalty,
frequency_penalty: config.frequency_penalty,
dry_multiplier: config.dry_multiplier,
dry_base: config.dry_base,
dry_allowed_length: config.dry_allowed_length,
dry_penalty_last_n: config.dry_penalty_last_n,
max_tokens: config.max_tokens,
timings_per_token: !!config.showTokensPerSecond,
...(config.custom.length ? JSON.parse(config.custom) : {}),
};
// send request
const fetchResponse = await fetch(`${BASE_URL}/v1/chat/completions`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
...(config.apiKey
? { Authorization: `Bearer ${config.apiKey}` }
: {}),
},
body: JSON.stringify(params),
signal: abortController.signal,
});
if (fetchResponse.status !== 200) {
const body = await fetchResponse.json();
throw new Error(body?.error?.message || 'Unknown error');
}
const chunks = getSSEStreamAsync(fetchResponse);
for await (const chunk of chunks) {
// const stop = chunk.stop;
if (chunk.error) {
throw new Error(chunk.error?.message || 'Unknown error');
}
const addedContent = chunk.choices[0].delta.content;
const lastContent = pendingMsg.content || '';
if (addedContent) {
pendingMsg = {
id: pendingMsg.id,
role: 'assistant',
content: lastContent + addedContent,
};
}
const timings = chunk.timings;
if (timings && config.showTokensPerSecond) {
// only extract what's really needed, to save some space
pendingMsg.timings = {
prompt_n: timings.prompt_n,
prompt_ms: timings.prompt_ms,
predicted_n: timings.predicted_n,
predicted_ms: timings.predicted_ms,
};
}
setPending(convId, pendingMsg);
onChunk?.();
}
} catch (err) {
setPending(convId, null);
if ((err as Error).name === 'AbortError') {
// user stopped the generation via stopGeneration() function
// we can safely ignore this error
} else {
console.error(err);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
alert((err as any)?.message ?? 'Unknown error');
throw err; // rethrow
}
}
if (pendingMsg.content) {
StorageUtils.appendMsg(currConversation.id, {
id: pendingMsg.id,
content: pendingMsg.content,
role: pendingMsg.role,
timings: pendingMsg.timings,
});
}
setPending(convId, null);
onChunk?.(); // trigger scroll to bottom
};
const sendMessage = async (
convId: string,
content: string,
onChunk?: CallbackGeneratedChunk
): Promise<boolean> => {
if (isGenerating(convId) || content.trim().length === 0) return false;
StorageUtils.appendMsg(convId, {
id: Date.now(),
role: 'user',
content,
});
try {
await generateMessage(convId, onChunk);
return true;
} catch (_) {
// rollback
StorageUtils.popMsg(convId);
}
return false;
};
const stopGenerating = (convId: string) => {
setPending(convId, null);
aborts[convId]?.abort();
};
// if content is undefined, we remove last assistant message
const replaceMessageAndGenerate = async (
convId: string,
origMsgId: Message['id'],
content?: string,
onChunk?: CallbackGeneratedChunk
) => {
if (isGenerating(convId)) return;
StorageUtils.filterAndKeepMsgs(convId, (msg) => msg.id < origMsgId);
if (content) {
StorageUtils.appendMsg(convId, {
id: Date.now(),
role: 'user',
content,
});
}
await generateMessage(convId, onChunk);
};
const saveConfig = (config: typeof CONFIG_DEFAULT) => {
StorageUtils.setConfig(config);
setConfig(config);
};
return (
<AppContext.Provider
value={{
isGenerating,
viewingConversation,
pendingMessages,
sendMessage,
stopGenerating,
replaceMessageAndGenerate,
canvasData,
setCanvasData,
config,
saveConfig,
showSettings,
setShowSettings,
}}
>
{children}
</AppContext.Provider>
);
};
export const useAppContext = () => useContext(AppContext);

View file

@ -0,0 +1,38 @@
export const XCloseButton: React.ElementType<
React.ClassAttributes<HTMLButtonElement> &
React.HTMLAttributes<HTMLButtonElement>
> = ({ className, ...props }) => (
<button className={`btn btn-square btn-sm ${className ?? ''}`} {...props}>
<svg
xmlns="http://www.w3.org/2000/svg"
className="h-6 w-6"
fill="none"
viewBox="0 0 24 24"
stroke="currentColor"
>
<path
strokeLinecap="round"
strokeLinejoin="round"
strokeWidth="2"
d="M6 18L18 6M6 6l12 12"
/>
</svg>
</button>
);
export const OpenInNewTab = ({
href,
children,
}: {
href: string;
children: string;
}) => (
<a
className="underline"
href={href}
target="_blank"
rel="noopener noreferrer"
>
{children}
</a>
);

View file

@ -0,0 +1,90 @@
// @ts-expect-error this package does not have typing
import TextLineStream from 'textlinestream';
import { APIMessage, Message } from './types';
// ponyfill for missing ReadableStream asyncIterator on Safari
import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
import { isDev } from '../Config';
// eslint-disable-next-line @typescript-eslint/no-explicit-any
export const isString = (x: any) => !!x.toLowerCase;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
export const isBoolean = (x: any) => x === true || x === false;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
export const isNumeric = (n: any) => !isString(n) && !isNaN(n) && !isBoolean(n);
export const escapeAttr = (str: string) =>
str.replace(/>/g, '&gt;').replace(/"/g, '&quot;');
// wrapper for SSE
export async function* getSSEStreamAsync(fetchResponse: Response) {
if (!fetchResponse.body) throw new Error('Response body is empty');
const lines: ReadableStream<string> = fetchResponse.body
.pipeThrough(new TextDecoderStream())
.pipeThrough(new TextLineStream());
// @ts-expect-error asyncIterator complains about type, but it should work
for await (const line of asyncIterator(lines)) {
if (isDev) console.log({ line });
if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
const data = JSON.parse(line.slice(5));
yield data;
} else if (line.startsWith('error:')) {
const data = JSON.parse(line.slice(6));
throw new Error(data.message || 'Unknown error');
}
}
}
// copy text to clipboard
export const copyStr = (textToCopy: string) => {
// Navigator clipboard api needs a secure context (https)
if (navigator.clipboard && window.isSecureContext) {
navigator.clipboard.writeText(textToCopy);
} else {
// Use the 'out of viewport hidden text area' trick
const textArea = document.createElement('textarea');
textArea.value = textToCopy;
// Move textarea out of the viewport so it's not visible
textArea.style.position = 'absolute';
textArea.style.left = '-999999px';
document.body.prepend(textArea);
textArea.select();
document.execCommand('copy');
}
};
/**
* filter out redundant fields upon sending to API
*/
export function normalizeMsgsForAPI(messages: Message[]) {
return messages.map((msg) => {
return {
role: msg.role,
content: msg.content,
};
}) as APIMessage[];
}
/**
* recommended for DeepsSeek-R1, filter out content between <think> and </think> tags
*/
export function filterThoughtFromMsgs(messages: APIMessage[]) {
return messages.map((msg) => {
return {
role: msg.role,
content:
msg.role === 'assistant'
? msg.content.split('</think>').at(-1)!.trim()
: msg.content,
} as APIMessage;
});
}
export function classNames(classes: Record<string, boolean>): string {
return Object.entries(classes)
.filter(([_, value]) => value)
.map(([key, _]) => key)
.join(' ');
}
export const delay = (ms: number) =>
new Promise((resolve) => setTimeout(resolve, ms));

View file

@ -0,0 +1,138 @@
// coversations is stored in localStorage
// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
import { CONFIG_DEFAULT } from '../Config';
import { Conversation, Message } from './types';
const event = new EventTarget();
type CallbackConversationChanged = (convId: string) => void;
let onConversationChangedHandlers: [
CallbackConversationChanged,
EventListener,
][] = [];
const dispatchConversationChange = (convId: string) => {
event.dispatchEvent(
new CustomEvent('conversationChange', { detail: { convId } })
);
};
// convId is a string prefixed with 'conv-'
const StorageUtils = {
/**
* manage conversations
*/
getAllConversations(): Conversation[] {
const res = [];
for (const key in localStorage) {
if (key.startsWith('conv-')) {
res.push(JSON.parse(localStorage.getItem(key) ?? '{}'));
}
}
res.sort((a, b) => b.lastModified - a.lastModified);
return res;
},
/**
* can return null if convId does not exist
*/
getOneConversation(convId: string): Conversation | null {
return JSON.parse(localStorage.getItem(convId) || 'null');
},
/**
* if convId does not exist, create one
*/
appendMsg(convId: string, msg: Message): void {
if (msg.content === null) return;
const conv = StorageUtils.getOneConversation(convId) || {
id: convId,
lastModified: Date.now(),
messages: [],
};
conv.messages.push(msg);
conv.lastModified = Date.now();
localStorage.setItem(convId, JSON.stringify(conv));
dispatchConversationChange(convId);
},
/**
* Get new conversation id
*/
getNewConvId(): string {
return `conv-${Date.now()}`;
},
/**
* remove conversation by id
*/
remove(convId: string): void {
localStorage.removeItem(convId);
dispatchConversationChange(convId);
},
/**
* remove all conversations
*/
filterAndKeepMsgs(
convId: string,
predicate: (msg: Message) => boolean
): void {
const conv = StorageUtils.getOneConversation(convId);
if (!conv) return;
conv.messages = conv.messages.filter(predicate);
conv.lastModified = Date.now();
localStorage.setItem(convId, JSON.stringify(conv));
dispatchConversationChange(convId);
},
/**
* remove last message from conversation
*/
popMsg(convId: string): Message | undefined {
const conv = StorageUtils.getOneConversation(convId);
if (!conv) return;
const msg = conv.messages.pop();
conv.lastModified = Date.now();
if (conv.messages.length === 0) {
StorageUtils.remove(convId);
} else {
localStorage.setItem(convId, JSON.stringify(conv));
}
dispatchConversationChange(convId);
return msg;
},
// event listeners
onConversationChanged(callback: CallbackConversationChanged) {
const fn = (e: Event) => callback((e as CustomEvent).detail.convId);
onConversationChangedHandlers.push([callback, fn]);
event.addEventListener('conversationChange', fn);
},
offConversationChanged(callback: CallbackConversationChanged) {
const fn = onConversationChangedHandlers.find(([cb, _]) => cb === callback);
if (fn) {
event.removeEventListener('conversationChange', fn[1]);
}
onConversationChangedHandlers = [];
},
// manage config
getConfig(): typeof CONFIG_DEFAULT {
const savedVal = JSON.parse(localStorage.getItem('config') || '{}');
// to prevent breaking changes in the future, we always provide default value for missing keys
return {
...CONFIG_DEFAULT,
...savedVal,
};
},
setConfig(config: typeof CONFIG_DEFAULT) {
localStorage.setItem('config', JSON.stringify(config));
},
getTheme(): string {
return localStorage.getItem('theme') || 'auto';
},
setTheme(theme: string) {
if (theme === 'auto') {
localStorage.removeItem('theme');
} else {
localStorage.setItem('theme', theme);
}
},
};
export default StorageUtils;

View file

@ -0,0 +1,36 @@
export interface TimingReport {
prompt_n: number;
prompt_ms: number;
predicted_n: number;
predicted_ms: number;
}
export interface Message {
id: number;
role: 'user' | 'assistant' | 'system';
content: string;
timings?: TimingReport;
}
export type APIMessage = Pick<Message, 'role' | 'content'>;
export interface Conversation {
id: string; // format: `conv-{timestamp}`
lastModified: number; // timestamp from Date.now()
messages: Message[];
}
export type PendingMessage = Omit<Message, 'content'> & {
content: string | null;
};
export enum CanvasType {
PY_INTERPRETER,
}
export interface CanvasPyInterpreter {
type: CanvasType.PY_INTERPRETER;
content: string;
}
export type CanvasData = CanvasPyInterpreter;

View file

@ -0,0 +1 @@
/// <reference types="vite/client" />

View file

@ -0,0 +1,26 @@
{
"compilerOptions": {
"tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo",
"target": "ES2021",
"useDefineForClassFields": true,
"lib": ["ES2021", "DOM", "DOM.Iterable"],
"module": "ESNext",
"skipLibCheck": true,
/* Bundler mode */
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"isolatedModules": true,
"moduleDetection": "force",
"noEmit": true,
"jsx": "react-jsx",
/* Linting */
"strict": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"noFallthroughCasesInSwitch": true,
"noUncheckedSideEffectImports": true
},
"include": ["src"]
}

View file

@ -0,0 +1,7 @@
{
"files": [],
"references": [
{ "path": "./tsconfig.app.json" },
{ "path": "./tsconfig.node.json" }
]
}

View file

@ -0,0 +1,24 @@
{
"compilerOptions": {
"tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo",
"target": "ES2022",
"lib": ["ES2023"],
"module": "ESNext",
"skipLibCheck": true,
/* Bundler mode */
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"isolatedModules": true,
"moduleDetection": "force",
"noEmit": true,
/* Linting */
"strict": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"noFallthroughCasesInSwitch": true,
"noUncheckedSideEffectImports": true
},
"include": ["vite.config.ts"]
}

View file

@ -1,8 +1,11 @@
import { defineConfig, PluginOption } from 'vite';
import react from '@vitejs/plugin-react';
import { viteSingleFile } from 'vite-plugin-singlefile'; import { viteSingleFile } from 'vite-plugin-singlefile';
import path from 'path'; import path from 'node:path';
import fs from 'fs'; import fs from 'node:fs';
import zlib from 'zlib'; import zlib from 'node:zlib';
/* eslint-disable */
const MAX_BUNDLE_SIZE = 1.5 * 1024 * 1024; // only increase when absolutely necessary const MAX_BUNDLE_SIZE = 1.5 * 1024 * 1024; // only increase when absolutely necessary
@ -15,20 +18,26 @@ const GUIDE_FOR_FRONTEND = `
--> -->
`.trim(); `.trim();
const FRONTEND_PLUGINS = [react()];
const BUILD_PLUGINS = [ const BUILD_PLUGINS = [
...FRONTEND_PLUGINS,
viteSingleFile(), viteSingleFile(),
(function llamaCppPlugin() { (function llamaCppPlugin() {
let config; let config: any;
return { return {
name: 'llamacpp:build', name: 'llamacpp:build',
apply: 'build', apply: 'build',
async configResolved(_config) { async configResolved(_config: any) {
config = _config; config = _config;
}, },
writeBundle() { writeBundle() {
const outputIndexHtml = path.join(config.build.outDir, 'index.html'); const outputIndexHtml = path.join(config.build.outDir, 'index.html');
const content = GUIDE_FOR_FRONTEND + '\n' + fs.readFileSync(outputIndexHtml, 'utf-8'); const content =
const compressed = zlib.gzipSync(Buffer.from(content, 'utf-8'), { level: 9 }); GUIDE_FOR_FRONTEND + '\n' + fs.readFileSync(outputIndexHtml, 'utf-8');
const compressed = zlib.gzipSync(Buffer.from(content, 'utf-8'), {
level: 9,
});
// because gzip header contains machine-specific info, we must remove these data from the header // because gzip header contains machine-specific info, we must remove these data from the header
// timestamp // timestamp
@ -42,18 +51,30 @@ const BUILD_PLUGINS = [
if (compressed.byteLength > MAX_BUNDLE_SIZE) { if (compressed.byteLength > MAX_BUNDLE_SIZE) {
throw new Error( throw new Error(
`Bundle size is too large (${Math.ceil(compressed.byteLength / 1024)} KB).\n` + `Bundle size is too large (${Math.ceil(compressed.byteLength / 1024)} KB).\n` +
`Please reduce the size of the frontend or increase MAX_BUNDLE_SIZE in vite.config.js.\n`, `Please reduce the size of the frontend or increase MAX_BUNDLE_SIZE in vite.config.js.\n`
); );
} }
const targetOutputFile = path.join(config.build.outDir, '../../public/index.html.gz'); const targetOutputFile = path.join(
config.build.outDir,
'../../public/index.html.gz'
);
fs.writeFileSync(targetOutputFile, compressed); fs.writeFileSync(targetOutputFile, compressed);
} },
} } satisfies PluginOption;
})(), })(),
]; ];
/** @type {import('vite').UserConfig} */ export default defineConfig({
export default { // @ts-ignore
plugins: process.env.ANALYZE ? [] : BUILD_PLUGINS, plugins: process.env.ANALYZE ? FRONTEND_PLUGINS : BUILD_PLUGINS,
}; server: {
proxy: {
'/v1': 'http://localhost:8080',
},
headers: {
'Cross-Origin-Embedder-Policy': 'require-corp',
'Cross-Origin-Opener-Policy': 'same-origin',
},
},
});

View file

@ -274,6 +274,7 @@ endif()
# Generate version info based on git commit. # Generate version info based on git commit.
if(NOT DEFINED GGML_BUILD_NUMBER)
find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH) find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH)
execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
@ -290,6 +291,8 @@ execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
OUTPUT_VARIABLE GGML_BUILD_COMMIT OUTPUT_VARIABLE GGML_BUILD_COMMIT
OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_STRIP_TRAILING_WHITESPACE
) )
endif()
# Capture variables prefixed with GGML_. # Capture variables prefixed with GGML_.

View file

@ -989,19 +989,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment); this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
} }
if (this_size > max_size) { if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
__func__, t->name,
ggml_backend_buft_name(buft),
this_size, max_size);
for (size_t i = 0; i < n_buffers; i++) {
ggml_backend_buffer_free(buffers[i]);
}
free(buffers);
return NULL;
}
if ((cur_buf_size + this_size) > max_size) {
// allocate tensors in the current buffer // allocate tensors in the current buffer
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) { if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
return NULL; return NULL;

View file

@ -360,21 +360,15 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
#endif #endif
#if defined(__loongarch_asx) #if defined(__loongarch_asx)
typedef union {
int32_t i;
float f;
} ft_union;
/* float type data load instructions */ /* float type data load instructions */
static __m128 __lsx_vreplfr2vr_s(float val) { static __m128 __lsx_vreplfr2vr_s(const float val) {
ft_union fi_tmpval = {.f = val}; v4f32 res = {val, val, val, val};
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i); return (__m128)res;
} }
static __m256 __lasx_xvreplfr2vr_s(float val) { static __m256 __lasx_xvreplfr2vr_s(const float val) {
ft_union fi_tmpval = {.f = val}; v8f32 res = {val, val, val, val, val, val, val, val};
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i); return (__m256)res;
} }
#endif #endif

View file

@ -297,6 +297,90 @@ static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
#endif #endif
#if defined(__loongarch_sx)
static __m128i lsx_packs_w(__m128i a, __m128i b) {
__m128i tmp, tmp1;
tmp = __lsx_vsat_w(a, 15);
tmp1 = __lsx_vsat_w(b, 15);
return __lsx_vpickev_h(tmp1, tmp);
}
static __m128i lsx_packs_h(__m128i a, __m128i b) {
__m128i tmp, tmp1;
tmp = __lsx_vsat_h(a, 7);
tmp1 = __lsx_vsat_h(b, 7);
return __lsx_vpickev_b(tmp1, tmp);
}
static __m128i lsx_packus_h(__m128i a, __m128i b) {
__m128i tmp, tmp1;
tmp = __lsx_vsat_hu(a, 7);
tmp1 = __lsx_vsat_hu(b, 7);
return __lsx_vpickev_b(tmp1, tmp);
}
static __m128i lsx_maddubs_h(__m128i a, __m128i b) {
__m128i tmp1, tmp2;
tmp1 = __lsx_vmulwev_h_b(a, b);
tmp2 = __lsx_vmulwod_h_b(a, b);
return __lsx_vsadd_h(tmp1, tmp2);
}
static __m128i lsx_madd_h(__m128i a, __m128i b) {
__m128i tmp1, tmp2;
tmp1 = __lsx_vmulwev_w_h(a, b);
tmp2 = __lsx_vmulwod_w_h(a, b);
return __lsx_vadd_w(tmp1, tmp2);
}
static __m128i lsx_set_w(int32_t a, int32_t b, int32_t c, int32_t d) {
v4i32 __ret = {d, c, b, a};
return (__m128i)__ret;
}
static __m128i lsx_shuffle_b(__m128i a, __m128i b) {
__m128i mask_f, zero, tmp0, tmp2, mask;
int f = 0x8f;
mask_f = __lsx_vreplgr2vr_b(f);
zero = __lsx_vldi(0);
tmp0 = __lsx_vand_v(b, mask_f); // get mask with low 4 bit and sign bits
tmp0 = __lsx_vori_b(tmp0, 0x10); // make each mask or with 0x10 prepare for positive
mask = __lsx_vsle_b(zero, tmp0); // if mask >= 0, set mask
tmp2 = __lsx_vand_v(tmp0, mask); // maskout the in2 < ones
return __lsx_vshuf_b(a, zero, tmp2);
}
static __m128i lsx_hadd_h(__m128i a, __m128i b) {
__m128i tmp1 = __lsx_vpickev_h(b, a);
__m128i tmp2 = __lsx_vpickod_h(b, a);
return __lsx_vadd_h(tmp1, tmp2);
}
static __m128i lsx_hadd_w(__m128i a, __m128i b) {
__m128i tmp1 = __lsx_vpickev_w(b, a);
__m128i tmp2 = __lsx_vpickod_w(b, a);
return __lsx_vadd_w(tmp1, tmp2);
}
static __m128 lsx_hadd_s(__m128 a, __m128 b) {
__m128 tmp1 = (__m128)__lsx_vpickev_w((__m128i)b, (__m128i)a);
__m128 tmp2 = (__m128)__lsx_vpickod_w((__m128i)b, (__m128i)a);
return __lsx_vfadd_s(tmp1, tmp2);
}
static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
__m128 res_0 =lsx_hadd_s(a, b);
__m128 res_1 =lsx_hadd_s(c, d);
__m128 res =lsx_hadd_s(res_0, res_1);
res =lsx_hadd_s(res, res);
res =lsx_hadd_s(res, res);
return ((v4f32)res)[0];
}
#endif
#if defined(__loongarch_asx) #if defined(__loongarch_asx)
#ifdef __clang__ #ifdef __clang__
@ -395,11 +479,6 @@ static __m256i lasx_set_w(int e7, int e6, int e5, int e4, int e3, int e2, int e1
return (__m256i)__ret; return (__m256i)__ret;
} }
static __m128i lsx_set_w(int32_t a, int32_t b, int32_t c, int32_t d) {
v4i32 __ret = {d, c, b, a};
return (__m128i)__ret;
}
static __m256i lasx_set_d(int64_t a, int64_t b, int64_t c, int64_t d) { static __m256i lasx_set_d(int64_t a, int64_t b, int64_t c, int64_t d) {
v4i64 __ret = {d, c, b, a}; v4i64 __ret = {d, c, b, a};
return (__m256i)__ret; return (__m256i)__ret;
@ -409,18 +488,6 @@ static __m256i lasx_insertf128( __m128i x, __m128i y) {
return lasx_set_q(x, y); return lasx_set_q(x, y);
} }
static __m128i lsx_shuffle_b(__m128i a, __m128i b) {
__m128i mask_f, zero, tmp0, tmp2, mask;
int f = 0x8f;
mask_f = __lsx_vreplgr2vr_b(f);
zero = __lsx_vldi(0);
tmp0 = __lsx_vand_v(b, mask_f); // get mask with low 4 bit and sign bits
tmp0 = __lsx_vori_b(tmp0, 0x10); // make each mask or with 0x10 prepare for positive
mask = __lsx_vsle_b(zero, tmp0); // if mask >= 0, set mask
tmp2 = __lsx_vand_v(tmp0, mask); // maskout the in2 < ones
return __lsx_vshuf_b(a, zero, tmp2);
}
static __m256i lasx_shuffle_b(__m256i a, __m256i b) { static __m256i lasx_shuffle_b(__m256i a, __m256i b) {
__m256i mask_f, zero, tmp0, tmp2, mask; __m256i mask_f, zero, tmp0, tmp2, mask;
int f = 0x8f; int f = 0x8f;
@ -434,30 +501,15 @@ static __m256i lasx_shuffle_b(__m256i a, __m256i b) {
} }
static __m256i lasx_extu8_16(__m128i a) { static __m256i lasx_extu8_16(__m128i a) {
__m128i zero = __lsx_vldi(0); return __lasx_vext2xv_hu_bu(____m256i(a));
__m128i vlo = __lsx_vilvl_b(zero, a);
__m128i vhi = __lsx_vilvh_b(zero, a);
return lasx_set_q(vhi, vlo);
} }
static __m256i lasx_ext8_16(__m128i a) { static __m256i lasx_ext8_16(__m128i a) {
__m128i sign = __lsx_vslti_b(a, 0); return __lasx_vext2xv_h_b(____m256i(a));
__m128i vlo = __lsx_vilvl_b(sign, a);
__m128i vhi = __lsx_vilvh_b(sign, a);
return lasx_set_q(vhi, vlo);
} }
static __m256i lasx_ext16_32(__m128i a) { static __m256i lasx_ext16_32(__m128i a) {
__m256i tmp1; return __lasx_vext2xv_w_h(____m256i(a));
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 0), 0);
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 1), 1);
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 2), 2);
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 3), 3);
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 4), 4);
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 5), 5);
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 6), 6);
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 7), 7);
return tmp1;
} }
static __m128i lasx_extracti128( __m256i a, int pos) { static __m128i lasx_extracti128( __m256i a, int pos) {
@ -482,25 +534,6 @@ static __m128 lasx_extractf128( __m256 a, int pos) {
return ret; return ret;
} }
static __m128i lsx_hadd_h(__m128i a, __m128i b) {
__m128i tmp1 = __lsx_vpickev_h(b, a);
__m128i tmp2 = __lsx_vpickod_h(b, a);
return __lsx_vadd_h(tmp1, tmp2);
}
static __m128i lsx_hadd_w(__m128i a, __m128i b) {
__m128i tmp1 = __lsx_vpickev_w(b, a);
__m128i tmp2 = __lsx_vpickod_w(b, a);
return __lsx_vadd_w(tmp1, tmp2);
}
static __m128 lsx_hadd_s(__m128 a, __m128 b) {
__m128 tmp1 = (__m128)__lsx_vpickev_w((__m128i)b, (__m128i)a);
__m128 tmp2 = (__m128)__lsx_vpickod_w((__m128i)b, (__m128i)a);
return __lsx_vfadd_s(tmp1, tmp2);
}
static __m256i lasx_maddubs_h(__m256i a, __m256i b) { static __m256i lasx_maddubs_h(__m256i a, __m256i b) {
__m256i tmp1, tmp2; __m256i tmp1, tmp2;
tmp1 = __lasx_xvmulwev_h_b(a, b); tmp1 = __lasx_xvmulwev_h_b(a, b);
@ -529,42 +562,6 @@ static __m256i lasx_packs_h(__m256i a, __m256i b) {
return __lasx_xvpickev_b(tmp1, tmp); return __lasx_xvpickev_b(tmp1, tmp);
} }
static __m128i lsx_packs_w(__m128i a, __m128i b) {
__m128i tmp, tmp1;
tmp = __lsx_vsat_w(a, 15);
tmp1 = __lsx_vsat_w(b, 15);
return __lsx_vpickev_h(tmp1, tmp);
}
static __m128i lsx_packs_h(__m128i a, __m128i b) {
__m128i tmp, tmp1;
tmp = __lsx_vsat_h(a, 7);
tmp1 = __lsx_vsat_h(b, 7);
return __lsx_vpickev_b(tmp1, tmp);
}
static __m128i lsx_packus_h(__m128i a, __m128i b) {
__m128i tmp, tmp1;
tmp = __lsx_vsat_hu(a, 7);
tmp1 = __lsx_vsat_hu(b, 7);
return __lsx_vpickev_b(tmp1, tmp);
}
static __m128i lsx_maddubs_h(__m128i a, __m128i b) {
__m128i tmp1, tmp2;
tmp1 = __lsx_vmulwev_h_b(a, b);
tmp2 = __lsx_vmulwod_h_b(a, b);
return __lsx_vsadd_h(tmp1, tmp2);
}
static __m128i lsx_madd_h(__m128i a, __m128i b) {
__m128i tmp1, tmp2;
tmp1 = __lsx_vmulwev_w_h(a, b);
tmp2 = __lsx_vmulwod_w_h(a, b);
return __lsx_vadd_w(tmp1, tmp2);
}
// multiply int8_t, add results pairwise twice // multiply int8_t, add results pairwise twice
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
// Get absolute values of x vectors // Get absolute values of x vectors
@ -580,12 +577,10 @@ static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
// horizontally add 8 floats // horizontally add 8 floats
static inline float hsum_float_8(const __m256 x) { static inline float hsum_float_8(const __m256 x) {
__m128 res = lasx_extractf128(x, 1); __m128 res = lasx_extractf128(x, 1);
ft_union tmp;
res = __lsx_vfadd_s(res, lasx_extractf128(x, 0)); res = __lsx_vfadd_s(res, lasx_extractf128(x, 0));
res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res)); res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res));
res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0)); res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0));
tmp.i = __lsx_vpickve2gr_w(res, 0); return ((v4f32)res)[0];
return tmp.f;
} }
// horizontally add 8 int32_t // horizontally add 8 int32_t
@ -927,7 +922,6 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
#elif defined(__loongarch_asx) #elif defined(__loongarch_asx)
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
ft_union fi;
__m256 v0 = (__m256)__lasx_xvld( x , 0); __m256 v0 = (__m256)__lasx_xvld( x , 0);
__m256 v1 = (__m256)__lasx_xvld( x , 32); __m256 v1 = (__m256)__lasx_xvld( x , 32);
__m256 v2 = (__m256)__lasx_xvld( x , 64); __m256 v2 = (__m256)__lasx_xvld( x , 64);
@ -945,8 +939,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) ); max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
__m128 tmp = max4; __m128 tmp = max4;
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 )); max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 ));
fi.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 ); const float max_scalar = ((v4f32)max4)[0];
const float max_scalar = fi.f;
// Quantize these floats // Quantize these floats
const float d = max_scalar / 127.f; const float d = max_scalar / 127.f;
@ -1251,7 +1244,6 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
#elif defined(__loongarch_asx) #elif defined(__loongarch_asx)
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
ft_union ft;
__m256 v0 = (__m256)__lasx_xvld( x , 0 ); __m256 v0 = (__m256)__lasx_xvld( x , 0 );
__m256 v1 = (__m256)__lasx_xvld( x , 32 ); __m256 v1 = (__m256)__lasx_xvld( x , 32 );
__m256 v2 = (__m256)__lasx_xvld( x , 64 ); __m256 v2 = (__m256)__lasx_xvld( x , 64 );
@ -1269,8 +1261,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) ); max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
__m128 tmp = max4; __m128 tmp = max4;
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 )); max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
ft.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 ); const float max_scalar = ((v4f32)max4)[0];
const float max_scalar = ft.f;
// Quantize these floats // Quantize these floats
const float d = max_scalar / 127.f; const float d = max_scalar / 127.f;
@ -2232,21 +2223,22 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
} }
sumf = hsum_float_8(acc); sumf = hsum_float_8(acc);
#elif defined(__loongarch_sx) #elif defined(__loongarch_sx)
// set constants // set constants
const __m128i low_mask = __lsx_vreplgr2vr_b(0xF); const __m128i low_mask = __lsx_vreplgr2vr_b(0xF);
const __m128i off = __lsx_vreplgr2vr_b(8); const __m128i off = __lsx_vreplgr2vr_b(8);
// Initialize accumulator with zeros // Initialize accumulator with zeros
__m128 acc_0 = __lsx_vldi(0); __m128 acc_0 = (__m128)__lsx_vldi(0);
__m128 acc_1 = __lsx_vldi(0); __m128 acc_1 = (__m128)__lsx_vldi(0);
__m128 acc_2 = __lsx_vldi(0); __m128 acc_2 = (__m128)__lsx_vldi(0);
__m128 acc_3 = __lsx_vldi(0); __m128 acc_3 = (__m128)__lsx_vldi(0);
for (; ib + 1 < nb; ib += 2) { for (; ib + 1 < nb; ib += 2) {
// Compute combined scale for the block 0 and 1 // Compute combined scale for the block 0 and 1
const __m128 d_0_1 = __lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) ); const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0); const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
@ -2264,7 +2256,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
//_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0); //_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
// Compute combined scale for the block 2 and 3 // Compute combined scale for the block 2 and 3
const __m128 d_2_3 = __lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) ); const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0); const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);
@ -6141,9 +6133,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1); acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
ft_union fi; *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
fi.i = __lsx_vpickve2gr_w(acc_m, 0);
*s = hsum_float_8(acc) + fi.f ;
#else #else
const uint8_t * scales = (const uint8_t*)&utmp[0]; const uint8_t * scales = (const uint8_t*)&utmp[0];

View file

@ -1076,29 +1076,23 @@ do { \
#define GGML_F16_STEP 32 #define GGML_F16_STEP 32
#define GGML_F16_EPR 8 #define GGML_F16_EPR 8
// F16 arithmetic is not supported by AVX, so we use F32 instead // F16 arithmetic is not supported by LASX, so we use F32 instead
#define GGML_F32Cx8 __m256 #define GGML_F32Cx8 __m256
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0) #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x)) #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) { static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
float tmp[8]; __m256i a;
memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
for (int i = 0; i < 8; i++) { a = __lasx_xvpermi_d(a, 0 | (1 << 4));
tmp[i] = GGML_FP16_TO_FP32(x[i]); return __lasx_xvfcvtl_s_h(a);
} }
return (__m256)__lasx_xvld(tmp, 0);
}
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) { static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
float arr[8]; __m256i a = __lasx_xvfcvt_h_s(y, y);
a = __lasx_xvpermi_d(a, 0 | (2 << 2));
__lasx_xvst(y, arr, 0); memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
for (int i = 0; i < 8; i++) {
x[i] = GGML_FP32_TO_FP16(arr[i]);
}
} }
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x) #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y) #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
@ -13959,8 +13953,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
tp->ec = GGML_STATUS_ABORTED; tp->ec = GGML_STATUS_ABORTED;
} }
if (node_n + 1 < cgraph->n_nodes) {
ggml_barrier(state->threadpool); ggml_barrier(state->threadpool);
} }
}
ggml_barrier(state->threadpool);
return 0; return 0;
} }

View file

@ -38,6 +38,7 @@
#include "ggml-cuda/upscale.cuh" #include "ggml-cuda/upscale.cuh"
#include "ggml-cuda/wkv6.cuh" #include "ggml-cuda/wkv6.cuh"
#include "ggml-cuda/gla.cuh" #include "ggml-cuda/gla.cuh"
#include "ggml.h"
#include <algorithm> #include <algorithm>
#include <array> #include <array>
@ -1365,8 +1366,6 @@ static void ggml_cuda_op_mul_mat(
const int64_t ne13 = src1->ne[3]; const int64_t ne13 = src1->ne[3];
const int64_t nrows1 = ggml_nrows(src1); const int64_t nrows1 = ggml_nrows(src1);
GGML_ASSERT(ne03 == ne13);
const int64_t ne0 = dst->ne[0]; const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1]; const int64_t ne1 = dst->ne[1];
@ -1380,9 +1379,11 @@ static void ggml_cuda_op_mul_mat(
GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1)); GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0); GGML_ASSERT(ne12 % ne02 == 0);
GGML_ASSERT(ne13 % ne03 == 0);
const int64_t i02_divisor = ne12 / ne02; const int64_t i02_divisor = ne12 / ne02;
const int64_t i03_divisor = ne13 / ne03;
const size_t src0_ts = ggml_type_size(src0->type); const size_t src0_ts = ggml_type_size(src0->type);
const size_t src0_bs = ggml_blck_size(src0->type); const size_t src0_bs = ggml_blck_size(src0->type);
@ -1398,6 +1399,7 @@ static void ggml_cuda_op_mul_mat(
GGML_ASSERT(!(split && ne02 > 1)); GGML_ASSERT(!(split && ne02 > 1));
GGML_ASSERT(!(split && ne03 > 1)); GGML_ASSERT(!(split && ne03 > 1));
GGML_ASSERT(!(split && ne02 < ne12)); GGML_ASSERT(!(split && ne02 < ne12));
GGML_ASSERT(!(split && ne03 < ne13));
ggml_tensor_extra_gpu * src0_extra = split ? (ggml_tensor_extra_gpu *) src0->extra : nullptr; ggml_tensor_extra_gpu * src0_extra = split ? (ggml_tensor_extra_gpu *) src0->extra : nullptr;
@ -1561,7 +1563,8 @@ static void ggml_cuda_op_mul_mat(
} }
// for split tensors the data begins at i0 == i0_offset_low // for split tensors the data begins at i0 == i0_offset_low
char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs; const size_t nbytes_src0_matrix = ne01*ne00*src0_ts / src0_bs;
char * src0_dd_i = dev[id].src0_dd + ((i03/i03_divisor)*ne02 + (i02/i02_divisor)) * nbytes_src0_matrix;
float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10; float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
char * src1_ddq_i = dev[id].src1_ddq + src1_ddq_i_offset; char * src1_ddq_i = dev[id].src1_ddq + src1_ddq_i_offset;
float * dst_dd_i = dev[id].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff); float * dst_dd_i = dev[id].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
@ -1605,8 +1608,9 @@ static void ggml_cuda_op_mul_mat(
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
} }
if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) { if (src1_col_0 == 0 && !src0_is_contiguous && i03 % i03_divisor == 0 && i02 % i02_divisor == 0) {
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream)); CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
src0_dd_i, src0, i03/i03_divisor, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
} }
// do the computation // do the computation
@ -1881,7 +1885,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
if (!split && use_mul_mat_vec && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) { if (!split && use_mul_mat_vec && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
// the custom F16 vector kernel can be used over batched cuBLAS GEMM // the custom F16 vector kernel can be used over batched cuBLAS GEMM
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention) // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
ggml_cuda_mul_mat_vec(ctx, src0, src1, dst); ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
@ -2215,12 +2219,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
ggml_cuda_op_rms_norm_back(ctx, dst); ggml_cuda_op_rms_norm_back(ctx, dst);
break; break;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
GGML_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
return false;
} else {
ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst); ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
}
break; break;
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:
ggml_cuda_mul_mat_id(ctx, dst); ggml_cuda_mul_mat_id(ctx, dst);
@ -2997,9 +2996,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) { if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
return false; return false;
} }
if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) {
return false;
}
#ifdef GGML_USE_MUSA #ifdef GGML_USE_MUSA
if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 && if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 &&
!ggml_is_transposed(a) && !ggml_is_transposed(b)) { !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
@ -3139,6 +3135,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
break; break;
case GGML_OP_NORM: case GGML_OP_NORM:
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
return true;
case GGML_OP_RMS_NORM_BACK: case GGML_OP_RMS_NORM_BACK:
return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0; return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0;
break; break;
@ -3181,7 +3178,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_SUM_ROWS: case GGML_OP_SUM_ROWS:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
case GGML_OP_ACC: case GGML_OP_ACC:
return true;
case GGML_OP_GROUP_NORM: case GGML_OP_GROUP_NORM:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_ARANGE: case GGML_OP_ARANGE:

View file

@ -16,7 +16,7 @@
#include "common.cuh" #include "common.cuh"
#if CUDART_VERSION >= 11800 #if CUDART_VERSION >= 11080
static __device__ __forceinline__ int ggml_cuda_movmatrix(const int x) { static __device__ __forceinline__ int ggml_cuda_movmatrix(const int x) {
int ret = 0; int ret = 0;
@ -50,7 +50,7 @@ static __device__ __forceinline__ int ggml_cuda_movmatrix(const int x) {
return ret_low | ret_high; return ret_low | ret_high;
} }
#endif // CUDART_VERSION >= 11800 #endif // CUDART_VERSION >= 11080
template <typename T> template <typename T>

View file

@ -1,18 +1,21 @@
#include "ggml.h"
#include "common.cuh" #include "common.cuh"
#include "mmv.cuh" #include "mmv.cuh"
template <typename T, typename type_acc, int block_size> template <typename T, typename type_acc, int block_size>
static __global__ void mul_mat_vec( static __global__ void mul_mat_vec(
const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row, const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) { const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst) {
const int64_t row = blockIdx.x; const int64_t row = blockIdx.x;
const int64_t channel = blockIdx.z; const int64_t channel = blockIdx.y;
const int64_t sample = blockIdx.z;
const int tid = threadIdx.x; const int tid = threadIdx.x;
constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr int warp_size = ggml_cuda_get_physical_warp_size();
x += (channel/channel_ratio)*stride_channel_x + row*stride_row; x += (sample/sample_ratio)*stride_sample_x + (channel/channel_ratio)*stride_channel_x + row*stride_row;
y += channel *stride_channel_y; y += sample *stride_sample_y + channel *stride_channel_y;
dst += channel *stride_channel_dst; dst += sample *stride_sample_dst + channel *stride_channel_dst;
const float2 * y2 = (const float2 *) y; const float2 * y2 = (const float2 *) y;
@ -91,12 +94,15 @@ template <typename T, typename type_acc>
static void launch_mul_mat_vec_cuda( static void launch_mul_mat_vec_cuda(
const T * x, const float * y, float * dst, const T * x, const float * y, float * dst,
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
const int64_t nsamples_y, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
cudaStream_t stream) { cudaStream_t stream) {
GGML_ASSERT(ncols % 2 == 0); GGML_ASSERT(ncols % 2 == 0);
GGML_ASSERT(stride_row % 2 == 0); GGML_ASSERT(stride_row % 2 == 0);
GGML_ASSERT(nchannels_y % nchannels_x == 0); GGML_ASSERT(nchannels_y % nchannels_x == 0);
GGML_ASSERT(nsamples_y % nsamples_x == 0);
const int64_t channel_ratio = nchannels_y / nchannels_x; const int64_t channel_ratio = nchannels_y / nchannels_x;
const int64_t sample_ratio = nsamples_y / nsamples_x;
int device; int device;
int warp_size; int warp_size;
@ -118,40 +124,48 @@ static void launch_mul_mat_vec_cuda(
} }
const int smem = warp_size*sizeof(float); const int smem = warp_size*sizeof(float);
const dim3 block_nums(nrows, 1, nchannels_y); const dim3 block_nums(nrows, nchannels_y, nsamples_y);
const dim3 block_dims(block_size_best, 1, 1); const dim3 block_dims(block_size_best, 1, 1);
switch (block_size_best) { switch (block_size_best) {
case 32: { case 32: {
mul_mat_vec<T, type_acc, 32><<<block_nums, block_dims, smem, stream>>> mul_mat_vec<T, type_acc, 32><<<block_nums, block_dims, smem, stream>>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst); (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break; } break;
case 64: { case 64: {
mul_mat_vec<T, type_acc, 64><<<block_nums, block_dims, smem, stream>>> mul_mat_vec<T, type_acc, 64><<<block_nums, block_dims, smem, stream>>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst); (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break; } break;
case 96: { case 96: {
mul_mat_vec<T, type_acc, 96><<<block_nums, block_dims, smem, stream>>> mul_mat_vec<T, type_acc, 96><<<block_nums, block_dims, smem, stream>>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst); (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break; } break;
case 128: { case 128: {
mul_mat_vec<T, type_acc, 128><<<block_nums, block_dims, smem, stream>>> mul_mat_vec<T, type_acc, 128><<<block_nums, block_dims, smem, stream>>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst); (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break; } break;
case 160: { case 160: {
mul_mat_vec<T, type_acc, 160><<<block_nums, block_dims, smem, stream>>> mul_mat_vec<T, type_acc, 160><<<block_nums, block_dims, smem, stream>>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst); (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break; } break;
case 192: { case 192: {
mul_mat_vec<T, type_acc, 192><<<block_nums, block_dims, smem, stream>>> mul_mat_vec<T, type_acc, 192><<<block_nums, block_dims, smem, stream>>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst); (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break; } break;
case 224: { case 224: {
mul_mat_vec<T, type_acc, 224><<<block_nums, block_dims, smem, stream>>> mul_mat_vec<T, type_acc, 224><<<block_nums, block_dims, smem, stream>>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst); (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break; } break;
case 256: { case 256: {
mul_mat_vec<T, type_acc, 256><<<block_nums, block_dims, smem, stream>>> mul_mat_vec<T, type_acc, 256><<<block_nums, block_dims, smem, stream>>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst); (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break; } break;
default: { default: {
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
@ -163,16 +177,19 @@ template<typename T>
static void mul_mat_vec_cuda( static void mul_mat_vec_cuda(
const T * x, const float * y, float * dst, const T * x, const float * y, float * dst,
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
const int64_t nsamples_y, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
enum ggml_prec prec, cudaStream_t stream) { enum ggml_prec prec, cudaStream_t stream) {
switch (prec) { switch (prec) {
case GGML_PREC_DEFAULT: { case GGML_PREC_DEFAULT: {
launch_mul_mat_vec_cuda<T, half>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, launch_mul_mat_vec_cuda<T, half>
stride_channel_x, stride_channel_y, stride_channel_dst, stream); (x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break; } break;
case GGML_PREC_F32: { case GGML_PREC_F32: {
launch_mul_mat_vec_cuda<T, float>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, launch_mul_mat_vec_cuda<T, float>
stride_channel_x, stride_channel_y, stride_channel_dst, stream); (x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break; } break;
} }
} }
@ -181,10 +198,19 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0]; GGML_TENSOR_BINARY_OP_LOCALS;
const int64_t ne01 = src0->ne[1];
GGML_ASSERT(src1->ne[1] == 1); const size_t ts_src0 = ggml_type_size(src0->type);
const size_t ts_src1 = ggml_type_size(src1->type);
const size_t ts_dst = ggml_type_size(dst->type);
GGML_ASSERT(ne11 == 1);
GGML_ASSERT(ne12 == ne2);
GGML_ASSERT(ne13 == ne3);
GGML_ASSERT(nb00 == ts_src0);
GGML_ASSERT(nb10 == ts_src1);
GGML_ASSERT(nb0 == ts_dst);
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32; const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
@ -192,29 +218,22 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
const float * src1_d = (const float *) src1->data; const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data; float * dst_d = (float *) dst->data;
const int64_t ne02 = src0->ne[2]; const int64_t s01 = src0->nb[1] / ts_src0;
const int64_t ne12 = src1->ne[2]; const int64_t s02 = src0->nb[2] / ts_src0;
GGML_ASSERT(dst->ne[2] == ne12); const int64_t s12 = src1->nb[2] / ts_src1;
const int64_t s2 = dst->nb[2] / ts_dst;
GGML_ASSERT(src0->ne[3] == 1); const int64_t s03 = src0->nb[3] / ts_src0;
GGML_ASSERT(src1->ne[3] == 1); const int64_t s13 = src1->nb[3] / ts_src1;
GGML_ASSERT( dst->ne[3] == 1); const int64_t s3 = dst->nb[3] / ts_dst;
const int64_t stride_row = src0->nb[1] / ggml_type_size(src0->type);
const int64_t channel_stride_x = src0->nb[2] / ggml_type_size(src0->type);
const int64_t channel_stride_y = src1->nb[2] / ggml_type_size(src1->type);
const int64_t channel_stride_dst = dst->nb[2] / ggml_type_size( dst->type);
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F16: { case GGML_TYPE_F16: {
const half * src0_d = (const half *) src0->data; const half * src0_d = (const half *) src0->data;
mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12, mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, s01, ne02, ne12, s02, s12, s2, ne03, ne13, s03, s13, s3, prec, ctx.stream());
channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
} break; } break;
case GGML_TYPE_BF16: { case GGML_TYPE_BF16: {
const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data; const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data;
mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12, mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, s01, ne02, ne12, s02, s12, s2, ne03, ne13, s03, s13, s3, prec, ctx.stream());
channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
} break; } break;
default: default:
GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type)); GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
@ -243,20 +262,27 @@ void ggml_cuda_op_mul_mat_vec(
const int64_t stride_row = ne00; const int64_t stride_row = ne00;
const int64_t nchannels_x = 1; const int64_t nchannels_x = 1;
const int64_t nchannels_y = 1; const int64_t nchannels_y = 1;
const int64_t channel_stride_x = 0; const int64_t stride_channel_x = 0;
const int64_t channel_stride_y = 0; const int64_t stride_channel_y = 0;
const int64_t channel_stride_dst = 0; const int64_t stride_channel_dst = 0;
const int64_t nsamples_x = 1;
const int64_t nsamples_y = 1;
const int64_t stride_sample_x = 0;
const int64_t stride_sample_y = 0;
const int64_t stride_sample_dst = 0;
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F16: { case GGML_TYPE_F16: {
const half * src0_d = (const half *) src0_dd_i; const half * src0_d = (const half *) src0_dd_i;
mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row, mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream); nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
} break; } break;
case GGML_TYPE_BF16: { case GGML_TYPE_BF16: {
const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i; const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i;
mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row, mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream); nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
} break; } break;
default: default:
GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type)); GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));

View file

@ -1,12 +1,20 @@
#include "norm.cuh" #include "norm.cuh"
#include <cstdint>
template <int block_size> template <int block_size>
static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) { static __global__ void norm_f32(
const int row = blockIdx.x*blockDim.y + threadIdx.y; const float * x, float * dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
const int64_t stride_sample, const float eps) {
const int nrows = gridDim.x;
const int nchannels = gridDim.y;
const int row = blockIdx.x;
const int channel = blockIdx.y;
const int sample = blockIdx.z;
const int tid = threadIdx.x; const int tid = threadIdx.x;
x += int64_t(row)*ncols; x += sample*stride_sample + channel*stride_channel + row*stride_row;
dst += int64_t(row)*ncols; dst += ((sample*nchannels + channel)*nrows + row)*ncols;
float2 mean_var = make_float2(0.0f, 0.0f); float2 mean_var = make_float2(0.0f, 0.0f);
@ -97,12 +105,19 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
} }
template <int block_size> template <int block_size>
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) { static __global__ void rms_norm_f32(
const int row = blockIdx.x*blockDim.y + threadIdx.y; const float * x, float * dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
const int64_t stride_sample, const float eps) {
const int nrows = gridDim.x;
const int nchannels = gridDim.y;
const int row = blockIdx.x;
const int channel = blockIdx.y;
const int sample = blockIdx.z;
const int tid = threadIdx.x; const int tid = threadIdx.x;
x += int64_t(row)*ncols; x += sample*stride_sample + channel*stride_channel + row*stride_row;
dst += int64_t(row)*ncols; dst += ((sample*nchannels + channel)*nrows + row)*ncols;
float tmp = 0.0f; // partial sum for thread in warp float tmp = 0.0f; // partial sum for thread in warp
@ -186,13 +201,16 @@ static __global__ void rms_norm_back_f32(
} }
} }
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) { static void norm_f32_cuda(
const float * x, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, cudaStream_t stream) {
const dim3 blocks_num(nrows, nchannels, nsamples);
if (ncols < 1024) { if (ncols < 1024) {
const dim3 block_dims(WARP_SIZE, 1, 1); const dim3 block_dims(WARP_SIZE, 1, 1);
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps); norm_f32<WARP_SIZE><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
} else { } else {
const dim3 block_dims(1024, 1, 1); const dim3 block_dims(1024, 1, 1);
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps); norm_f32<1024><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
} }
} }
@ -207,13 +225,16 @@ static void group_norm_f32_cuda(
} }
} }
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) { static void rms_norm_f32_cuda(
const float * x, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, cudaStream_t stream) {
const dim3 blocks_num(nrows, nchannels, nsamples);
if (ncols < 1024) { if (ncols < 1024) {
const dim3 block_dims(WARP_SIZE, 1, 1); const dim3 block_dims(WARP_SIZE, 1, 1);
rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps); rms_norm_f32<WARP_SIZE><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
} else { } else {
const dim3 block_dims(1024, 1, 1); const dim3 block_dims(1024, 1, 1);
rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps); rms_norm_f32<1024><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
} }
} }
@ -233,19 +254,22 @@ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float * dst_d = (float *) dst->data; float * dst_d = (float *) dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0]; GGML_TENSOR_UNARY_OP_LOCALS;
const int64_t nrows = ggml_nrows(src0);
float eps; float eps;
memcpy(&eps, dst->op_params, sizeof(float)); memcpy(&eps, dst->op_params, sizeof(float));
GGML_ASSERT(eps >= 0.0f); GGML_ASSERT(eps >= 0.0f);
norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream); const size_t ts0 = ggml_type_size(src0->type);
GGML_ASSERT(nb00 == ts0);
const int64_t s01 = nb01 / ts0;
const int64_t s02 = nb02 / ts0;
const int64_t s03 = nb03 / ts0;
norm_f32_cuda(src0_d, dst_d, ne00, ne01, ne02, ne03, s01, s02, s03, eps, stream);
} }
void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@ -254,8 +278,6 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
float * dst_d = (float *)dst->data; float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
@ -275,19 +297,22 @@ void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float * dst_d = (float *) dst->data; float * dst_d = (float *) dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0]; GGML_TENSOR_UNARY_OP_LOCALS;
const int64_t nrows = ggml_nrows(src0);
float eps; float eps;
memcpy(&eps, dst->op_params, sizeof(float)); memcpy(&eps, dst->op_params, sizeof(float));
GGML_ASSERT(eps >= 0.0f); GGML_ASSERT(eps >= 0.0f);
rms_norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream); const size_t ts0 = ggml_type_size(src0->type);
GGML_ASSERT(nb00 == ts0);
const int64_t s01 = nb01 / ts0;
const int64_t s02 = nb02 / ts0;
const int64_t s03 = nb03 / ts0;
rms_norm_f32_cuda(src0_d, dst_d, ne00, ne01, ne02, ne03, s01, s02, s03, eps, stream);
} }
void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

View file

@ -46,6 +46,9 @@ endif()
message(STATUS "HIP and hipBLAS found") message(STATUS "HIP and hipBLAS found")
# Workaround old compilers
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} --gpu-max-threads-per-block=1024")
file(GLOB GGML_HEADERS_ROCM "../ggml-cuda/*.cuh") file(GLOB GGML_HEADERS_ROCM "../ggml-cuda/*.cuh")
list(APPEND GGML_HEADERS_ROCM "../../include/ggml-cuda.h") list(APPEND GGML_HEADERS_ROCM "../../include/ggml-cuda.h")

View file

@ -19,8 +19,15 @@
// max number of MTLCommandBuffer used to submit a graph for processing // max number of MTLCommandBuffer used to submit a graph for processing
#define GGML_METAL_MAX_COMMAND_BUFFERS 8 #define GGML_METAL_MAX_COMMAND_BUFFERS 8
#ifndef TARGET_OS_VISION
#define TARGET_OS_VISION 0
#endif
// create residency sets only on macOS >= 15.0 // create residency sets only on macOS >= 15.0
#if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000 #if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000 || \
TARGET_OS_IOS && __IPHONE_OS_VERSION_MAX_ALLOWED >= 180000 || \
TARGET_OS_TV && __TV_OS_VERSION_MAX_ALLOWED >= 180000 || \
TARGET_OS_VISION && __VISION_OS_VERSION_MAX_ALLOWED >= 200000
#define GGML_METAL_HAS_RESIDENCY_SETS 1 #define GGML_METAL_HAS_RESIDENCY_SETS 1
#endif #endif
@ -1071,7 +1078,7 @@ static bool ggml_backend_metal_buffer_rset_init(
} }
#if defined(GGML_METAL_HAS_RESIDENCY_SETS) #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
if (@available(macOS 15.0, *)) { if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init]; MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init];
desc.label = @"ggml_backend_metal"; desc.label = @"ggml_backend_metal";
desc.initialCapacity = ctx->n_buffers; desc.initialCapacity = ctx->n_buffers;
@ -1106,7 +1113,7 @@ static bool ggml_backend_metal_buffer_rset_init(
// rset free // rset free
static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer_context * ctx) { static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer_context * ctx) {
#if defined(GGML_METAL_HAS_RESIDENCY_SETS) #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
if (@available(macOS 15.0, *)) { if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
if (ctx->rset) { if (ctx->rset) {
[ctx->rset endResidency]; [ctx->rset endResidency];
[ctx->rset removeAllAllocations]; [ctx->rset removeAllAllocations];
@ -1201,12 +1208,13 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_SUM_ROWS: case GGML_OP_SUM_ROWS:
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
case GGML_OP_GROUP_NORM: case GGML_OP_GROUP_NORM:
return has_simdgroup_reduction; return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
return has_simdgroup_reduction && (op->ne[0] % 4 == 0); return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
case GGML_OP_ARGMAX: case GGML_OP_ARGMAX:
case GGML_OP_NORM:
return true; return true;
case GGML_OP_NORM:
return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
case GGML_OP_ROPE: case GGML_OP_ROPE:
{ {
const int mode = ((const int32_t *) op->op_params)[2]; const int mode = ((const int32_t *) op->op_params)[2];

View file

@ -1045,7 +1045,28 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
ggml_free(ctx); ggml_free(ctx);
return false; return false;
} }
GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n", __func__, (void*)src->buffer, (void*)dst->buffer);
uint64_t src_size = (uint64_t) ggml_nbytes(src);
uint64_t dst_data = (uint64_t) dst->data;
uint64_t dst_base = (uint64_t) ggml_backend_buffer_get_base(dst->buffer);
uint64_t dst_buf_sz = (uint64_t) ggml_backend_buffer_get_size(dst->buffer);
if (dst_data + src_size > dst_base + dst_buf_sz) {
GGML_PRINT_DEBUG("[%s] out-of-bounds write in rpc_server::copy_tensor:\n"
" write range : [0x%" PRIx64 ", 0x%" PRIx64 "]\n"
" buffer base: [0x%" PRIx64 ", 0x%" PRIx64 "]\n",
__func__,
dst_data,
dst_data + src_size,
dst_base,
dst_base + dst_buf_sz);
ggml_free(ctx);
return false;
}
GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n",
__func__, (void*) src->buffer, (void*) dst->buffer);
response.result = ggml_backend_buffer_copy_tensor(src, dst); response.result = ggml_backend_buffer_copy_tensor(src, dst);
ggml_free(ctx); ggml_free(ctx);
return true; return true;

View file

@ -103,11 +103,10 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
name = std::regex_replace(name, std::regex("\\(TM\\)"), ""); name = std::regex_replace(name, std::regex("\\(TM\\)"), "");
auto global_mem_size = prop.get_global_mem_size()/1000000; auto global_mem_size = prop.get_global_mem_size()/1000000;
std::string xmx = gpu_has_xmx(device) ? "yes" : "no"; GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|%14s|\n", id, device_type.c_str(),
name.c_str(), version.c_str(), prop.get_max_compute_units(), name.c_str(), version.c_str(), prop.get_max_compute_units(),
prop.get_max_work_group_size(), prop.get_max_sub_group_size(), prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str(), xmx.c_str()); global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
} }
void ggml_backend_sycl_print_sycl_devices() { void ggml_backend_sycl_print_sycl_devices() {
@ -118,16 +117,16 @@ void ggml_backend_sycl_print_sycl_devices() {
GGML_LOG_INFO( GGML_LOG_INFO(
"| | | | " "| | | | "
" |Max | |Max |Global | | XMX |\n"); " |Max | |Max |Global | |\n");
GGML_LOG_INFO( GGML_LOG_INFO(
"| | | | " "| | | | "
" |compute|Max work|sub |mem | | or |\n"); " |compute|Max work|sub |mem | |\n");
GGML_LOG_INFO( GGML_LOG_INFO(
"|ID| Device Type| " "|ID| Device Type| "
"Name|Version|units |group |group|size | Driver version| Tensor Cores |\n"); "Name|Version|units |group |group|size | Driver version|\n");
GGML_LOG_INFO( GGML_LOG_INFO(
"|--|-------------------|---------------------------------------|------" "|--|-------------------|---------------------------------------|------"
"-|-------|--------|-----|-------|---------------------|--------------|\n"); "-|-------|--------|-----|-------|---------------------|\n");
for (int id = 0; id < device_count; ++id) { for (int id = 0; id < device_count; ++id) {
sycl::device device = dpct::dev_mgr::instance().get_device(id); sycl::device device = dpct::dev_mgr::instance().get_device(id);
@ -4537,14 +4536,17 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_VIEW: case GGML_OP_VIEW:
case GGML_OP_PERMUTE: case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE: case GGML_OP_TRANSPOSE:
case GGML_OP_NORM:
case GGML_OP_ADD: case GGML_OP_ADD:
case GGML_OP_ADD1: case GGML_OP_ADD1:
case GGML_OP_LOG: case GGML_OP_LOG:
case GGML_OP_SUB: case GGML_OP_SUB:
case GGML_OP_MUL: case GGML_OP_MUL:
case GGML_OP_DIV: case GGML_OP_DIV:
return true;
case GGML_OP_NORM:
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
case GGML_OP_GROUP_NORM:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_SCALE: case GGML_OP_SCALE:
case GGML_OP_SQR: case GGML_OP_SQR:
case GGML_OP_SQRT: case GGML_OP_SQRT:
@ -4576,7 +4578,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_SUM_ROWS: case GGML_OP_SUM_ROWS:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
case GGML_OP_ACC: case GGML_OP_ACC:
case GGML_OP_GROUP_NORM:
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:

View file

@ -156,6 +156,7 @@ struct vk_device_struct {
vk::PhysicalDeviceProperties properties; vk::PhysicalDeviceProperties properties;
std::string name; std::string name;
uint64_t max_memory_allocation_size; uint64_t max_memory_allocation_size;
uint64_t suballocation_block_size;
bool fp16; bool fp16;
bool pipeline_robustness; bool pipeline_robustness;
vk::Device device; vk::Device device;
@ -183,12 +184,12 @@ struct vk_device_struct {
size_t idx; size_t idx;
bool mul_mat_l; bool mul_mat_l[GGML_TYPE_COUNT];
bool mul_mat_m; bool mul_mat_m[GGML_TYPE_COUNT];
bool mul_mat_s; bool mul_mat_s[GGML_TYPE_COUNT];
bool mul_mat_id_l; bool mul_mat_id_l[GGML_TYPE_COUNT];
bool mul_mat_id_m; bool mul_mat_id_m[GGML_TYPE_COUNT];
bool mul_mat_id_s; bool mul_mat_id_s[GGML_TYPE_COUNT];
// set to true to indicate that some shaders need to be compiled after the dryrun // set to true to indicate that some shaders need to be compiled after the dryrun
bool need_compiles {}; bool need_compiles {};
@ -1377,7 +1378,33 @@ static std::array<uint32_t, 2> fa_rows_cols(uint32_t D, uint32_t clamp, ggml_typ
return {64, 64}; return {64, 64};
}; };
static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector<uint32_t>& warptile, bool mul_mat_id) { static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector<uint32_t>& warptile, bool mul_mat_id, ggml_type src0_type) {
uint32_t lut_size = 0;
switch (src0_type) {
case GGML_TYPE_IQ2_XXS:
lut_size = 8*256;
break;
case GGML_TYPE_IQ2_XS:
lut_size = 8*512;
break;
case GGML_TYPE_IQ2_S:
lut_size = 8*1024;
break;
case GGML_TYPE_IQ3_XXS:
lut_size = 4*256;
break;
case GGML_TYPE_IQ3_S:
lut_size = 4*512;
break;
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
lut_size = 4*16;
break;
default:
break;
}
// Needs to be kept up to date on shader changes // Needs to be kept up to date on shader changes
const uint32_t bank_conflict_offset = device->coopmat_support ? 8 : 1; const uint32_t bank_conflict_offset = device->coopmat_support ? 8 : 1;
const uint32_t type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float); const uint32_t type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float);
@ -1387,7 +1414,13 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
const uint32_t mmid_row_ids = mul_mat_id ? 3072 * sizeof(uint32_t) : 0; const uint32_t mmid_row_ids = mul_mat_id ? 3072 * sizeof(uint32_t) : 0;
const uint32_t coopmat_stage = device->coopmat_support ? warptile[7] * warptile[8] / warps * sizeof(float) : 0; const uint32_t coopmat_stage = device->coopmat_support ? warptile[7] * warptile[8] / warps * sizeof(float) : 0;
return (load_bufs + mmid_row_ids + coopmat_stage) <= device->properties.limits.maxComputeSharedMemorySize; const uint32_t total_size = load_bufs + mmid_row_ids + coopmat_stage + lut_size;
const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize;
VK_LOG_DEBUG("ggml_vk_matmul_shmem_support(warptile=(" << warptile[0] << "," << warptile[1] << "," << warptile[2] << "), "
"mul_mat_id=" << mul_mat_id << ", src0_type=" << ggml_type_name(src0_type) << ", supported=" << supported);
return supported;
} }
static void ggml_vk_load_shaders(vk_device& device) { static void ggml_vk_load_shaders(vk_device& device) {
@ -1471,62 +1504,32 @@ static void ggml_vk_load_shaders(vk_device& device) {
m_align = 64; m_align = 64;
s_align = 32; s_align = 32;
// Fallback to smaller sizes if there's not enough shared memory. Given the current shaders for (uint32_t i = 0; i < GGML_TYPE_COUNT; ++i) {
// and tile sizes, this should handle 16KB, 32KB, and 48KB+. ggml_type t = (ggml_type)i;
// This logic doesn't explicitly account for the 12KB row_ids in the mul_mat_mat_id shaders.
// But the numbers happen to work out for 32KB shared memory size that when using the medium
// size there's enough room for everything, and we assert for this.
uint32_t shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float);
if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) {
l_warptile = m_warptile;
l_wg_denoms = m_wg_denoms;
shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float);
GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize);
}
if (device->properties.limits.maxComputeSharedMemorySize >= 32768) {
// assert mul_mat_mat_id shaders will fit.
GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize);
}
shmem_needed = (l_warptile_mmq[1] + l_warptile_mmq[2]) * (l_warptile_mmq[3] + 1) * sizeof(float);
if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) {
if (device->properties.limits.maxComputeSharedMemorySize == 32768) {
l_warptile_mmq = m_warptile_mmq;
l_mmq_wg_denoms = m_mmq_wg_denoms;
} else {
l_warptile_mmq = s_warptile_mmq;
l_mmq_wg_denoms = s_mmq_wg_denoms;
}
shmem_needed = (l_warptile_mmq[1] + l_warptile_mmq[2]) * (l_warptile_mmq[3] + 1) * sizeof(float);
GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize);
}
if (device->properties.limits.maxComputeSharedMemorySize >= 32768) {
// assert mul_mat_mat_id shaders will fit.
GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize);
}
// Disable medium and large matrix multiplication if not enough shared memory is available // Disable medium and large matrix multiplication if not enough shared memory is available
// Check mmq warptiles as the largest configuration // Check mmq warptiles as the largest configuration
// Throw an error if not enough for any matrix multiplication is available // Throw an error if not enough for any matrix multiplication is available
if (!ggml_vk_matmul_shmem_support(device, s_warptile_mmq, false)) { if (!ggml_vk_matmul_shmem_support(device, s_warptile_mmq, false, t)) {
std::cerr << "ggml_vulkan: Error: Shared memory size too small for matrix multiplication." << std::endl; std::cerr << "ggml_vulkan: Error: Shared memory size too small for matrix multiplication." << std::endl;
throw std::runtime_error("Shared memory size too small for matrix multiplication."); throw std::runtime_error("Shared memory size too small for matrix multiplication.");
} else if (!ggml_vk_matmul_shmem_support(device, m_warptile_mmq, false)) { } else if (!ggml_vk_matmul_shmem_support(device, m_warptile_mmq, false, t)) {
device->mul_mat_m = false; device->mul_mat_m[i] = false;
device->mul_mat_l = false; device->mul_mat_l[i] = false;
} else if (!ggml_vk_matmul_shmem_support(device, l_warptile_mmq, false)) { } else if (!ggml_vk_matmul_shmem_support(device, l_warptile_mmq, false, t)) {
device->mul_mat_l = false; device->mul_mat_l[i] = false;
} }
// Disable mul_mat_id if not enough shared memory is available // Disable mul_mat_id if not enough shared memory is available
if (!ggml_vk_matmul_shmem_support(device, s_warptile_mmq, true)) { if (!ggml_vk_matmul_shmem_support(device, s_warptile_mmq, true, t)) {
device->mul_mat_id_s = false; device->mul_mat_id_s[i] = false;
device->mul_mat_id_m = false; device->mul_mat_id_m[i] = false;
device->mul_mat_id_l = false; device->mul_mat_id_l[i] = false;
} else if (!ggml_vk_matmul_shmem_support(device, m_warptile_mmq, true)) { } else if (!ggml_vk_matmul_shmem_support(device, m_warptile_mmq, true, t)) {
device->mul_mat_id_m = false; device->mul_mat_id_m[i] = false;
device->mul_mat_id_l = false; device->mul_mat_id_l[i] = false;
} else if (!ggml_vk_matmul_shmem_support(device, l_warptile_mmq, true)) { } else if (!ggml_vk_matmul_shmem_support(device, l_warptile_mmq, true, t)) {
device->mul_mat_id_l = false; device->mul_mat_id_l[i] = false;
}
} }
} }
@ -1621,6 +1624,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
//CREATE_FA(GGML_TYPE_IQ2_S, iq2_s) //CREATE_FA(GGML_TYPE_IQ2_S, iq2_s)
//CREATE_FA(GGML_TYPE_IQ3_XXS, iq3_xxs) //CREATE_FA(GGML_TYPE_IQ3_XXS, iq3_xxs)
//CREATE_FA(GGML_TYPE_IQ3_S, iq3_s) //CREATE_FA(GGML_TYPE_IQ3_S, iq3_s)
//CREATE_FA(GGML_TYPE_IQ4_XS, iq4_xs)
CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl) CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl)
#undef CREATE_FA #undef CREATE_FA
@ -1654,6 +1658,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_S].f16acc, matmul_iq2_s_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_S].f16acc, matmul_iq2_s_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_S].f16acc, matmul_iq3_s_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_S].f16acc, matmul_iq3_s_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_XS].f16acc, matmul_iq4_xs_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, 4)
@ -1672,6 +1677,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc, matmul_id_iq4_xs_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
#undef CREATE_MM #undef CREATE_MM
#undef CREATE_MM2 #undef CREATE_MM2
@ -1680,115 +1686,116 @@ static void ggml_vk_load_shaders(vk_device& device) {
#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) #if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
if (device->coopmat_support) { if (device->coopmat_support) {
// Create 6 variants, {s,m,l}x{unaligned,aligned} // Create 6 variants, {s,m,l}x{unaligned,aligned}
#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ #define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
if (device->mul_mat ## ID ## _l) \ if (device->mul_mat ## ID ## _l[TYPE]) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true); \
if (device->mul_mat ## ID ## _m) \ if (device->mul_mat ## ID ## _m[TYPE]) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true); \
if (device->mul_mat ## ID ## _s) \ if (device->mul_mat ## ID ## _s[TYPE]) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true); \
if (device->mul_mat ## ID ## _l) \ if (device->mul_mat ## ID ## _l[TYPE]) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true); \
if (device->mul_mat ## ID ## _m) \ if (device->mul_mat ## ID ## _m[TYPE]) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true); \
if (device->mul_mat ## ID ## _s) \ if (device->mul_mat ## ID ## _s[TYPE]) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true); \
// Create 2 variants, {f16,f32} accumulator // Create 2 variants, {f16,f32} accumulator
#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ #define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
if (device->coopmat_acc_f16_support) { \ if (device->coopmat_acc_f16_support) { \
CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ CREATE_MM(TYPE, PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
} \ } \
if (device->coopmat_acc_f32_support) { \ if (device->coopmat_acc_f32_support) { \
CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ CREATE_MM(TYPE, PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
} \ } \
CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
if (device->coopmat_acc_f16_support) { if (device->coopmat_acc_f16_support) {
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc, matmul_iq2_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc, matmul_iq2_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc, matmul_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc, matmul_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc, matmul_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc, matmul_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc, matmul_iq4_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
} else { } else {
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc, matmul_iq2_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc, matmul_iq2_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc, matmul_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc, matmul_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc, matmul_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc, matmul_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc, matmul_iq4_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
} }
// If there's not enough shared memory for row_ids and the result tile, don't create these pipelines. CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) { CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
if (device->coopmat_acc_f16_support) { if (device->coopmat_acc_f16_support) {
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc, matmul_id_iq2_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc, matmul_id_iq2_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc, matmul_id_iq4_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
} else { } else {
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc, matmul_id_iq2_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc, matmul_id_iq2_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc, matmul_id_iq4_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
} CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
} }
#undef CREATE_MM2 #undef CREATE_MM2
#undef CREATE_MM #undef CREATE_MM
@ -1796,137 +1803,135 @@ static void ggml_vk_load_shaders(vk_device& device) {
#endif // defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) #endif // defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
if (device->fp16) { if (device->fp16) {
// Create 6 variants, {s,m,l}x{unaligned,aligned} // Create 6 variants, {s,m,l}x{unaligned,aligned}
#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ #define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
if (device->mul_mat ## ID ## _l) \ if (device->mul_mat ## ID ## _l[TYPE]) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \
if (device->mul_mat ## ID ## _m) \ if (device->mul_mat ## ID ## _m[TYPE]) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \
if (device->mul_mat ## ID ## _s) \ if (device->mul_mat ## ID ## _s[TYPE]) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \
if (device->mul_mat ## ID ## _l) \ if (device->mul_mat ## ID ## _l[TYPE]) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \
if (device->mul_mat ## ID ## _m) \ if (device->mul_mat ## ID ## _m[TYPE]) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \
if (device->mul_mat ## ID ## _s) \ if (device->mul_mat ## ID ## _s[TYPE]) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \
// Create 2 variants, {f16,f32} accumulator // Create 2 variants, {f16,f32} accumulator
#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ #define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ CREATE_MM(TYPE, PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ CREATE_MM(TYPE, PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc, matmul_iq2_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc, matmul_iq2_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc, matmul_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc, matmul_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc, matmul_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc, matmul_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc, matmul_iq4_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
// If there's not enough shared memory for row_ids and the result tile, don't create these pipelines. CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) { CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc, matmul_id_iq2_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc, matmul_id_iq2_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc, matmul_id_iq4_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
} CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
#undef CREATE_MM2 #undef CREATE_MM2
#undef CREATE_MM #undef CREATE_MM
} else { } else {
// Create 6 variants, {s,m,l}x{unaligned,aligned} // Create 6 variants, {s,m,l}x{unaligned,aligned}
#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ #define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
if (device->mul_mat ## ID ## _l) \ if (device->mul_mat ## ID ## _l[TYPE]) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \
if (device->mul_mat ## ID ## _m) \ if (device->mul_mat ## ID ## _m[TYPE]) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \
if (device->mul_mat ## ID ## _s) \ if (device->mul_mat ## ID ## _s[TYPE]) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \
if (device->mul_mat ## ID ## _l) \ if (device->mul_mat ## ID ## _l[TYPE]) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \
if (device->mul_mat ## ID ## _m) \ if (device->mul_mat ## ID ## _m[TYPE]) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \
if (device->mul_mat ## ID ## _s) \ if (device->mul_mat ## ID ## _s[TYPE]) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \
CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_F16, pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_F16, pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f32acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f32acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f32acc, matmul_iq2_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f32acc, matmul_iq2_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f32acc, matmul_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f32acc, matmul_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f32acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f32acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f32acc, matmul_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f32acc, matmul_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f32acc, matmul_iq4_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
// If there's not enough shared memory for row_ids and the result tile, don't create these pipelines. CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) { CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
CREATE_MM(pipeline_matmul_id_f16.f32acc, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
CREATE_MM(pipeline_matmul_id_f16_f32.f32acc, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f32acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f32acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f32acc, matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f32acc, matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f32acc, matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f32acc, matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f32acc, matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f32acc, matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f32acc, matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f32acc, matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f32acc, matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f32acc, matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f32acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f32acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f32acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f32acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f32acc, matmul_id_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f32acc, matmul_id_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f32acc, matmul_id_iq2_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f32acc, matmul_id_iq2_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f32acc, matmul_id_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f32acc, matmul_id_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f32acc, matmul_id_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f32acc, matmul_id_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f32acc, matmul_id_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f32acc, matmul_id_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f32acc, matmul_id_iq4_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
} CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
#undef CREATE_MM #undef CREATE_MM
} }
@ -1961,6 +1966,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_s_f32_f32_len, mul_mat_vec_iq2_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_s_f32_f32_len, mul_mat_vec_iq2_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f32_f32_len, mul_mat_vec_iq3_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f32_f32_len, mul_mat_vec_iq3_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_s_f32_f32_len, mul_mat_vec_iq3_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_s_f32_f32_len, mul_mat_vec_iq3_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq4_xs_f32_f32_len, mul_mat_vec_iq4_xs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32_"+std::to_string(i+1), mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32_"+std::to_string(i+1), mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
@ -1980,6 +1986,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_s_f16_f32_len, mul_mat_vec_iq2_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_s_f16_f32_len, mul_mat_vec_iq2_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f16_f32_len, mul_mat_vec_iq3_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f16_f32_len, mul_mat_vec_iq3_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_s_f16_f32_len, mul_mat_vec_iq3_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_s_f16_f32_len, mul_mat_vec_iq3_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq4_xs_f16_f32_len, mul_mat_vec_iq4_xs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
} }
@ -2000,6 +2007,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_S], "mul_mat_vec_id_iq2_s_f32", mul_mat_vec_id_iq2_s_f32_len, mul_mat_vec_id_iq2_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_S], "mul_mat_vec_id_iq2_s_f32", mul_mat_vec_id_iq2_s_f32_len, mul_mat_vec_id_iq2_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_XXS], "mul_mat_vec_id_iq3_xxs_f32", mul_mat_vec_id_iq3_xxs_f32_len, mul_mat_vec_id_iq3_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_XXS], "mul_mat_vec_id_iq3_xxs_f32", mul_mat_vec_id_iq3_xxs_f32_len, mul_mat_vec_id_iq3_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_S], "mul_mat_vec_id_iq3_s_f32", mul_mat_vec_id_iq3_s_f32_len, mul_mat_vec_id_iq3_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_S], "mul_mat_vec_id_iq3_s_f32", mul_mat_vec_id_iq3_s_f32_len, mul_mat_vec_id_iq3_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_XS], "mul_mat_vec_id_iq4_xs_f32", mul_mat_vec_id_iq4_xs_f32_len, mul_mat_vec_id_iq4_xs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
// dequant shaders // dequant shaders
@ -2019,6 +2027,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ2_S], "dequant_iq2_s", dequant_iq2_s_len, dequant_iq2_s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ2_S], "dequant_iq2_s", dequant_iq2_s_len, dequant_iq2_s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ3_XXS], "dequant_iq3_xxs", dequant_iq3_xxs_len, dequant_iq3_xxs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ3_XXS], "dequant_iq3_xxs", dequant_iq3_xxs_len, dequant_iq3_xxs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ3_S], "dequant_iq3_s", dequant_iq3_s_len, dequant_iq3_s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ3_S], "dequant_iq3_s", dequant_iq3_s_len, dequant_iq3_s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_XS], "dequant_iq4_xs", dequant_iq4_xs_len, dequant_iq4_xs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_NL], "dequant_iq4_nl", dequant_iq4_nl_len, dequant_iq4_nl_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_NL], "dequant_iq4_nl", dequant_iq4_nl_len, dequant_iq4_nl_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
// get_rows // get_rows
@ -2034,6 +2043,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ2_S], "get_rows_iq2_s", get_rows_iq2_s_len, get_rows_iq2_s_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ2_S], "get_rows_iq2_s", get_rows_iq2_s_len, get_rows_iq2_s_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ3_XXS], "get_rows_iq3_xxs", get_rows_iq3_xxs_len, get_rows_iq3_xxs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ3_XXS], "get_rows_iq3_xxs", get_rows_iq3_xxs_len, get_rows_iq3_xxs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ3_S], "get_rows_iq3_s", get_rows_iq3_s_len, get_rows_iq3_s_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ3_S], "get_rows_iq3_s", get_rows_iq3_s_len, get_rows_iq3_s_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_XS], "get_rows_iq4_xs", get_rows_iq4_xs_len, get_rows_iq4_xs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@ -2048,6 +2058,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ2_S], "get_rows_iq2_s_f32", get_rows_iq2_s_f32_len, get_rows_iq2_s_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ2_S], "get_rows_iq2_s_f32", get_rows_iq2_s_f32_len, get_rows_iq2_s_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ3_XXS], "get_rows_iq3_xxs_f32", get_rows_iq3_xxs_f32_len, get_rows_iq3_xxs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ3_XXS], "get_rows_iq3_xxs_f32", get_rows_iq3_xxs_f32_len, get_rows_iq3_xxs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ3_S], "get_rows_iq3_s_f32", get_rows_iq3_s_f32_len, get_rows_iq3_s_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ3_S], "get_rows_iq3_s_f32", get_rows_iq3_s_f32_len, get_rows_iq3_s_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_XS], "get_rows_iq4_xs_f32", get_rows_iq4_xs_f32_len, get_rows_iq4_xs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
@ -2269,6 +2280,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
device->physical_device.getProperties2(&props2); device->physical_device.getProperties2(&props2);
device->properties = props2.properties; device->properties = props2.properties;
device->vendor_id = device->properties.vendorID;
const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE"); const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE");
@ -2280,7 +2292,20 @@ static vk_device ggml_vk_get_device(size_t idx) {
device->max_memory_allocation_size = props3.maxMemoryAllocationSize; device->max_memory_allocation_size = props3.maxMemoryAllocationSize;
} }
device->vendor_id = device->properties.vendorID; const char* GGML_VK_SUBALLOCATION_BLOCK_SIZE = getenv("GGML_VK_SUBALLOCATION_BLOCK_SIZE");
if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr) {
device->suballocation_block_size = std::stoul(GGML_VK_SUBALLOCATION_BLOCK_SIZE);
#if defined(_WIN32)
} else if (device->vendor_id == VK_VENDOR_ID_NVIDIA) {
// Limit batching of allocations to 1GB by default to avoid fragmentation issues
device->suballocation_block_size = 1024*1024*1024;
#endif
} else {
device->suballocation_block_size = device->max_memory_allocation_size;
}
device->suballocation_block_size = std::min(device->suballocation_block_size, device->max_memory_allocation_size);
device->subgroup_size = subgroup_props.subgroupSize; device->subgroup_size = subgroup_props.subgroupSize;
device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
if (sm_builtins) { if (sm_builtins) {
@ -2591,35 +2616,37 @@ static vk_device ggml_vk_get_device(size_t idx) {
// Shaders // Shaders
// Disable matmul tile sizes early if performance low or not supported // Disable matmul tile sizes early if performance low or not supported
for (uint32_t i = 0; i < GGML_TYPE_COUNT; ++i) {
switch (device->vendor_id) { switch (device->vendor_id) {
#ifndef GGML_VULKAN_RUN_TESTS #ifndef GGML_VULKAN_RUN_TESTS
case VK_VENDOR_ID_AMD: case VK_VENDOR_ID_AMD:
case VK_VENDOR_ID_INTEL: case VK_VENDOR_ID_INTEL:
device->mul_mat_l = false; device->mul_mat_l[i] = false;
device->mul_mat_m = true; device->mul_mat_m[i] = true;
device->mul_mat_s = true; device->mul_mat_s[i] = true;
device->mul_mat_id_l = false; device->mul_mat_id_l[i] = false;
device->mul_mat_id_m = true; device->mul_mat_id_m[i] = true;
device->mul_mat_id_s = true; device->mul_mat_id_s[i] = true;
break; break;
case VK_VENDOR_ID_APPLE: case VK_VENDOR_ID_APPLE:
device->mul_mat_l = false; device->mul_mat_l[i] = false;
device->mul_mat_m = true; device->mul_mat_m[i] = true;
device->mul_mat_s = false; device->mul_mat_s[i] = false;
device->mul_mat_id_l = false; device->mul_mat_id_l[i] = false;
device->mul_mat_id_m = true; device->mul_mat_id_m[i] = true;
device->mul_mat_id_s = false; device->mul_mat_id_s[i] = false;
break; break;
#endif #endif
default: default:
device->mul_mat_l = true; device->mul_mat_l[i] = true;
device->mul_mat_m = true; device->mul_mat_m[i] = true;
device->mul_mat_s = true; device->mul_mat_s[i] = true;
device->mul_mat_id_l = true; device->mul_mat_id_l[i] = true;
device->mul_mat_id_m = true; device->mul_mat_id_m[i] = true;
device->mul_mat_id_s = true; device->mul_mat_id_s[i] = true;
break; break;
} }
}
ggml_vk_load_shaders(device); ggml_vk_load_shaders(device);
@ -2748,8 +2775,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none"; std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none";
std::string device_name = props2.properties.deviceName.data(); std::string device_name = props2.properties.deviceName.data();
GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | matrix cores: %s\n", GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | shared memory: %d | matrix cores: %s\n",
idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size, matrix_cores.c_str()); idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size,
props2.properties.limits.maxComputeSharedMemorySize, matrix_cores.c_str());
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) { if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
GGML_LOG_DEBUG("ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want.\n"); GGML_LOG_DEBUG("ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want.\n");
@ -2980,6 +3008,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_NL:
break; break;
default: default:
@ -3033,6 +3062,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_NL:
break; break;
default: default:
@ -3069,6 +3099,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_NL:
break; break;
default: default:
@ -3117,6 +3148,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_NL:
break; break;
default: default:
@ -3148,6 +3180,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_NL:
break; break;
default: default:
@ -3718,31 +3751,31 @@ static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int
return split_k; return split_k;
} }
static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) { static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned, ggml_type src0_type) {
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")"); VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ")");
if (ctx->device->coopmat2) { if (ctx->device->coopmat2) {
if ((ctx->device->mul_mat_l && (m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_m && !ctx->device->mul_mat_s)) { if ((ctx->device->mul_mat_l[src0_type] && (m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) {
return aligned ? mmp->a_l : mmp->l; return aligned ? mmp->a_l : mmp->l;
} }
if ((ctx->device->mul_mat_m && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_s) { if ((ctx->device->mul_mat_m[src0_type] && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_s[src0_type]) {
return aligned ? mmp->a_m : mmp->m; return aligned ? mmp->a_m : mmp->m;
} }
return aligned ? mmp->a_s : mmp->s; return aligned ? mmp->a_s : mmp->s;
} }
if ((ctx->device->mul_mat_s && (m <= 32 || n <= 32)) || (!ctx->device->mul_mat_m && !ctx->device->mul_mat_l)) { if ((ctx->device->mul_mat_s[src0_type] && (m <= 32 || n <= 32)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_l[src0_type])) {
return aligned ? mmp->a_s : mmp->s; return aligned ? mmp->a_s : mmp->s;
} }
if ((ctx->device->mul_mat_m && (m <= 64 || n <= 64)) || !ctx->device->mul_mat_l) { if ((ctx->device->mul_mat_m[src0_type] && (m <= 64 || n <= 64)) || !ctx->device->mul_mat_l[src0_type]) {
return aligned ? mmp->a_m : mmp->m; return aligned ? mmp->a_m : mmp->m;
} }
return aligned ? mmp->a_l : mmp->l; return aligned ? mmp->a_l : mmp->l;
} }
static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) { static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type src0_type) {
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")"); VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ", " << ggml_type_name(src0_type) << ")");
return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true)->align; return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true, src0_type)->align;
} }
static void ggml_vk_matmul( static void ggml_vk_matmul(
@ -3769,31 +3802,31 @@ static void ggml_vk_matmul(
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 }); ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 });
} }
static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) { static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned, ggml_type src0_type) {
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")"); VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ")");
if (ctx->device->coopmat2) { if (ctx->device->coopmat2) {
if ((ctx->device->mul_mat_id_l && (m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_id_m && !ctx->device->mul_mat_id_s)) { if ((ctx->device->mul_mat_id_l[src0_type] && (m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_id_m[src0_type] && !ctx->device->mul_mat_id_s[src0_type])) {
return aligned ? mmp->a_l : mmp->l; return aligned ? mmp->a_l : mmp->l;
} }
if ((ctx->device->mul_mat_id_m && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_id_s) { if ((ctx->device->mul_mat_id_m[src0_type] && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_id_s[src0_type]) {
return aligned ? mmp->a_m : mmp->m; return aligned ? mmp->a_m : mmp->m;
} }
return aligned ? mmp->a_s : mmp->s; return aligned ? mmp->a_s : mmp->s;
} }
if ((ctx->device->mul_mat_id_s && (m <= 32 || n <= 32)) || (!ctx->device->mul_mat_id_m && !ctx->device->mul_mat_id_l)) { if ((ctx->device->mul_mat_id_s[src0_type] && (m <= 32 || n <= 32)) || (!ctx->device->mul_mat_id_m[src0_type] && !ctx->device->mul_mat_id_l[src0_type])) {
return aligned ? mmp->a_s : mmp->s; return aligned ? mmp->a_s : mmp->s;
} }
if ((ctx->device->mul_mat_id_m && (m <= 64 || n <= 64)) || !ctx->device->mul_mat_id_l) { if ((ctx->device->mul_mat_id_m[src0_type] && (m <= 64 || n <= 64)) || !ctx->device->mul_mat_id_l[src0_type]) {
return aligned ? mmp->a_m : mmp->m; return aligned ? mmp->a_m : mmp->m;
} }
return aligned ? mmp->a_l : mmp->l; return aligned ? mmp->a_l : mmp->l;
} }
static uint32_t ggml_vk_guess_matmul_id_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) { static uint32_t ggml_vk_guess_matmul_id_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type src0_type) {
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")"); VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ", " << ggml_type_name(src0_type) << ")");
return ggml_vk_guess_matmul_id_pipeline(ctx, mmp, m, n, true)->align; return ggml_vk_guess_matmul_id_pipeline(ctx, mmp, m, n, true, src0_type)->align;
} }
static void ggml_vk_matmul_id( static void ggml_vk_matmul_id(
@ -3974,10 +4007,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
const int y_ne = ne11 * ne10; const int y_ne = ne11 * ne10;
const int d_ne = ne11 * ne01; const int d_ne = ne11 * ne01;
const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11)); const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11, qx_needs_dequant ? GGML_TYPE_F16 : src0->type));
const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8; const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8;
vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned); vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, qx_needs_dequant ? GGML_TYPE_F16 : src0->type);
const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline); const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline);
@ -4556,10 +4589,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
const uint64_t y_ne = ne11 * ne10; const uint64_t y_ne = ne11 * ne10;
const uint64_t d_ne = ne21 * ne20; const uint64_t d_ne = ne21 * ne20;
const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_id_pipeline_align(ctx, mmp, ne01, nei1)); const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_id_pipeline_align(ctx, mmp, ne01, nei1, qx_needs_dequant ? GGML_TYPE_F16 : src0->type));
const bool aligned = ne10 == kpad && ne01 > 8 && nei1 > 8; const bool aligned = ne10 == kpad && ne01 > 8 && nei1 > 8;
vk_pipeline pipeline = ggml_vk_guess_matmul_id_pipeline(ctx, mmp, ne01, nei1, aligned); vk_pipeline pipeline = ggml_vk_guess_matmul_id_pipeline(ctx, mmp, ne01, nei1, aligned, qx_needs_dequant ? GGML_TYPE_F16 : src0->type);
const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
@ -7561,7 +7594,7 @@ static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type
static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
return ctx->device->max_memory_allocation_size; return ctx->device->suballocation_block_size;
} }
static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
@ -7998,13 +8031,14 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:
{ {
ggml_type src0_type = op->src[0]->type;
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
const vk_device& device = ggml_vk_get_device(ctx->device); const vk_device& device = ggml_vk_get_device(ctx->device);
if (op->op == GGML_OP_MUL_MAT_ID && !device->mul_mat_id_s && !device->mul_mat_id_m && !device->mul_mat_id_l) { if (op->op == GGML_OP_MUL_MAT_ID && !device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) {
// If there's not enough shared memory for row_ids and the result tile, fallback to CPU // If there's not enough shared memory for row_ids and the result tile, fallback to CPU
return false; return false;
} }
switch (op->src[0]->type) { switch (src0_type) {
case GGML_TYPE_F32: case GGML_TYPE_F32:
case GGML_TYPE_F16: case GGML_TYPE_F16:
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
@ -8022,6 +8056,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_NL:
break; break;
default: default:
@ -8095,6 +8130,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
//case GGML_TYPE_IQ2_S: //case GGML_TYPE_IQ2_S:
//case GGML_TYPE_IQ3_XXS: //case GGML_TYPE_IQ3_XXS:
//case GGML_TYPE_IQ3_S: //case GGML_TYPE_IQ3_S:
//case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_NL:
break; break;
default: default:
@ -8117,6 +8153,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_NL:
return true; return true;
default: default:
@ -8182,9 +8219,11 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
case GGML_OP_VIEW: case GGML_OP_VIEW:
case GGML_OP_PERMUTE: case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE: case GGML_OP_TRANSPOSE:
return true;
case GGML_OP_NORM: case GGML_OP_NORM:
case GGML_OP_GROUP_NORM: case GGML_OP_GROUP_NORM:
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_ADD: case GGML_OP_ADD:
case GGML_OP_ACC: case GGML_OP_ACC:
case GGML_OP_MUL: case GGML_OP_MUL:

View file

@ -12,7 +12,7 @@ layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
#endif #endif
void main() { void main() {
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL) #if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
init_iq_shmem(gl_WorkGroupSize); init_iq_shmem(gl_WorkGroupSize);
if (gl_LocalInvocationIndex.x != 0) { if (gl_LocalInvocationIndex.x != 0) {
return; return;

View file

@ -217,7 +217,7 @@ void quantize(uint dst_idx, uint src_idx)
#endif #endif
void main() { void main() {
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL) #if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
init_iq_shmem(gl_WorkGroupSize); init_iq_shmem(gl_WorkGroupSize);
if (gl_LocalInvocationIndex.x != 0) { if (gl_LocalInvocationIndex.x != 0) {
return; return;

View file

@ -304,6 +304,42 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
} }
#endif #endif
#if defined(DATA_A_IQ4_XS)
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const uint ib32 = iqs / 32;
const uint iq = 16 * ib32 + (iqs % 16);
const uint sl = (data_a[a_offset + ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
const uint sh = (data_a[a_offset + ib].scales_h >> (2 * ib32)) & 3;
const uint qshift = (iqs & 16) >> 2;
u8vec2 qs = u8vec2(data_a[a_offset + ib].qs[iq], data_a[a_offset + ib].qs[iq + 1]);
qs = (qs >> qshift) & uint8_t(0xF);
const float dl = float(int(sl | (sh << 4)) - 32);
return dl * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]);
}
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
const uint ib32 = iqs / 32;
const uint iq = 16 * ib32 + (iqs % 16);
const uint sl = (data_a[a_offset + ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
const uint sh = (data_a[a_offset + ib].scales_h >> (2 * ib32)) & 3;
const uint qshift = (iqs & 16) >> 2;
u8vec4 qs = u8vec4(
data_a[a_offset + ib].qs[iq + 0],
data_a[a_offset + ib].qs[iq + 1],
data_a[a_offset + ib].qs[iq + 2],
data_a[a_offset + ib].qs[iq + 3]
);
qs = (qs >> qshift) & uint8_t(0xF);
const float dl = float(int(sl | (sh << 4)) - 32);
return dl * vec4(
kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y],
kvalues_iq4nl[qs.z], kvalues_iq4nl[qs.w]);
}
#endif
#if defined(DATA_A_IQ4_NL) #if defined(DATA_A_IQ4_NL)
vec2 dequantize(uint ib, uint iqs, uint a_offset) { vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const uint vui = uint(data_a[a_offset + ib].qs[iqs]); const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
@ -321,7 +357,7 @@ vec2 get_dm(uint ib, uint a_offset) {
} }
#endif #endif
#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL) #if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
vec2 get_dm(uint ib, uint a_offset) { vec2 get_dm(uint ib, uint a_offset) {
return vec2(float(data_a[a_offset + ib].d), 0); return vec2(float(data_a[a_offset + ib].d), 0);
} }

View file

@ -323,15 +323,16 @@ float16_t dequantFuncIQ2_XXS(const in decodeBufIQ2_XXS bl, const in uint blockCo
const uint8_t qs = bl.block.qs[iqs]; const uint8_t qs = bl.block.qs[iqs];
const uint signscale = pack32(u16vec2(bl16.block.qs[4*ib32+2], bl16.block.qs[4*ib32+3])); const uint signscale = pack32(u16vec2(bl16.block.qs[4*ib32+2], bl16.block.qs[4*ib32+3]));
const float16_t dscale = bl.block.d * 0.25hf * (0.5hf + float16_t(signscale >> 28)); const float dscale = float(bl.block.d) * 0.25 * (0.5 + float(signscale >> 28));
uint sign = bitfieldExtract(signscale, 7 * int(ib8), 7); uint sign = bitfieldExtract(signscale, 7 * int(ib8), 7);
sign |= bitCount(sign) << 7; sign |= bitCount(sign) << 7;
const uint8_t g = unpack8(iq2xxs_grid[qs][(idx & 4) >> 2])[idx & 3]; uint g2 = iq2xxs_grid[qs][(idx & 4) >> 2];
g2 >>= (idx & 2) * 8;
const vec2 g = vec2(unpack8(g2));
float16_t ret = dscale * float16_t(g) * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf); vec2 ret = dscale * g * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf);
return float16_t(ret[idx & 1]);
return ret;
} }
#endif #endif
@ -350,14 +351,16 @@ float16_t dequantFuncIQ2_XS(const in decodeBufIQ2_XS bl, const in uint blockCoor
const uint iqs = (idx & 0xF8) >> 3; // 0..63 const uint iqs = (idx & 0xF8) >> 3; // 0..63
const uint16_t qs = bl.block.qs[iqs]; const uint16_t qs = bl.block.qs[iqs];
const float16_t dscale = bl.block.d * 0.25hf * (0.5hf + float16_t((bl.block.scales[is] >> sshift) & 0xF)); const float dscale = float(bl.block.d) * 0.25 * (0.5 + float((bl.block.scales[is] >> sshift) & 0xF));
uint sign = uint(qs >> 9); uint sign = uint(qs >> 9);
sign |= bitCount(sign) << 7; sign |= bitCount(sign) << 7;
const uint8_t g = unpack8(iq2xs_grid[qs & 0x1FF][(idx & 4) >> 2])[idx & 3]; uint g2 = iq2xs_grid[qs & 0x1FF][(idx & 4) >> 2];
g2 >>= (idx & 2) * 8;
const vec2 g = vec2(unpack8(g2));
float16_t ret = dscale * float16_t(g) * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf); vec2 ret = dscale * g * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf);
return ret; return float16_t(ret[idx & 1]);
} }
#endif #endif
@ -369,24 +372,23 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ2
float16_t dequantFuncIQ2_S(const in decodeBufIQ2_S bl, const in uint blockCoords[2], const in uint coordInBlock[2]) float16_t dequantFuncIQ2_S(const in decodeBufIQ2_S bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{ {
uint idx = coordInBlock[1]; uint idx = coordInBlock[1];
uint lsb = idx & 1;
idx /= 2;
const uint ib8 = (idx % 128) / 4; // 0..31 const uint ib32 = (idx & 0xE0) >> 5; // 0..7
const uint ib32 = ib8 / 4; // 0..7 const uint ib8 = (idx & 0xF8) >> 3; // 0..31
const uint qhshift = 2 * (ib8 % 4);
const uint scale = (bl.block.scales[ib32] >> (2 * (ib8 & 2))) & 0xf; const uint scale = (bl.block.scales[ib32] >> ((idx & 0x10) >> 2)) & 0xf;
const uint qs = bl.block.qs[ib8]; const uint qs = bl.block.qs[ib8];
const uint qh = bl.block.qh[ib32]; const uint qh = bl.block.qh[ib32];
const uint qhshift = 2 * (ib8 % 4); const uint sign = bl.block.qs[QUANT_K / 8 + ib8] >> (idx & 0x6);
const uint sign = bl.block.qs[QUANT_K / 8 + ib8] >> (2 * (idx % 4));
const float d = float(bl.block.d); const float d = float(bl.block.d);
const float db = d * 0.25 * (0.5 + scale); const float db = d * 0.25 * (0.5 + scale);
const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign)))); const ivec2 sign01 = 1 - (2 & ivec2(sign << 1, sign));
const uint16_t grid = unpack16(iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(idx & 2) >> 1])[idx & 1]; uint g2 = iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(idx & 4) >> 2];
const vec2 v = db * vec2(sign01) * vec2(unpack8(grid)); g2 >>= (idx & 2) * 8;
return float16_t(v[lsb]); const vec2 v = db * vec2(sign01) * vec2(unpack8(g2));
return float16_t(v[idx & 1]);
} }
#endif #endif
@ -401,28 +403,25 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ3
float16_t dequantFuncIQ3_XXS(const in decodeBufIQ3_XXS bl, const in uint blockCoords[2], const in uint coordInBlock[2]) float16_t dequantFuncIQ3_XXS(const in decodeBufIQ3_XXS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{ {
decodeBufIQ3_XXS_packed16 bl16 = decodeBufIQ3_XXS_packed16(bl);
uint idx = coordInBlock[1]; uint idx = coordInBlock[1];
uint lsb = idx & 1;
idx /= 2;
const uint iqs = (idx % 128) / 2; // 0..63 const uint iqs = (idx & 0xFC) >> 2; // 0..63
const uint is = QUANT_K / 4 + 4 * (iqs / 8); // 8 values const uint is = QUANT_K / 4 + ((idx & 0xE0) >> 3);// 8 values
const float d = float(bl.block.d); const float d = float(bl.block.d);
const uint qs = bl.block.qs[iqs]; const uint qs = bl.block.qs[iqs];
const uint signs = pack32(u8vec4( const uint signs = pack32(u16vec2(
bl.block.qs[is+0], bl16.block.qs[is/2+0],
bl.block.qs[is+1], bl16.block.qs[is/2+1]
bl.block.qs[is+2],
bl.block.qs[is+3]
)); ));
const float db = d * 0.5 * (0.5 + (signs >> 28)); const float db = d * 0.5 * (0.5 + (signs >> 28));
const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7); const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7);
const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4)); const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (idx & 0x6);
const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign)))); const ivec2 sign01 = ivec2(1 - (2 & ivec2(sign << 1, sign)));
const uint grid = iq3xxs_grid[qs] >> (16 * (idx & 1)); const uint grid = iq3xxs_grid[qs] >> (16 * ((idx & 2) >> 1));
const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
return float16_t(v[lsb]); return float16_t(v[idx & 1]);
} }
#endif #endif
@ -434,26 +433,45 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ3
float16_t dequantFuncIQ3_S(const in decodeBufIQ3_S bl, const in uint blockCoords[2], const in uint coordInBlock[2]) float16_t dequantFuncIQ3_S(const in decodeBufIQ3_S bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{ {
uint idx = coordInBlock[1]; uint idx = coordInBlock[1];
uint lsb = idx & 1;
idx /= 2;
const uint iqs = (idx % 128) / 2; // 0..63 const uint iqs = (idx & 0xFC) >> 2; // 0..63
const uint iqh = iqs / 8; const uint iqh = (idx & 0xE0) >> 5;
const float d = float(bl.block.d); const float d = float(bl.block.d);
const uint qs = bl.block.qs[iqs]; const uint qs = bl.block.qs[iqs];
const uint qh = bl.block.qh[iqh]; const uint qh = bl.block.qh[iqh];
const int8_t sign = int8_t(bl.block.signs[iqs / 2] >> (2 * (idx % 4))); const int8_t sign = int8_t(bl.block.signs[iqs / 2] >> (idx & 0x6));
const uint scale = bl.block.scales[iqs / 16]; const uint scale = bl.block.scales[iqs / 16];
const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(sign << 1, sign))); const ivec2 sign01 = ivec2(1 - (2 & ivec2(sign << 1, sign)));
const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf)); const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf));
const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)] >> (16 * (idx % 2)); const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)] >> ((idx & 2) << 3);
const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
return float16_t(v[lsb]); return float16_t(v[idx & 1]);
} }
#endif #endif
#if defined(DATA_A_IQ4_XS)
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_XS {
block_iq4_xs block;
};
float16_t dequantFuncIQ4_XS(const in decodeBufIQ4_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
const float16_t d = bl.block.d;
const uint idx = coordInBlock[1];
const uint ib32 = (idx & 0xE0) >> 5; // 0..7
const uint sl = (bl.block.scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
const uint sh = ((bl.block.scales_h) >> (2 * ib32)) & 3;
const uint qshift = (idx & 16) >> 2;
const uint q = (bl.block.qs[16 * ib32 + (idx % 16)] >> qshift) & 0xF;
float16_t ret = d * float16_t(int(sl | (sh << 4)) - 32) * float16_t(kvalues_iq4nl[q]);
return ret;
}
#endif
#if defined(DATA_A_IQ4_NL) #if defined(DATA_A_IQ4_NL)
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_NL { layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_NL {
@ -504,6 +522,8 @@ float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoor
#define dequantFuncA dequantFuncIQ3_XXS #define dequantFuncA dequantFuncIQ3_XXS
#elif defined(DATA_A_IQ3_S) #elif defined(DATA_A_IQ3_S)
#define dequantFuncA dequantFuncIQ3_S #define dequantFuncA dequantFuncIQ3_S
#elif defined(DATA_A_IQ4_XS)
#define dequantFuncA dequantFuncIQ4_XS
#elif defined(DATA_A_IQ4_NL) #elif defined(DATA_A_IQ4_NL)
#define dequantFuncA dequantFuncIQ4_NL #define dequantFuncA dequantFuncIQ4_NL
#endif #endif

View file

@ -0,0 +1,34 @@
#version 450
#include "dequant_head.comp"
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer A {block_iq4_xs data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
void main() {
// Each thread handles 1 subblock (1 scale and 32 quantized values)
const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
init_iq_shmem(gl_WorkGroupSize);
if (ib >= p.nel / 256) {
return;
}
const uint ib32 = gl_LocalInvocationID.x % 8;
const float d = float(data_a[ib].d);
// Scales are 6 bits
const uint scale = ((data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF)
| (((data_a[ib].scales_h >> (2 * ib32)) & 3) << 4);
const float dl = d * (int(scale) - 32);
const uint b_idx = 256 * ib + 32 * ib32;
const uint q_idx = 16 * ib32;
[[unroll]] for (uint l = 0; l < 16; ++l) {
data_b[b_idx + l + 0] = D_TYPE(dl * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]);
data_b[b_idx + l + 16] = D_TYPE(dl * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >> 4]);
}
}

View file

@ -104,7 +104,7 @@ ACC_TYPE Max(const in uint32_t row, const in uint32_t col, const in ACC_TYPE ele
#endif #endif
void main() { void main() {
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL) #if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
init_iq_shmem(gl_WorkGroupSize); init_iq_shmem(gl_WorkGroupSize);
#endif #endif

View file

@ -12,7 +12,7 @@ void main() {
const uint i11 = (gl_GlobalInvocationID.z)/p.ne12; const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
const uint i12 = (gl_GlobalInvocationID.z)%p.ne12; const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL) #if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
init_iq_shmem(gl_WorkGroupSize); init_iq_shmem(gl_WorkGroupSize);
#endif #endif

View file

@ -133,7 +133,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
void main() { void main() {
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z); const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL) #if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
init_iq_shmem(gl_WorkGroupSize); init_iq_shmem(gl_WorkGroupSize);
#endif #endif

View file

@ -95,7 +95,7 @@ shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS];
#endif #endif
void main() { void main() {
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL) #if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
init_iq_shmem(gl_WorkGroupSize); init_iq_shmem(gl_WorkGroupSize);
#endif #endif
@ -547,6 +547,25 @@ void main() {
const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)] >> (16 * (idx % 2)); const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)] >> (16 * (idx % 2));
const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
#elif defined(DATA_A_IQ4_XS)
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
const uint ib = idx / 128; // 2 values per idx
const uint ib32 = (idx % 128) / 16; // 0..7
const uint iq = 16 * ib32 + 2 * (idx % 8);
const uint sl = (data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
const uint sh = ((data_a[ib].scales_h) >> (2 * ib32)) & 3;
const uint qshift = (idx & 8) >> 1;
u8vec2 qs = u8vec2(data_a[ib].qs[iq], data_a[ib].qs[iq + 1]);
qs = (qs >> qshift) & uint8_t(0xF);
const float d = float(data_a[ib].d);
const vec2 v = d * float(int(sl | (sh << 4)) - 32) * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]);
buf_a[buf_idx ] = FLOAT_TYPE(v.x); buf_a[buf_idx ] = FLOAT_TYPE(v.x);
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
#elif defined(DATA_A_IQ4_NL) #elif defined(DATA_A_IQ4_NL)

View file

@ -106,7 +106,7 @@ D_TYPE perElemOpD(const in uint32_t r, const in uint32_t c, const in D_TYPE elem
#endif #endif
void main() { void main() {
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL) #if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
init_iq_shmem(gl_WorkGroupSize); init_iq_shmem(gl_WorkGroupSize);
#endif #endif

View file

@ -1026,6 +1026,23 @@ void init_iq_shmem(uvec3 wgsize)
#define A_TYPE_PACKED16 block_iq3_s_packed16 #define A_TYPE_PACKED16 block_iq3_s_packed16
#endif #endif
#define QUANT_K_IQ4_XS 256
#define QUANT_R_IQ4_XS 1
struct block_iq4_xs
{
float16_t d;
uint16_t scales_h;
uint8_t scales_l[QUANT_K_IQ4_XS/64];
uint8_t qs[QUANT_K_IQ4_XS/2];
};
#if defined(DATA_A_IQ4_XS)
#define QUANT_K QUANT_K_IQ4_XS
#define QUANT_R QUANT_R_IQ4_XS
#define A_TYPE block_iq4_xs
#endif
#define QUANT_K_IQ4_NL 32 #define QUANT_K_IQ4_NL 32
#define QUANT_R_IQ4_NL 2 #define QUANT_R_IQ4_NL 2
@ -1042,7 +1059,13 @@ struct block_iq4_nl_packed16
}; };
#if defined(DATA_A_IQ4_NL) #if defined(DATA_A_IQ4_NL)
#define QUANT_K QUANT_K_IQ4_NL
#define QUANT_R QUANT_R_IQ4_NL
#define A_TYPE block_iq4_nl
#define A_TYPE_PACKED16 block_iq4_nl_packed16
#endif
#if defined(DATA_A_IQ4_NL) || defined(DATA_A_IQ4_XS)
const int8_t kvalues_iq4nl_const[16] = { const int8_t kvalues_iq4nl_const[16] = {
int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10), int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113) int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
@ -1058,11 +1081,6 @@ void init_iq_shmem(uvec3 wgsize)
} }
barrier(); barrier();
} }
#define QUANT_K QUANT_K_IQ4_NL
#define QUANT_R QUANT_R_IQ4_NL
#define A_TYPE block_iq4_nl
#define A_TYPE_PACKED16 block_iq4_nl_packed16
#endif #endif
#endif // !defined(GGML_TYPES_COMP) #endif // !defined(GGML_TYPES_COMP)

View file

@ -60,6 +60,7 @@ const std::vector<std::string> type_names = {
"iq2_s", "iq2_s",
"iq3_xxs", "iq3_xxs",
"iq3_s", "iq3_s",
"iq4_xs",
"iq4_nl" "iq4_nl"
}; };

View file

@ -1114,11 +1114,12 @@ extern "C" {
}; };
struct llama_sampler { struct llama_sampler {
struct llama_sampler_i * iface; const struct llama_sampler_i * iface;
llama_sampler_context_t ctx; llama_sampler_context_t ctx;
}; };
// mirror of llama_sampler_i: // mirror of llama_sampler_i:
LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl); LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token); LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p); LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);

View file

@ -1 +1 @@
498e0ecd2c4f9379439fd413805af10e8e9ff349 08b538031f7f944e84f472483ef5d26bf5190ead

View file

@ -1275,6 +1275,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const bool use_mmap_buffer = true; const bool use_mmap_buffer = true;
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
// build a list of buffer types for the CPU and GPU devices // build a list of buffer types for the CPU and GPU devices
pimpl->cpu_buft_list = make_cpu_buft_list(devices); pimpl->cpu_buft_list = make_cpu_buft_list(devices);
for (auto * dev : devices) { for (auto * dev : devices) {

View file

@ -316,6 +316,13 @@ static uint32_t get_rng_seed(uint32_t seed) {
// llama_sampler API // llama_sampler API
struct llama_sampler * llama_sampler_init(const struct llama_sampler_i * iface, llama_sampler_context_t ctx) {
return new llama_sampler {
/* .iface = */ iface,
/* .ctx = */ ctx,
};
}
const char * llama_sampler_name(const struct llama_sampler * smpl) { const char * llama_sampler_name(const struct llama_sampler * smpl) {
if (!smpl->iface) { if (!smpl->iface) {
return "(null)"; return "(null)";
@ -347,10 +354,10 @@ struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
} }
if (smpl->ctx == nullptr) { if (smpl->ctx == nullptr) {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ smpl->iface, /* .iface = */ smpl->iface,
/* .ctx = */ nullptr, /* .ctx = */ nullptr
}; );
} }
GGML_ABORT("the sampler does not support cloning"); GGML_ABORT("the sampler does not support cloning");
@ -472,15 +479,15 @@ static struct llama_sampler_i llama_sampler_chain_i = {
}; };
struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) { struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_chain_i, /* .iface = */ &llama_sampler_chain_i,
/* .ctx = */ new llama_sampler_chain { /* .ctx = */ new llama_sampler_chain {
/* .params = */ params, /* .params = */ params,
/* .samplers = */ {}, /* .samplers = */ {},
/* .t_sample_us = */ 0, /* .t_sample_us = */ 0,
/* .n_sample = */ 0, /* .n_sample = */ 0,
}, }
}; );
} }
void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) { void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
@ -546,10 +553,10 @@ static struct llama_sampler_i llama_sampler_greedy_i = {
}; };
struct llama_sampler * llama_sampler_init_greedy() { struct llama_sampler * llama_sampler_init_greedy() {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_greedy_i, /* .iface = */ &llama_sampler_greedy_i,
/* .ctx = */ nullptr, /* .ctx = */ nullptr
}; );
} }
// dist // dist
@ -608,14 +615,14 @@ static struct llama_sampler_i llama_sampler_dist_i = {
struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
auto seed_cur = get_rng_seed(seed); auto seed_cur = get_rng_seed(seed);
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_dist_i, /* .iface = */ &llama_sampler_dist_i,
/* .ctx = */ new llama_sampler_dist { /* .ctx = */ new llama_sampler_dist {
/* .seed = */ seed, /* .seed = */ seed,
/* .seed_cur = */ seed_cur, /* .seed_cur = */ seed_cur,
/* .rng = */ std::mt19937(seed_cur), /* .rng = */ std::mt19937(seed_cur),
}, }
}; );
} }
// softmax // softmax
@ -638,10 +645,10 @@ static struct llama_sampler_i llama_sampler_softmax_i = {
}; };
struct llama_sampler * llama_sampler_init_softmax() { struct llama_sampler * llama_sampler_init_softmax() {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_softmax_i, /* .iface = */ &llama_sampler_softmax_i,
/* .ctx = */ nullptr, /* .ctx = */ nullptr
}; );
} }
// top-k // top-k
@ -678,12 +685,12 @@ static struct llama_sampler_i llama_sampler_top_k_i = {
}; };
struct llama_sampler * llama_sampler_init_top_k(int32_t k) { struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_top_k_i, /* .iface = */ &llama_sampler_top_k_i,
/* .ctx = */ new llama_sampler_top_k { /* .ctx = */ new llama_sampler_top_k {
/* .k = */ k, /* .k = */ k,
}, }
}; );
} }
// top-p // top-p
@ -744,13 +751,13 @@ static struct llama_sampler_i llama_sampler_top_p_i = {
}; };
struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) { struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_top_p_i, /* .iface = */ &llama_sampler_top_p_i,
/* .ctx = */ new llama_sampler_top_p { /* .ctx = */ new llama_sampler_top_p {
/* .p = */ p, /* .p = */ p,
/* .min_keep = */ min_keep, /* .min_keep = */ min_keep,
}, }
}; );
} }
// min-p // min-p
@ -840,13 +847,13 @@ static struct llama_sampler_i llama_sampler_min_p_i = {
}; };
struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) { struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_min_p_i, /* .iface = */ &llama_sampler_min_p_i,
/* .ctx = */ new llama_sampler_min_p { /* .ctx = */ new llama_sampler_min_p {
/* .p = */ p, /* .p = */ p,
/* .min_keep = */ min_keep, /* .min_keep = */ min_keep,
}, }
}; );
} }
// typical // typical
@ -939,13 +946,13 @@ static struct llama_sampler_i llama_sampler_typical_i = {
}; };
struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) { struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_typical_i, /* .iface = */ &llama_sampler_typical_i,
/* .ctx = */ new llama_sampler_typical { /* .ctx = */ new llama_sampler_typical {
/* .p = */ p, /* .p = */ p,
/* .min_keep = */ min_keep, /* .min_keep = */ min_keep,
}, }
}; );
} }
// temp // temp
@ -983,12 +990,12 @@ static struct llama_sampler_i llama_sampler_temp_i = {
}; };
struct llama_sampler * llama_sampler_init_temp(float temp) { struct llama_sampler * llama_sampler_init_temp(float temp) {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_temp_i, /* .iface = */ &llama_sampler_temp_i,
/* .ctx = */ new llama_sampler_temp { /* .ctx = */ new llama_sampler_temp {
/*.temp = */ temp, /*.temp = */ temp,
}, }
}; );
} }
// temp-ext // temp-ext
@ -1093,14 +1100,14 @@ static struct llama_sampler_i llama_sampler_temp_ext_i = {
}; };
struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) { struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_temp_ext_i, /* .iface = */ &llama_sampler_temp_ext_i,
/* .ctx = */ new llama_sampler_temp_ext { /* .ctx = */ new llama_sampler_temp_ext {
/* .temp = */ temp, /* .temp = */ temp,
/* .delta = */ delta, /* .delta = */ delta,
/* .exponent = */ exponent, /* .exponent = */ exponent,
}, }
}; );
} }
// xtc // xtc
@ -1185,7 +1192,7 @@ static struct llama_sampler_i llama_sampler_xtc_i = {
struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) { struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
auto seed_cur = get_rng_seed(seed); auto seed_cur = get_rng_seed(seed);
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_xtc_i, /* .iface = */ &llama_sampler_xtc_i,
/* .ctx = */ new llama_sampler_xtc { /* .ctx = */ new llama_sampler_xtc {
/* .probability = */ p, /* .probability = */ p,
@ -1194,8 +1201,8 @@ struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep,
/* .seed = */ seed, /* .seed = */ seed,
/* .seed_cur = */ seed_cur, /* .seed_cur = */ seed_cur,
/* .rng = */ std::mt19937(seed_cur), /* .rng = */ std::mt19937(seed_cur),
}, }
}; );
} }
// mirostat // mirostat
@ -1292,7 +1299,7 @@ static struct llama_sampler_i llama_sampler_mirostat_i = {
struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) { struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
auto seed_cur = get_rng_seed(seed); auto seed_cur = get_rng_seed(seed);
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_mirostat_i, /* .iface = */ &llama_sampler_mirostat_i,
/* .ctx = */ new llama_sampler_mirostat { /* .ctx = */ new llama_sampler_mirostat {
/* .n_vocab = */ n_vocab, /* .n_vocab = */ n_vocab,
@ -1303,8 +1310,8 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see
/* .m = */ m, /* .m = */ m,
/* .mu = */ 2.0f*tau, /* .mu = */ 2.0f*tau,
/* .rng = */ std::mt19937(seed_cur), /* .rng = */ std::mt19937(seed_cur),
}, }
}; );
} }
// mirostat v2 // mirostat v2
@ -1391,7 +1398,7 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) { struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
auto seed_cur = get_rng_seed(seed); auto seed_cur = get_rng_seed(seed);
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_mirostat_v2_i, /* .iface = */ &llama_sampler_mirostat_v2_i,
/* .ctx = */ new llama_sampler_mirostat_v2 { /* .ctx = */ new llama_sampler_mirostat_v2 {
/* .seed = */ seed, /* .seed = */ seed,
@ -1400,8 +1407,8 @@ struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau,
/* .eta = */ eta, /* .eta = */ eta,
/* .mu = */ 2.0f*tau, /* .mu = */ 2.0f*tau,
/* .rng = */ std::mt19937(seed_cur), /* .rng = */ std::mt19937(seed_cur),
}, }
}; );
} }
// grammar // grammar
@ -1528,10 +1535,10 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
}; };
} }
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_grammar_i, /* .iface = */ &llama_sampler_grammar_i,
/* .ctx = */ ctx, /* .ctx = */ ctx
}; );
} }
struct llama_sampler * llama_sampler_init_grammar( struct llama_sampler * llama_sampler_init_grammar(
@ -1678,7 +1685,7 @@ struct llama_sampler * llama_sampler_init_penalties(
float penalty_present) { float penalty_present) {
penalty_last_n = std::max(penalty_last_n, 0); penalty_last_n = std::max(penalty_last_n, 0);
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_penalties_i, /* .iface = */ &llama_sampler_penalties_i,
/* .ctx = */ new llama_sampler_penalties { /* .ctx = */ new llama_sampler_penalties {
/* .penalty_last_n = */ penalty_last_n, /* .penalty_last_n = */ penalty_last_n,
@ -1687,8 +1694,8 @@ struct llama_sampler * llama_sampler_init_penalties(
/* .penalty_present = */ penalty_present, /* .penalty_present = */ penalty_present,
/* .prev = */ ring_buffer<llama_token>(penalty_last_n), /* .prev = */ ring_buffer<llama_token>(penalty_last_n),
/* .token_count = */ {}, /* .token_count = */ {},
}, }
}; );
} }
// DRY // DRY
@ -2041,7 +2048,7 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
} }
} }
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_dry_i, /* .iface = */ &llama_sampler_dry_i,
/* .ctx = */ new llama_sampler_dry { /* .ctx = */ new llama_sampler_dry {
/* .total_context_size = */ context_size, /* .total_context_size = */ context_size,
@ -2053,8 +2060,8 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
/* .dry_repeat_count = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{}, /* .dry_repeat_count = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{},
/* .dry_max_token_repeat = */ {}, /* .dry_max_token_repeat = */ {},
/* .last_tokens = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0), /* .last_tokens = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0),
}, }
}; );
} }
// wrapper for test-sampling.cpp // wrapper for test-sampling.cpp
@ -2155,14 +2162,14 @@ struct llama_sampler * llama_sampler_init_logit_bias(
int32_t n_vocab, int32_t n_vocab,
int32_t n_logit_bias, int32_t n_logit_bias,
const llama_logit_bias * logit_bias) { const llama_logit_bias * logit_bias) {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_logit_bias_i, /* .iface = */ &llama_sampler_logit_bias_i,
/* .ctx = */ new llama_sampler_logit_bias { /* .ctx = */ new llama_sampler_logit_bias {
/* .n_vocab = */ n_vocab, /* .n_vocab = */ n_vocab,
/* .logit_bias = */ std::vector<llama_logit_bias>(logit_bias, logit_bias + n_logit_bias), /* .logit_bias = */ std::vector<llama_logit_bias>(logit_bias, logit_bias + n_logit_bias),
/* .to_search = */ {}, /* .to_search = */ {},
}, }
}; );
} }
// infill // infill
@ -2377,14 +2384,14 @@ static struct llama_sampler_i llama_sampler_infill_i = {
}; };
struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) { struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_infill_i, /* .iface = */ &llama_sampler_infill_i,
/* .ctx = */ new llama_sampler_infill { /* .ctx = */ new llama_sampler_infill {
/* .vocab = */ vocab, /* .vocab = */ vocab,
/* .buf0 = */ std::vector<char>(512), /* .buf0 = */ std::vector<char>(512),
/* .buf1 = */ std::vector<char>(512), /* .buf1 = */ std::vector<char>(512),
}, }
}; );
} }
// utils // utils

View file

@ -4610,7 +4610,8 @@ struct llm_build_context {
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
cb(k_pe, "k_pe", il); cb(k_pe, "k_pe", il);
kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
kv_compressed = ggml_cont(ctx0, kv_compressed);
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
model.layers[il].attn_kv_a_norm, NULL, model.layers[il].attn_kv_a_norm, NULL,
LLM_NORM_RMS, cb, il); LLM_NORM_RMS, cb, il);
@ -6464,7 +6465,8 @@ struct llm_build_context {
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
cb(k_pe, "k_pe", il); cb(k_pe, "k_pe", il);
kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
kv_compressed = ggml_cont(ctx0, kv_compressed);
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
model.layers[il].attn_kv_a_norm, NULL, model.layers[il].attn_kv_a_norm, NULL,
LLM_NORM_RMS, cb, il); LLM_NORM_RMS, cb, il);
@ -7215,7 +7217,7 @@ struct llm_build_context {
struct ggml_tensor * Qcur = nullptr; struct ggml_tensor * Qcur = nullptr;
struct ggml_tensor * Kcur = nullptr; struct ggml_tensor * Kcur = nullptr;
struct ggml_tensor * Vcur = nullptr; struct ggml_tensor * Vcur = nullptr;
if (model.type == LLM_TYPE_1_5B || model.type == LLM_TYPE_4B || model.type == LLM_TYPE_9B) { if (model.layers[il].wqkv == nullptr) {
Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
if (model.layers[il].bq) { if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
@ -8799,12 +8801,14 @@ static int llama_decode_impl(
//llama_synchronize(&lctx); //llama_synchronize(&lctx);
// decide if we need to defrag the kv cache // decide if we need to defrag the kv cache
if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) { if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f; // - do not defrag small contexts (i.e. < 2048 tokens)
// - count the padding towards the number of used tokens
const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + llama_kv_cache_get_padding(cparams))/float(kv_self.n)) : 0.0f;
// queue defragmentation for next llama_kv_cache_update // queue defragmentation for next llama_kv_cache_update
if (fragmentation > cparams.defrag_thold) { if (fragmentation > cparams.defrag_thold) {
//LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation); LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
llama_kv_cache_defrag(kv_self); llama_kv_cache_defrag(kv_self);
} }
@ -9426,8 +9430,6 @@ static struct llama_model * llama_model_load_from_file_impl(
struct llama_model_params params) { struct llama_model_params params) {
ggml_time_init(); ggml_time_init();
llama_model * model = new llama_model(params);
unsigned cur_percentage = 0; unsigned cur_percentage = 0;
if (params.progress_callback == NULL) { if (params.progress_callback == NULL) {
params.progress_callback_user_data = &cur_percentage; params.progress_callback_user_data = &cur_percentage;
@ -9445,6 +9447,8 @@ static struct llama_model * llama_model_load_from_file_impl(
}; };
} }
llama_model * model = new llama_model(params);
// create list of devices to use with this model // create list of devices to use with this model
if (params.devices) { if (params.devices) {
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) { for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {

View file

@ -618,8 +618,15 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
result.reserve(utf8.size()); result.reserve(utf8.size());
size_t offset = 0; size_t offset = 0;
while (offset < utf8.size()) { while (offset < utf8.size()) {
try {
result.push_back(unicode_cpt_from_utf8(utf8, offset)); result.push_back(unicode_cpt_from_utf8(utf8, offset));
} }
catch (const std::invalid_argument & /*ex*/) {
// Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
++offset;
result.emplace_back(0xFFFD); // replacement character
}
}
return result; return result;
} }

View file

@ -1674,21 +1674,28 @@ struct test_silu_back : public test_case {
struct test_norm : public test_case { struct test_norm : public test_case {
const ggml_type type; const ggml_type type;
const std::array<int64_t, 4> ne; const std::array<int64_t, 4> ne;
float eps; const bool v; // whether a is a non-contiguous view
const float eps;
std::string vars() override { std::string vars() override {
return VARS_TO_STR3(type, ne, eps); return VARS_TO_STR4(type, ne, v, eps);
} }
test_norm(ggml_type type = GGML_TYPE_F32, test_norm(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {64, 5, 4, 3}, std::array<int64_t, 4> ne = {64, 5, 4, 3},
bool v = false,
float eps = 1e-6f) float eps = 1e-6f)
: type(type), ne(ne), eps(eps) {} : type(type), ne(ne), v(v), eps(eps) {}
ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_set_name(a, "a"); ggml_set_name(a, "a");
if (v) {
a = ggml_view_4d(ctx, a, a->ne[0]/2, a->ne[1]/2, a->ne[2]/2, a->ne[3]/2, a->nb[1], a->nb[2], a->nb[3], 0);
ggml_set_name(a, "view of a");
}
ggml_tensor * out = ggml_norm(ctx, a, eps); ggml_tensor * out = ggml_norm(ctx, a, eps);
ggml_set_name(out, "out"); ggml_set_name(out, "out");
@ -1700,22 +1707,29 @@ struct test_norm : public test_case {
struct test_rms_norm : public test_case { struct test_rms_norm : public test_case {
const ggml_type type; const ggml_type type;
const std::array<int64_t, 4> ne; const std::array<int64_t, 4> ne;
float eps; const bool v; // whether a is a non-contiguous view
const float eps;
std::string vars() override { std::string vars() override {
return VARS_TO_STR3(type, ne, eps); return VARS_TO_STR4(type, ne, v, eps);
} }
test_rms_norm(ggml_type type = GGML_TYPE_F32, test_rms_norm(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {64, 5, 4, 3}, std::array<int64_t, 4> ne = {64, 5, 4, 3},
bool v = false,
float eps = 1e-6f) float eps = 1e-6f)
: type(type), ne(ne), eps(eps) {} : type(type), ne(ne), v(v), eps(eps) {}
ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_set_param(ctx, a); ggml_set_param(ctx, a);
ggml_set_name(a, "a"); ggml_set_name(a, "a");
if (v) {
a = ggml_view_4d(ctx, a, a->ne[0]/2, a->ne[1]/2, a->ne[2]/2, a->ne[3]/2, a->nb[1], a->nb[2], a->nb[3], 0);
ggml_set_name(a, "view of a");
}
ggml_tensor * out = ggml_rms_norm(ctx, a, eps); ggml_tensor * out = ggml_rms_norm(ctx, a, eps);
ggml_set_name(out, "out"); ggml_set_name(out, "out");
@ -1741,7 +1755,7 @@ struct test_rms_norm : public test_case {
struct test_rms_norm_back : public test_case { struct test_rms_norm_back : public test_case {
const ggml_type type; const ggml_type type;
const std::array<int64_t, 4> ne; const std::array<int64_t, 4> ne;
float eps; const float eps;
std::string vars() override { std::string vars() override {
return VARS_TO_STR3(type, ne, eps); return VARS_TO_STR3(type, ne, eps);
@ -2919,7 +2933,7 @@ struct test_group_norm : public test_case {
const float eps; const float eps;
std::string vars() override { std::string vars() override {
return VARS_TO_STR3(type, ne, num_groups); return VARS_TO_STR4(type, ne, num_groups, eps);
} }
test_group_norm(ggml_type type = GGML_TYPE_F32, test_group_norm(ggml_type type = GGML_TYPE_F32,
@ -3964,9 +3978,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_scale()); test_cases.emplace_back(new test_scale());
test_cases.emplace_back(new test_silu_back()); test_cases.emplace_back(new test_silu_back());
for (float eps : {0.0f, 1e-7f, 1e-4f, 1e-1f}) { for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f}) {
test_cases.emplace_back(new test_norm (GGML_TYPE_F32, {64, 5, 4, 3}, eps)); for (bool v : {false, true}) {
test_cases.emplace_back(new test_rms_norm (GGML_TYPE_F32, {64, 5, 4, 3}, eps)); test_cases.emplace_back(new test_norm (GGML_TYPE_F32, {64, 5, 4, 3}, v, eps));
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, v, eps));
}
test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, {64, 5, 4, 3}, eps)); test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
} }

View file

@ -18,12 +18,8 @@
using json = nlohmann::ordered_json; using json = nlohmann::ordered_json;
static common_chat_msg msg_from_json(const json & message) { static common_chat_msg msg_from_json(const json & message) {
common_chat_msg ret{ common_chat_msg ret;
"assistant", ret.role = "assistant";
"",
{},
/* .tool_plan = */ "",
};
if (message.contains("content") && !message.at("content").is_null()) { if (message.contains("content") && !message.at("content").is_null()) {
ret.content = message.at("content"); ret.content = message.at("content");
} }
@ -289,7 +285,7 @@ static void test_template(const common_chat_template & tmpl, const std::vector<s
static void test_template_output_parsers() { static void test_template_output_parsers() {
json text_message { json text_message {
{ "role", "assistant" }, { "role", "assistant" },
{ "content", "Hello, world!" }, { "content", "Hello, world!\nWhat's up?" },
}; };
json tool_calls = json::array({{ json tool_calls = json::array({{
{ "type", "function" }, { "type", "function" },
@ -379,7 +375,7 @@ static void test_template_output_parsers() {
common_chat_inputs inputs_no_tools; common_chat_inputs inputs_no_tools;
inputs_no_tools.messages = { inputs_no_tools.messages = {
{ { "role", "user" }, { "content", "Hey" } } { { "role", "user" }, { "content", "Hey\nThere" } }
}; };
common_chat_inputs inputs_tools = inputs_no_tools; common_chat_inputs inputs_tools = inputs_no_tools;
@ -408,7 +404,8 @@ static void test_template_output_parsers() {
" {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n" " {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n"
"]<|END_ACTION|>"); "]<|END_ACTION|>");
test_template(tmpl, end_tokens, text_message, tools, test_template(tmpl, end_tokens, text_message, tools,
"<|START_RESPONSE|>Hello, world!<|END_RESPONSE|>", "<|START_RESPONSE|>Hello, world!\n"
"What's up?<|END_RESPONSE|>",
/* expect_grammar_triggered= */ false); /* expect_grammar_triggered= */ false);
} }
{ {
@ -428,7 +425,7 @@ static void test_template_output_parsers() {
assert_msg_equals(msg_from_json(text_message), assert_msg_equals(msg_from_json(text_message),
common_chat_parse("{\n" common_chat_parse("{\n"
" \"response\": \"Hello, world!\"\n" " \"response\": \"Hello, world!\\nWhat's up?\"\n"
"}", "}",
common_chat_params_init(tmpl, inputs_tools).format)); common_chat_params_init(tmpl, inputs_tools).format));
test_template(tmpl, end_tokens, tool_call_message_with_id, tools, test_template(tmpl, end_tokens, tool_call_message_with_id, tools,
@ -451,7 +448,7 @@ static void test_template_output_parsers() {
assert_equals(COMMON_CHAT_FORMAT_MISTRAL_NEMO, common_chat_params_init(tmpl, inputs_tools).format); assert_equals(COMMON_CHAT_FORMAT_MISTRAL_NEMO, common_chat_params_init(tmpl, inputs_tools).format);
test_template(tmpl, end_tokens, text_message, tools, "Hello, world!", /* expect_grammar_triggered= */ false); test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
test_template( test_template(
tmpl, end_tokens, tool_call_message_with_id, tools, tmpl, end_tokens, tool_call_message_with_id, tools,
"[TOOL_CALLS][{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}, \"id\": \"123456789\"}]"); "[TOOL_CALLS][{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}, \"id\": \"123456789\"}]");
@ -476,7 +473,7 @@ static void test_template_output_parsers() {
inputs_tools) inputs_tools)
.format); .format);
test_template(tmpl, end_tokens, text_message, tools, "Hello, world!", /* expect_grammar_triggered= */ false); test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
test_template(tmpl, end_tokens, tool_call_message, tools, test_template(tmpl, end_tokens, tool_call_message, tools,
"<tool_call>\n" "<tool_call>\n"
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n" "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
@ -516,7 +513,7 @@ static void test_template_output_parsers() {
assert_equals(COMMON_CHAT_FORMAT_LLAMA_3_X, common_chat_params_init(tmpl, inputs_tools).format); assert_equals(COMMON_CHAT_FORMAT_LLAMA_3_X, common_chat_params_init(tmpl, inputs_tools).format);
test_template(tmpl, end_tokens, text_message, tools, "Hello, world!", /* expect_grammar_triggered= */ false); test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
test_template(tmpl, end_tokens, tool_call_message, tools, test_template(tmpl, end_tokens, tool_call_message, tools,
"{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}"); "{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}");
} }
@ -528,7 +525,7 @@ static void test_template_output_parsers() {
assert_equals(COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1, assert_equals(COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
common_chat_params_init(tmpl, inputs_tools).format); common_chat_params_init(tmpl, inputs_tools).format);
test_template(tmpl, end_tokens, text_message, tools, "Hello, world!", /* expect_grammar_triggered= */ false); test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
test_template(tmpl, end_tokens, tool_call_message, tools, test_template(tmpl, end_tokens, tool_call_message, tools,
"<function=special_function>{\"arg1\": 1}</function>"); "<function=special_function>{\"arg1\": 1}</function>");
} }
@ -542,7 +539,8 @@ static void test_template_output_parsers() {
test_template(tmpl, end_tokens, text_message, {}, test_template(tmpl, end_tokens, text_message, {},
"all\n" "all\n"
"Hello, world!", "Hello, world!\n"
"What's up?",
/* expect_grammar_triggered= */ false); /* expect_grammar_triggered= */ false);
test_template(tmpl, end_tokens, tool_call_message, tools, test_template(tmpl, end_tokens, tool_call_message, tools,
"special_function\n" "special_function\n"
@ -555,7 +553,7 @@ static void test_template_output_parsers() {
assert_equals(COMMON_CHAT_FORMAT_FIREFUNCTION_V2, common_chat_params_init(tmpl, inputs_tools).format); assert_equals(COMMON_CHAT_FORMAT_FIREFUNCTION_V2, common_chat_params_init(tmpl, inputs_tools).format);
test_template(tmpl, end_tokens, text_message, tools, "Hello, world!", /* expect_grammar_triggered= */ false); test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
test_template(tmpl, end_tokens, tool_call_message, tools, test_template(tmpl, end_tokens, tool_call_message, tools,
" functools[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]"); " functools[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]");
} }
@ -566,7 +564,7 @@ static void test_template_output_parsers() {
assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1, common_chat_params_init(tmpl, inputs_tools).format); assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1, common_chat_params_init(tmpl, inputs_tools).format);
test_template(tmpl, end_tokens, text_message, tools, "Hello, world!", /* expect_grammar_triggered= */ false); test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
test_template(tmpl, end_tokens, tool_call_message, tools, test_template(tmpl, end_tokens, tool_call_message, tools,
"<tool▁calls▁begin><tool▁call▁begin>function<tool▁sep>special_function\n" "<tool▁calls▁begin><tool▁call▁begin>function<tool▁sep>special_function\n"
"```json\n" "```json\n"