Merge branch 'ggerganov:master' into token
This commit is contained in:
commit
c47c41cd35
124 changed files with 2563 additions and 1611 deletions
|
@ -17,8 +17,10 @@ Checks: >
|
||||||
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
|
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
|
||||||
performance-*,
|
performance-*,
|
||||||
portability-*,
|
portability-*,
|
||||||
|
-portability-simd-intrinsics,
|
||||||
misc-*,
|
misc-*,
|
||||||
-misc-const-correctness,
|
-misc-const-correctness,
|
||||||
-misc-non-private-member-variables-in-classes,
|
-misc-non-private-member-variables-in-classes,
|
||||||
-misc-no-recursion,
|
-misc-no-recursion,
|
||||||
|
-misc-use-anonymous-namespace,
|
||||||
FormatStyle: none
|
FormatStyle: none
|
||||||
|
|
9
.github/workflows/build.yml
vendored
9
.github/workflows/build.yml
vendored
|
@ -904,6 +904,8 @@ jobs:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: Install Cuda Toolkit 11.7
|
- name: Install Cuda Toolkit 11.7
|
||||||
if: ${{ matrix.cuda == '11.7' }}
|
if: ${{ matrix.cuda == '11.7' }}
|
||||||
|
@ -1119,6 +1121,11 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
||||||
|
|
||||||
|
- name: Install ccache
|
||||||
|
uses: hendrikmuhs/ccache-action@v1.2
|
||||||
|
with:
|
||||||
|
key: ${{ github.job }}
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
|
@ -1139,6 +1146,8 @@ jobs:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: Install
|
- name: Install
|
||||||
id: depends
|
id: depends
|
||||||
|
|
186
AUTHORS
186
AUTHORS
|
@ -1,4 +1,4 @@
|
||||||
# date: Wed Jun 26 19:36:34 EEST 2024
|
# date: Thu Nov 28 20:46:15 EET 2024
|
||||||
# this file is auto-generated by scripts/gen-authors.sh
|
# this file is auto-generated by scripts/gen-authors.sh
|
||||||
|
|
||||||
0cc4m <picard12@live.de>
|
0cc4m <picard12@live.de>
|
||||||
|
@ -7,6 +7,7 @@
|
||||||
2f38b454 <dxf@protonmail.com>
|
2f38b454 <dxf@protonmail.com>
|
||||||
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
|
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
|
||||||
44670 <44670@users.noreply.github.com>
|
44670 <44670@users.noreply.github.com>
|
||||||
|
65a <10104049+65a@users.noreply.github.com>
|
||||||
AN Long <aisk@users.noreply.github.com>
|
AN Long <aisk@users.noreply.github.com>
|
||||||
AT <manyoso@users.noreply.github.com>
|
AT <manyoso@users.noreply.github.com>
|
||||||
Aarni Koskela <akx@iki.fi>
|
Aarni Koskela <akx@iki.fi>
|
||||||
|
@ -19,20 +20,28 @@ Adithya Balaji <adithya.b94@gmail.com>
|
||||||
AdithyanI <adithyan.i4internet@gmail.com>
|
AdithyanI <adithyan.i4internet@gmail.com>
|
||||||
Adrian <smith.adriane@gmail.com>
|
Adrian <smith.adriane@gmail.com>
|
||||||
Adrian Hesketh <a-h@users.noreply.github.com>
|
Adrian Hesketh <a-h@users.noreply.github.com>
|
||||||
|
Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com>
|
||||||
Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
|
Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
|
||||||
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
|
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
|
||||||
|
AidanBeltonS <aidan.belton@codeplay.com>
|
||||||
Aisuko <urakiny@gmail.com>
|
Aisuko <urakiny@gmail.com>
|
||||||
|
Akarshan Biswas <akarshan.biswas@gmail.com>
|
||||||
Akarshan Biswas <akarshanbiswas@fedoraproject.org>
|
Akarshan Biswas <akarshanbiswas@fedoraproject.org>
|
||||||
|
Al Mochkin <14274697+amochkin@users.noreply.github.com>
|
||||||
Albert Jin <albert.jin@gmail.com>
|
Albert Jin <albert.jin@gmail.com>
|
||||||
Alberto <57916483+albbus-stack@users.noreply.github.com>
|
Alberto <57916483+albbus-stack@users.noreply.github.com>
|
||||||
|
Alberto Cabrera Pérez <alberto.cabrera@codeplay.com>
|
||||||
|
Alberto Cabrera Pérez <alberto.cabrera@intel.com>
|
||||||
Alex <awhill19@icloud.com>
|
Alex <awhill19@icloud.com>
|
||||||
Alex Azarov <alex@azarov.by>
|
Alex Azarov <alex@azarov.by>
|
||||||
Alex Azarov <alexander.azarov@mapbox.com>
|
Alex Azarov <alexander.azarov@mapbox.com>
|
||||||
Alex Klinkhamer <from.github.com.917@grencez.dev>
|
Alex Klinkhamer <from.github.com.917@grencez.dev>
|
||||||
Alex Klinkhamer <git@grencez.dev>
|
Alex Klinkhamer <git@grencez.dev>
|
||||||
Alex Nguyen <tiendung@users.noreply.github.com>
|
Alex Nguyen <tiendung@users.noreply.github.com>
|
||||||
|
Alex O'Connell <35843486+acon96@users.noreply.github.com>
|
||||||
Alex Petenchea <alex.petenchea@gmail.com>
|
Alex Petenchea <alex.petenchea@gmail.com>
|
||||||
Alex Renda <alexrenda@users.noreply.github.com>
|
Alex Renda <alexrenda@users.noreply.github.com>
|
||||||
|
Alex Tuddenham <61622354+AlexsCode@users.noreply.github.com>
|
||||||
Alex von Gluck IV <kallisti5@unixzen.com>
|
Alex von Gluck IV <kallisti5@unixzen.com>
|
||||||
Alexey Parfenov <zxed@alkatrazstudio.net>
|
Alexey Parfenov <zxed@alkatrazstudio.net>
|
||||||
Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
|
Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
|
||||||
|
@ -45,18 +54,25 @@ AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
|
||||||
Ananta Bastola <anantarajbastola@gmail.com>
|
Ananta Bastola <anantarajbastola@gmail.com>
|
||||||
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
|
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
|
||||||
András Salamon <ott2@users.noreply.github.com>
|
András Salamon <ott2@users.noreply.github.com>
|
||||||
|
Andreas (Andi) Kunar <andreask@msn.com>
|
||||||
Andrei <abetlen@gmail.com>
|
Andrei <abetlen@gmail.com>
|
||||||
Andrew Canis <andrew.canis@gmail.com>
|
Andrew Canis <andrew.canis@gmail.com>
|
||||||
Andrew Downing <andrew2085@gmail.com>
|
Andrew Downing <andrew2085@gmail.com>
|
||||||
Andrew Duffy <a10y@users.noreply.github.com>
|
Andrew Duffy <a10y@users.noreply.github.com>
|
||||||
Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
|
Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
|
||||||
|
Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com>
|
||||||
|
Andy Salerno <andysalerno@gmail.com>
|
||||||
Andy Tai <andy-tai@users.noreply.github.com>
|
Andy Tai <andy-tai@users.noreply.github.com>
|
||||||
|
Anthony Van de Gejuchte <anthonyvdgent@gmail.com>
|
||||||
|
Antonis Makropoulos <benuix@gmail.com>
|
||||||
Arik Poznanski <arikpoz@users.noreply.github.com>
|
Arik Poznanski <arikpoz@users.noreply.github.com>
|
||||||
|
Armen Kaleshian <kriation@users.noreply.github.com>
|
||||||
Artem <guinmoon@gmail.com>
|
Artem <guinmoon@gmail.com>
|
||||||
Artem Zinnatullin <ceo@abstractny.gay>
|
Artem Zinnatullin <ceo@abstractny.gay>
|
||||||
Artyom Lebedev <vagran.ast@gmail.com>
|
Artyom Lebedev <vagran.ast@gmail.com>
|
||||||
Asbjørn Olling <asbjornolling@gmail.com>
|
Asbjørn Olling <asbjornolling@gmail.com>
|
||||||
Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
|
Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
|
||||||
|
Asghar Ghorbani <a-ghorbani@users.noreply.github.com>
|
||||||
Ashish <1856117+ashishdatta@users.noreply.github.com>
|
Ashish <1856117+ashishdatta@users.noreply.github.com>
|
||||||
Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
|
Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
|
||||||
Ashraful Islam <ashraful.meche@gmail.com>
|
Ashraful Islam <ashraful.meche@gmail.com>
|
||||||
|
@ -76,12 +92,16 @@ Ben Williams <ben@719ben.com>
|
||||||
Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
|
Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
|
||||||
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
|
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
|
||||||
Bernat Vadell <hounter.caza@gmail.com>
|
Bernat Vadell <hounter.caza@gmail.com>
|
||||||
|
Bert Wagner <github@bertwagner.com>
|
||||||
Bingan <70050083+binganao@users.noreply.github.com>
|
Bingan <70050083+binganao@users.noreply.github.com>
|
||||||
|
Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com>
|
||||||
Bodo Graumann <mail@bodograumann.de>
|
Bodo Graumann <mail@bodograumann.de>
|
||||||
Bono Lv <lvscar@users.noreply.github.com>
|
Bono Lv <lvscar@users.noreply.github.com>
|
||||||
Borislav Stanimirov <b.stanimirov@abv.bg>
|
Borislav Stanimirov <b.stanimirov@abv.bg>
|
||||||
Branden Butler <bwtbutler@hotmail.com>
|
Branden Butler <bwtbutler@hotmail.com>
|
||||||
|
Brandon Squizzato <35474886+bsquizz@users.noreply.github.com>
|
||||||
Brian <mofosyne@gmail.com>
|
Brian <mofosyne@gmail.com>
|
||||||
|
Brian Cunnie <brian.cunnie@gmail.com>
|
||||||
Bruce MacDonald <brucewmacdonald@gmail.com>
|
Bruce MacDonald <brucewmacdonald@gmail.com>
|
||||||
Bryan Honof <bryanhonof@gmail.com>
|
Bryan Honof <bryanhonof@gmail.com>
|
||||||
CJ Pais <cj@cjpais.com>
|
CJ Pais <cj@cjpais.com>
|
||||||
|
@ -90,32 +110,47 @@ Calvin Laurenson <calvin@laurenson.dev>
|
||||||
Cameron <csteele@steelecameron.com>
|
Cameron <csteele@steelecameron.com>
|
||||||
Cameron Kaiser <classilla@users.noreply.github.com>
|
Cameron Kaiser <classilla@users.noreply.github.com>
|
||||||
Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
|
Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
|
||||||
|
CarryFun <76023481+CarryFun@users.noreply.github.com>
|
||||||
|
Carsten Kragelund Jørgensen <carsten@kragelund.me>
|
||||||
|
CarterLi999 <664681047@qq.com>
|
||||||
Casey Primozic <casey@cprimozic.net>
|
Casey Primozic <casey@cprimozic.net>
|
||||||
Casey Primozic <me@ameo.link>
|
Casey Primozic <me@ameo.link>
|
||||||
CausalLM <148736309+CausalLM@users.noreply.github.com>
|
CausalLM <148736309+CausalLM@users.noreply.github.com>
|
||||||
Cebtenzzre <cebtenzzre@gmail.com>
|
Cebtenzzre <cebtenzzre@gmail.com>
|
||||||
Chad Brewbaker <crb002@gmail.com>
|
Chad Brewbaker <crb002@gmail.com>
|
||||||
|
Changyeon Kim <cyzero.kim@samsung.com>
|
||||||
Chao Jiang <jc19chaoj@zoho.com>
|
Chao Jiang <jc19chaoj@zoho.com>
|
||||||
|
Charles Xu <63788048+chaxu01@users.noreply.github.com>
|
||||||
|
Charles Xu <charles.xu@arm.com>
|
||||||
|
Chen Xi <xi2.chen@intel.com>
|
||||||
|
Chen Xi <xixichen08@foxmail.com>
|
||||||
Cheng Shao <terrorjack@type.dance>
|
Cheng Shao <terrorjack@type.dance>
|
||||||
|
Chenguang Li <87689256+noemotiovon@users.noreply.github.com>
|
||||||
Chris Elrod <elrodc@gmail.com>
|
Chris Elrod <elrodc@gmail.com>
|
||||||
Chris Kuehl <ckuehl@ckuehl.me>
|
Chris Kuehl <ckuehl@ckuehl.me>
|
||||||
Christian Demsar <christian@github.email.demsar.us>
|
Christian Demsar <christian@github.email.demsar.us>
|
||||||
Christian Demsar <crasm@git.vczf.us>
|
Christian Demsar <crasm@git.vczf.us>
|
||||||
Christian Falch <875252+chrfalch@users.noreply.github.com>
|
Christian Falch <875252+chrfalch@users.noreply.github.com>
|
||||||
Christian Kögler <ck3d@gmx.de>
|
Christian Kögler <ck3d@gmx.de>
|
||||||
|
Christian Köhnenkamp <cvk5@me.com>
|
||||||
Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
|
Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
|
||||||
Clark Saben <76020733+csaben@users.noreply.github.com>
|
Clark Saben <76020733+csaben@users.noreply.github.com>
|
||||||
Clint Herron <hanclinto@gmail.com>
|
Clint Herron <hanclinto@gmail.com>
|
||||||
|
Conrad Kramer <conrad@conradkramer.com>
|
||||||
CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
|
CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
|
||||||
|
Csaba Kecskemeti <csaba.kecskemeti@gmail.com>
|
||||||
Cuong Trinh Manh <nguoithichkhampha@gmail.com>
|
Cuong Trinh Manh <nguoithichkhampha@gmail.com>
|
||||||
DAN™ <dranger003@gmail.com>
|
DAN™ <dranger003@gmail.com>
|
||||||
Damian Stewart <d@damianstewart.com>
|
Damian Stewart <d@damianstewart.com>
|
||||||
|
Dan Johansson <164997844+eddnjjn@users.noreply.github.com>
|
||||||
|
Dan Johansson <dan.johansson@arm.com>
|
||||||
Dane Madsen <dane_madsen@hotmail.com>
|
Dane Madsen <dane_madsen@hotmail.com>
|
||||||
DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
|
DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
|
||||||
Daniel Bevenius <daniel.bevenius@gmail.com>
|
Daniel Bevenius <daniel.bevenius@gmail.com>
|
||||||
Daniel Drake <drake@endlessos.org>
|
Daniel Drake <drake@endlessos.org>
|
||||||
Daniel Hiltgen <dhiltgen@users.noreply.github.com>
|
Daniel Hiltgen <dhiltgen@users.noreply.github.com>
|
||||||
Daniel Illescas Romero <illescas.daniel@protonmail.com>
|
Daniel Illescas Romero <illescas.daniel@protonmail.com>
|
||||||
|
Daniel Kleine <53251018+d-kleine@users.noreply.github.com>
|
||||||
Daniele <57776841+daniandtheweb@users.noreply.github.com>
|
Daniele <57776841+daniandtheweb@users.noreply.github.com>
|
||||||
DannyDaemonic <DannyDaemonic@gmail.com>
|
DannyDaemonic <DannyDaemonic@gmail.com>
|
||||||
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
|
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
|
||||||
|
@ -129,19 +164,28 @@ David Pflug <david@pflug.email>
|
||||||
David Renshaw <dwrenshaw@gmail.com>
|
David Renshaw <dwrenshaw@gmail.com>
|
||||||
David Sommers <12738+databyte@users.noreply.github.com>
|
David Sommers <12738+databyte@users.noreply.github.com>
|
||||||
David Yang <davidyang6us@gmail.com>
|
David Yang <davidyang6us@gmail.com>
|
||||||
|
DavidKorczynski <david@adalogics.com>
|
||||||
Dawid Potocki <github@dawidpotocki.com>
|
Dawid Potocki <github@dawidpotocki.com>
|
||||||
Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
|
Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
|
||||||
Dean <Dean.Sinaean@gmail.com>
|
Dean <Dean.Sinaean@gmail.com>
|
||||||
Deins <deinsegle@gmail.com>
|
Deins <deinsegle@gmail.com>
|
||||||
|
Denis Spasyuk <34203011+dspasyuk@users.noreply.github.com>
|
||||||
|
Derrick T. Woolworth <dwoolworth@gmail.com>
|
||||||
Deven Mistry <31466137+deven367@users.noreply.github.com>
|
Deven Mistry <31466137+deven367@users.noreply.github.com>
|
||||||
|
Dibakar Gope <dibakar.gope@arm.com>
|
||||||
Didzis Gosko <didzis@users.noreply.github.com>
|
Didzis Gosko <didzis@users.noreply.github.com>
|
||||||
|
Diego Devesa <slarengh@gmail.com>
|
||||||
|
Diogo Teles Sant'Anna <diogoteles@google.com>
|
||||||
Djip007 <djip.perois@free.fr>
|
Djip007 <djip.perois@free.fr>
|
||||||
Don Mahurin <dmahurin@users.noreply.github.com>
|
Don Mahurin <dmahurin@users.noreply.github.com>
|
||||||
DooWoong Lee (David) <manics99@naver.com>
|
DooWoong Lee (David) <manics99@naver.com>
|
||||||
Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
|
Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
|
||||||
|
Dou Xinpeng <15529241576@163.com>
|
||||||
|
Dou Xinpeng <81913537+Dou-Git@users.noreply.github.com>
|
||||||
Douglas Hanley <thesecretaryofwar@gmail.com>
|
Douglas Hanley <thesecretaryofwar@gmail.com>
|
||||||
Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
|
Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
|
||||||
Ebey Abraham <ebey97@gmail.com>
|
Ebey Abraham <ebey97@gmail.com>
|
||||||
|
Echo Nolan <echo@echonolan.net>
|
||||||
Ed Lee <edilee@mozilla.com>
|
Ed Lee <edilee@mozilla.com>
|
||||||
Ed Lepedus <ed.lepedus@googlemail.com>
|
Ed Lepedus <ed.lepedus@googlemail.com>
|
||||||
Eddie-Wang <wangjinheng1120@163.com>
|
Eddie-Wang <wangjinheng1120@163.com>
|
||||||
|
@ -151,10 +195,13 @@ Elbios <141279586+Elbios@users.noreply.github.com>
|
||||||
Elton Kola <eltonkola@gmail.com>
|
Elton Kola <eltonkola@gmail.com>
|
||||||
Engininja2 <139037756+Engininja2@users.noreply.github.com>
|
Engininja2 <139037756+Engininja2@users.noreply.github.com>
|
||||||
Equim <sayaka@ekyu.moe>
|
Equim <sayaka@ekyu.moe>
|
||||||
|
Eric Curtin <ecurtin@redhat.com>
|
||||||
|
Eric Curtin <ericcurtin17@gmail.com>
|
||||||
Eric Sommerlade <es0m@users.noreply.github.com>
|
Eric Sommerlade <es0m@users.noreply.github.com>
|
||||||
Eric Zhang <34133756+EZForever@users.noreply.github.com>
|
Eric Zhang <34133756+EZForever@users.noreply.github.com>
|
||||||
Erik Garrison <erik.garrison@gmail.com>
|
Erik Garrison <erik.garrison@gmail.com>
|
||||||
Erik Scholz <Green-Sky@users.noreply.github.com>
|
Erik Scholz <Green-Sky@users.noreply.github.com>
|
||||||
|
Esko Toivonen <eskot98@gmail.com>
|
||||||
Ettore Di Giacinto <mudler@users.noreply.github.com>
|
Ettore Di Giacinto <mudler@users.noreply.github.com>
|
||||||
Evan Jones <evan.q.jones@gmail.com>
|
Evan Jones <evan.q.jones@gmail.com>
|
||||||
Evan Miller <emmiller@gmail.com>
|
Evan Miller <emmiller@gmail.com>
|
||||||
|
@ -166,19 +213,26 @@ FK <sozforex@gmail.com>
|
||||||
Fabian <cmdrf@users.noreply.github.com>
|
Fabian <cmdrf@users.noreply.github.com>
|
||||||
Fabio R. Sluzala <Fabio3rs@users.noreply.github.com>
|
Fabio R. Sluzala <Fabio3rs@users.noreply.github.com>
|
||||||
Faez Shakil <faez.shakil@gmail.com>
|
Faez Shakil <faez.shakil@gmail.com>
|
||||||
|
Faisal Zaghloul <faisal.zaghloul@gmail.com>
|
||||||
|
Faisal Zaghloul <quic_fzaghlou@quicinc.com>
|
||||||
|
Fan Shupei <dymarkfan@outlook.com>
|
||||||
FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
|
FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
|
||||||
|
Farbod Bijary <110523279+farbodbj@users.noreply.github.com>
|
||||||
Fattire <528174+fat-tire@users.noreply.github.com>
|
Fattire <528174+fat-tire@users.noreply.github.com>
|
||||||
Felix <stenbackfelix@gmail.com>
|
Felix <stenbackfelix@gmail.com>
|
||||||
Finn Voorhees <finnvoorhees@gmail.com>
|
Finn Voorhees <finnvoorhees@gmail.com>
|
||||||
Firat <firatkiral@gmail.com>
|
Firat <firatkiral@gmail.com>
|
||||||
|
FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com>
|
||||||
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
|
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
|
||||||
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
|
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
|
||||||
Francisco Melo <43780565+francis2tm@users.noreply.github.com>
|
Francisco Melo <43780565+francis2tm@users.noreply.github.com>
|
||||||
Frank Mai <thxcode0824@gmail.com>
|
Frank Mai <thxcode0824@gmail.com>
|
||||||
FrankHB <frankhb1989@gmail.com>
|
FrankHB <frankhb1989@gmail.com>
|
||||||
|
Frankie Robertson <frankier@users.noreply.github.com>
|
||||||
Fred Douglas <43351173+fredlas@users.noreply.github.com>
|
Fred Douglas <43351173+fredlas@users.noreply.github.com>
|
||||||
Frederik Vogel <Schaltfehler@users.noreply.github.com>
|
Frederik Vogel <Schaltfehler@users.noreply.github.com>
|
||||||
Gabe Goodhart <gabe.l.hart@gmail.com>
|
Gabe Goodhart <gabe.l.hart@gmail.com>
|
||||||
|
Gabe Goodhart <ghart@us.ibm.com>
|
||||||
GainLee <perfecter.gen@gmail.com>
|
GainLee <perfecter.gen@gmail.com>
|
||||||
Galunid <karolek1231456@gmail.com>
|
Galunid <karolek1231456@gmail.com>
|
||||||
Gary Linscott <glinscott@gmail.com>
|
Gary Linscott <glinscott@gmail.com>
|
||||||
|
@ -187,11 +241,13 @@ Gavin Zhao <gavinzhaojw@protonmail.com>
|
||||||
Genkagaku.GPT <hlhr202@163.com>
|
Genkagaku.GPT <hlhr202@163.com>
|
||||||
Georgi Gerganov <ggerganov@gmail.com>
|
Georgi Gerganov <ggerganov@gmail.com>
|
||||||
Gilad S <giladgd@users.noreply.github.com>
|
Gilad S <giladgd@users.noreply.github.com>
|
||||||
|
Gilad S. <7817232+giladgd@users.noreply.github.com>
|
||||||
Giuseppe Scrivano <giuseppe@scrivano.org>
|
Giuseppe Scrivano <giuseppe@scrivano.org>
|
||||||
GiviMAD <GiviMAD@users.noreply.github.com>
|
GiviMAD <GiviMAD@users.noreply.github.com>
|
||||||
Govlzkoy <gotope@users.noreply.github.com>
|
Govlzkoy <gotope@users.noreply.github.com>
|
||||||
Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
|
Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
|
||||||
Guillaume Wenzek <gwenzek@users.noreply.github.com>
|
Guillaume Wenzek <gwenzek@users.noreply.github.com>
|
||||||
|
Guoliang Hua <32868157+nbcsm@users.noreply.github.com>
|
||||||
Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
|
Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
|
||||||
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
|
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
|
||||||
Haggai Nuchi <h.nuchi@gmail.com>
|
Haggai Nuchi <h.nuchi@gmail.com>
|
||||||
|
@ -213,11 +269,14 @@ Hong Bo PENG <penghb@cn.ibm.com>
|
||||||
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
|
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
|
||||||
Howard Su <howard0su@gmail.com>
|
Howard Su <howard0su@gmail.com>
|
||||||
Hua Jiang <allenhjiang@outlook.com>
|
Hua Jiang <allenhjiang@outlook.com>
|
||||||
|
Huang Qi <huangqi3@xiaomi.com>
|
||||||
Huawei Lin <huaweilin.cs@gmail.com>
|
Huawei Lin <huaweilin.cs@gmail.com>
|
||||||
Hugo Roussel <hugo.rous@gmail.com>
|
Hugo Roussel <hugo.rous@gmail.com>
|
||||||
|
Huifeng Ou <79071290+ho2103@users.noreply.github.com>
|
||||||
Ian Bull <irbull@eclipsesource.com>
|
Ian Bull <irbull@eclipsesource.com>
|
||||||
Ian Bull <irbull@gmail.com>
|
Ian Bull <irbull@gmail.com>
|
||||||
Ian Scrivener <github@zilogy.asia>
|
Ian Scrivener <github@zilogy.asia>
|
||||||
|
Icecream95 <the.real.icecream95@gmail.com>
|
||||||
Ido S <ido.pluto@gmail.com>
|
Ido S <ido.pluto@gmail.com>
|
||||||
IgnacioFDM <ignaciofdm@gmail.com>
|
IgnacioFDM <ignaciofdm@gmail.com>
|
||||||
Igor Okulist <okigan@gmail.com>
|
Igor Okulist <okigan@gmail.com>
|
||||||
|
@ -226,11 +285,15 @@ Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
|
||||||
Ionoclast Laboratories <brigham@ionoclast.com>
|
Ionoclast Laboratories <brigham@ionoclast.com>
|
||||||
Isaac McFadyen <isaac@imcf.me>
|
Isaac McFadyen <isaac@imcf.me>
|
||||||
IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com>
|
IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com>
|
||||||
|
Ivan <nekotekina@gmail.com>
|
||||||
|
Ivan Filipov <159561759+vanaka11@users.noreply.github.com>
|
||||||
Ivan Komarov <Ivan.Komarov@dfyz.info>
|
Ivan Komarov <Ivan.Komarov@dfyz.info>
|
||||||
Ivan Stepanov <ivanstepanovftw@gmail.com>
|
Ivan Stepanov <ivanstepanovftw@gmail.com>
|
||||||
JH23X <165871467+JH23X@users.noreply.github.com>
|
JH23X <165871467+JH23X@users.noreply.github.com>
|
||||||
|
Jack Mousseau <jack@software.inc>
|
||||||
Jack Mousseau <jmousseau@users.noreply.github.com>
|
Jack Mousseau <jmousseau@users.noreply.github.com>
|
||||||
JackJollimore <130917767+JackJollimore@users.noreply.github.com>
|
JackJollimore <130917767+JackJollimore@users.noreply.github.com>
|
||||||
|
Jaeden Amero <jaeden@patater.com>
|
||||||
Jaemin Son <woalsdnd@gmail.com>
|
Jaemin Son <woalsdnd@gmail.com>
|
||||||
Jag Chadha <jagtesh@gmail.com>
|
Jag Chadha <jagtesh@gmail.com>
|
||||||
Jakub N <jakubniemczyk97@gmail.com>
|
Jakub N <jakubniemczyk97@gmail.com>
|
||||||
|
@ -243,10 +306,14 @@ Jannis Schönleber <joennlae@gmail.com>
|
||||||
Jared Van Bortel <cebtenzzre@gmail.com>
|
Jared Van Bortel <cebtenzzre@gmail.com>
|
||||||
Jared Van Bortel <jared@nomic.ai>
|
Jared Van Bortel <jared@nomic.ai>
|
||||||
Jason McCartney <jmac@theroot.org>
|
Jason McCartney <jmac@theroot.org>
|
||||||
|
Jason Stillerman <jason.t.stillerman@gmail.com>
|
||||||
Jean-Christophe Hoelt <hoelt@fovea.cc>
|
Jean-Christophe Hoelt <hoelt@fovea.cc>
|
||||||
Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
|
Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
|
||||||
Jed Fox <git@jedfox.com>
|
Jed Fox <git@jedfox.com>
|
||||||
|
Jeff Bolz <jbolz@nvidia.com>
|
||||||
|
Jeffrey Morgan <jmorganca@gmail.com>
|
||||||
Jeffrey Quesnelle <emozilla@nousresearch.com>
|
Jeffrey Quesnelle <emozilla@nousresearch.com>
|
||||||
|
Jeroen Mostert <jeroen.mostert@cm.com>
|
||||||
Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
|
Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
|
||||||
Jeximo <jeximo@gmail.com>
|
Jeximo <jeximo@gmail.com>
|
||||||
Jhen-Jie Hong <iainst0409@gmail.com>
|
Jhen-Jie Hong <iainst0409@gmail.com>
|
||||||
|
@ -258,6 +325,9 @@ Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
|
||||||
Jiří Sejkora <Sejseloid@gmail.com>
|
Jiří Sejkora <Sejseloid@gmail.com>
|
||||||
Joan Fontanals <jfontanalsmartinez@gmail.com>
|
Joan Fontanals <jfontanalsmartinez@gmail.com>
|
||||||
Joan Fontanals <joan.fontanals.martinez@jina.ai>
|
Joan Fontanals <joan.fontanals.martinez@jina.ai>
|
||||||
|
João Dinis Ferreira <hello@joaof.eu>
|
||||||
|
Joe Eli McIlvain <joe.eli.mac@gmail.com>
|
||||||
|
Joe Todd <joe.todd@codeplay.com>
|
||||||
Johan <JohanAR@users.noreply.github.com>
|
Johan <JohanAR@users.noreply.github.com>
|
||||||
Johannes Gäßler <johannesg@5d6.de>
|
Johannes Gäßler <johannesg@5d6.de>
|
||||||
Johannes Rudolph <johannes.rudolph@gmail.com>
|
Johannes Rudolph <johannes.rudolph@gmail.com>
|
||||||
|
@ -274,7 +344,9 @@ Joyce <joycebrum@google.com>
|
||||||
Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
|
Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
|
||||||
Judd <foldl@users.noreply.github.com>
|
Judd <foldl@users.noreply.github.com>
|
||||||
Julius Arkenberg <arki05@users.noreply.github.com>
|
Julius Arkenberg <arki05@users.noreply.github.com>
|
||||||
|
Jun Hee Yoo <contact.jhyoo@gmail.com>
|
||||||
Jun Jie <71215065+junnjiee16@users.noreply.github.com>
|
Jun Jie <71215065+junnjiee16@users.noreply.github.com>
|
||||||
|
Junil Kim <logyourself@gmail.com>
|
||||||
Junyang Lin <justinlin930319@hotmail.com>
|
Junyang Lin <justinlin930319@hotmail.com>
|
||||||
Juraj Bednar <juraj@bednar.io>
|
Juraj Bednar <juraj@bednar.io>
|
||||||
Justin Parker <jparkerweb@gmail.com>
|
Justin Parker <jparkerweb@gmail.com>
|
||||||
|
@ -292,12 +364,14 @@ Karthik Sethuraman <k.seth1993@gmail.com>
|
||||||
Kasumi <90275229+kasumi-1@users.noreply.github.com>
|
Kasumi <90275229+kasumi-1@users.noreply.github.com>
|
||||||
Kawrakow <48489457+ikawrakow@users.noreply.github.com>
|
Kawrakow <48489457+ikawrakow@users.noreply.github.com>
|
||||||
Keiichi Tabata <keiichi.tabata@outlook.com>
|
Keiichi Tabata <keiichi.tabata@outlook.com>
|
||||||
|
Keke Han <hankeke303@163.com>
|
||||||
Kenvix ⭐ <kenvixzure@live.com>
|
Kenvix ⭐ <kenvixzure@live.com>
|
||||||
Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
|
Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
|
||||||
Kevin Gibbons <bakkot@gmail.com>
|
Kevin Gibbons <bakkot@gmail.com>
|
||||||
Kevin Ji <1146876+kevinji@users.noreply.github.com>
|
Kevin Ji <1146876+kevinji@users.noreply.github.com>
|
||||||
Kevin Kwok <antimatter15@gmail.com>
|
Kevin Kwok <antimatter15@gmail.com>
|
||||||
Kevin Lo <kevlo@kevlo.org>
|
Kevin Lo <kevlo@kevlo.org>
|
||||||
|
Kevin Wang <kevmo314@gmail.com>
|
||||||
Kolen Cheung <ickc@users.noreply.github.com>
|
Kolen Cheung <ickc@users.noreply.github.com>
|
||||||
Konstantin Herud <konstantin.herud@denkbares.com>
|
Konstantin Herud <konstantin.herud@denkbares.com>
|
||||||
Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
|
Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
|
||||||
|
@ -315,22 +389,29 @@ LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
|
||||||
Leonardo Neumann <leonardo@neumann.dev.br>
|
Leonardo Neumann <leonardo@neumann.dev.br>
|
||||||
Li Tan <tanliboy@gmail.com>
|
Li Tan <tanliboy@gmail.com>
|
||||||
Linwei Wang <wanix1988@gmail.com>
|
Linwei Wang <wanix1988@gmail.com>
|
||||||
|
Liu Jia <109258120+Septa2112@users.noreply.github.com>
|
||||||
|
Liu Jia <jia3.liu@intel.com>
|
||||||
LoganDark <github@logandark.mozmail.com>
|
LoganDark <github@logandark.mozmail.com>
|
||||||
|
Loïc Carrère <loic.carrere@gmail.com>
|
||||||
LostRuins <39025047+LostRuins@users.noreply.github.com>
|
LostRuins <39025047+LostRuins@users.noreply.github.com>
|
||||||
Luciano <lucianostrika44@gmail.com>
|
Luciano <lucianostrika44@gmail.com>
|
||||||
Luo Tian <lt@basecity.com>
|
Luo Tian <lt@basecity.com>
|
||||||
Lyle Dean <dean@lyle.dev>
|
Lyle Dean <dean@lyle.dev>
|
||||||
|
M-A <maruel@gmail.com>
|
||||||
M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
|
M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
|
||||||
|
Ma Mingfei <mingfei.ma@intel.com>
|
||||||
Maarten ter Huurne <maarten@treewalker.org>
|
Maarten ter Huurne <maarten@treewalker.org>
|
||||||
Mack Straight <eiz@users.noreply.github.com>
|
Mack Straight <eiz@users.noreply.github.com>
|
||||||
Maël Kerbiriou <m431.kerbiriou@gmail.com>
|
Maël Kerbiriou <m431.kerbiriou@gmail.com>
|
||||||
MaggotHATE <clay1326@gmail.com>
|
MaggotHATE <clay1326@gmail.com>
|
||||||
|
Mahesh Madhav <67384846+heshpdx@users.noreply.github.com>
|
||||||
Manuel <44313466+makuche@users.noreply.github.com>
|
Manuel <44313466+makuche@users.noreply.github.com>
|
||||||
Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
|
Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
|
||||||
Marco Matthies <71844+marcom@users.noreply.github.com>
|
Marco Matthies <71844+marcom@users.noreply.github.com>
|
||||||
Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
|
Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
|
||||||
Marian Cepok <marian.cepok@gmail.com>
|
Marian Cepok <marian.cepok@gmail.com>
|
||||||
Mark Fairbairn <thebaron88@gmail.com>
|
Mark Fairbairn <thebaron88@gmail.com>
|
||||||
|
Mark Zhuang <zhuangqiubin@gmail.com>
|
||||||
Marko Tasic <mtasic85@gmail.com>
|
Marko Tasic <mtasic85@gmail.com>
|
||||||
Markus Tavenrath <mtavenrath@users.noreply.github.com>
|
Markus Tavenrath <mtavenrath@users.noreply.github.com>
|
||||||
Martin Delille <martin@delille.org>
|
Martin Delille <martin@delille.org>
|
||||||
|
@ -342,11 +423,15 @@ MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com>
|
||||||
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
|
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
|
||||||
Matheus C. França <matheus-catarino@hotmail.com>
|
Matheus C. França <matheus-catarino@hotmail.com>
|
||||||
Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
|
Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
|
||||||
|
Mathieu Geli <mathieu.geli@gmail.com>
|
||||||
Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
|
Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
|
||||||
|
Mathijs Henquet <mathijs.henquet@gmail.com>
|
||||||
Mathijs de Bruin <mathijs@mathijsfietst.nl>
|
Mathijs de Bruin <mathijs@mathijsfietst.nl>
|
||||||
Matt Clayton <156335168+mattjcly@users.noreply.github.com>
|
Matt Clayton <156335168+mattjcly@users.noreply.github.com>
|
||||||
Matt Pulver <matt.pulver@heavy.ai>
|
Matt Pulver <matt.pulver@heavy.ai>
|
||||||
|
Matt Stephenson <mstephenson6@users.noreply.github.com>
|
||||||
Matteo Boschini <12133566+mbosc@users.noreply.github.com>
|
Matteo Boschini <12133566+mbosc@users.noreply.github.com>
|
||||||
|
Matteo Mortari <matteo.mortari@gmail.com>
|
||||||
Mattheus Chediak <shammcity00@gmail.com>
|
Mattheus Chediak <shammcity00@gmail.com>
|
||||||
Matthew Tejo <matthew.tejo@gmail.com>
|
Matthew Tejo <matthew.tejo@gmail.com>
|
||||||
Matvey Soloviev <blackhole89@gmail.com>
|
Matvey Soloviev <blackhole89@gmail.com>
|
||||||
|
@ -356,8 +441,10 @@ Maxime <672982+maximegmd@users.noreply.github.com>
|
||||||
Maximilian Winter <maximilian.winter.91@gmail.com>
|
Maximilian Winter <maximilian.winter.91@gmail.com>
|
||||||
Meng Zhang <meng@tabbyml.com>
|
Meng Zhang <meng@tabbyml.com>
|
||||||
Meng, Hengyu <hengyu.meng@intel.com>
|
Meng, Hengyu <hengyu.meng@intel.com>
|
||||||
|
Mengqing Cao <cmq0113@163.com>
|
||||||
Merrick Christensen <merrick.christensen@gmail.com>
|
Merrick Christensen <merrick.christensen@gmail.com>
|
||||||
Michael Coppola <m18coppola@gmail.com>
|
Michael Coppola <m18coppola@gmail.com>
|
||||||
|
Michael Francis <edude03@gmail.com>
|
||||||
Michael Hueschen <m@mhueschen.dev>
|
Michael Hueschen <m@mhueschen.dev>
|
||||||
Michael Kesper <mkesper@schokokeks.org>
|
Michael Kesper <mkesper@schokokeks.org>
|
||||||
Michael Klimenko <mklimenko29@gmail.com>
|
Michael Klimenko <mklimenko29@gmail.com>
|
||||||
|
@ -365,41 +452,57 @@ Michael Podvitskiy <podvitskiymichael@gmail.com>
|
||||||
Michael Potter <NanoTekGuy@Gmail.com>
|
Michael Potter <NanoTekGuy@Gmail.com>
|
||||||
Michael de Gans <michael.john.degans@gmail.com>
|
Michael de Gans <michael.john.degans@gmail.com>
|
||||||
Michaël de Vries <vriesdemichael@gmail.com>
|
Michaël de Vries <vriesdemichael@gmail.com>
|
||||||
|
Michał Tuszyński <srgtuszy@gmail.com>
|
||||||
Mihai <mihai.chirculescu@yahoo.com>
|
Mihai <mihai.chirculescu@yahoo.com>
|
||||||
Mike <ytianhui2004@gmail.com>
|
Mike <ytianhui2004@gmail.com>
|
||||||
Mikko Juola <mikjuo@gmail.com>
|
Mikko Juola <mikjuo@gmail.com>
|
||||||
Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
|
Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
|
||||||
|
Minsoo Cheong <icycle0409@snu.ac.kr>
|
||||||
Mirko185 <mirkosig@gmail.com>
|
Mirko185 <mirkosig@gmail.com>
|
||||||
Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
|
Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
|
||||||
|
MistApproach <98988043+MistApproach@users.noreply.github.com>
|
||||||
Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
|
Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
|
||||||
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
|
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
|
||||||
Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
|
Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
|
||||||
|
Molly Sophia <mollysophia379@gmail.com>
|
||||||
|
MorganRO8 <47795945+MorganRO8@users.noreply.github.com>
|
||||||
Murilo Santana <mvrilo@gmail.com>
|
Murilo Santana <mvrilo@gmail.com>
|
||||||
Musab Gultekin <musabgultekin@users.noreply.github.com>
|
Musab Gultekin <musabgultekin@users.noreply.github.com>
|
||||||
Nam D. Tran <42194884+namtranase@users.noreply.github.com>
|
Nam D. Tran <42194884+namtranase@users.noreply.github.com>
|
||||||
Nathan Epstein <nate2@umbc.edu>
|
Nathan Epstein <nate2@umbc.edu>
|
||||||
|
Natsu <chino@hotococoa.moe>
|
||||||
NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
|
NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
|
||||||
Nebula <infinitewormhole@gmail.com>
|
Nebula <infinitewormhole@gmail.com>
|
||||||
Neo Zhang <14088817+arthw@users.noreply.github.com>
|
Neo Zhang <14088817+arthw@users.noreply.github.com>
|
||||||
Neo Zhang <zhang.jianyu@outlook.com>
|
Neo Zhang <zhang.jianyu@outlook.com>
|
||||||
Neo Zhang Jianyu <jianyu.zhang@intel.com>
|
Neo Zhang Jianyu <jianyu.zhang@intel.com>
|
||||||
Neuman Vong <neuman.vong@gmail.com>
|
Neuman Vong <neuman.vong@gmail.com>
|
||||||
|
Nexes the Old <124105151+Nexesenex@users.noreply.github.com>
|
||||||
Nexesenex <124105151+Nexesenex@users.noreply.github.com>
|
Nexesenex <124105151+Nexesenex@users.noreply.github.com>
|
||||||
Niall Coates <1349685+Niall-@users.noreply.github.com>
|
Niall Coates <1349685+Niall-@users.noreply.github.com>
|
||||||
|
Nicholai Tukanov <nicholaitukanov@gmail.com>
|
||||||
|
Nico Bosshard <nico@bosshome.ch>
|
||||||
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
|
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
|
||||||
Nicolás Pérez <nicolas_perez@brown.edu>
|
Nicolás Pérez <nicolas_perez@brown.edu>
|
||||||
Nigel Bosch <pnigelb@gmail.com>
|
Nigel Bosch <pnigelb@gmail.com>
|
||||||
Niklas Korz <niklas@niklaskorz.de>
|
Niklas Korz <niklas@niklaskorz.de>
|
||||||
|
NikolaiLyssogor <59844691+NikolaiLyssogor@users.noreply.github.com>
|
||||||
Nikolas <127742645+nneubacher@users.noreply.github.com>
|
Nikolas <127742645+nneubacher@users.noreply.github.com>
|
||||||
Nindaleth <Nindaleth@users.noreply.github.com>
|
Nindaleth <Nindaleth@users.noreply.github.com>
|
||||||
|
OSecret <135510162+OLSecret@users.noreply.github.com>
|
||||||
Oleksandr Nikitin <oleksandr@tvori.info>
|
Oleksandr Nikitin <oleksandr@tvori.info>
|
||||||
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
|
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
|
||||||
Olivier Chafik <ochafik@users.noreply.github.com>
|
Olivier Chafik <ochafik@users.noreply.github.com>
|
||||||
Ondřej Čertík <ondrej@certik.us>
|
Ondřej Čertík <ondrej@certik.us>
|
||||||
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
|
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
|
||||||
|
PAB <pierreantoine.bannier@gmail.com>
|
||||||
|
Pablo Duboue <pablo.duboue@gmail.com>
|
||||||
|
Pascal Patry <ppatry@mtacitlabs.com>
|
||||||
Patrice Ferlet <metal3d@gmail.com>
|
Patrice Ferlet <metal3d@gmail.com>
|
||||||
Paul Tsochantaris <ptsochantaris@icloud.com>
|
Paul Tsochantaris <ptsochantaris@icloud.com>
|
||||||
|
Pavel Zloi <github.com@drteam.rocks>
|
||||||
Pavol Rusnak <pavol@rusnak.io>
|
Pavol Rusnak <pavol@rusnak.io>
|
||||||
|
Paweł Wodnicki <151604+32bitmicro@users.noreply.github.com>
|
||||||
Pedro Cuenca <pedro@huggingface.co>
|
Pedro Cuenca <pedro@huggingface.co>
|
||||||
Peter Sugihara <peter@campsh.com>
|
Peter Sugihara <peter@campsh.com>
|
||||||
Phil H <5756783+phiharri@users.noreply.github.com>
|
Phil H <5756783+phiharri@users.noreply.github.com>
|
||||||
|
@ -407,10 +510,15 @@ Philip Taron <philip.taron@gmail.com>
|
||||||
Phillip Kravtsov <phillip@kravtsov.net>
|
Phillip Kravtsov <phillip@kravtsov.net>
|
||||||
Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
|
Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
|
||||||
Pierrick Hymbert <pierrick.hymbert@gmail.com>
|
Pierrick Hymbert <pierrick.hymbert@gmail.com>
|
||||||
|
Pieter Ouwerkerk <pieter.ouwerkerk@gmail.com>
|
||||||
|
Plamen Minev <pacominev@gmail.com>
|
||||||
|
Prashant Vithule <119530321+Vithulep@users.noreply.github.com>
|
||||||
Przemysław Pawełczyk <przemoc@gmail.com>
|
Przemysław Pawełczyk <przemoc@gmail.com>
|
||||||
Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
|
Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
|
||||||
Qingyou Meng <meng.qingyou@gmail.com>
|
Qingyou Meng <meng.qingyou@gmail.com>
|
||||||
Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
|
Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
|
||||||
|
R0CKSTAR <xiaodong.ye@mthreads.com>
|
||||||
|
R0CKSTAR <yeahdongcn@gmail.com>
|
||||||
RJ Adriaansen <adriaansen@eshcc.eur.nl>
|
RJ Adriaansen <adriaansen@eshcc.eur.nl>
|
||||||
Radoslav Gerganov <rgerganov@gmail.com>
|
Radoslav Gerganov <rgerganov@gmail.com>
|
||||||
Radosław Gryta <radek.gryta@gmail.com>
|
Radosław Gryta <radek.gryta@gmail.com>
|
||||||
|
@ -419,11 +527,13 @@ Raj Hammeer Singh Hada <hammeerraj@gmail.com>
|
||||||
Ralph Soika <ralph.soika@imixs.com>
|
Ralph Soika <ralph.soika@imixs.com>
|
||||||
Rand Xie <randxiexyy29@gmail.com>
|
Rand Xie <randxiexyy29@gmail.com>
|
||||||
Randall Fitzgerald <randall@dasaku.net>
|
Randall Fitzgerald <randall@dasaku.net>
|
||||||
|
Random Fly <renfei8@live.cn>
|
||||||
Reinforce-II <fate@eastal.com>
|
Reinforce-II <fate@eastal.com>
|
||||||
Ren Xuancheng <jklj077@users.noreply.github.com>
|
Ren Xuancheng <jklj077@users.noreply.github.com>
|
||||||
Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
|
Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
|
||||||
RhinoDevel <RhinoDevel@users.noreply.github.com>
|
RhinoDevel <RhinoDevel@users.noreply.github.com>
|
||||||
Riceball LEE <snowyu.lee@gmail.com>
|
Riceball LEE <snowyu.lee@gmail.com>
|
||||||
|
Rich Dougherty <rich@rd.nz>
|
||||||
Richard Kiss <him@richardkiss.com>
|
Richard Kiss <him@richardkiss.com>
|
||||||
Richard Roberson <richardr1126@gmail.com>
|
Richard Roberson <richardr1126@gmail.com>
|
||||||
Rick G <26732651+TheFlipbook@users.noreply.github.com>
|
Rick G <26732651+TheFlipbook@users.noreply.github.com>
|
||||||
|
@ -439,21 +549,30 @@ Robey Holderith <robey@flaminglunchbox.net>
|
||||||
Robyn <robyngraf@users.noreply.github.com>
|
Robyn <robyngraf@users.noreply.github.com>
|
||||||
Roger Meier <r.meier@siemens.com>
|
Roger Meier <r.meier@siemens.com>
|
||||||
Roland <14355895+rbur0425@users.noreply.github.com>
|
Roland <14355895+rbur0425@users.noreply.github.com>
|
||||||
|
Romain Biessy <romain.biessy@codeplay.com>
|
||||||
Romain D <90720+Artefact2@users.noreply.github.com>
|
Romain D <90720+Artefact2@users.noreply.github.com>
|
||||||
Romain Neutron <romain@neutron.io>
|
Romain Neutron <romain@neutron.io>
|
||||||
Roman Parykin <donderom@gmail.com>
|
Roman Parykin <donderom@gmail.com>
|
||||||
Ron Evans <ron@hybridgroup.com>
|
Ron Evans <ron@hybridgroup.com>
|
||||||
Ron Jailall <rojailal@gmail.com>
|
Ron Jailall <rojailal@gmail.com>
|
||||||
|
Roni <sulpher@gmx.net>
|
||||||
Ronny Brendel <ronnybrendel@gmail.com>
|
Ronny Brendel <ronnybrendel@gmail.com>
|
||||||
Ronsor <ronsor@ronsor.pw>
|
Ronsor <ronsor@ronsor.pw>
|
||||||
Rowan Hart <rowanbhart@gmail.com>
|
Rowan Hart <rowanbhart@gmail.com>
|
||||||
|
Ruchira Hasaranga <ruchira66@gmail.com>
|
||||||
|
Ruixin Huang <18860020911@163.com>
|
||||||
Rune <43761327+Rune-AI@users.noreply.github.com>
|
Rune <43761327+Rune-AI@users.noreply.github.com>
|
||||||
|
RunningLeon <maningsheng@sensetime.com>
|
||||||
|
RunningLeon <mnsheng@yeah.net>
|
||||||
Ryan Landay <rlanday@gmail.com>
|
Ryan Landay <rlanday@gmail.com>
|
||||||
Ryder Wishart <ryderwishart@gmail.com>
|
Ryder Wishart <ryderwishart@gmail.com>
|
||||||
Ryuei <louixs@users.noreply.github.com>
|
Ryuei <louixs@users.noreply.github.com>
|
||||||
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
|
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
|
||||||
|
SRHMorris <69468379+SRHMorris@users.noreply.github.com>
|
||||||
|
SXX <sxx1136965276@gmail.com>
|
||||||
SakuraUmi <yukinon244@gmail.com>
|
SakuraUmi <yukinon244@gmail.com>
|
||||||
Salvador E. Tropea <stropea@inti.gob.ar>
|
Salvador E. Tropea <stropea@inti.gob.ar>
|
||||||
|
Salvatore Mesoraca <s.mesoraca16@gmail.com>
|
||||||
Sam Spilsbury <smspillaz@gmail.com>
|
Sam Spilsbury <smspillaz@gmail.com>
|
||||||
Sami Farin <3876865+Safari77@users.noreply.github.com>
|
Sami Farin <3876865+Safari77@users.noreply.github.com>
|
||||||
Samuel Maynard <samwmaynard@gmail.com>
|
Samuel Maynard <samwmaynard@gmail.com>
|
||||||
|
@ -463,23 +582,29 @@ Sebastián A <sebastian.aedo29@gmail.com>
|
||||||
SebastianApel <13675545+SebastianApel@users.noreply.github.com>
|
SebastianApel <13675545+SebastianApel@users.noreply.github.com>
|
||||||
Senemu <10880819+Senemu@users.noreply.github.com>
|
Senemu <10880819+Senemu@users.noreply.github.com>
|
||||||
Sergey Alirzaev <zl29ah@gmail.com>
|
Sergey Alirzaev <zl29ah@gmail.com>
|
||||||
|
Sergio López <slp@redhat.com>
|
||||||
Sergio López <slp@sinrega.org>
|
Sergio López <slp@sinrega.org>
|
||||||
Sertaç Özercan <852750+sozercan@users.noreply.github.com>
|
Sertaç Özercan <852750+sozercan@users.noreply.github.com>
|
||||||
SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
|
SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
|
||||||
ShadovvBeast <ShadovvBeast@gmail.com>
|
ShadovvBeast <ShadovvBeast@gmail.com>
|
||||||
Shakhar Dasgupta <shakhardasgupta@gmail.com>
|
Shakhar Dasgupta <shakhardasgupta@gmail.com>
|
||||||
|
Shane A <shanea@allenai.org>
|
||||||
Shangning Xu <32517059+xushangning@users.noreply.github.com>
|
Shangning Xu <32517059+xushangning@users.noreply.github.com>
|
||||||
|
Shankar <gshankar.87@gmail.com>
|
||||||
|
Shanshan Shen <467638484@qq.com>
|
||||||
Shijie <821898965@qq.com>
|
Shijie <821898965@qq.com>
|
||||||
Shintarou Okada <kokuzen@gmail.com>
|
Shintarou Okada <kokuzen@gmail.com>
|
||||||
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
|
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
|
||||||
Shouzheng Liu <lshzh.hi@gmail.com>
|
Shouzheng Liu <lshzh.hi@gmail.com>
|
||||||
Shuichi Tsutsumi <shuichi0526@gmail.com>
|
Shuichi Tsutsumi <shuichi0526@gmail.com>
|
||||||
|
Shupei Fan <dymarkfan@outlook.com>
|
||||||
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
|
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
|
||||||
Simon Willison <swillison@gmail.com>
|
Simon Willison <swillison@gmail.com>
|
||||||
Siwen Yu <yusiwen@gmail.com>
|
Siwen Yu <yusiwen@gmail.com>
|
||||||
Sky Yan <skyan83@gmail.com>
|
Sky Yan <skyan83@gmail.com>
|
||||||
Slaren <2141330+slaren@users.noreply.github.com>
|
Slaren <2141330+slaren@users.noreply.github.com>
|
||||||
Slava Primenko <primenko.s@gmail.com>
|
Slava Primenko <primenko.s@gmail.com>
|
||||||
|
Small Grass Forest <zixuanxcl@gmail.com>
|
||||||
SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com>
|
SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com>
|
||||||
Someone <sergei.kozlukov@aalto.fi>
|
Someone <sergei.kozlukov@aalto.fi>
|
||||||
Someone Serge <sergei.kozlukov@aalto.fi>
|
Someone Serge <sergei.kozlukov@aalto.fi>
|
||||||
|
@ -491,12 +616,15 @@ Stefan Sydow <stefan@sydow.email>
|
||||||
Steffen Röcker <sroecker@gmail.com>
|
Steffen Röcker <sroecker@gmail.com>
|
||||||
Stephan Walter <stephan@walter.name>
|
Stephan Walter <stephan@walter.name>
|
||||||
Stephen Nichols <snichols@users.noreply.github.com>
|
Stephen Nichols <snichols@users.noreply.github.com>
|
||||||
|
Steve Bonds <sbonds@gmail.com>
|
||||||
Steve Grubb <ausearch.1@gmail.com>
|
Steve Grubb <ausearch.1@gmail.com>
|
||||||
Steven Prichard <spprichard20@gmail.com>
|
Steven Prichard <spprichard20@gmail.com>
|
||||||
Steven Roussey <sroussey@gmail.com>
|
Steven Roussey <sroussey@gmail.com>
|
||||||
Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
|
Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
|
||||||
|
StrangeBytesDev <141275258+StrangeBytesDev@users.noreply.github.com>
|
||||||
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
|
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
|
||||||
SuperUserNameMan <yoann@terminajones.com>
|
SuperUserNameMan <yoann@terminajones.com>
|
||||||
|
Sutou Kouhei <kou@cozmixng.org>
|
||||||
Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
|
Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
|
||||||
Taikono-Himazin <kazu@po.harenet.ne.jp>
|
Taikono-Himazin <kazu@po.harenet.ne.jp>
|
||||||
Tameem <113388789+AhmadTameem@users.noreply.github.com>
|
Tameem <113388789+AhmadTameem@users.noreply.github.com>
|
||||||
|
@ -507,7 +635,9 @@ Theia Vogel <theia@vgel.me>
|
||||||
Thérence <13496987+Royalphax@users.noreply.github.com>
|
Thérence <13496987+Royalphax@users.noreply.github.com>
|
||||||
Thibault Terrasson <thibault.terrasson@gmail.com>
|
Thibault Terrasson <thibault.terrasson@gmail.com>
|
||||||
Thomas Klausner <wiz@gatalith.at>
|
Thomas Klausner <wiz@gatalith.at>
|
||||||
|
Thorsten Sommer <SommerEngineering@users.noreply.github.com>
|
||||||
Tim Miller <drasticactions@users.noreply.github.com>
|
Tim Miller <drasticactions@users.noreply.github.com>
|
||||||
|
Tim Wang <overocean@gmail.com>
|
||||||
Timmy Knight <r2d2fish@gmail.com>
|
Timmy Knight <r2d2fish@gmail.com>
|
||||||
Timothy Cronin <40186632+4imothy@users.noreply.github.com>
|
Timothy Cronin <40186632+4imothy@users.noreply.github.com>
|
||||||
Ting Lou <ting.lou@gmail.com>
|
Ting Lou <ting.lou@gmail.com>
|
||||||
|
@ -517,24 +647,31 @@ Tom C <tom.corelis@gmail.com>
|
||||||
Tom Jobbins <784313+TheBloke@users.noreply.github.com>
|
Tom Jobbins <784313+TheBloke@users.noreply.github.com>
|
||||||
Tomas <tom.tomas.36478119@gmail.com>
|
Tomas <tom.tomas.36478119@gmail.com>
|
||||||
Tomáš Pazdiora <tomas.pazdiora@gmail.com>
|
Tomáš Pazdiora <tomas.pazdiora@gmail.com>
|
||||||
|
Tony Wasserka <4840017+neobrain@users.noreply.github.com>
|
||||||
Tristan Druyen <tristan@vault81.mozmail.com>
|
Tristan Druyen <tristan@vault81.mozmail.com>
|
||||||
Tristan Ross <rosscomputerguy@protonmail.com>
|
Tristan Ross <rosscomputerguy@protonmail.com>
|
||||||
|
Trivikram Kamat <16024985+trivikr@users.noreply.github.com>
|
||||||
Tungsten842 <886724vf@anonaddy.me>
|
Tungsten842 <886724vf@anonaddy.me>
|
||||||
Tungsten842 <quantmint@protonmail.com>
|
Tungsten842 <quantmint@protonmail.com>
|
||||||
Tushar <ditsuke@protonmail.com>
|
Tushar <ditsuke@protonmail.com>
|
||||||
UEXTM.com <84163508+uextm@users.noreply.github.com>
|
UEXTM.com <84163508+uextm@users.noreply.github.com>
|
||||||
|
Ujjawal Panchal <31011628+Ujjawal-K-Panchal@users.noreply.github.com>
|
||||||
Ulrich Drepper <drepper@gmail.com>
|
Ulrich Drepper <drepper@gmail.com>
|
||||||
Uzo Nweke <uzoechi@gmail.com>
|
Uzo Nweke <uzoechi@gmail.com>
|
||||||
Vaibhav Srivastav <vaibhavs10@gmail.com>
|
Vaibhav Srivastav <vaibhavs10@gmail.com>
|
||||||
Val Kharitonov <mail@kharvd.com>
|
Val Kharitonov <mail@kharvd.com>
|
||||||
Valentin Konovalov <valle.ketsujin@gmail.com>
|
Valentin Konovalov <valle.ketsujin@gmail.com>
|
||||||
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
|
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
|
||||||
|
Vali Malinoiu <0x4139@gmail.com>
|
||||||
Victor Nogueira <felladrin@gmail.com>
|
Victor Nogueira <felladrin@gmail.com>
|
||||||
Victor Z. Peng <ziliangdotme@gmail.com>
|
Victor Z. Peng <ziliangdotme@gmail.com>
|
||||||
|
Viet-Anh NGUYEN (Andrew) <vietanh.dev@gmail.com>
|
||||||
|
Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com>
|
||||||
Vlad <spitfireage@gmail.com>
|
Vlad <spitfireage@gmail.com>
|
||||||
Vladimir <bogdad@gmail.com>
|
Vladimir <bogdad@gmail.com>
|
||||||
Vladimir Malyutin <first-leon@yandex.ru>
|
Vladimir Malyutin <first-leon@yandex.ru>
|
||||||
Vladimir Zorin <vladimir@deviant.guru>
|
Vladimir Zorin <vladimir@deviant.guru>
|
||||||
|
VoidIsVoid <343750470@qq.com>
|
||||||
Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
|
Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
|
||||||
WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
|
WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
|
||||||
Weird Constructor <weirdconstructor@gmail.com>
|
Weird Constructor <weirdconstructor@gmail.com>
|
||||||
|
@ -551,15 +688,22 @@ Xiang (Kevin) Li <kevinli020508@gmail.com>
|
||||||
Xiao-Yong Jin <jinxiaoyong@gmail.com>
|
Xiao-Yong Jin <jinxiaoyong@gmail.com>
|
||||||
XiaotaoChen <chenxiaotao1234@gmail.com>
|
XiaotaoChen <chenxiaotao1234@gmail.com>
|
||||||
Xiaoyi Chen <cxychina@gmail.com>
|
Xiaoyi Chen <cxychina@gmail.com>
|
||||||
|
Xie Yanbo <xieyanbo@gmail.com>
|
||||||
Xingchen Song(宋星辰) <xingchensong1996@163.com>
|
Xingchen Song(宋星辰) <xingchensong1996@163.com>
|
||||||
|
Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com>
|
||||||
Xuan Son Nguyen <thichthat@gmail.com>
|
Xuan Son Nguyen <thichthat@gmail.com>
|
||||||
|
Yaiko <elyaiko@hotmail.com>
|
||||||
Yann Follet <131855179+YannFollet@users.noreply.github.com>
|
Yann Follet <131855179+YannFollet@users.noreply.github.com>
|
||||||
Yaroslav <yaroslav.yashin@me.com>
|
Yaroslav <yaroslav.yashin@me.com>
|
||||||
Yazan Agha-Schrader <mountaiin@icloud.com>
|
Yazan Agha-Schrader <mountaiin@icloud.com>
|
||||||
Yiming Cui <conandiy@vip.qq.com>
|
Yiming Cui <conandiy@vip.qq.com>
|
||||||
Yishuo Wang <MeouSker77@outlook.com>
|
Yishuo Wang <MeouSker77@outlook.com>
|
||||||
|
Yoshi Suhara <y.suhara@gmail.com>
|
||||||
|
Yoshi Suhara <ysuhara@nvidia.com>
|
||||||
|
Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
|
||||||
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
|
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
|
||||||
Yui <dev@sleepyyui.com>
|
Yui <dev@sleepyyui.com>
|
||||||
|
Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
|
||||||
Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
|
Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
|
||||||
Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
|
Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
|
||||||
ZHAOKAI WANG <sanxianwei@163.com>
|
ZHAOKAI WANG <sanxianwei@163.com>
|
||||||
|
@ -568,6 +712,8 @@ Zay <95888118+isaiahbjork@users.noreply.github.com>
|
||||||
Zenix <zenixls2@gmail.com>
|
Zenix <zenixls2@gmail.com>
|
||||||
Zhang Peiyuan <a1286225768@gmail.com>
|
Zhang Peiyuan <a1286225768@gmail.com>
|
||||||
Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
|
Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
|
||||||
|
Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com>
|
||||||
|
Zhiyuan Li <lizhiyuan@uniartisan.com>
|
||||||
ZhouYuChen <zhouyuchen@naver.com>
|
ZhouYuChen <zhouyuchen@naver.com>
|
||||||
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
|
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
|
||||||
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
|
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
|
||||||
|
@ -581,6 +727,7 @@ alexpinel <93524949+alexpinel@users.noreply.github.com>
|
||||||
alonfaraj <alonfaraj@gmail.com>
|
alonfaraj <alonfaraj@gmail.com>
|
||||||
alwqx <kenan3015@gmail.com>
|
alwqx <kenan3015@gmail.com>
|
||||||
amd-lalithnc <lalithnc@amd.com>
|
amd-lalithnc <lalithnc@amd.com>
|
||||||
|
amritahs-ibm <amritahs@linux.vnet.ibm.com>
|
||||||
andrijdavid <david@geek.mg>
|
andrijdavid <david@geek.mg>
|
||||||
anon998 <131767832+anon998@users.noreply.github.com>
|
anon998 <131767832+anon998@users.noreply.github.com>
|
||||||
anzz1 <anzz1@live.com>
|
anzz1 <anzz1@live.com>
|
||||||
|
@ -588,14 +735,18 @@ apaz <aarpazdera@gmail.com>
|
||||||
apcameron <37645737+apcameron@users.noreply.github.com>
|
apcameron <37645737+apcameron@users.noreply.github.com>
|
||||||
arch-btw <57669023+arch-btw@users.noreply.github.com>
|
arch-btw <57669023+arch-btw@users.noreply.github.com>
|
||||||
arcrank <arcrank@gmail.com>
|
arcrank <arcrank@gmail.com>
|
||||||
|
ardfork <134447697+ardfork@users.noreply.github.com>
|
||||||
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
|
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
|
||||||
at8u <129688334+at8u@users.noreply.github.com>
|
at8u <129688334+at8u@users.noreply.github.com>
|
||||||
automaticcat <daogiatuank54@gmail.com>
|
automaticcat <daogiatuank54@gmail.com>
|
||||||
|
awatuna <23447591+awatuna@users.noreply.github.com>
|
||||||
|
b4b4o <zwbao@foxmail.com>
|
||||||
bandoti <141645996+bandoti@users.noreply.github.com>
|
bandoti <141645996+bandoti@users.noreply.github.com>
|
||||||
beiller <beiller@gmail.com>
|
beiller <beiller@gmail.com>
|
||||||
bhubbb <79117352+bhubbb@users.noreply.github.com>
|
bhubbb <79117352+bhubbb@users.noreply.github.com>
|
||||||
bmwl <brian.marshall@tolko.com>
|
bmwl <brian.marshall@tolko.com>
|
||||||
bobqianic <129547291+bobqianic@users.noreply.github.com>
|
bobqianic <129547291+bobqianic@users.noreply.github.com>
|
||||||
|
brucepro <git@brucepro.net>
|
||||||
bryanSwk <93190252+bryanSwk@users.noreply.github.com>
|
bryanSwk <93190252+bryanSwk@users.noreply.github.com>
|
||||||
bsilvereagle <bsilvereagle@users.noreply.github.com>
|
bsilvereagle <bsilvereagle@users.noreply.github.com>
|
||||||
bssrdf <merlintiger@hotmail.com>
|
bssrdf <merlintiger@hotmail.com>
|
||||||
|
@ -614,10 +765,14 @@ cpumaxx <163466046+cpumaxx@users.noreply.github.com>
|
||||||
crasm <crasm@git.vczf.net>
|
crasm <crasm@git.vczf.net>
|
||||||
crasm <crasm@git.vczf.us>
|
crasm <crasm@git.vczf.us>
|
||||||
daboe01 <daboe01@googlemail.com>
|
daboe01 <daboe01@googlemail.com>
|
||||||
|
daghanerdonmez <44506702+daghanerdonmez@users.noreply.github.com>
|
||||||
|
daminho <37615795+daminho@users.noreply.github.com>
|
||||||
david raistrick <keen99@users.noreply.github.com>
|
david raistrick <keen99@users.noreply.github.com>
|
||||||
ddh0 <dylanhalladay02@icloud.com>
|
ddh0 <dylanhalladay02@icloud.com>
|
||||||
ddpasa <112642920+ddpasa@users.noreply.github.com>
|
ddpasa <112642920+ddpasa@users.noreply.github.com>
|
||||||
deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
|
deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
|
||||||
|
devojony <61173062+devojony@users.noreply.github.com>
|
||||||
|
ditsuke <ditsuke@protonmail.com>
|
||||||
divinity76 <divinity76@gmail.com>
|
divinity76 <divinity76@gmail.com>
|
||||||
dm4 <sunrisedm4@gmail.com>
|
dm4 <sunrisedm4@gmail.com>
|
||||||
dotpy314 <33351922+dotpy314@users.noreply.github.com>
|
dotpy314 <33351922+dotpy314@users.noreply.github.com>
|
||||||
|
@ -629,14 +784,18 @@ ebraminio <ebraminio@gmail.com>
|
||||||
eiery <19350831+eiery@users.noreply.github.com>
|
eiery <19350831+eiery@users.noreply.github.com>
|
||||||
eric8607242 <e0928021388@gmail.com>
|
eric8607242 <e0928021388@gmail.com>
|
||||||
fairydreaming <166155368+fairydreaming@users.noreply.github.com>
|
fairydreaming <166155368+fairydreaming@users.noreply.github.com>
|
||||||
|
fengerhu1 <2748250768@qq.com>
|
||||||
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
|
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
|
||||||
github-actions[bot] <github-actions[bot]@users.noreply.github.com>
|
github-actions[bot] <github-actions[bot]@users.noreply.github.com>
|
||||||
gliptic <gliptic@users.noreply.github.com>
|
gliptic <gliptic@users.noreply.github.com>
|
||||||
goerch <jhr.walter@t-online.de>
|
goerch <jhr.walter@t-online.de>
|
||||||
grahameth <96447521+grahameth@users.noreply.github.com>
|
grahameth <96447521+grahameth@users.noreply.github.com>
|
||||||
|
gtygo <gtydoit@gmail.com>
|
||||||
gwjr <502526+gwjr@users.noreply.github.com>
|
gwjr <502526+gwjr@users.noreply.github.com>
|
||||||
h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
|
h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
|
||||||
hankcs <cnhankmc@gmail.com>
|
hankcs <cnhankmc@gmail.com>
|
||||||
|
haopeng <657407891@qq.com>
|
||||||
|
hipudding <huafengchun@gmail.com>
|
||||||
hoangmit <hoangmit@users.noreply.github.com>
|
hoangmit <hoangmit@users.noreply.github.com>
|
||||||
hongbo.mo <352280764@qq.com>
|
hongbo.mo <352280764@qq.com>
|
||||||
hopkins385 <98618192+hopkins385@users.noreply.github.com>
|
hopkins385 <98618192+hopkins385@users.noreply.github.com>
|
||||||
|
@ -649,12 +808,14 @@ hxer7963 <hxer7963@gmail.com>
|
||||||
hydai <z54981220@gmail.com>
|
hydai <z54981220@gmail.com>
|
||||||
iSma <ismail.senhaji@gmail.com>
|
iSma <ismail.senhaji@gmail.com>
|
||||||
iacore <74560659+iacore@users.noreply.github.com>
|
iacore <74560659+iacore@users.noreply.github.com>
|
||||||
|
icppWorld <124377669+icppWorld@users.noreply.github.com>
|
||||||
igarnier <igarnier@protonmail.com>
|
igarnier <igarnier@protonmail.com>
|
||||||
intelmatt <61025942+intelmatt@users.noreply.github.com>
|
intelmatt <61025942+intelmatt@users.noreply.github.com>
|
||||||
iohub <rickyang.pro@gmail.com>
|
iohub <rickyang.pro@gmail.com>
|
||||||
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
|
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
|
||||||
jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
|
jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
|
||||||
jameswu2014 <545426914@qq.com>
|
jameswu2014 <545426914@qq.com>
|
||||||
|
jdomke <28772296+jdomke@users.noreply.github.com>
|
||||||
jiez <373447296@qq.com>
|
jiez <373447296@qq.com>
|
||||||
jneem <joeneeman@gmail.com>
|
jneem <joeneeman@gmail.com>
|
||||||
joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
|
joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
|
||||||
|
@ -677,28 +838,35 @@ klosax <131523366+klosax@users.noreply.github.com>
|
||||||
kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
|
kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
|
||||||
kunnis <kunnis@users.noreply.github.com>
|
kunnis <kunnis@users.noreply.github.com>
|
||||||
kuronekosaiko <EvanChanJ@163.com>
|
kuronekosaiko <EvanChanJ@163.com>
|
||||||
|
kustaaya <58045274+kustaaya@users.noreply.github.com>
|
||||||
kuvaus <22169537+kuvaus@users.noreply.github.com>
|
kuvaus <22169537+kuvaus@users.noreply.github.com>
|
||||||
kwin1412 <42286931+kwin1412@users.noreply.github.com>
|
kwin1412 <42286931+kwin1412@users.noreply.github.com>
|
||||||
l3utterfly <gc.pthzfoldr@gmail.com>
|
l3utterfly <gc.pthzfoldr@gmail.com>
|
||||||
|
laik <laik.lj@me.com>
|
||||||
ldwang <ftgreat@163.com>
|
ldwang <ftgreat@163.com>
|
||||||
le.chang <cljs118@126.com>
|
le.chang <cljs118@126.com>
|
||||||
leejet <leejet714@gmail.com>
|
leejet <leejet714@gmail.com>
|
||||||
|
leo-pony <nengjunma@outlook.com>
|
||||||
limitedAtonement <limitedAtonement@users.noreply.github.com>
|
limitedAtonement <limitedAtonement@users.noreply.github.com>
|
||||||
liuwei-git <14815172+liuwei-git@users.noreply.github.com>
|
liuwei-git <14815172+liuwei-git@users.noreply.github.com>
|
||||||
lon <114724657+longregen@users.noreply.github.com>
|
lon <114724657+longregen@users.noreply.github.com>
|
||||||
loonerin <132926317+loonerin@users.noreply.github.com>
|
loonerin <132926317+loonerin@users.noreply.github.com>
|
||||||
|
ltoniazzi <61414566+ltoniazzi@users.noreply.github.com>
|
||||||
luoyu-intel <yu.luo@intel.com>
|
luoyu-intel <yu.luo@intel.com>
|
||||||
m3ndax <adrian.goessl@outlook.com>
|
m3ndax <adrian.goessl@outlook.com>
|
||||||
maddes8cht <55592906+maddes8cht@users.noreply.github.com>
|
maddes8cht <55592906+maddes8cht@users.noreply.github.com>
|
||||||
makomk <makosoft@googlemail.com>
|
makomk <makosoft@googlemail.com>
|
||||||
manikbhandari <mbbhandarimanik2@gmail.com>
|
manikbhandari <mbbhandarimanik2@gmail.com>
|
||||||
maor-ps <154728172+maor-ps@users.noreply.github.com>
|
maor-ps <154728172+maor-ps@users.noreply.github.com>
|
||||||
|
matiaslin <45382001+matiaslin@users.noreply.github.com>
|
||||||
|
matteo <matteogeniaccio@yahoo.it>
|
||||||
mdrokz <mohammadmunshi@gmail.com>
|
mdrokz <mohammadmunshi@gmail.com>
|
||||||
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
|
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
|
||||||
minarchist <minarchist@users.noreply.github.com>
|
minarchist <minarchist@users.noreply.github.com>
|
||||||
mj-shifu <77107165+mj-shifu@users.noreply.github.com>
|
mj-shifu <77107165+mj-shifu@users.noreply.github.com>
|
||||||
mmyjona <jonathan.gonse@gmail.com>
|
mmyjona <jonathan.gonse@gmail.com>
|
||||||
momonga <115213907+mmnga@users.noreply.github.com>
|
momonga <115213907+mmnga@users.noreply.github.com>
|
||||||
|
momonga <146910567+mmngays@users.noreply.github.com>
|
||||||
moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
|
moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
|
||||||
mzcu <milos.cubrilo@gmail.com>
|
mzcu <milos.cubrilo@gmail.com>
|
||||||
nanahi <130121847+na-na-hi@users.noreply.github.com>
|
nanahi <130121847+na-na-hi@users.noreply.github.com>
|
||||||
|
@ -716,8 +884,10 @@ omahs <73983677+omahs@users.noreply.github.com>
|
||||||
oobabooga <112222186+oobabooga@users.noreply.github.com>
|
oobabooga <112222186+oobabooga@users.noreply.github.com>
|
||||||
opparco <parco.opaai@gmail.com>
|
opparco <parco.opaai@gmail.com>
|
||||||
ostix360 <55257054+ostix360@users.noreply.github.com>
|
ostix360 <55257054+ostix360@users.noreply.github.com>
|
||||||
|
pculliton <phillipculliton@gmail.com>
|
||||||
pengxin99 <pengxin.yuan@intel.com>
|
pengxin99 <pengxin.yuan@intel.com>
|
||||||
perserk <perserk@gmail.com>
|
perserk <perserk@gmail.com>
|
||||||
|
piDack <104877312+piDack@users.noreply.github.com>
|
||||||
pmysl <piotr.myslinski@outlook.com>
|
pmysl <piotr.myslinski@outlook.com>
|
||||||
postmasters <namnguyen@google.com>
|
postmasters <namnguyen@google.com>
|
||||||
pudepiedj <pudepiedj@gmail.com>
|
pudepiedj <pudepiedj@gmail.com>
|
||||||
|
@ -733,6 +903,7 @@ runfuture <runfuture@users.noreply.github.com>
|
||||||
sandyiscool <sandyiscool@gmail.com>
|
sandyiscool <sandyiscool@gmail.com>
|
||||||
sasha0552 <admin@sasha0552.org>
|
sasha0552 <admin@sasha0552.org>
|
||||||
semidark <me@semidark.net>
|
semidark <me@semidark.net>
|
||||||
|
serhii-nakon <57632032+serhii-nakon@users.noreply.github.com>
|
||||||
sharpHL <132747147+sharpHL@users.noreply.github.com>
|
sharpHL <132747147+sharpHL@users.noreply.github.com>
|
||||||
shibe2 <shibe@tuta.io>
|
shibe2 <shibe@tuta.io>
|
||||||
singularity <12184989+singularity-s0@users.noreply.github.com>
|
singularity <12184989+singularity-s0@users.noreply.github.com>
|
||||||
|
@ -741,42 +912,55 @@ sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
|
||||||
slaren <2141330+slaren@users.noreply.github.com>
|
slaren <2141330+slaren@users.noreply.github.com>
|
||||||
slaren <slarengh@gmail.com>
|
slaren <slarengh@gmail.com>
|
||||||
snadampal <87143774+snadampal@users.noreply.github.com>
|
snadampal <87143774+snadampal@users.noreply.github.com>
|
||||||
|
standby24x7 <standby24x7@gmail.com>
|
||||||
staviq <staviq@gmail.com>
|
staviq <staviq@gmail.com>
|
||||||
stduhpf <stephduh@live.fr>
|
stduhpf <stephduh@live.fr>
|
||||||
strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
|
strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
|
||||||
swittk <switt1995@gmail.com>
|
swittk <switt1995@gmail.com>
|
||||||
takov751 <40316768+takov751@users.noreply.github.com>
|
takov751 <40316768+takov751@users.noreply.github.com>
|
||||||
tarcey <cey.tarik@gmail.com>
|
tarcey <cey.tarik@gmail.com>
|
||||||
|
tc-mb <157115220+tc-mb@users.noreply.github.com>
|
||||||
texmex76 <40733439+texmex76@users.noreply.github.com>
|
texmex76 <40733439+texmex76@users.noreply.github.com>
|
||||||
thement <40525767+thement@users.noreply.github.com>
|
thement <40525767+thement@users.noreply.github.com>
|
||||||
|
thewh1teagle <61390950+thewh1teagle@users.noreply.github.com>
|
||||||
tjohnman <tjohnman@users.noreply.github.com>
|
tjohnman <tjohnman@users.noreply.github.com>
|
||||||
|
toyer <2042519524@qq.com>
|
||||||
tslmy <tslmy@users.noreply.github.com>
|
tslmy <tslmy@users.noreply.github.com>
|
||||||
ubik2 <ubik2@users.noreply.github.com>
|
ubik2 <ubik2@users.noreply.github.com>
|
||||||
uint256_t <konndennsa@gmail.com>
|
uint256_t <konndennsa@gmail.com>
|
||||||
uint256_t <maekawatoshiki1017@gmail.com>
|
uint256_t <maekawatoshiki1017@gmail.com>
|
||||||
unbounded <haakon@likedan.net>
|
unbounded <haakon@likedan.net>
|
||||||
|
uvos <devnull@uvos.xyz>
|
||||||
valiray <133289098+valiray@users.noreply.github.com>
|
valiray <133289098+valiray@users.noreply.github.com>
|
||||||
|
vb <vaibhavs10@gmail.com>
|
||||||
vik <vikhyatk@gmail.com>
|
vik <vikhyatk@gmail.com>
|
||||||
viric <viric@viric.name>
|
viric <viric@viric.name>
|
||||||
vodkaslime <646329483@qq.com>
|
vodkaslime <646329483@qq.com>
|
||||||
vvhg1 <94630311+vvhg1@users.noreply.github.com>
|
vvhg1 <94630311+vvhg1@users.noreply.github.com>
|
||||||
vxiiduu <73044267+vxiiduu@users.noreply.github.com>
|
vxiiduu <73044267+vxiiduu@users.noreply.github.com>
|
||||||
|
wangshuai09 <391746016@qq.com>
|
||||||
wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
|
wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
|
||||||
whoreson <139810751+whoreson@users.noreply.github.com>
|
whoreson <139810751+whoreson@users.noreply.github.com>
|
||||||
woachk <24752637+woachk@users.noreply.github.com>
|
woachk <24752637+woachk@users.noreply.github.com>
|
||||||
wonjun Jang <strutive07@gmail.com>
|
wonjun Jang <strutive07@gmail.com>
|
||||||
woodx <124784234+woodx9@users.noreply.github.com>
|
woodx <124784234+woodx9@users.noreply.github.com>
|
||||||
|
wwoodsTM <104587230+wwoodsTM@users.noreply.github.com>
|
||||||
wzy <32936898+Freed-Wu@users.noreply.github.com>
|
wzy <32936898+Freed-Wu@users.noreply.github.com>
|
||||||
xaedes <xaedes@gmail.com>
|
xaedes <xaedes@gmail.com>
|
||||||
xaedes <xaedes@googlemail.com>
|
xaedes <xaedes@googlemail.com>
|
||||||
|
xctan <axunlei@gmail.com>
|
||||||
xloem <0xloem@gmail.com>
|
xloem <0xloem@gmail.com>
|
||||||
yangli2 <yangli2@gmail.com>
|
yangli2 <yangli2@gmail.com>
|
||||||
yuiseki <yuiseki@gmail.com>
|
yuiseki <yuiseki@gmail.com>
|
||||||
|
yuri@FreeBSD <yurivict@users.noreply.github.com>
|
||||||
zakkor <edward.partenie@gmail.com>
|
zakkor <edward.partenie@gmail.com>
|
||||||
zhangkaihuo <zhangkaihuo@gmail.com>
|
zhangkaihuo <zhangkaihuo@gmail.com>
|
||||||
|
zhentaoyu <zhentao.yu@intel.com>
|
||||||
zhouwg <6889919+zhouwg@users.noreply.github.com>
|
zhouwg <6889919+zhouwg@users.noreply.github.com>
|
||||||
zhouwg <zhouwg2000@gmail.com>
|
zhouwg <zhouwg2000@gmail.com>
|
||||||
zrm <trustiosity.zrm@gmail.com>
|
zrm <trustiosity.zrm@gmail.com>
|
||||||
Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
|
Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
|
||||||
|
杨朱 · Kiki <baofa.fan@daocloud.io>
|
||||||
源文雨 <41315874+fumiama@users.noreply.github.com>
|
源文雨 <41315874+fumiama@users.noreply.github.com>
|
||||||
|
蕭澧邦 <45505768+shou692199@users.noreply.github.com>
|
||||||
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
|
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
|
||||||
|
|
9
Makefile
9
Makefile
|
@ -254,8 +254,8 @@ endif
|
||||||
# keep standard at C11 and C++11
|
# keep standard at C11 and C++11
|
||||||
MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
|
MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
|
||||||
MK_CFLAGS = -std=c11 -fPIC
|
MK_CFLAGS = -std=c11 -fPIC
|
||||||
MK_CXXFLAGS = -std=c++11 -fPIC
|
MK_CXXFLAGS = -std=c++17 -fPIC
|
||||||
MK_NVCCFLAGS = -std=c++11
|
MK_NVCCFLAGS = -std=c++17
|
||||||
|
|
||||||
ifdef LLAMA_NO_CCACHE
|
ifdef LLAMA_NO_CCACHE
|
||||||
GGML_NO_CCACHE := 1
|
GGML_NO_CCACHE := 1
|
||||||
|
@ -575,9 +575,12 @@ endif
|
||||||
|
|
||||||
ifndef GGML_NO_AMX
|
ifndef GGML_NO_AMX
|
||||||
MK_CPPFLAGS += -DGGML_USE_AMX
|
MK_CPPFLAGS += -DGGML_USE_AMX
|
||||||
OBJ_GGML_EXT += ggml/src/ggml-amx/ggml-amx.o ggml/src/ggml-amx/mmq.o
|
OBJ_GGML_EXT += ggml/src/ggml-cpu/amx/amx.o ggml/src/ggml-cpu/amx/mmq.o
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# only necessary for the CPU backend files
|
||||||
|
MK_CPPFLAGS += -Iggml/src/ggml-cpu
|
||||||
|
|
||||||
ifdef GGML_RPC
|
ifdef GGML_RPC
|
||||||
MK_CPPFLAGS += -DGGML_USE_RPC
|
MK_CPPFLAGS += -DGGML_USE_RPC
|
||||||
OBJ_GGML_EXT += ggml/src/ggml-rpc.o
|
OBJ_GGML_EXT += ggml/src/ggml-rpc.o
|
||||||
|
|
|
@ -28,13 +28,16 @@ var cSettings: [CSetting] = [
|
||||||
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
||||||
.unsafeFlags(["-fno-objc-arc"]),
|
.unsafeFlags(["-fno-objc-arc"]),
|
||||||
.headerSearchPath("ggml/src"),
|
.headerSearchPath("ggml/src"),
|
||||||
|
.headerSearchPath("ggml/src/ggml-cpu"),
|
||||||
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||||
// We should consider add this in the future when we drop support for iOS 14
|
// We should consider add this in the future when we drop support for iOS 14
|
||||||
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
|
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
|
||||||
// .define("ACCELERATE_NEW_LAPACK"),
|
// .define("ACCELERATE_NEW_LAPACK"),
|
||||||
// .define("ACCELERATE_LAPACK_ILP64")
|
// .define("ACCELERATE_LAPACK_ILP64")
|
||||||
|
.define("GGML_USE_CPU"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
#if canImport(Darwin)
|
#if canImport(Darwin)
|
||||||
sources.append("ggml/src/ggml-common.h")
|
sources.append("ggml/src/ggml-common.h")
|
||||||
sources.append("ggml/src/ggml-metal/ggml-metal.m")
|
sources.append("ggml/src/ggml-metal/ggml-metal.m")
|
||||||
|
@ -44,7 +47,6 @@ cSettings.append(
|
||||||
contentsOf: [
|
contentsOf: [
|
||||||
.define("GGML_USE_ACCELERATE"),
|
.define("GGML_USE_ACCELERATE"),
|
||||||
.define("GGML_USE_METAL"),
|
.define("GGML_USE_METAL"),
|
||||||
.define("GGML_USE_CPU")
|
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
#endif
|
#endif
|
||||||
|
|
429
README.md
429
README.md
|
@ -4,7 +4,6 @@
|
||||||
|
|
||||||
[](https://opensource.org/licenses/MIT)
|
[](https://opensource.org/licenses/MIT)
|
||||||
[](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
[](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
||||||
[](https://conan.io/center/llama-cpp)
|
|
||||||
|
|
||||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||||
|
|
||||||
|
@ -26,7 +25,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
|
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
|
||||||
variety of hardware - locally and in the cloud.
|
range of hardware - locally and in the cloud.
|
||||||
|
|
||||||
- Plain C/C++ implementation without any dependencies
|
- Plain C/C++ implementation without any dependencies
|
||||||
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
||||||
|
@ -36,14 +35,17 @@ variety of hardware - locally and in the cloud.
|
||||||
- Vulkan and SYCL backend support
|
- Vulkan and SYCL backend support
|
||||||
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
||||||
|
|
||||||
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
|
The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.
|
||||||
improved significantly thanks to many contributions. It is the main playground for developing new features for the
|
|
||||||
[ggml](https://github.com/ggerganov/ggml) library.
|
|
||||||
|
|
||||||
**Supported models:**
|
<details>
|
||||||
|
<summary>Models</summary>
|
||||||
|
|
||||||
Typically finetunes of the base models below are supported as well.
|
Typically finetunes of the base models below are supported as well.
|
||||||
|
|
||||||
|
Instructions for adding support for new models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)
|
||||||
|
|
||||||
|
**Text-only:**
|
||||||
|
|
||||||
- [X] LLaMA 🦙
|
- [X] LLaMA 🦙
|
||||||
- [x] LLaMA 2 🦙🦙
|
- [x] LLaMA 2 🦙🦙
|
||||||
- [x] LLaMA 3 🦙🦙🦙
|
- [x] LLaMA 3 🦙🦙🦙
|
||||||
|
@ -97,9 +99,7 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
|
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
|
||||||
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
|
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
|
||||||
|
|
||||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
|
**Multimodal:**
|
||||||
|
|
||||||
**Multimodal models:**
|
|
||||||
|
|
||||||
- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
|
- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
|
||||||
- [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
|
- [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
|
||||||
|
@ -111,7 +111,10 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
|
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
|
||||||
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
|
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
|
||||||
|
|
||||||
**Bindings:**
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Bindings</summary>
|
||||||
|
|
||||||
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
||||||
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
||||||
|
@ -138,282 +141,74 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
|
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
|
||||||
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
|
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
|
||||||
|
|
||||||
**UI:**
|
</details>
|
||||||
|
|
||||||
Unless otherwise noted these projects are open-source with permissive licensing:
|
<details>
|
||||||
|
<summary>UIs</summary>
|
||||||
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
|
|
||||||
- [iohub/collama](https://github.com/iohub/coLLaMA)
|
|
||||||
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
|
|
||||||
- [nat/openplayground](https://github.com/nat/openplayground)
|
|
||||||
- [Faraday](https://faraday.dev/) (proprietary)
|
|
||||||
- [LMStudio](https://lmstudio.ai/) (proprietary)
|
|
||||||
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
|
|
||||||
- [ramalama](https://github.com/containers/ramalama) (MIT)
|
|
||||||
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
|
|
||||||
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
|
|
||||||
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
|
|
||||||
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
|
|
||||||
- [ollama/ollama](https://github.com/ollama/ollama)
|
|
||||||
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
|
|
||||||
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
|
|
||||||
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
|
|
||||||
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
|
|
||||||
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
|
|
||||||
- [RAGNA Desktop](https://ragna.app/) (proprietary)
|
|
||||||
- [RecurseChat](https://recurse.chat/) (proprietary)
|
|
||||||
- [semperai/amica](https://github.com/semperai/amica)
|
|
||||||
- [withcatai/catai](https://github.com/withcatai/catai)
|
|
||||||
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
|
|
||||||
- [Msty](https://msty.app) (proprietary)
|
|
||||||
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
|
|
||||||
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
|
|
||||||
- [Dot](https://github.com/alexpinel/Dot) (GPL)
|
|
||||||
- [MindMac](https://mindmac.app) (proprietary)
|
|
||||||
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
|
||||||
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
|
||||||
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
|
||||||
- [AIKit](https://github.com/sozercan/aikit) (MIT)
|
|
||||||
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
|
|
||||||
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
|
|
||||||
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
|
|
||||||
- [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
|
|
||||||
|
|
||||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||||
|
|
||||||
**Tools:**
|
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
||||||
|
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
|
||||||
|
- [Dot](https://github.com/alexpinel/Dot) (GPL)
|
||||||
|
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
||||||
|
- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
|
||||||
|
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
|
||||||
|
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
|
||||||
|
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
||||||
|
- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
|
||||||
|
- [LARS](https://github.com/abgulati/LARS) (AGPL)
|
||||||
|
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
|
||||||
|
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
|
||||||
|
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
|
||||||
|
- [LMStudio](https://lmstudio.ai/) (proprietary)
|
||||||
|
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
|
||||||
|
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
|
||||||
|
- [MindMac](https://mindmac.app) (proprietary)
|
||||||
|
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
|
||||||
|
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
|
||||||
|
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) (Apache-2.0)
|
||||||
|
- [nat/openplayground](https://github.com/nat/openplayground) (MIT)
|
||||||
|
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) (MIT)
|
||||||
|
- [ollama/ollama](https://github.com/ollama/ollama) (MIT)
|
||||||
|
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
|
||||||
|
- [PocketPal AI](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
|
||||||
|
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) (MIT)
|
||||||
|
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) (MIT)
|
||||||
|
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
|
||||||
|
- [ramalama](https://github.com/containers/ramalama) (MIT)
|
||||||
|
- [semperai/amica](https://github.com/semperai/amica) (MIT)
|
||||||
|
- [withcatai/catai](https://github.com/withcatai/catai) (MIT)
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Tools</summary>
|
||||||
|
|
||||||
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
||||||
- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
|
- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
|
||||||
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
|
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
|
||||||
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
|
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
|
||||||
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
|
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
|
||||||
|
|
||||||
**Infrastructure:**
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Infrastructure</summary>
|
||||||
|
|
||||||
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
|
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
|
||||||
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
|
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
|
||||||
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
|
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
|
||||||
|
|
||||||
**Games:**
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Games</summary>
|
||||||
|
|
||||||
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
|
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
|
||||||
|
|
||||||
## Demo
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Typical run using LLaMA v2 13B on M2 Ultra</summary>
|
|
||||||
|
|
||||||
```
|
|
||||||
$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
|
|
||||||
I llama.cpp build info:
|
|
||||||
I UNAME_S: Darwin
|
|
||||||
I UNAME_P: arm
|
|
||||||
I UNAME_M: arm64
|
|
||||||
I CFLAGS: -I. -O3 -std=c11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE
|
|
||||||
I CXXFLAGS: -I. -I./common -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS
|
|
||||||
I LDFLAGS: -framework Accelerate
|
|
||||||
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
|
|
||||||
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
|
|
||||||
|
|
||||||
make: Nothing to be done for `default'.
|
|
||||||
main: build = 1041 (cf658ad)
|
|
||||||
main: seed = 1692823051
|
|
||||||
llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
|
|
||||||
llama_model_loader: - type f32: 81 tensors
|
|
||||||
llama_model_loader: - type q4_0: 281 tensors
|
|
||||||
llama_model_loader: - type q6_K: 1 tensors
|
|
||||||
llm_load_print_meta: format = GGUF V1 (latest)
|
|
||||||
llm_load_print_meta: arch = llama
|
|
||||||
llm_load_print_meta: vocab type = SPM
|
|
||||||
llm_load_print_meta: n_vocab = 32000
|
|
||||||
llm_load_print_meta: n_merges = 0
|
|
||||||
llm_load_print_meta: n_ctx_train = 4096
|
|
||||||
llm_load_print_meta: n_ctx = 512
|
|
||||||
llm_load_print_meta: n_embd = 5120
|
|
||||||
llm_load_print_meta: n_head = 40
|
|
||||||
llm_load_print_meta: n_head_kv = 40
|
|
||||||
llm_load_print_meta: n_layer = 40
|
|
||||||
llm_load_print_meta: n_rot = 128
|
|
||||||
llm_load_print_meta: n_gqa = 1
|
|
||||||
llm_load_print_meta: f_norm_eps = 1.0e-05
|
|
||||||
llm_load_print_meta: f_norm_rms_eps = 1.0e-05
|
|
||||||
llm_load_print_meta: n_ff = 13824
|
|
||||||
llm_load_print_meta: freq_base = 10000.0
|
|
||||||
llm_load_print_meta: freq_scale = 1
|
|
||||||
llm_load_print_meta: model type = 13B
|
|
||||||
llm_load_print_meta: model ftype = mostly Q4_0
|
|
||||||
llm_load_print_meta: model size = 13.02 B
|
|
||||||
llm_load_print_meta: general.name = LLaMA v2
|
|
||||||
llm_load_print_meta: BOS token = 1 '<s>'
|
|
||||||
llm_load_print_meta: EOS token = 2 '</s>'
|
|
||||||
llm_load_print_meta: UNK token = 0 '<unk>'
|
|
||||||
llm_load_print_meta: LF token = 13 '<0x0A>'
|
|
||||||
llm_load_tensors: ggml ctx size = 0.11 MB
|
|
||||||
llm_load_tensors: mem required = 7024.01 MB (+ 400.00 MB per state)
|
|
||||||
...................................................................................................
|
|
||||||
llama_new_context_with_model: kv self size = 400.00 MB
|
|
||||||
llama_new_context_with_model: compute buffer total size = 75.41 MB
|
|
||||||
|
|
||||||
system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
|
|
||||||
sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
|
|
||||||
generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
|
|
||||||
|
|
||||||
|
|
||||||
Building a website can be done in 10 simple steps:
|
|
||||||
Step 1: Find the right website platform.
|
|
||||||
Step 2: Choose your domain name and hosting plan.
|
|
||||||
Step 3: Design your website layout.
|
|
||||||
Step 4: Write your website content and add images.
|
|
||||||
Step 5: Install security features to protect your site from hackers or spammers
|
|
||||||
Step 6: Test your website on multiple browsers, mobile devices, operating systems etc…
|
|
||||||
Step 7: Test it again with people who are not related to you personally – friends or family members will work just fine!
|
|
||||||
Step 8: Start marketing and promoting the website via social media channels or paid ads
|
|
||||||
Step 9: Analyze how many visitors have come to your site so far, what type of people visit more often than others (e.g., men vs women) etc…
|
|
||||||
Step 10: Continue to improve upon all aspects mentioned above by following trends in web design and staying up-to-date on new technologies that can enhance user experience even further!
|
|
||||||
How does a Website Work?
|
|
||||||
A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
|
|
||||||
The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
|
|
||||||
How to
|
|
||||||
llama_print_timings: load time = 576.45 ms
|
|
||||||
llama_print_timings: sample time = 283.10 ms / 400 runs ( 0.71 ms per token, 1412.91 tokens per second)
|
|
||||||
llama_print_timings: prompt eval time = 599.83 ms / 19 tokens ( 31.57 ms per token, 31.68 tokens per second)
|
|
||||||
llama_print_timings: eval time = 24513.59 ms / 399 runs ( 61.44 ms per token, 16.28 tokens per second)
|
|
||||||
llama_print_timings: total time = 25431.49 ms
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Demo of running both LLaMA-7B and whisper.cpp on a single M1 Pro MacBook</summary>
|
|
||||||
|
|
||||||
And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
|
|
||||||
|
|
||||||
https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
Here are the end-to-end binary build and model conversion steps for most supported models.
|
|
||||||
|
|
||||||
### Basic usage
|
|
||||||
|
|
||||||
Firstly, you need to get the binary. There are different methods that you can follow:
|
|
||||||
- Method 1: Clone this repository and build locally, see [how to build](./docs/build.md)
|
|
||||||
- Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md)
|
|
||||||
- Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md)
|
|
||||||
- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases)
|
|
||||||
|
|
||||||
You can run a basic completion using this command:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
|
|
||||||
|
|
||||||
# Output:
|
|
||||||
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
|
|
||||||
```
|
|
||||||
|
|
||||||
See [this page](./examples/main/README.md) for a full list of parameters.
|
|
||||||
|
|
||||||
### Conversation mode
|
|
||||||
|
|
||||||
If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
|
|
||||||
|
|
||||||
# Output:
|
|
||||||
# > hi, who are you?
|
|
||||||
# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
|
|
||||||
#
|
|
||||||
# > what is 1+1?
|
|
||||||
# Easy peasy! The answer to 1+1 is... 2!
|
|
||||||
```
|
|
||||||
|
|
||||||
By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
|
|
||||||
```
|
|
||||||
|
|
||||||
You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
|
|
||||||
```
|
|
||||||
|
|
||||||
### Web server
|
|
||||||
|
|
||||||
[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
|
|
||||||
|
|
||||||
Example usage:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
./llama-server -m your_model.gguf --port 8080
|
|
||||||
|
|
||||||
# Basic web UI can be accessed via browser: http://localhost:8080
|
|
||||||
# Chat completion endpoint: http://localhost:8080/v1/chat/completions
|
|
||||||
```
|
|
||||||
|
|
||||||
### Interactive mode
|
|
||||||
|
|
||||||
> [!NOTE]
|
|
||||||
> If you prefer basic usage, please consider using conversation mode instead of interactive mode
|
|
||||||
|
|
||||||
In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
|
|
||||||
|
|
||||||
Here is an example of a few-shot interaction, invoked with the command
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# default arguments using a 7B model
|
|
||||||
./examples/chat.sh
|
|
||||||
|
|
||||||
# advanced chat with a 13B model
|
|
||||||
./examples/chat-13B.sh
|
|
||||||
|
|
||||||
# custom arguments using a 13B model
|
|
||||||
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
### Persistent Interaction
|
|
||||||
|
|
||||||
The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Start a new chat
|
|
||||||
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
|
|
||||||
|
|
||||||
# Resume that chat
|
|
||||||
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
|
|
||||||
|
|
||||||
# Start a different chat with the same prompt/model
|
|
||||||
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh
|
|
||||||
|
|
||||||
# Different prompt cache for different prompt/model
|
|
||||||
PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
|
|
||||||
CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
### Constrained output with grammars
|
|
||||||
|
|
||||||
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
|
||||||
```
|
|
||||||
|
|
||||||
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
|
||||||
|
|
||||||
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
|
|
||||||
|
|
||||||
## Build
|
|
||||||
|
|
||||||
Please refer to [Build llama.cpp locally](./docs/build.md)
|
|
||||||
|
|
||||||
## Supported backends
|
## Supported backends
|
||||||
|
|
||||||
| Backend | Target devices |
|
| Backend | Target devices |
|
||||||
|
@ -428,23 +223,104 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
|
||||||
| [Vulkan](./docs/build.md#vulkan) | GPU |
|
| [Vulkan](./docs/build.md#vulkan) | GPU |
|
||||||
| [CANN](./docs/build.md#cann) | Ascend NPU |
|
| [CANN](./docs/build.md#cann) | Ascend NPU |
|
||||||
|
|
||||||
## Tools
|
## Building and usage
|
||||||
|
|
||||||
### Prepare and Quantize
|
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
|
||||||
|
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
|
||||||
|
|
||||||
> [!NOTE]
|
- Clone this repository and build locally, see [how to build](./docs/build.md)
|
||||||
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
|
- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](./docs/install.md)
|
||||||
|
- Use a Docker image, see [documentation for Docker](./docs/docker.md)
|
||||||
|
- Download pre-built binaries from [releases](https://github.com/ggerganov/llama.cpp/releases)
|
||||||
|
|
||||||
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
|
### Obtaining and quantizing models
|
||||||
|
|
||||||
Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
|
The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
|
||||||
It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
|
|
||||||
|
|
||||||
To learn more about quantizing model, [read this documentation](./examples/quantize/README.md)
|
- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
|
||||||
|
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
|
||||||
|
|
||||||
|
After downloading a model, use the CLI tools to run it locally - see below.
|
||||||
|
|
||||||
|
`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
|
||||||
|
|
||||||
|
The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`:
|
||||||
|
|
||||||
|
- Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes
|
||||||
|
- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggerganov/llama.cpp/discussions/10123)
|
||||||
|
- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggerganov/llama.cpp/discussions/9268)
|
||||||
|
- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggerganov/llama.cpp/discussions/9669)
|
||||||
|
|
||||||
|
To learn more about model quantization, [read this documentation](./examples/quantize/README.md)
|
||||||
|
|
||||||
|
### Using the `llama-cli` tool
|
||||||
|
|
||||||
|
Run a basic text completion:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
|
||||||
|
|
||||||
|
# Output:
|
||||||
|
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
|
||||||
|
```
|
||||||
|
|
||||||
|
See [this page](./examples/main/README.md) for a full list of parameters.
|
||||||
|
|
||||||
|
### Conversation mode
|
||||||
|
|
||||||
|
Run `llama-cli` in conversation/chat mode by passing the `-cnv` parameter:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
|
||||||
|
|
||||||
|
# Output:
|
||||||
|
# > hi, who are you?
|
||||||
|
# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
|
||||||
|
#
|
||||||
|
# > what is 1+1?
|
||||||
|
# Easy peasy! The answer to 1+1 is... 2!
|
||||||
|
```
|
||||||
|
|
||||||
|
By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
|
||||||
|
```
|
||||||
|
|
||||||
|
### Constrained output with grammars
|
||||||
|
|
||||||
|
`llama.cpp` can constrain the output of the model via custom grammars. For example, you can force the model to output only JSON:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama-cli -m your_model.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
||||||
|
```
|
||||||
|
|
||||||
|
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
||||||
|
|
||||||
|
For authoring more complex JSON grammars, check out https://grammar.intrinsiclabs.ai/
|
||||||
|
|
||||||
|
### Web server (`llama-server`)
|
||||||
|
|
||||||
|
The [llama-server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
|
||||||
|
|
||||||
|
Example usage:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama-server -m your_model.gguf --port 8080
|
||||||
|
|
||||||
|
# Basic web UI can be accessed via browser: http://localhost:8080
|
||||||
|
# Chat completion endpoint: http://localhost:8080/v1/chat/completions
|
||||||
|
```
|
||||||
|
|
||||||
### Perplexity (measuring model quality)
|
### Perplexity (measuring model quality)
|
||||||
|
|
||||||
You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
|
Use the `llama-perplexity` tool to measure perplexity over a given prompt (lower perplexity is better).
|
||||||
For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
|
For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
|
||||||
|
|
||||||
To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)
|
To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)
|
||||||
|
@ -464,7 +340,6 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
|
||||||
|
|
||||||
- [main (cli)](./examples/main/README.md)
|
- [main (cli)](./examples/main/README.md)
|
||||||
- [server](./examples/server/README.md)
|
- [server](./examples/server/README.md)
|
||||||
- [jeopardy](./examples/jeopardy/README.md)
|
|
||||||
- [GBNF grammars](./grammars/README.md)
|
- [GBNF grammars](./grammars/README.md)
|
||||||
|
|
||||||
**Development documentation**
|
**Development documentation**
|
||||||
|
|
|
@ -88,5 +88,5 @@ if (LLAMA_CURL)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
target_include_directories(${TARGET} PUBLIC .)
|
target_include_directories(${TARGET} PUBLIC .)
|
||||||
target_compile_features (${TARGET} PUBLIC cxx_std_11)
|
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
||||||
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
||||||
|
|
|
@ -128,7 +128,11 @@ static void common_params_handle_model_default(common_params & params) {
|
||||||
}
|
}
|
||||||
params.hf_file = params.model;
|
params.hf_file = params.model;
|
||||||
} else if (params.model.empty()) {
|
} else if (params.model.empty()) {
|
||||||
params.model = fs_get_cache_file(string_split<std::string>(params.hf_file, '/').back());
|
// this is to avoid different repo having same file name, or same file name in different subdirs
|
||||||
|
std::string filename = params.hf_repo + "_" + params.hf_file;
|
||||||
|
// to make sure we don't have any slashes in the filename
|
||||||
|
string_replace_all(filename, "/", "_");
|
||||||
|
params.model = fs_get_cache_file(filename);
|
||||||
}
|
}
|
||||||
} else if (!params.model_url.empty()) {
|
} else if (!params.model_url.empty()) {
|
||||||
if (params.model.empty()) {
|
if (params.model.empty()) {
|
||||||
|
@ -1366,8 +1370,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_gpu_layers = value;
|
params.n_gpu_layers = value;
|
||||||
if (!llama_supports_gpu_offload()) {
|
if (!llama_supports_gpu_offload()) {
|
||||||
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
|
fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
|
||||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
|
||||||
|
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
|
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
|
||||||
|
@ -2100,8 +2105,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.speculative.n_gpu_layers = value;
|
params.speculative.n_gpu_layers = value;
|
||||||
if (!llama_supports_gpu_offload()) {
|
if (!llama_supports_gpu_offload()) {
|
||||||
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
|
||||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
|
||||||
|
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||||
|
|
|
@ -652,7 +652,17 @@ bool fs_validate_filename(const std::string & filename) {
|
||||||
|
|
||||||
std::u32string filename_utf32;
|
std::u32string filename_utf32;
|
||||||
try {
|
try {
|
||||||
|
#if defined(__clang__)
|
||||||
|
// disable C++17 deprecation warning for std::codecvt_utf8
|
||||||
|
# pragma clang diagnostic push
|
||||||
|
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||||
|
#endif
|
||||||
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
||||||
|
|
||||||
|
#if defined(__clang__)
|
||||||
|
# pragma clang diagnostic pop
|
||||||
|
#endif
|
||||||
|
|
||||||
filename_utf32 = converter.from_bytes(filename);
|
filename_utf32 = converter.from_bytes(filename);
|
||||||
|
|
||||||
// If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
|
// If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
|
||||||
|
@ -829,9 +839,9 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
llama_model * model = nullptr;
|
llama_model * model = nullptr;
|
||||||
|
|
||||||
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
||||||
model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
|
||||||
} else if (!params.model_url.empty()) {
|
} else if (!params.model_url.empty()) {
|
||||||
model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
|
||||||
} else {
|
} else {
|
||||||
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||||
}
|
}
|
||||||
|
@ -1342,17 +1352,17 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_url(
|
struct llama_model * common_load_model_from_url(
|
||||||
const char * model_url,
|
const std::string & model_url,
|
||||||
const char * path_model,
|
const std::string & local_path,
|
||||||
const char * hf_token,
|
const std::string & hf_token,
|
||||||
const struct llama_model_params & params) {
|
const struct llama_model_params & params) {
|
||||||
// Basic validation of the model_url
|
// Basic validation of the model_url
|
||||||
if (!model_url || strlen(model_url) == 0) {
|
if (model_url.empty()) {
|
||||||
LOG_ERR("%s: invalid model_url\n", __func__);
|
LOG_ERR("%s: invalid model_url\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!common_download_file(model_url, path_model, hf_token)) {
|
if (!common_download_file(model_url, local_path, hf_token)) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1363,9 +1373,9 @@ struct llama_model * common_load_model_from_url(
|
||||||
/*.no_alloc = */ true,
|
/*.no_alloc = */ true,
|
||||||
/*.ctx = */ NULL,
|
/*.ctx = */ NULL,
|
||||||
};
|
};
|
||||||
auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
|
auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
|
||||||
if (!ctx_gguf) {
|
if (!ctx_gguf) {
|
||||||
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model);
|
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1384,13 +1394,13 @@ struct llama_model * common_load_model_from_url(
|
||||||
// Verify the first split file format
|
// Verify the first split file format
|
||||||
// and extract split URL and PATH prefixes
|
// and extract split URL and PATH prefixes
|
||||||
{
|
{
|
||||||
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
|
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
|
||||||
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
|
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
|
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
|
||||||
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
|
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1417,14 +1427,14 @@ struct llama_model * common_load_model_from_url(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return llama_load_model_from_file(path_model, params);
|
return llama_load_model_from_file(local_path.c_str(), params);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_hf(
|
struct llama_model * common_load_model_from_hf(
|
||||||
const char * repo,
|
const std::string & repo,
|
||||||
const char * model,
|
const std::string & remote_path,
|
||||||
const char * path_model,
|
const std::string & local_path,
|
||||||
const char * hf_token,
|
const std::string & hf_token,
|
||||||
const struct llama_model_params & params) {
|
const struct llama_model_params & params) {
|
||||||
// construct hugging face model url:
|
// construct hugging face model url:
|
||||||
//
|
//
|
||||||
|
@ -1438,27 +1448,27 @@ struct llama_model * common_load_model_from_hf(
|
||||||
std::string model_url = "https://huggingface.co/";
|
std::string model_url = "https://huggingface.co/";
|
||||||
model_url += repo;
|
model_url += repo;
|
||||||
model_url += "/resolve/main/";
|
model_url += "/resolve/main/";
|
||||||
model_url += model;
|
model_url += remote_path;
|
||||||
|
|
||||||
return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
|
return common_load_model_from_url(model_url, local_path, hf_token, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_url(
|
struct llama_model * common_load_model_from_url(
|
||||||
const char * /*model_url*/,
|
const std::string & /*model_url*/,
|
||||||
const char * /*path_model*/,
|
const std::string & /*local_path*/,
|
||||||
const char * /*hf_token*/,
|
const std::string & /*hf_token*/,
|
||||||
const struct llama_model_params & /*params*/) {
|
const struct llama_model_params & /*params*/) {
|
||||||
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_hf(
|
struct llama_model * common_load_model_from_hf(
|
||||||
const char * /*repo*/,
|
const std::string & /*repo*/,
|
||||||
const char * /*model*/,
|
const std::string & /*remote_path*/,
|
||||||
const char * /*path_model*/,
|
const std::string & /*local_path*/,
|
||||||
const char * /*hf_token*/,
|
const std::string & /*hf_token*/,
|
||||||
const struct llama_model_params & /*params*/) {
|
const struct llama_model_params & /*params*/) {
|
||||||
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
|
@ -471,8 +471,17 @@ struct llama_model_params common_model_params_to_llama ( common_params
|
||||||
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
||||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
struct llama_model * common_load_model_from_url(
|
||||||
struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
const std::string & model_url,
|
||||||
|
const std::string & local_path,
|
||||||
|
const std::string & hf_token,
|
||||||
|
const struct llama_model_params & params);
|
||||||
|
struct llama_model * common_load_model_from_hf(
|
||||||
|
const std::string & repo,
|
||||||
|
const std::string & remote_path,
|
||||||
|
const std::string & local_path,
|
||||||
|
const std::string & hf_token,
|
||||||
|
const struct llama_model_params & params);
|
||||||
|
|
||||||
// clear LoRA adapters from context, then apply new list of adapters
|
// clear LoRA adapters from context, then apply new list of adapters
|
||||||
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
||||||
|
|
|
@ -23,10 +23,10 @@ $ curl -L {model-url} -o ~/{model}.gguf
|
||||||
Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
|
Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
|
||||||
|
|
||||||
```
|
```
|
||||||
$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
|
$ ./build/bin/llama-cli -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
|
||||||
```
|
```
|
||||||
|
|
||||||
Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
|
Here, we show `llama-cli`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
|
||||||
|
|
||||||
To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:
|
To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,8 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
||||||
|
|
||||||
## News
|
## News
|
||||||
|
|
||||||
|
- 2024.11
|
||||||
|
- Support F16 and F32 data type model for Ascend 310P NPU.
|
||||||
- 2024.8
|
- 2024.8
|
||||||
- Support `Q4_0` and `Q8_0` data type for Ascend NPU.
|
- Support `Q4_0` and `Q8_0` data type for Ascend NPU.
|
||||||
- 2024.7
|
- 2024.7
|
||||||
|
@ -40,9 +42,11 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
||||||
### Ascend NPU
|
### Ascend NPU
|
||||||
|
|
||||||
**Verified devices**
|
**Verified devices**
|
||||||
|
|
||||||
| Ascend NPU | Status |
|
| Ascend NPU | Status |
|
||||||
|:-----------------------------:|:-------:|
|
|:-----------------------------:|:-------:|
|
||||||
| Atlas 300T A2 | Support |
|
| Atlas 300T A2 | Support |
|
||||||
|
| Atlas 300I Duo | Support |
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-batched-bench)
|
||||||
add_executable(${TARGET} batched-bench.cpp)
|
add_executable(${TARGET} batched-bench.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-batched)
|
||||||
add_executable(${TARGET} batched.cpp)
|
add_executable(${TARGET} batched.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-convert-llama2c-to-ggml)
|
||||||
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
|
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-cvector-generator)
|
||||||
add_executable(${TARGET} cvector-generator.cpp pca.hpp)
|
add_executable(${TARGET} cvector-generator.cpp pca.hpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-embedding)
|
||||||
add_executable(${TARGET} embedding.cpp)
|
add_executable(${TARGET} embedding.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,7 +2,7 @@ set(TARGET llama-eval-callback)
|
||||||
add_executable(${TARGET} eval-callback.cpp)
|
add_executable(${TARGET} eval-callback.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
||||||
set(TEST_TARGET test-eval-callback)
|
set(TEST_TARGET test-eval-callback)
|
||||||
add_test(NAME ${TEST_TARGET}
|
add_test(NAME ${TEST_TARGET}
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-export-lora)
|
||||||
add_executable(${TARGET} export-lora.cpp)
|
add_executable(${TARGET} export-lora.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-gbnf-validator)
|
||||||
add_executable(${TARGET} gbnf-validator.cpp)
|
add_executable(${TARGET} gbnf-validator.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-gen-docs)
|
||||||
add_executable(${TARGET} gen-docs.cpp)
|
add_executable(${TARGET} gen-docs.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -19,4 +19,4 @@ add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
|
||||||
target_link_libraries(${TARGET} PRIVATE sha256)
|
target_link_libraries(${TARGET} PRIVATE sha256)
|
||||||
|
|
||||||
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-gguf-split)
|
||||||
add_executable(${TARGET} gguf-split.cpp)
|
add_executable(${TARGET} gguf-split.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-gguf)
|
||||||
add_executable(${TARGET} gguf.cpp)
|
add_executable(${TARGET} gguf.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-gritlm)
|
||||||
add_executable(${TARGET} gritlm.cpp)
|
add_executable(${TARGET} gritlm.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-imatrix)
|
||||||
add_executable(${TARGET} imatrix.cpp)
|
add_executable(${TARGET} imatrix.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -637,10 +637,19 @@ int main(int argc, char ** argv) {
|
||||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!compute_imatrix(ctx, params)) {
|
if (params.prompt.empty()) {
|
||||||
return 1;
|
if (params.in_files.empty()) {
|
||||||
|
LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
LOG_INF("No prompt provided; combining precomputed matrices only.\n");
|
||||||
|
} else {
|
||||||
|
if (!compute_imatrix(ctx, params)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
g_collector.save_imatrix();
|
g_collector.save_imatrix();
|
||||||
|
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-infill)
|
||||||
add_executable(${TARGET} infill.cpp)
|
add_executable(${TARGET} infill.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-bench)
|
||||||
add_executable(${TARGET} llama-bench.cpp)
|
add_executable(${TARGET} llama-bench.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -11,7 +11,7 @@ target_include_directories(llava PUBLIC .)
|
||||||
target_include_directories(llava PUBLIC ../..)
|
target_include_directories(llava PUBLIC ../..)
|
||||||
target_include_directories(llava PUBLIC ../../common)
|
target_include_directories(llava PUBLIC ../../common)
|
||||||
|
|
||||||
target_compile_features(llava PRIVATE cxx_std_11)
|
target_compile_features(llava PRIVATE cxx_std_17)
|
||||||
|
|
||||||
add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
|
add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
|
||||||
if (BUILD_SHARED_LIBS)
|
if (BUILD_SHARED_LIBS)
|
||||||
|
@ -35,11 +35,11 @@ add_executable(${TARGET} llava-cli.cpp)
|
||||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
|
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
||||||
set(TARGET llama-minicpmv-cli)
|
set(TARGET llama-minicpmv-cli)
|
||||||
add_executable(${TARGET} minicpmv-cli.cpp)
|
add_executable(${TARGET} minicpmv-cli.cpp)
|
||||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
|
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -40,10 +40,17 @@
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
|
||||||
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
#if defined(LLAVA_LOG_OFF)
|
||||||
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
# define LOG_INF(...)
|
||||||
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
# define LOG_WRN(...)
|
||||||
#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
# define LOG_ERR(...)
|
||||||
|
# define LOG_DBG(...)
|
||||||
|
#else // defined(LLAVA_LOG_OFF)
|
||||||
|
# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||||
|
# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||||
|
# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||||
|
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||||
|
#endif // defined(LLAVA_LOG_OFF)
|
||||||
|
|
||||||
//#define CLIP_DEBUG_FUNCTIONS
|
//#define CLIP_DEBUG_FUNCTIONS
|
||||||
|
|
||||||
|
|
|
@ -11,13 +11,17 @@
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
|
#if defined(LLAVA_LOG_OFF)
|
||||||
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
# define LOG_INF(...)
|
||||||
|
# define LOG_WRN(...)
|
||||||
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
# define LOG_ERR(...)
|
||||||
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
# define LOG_DBG(...)
|
||||||
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
#else // defined(LLAVA_LOG_OFF)
|
||||||
#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||||
|
# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||||
|
# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||||
|
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||||
|
#endif // defined(LLAVA_LOG_OFF)
|
||||||
|
|
||||||
// RGB uint8 image
|
// RGB uint8 image
|
||||||
struct clip_image_u8 {
|
struct clip_image_u8 {
|
||||||
|
@ -498,10 +502,16 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
|
||||||
errno = 0;
|
errno = 0;
|
||||||
size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
|
size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
|
||||||
if (ferror(file)) {
|
if (ferror(file)) {
|
||||||
die_fmt("read error: %s", strerror(errno));
|
LOG_ERR("read error: %s", strerror(errno));
|
||||||
|
free(buffer);
|
||||||
|
fclose(file);
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
if (ret != (size_t) fileSize) {
|
if (ret != (size_t) fileSize) {
|
||||||
die("unexpectedly reached end of file");
|
LOG_ERR("unexpectedly reached end of file");
|
||||||
|
free(buffer);
|
||||||
|
fclose(file);
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
fclose(file); // Close the file
|
fclose(file); // Close the file
|
||||||
|
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-lookahead)
|
||||||
add_executable(${TARGET} lookahead.cpp)
|
add_executable(${TARGET} lookahead.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,22 +2,22 @@ set(TARGET llama-lookup)
|
||||||
add_executable(${TARGET} lookup.cpp)
|
add_executable(${TARGET} lookup.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
||||||
set(TARGET llama-lookup-create)
|
set(TARGET llama-lookup-create)
|
||||||
add_executable(${TARGET} lookup-create.cpp)
|
add_executable(${TARGET} lookup-create.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
||||||
set(TARGET llama-lookup-merge)
|
set(TARGET llama-lookup-merge)
|
||||||
add_executable(${TARGET} lookup-merge.cpp)
|
add_executable(${TARGET} lookup-merge.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
||||||
set(TARGET llama-lookup-stats)
|
set(TARGET llama-lookup-stats)
|
||||||
add_executable(${TARGET} lookup-stats.cpp)
|
add_executable(${TARGET} lookup-stats.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -29,4 +29,4 @@ add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
|
||||||
target_include_directories(${TARGET} PRIVATE ${_common_path})
|
target_include_directories(${TARGET} PRIVATE ${_common_path})
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-cli)
|
||||||
add_executable(${TARGET} main.cpp)
|
add_executable(${TARGET} main.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-parallel)
|
||||||
add_executable(${TARGET} parallel.cpp)
|
add_executable(${TARGET} parallel.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-passkey)
|
||||||
add_executable(${TARGET} passkey.cpp)
|
add_executable(${TARGET} passkey.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-perplexity)
|
||||||
add_executable(${TARGET} perplexity.cpp)
|
add_executable(${TARGET} perplexity.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -3,4 +3,4 @@ add_executable(${TARGET} quantize-stats.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -3,4 +3,4 @@ add_executable(${TARGET} quantize.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-retrieval)
|
||||||
add_executable(${TARGET} retrieval.cpp)
|
add_executable(${TARGET} retrieval.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-run)
|
||||||
add_executable(${TARGET} run.cpp)
|
add_executable(${TARGET} run.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-save-load-state)
|
||||||
add_executable(${TARGET} save-load-state.cpp)
|
add_executable(${TARGET} save-load-state.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -50,4 +50,4 @@ if (WIN32)
|
||||||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,6 +2,6 @@ aiohttp~=3.9.3
|
||||||
pytest~=8.3.3
|
pytest~=8.3.3
|
||||||
huggingface_hub~=0.23.2
|
huggingface_hub~=0.23.2
|
||||||
numpy~=1.26.4
|
numpy~=1.26.4
|
||||||
openai~=1.30.3
|
openai~=1.55.3
|
||||||
prometheus-client~=0.20.0
|
prometheus-client~=0.20.0
|
||||||
requests~=2.32.3
|
requests~=2.32.3
|
||||||
|
|
|
@ -32,3 +32,17 @@ def test_server_models():
|
||||||
assert res.status_code == 200
|
assert res.status_code == 200
|
||||||
assert len(res.body["data"]) == 1
|
assert len(res.body["data"]) == 1
|
||||||
assert res.body["data"][0]["id"] == server.model_alias
|
assert res.body["data"][0]["id"] == server.model_alias
|
||||||
|
|
||||||
|
def test_load_split_model():
|
||||||
|
global server
|
||||||
|
server.model_hf_repo = "ggml-org/models"
|
||||||
|
server.model_hf_file = "tinyllamas/split/stories15M-q8_0-00001-of-00003.gguf"
|
||||||
|
server.model_alias = "tinyllama-split"
|
||||||
|
server.start()
|
||||||
|
res = server.make_request("POST", "/completion", data={
|
||||||
|
"n_predict": 16,
|
||||||
|
"prompt": "Hello",
|
||||||
|
"temperature": 0.0,
|
||||||
|
})
|
||||||
|
assert res.status_code == 200
|
||||||
|
assert match_regex("(little|girl)+", res.body["content"])
|
||||||
|
|
|
@ -127,3 +127,22 @@ def test_completion_with_response_format(response_format: dict, n_predicted: int
|
||||||
assert res.status_code != 200
|
assert res.status_code != 200
|
||||||
assert "error" in res.body
|
assert "error" in res.body
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("messages", [
|
||||||
|
None,
|
||||||
|
"string",
|
||||||
|
[123],
|
||||||
|
[{}],
|
||||||
|
[{"role": 123}],
|
||||||
|
[{"role": "system", "content": 123}],
|
||||||
|
# [{"content": "hello"}], # TODO: should not be a valid case
|
||||||
|
[{"role": "system", "content": "test"}, {}],
|
||||||
|
])
|
||||||
|
def test_invalid_chat_completion_req(messages):
|
||||||
|
global server
|
||||||
|
server.start()
|
||||||
|
res = server.make_request("POST", "/chat/completions", data={
|
||||||
|
"messages": messages,
|
||||||
|
})
|
||||||
|
assert res.status_code == 400 or res.status_code == 500
|
||||||
|
assert "error" in res.body
|
||||||
|
|
|
@ -8,6 +8,7 @@ def create_server():
|
||||||
global server
|
global server
|
||||||
server = ServerPreset.tinyllama_infill()
|
server = ServerPreset.tinyllama_infill()
|
||||||
|
|
||||||
|
|
||||||
def test_infill_without_input_extra():
|
def test_infill_without_input_extra():
|
||||||
global server
|
global server
|
||||||
server.start()
|
server.start()
|
||||||
|
@ -19,6 +20,7 @@ def test_infill_without_input_extra():
|
||||||
assert res.status_code == 200
|
assert res.status_code == 200
|
||||||
assert match_regex("(One|day|she|saw|big|scary|bird)+", res.body["content"])
|
assert match_regex("(One|day|she|saw|big|scary|bird)+", res.body["content"])
|
||||||
|
|
||||||
|
|
||||||
def test_infill_with_input_extra():
|
def test_infill_with_input_extra():
|
||||||
global server
|
global server
|
||||||
server.start()
|
server.start()
|
||||||
|
@ -33,3 +35,23 @@ def test_infill_with_input_extra():
|
||||||
})
|
})
|
||||||
assert res.status_code == 200
|
assert res.status_code == 200
|
||||||
assert match_regex("(cuts|Jimmy|mom|came|into|the|room)+", res.body["content"])
|
assert match_regex("(cuts|Jimmy|mom|came|into|the|room)+", res.body["content"])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("input_extra", [
|
||||||
|
{},
|
||||||
|
{"filename": "ok"},
|
||||||
|
{"filename": 123},
|
||||||
|
{"filename": 123, "text": "abc"},
|
||||||
|
{"filename": 123, "text": 456},
|
||||||
|
])
|
||||||
|
def test_invalid_input_extra_req(input_extra):
|
||||||
|
global server
|
||||||
|
server.start()
|
||||||
|
res = server.make_request("POST", "/infill", data={
|
||||||
|
"prompt": "Complete this",
|
||||||
|
"input_extra": [input_extra],
|
||||||
|
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_",
|
||||||
|
"input_suffix": "}\n",
|
||||||
|
})
|
||||||
|
assert res.status_code == 400
|
||||||
|
assert "error" in res.body
|
||||||
|
|
|
@ -36,3 +36,20 @@ def test_rerank():
|
||||||
assert most_relevant["relevance_score"] > least_relevant["relevance_score"]
|
assert most_relevant["relevance_score"] > least_relevant["relevance_score"]
|
||||||
assert most_relevant["index"] == 2
|
assert most_relevant["index"] == 2
|
||||||
assert least_relevant["index"] == 3
|
assert least_relevant["index"] == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("documents", [
|
||||||
|
[],
|
||||||
|
None,
|
||||||
|
123,
|
||||||
|
[1, 2, 3],
|
||||||
|
])
|
||||||
|
def test_invalid_rerank_req(documents):
|
||||||
|
global server
|
||||||
|
server.start()
|
||||||
|
res = server.make_request("POST", "/rerank", data={
|
||||||
|
"query": "Machine learning is",
|
||||||
|
"documents": documents,
|
||||||
|
})
|
||||||
|
assert res.status_code == 400
|
||||||
|
assert "error" in res.body
|
||||||
|
|
103
examples/server/tests/unit/test_speculative.py
Normal file
103
examples/server/tests/unit/test_speculative.py
Normal file
|
@ -0,0 +1,103 @@
|
||||||
|
import pytest
|
||||||
|
from utils import *
|
||||||
|
|
||||||
|
# We use a F16 MOE gguf as main model, and q4_0 as draft model
|
||||||
|
|
||||||
|
server = ServerPreset.stories15m_moe()
|
||||||
|
|
||||||
|
MODEL_DRAFT_FILE_URL = "https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories15M-q4_0.gguf"
|
||||||
|
|
||||||
|
def create_server():
|
||||||
|
global server
|
||||||
|
server = ServerPreset.stories15m_moe()
|
||||||
|
# download draft model file if needed
|
||||||
|
file_name = MODEL_DRAFT_FILE_URL.split('/').pop()
|
||||||
|
model_draft_file = f'../../../{file_name}'
|
||||||
|
if not os.path.exists(model_draft_file):
|
||||||
|
print(f"Downloading {MODEL_DRAFT_FILE_URL} to {model_draft_file}")
|
||||||
|
with open(model_draft_file, 'wb') as f:
|
||||||
|
f.write(requests.get(MODEL_DRAFT_FILE_URL).content)
|
||||||
|
print(f"Done downloading draft model file")
|
||||||
|
# set default values
|
||||||
|
server.model_draft = model_draft_file
|
||||||
|
server.draft_min = 4
|
||||||
|
server.draft_max = 8
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module", autouse=True)
|
||||||
|
def fixture_create_server():
|
||||||
|
return create_server()
|
||||||
|
|
||||||
|
|
||||||
|
def test_with_and_without_draft():
|
||||||
|
global server
|
||||||
|
server.model_draft = None # disable draft model
|
||||||
|
server.start()
|
||||||
|
res = server.make_request("POST", "/completion", data={
|
||||||
|
"prompt": "I believe the meaning of life is",
|
||||||
|
"temperature": 0.0,
|
||||||
|
"top_k": 1,
|
||||||
|
})
|
||||||
|
assert res.status_code == 200
|
||||||
|
content_no_draft = res.body["content"]
|
||||||
|
server.stop()
|
||||||
|
|
||||||
|
# create new server with draft model
|
||||||
|
create_server()
|
||||||
|
server.start()
|
||||||
|
res = server.make_request("POST", "/completion", data={
|
||||||
|
"prompt": "I believe the meaning of life is",
|
||||||
|
"temperature": 0.0,
|
||||||
|
"top_k": 1,
|
||||||
|
})
|
||||||
|
assert res.status_code == 200
|
||||||
|
content_draft = res.body["content"]
|
||||||
|
|
||||||
|
assert content_no_draft == content_draft
|
||||||
|
|
||||||
|
|
||||||
|
def test_different_draft_min_draft_max():
|
||||||
|
global server
|
||||||
|
test_values = [
|
||||||
|
(1, 2),
|
||||||
|
(1, 4),
|
||||||
|
(4, 8),
|
||||||
|
(4, 12),
|
||||||
|
(8, 16),
|
||||||
|
]
|
||||||
|
last_content = None
|
||||||
|
for draft_min, draft_max in test_values:
|
||||||
|
server.stop()
|
||||||
|
server.draft_min = draft_min
|
||||||
|
server.draft_max = draft_max
|
||||||
|
server.start()
|
||||||
|
res = server.make_request("POST", "/completion", data={
|
||||||
|
"prompt": "I believe the meaning of life is",
|
||||||
|
"temperature": 0.0,
|
||||||
|
"top_k": 1,
|
||||||
|
})
|
||||||
|
assert res.status_code == 200
|
||||||
|
if last_content is not None:
|
||||||
|
assert last_content == res.body["content"]
|
||||||
|
last_content = res.body["content"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("n_slots,n_requests", [
|
||||||
|
(1, 2),
|
||||||
|
(2, 2),
|
||||||
|
])
|
||||||
|
def test_multi_requests_parallel(n_slots: int, n_requests: int):
|
||||||
|
global server
|
||||||
|
server.n_slots = n_slots
|
||||||
|
server.start()
|
||||||
|
tasks = []
|
||||||
|
for _ in range(n_requests):
|
||||||
|
tasks.append((server.make_request, ("POST", "/completion", {
|
||||||
|
"prompt": "I believe the meaning of life is",
|
||||||
|
"temperature": 0.0,
|
||||||
|
"top_k": 1,
|
||||||
|
})))
|
||||||
|
results = parallel_function_calls(tasks)
|
||||||
|
for res in results:
|
||||||
|
assert res.status_code == 200
|
||||||
|
assert match_regex("(wise|kind|owl|answer)+", res.body["content"])
|
|
@ -8,7 +8,6 @@ import os
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
import threading
|
|
||||||
import requests
|
import requests
|
||||||
import time
|
import time
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
@ -47,6 +46,7 @@ class ServerProcess:
|
||||||
model_alias: str | None = None
|
model_alias: str | None = None
|
||||||
model_url: str | None = None
|
model_url: str | None = None
|
||||||
model_file: str | None = None
|
model_file: str | None = None
|
||||||
|
model_draft: str | None = None
|
||||||
n_threads: int | None = None
|
n_threads: int | None = None
|
||||||
n_gpu_layer: int | None = None
|
n_gpu_layer: int | None = None
|
||||||
n_batch: int | None = None
|
n_batch: int | None = None
|
||||||
|
@ -69,6 +69,8 @@ class ServerProcess:
|
||||||
response_format: str | None = None
|
response_format: str | None = None
|
||||||
lora_files: List[str] | None = None
|
lora_files: List[str] | None = None
|
||||||
disable_ctx_shift: int | None = False
|
disable_ctx_shift: int | None = False
|
||||||
|
draft_min: int | None = None
|
||||||
|
draft_max: int | None = None
|
||||||
|
|
||||||
# session variables
|
# session variables
|
||||||
process: subprocess.Popen | None = None
|
process: subprocess.Popen | None = None
|
||||||
|
@ -103,6 +105,8 @@ class ServerProcess:
|
||||||
server_args.extend(["--model", self.model_file])
|
server_args.extend(["--model", self.model_file])
|
||||||
if self.model_url:
|
if self.model_url:
|
||||||
server_args.extend(["--model-url", self.model_url])
|
server_args.extend(["--model-url", self.model_url])
|
||||||
|
if self.model_draft:
|
||||||
|
server_args.extend(["--model-draft", self.model_draft])
|
||||||
if self.model_hf_repo:
|
if self.model_hf_repo:
|
||||||
server_args.extend(["--hf-repo", self.model_hf_repo])
|
server_args.extend(["--hf-repo", self.model_hf_repo])
|
||||||
if self.model_hf_file:
|
if self.model_hf_file:
|
||||||
|
@ -148,6 +152,10 @@ class ServerProcess:
|
||||||
server_args.extend(["--no-context-shift"])
|
server_args.extend(["--no-context-shift"])
|
||||||
if self.api_key:
|
if self.api_key:
|
||||||
server_args.extend(["--api-key", self.api_key])
|
server_args.extend(["--api-key", self.api_key])
|
||||||
|
if self.draft_max:
|
||||||
|
server_args.extend(["--draft-max", self.draft_max])
|
||||||
|
if self.draft_min:
|
||||||
|
server_args.extend(["--draft-min", self.draft_min])
|
||||||
|
|
||||||
args = [str(arg) for arg in [server_path, *server_args]]
|
args = [str(arg) for arg in [server_path, *server_args]]
|
||||||
print(f"bench: starting server with: {' '.join(args)}")
|
print(f"bench: starting server with: {' '.join(args)}")
|
||||||
|
@ -161,26 +169,12 @@ class ServerProcess:
|
||||||
self.process = subprocess.Popen(
|
self.process = subprocess.Popen(
|
||||||
[str(arg) for arg in [server_path, *server_args]],
|
[str(arg) for arg in [server_path, *server_args]],
|
||||||
creationflags=flags,
|
creationflags=flags,
|
||||||
stdout=subprocess.PIPE,
|
stdout=sys.stdout,
|
||||||
stderr=subprocess.PIPE,
|
stderr=sys.stdout,
|
||||||
env={**os.environ, "LLAMA_CACHE": "tmp"},
|
env={**os.environ, "LLAMA_CACHE": "tmp"},
|
||||||
)
|
)
|
||||||
server_instances.add(self)
|
server_instances.add(self)
|
||||||
|
|
||||||
def server_log(in_stream, out_stream):
|
|
||||||
for line in iter(in_stream.readline, b""):
|
|
||||||
print(line.decode("utf-8"), end="", file=out_stream)
|
|
||||||
|
|
||||||
thread_stdout = threading.Thread(
|
|
||||||
target=server_log, args=(self.process.stdout, sys.stdout), daemon=True
|
|
||||||
)
|
|
||||||
thread_stdout.start()
|
|
||||||
|
|
||||||
thread_stderr = threading.Thread(
|
|
||||||
target=server_log, args=(self.process.stderr, sys.stderr), daemon=True
|
|
||||||
)
|
|
||||||
thread_stderr.start()
|
|
||||||
|
|
||||||
print(f"server pid={self.process.pid}, pytest pid={os.getpid()}")
|
print(f"server pid={self.process.pid}, pytest pid={os.getpid()}")
|
||||||
|
|
||||||
# wait for server to start
|
# wait for server to start
|
||||||
|
@ -200,7 +194,8 @@ class ServerProcess:
|
||||||
raise TimeoutError(f"Server did not start within {timeout_seconds} seconds")
|
raise TimeoutError(f"Server did not start within {timeout_seconds} seconds")
|
||||||
|
|
||||||
def stop(self) -> None:
|
def stop(self) -> None:
|
||||||
server_instances.remove(self)
|
if self in server_instances:
|
||||||
|
server_instances.remove(self)
|
||||||
if self.process:
|
if self.process:
|
||||||
print(f"Stopping server with pid={self.process.pid}")
|
print(f"Stopping server with pid={self.process.pid}")
|
||||||
self.process.kill()
|
self.process.kill()
|
||||||
|
@ -319,7 +314,6 @@ class ServerPreset:
|
||||||
server.model_hf_repo = "ggml-org/models"
|
server.model_hf_repo = "ggml-org/models"
|
||||||
server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
|
server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
|
||||||
server.model_alias = "jina-reranker"
|
server.model_alias = "jina-reranker"
|
||||||
server.model_file = "./tmp/jina-reranker-v1-tiny-en.gguf"
|
|
||||||
server.n_ctx = 512
|
server.n_ctx = 512
|
||||||
server.n_batch = 512
|
server.n_batch = 512
|
||||||
server.n_slots = 1
|
server.n_slots = 1
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-simple-chat)
|
||||||
add_executable(${TARGET} simple-chat.cpp)
|
add_executable(${TARGET} simple-chat.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-simple)
|
||||||
add_executable(${TARGET} simple.cpp)
|
add_executable(${TARGET} simple.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
|
The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
|
./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
|
||||||
|
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-speculative-simple)
|
||||||
add_executable(${TARGET} speculative-simple.cpp)
|
add_executable(${TARGET} speculative-simple.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-speculative)
|
||||||
add_executable(${TARGET} speculative.cpp)
|
add_executable(${TARGET} speculative.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-tokenize)
|
||||||
add_executable(${TARGET} tokenize.cpp)
|
add_executable(${TARGET} tokenize.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -161,7 +161,6 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
|
||||||
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
|
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
|
||||||
option(GGML_OPENMP "ggml: use OpenMP" ON)
|
option(GGML_OPENMP "ggml: use OpenMP" ON)
|
||||||
option(GGML_RPC "ggml: use RPC" OFF)
|
option(GGML_RPC "ggml: use RPC" OFF)
|
||||||
option(GGML_AMX "ggml: use AMX" OFF)
|
|
||||||
option(GGML_SYCL "ggml: use SYCL" OFF)
|
option(GGML_SYCL "ggml: use SYCL" OFF)
|
||||||
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
||||||
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
||||||
|
|
|
@ -1,25 +0,0 @@
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-backend.h"
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// buffer_type API
|
|
||||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
|
|
||||||
|
|
||||||
GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
|
|
||||||
|
|
||||||
// backend API
|
|
||||||
GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
|
|
||||||
|
|
||||||
GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
|
|
||||||
|
|
||||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
|
@ -91,6 +91,7 @@ extern "C" {
|
||||||
GGML_BACKEND_API int ggml_cpu_has_neon (void);
|
GGML_BACKEND_API int ggml_cpu_has_neon (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
|
GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
|
GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_dotprod (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
|
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_sve (void);
|
GGML_BACKEND_API int ggml_cpu_has_sve (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
|
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
|
||||||
|
|
|
@ -389,6 +389,9 @@ extern "C" {
|
||||||
GGML_TYPE_Q4_0_8_8 = 33,
|
GGML_TYPE_Q4_0_8_8 = 33,
|
||||||
GGML_TYPE_TQ1_0 = 34,
|
GGML_TYPE_TQ1_0 = 34,
|
||||||
GGML_TYPE_TQ2_0 = 35,
|
GGML_TYPE_TQ2_0 = 35,
|
||||||
|
GGML_TYPE_IQ4_NL_4_4 = 36,
|
||||||
|
// GGML_TYPE_IQ4_NL_4_8 = 37,
|
||||||
|
// GGML_TYPE_IQ4_NL_8_8 = 38,
|
||||||
GGML_TYPE_COUNT,
|
GGML_TYPE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -261,21 +261,15 @@ function(ggml_add_backend backend)
|
||||||
if (${backend_id})
|
if (${backend_id})
|
||||||
string(TOLOWER "ggml-${backend}" backend_target)
|
string(TOLOWER "ggml-${backend}" backend_target)
|
||||||
add_subdirectory(${backend_target})
|
add_subdirectory(${backend_target})
|
||||||
# check again in case the backend disabled itself
|
message(STATUS "Including ${backend} backend")
|
||||||
# note that this should NOT be the normal behavior, in case of errors the backend should fail the build
|
if (NOT GGML_BACKEND_DL)
|
||||||
# however, currently it is necessary for AMX, since it is enabled by default on llama.cpp
|
string(TOUPPER "GGML_USE_${backend}" backend_use)
|
||||||
if (${backend_id})
|
target_compile_definitions(ggml PUBLIC ${backend_use})
|
||||||
message(STATUS "Including ${backend} backend")
|
|
||||||
if (NOT GGML_BACKEND_DL)
|
|
||||||
string(TOUPPER "GGML_USE_${backend}" backend_use)
|
|
||||||
target_compile_definitions(ggml PUBLIC ${backend_use})
|
|
||||||
endif()
|
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
ggml_add_backend(CPU)
|
ggml_add_backend(CPU)
|
||||||
ggml_add_backend(AMX)
|
|
||||||
ggml_add_backend(BLAS)
|
ggml_add_backend(BLAS)
|
||||||
ggml_add_backend(CANN)
|
ggml_add_backend(CANN)
|
||||||
ggml_add_backend(CUDA)
|
ggml_add_backend(CUDA)
|
||||||
|
@ -289,7 +283,7 @@ ggml_add_backend(Vulkan)
|
||||||
|
|
||||||
foreach (target ggml-base ggml)
|
foreach (target ggml-base ggml)
|
||||||
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
||||||
target_compile_features (${target} PRIVATE c_std_11) # don't bump
|
target_compile_features (${target} PRIVATE c_std_11 cxx_std_17) # don't bump
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
||||||
target_link_libraries(ggml-base PRIVATE Threads::Threads)
|
target_link_libraries(ggml-base PRIVATE Threads::Threads)
|
||||||
|
|
|
@ -1,105 +0,0 @@
|
||||||
if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
|
||||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
|
||||||
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$") AND
|
|
||||||
CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
|
|
||||||
message(STATUS "Using AMX")
|
|
||||||
|
|
||||||
file(GLOB GGML_HEADERS_AMX "*.h")
|
|
||||||
list(APPEND GGML_HEADERS_AMX "../../include/ggml-amx.h")
|
|
||||||
|
|
||||||
file(GLOB GGML_SOURCES_AMX "*.cpp")
|
|
||||||
|
|
||||||
ggml_add_backend_library(ggml-amx
|
|
||||||
${GGML_HEADERS_AMX}
|
|
||||||
${GGML_SOURCES_AMX}
|
|
||||||
)
|
|
||||||
|
|
||||||
# this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
|
|
||||||
# TODO: integrate AMX backend into the CPU backend
|
|
||||||
if (MSVC)
|
|
||||||
# instruction set detection for MSVC only
|
|
||||||
if (GGML_NATIVE)
|
|
||||||
# TODO: improve, should not reference files from the parent folder
|
|
||||||
include(../ggml-cpu/cmake/FindSIMD.cmake)
|
|
||||||
endif ()
|
|
||||||
if (GGML_AVX512)
|
|
||||||
list(APPEND ARCH_FLAGS /arch:AVX512)
|
|
||||||
# MSVC has no compile-time flags enabling specific
|
|
||||||
# AVX512 extensions, neither it defines the
|
|
||||||
# macros corresponding to the extensions.
|
|
||||||
# Do it manually.
|
|
||||||
if (GGML_AVX512_VBMI)
|
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
|
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
|
|
||||||
endif()
|
|
||||||
if (GGML_AVX512_VNNI)
|
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
|
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
|
||||||
endif()
|
|
||||||
if (GGML_AVX512_BF16)
|
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
|
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
|
|
||||||
endif()
|
|
||||||
if (GGML_AMX_TILE)
|
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
|
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
|
|
||||||
endif()
|
|
||||||
if (GGML_AMX_INT8)
|
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
|
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
|
|
||||||
endif()
|
|
||||||
if (GGML_AMX_BF16)
|
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
|
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
|
|
||||||
endif()
|
|
||||||
elseif (GGML_AVX2)
|
|
||||||
list(APPEND ARCH_FLAGS /arch:AVX2)
|
|
||||||
elseif (GGML_AVX)
|
|
||||||
list(APPEND ARCH_FLAGS /arch:AVX)
|
|
||||||
endif()
|
|
||||||
else()
|
|
||||||
if (GGML_NATIVE)
|
|
||||||
list(APPEND ARCH_FLAGS -march=native)
|
|
||||||
endif()
|
|
||||||
if (GGML_F16C)
|
|
||||||
list(APPEND ARCH_FLAGS -mf16c)
|
|
||||||
endif()
|
|
||||||
if (GGML_FMA)
|
|
||||||
list(APPEND ARCH_FLAGS -mfma)
|
|
||||||
endif()
|
|
||||||
if (GGML_AVX)
|
|
||||||
list(APPEND ARCH_FLAGS -mavx)
|
|
||||||
endif()
|
|
||||||
if (GGML_AVX2)
|
|
||||||
list(APPEND ARCH_FLAGS -mavx2)
|
|
||||||
endif()
|
|
||||||
if (GGML_AVX512)
|
|
||||||
list(APPEND ARCH_FLAGS -mavx512f)
|
|
||||||
list(APPEND ARCH_FLAGS -mavx512dq)
|
|
||||||
list(APPEND ARCH_FLAGS -mavx512bw)
|
|
||||||
endif()
|
|
||||||
if (GGML_AVX512_VBMI)
|
|
||||||
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
|
||||||
endif()
|
|
||||||
if (GGML_AVX512_VNNI)
|
|
||||||
list(APPEND ARCH_FLAGS -mavx512vnni)
|
|
||||||
endif()
|
|
||||||
if (GGML_AVX512_BF16)
|
|
||||||
list(APPEND ARCH_FLAGS -mavx512bf16)
|
|
||||||
endif()
|
|
||||||
if (GGML_AMX_TILE)
|
|
||||||
list(APPEND ARCH_FLAGS -mamx-tile)
|
|
||||||
endif()
|
|
||||||
if (GGML_AMX_INT8)
|
|
||||||
list(APPEND ARCH_FLAGS -mamx-int8)
|
|
||||||
endif()
|
|
||||||
if (GGML_AMX_BF16)
|
|
||||||
list(APPEND ARCH_FLAGS -mamx-bf16)
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS})
|
|
||||||
else()
|
|
||||||
set(GGML_AMX OFF PARENT_SCOPE)
|
|
||||||
message(WARNING "AMX requires x86 and gcc version > 11.0. Turning off GGML_AMX.")
|
|
||||||
endif()
|
|
|
@ -1,449 +0,0 @@
|
||||||
#include "ggml-amx.h"
|
|
||||||
#include "ggml-amx/common.h"
|
|
||||||
#include "ggml-amx/mmq.h"
|
|
||||||
#include "ggml-backend-impl.h"
|
|
||||||
#include "ggml-impl.h"
|
|
||||||
|
|
||||||
#if defined(__gnu_linux__)
|
|
||||||
#include <sys/syscall.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <cstring>
|
|
||||||
#include <memory>
|
|
||||||
|
|
||||||
#if defined(__AMX_INT8__)
|
|
||||||
|
|
||||||
// AMX buffer interface
|
|
||||||
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
||||||
free(buffer->context);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
||||||
return (void *)(buffer->context);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
||||||
memset((char *)tensor->data + offset, value, size);
|
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
||||||
if (qtype_has_amx_kernels(tensor->type)) {
|
|
||||||
ggml_backend_amx_convert_weight(tensor, data, offset, size);
|
|
||||||
} else {
|
|
||||||
memcpy((char *)tensor->data + offset, data, size);
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
||||||
GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
|
|
||||||
memcpy(data, (const char *)tensor->data + offset, size);
|
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
||||||
if (ggml_backend_buffer_is_host(src->buffer)) {
|
|
||||||
if (qtype_has_amx_kernels(src->type)) {
|
|
||||||
ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_backend_amx_get_alloc_size(dst));
|
|
||||||
} else {
|
|
||||||
memcpy(dst->data, src->data, ggml_nbytes(src));
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
||||||
memset(buffer->context, value, buffer->size);
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
|
|
||||||
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
|
|
||||||
/* .get_base = */ ggml_backend_amx_buffer_get_base,
|
|
||||||
/* .init_tensor = */ NULL, // no initialization required
|
|
||||||
/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
|
|
||||||
/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
|
|
||||||
/* .get_tensor = */ ggml_backend_amx_buffer_get_tensor,
|
|
||||||
/* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor,
|
|
||||||
/* .clear = */ ggml_backend_amx_buffer_clear,
|
|
||||||
/* .reset = */ NULL,
|
|
||||||
};
|
|
||||||
|
|
||||||
static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
||||||
return "AMX";
|
|
||||||
|
|
||||||
GGML_UNUSED(buft);
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
||||||
void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
|
|
||||||
if (data == NULL) {
|
|
||||||
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
|
|
||||||
}
|
|
||||||
|
|
||||||
static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
||||||
return TENSOR_ALIGNMENT;
|
|
||||||
|
|
||||||
GGML_UNUSED(buft);
|
|
||||||
}
|
|
||||||
|
|
||||||
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
|
|
||||||
return ggml_backend_amx_get_alloc_size(tensor);
|
|
||||||
|
|
||||||
GGML_UNUSED(buft);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
|
||||||
return false;
|
|
||||||
|
|
||||||
GGML_UNUSED(buft);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
|
|
||||||
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
|
|
||||||
/* .iface = */ {
|
|
||||||
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
|
|
||||||
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
|
|
||||||
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
|
|
||||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
||||||
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
|
|
||||||
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
|
|
||||||
},
|
|
||||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
|
|
||||||
/* .context = */ NULL,
|
|
||||||
};
|
|
||||||
|
|
||||||
return &ggml_backend_buffer_type_amx;
|
|
||||||
}
|
|
||||||
|
|
||||||
// backend interface
|
|
||||||
|
|
||||||
static const char * ggml_backend_amx_name(ggml_backend_t backend) {
|
|
||||||
return "AMX";
|
|
||||||
|
|
||||||
GGML_UNUSED(backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_amx_free(ggml_backend_t backend) {
|
|
||||||
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
|
|
||||||
delete ctx;
|
|
||||||
delete backend;
|
|
||||||
}
|
|
||||||
|
|
||||||
static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
|
||||||
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
|
|
||||||
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
||||||
struct ggml_tensor * node = cgraph->nodes[i];
|
|
||||||
|
|
||||||
switch (node->op) {
|
|
||||||
case GGML_OP_MUL_MAT:
|
|
||||||
ggml_backend_amx_mul_mat(ctx, node);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case GGML_OP_NONE:
|
|
||||||
case GGML_OP_RESHAPE:
|
|
||||||
case GGML_OP_VIEW:
|
|
||||||
case GGML_OP_PERMUTE:
|
|
||||||
case GGML_OP_TRANSPOSE:
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return GGML_STATUS_SUCCESS;
|
|
||||||
|
|
||||||
GGML_UNUSED(backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct ggml_backend_i ggml_backend_amx_i = {
|
|
||||||
/* .get_name = */ ggml_backend_amx_name,
|
|
||||||
/* .free = */ ggml_backend_amx_free,
|
|
||||||
/* .set_tensor_async = */ NULL,
|
|
||||||
/* .get_tensor_async = */ NULL,
|
|
||||||
/* .cpy_tensor_async = */ NULL,
|
|
||||||
/* .synchronize = */ NULL,
|
|
||||||
/* .graph_plan_create = */ NULL,
|
|
||||||
/* .graph_plan_free = */ NULL,
|
|
||||||
/* .graph_plan_update = */ NULL,
|
|
||||||
/* .graph_plan_compute = */ NULL,
|
|
||||||
/* .graph_compute = */ ggml_backend_amx_graph_compute,
|
|
||||||
/* .event_record = */ NULL,
|
|
||||||
/* .event_wait = */ NULL,
|
|
||||||
};
|
|
||||||
|
|
||||||
static ggml_guid_t ggml_backend_amx_guid() {
|
|
||||||
static ggml_guid guid = { 0x13, 0xb8, 0xa4, 0xc4, 0xba, 0xfe, 0x51, 0x67, 0x87, 0x44, 0x55, 0x15, 0xb2, 0x35, 0x62, 0x3e };
|
|
||||||
return &guid;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define ARCH_GET_XCOMP_PERM 0x1022
|
|
||||||
#define ARCH_REQ_XCOMP_PERM 0x1023
|
|
||||||
#define XFEATURE_XTILECFG 17
|
|
||||||
#define XFEATURE_XTILEDATA 18
|
|
||||||
|
|
||||||
static bool ggml_amx_init() {
|
|
||||||
#if defined(__gnu_linux__)
|
|
||||||
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
|
|
||||||
fprintf(stderr, "AMX is not ready to be used!\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
#elif defined(_WIN32)
|
|
||||||
return true;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_amx_init() {
|
|
||||||
|
|
||||||
// invoke a Linux system call to request access to AMX features
|
|
||||||
ggml_amx_init();
|
|
||||||
|
|
||||||
// backend context
|
|
||||||
ggml_backend_amx_context * ctx = new ggml_backend_amx_context;
|
|
||||||
|
|
||||||
// ggml amx backend
|
|
||||||
ggml_backend_t backend = new ggml_backend {
|
|
||||||
/* .guid = */ ggml_backend_amx_guid(),
|
|
||||||
/* .interface = */ ggml_backend_amx_i,
|
|
||||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
|
|
||||||
/* .context = */ ctx,
|
|
||||||
};
|
|
||||||
|
|
||||||
return backend;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ggml_backend_is_amx(ggml_backend_t backend) {
|
|
||||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_amx_guid());
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
|
|
||||||
GGML_ASSERT(ggml_backend_is_amx(backend_amx));
|
|
||||||
|
|
||||||
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend_amx->context;
|
|
||||||
ctx->n_threads = n_threads;
|
|
||||||
}
|
|
||||||
|
|
||||||
// device interface
|
|
||||||
|
|
||||||
static const char * ggml_backend_amx_device_get_name(ggml_backend_dev_t dev) {
|
|
||||||
return "AMX";
|
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char * ggml_backend_amx_device_get_description(ggml_backend_dev_t dev) {
|
|
||||||
return "Intel Advanced Matrix Extensions";
|
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
|
||||||
// TODO
|
|
||||||
*free = 0;
|
|
||||||
*total = 0;
|
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
}
|
|
||||||
|
|
||||||
static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
|
|
||||||
return GGML_BACKEND_DEVICE_TYPE_ACCEL;
|
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_amx_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
|
||||||
props->name = ggml_backend_amx_device_get_name(dev);
|
|
||||||
props->description = ggml_backend_amx_device_get_description(dev);
|
|
||||||
props->type = ggml_backend_amx_device_get_type(dev);
|
|
||||||
ggml_backend_amx_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
|
||||||
|
|
||||||
// `buffer_from_host_ptr` is intended to be used in mmap, when memory layout unchanged
|
|
||||||
props->caps = {
|
|
||||||
/* .async = */ false,
|
|
||||||
/* .host_buffer = */ false,
|
|
||||||
/* .buffer_from_host_ptr = */ false,
|
|
||||||
/* .events = */ false,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_backend_t ggml_backend_amx_device_init(ggml_backend_dev_t dev, const char * params) {
|
|
||||||
return ggml_backend_amx_init();
|
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
GGML_UNUSED(params);
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_amx_device_get_buffer_type(ggml_backend_dev_t dev) {
|
|
||||||
return ggml_backend_amx_buffer_type();
|
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
|
||||||
|
|
||||||
// handle only 2d gemm for now
|
|
||||||
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
|
|
||||||
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
|
|
||||||
};
|
|
||||||
|
|
||||||
switch (op->op) {
|
|
||||||
case GGML_OP_NONE:
|
|
||||||
case GGML_OP_RESHAPE:
|
|
||||||
case GGML_OP_VIEW:
|
|
||||||
case GGML_OP_PERMUTE:
|
|
||||||
case GGML_OP_TRANSPOSE:
|
|
||||||
return true;
|
|
||||||
|
|
||||||
case GGML_OP_MUL_MAT: {
|
|
||||||
const struct ggml_tensor * src0 = op->src[0];
|
|
||||||
const struct ggml_tensor * src1 = op->src[1];
|
|
||||||
|
|
||||||
const enum ggml_type type = src0->type;
|
|
||||||
const int64_t ne0 = op->ne[0];
|
|
||||||
|
|
||||||
// amx kernels enables for Q4_0, Q4_1, Q8_0, F16
|
|
||||||
// Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
|
|
||||||
bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
|
|
||||||
|
|
||||||
bool can_use_amx =
|
|
||||||
is_contiguous_2d(src0) && // src0 must be contiguous
|
|
||||||
is_contiguous_2d(src1) && // src1 must be contiguous
|
|
||||||
src1->type == GGML_TYPE_F32 && // src1 must be float32
|
|
||||||
has_amx_kernels && // with amx kernel impls
|
|
||||||
ne0 % (TILE_N * 2) == 0; // out_features is 32x
|
|
||||||
|
|
||||||
return can_use_amx;
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ggml_backend_amx_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
|
||||||
return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
|
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
}
|
|
||||||
|
|
||||||
static const struct ggml_backend_device_i ggml_backend_amx_device_i = {
|
|
||||||
/* .get_name = */ ggml_backend_amx_device_get_name,
|
|
||||||
/* .get_description = */ ggml_backend_amx_device_get_description,
|
|
||||||
/* .get_memory = */ ggml_backend_amx_device_get_memory,
|
|
||||||
/* .get_type = */ ggml_backend_amx_device_get_type,
|
|
||||||
/* .get_props = */ ggml_backend_amx_device_get_props,
|
|
||||||
/* .init_backend = */ ggml_backend_amx_device_init,
|
|
||||||
/* .get_buffer_type = */ ggml_backend_amx_device_get_buffer_type,
|
|
||||||
/* .get_host_buffer_type = */ NULL,
|
|
||||||
/* .buffer_from_host_ptr = */ NULL,
|
|
||||||
/* .supports_op = */ ggml_backend_amx_device_supports_op,
|
|
||||||
/* .supports_buft = */ ggml_backend_amx_device_supports_buft,
|
|
||||||
/* .offload_op = */ NULL,
|
|
||||||
/* .event_new = */ NULL,
|
|
||||||
/* .event_free = */ NULL,
|
|
||||||
/* .event_synchronize = */ NULL,
|
|
||||||
};
|
|
||||||
|
|
||||||
// backend reg interface
|
|
||||||
|
|
||||||
static const char * ggml_backend_amx_reg_get_name(ggml_backend_reg_t reg) {
|
|
||||||
return "AMX";
|
|
||||||
|
|
||||||
GGML_UNUSED(reg);
|
|
||||||
}
|
|
||||||
|
|
||||||
static size_t ggml_backend_amx_reg_get_device_count(ggml_backend_reg_t reg) {
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
GGML_UNUSED(reg);
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_backend_dev_t ggml_backend_amx_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
|
||||||
GGML_ASSERT(index == 0);
|
|
||||||
|
|
||||||
static ggml_backend_device ggml_backend_amx_device = {
|
|
||||||
/* .iface = */ ggml_backend_amx_device_i,
|
|
||||||
/* .reg = */ reg,
|
|
||||||
/* .context = */ nullptr,
|
|
||||||
};
|
|
||||||
|
|
||||||
return &ggml_backend_amx_device;
|
|
||||||
|
|
||||||
GGML_UNUSED(reg);
|
|
||||||
GGML_UNUSED(index);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void * ggml_backend_amx_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
|
||||||
if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
|
||||||
return (void *)ggml_backend_amx_set_n_threads;
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
GGML_UNUSED(reg);
|
|
||||||
GGML_UNUSED(name);
|
|
||||||
}
|
|
||||||
|
|
||||||
static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = {
|
|
||||||
/* .get_name = */ ggml_backend_amx_reg_get_name,
|
|
||||||
/* .get_device_count = */ ggml_backend_amx_reg_get_device_count,
|
|
||||||
/* .get_device = */ ggml_backend_amx_reg_get_device,
|
|
||||||
/* .get_proc_address = */ ggml_backend_amx_get_proc_address,
|
|
||||||
};
|
|
||||||
|
|
||||||
ggml_backend_reg_t ggml_backend_amx_reg(void) {
|
|
||||||
static struct ggml_backend_reg ggml_backend_amx_reg = {
|
|
||||||
/* .api_version = */ GGML_BACKEND_API_VERSION,
|
|
||||||
/* .iface = */ ggml_backend_amx_reg_i,
|
|
||||||
/* .context = */ NULL,
|
|
||||||
};
|
|
||||||
|
|
||||||
return &ggml_backend_amx_reg;
|
|
||||||
}
|
|
||||||
|
|
||||||
#else // if defined(__AMX_INT8__)
|
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void) {
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ggml_backend_is_amx(ggml_backend_t backend) {
|
|
||||||
GGML_UNUSED(backend);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_amx_init(void) {
|
|
||||||
fprintf(stderr, "GGML is not compiled with AMX support!\n");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
|
|
||||||
fprintf(stderr, "GGML is not compiled with AMX support!\n");
|
|
||||||
|
|
||||||
GGML_UNUSED(backend_amx);
|
|
||||||
GGML_UNUSED(n_threads);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_reg_t ggml_backend_amx_reg(void) {
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg)
|
|
|
@ -49,10 +49,6 @@
|
||||||
#include "ggml-rpc.h"
|
#include "ggml-rpc.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_AMX
|
|
||||||
# include "ggml-amx.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GGML_USE_CANN
|
#ifdef GGML_USE_CANN
|
||||||
#include "ggml-cann.h"
|
#include "ggml-cann.h"
|
||||||
#endif
|
#endif
|
||||||
|
@ -92,9 +88,6 @@ struct ggml_backend_registry {
|
||||||
#ifdef GGML_USE_RPC
|
#ifdef GGML_USE_RPC
|
||||||
register_backend(ggml_backend_rpc_reg());
|
register_backend(ggml_backend_rpc_reg());
|
||||||
#endif
|
#endif
|
||||||
#ifdef GGML_USE_AMX
|
|
||||||
register_backend(ggml_backend_amx_reg());
|
|
||||||
#endif
|
|
||||||
#ifdef GGML_USE_KOMPUTE
|
#ifdef GGML_USE_KOMPUTE
|
||||||
register_backend(ggml_backend_kompute_reg());
|
register_backend(ggml_backend_kompute_reg());
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -742,7 +742,8 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||||
|
|
||||||
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
||||||
// since the tensor is pre-allocated, it cannot be moved to another backend
|
// since the tensor is pre-allocated, it cannot be moved to another backend
|
||||||
GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name);
|
ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||||
|
GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op));
|
||||||
}
|
}
|
||||||
|
|
||||||
// graph input
|
// graph input
|
||||||
|
|
|
@ -22,13 +22,14 @@ if(NOT SOC_TYPE)
|
||||||
detect_ascend_soc_type(SOC_VERSION)
|
detect_ascend_soc_type(SOC_VERSION)
|
||||||
set(SOC_TYPE "${SOC_VERSION}")
|
set(SOC_TYPE "${SOC_VERSION}")
|
||||||
message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
|
message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
|
||||||
else()
|
|
||||||
string(TOLOWER ${SOC_TYPE} SOC_VERSION)
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P.
|
string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
|
||||||
|
|
||||||
|
# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND_310P.
|
||||||
string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
|
string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
|
||||||
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
|
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
|
||||||
|
string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
|
||||||
|
|
||||||
if (CANN_INSTALL_DIR)
|
if (CANN_INSTALL_DIR)
|
||||||
# Only Support Linux.
|
# Only Support Linux.
|
||||||
|
|
|
@ -21,22 +21,23 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "aclnn_ops.h"
|
#include "aclnn_ops.h"
|
||||||
#include "ggml-impl.h"
|
|
||||||
|
|
||||||
|
#include <aclnnop/aclnn_addcdiv.h>
|
||||||
#include <aclnnop/aclnn_avgpool2d.h>
|
#include <aclnnop/aclnn_avgpool2d.h>
|
||||||
|
#include <aclnnop/aclnn_batch_matmul.h>
|
||||||
#include <aclnnop/aclnn_cast.h>
|
#include <aclnnop/aclnn_cast.h>
|
||||||
#include <aclnnop/aclnn_constant_pad_nd.h>
|
#include <aclnnop/aclnn_constant_pad_nd.h>
|
||||||
#include <aclnnop/aclnn_copy.h>
|
#include <aclnnop/aclnn_copy.h>
|
||||||
#include <aclnnop/aclnn_cos.h>
|
#include <aclnnop/aclnn_cos.h>
|
||||||
|
#include <aclnnop/aclnn_div.h>
|
||||||
#include <aclnnop/aclnn_exp.h>
|
#include <aclnnop/aclnn_exp.h>
|
||||||
#include <aclnnop/aclnn_fill_scalar.h>
|
#include <aclnnop/aclnn_fill_scalar.h>
|
||||||
#include <aclnnop/aclnn_group_norm.h>
|
#include <aclnnop/aclnn_group_norm.h>
|
||||||
#include <aclnnop/aclnn_index_fill_tensor.h>
|
#include <aclnnop/aclnn_index_fill_tensor.h>
|
||||||
#include <aclnnop/aclnn_layer_norm.h>
|
#include <aclnnop/aclnn_layer_norm.h>
|
||||||
#include <aclnnop/aclnn_mm.h>
|
|
||||||
#include <aclnnop/aclnn_batch_matmul.h>
|
|
||||||
#include <aclnnop/aclnn_matmul.h>
|
#include <aclnnop/aclnn_matmul.h>
|
||||||
#include <aclnnop/aclnn_max_pool.h>
|
#include <aclnnop/aclnn_max_pool.h>
|
||||||
|
#include <aclnnop/aclnn_mm.h>
|
||||||
#include <aclnnop/aclnn_permute.h>
|
#include <aclnnop/aclnn_permute.h>
|
||||||
#include <aclnnop/aclnn_pow_tensor_tensor.h>
|
#include <aclnnop/aclnn_pow_tensor_tensor.h>
|
||||||
#include <aclnnop/aclnn_reduce_sum.h>
|
#include <aclnnop/aclnn_reduce_sum.h>
|
||||||
|
@ -56,6 +57,7 @@
|
||||||
#include <exception>
|
#include <exception>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include "ggml-impl.h"
|
||||||
#include "kernels/ascendc_kernels.h"
|
#include "kernels/ascendc_kernels.h"
|
||||||
|
|
||||||
#define GGML_COMMON_DECL_C
|
#define GGML_COMMON_DECL_C
|
||||||
|
@ -1103,9 +1105,9 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Creates an ACL tensor initialized with ones using a provided buffer.
|
* @brief Creates an ACL tensor initialized with value using a provided buffer.
|
||||||
*
|
*
|
||||||
* This function initializes a tensor with ones using the specified buffer and
|
* This function initializes a tensor with value using the specified buffer and
|
||||||
* tensor parameters.
|
* tensor parameters.
|
||||||
*
|
*
|
||||||
* @param ctx The context for the CANN backend operations.
|
* @param ctx The context for the CANN backend operations.
|
||||||
|
@ -1118,12 +1120,12 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
|
||||||
* @param type_size The size of each element in the tensor data type.
|
* @param type_size The size of each element in the tensor data type.
|
||||||
* @param value The value to be used for initializing the tensor (default
|
* @param value The value to be used for initializing the tensor (default
|
||||||
* is 1.0).
|
* is 1.0).
|
||||||
* @return An ACL tensor initialized with ones.
|
* @return An ACL tensor initialized with value.
|
||||||
*/
|
*/
|
||||||
static aclTensor* aclnn_ones(ggml_backend_cann_context& ctx, void* buffer,
|
static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
|
||||||
size_t n_bytes, int64_t* ne, int64_t dims,
|
size_t n_bytes, int64_t* ne, int64_t dims,
|
||||||
aclDataType type, size_t type_size,
|
aclDataType type, size_t type_size,
|
||||||
float value = 1.0f) {
|
float value = 1.0f) {
|
||||||
aclTensor* acl_tensor =
|
aclTensor* acl_tensor =
|
||||||
aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
|
aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
|
||||||
float alpha_host = 1.0f;
|
float alpha_host = 1.0f;
|
||||||
|
@ -1165,7 +1167,7 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src);
|
size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src);
|
||||||
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
|
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
|
||||||
|
|
||||||
aclTensor* acl_gamma = aclnn_ones(
|
aclTensor* acl_gamma = aclnn_values(
|
||||||
ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1,
|
ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1,
|
||||||
ggml_cann_type_mapping(src->type), ggml_element_size(src));
|
ggml_cann_type_mapping(src->type), ggml_element_size(src));
|
||||||
|
|
||||||
|
@ -1209,9 +1211,9 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
|
||||||
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
|
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
|
||||||
|
|
||||||
aclTensor* mask_tensor =
|
aclTensor* mask_tensor =
|
||||||
aclnn_ones(ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne,
|
aclnn_values(ctx, one_tensor_allocator.get(), one_tensor_n_bytes,
|
||||||
GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
|
src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
|
||||||
ggml_element_size(src), value);
|
ggml_element_size(src), value);
|
||||||
|
|
||||||
uint64_t workspaceSize = 0;
|
uint64_t workspaceSize = 0;
|
||||||
aclOpExecutor* executor;
|
aclOpExecutor* executor;
|
||||||
|
@ -1768,6 +1770,92 @@ static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||||
ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Performs element-wise division of tensor1 by tensor2 , multiplies the
|
||||||
|
result by the scalar value and adds it to self .
|
||||||
|
*
|
||||||
|
* Performs element-wise division of tensor1 by tensor2,
|
||||||
|
* multiplies the result by the scalar value and adds it to self .
|
||||||
|
* The operation is defined as:
|
||||||
|
* \f[
|
||||||
|
* \text{out}_i = \text{selft}_i + \text{value} \times
|
||||||
|
\frac{\text{tensor1}_i}{\text{tensor2}_i}
|
||||||
|
* \f]
|
||||||
|
|
||||||
|
* @param ctx The context for the CANN backend operations.
|
||||||
|
* @param acl_self The source tensor on which the addcdiv function will be
|
||||||
|
applied.
|
||||||
|
* @param tensor1 Numerator tensor.
|
||||||
|
* @param tensor2 Denominator tensor.
|
||||||
|
* @param value The value to be used for coefficient.
|
||||||
|
*/
|
||||||
|
static void aclnn_inplace_addcdiv(ggml_backend_cann_context& ctx,
|
||||||
|
aclTensor* acl_self, aclTensor* tensor1,
|
||||||
|
aclTensor* tensor2, float value) {
|
||||||
|
uint64_t workspaceSize = 0;
|
||||||
|
aclOpExecutor* executor;
|
||||||
|
void* workspaceAddr = nullptr;
|
||||||
|
aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
|
||||||
|
|
||||||
|
ACL_CHECK(aclnnInplaceAddcdivGetWorkspaceSize(
|
||||||
|
acl_self, tensor1, tensor2, acl_value, &workspaceSize, &executor));
|
||||||
|
if (workspaceSize > 0) {
|
||||||
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||||
|
workspaceAddr = workspace_allocator.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
ACL_CHECK(aclnnInplaceAddcdiv(workspaceAddr, workspaceSize, executor,
|
||||||
|
ctx.stream()));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Matrix division, optionally in-place.
|
||||||
|
*
|
||||||
|
* This function division each element of the source tensor `acl_src` by the
|
||||||
|
* tensor `acl_other` and stores the result in the destination tensor `acl_dst`.
|
||||||
|
* If `inplace` is true, `acl_dst` will not be used and the operation is
|
||||||
|
* performed in-place on `acl_src`. The operation is defined as: \f[
|
||||||
|
* \text{dst}_i = \frac{\text{acl_src}_i}{\text{acl_other}_i}
|
||||||
|
* \f]
|
||||||
|
*
|
||||||
|
* @param ctx The context for the CANN backend operations.
|
||||||
|
* @param acl_src Numerator tensor..
|
||||||
|
* @param acl_other Denominator tensor.
|
||||||
|
* @param acl_dst The destination tensor where the result will be stored if
|
||||||
|
* `inplace` is false.
|
||||||
|
* @param inplace Flag indicating whether to perform the operation in-place on
|
||||||
|
* `acl_src`.
|
||||||
|
*/
|
||||||
|
static void aclnn_div_tensor(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||||
|
aclTensor* acl_other, aclTensor* acl_dst,
|
||||||
|
bool inplace) {
|
||||||
|
uint64_t workspaceSize = 0;
|
||||||
|
aclOpExecutor* executor;
|
||||||
|
void* workspaceAddr = nullptr;
|
||||||
|
|
||||||
|
if (inplace) {
|
||||||
|
ACL_CHECK(aclnnInplaceDivGetWorkspaceSize(acl_src, acl_other,
|
||||||
|
&workspaceSize, &executor));
|
||||||
|
if (workspaceSize > 0) {
|
||||||
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||||
|
workspaceAddr = workspace_allocator.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
ACL_CHECK(aclnnInplaceDiv(workspaceAddr, workspaceSize, executor,
|
||||||
|
ctx.stream()));
|
||||||
|
} else {
|
||||||
|
ACL_CHECK(aclnnDivGetWorkspaceSize(acl_src, acl_other, acl_dst,
|
||||||
|
&workspaceSize, &executor));
|
||||||
|
if (workspaceSize > 0) {
|
||||||
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||||
|
workspaceAddr = workspace_allocator.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
ACL_CHECK(
|
||||||
|
aclnnDiv(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
|
void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
|
||||||
ggml_tensor* dst) {
|
ggml_tensor* dst) {
|
||||||
const ggml_tensor* src = dst->src[0];
|
const ggml_tensor* src = dst->src[0];
|
||||||
|
@ -2311,12 +2399,13 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
ctx.stream()));
|
ctx.stream()));
|
||||||
|
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32: {
|
||||||
{
|
|
||||||
#ifdef ASCEND_310P
|
#ifdef ASCEND_310P
|
||||||
// Special operation for get_row_f32 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
|
// Special operation for get_row_f32 kernel of 310P: clear the
|
||||||
|
// content of dest data buffer when row is not aligned to 32 bytes
|
||||||
if ((src0->ne[0] % 8) != 0) {
|
if ((src0->ne[0] % 8) != 0) {
|
||||||
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
|
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] *
|
||||||
|
src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
|
||||||
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
|
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -2329,12 +2418,15 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
((ggml_tensor*)dst->extra)->nb);
|
((ggml_tensor*)dst->extra)->nb);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16: {
|
||||||
{
|
|
||||||
#ifdef ASCEND_310P
|
#ifdef ASCEND_310P
|
||||||
// Special operation for get_row_f16 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
|
// Special operation for get_row_f16 kernel of 310P: clear the
|
||||||
|
// content of dest data buffer when row is not aligned to 32 bytes
|
||||||
if ((src0->ne[0] % 16) != 0) {
|
if ((src0->ne[0] % 16) != 0) {
|
||||||
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16
|
size_t dst_len =
|
||||||
|
src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] *
|
||||||
|
ggml_type_size(
|
||||||
|
GGML_TYPE_F32); // out is also f32, even input is f16
|
||||||
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
|
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -2459,8 +2551,9 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
||||||
* @param acl_dst The destination tensor where the result of the matrix
|
* @param acl_dst The destination tensor where the result of the matrix
|
||||||
* multiplication will be stored.
|
* multiplication will be stored.
|
||||||
*/
|
*/
|
||||||
static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx,
|
||||||
aclTensor* acl_weight, aclTensor* acl_dst) {
|
aclTensor* acl_input, aclTensor* acl_weight,
|
||||||
|
aclTensor* acl_dst) {
|
||||||
int8_t cube_math_type = 2;
|
int8_t cube_math_type = 2;
|
||||||
uint64_t workspaceSize = 0;
|
uint64_t workspaceSize = 0;
|
||||||
aclOpExecutor* executor;
|
aclOpExecutor* executor;
|
||||||
|
@ -2475,8 +2568,7 @@ static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_inpu
|
||||||
workspaceAddr = workspace_allocator.get();
|
workspaceAddr = workspace_allocator.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
ACL_CHECK(
|
ACL_CHECK(aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||||
aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -2496,8 +2588,9 @@ static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_inpu
|
||||||
* @param acl_dst The destination tensor where the result of the matrix
|
* @param acl_dst The destination tensor where the result of the matrix
|
||||||
* multiplication will be stored.
|
* multiplication will be stored.
|
||||||
*/
|
*/
|
||||||
static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx,
|
||||||
aclTensor* acl_weight, aclTensor* acl_dst) {
|
aclTensor* acl_input, aclTensor* acl_weight,
|
||||||
|
aclTensor* acl_dst) {
|
||||||
int8_t cube_math_type = 2;
|
int8_t cube_math_type = 2;
|
||||||
uint64_t workspaceSize = 0;
|
uint64_t workspaceSize = 0;
|
||||||
aclOpExecutor* executor;
|
aclOpExecutor* executor;
|
||||||
|
@ -2548,31 +2641,27 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
||||||
|
|
||||||
aclTensor* acl_input_tensor =
|
aclTensor* acl_input_tensor =
|
||||||
ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
|
ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
|
||||||
int64_t transpose_ne[] = {
|
int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
|
||||||
bcast_weight_ne[1], bcast_weight_ne[0],
|
bcast_weight_ne[2], bcast_weight_ne[3],
|
||||||
bcast_weight_ne[2], bcast_weight_ne[3],
|
bcast_weight_ne[4], bcast_weight_ne[5]};
|
||||||
bcast_weight_ne[4], bcast_weight_ne[5]
|
size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
|
||||||
};
|
bcast_weight_nb[2], bcast_weight_nb[3],
|
||||||
size_t transpose_nb[] = {
|
bcast_weight_nb[4], bcast_weight_nb[5]};
|
||||||
bcast_weight_nb[1], bcast_weight_nb[0],
|
|
||||||
bcast_weight_nb[2], bcast_weight_nb[3],
|
|
||||||
bcast_weight_nb[4], bcast_weight_nb[5]
|
|
||||||
};
|
|
||||||
aclTensor* acl_weight_tensor =
|
aclTensor* acl_weight_tensor =
|
||||||
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
|
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
|
||||||
aclTensor* acl_dst =
|
aclTensor* acl_dst =
|
||||||
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
|
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
|
||||||
|
|
||||||
switch (n_dims) {
|
switch (n_dims) {
|
||||||
case 2:
|
case 2:
|
||||||
aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3:
|
||||||
aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
|
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
|
||||||
|
@ -2594,8 +2683,8 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
||||||
* multiplication will be stored.
|
* multiplication will be stored.
|
||||||
*/
|
*/
|
||||||
static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||||
ggml_tensor* dst,
|
ggml_tensor* dst,
|
||||||
const enum ggml_type type) {
|
const enum ggml_type type) {
|
||||||
ggml_tensor* src0 = dst->src[0]; // weight
|
ggml_tensor* src0 = dst->src[0]; // weight
|
||||||
ggml_tensor* src1 = dst->src[1]; // input
|
ggml_tensor* src1 = dst->src[1]; // input
|
||||||
|
|
||||||
|
@ -2617,14 +2706,15 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||||
|
|
||||||
// scale stored at the end of weight. Also need transpose.
|
// scale stored at the end of weight. Also need transpose.
|
||||||
size_t scale_elem_size = sizeof(uint16_t);
|
size_t scale_elem_size = sizeof(uint16_t);
|
||||||
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size};
|
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
|
||||||
|
scale_elem_size};
|
||||||
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
|
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
|
||||||
char* scale_offset = (char*)src0->data + weight_size;
|
char* scale_offset = (char*)src0->data + weight_size;
|
||||||
|
|
||||||
// input
|
// input
|
||||||
size_t input_elem_size = sizeof(uint16_t);
|
size_t input_elem_size = sizeof(uint16_t);
|
||||||
int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
|
int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
|
||||||
size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
|
size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
|
||||||
size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
|
size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
|
||||||
ggml_cann_pool_alloc input_alloctor(ctx.pool());
|
ggml_cann_pool_alloc input_alloctor(ctx.pool());
|
||||||
void* input_buffer = src1->data;
|
void* input_buffer = src1->data;
|
||||||
|
@ -2632,7 +2722,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||||
// case in
|
// case in
|
||||||
if (src1->type != GGML_TYPE_F16) {
|
if (src1->type != GGML_TYPE_F16) {
|
||||||
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
|
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
|
||||||
input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
|
input_buffer =
|
||||||
|
input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
|
||||||
|
|
||||||
int64_t* input_cast_ne = src1->ne;
|
int64_t* input_cast_ne = src1->ne;
|
||||||
size_t input_cast_nb[GGML_MAX_DIMS];
|
size_t input_cast_nb[GGML_MAX_DIMS];
|
||||||
|
@ -2642,9 +2733,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||||
}
|
}
|
||||||
|
|
||||||
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
|
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
|
||||||
input_buffer,
|
input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
|
||||||
ACL_FLOAT16,
|
input_cast_nb, GGML_MAX_DIMS);
|
||||||
input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
|
|
||||||
aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
|
aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
|
||||||
|
|
||||||
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
||||||
|
@ -2655,7 +2745,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||||
size_t output_elem_size = sizeof(uint16_t);
|
size_t output_elem_size = sizeof(uint16_t);
|
||||||
size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
|
size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
|
||||||
ggml_cann_pool_alloc output_allocator(ctx.pool());
|
ggml_cann_pool_alloc output_allocator(ctx.pool());
|
||||||
void* output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
|
void* output_buffer =
|
||||||
|
output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
|
||||||
size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
|
size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
|
||||||
|
|
||||||
// aclnn
|
// aclnn
|
||||||
|
@ -2679,7 +2770,9 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||||
|
|
||||||
// first split
|
// first split
|
||||||
int64_t weight_ne_offset = 0;
|
int64_t weight_ne_offset = 0;
|
||||||
int64_t weight_ne[2] = {max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0]};
|
int64_t weight_ne[2] = {
|
||||||
|
max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size,
|
||||||
|
src0->ne[0]};
|
||||||
int64_t scale_ne_offset = 0;
|
int64_t scale_ne_offset = 0;
|
||||||
int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
|
int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
|
||||||
int64_t output_ne_offset = 0;
|
int64_t output_ne_offset = 0;
|
||||||
|
@ -2687,24 +2780,21 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||||
|
|
||||||
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
|
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
|
||||||
(char*)src0->data + batch0 * weight_stride,
|
(char*)src0->data + batch0 * weight_stride,
|
||||||
ggml_cann_type_mapping(type),
|
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
|
||||||
weight_elem_size, weight_ne, weight_nb, 2,
|
weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
|
||||||
ACL_FORMAT_ND, weight_ne_offset);
|
|
||||||
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
|
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
|
||||||
scale_offset + batch0 * scale_stride,
|
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
|
||||||
ACL_FLOAT16,
|
scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
|
||||||
scale_elem_size, scale_ne, scale_nb, 2,
|
scale_ne_offset);
|
||||||
ACL_FORMAT_ND, scale_ne_offset);
|
|
||||||
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
|
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
|
||||||
(char*)output_buffer + batch1 * output_stride,
|
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
|
||||||
ACL_FLOAT16,
|
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
|
||||||
output_elem_size, output_ne, output_nb, 2,
|
output_ne_offset);
|
||||||
ACL_FORMAT_ND, output_ne_offset);
|
|
||||||
|
|
||||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
||||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
|
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
|
||||||
nullptr, nullptr, nullptr, nullptr, QK8_0,
|
nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
|
||||||
acl_output_tensor, &workspaceSize, &executor));
|
&workspaceSize, &executor));
|
||||||
if (workspaceAddr == nullptr) {
|
if (workspaceAddr == nullptr) {
|
||||||
workspaceAddr = workspace_allocator.alloc(workspaceSize);
|
workspaceAddr = workspace_allocator.alloc(workspaceSize);
|
||||||
}
|
}
|
||||||
|
@ -2717,28 +2807,29 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||||
|
|
||||||
// other splits
|
// other splits
|
||||||
for (int64_t split = 1; split < split_size; split++) {
|
for (int64_t split = 1; split < split_size; split++) {
|
||||||
weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
|
weight_ne_offset +=
|
||||||
weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
|
weight_elem_size * weight_ne[0] * weight_ne[1];
|
||||||
|
weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1]
|
||||||
|
? src0->ne[1] - (max_elem_size * split)
|
||||||
|
: max_elem_size;
|
||||||
scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
|
scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
|
||||||
scale_ne[0] = weight_ne[0];
|
scale_ne[0] = weight_ne[0];
|
||||||
output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
|
output_ne_offset +=
|
||||||
|
output_elem_size * output_ne[0] * output_ne[1];
|
||||||
output_ne[0] = weight_ne[0];
|
output_ne[0] = weight_ne[0];
|
||||||
|
|
||||||
acl_weight_tensor = ggml_cann_create_tensor(
|
acl_weight_tensor = ggml_cann_create_tensor(
|
||||||
(char*)src0->data + batch0 * weight_stride,
|
(char*)src0->data + batch0 * weight_stride,
|
||||||
ggml_cann_type_mapping(type),
|
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
|
||||||
weight_elem_size, weight_ne, weight_nb, 2,
|
weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
|
||||||
ACL_FORMAT_ND, weight_ne_offset);
|
|
||||||
acl_scale_tensor = ggml_cann_create_tensor(
|
acl_scale_tensor = ggml_cann_create_tensor(
|
||||||
scale_offset + batch0 * scale_stride,
|
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
|
||||||
ACL_FLOAT16,
|
scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
|
||||||
scale_elem_size, scale_ne, scale_nb, 2,
|
scale_ne_offset);
|
||||||
ACL_FORMAT_ND, scale_ne_offset);
|
|
||||||
acl_output_tensor = ggml_cann_create_tensor(
|
acl_output_tensor = ggml_cann_create_tensor(
|
||||||
(char*)output_buffer + batch1 * output_stride,
|
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
|
||||||
ACL_FLOAT16,
|
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
|
||||||
output_elem_size, output_ne, output_nb, 2,
|
output_ne_offset);
|
||||||
ACL_FORMAT_ND, output_ne_offset);
|
|
||||||
|
|
||||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
||||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
|
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
|
||||||
|
@ -2766,11 +2857,11 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||||
}
|
}
|
||||||
|
|
||||||
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
|
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
|
||||||
output_buffer,
|
output_buffer, ACL_FLOAT16, output_elem_size, output_cast_ne,
|
||||||
ACL_FLOAT16,
|
output_cast_nb, GGML_MAX_DIMS);
|
||||||
output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
|
|
||||||
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
|
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
|
||||||
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
|
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor,
|
||||||
|
ggml_cann_type_mapping(dst->type));
|
||||||
|
|
||||||
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
||||||
ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
|
ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
|
||||||
|
@ -2873,12 +2964,14 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
|
||||||
static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
|
static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
|
||||||
aclTensor* acl_cos_repeat_tensor,
|
aclTensor* acl_cos_repeat_tensor,
|
||||||
aclTensor* acl_sin_repeat_tensor,
|
aclTensor* acl_sin_repeat_tensor,
|
||||||
float theta_scale, bool is_neox) {
|
float theta_scale, float freq_scale,
|
||||||
|
float attn_factor, bool is_neox) {
|
||||||
// int sin/cos cache, cache has different repeat method depond on
|
// int sin/cos cache, cache has different repeat method depond on
|
||||||
// @param.is_neox
|
// @param.is_neox
|
||||||
|
|
||||||
ggml_tensor* src0 = dst->src[0]; // input
|
ggml_tensor* src0 = dst->src[0]; // input
|
||||||
ggml_tensor* src1 = dst->src[1]; // position
|
ggml_tensor* src1 = dst->src[1]; // position
|
||||||
|
ggml_tensor* src2 = dst->src[2]; // freq_factors
|
||||||
|
|
||||||
// arange, [0,1,...,ne0/2]
|
// arange, [0,1,...,ne0/2]
|
||||||
int64_t arange_length = src0->ne[0] / 2;
|
int64_t arange_length = src0->ne[0] / 2;
|
||||||
|
@ -2907,11 +3000,26 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
|
||||||
ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(),
|
ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(),
|
||||||
arange_length * sizeof(float_t));
|
arange_length * sizeof(float_t));
|
||||||
void* theta_scale_buffer = theta_scale_allocator.get();
|
void* theta_scale_buffer = theta_scale_allocator.get();
|
||||||
aclTensor* acl_theta_scale_tensor = aclnn_ones(
|
aclTensor* acl_theta_scale_tensor = aclnn_values(
|
||||||
ctx, theta_scale_buffer, arange_length * sizeof(float_t), arange_ne,
|
ctx, theta_scale_buffer, arange_length * sizeof(float_t), arange_ne,
|
||||||
GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), theta_scale);
|
GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), theta_scale);
|
||||||
aclnn_pow_tensor_tensor(ctx, acl_theta_scale_tensor, acl_arange_tensor);
|
aclnn_pow_tensor_tensor(ctx, acl_theta_scale_tensor, acl_arange_tensor);
|
||||||
|
|
||||||
|
// freq_scale
|
||||||
|
if (freq_scale != 1) {
|
||||||
|
aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
// freq_factors
|
||||||
|
if (src2) {
|
||||||
|
aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor(
|
||||||
|
src2->data, ggml_cann_type_mapping(src2->type),
|
||||||
|
ggml_type_size(src2->type), arange_ne, arange_nb, GGML_MAX_DIMS);
|
||||||
|
aclnn_div_tensor(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor,
|
||||||
|
nullptr, true);
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_freq_factors_tensor));
|
||||||
|
}
|
||||||
|
|
||||||
// position
|
// position
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||||
int64_t position_length = src1->ne[0];
|
int64_t position_length = src1->ne[0];
|
||||||
|
@ -2975,6 +3083,12 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
|
||||||
GGML_MAX_DIMS, ACL_FORMAT_ND);
|
GGML_MAX_DIMS, ACL_FORMAT_ND);
|
||||||
aclnn_cos(ctx, acl_permute_tensor, acl_cos_tensor);
|
aclnn_cos(ctx, acl_permute_tensor, acl_cos_tensor);
|
||||||
|
|
||||||
|
// attn_factor
|
||||||
|
if (attn_factor != 1) {
|
||||||
|
aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true);
|
||||||
|
aclnn_muls(ctx, acl_cos_tensor, attn_factor, nullptr, true);
|
||||||
|
}
|
||||||
|
|
||||||
// repeat
|
// repeat
|
||||||
if (is_neox) {
|
if (is_neox) {
|
||||||
int64_t repeatsArray[] = {1, 1, 1, 2};
|
int64_t repeatsArray[] = {1, 1, 1, 2};
|
||||||
|
@ -3038,19 +3152,11 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
|
memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
|
||||||
memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
|
memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
|
||||||
|
|
||||||
// TODO: with freq_factors
|
|
||||||
GGML_ASSERT(src2 == NULL);
|
|
||||||
// TODO: attn_factor != 1
|
|
||||||
GGML_ASSERT(attn_factor == 1);
|
|
||||||
// TODO: n_dims <= ne0
|
// TODO: n_dims <= ne0
|
||||||
GGML_ASSERT(n_dims == ne0);
|
GGML_ASSERT(n_dims == ne0);
|
||||||
GGML_ASSERT(n_dims % 2 == 0);
|
GGML_ASSERT(n_dims % 2 == 0);
|
||||||
// TODO: ext_factor != 0
|
// TODO: ext_factor != 0
|
||||||
GGML_ASSERT(ext_factor == 0);
|
GGML_ASSERT(ext_factor == 0);
|
||||||
// TODO: freq_scale != 1
|
|
||||||
GGML_ASSERT(freq_scale == 1);
|
|
||||||
// TODO: type == GGML_TYPE_F16
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
||||||
|
|
||||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||||
|
|
||||||
|
@ -3081,7 +3187,217 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t),
|
ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t),
|
||||||
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
|
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
|
||||||
aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
|
aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
|
||||||
theta_scale, is_neox);
|
theta_scale, freq_scale, attn_factor, is_neox);
|
||||||
|
|
||||||
|
aclTensor* acl_src = ggml_cann_create_tensor(src0);
|
||||||
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||||
|
|
||||||
|
#ifdef ASCEND_310P
|
||||||
|
// Special ROPE operation for 310P
|
||||||
|
|
||||||
|
// roll input
|
||||||
|
void* input_roll_buffer;
|
||||||
|
aclTensor* acl_minus_one_tensor;
|
||||||
|
void* minus_one_scale_buffer = nullptr;
|
||||||
|
ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
|
||||||
|
ggml_cann_pool_alloc minus_one_scale_allocator(
|
||||||
|
ctx.pool(), sizeof(float_t) * src0->ne[0]);
|
||||||
|
if (!is_neox) {
|
||||||
|
// roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
|
||||||
|
input_roll_buffer = roll_allocator.get();
|
||||||
|
int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
|
||||||
|
src0->ne[2], src0->ne[3]};
|
||||||
|
size_t input_roll_nb[GGML_MAX_DIMS];
|
||||||
|
input_roll_nb[0] = ggml_type_size(src0->type);
|
||||||
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||||
|
input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
|
||||||
|
}
|
||||||
|
aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
|
||||||
|
input_roll_buffer, ggml_cann_type_mapping(src0->type),
|
||||||
|
ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
|
||||||
|
GGML_MAX_DIMS);
|
||||||
|
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
|
||||||
|
src0->data, ggml_cann_type_mapping(src0->type),
|
||||||
|
ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
|
||||||
|
GGML_MAX_DIMS);
|
||||||
|
|
||||||
|
int64_t shifts[] = {1};
|
||||||
|
int64_t dims[] = {3};
|
||||||
|
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
||||||
|
|
||||||
|
// init [-1, 1, -1, 1, ...]
|
||||||
|
minus_one_scale_buffer = minus_one_scale_allocator.get();
|
||||||
|
|
||||||
|
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
|
||||||
|
size_t minus_one_nb[GGML_MAX_DIMS];
|
||||||
|
minus_one_nb[0] = sizeof(float_t);
|
||||||
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||||
|
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
|
||||||
|
}
|
||||||
|
acl_minus_one_tensor = aclnn_values(
|
||||||
|
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
|
||||||
|
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
|
||||||
|
int64_t dim = 3;
|
||||||
|
int64_t* index = new int64_t[src0->ne[0]];
|
||||||
|
for (int i = 0; i < src0->ne[0]; i++) {
|
||||||
|
index[i] = i / 2 * 2;
|
||||||
|
}
|
||||||
|
int64_t index_num = src0->ne[0];
|
||||||
|
float value = -1;
|
||||||
|
aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
|
||||||
|
index_num, value);
|
||||||
|
} else {
|
||||||
|
// roll input: [q0,q1,q2,...] ->
|
||||||
|
// [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
|
||||||
|
input_roll_buffer = roll_allocator.get();
|
||||||
|
aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
|
||||||
|
input_roll_buffer, ggml_cann_type_mapping(src0->type),
|
||||||
|
ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
|
||||||
|
aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
|
||||||
|
|
||||||
|
int64_t shifts[] = {src0->ne[0] / 2};
|
||||||
|
int64_t dims[] = {3};
|
||||||
|
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
|
||||||
|
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
||||||
|
// init [-1, -1, -1, 1, 1,1,...]
|
||||||
|
minus_one_scale_buffer = minus_one_scale_allocator.get();
|
||||||
|
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
|
||||||
|
size_t minus_one_nb[GGML_MAX_DIMS];
|
||||||
|
minus_one_nb[0] = sizeof(float_t);
|
||||||
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||||
|
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
|
||||||
|
}
|
||||||
|
acl_minus_one_tensor = aclnn_values(
|
||||||
|
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
|
||||||
|
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
|
||||||
|
// -1 * first half
|
||||||
|
int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
|
||||||
|
size_t first_half_nb[GGML_MAX_DIMS];
|
||||||
|
first_half_nb[0] = sizeof(float_t);
|
||||||
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||||
|
first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
|
||||||
|
}
|
||||||
|
aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
|
||||||
|
minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
|
||||||
|
first_half_nb, GGML_MAX_DIMS);
|
||||||
|
bool inplace = true;
|
||||||
|
float scale = -1;
|
||||||
|
aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_first_half_tensor));
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: n_dims < ne0
|
||||||
|
GGML_ASSERT(n_dims == src0->ne[0]);
|
||||||
|
|
||||||
|
// input * scale
|
||||||
|
ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
|
||||||
|
ggml_nbytes(src0));
|
||||||
|
void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
|
||||||
|
size_t input_nb[GGML_MAX_DIMS];
|
||||||
|
input_nb[0] = ggml_type_size(src0->type);
|
||||||
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||||
|
input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
|
||||||
|
}
|
||||||
|
aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
|
||||||
|
input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
|
||||||
|
ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
|
||||||
|
aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
|
||||||
|
input_roll_buffer, ggml_cann_type_mapping(src0->type),
|
||||||
|
ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
|
||||||
|
|
||||||
|
aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
|
||||||
|
acl_input_roll_mul_scale_tensor);
|
||||||
|
|
||||||
|
// output
|
||||||
|
void* output_fp32_buffer;
|
||||||
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
|
aclnn_inplace_mul(ctx, acl_src, acl_cos_reshape_tensor);
|
||||||
|
aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor,
|
||||||
|
acl_sin_reshape_tensor);
|
||||||
|
aclnn_add(ctx, acl_src, acl_input_roll_mul_scale_tensor, acl_dst);
|
||||||
|
// TODO: ne0 != n_dims in mode2
|
||||||
|
} else if (src0->type == GGML_TYPE_F16) {
|
||||||
|
size_t input_fp32_nb[GGML_MAX_DIMS];
|
||||||
|
input_fp32_nb[0] = sizeof(float_t);
|
||||||
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||||
|
input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
|
||||||
|
}
|
||||||
|
ggml_cann_pool_alloc fp32_allocator1(
|
||||||
|
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
|
||||||
|
void* input_fp32_buffer1 = fp32_allocator1.get();
|
||||||
|
aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
|
||||||
|
input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
|
||||||
|
input_fp32_nb, GGML_MAX_DIMS);
|
||||||
|
ggml_cann_pool_alloc fp32_allocator2(
|
||||||
|
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
|
||||||
|
void* input_fp32_buffer2 = fp32_allocator2.get();
|
||||||
|
aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
|
||||||
|
input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
|
||||||
|
input_fp32_nb, GGML_MAX_DIMS);
|
||||||
|
|
||||||
|
ggml_cann_pool_alloc fp32_allocator(
|
||||||
|
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
|
||||||
|
output_fp32_buffer = fp32_allocator.get();
|
||||||
|
aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
|
||||||
|
output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
|
||||||
|
input_fp32_nb, GGML_MAX_DIMS);
|
||||||
|
aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1);
|
||||||
|
aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
|
||||||
|
input_fp32_tensor2);
|
||||||
|
aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
|
||||||
|
output_fp32_tensor);
|
||||||
|
aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
|
||||||
|
|
||||||
|
ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
|
||||||
|
ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
|
||||||
|
ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// src0 == GGML_TYPE_F16
|
||||||
|
// TODO: optimization this `if` code
|
||||||
|
if (src0->type == GGML_TYPE_F16) {
|
||||||
|
ggml_cann_pool_alloc sin_final_allocator(
|
||||||
|
ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type));
|
||||||
|
ggml_cann_pool_alloc cos_final_allocator(
|
||||||
|
ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type));
|
||||||
|
void* sin_final_buffer = sin_final_allocator.get();
|
||||||
|
void* cos_final_buffer = cos_final_allocator.get();
|
||||||
|
|
||||||
|
int64_t sin_final_ne[4] = {src0->ne[0], 1, src0->ne[2], 1};
|
||||||
|
size_t sin_final_nb[GGML_MAX_DIMS];
|
||||||
|
sin_final_nb[0] = ggml_type_size(src0->type);
|
||||||
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||||
|
sin_final_nb[i] = sin_final_nb[i - 1] * sin_final_ne[i - 1];
|
||||||
|
}
|
||||||
|
aclTensor* acl_sin_final_tensor = ggml_cann_create_tensor(
|
||||||
|
sin_final_buffer, ggml_cann_type_mapping(src0->type),
|
||||||
|
ggml_type_size(src0->type), sin_final_ne, sin_final_nb,
|
||||||
|
GGML_MAX_DIMS);
|
||||||
|
aclTensor* acl_cos_final_tensor = ggml_cann_create_tensor(
|
||||||
|
cos_final_buffer, ggml_cann_type_mapping(src0->type),
|
||||||
|
ggml_type_size(src0->type), sin_final_ne, sin_final_nb,
|
||||||
|
GGML_MAX_DIMS);
|
||||||
|
|
||||||
|
aclnn_cast(ctx, acl_sin_reshape_tensor, acl_sin_final_tensor,
|
||||||
|
ggml_cann_type_mapping(src0->type));
|
||||||
|
aclnn_cast(ctx, acl_cos_reshape_tensor, acl_cos_final_tensor,
|
||||||
|
ggml_cann_type_mapping(src0->type));
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
|
||||||
|
acl_sin_reshape_tensor = acl_sin_final_tensor;
|
||||||
|
acl_cos_reshape_tensor = acl_cos_final_tensor;
|
||||||
|
}
|
||||||
|
|
||||||
uint64_t workspaceSize = 0;
|
uint64_t workspaceSize = 0;
|
||||||
aclOpExecutor* executor;
|
aclOpExecutor* executor;
|
||||||
|
@ -3093,10 +3409,9 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
acl_mode = 1;
|
acl_mode = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
aclTensor* acl_x = ggml_cann_create_tensor(src0);
|
|
||||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
|
||||||
ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
|
ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
|
||||||
acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor));
|
acl_src, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
|
||||||
|
acl_dst, &workspaceSize, &executor));
|
||||||
if (workspaceSize > 0) {
|
if (workspaceSize > 0) {
|
||||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||||
workspaceAddr = workspace_allocator.get();
|
workspaceAddr = workspace_allocator.get();
|
||||||
|
@ -3105,7 +3420,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize,
|
ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize,
|
||||||
executor, ctx.stream()));
|
executor, ctx.stream()));
|
||||||
|
|
||||||
ACL_CHECK(aclDestroyTensor(acl_x));
|
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||||
ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
|
ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
|
||||||
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
|
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
|
||||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||||
|
|
|
@ -1738,13 +1738,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||||
}
|
}
|
||||||
case GGML_OP_ROPE: {
|
case GGML_OP_ROPE: {
|
||||||
// TODO: with ops-test v == 1
|
// TODO: with ops-test v == 1
|
||||||
float * freq_scale = (float*)((int32_t*)op->op_params + 6);
|
|
||||||
float * ext_factor = (float*)((int32_t*)op->op_params + 7);
|
float * ext_factor = (float*)((int32_t*)op->op_params + 7);
|
||||||
float * attn_factor = (float*)((int32_t*)op->op_params + 8);
|
|
||||||
// TODO: with freq_factors
|
|
||||||
if (op->src[2] != NULL) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// TODO: n_dims <= ne0
|
// TODO: n_dims <= ne0
|
||||||
if (op->src[0]->ne[0] != op->op_params[1]) {
|
if (op->src[0]->ne[0] != op->op_params[1]) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -1753,21 +1747,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||||
if (*ext_factor != 0) {
|
if (*ext_factor != 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// TODO: freq_scale != 1
|
return true;
|
||||||
if (*freq_scale != 1) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// TODO: attn_factor != 1
|
|
||||||
if (*attn_factor != 1) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
//TODO: type == GGML_TYPE_F16
|
|
||||||
switch (op->src[0]->type) {
|
|
||||||
case GGML_TYPE_F32:
|
|
||||||
return true;
|
|
||||||
default:
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
case GGML_OP_UPSCALE: {
|
case GGML_OP_UPSCALE: {
|
||||||
// aclnnUpsampleNearest2dGetWorkspaceSize not support
|
// aclnnUpsampleNearest2dGetWorkspaceSize not support
|
||||||
|
|
|
@ -25,6 +25,6 @@ ascendc_library(ascendc_kernels STATIC
|
||||||
${SRC_FILES}
|
${SRC_FILES}
|
||||||
)
|
)
|
||||||
|
|
||||||
message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.")
|
message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
|
||||||
ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
|
ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
|
||||||
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
|
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
|
||||||
|
|
|
@ -20,7 +20,6 @@ class DupByRows {
|
||||||
// Input has four dims.
|
// Input has four dims.
|
||||||
int64_t op_block_num = GetBlockNum();
|
int64_t op_block_num = GetBlockNum();
|
||||||
int64_t op_block_idx = GetBlockIdx();
|
int64_t op_block_idx = GetBlockIdx();
|
||||||
assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM);
|
|
||||||
|
|
||||||
// param
|
// param
|
||||||
num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
|
num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
|
||||||
|
|
|
@ -2,6 +2,15 @@
|
||||||
|
|
||||||
// optimize me. Use template to avoid copy code.
|
// optimize me. Use template to avoid copy code.
|
||||||
using namespace AscendC;
|
using namespace AscendC;
|
||||||
|
#ifdef ASCEND_310P // 310P not support 4bit get row
|
||||||
|
extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
|
||||||
|
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
||||||
|
GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
|
||||||
|
GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
||||||
|
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
||||||
|
printf("Ascend310P not support 4bit get row.\n");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
|
||||||
#define BUFFER_NUM 2
|
#define BUFFER_NUM 2
|
||||||
|
|
||||||
|
@ -110,12 +119,9 @@ class GET_ROW_Q4_0 {
|
||||||
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
||||||
|
|
||||||
// TODO: cast more data to speed up.
|
// TODO: cast more data to speed up.
|
||||||
#ifdef ASCEND_310P
|
|
||||||
// TODO: 310P support quantification
|
|
||||||
#else
|
|
||||||
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
|
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
|
||||||
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
|
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
|
||||||
#endif
|
|
||||||
// Only mul need compile by group.
|
// Only mul need compile by group.
|
||||||
half scale = scale_gm.GetValue(scale_offset);
|
half scale = scale_gm.GetValue(scale_offset);
|
||||||
|
|
||||||
|
@ -194,3 +200,5 @@ extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
|
||||||
indices_nb_ub, output_ne_ub, output_nb_ub);
|
indices_nb_ub, output_ne_ub, output_nb_ub);
|
||||||
op.calculate();
|
op.calculate();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif // #ifdef ASCEND_310P
|
||||||
|
|
|
@ -1,6 +1,14 @@
|
||||||
#include "kernel_operator.h"
|
#include "kernel_operator.h"
|
||||||
|
|
||||||
using namespace AscendC;
|
using namespace AscendC;
|
||||||
|
#ifdef ASCEND_310P
|
||||||
|
extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
|
||||||
|
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||||
|
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||||
|
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
||||||
|
printf("Ascend310P not support f16->8bit quantization.\n");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
|
||||||
#define BUFFER_NUM 2
|
#define BUFFER_NUM 2
|
||||||
#define QK8_0 32
|
#define QK8_0 32
|
||||||
|
@ -206,3 +214,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
|
||||||
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
||||||
op.calculate();
|
op.calculate();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif // #ifdef ASCEND_310P
|
||||||
|
|
|
@ -1,6 +1,14 @@
|
||||||
#include "kernel_operator.h"
|
#include "kernel_operator.h"
|
||||||
|
|
||||||
using namespace AscendC;
|
using namespace AscendC;
|
||||||
|
#ifdef ASCEND_310P // 310P not support f32->8bit quantization
|
||||||
|
extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
|
||||||
|
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||||
|
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||||
|
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
||||||
|
printf("Ascend310P not support f32->8bit quantization.\n");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
|
||||||
#define BUFFER_NUM 2
|
#define BUFFER_NUM 2
|
||||||
#define QK8_0 32
|
#define QK8_0 32
|
||||||
|
@ -204,3 +212,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
|
||||||
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
||||||
op.calculate();
|
op.calculate();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif // #ifdef ASCEND_310P
|
||||||
|
|
|
@ -1,6 +1,21 @@
|
||||||
#include "kernel_operator.h"
|
#include "kernel_operator.h"
|
||||||
|
|
||||||
using namespace AscendC;
|
using namespace AscendC;
|
||||||
|
#ifdef ASCEND_310P // 310P not support float->4bit quantization
|
||||||
|
extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
|
||||||
|
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||||
|
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||||
|
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
||||||
|
printf("Ascend310P not support f32->4bit quantization.\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
|
||||||
|
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||||
|
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||||
|
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
||||||
|
printf("Ascend310P not support f16->4bit quantization.\n");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
|
||||||
#define BUFFER_NUM 2
|
#define BUFFER_NUM 2
|
||||||
#define Group_Size 32
|
#define Group_Size 32
|
||||||
|
@ -276,3 +291,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
|
||||||
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
||||||
op.calculate();
|
op.calculate();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif // #ifdef ASCEND_310P
|
||||||
|
|
|
@ -418,6 +418,12 @@ typedef struct {
|
||||||
} block_iq4_xs;
|
} block_iq4_xs;
|
||||||
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
ggml_half d[4]; // deltas for 4 iq4_nl blocks
|
||||||
|
uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks
|
||||||
|
} block_iq4_nlx4;
|
||||||
|
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
|
||||||
|
|
||||||
#endif // GGML_COMMON_DECL
|
#endif // GGML_COMMON_DECL
|
||||||
#endif // GGML_COMMON_DECL
|
#endif // GGML_COMMON_DECL
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,20 @@
|
||||||
ggml_add_backend_library(ggml-cpu
|
ggml_add_backend_library(ggml-cpu)
|
||||||
ggml-cpu.c
|
|
||||||
ggml-cpu.cpp
|
|
||||||
ggml-cpu-aarch64.c
|
|
||||||
ggml-cpu-aarch64.h
|
|
||||||
ggml-cpu-quants.c
|
|
||||||
ggml-cpu-quants.h
|
|
||||||
)
|
|
||||||
|
|
||||||
|
list (APPEND GGML_CPU_SOURCES
|
||||||
|
ggml-cpu.c
|
||||||
|
ggml-cpu.cpp
|
||||||
|
ggml-cpu-aarch64.c
|
||||||
|
ggml-cpu-aarch64.h
|
||||||
|
ggml-cpu-quants.c
|
||||||
|
ggml-cpu-quants.h
|
||||||
|
amx/amx.cpp
|
||||||
|
amx/amx.h
|
||||||
|
amx/mmq.cpp
|
||||||
|
amx/mmq.h
|
||||||
|
ggml-cpu-impl.h
|
||||||
|
)
|
||||||
|
|
||||||
|
target_compile_features(ggml-cpu PRIVATE c_std_11 cxx_std_17)
|
||||||
target_include_directories(ggml-cpu PRIVATE .)
|
target_include_directories(ggml-cpu PRIVATE .)
|
||||||
|
|
||||||
if (APPLE AND GGML_ACCELERATE)
|
if (APPLE AND GGML_ACCELERATE)
|
||||||
|
@ -14,9 +22,9 @@ if (APPLE AND GGML_ACCELERATE)
|
||||||
if (ACCELERATE_FRAMEWORK)
|
if (ACCELERATE_FRAMEWORK)
|
||||||
message(STATUS "Accelerate framework found")
|
message(STATUS "Accelerate framework found")
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_ACCELERATE)
|
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_ACCELERATE)
|
||||||
add_compile_definitions(ACCELERATE_NEW_LAPACK)
|
target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_NEW_LAPACK)
|
||||||
add_compile_definitions(ACCELERATE_LAPACK_ILP64)
|
target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_LAPACK_ILP64)
|
||||||
|
|
||||||
target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
|
target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
|
||||||
else()
|
else()
|
||||||
|
@ -29,15 +37,9 @@ if (GGML_OPENMP)
|
||||||
if (OpenMP_FOUND)
|
if (OpenMP_FOUND)
|
||||||
message(STATUS "OpenMP found")
|
message(STATUS "OpenMP found")
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_OPENMP)
|
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_OPENMP)
|
||||||
|
|
||||||
target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||||
|
|
||||||
# FIXME: should be replaced with a compiler id check
|
|
||||||
#if (GGML_MUSA)
|
|
||||||
# list(APPEND GGML_CPU_EXTRA_INCLUDES "/usr/lib/llvm-14/lib/clang/14.0.0/include")
|
|
||||||
# list(APPEND GGML_CPU_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
|
|
||||||
#endif()
|
|
||||||
else()
|
else()
|
||||||
message(WARNING "OpenMP not found")
|
message(WARNING "OpenMP not found")
|
||||||
endif()
|
endif()
|
||||||
|
@ -46,11 +48,11 @@ endif()
|
||||||
if (GGML_LLAMAFILE)
|
if (GGML_LLAMAFILE)
|
||||||
message(STATUS "Using llamafile")
|
message(STATUS "Using llamafile")
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_LLAMAFILE)
|
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_LLAMAFILE)
|
||||||
|
|
||||||
target_sources(ggml-cpu PRIVATE
|
list(APPEND GGML_CPU_SOURCES
|
||||||
llamafile/sgemm.cpp
|
llamafile/sgemm.cpp
|
||||||
llamafile/sgemm.h)
|
llamafile/sgemm.h)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_CPU_HBM)
|
if (GGML_CPU_HBM)
|
||||||
|
@ -58,7 +60,7 @@ if (GGML_CPU_HBM)
|
||||||
|
|
||||||
message(STATUS "Using memkind for CPU HBM")
|
message(STATUS "Using memkind for CPU HBM")
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_CPU_HBM)
|
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_HBM)
|
||||||
|
|
||||||
target_link_libraries(ggml-cpu PUBLIC memkind)
|
target_link_libraries(ggml-cpu PUBLIC memkind)
|
||||||
endif()
|
endif()
|
||||||
|
@ -72,27 +74,33 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
|
||||||
message(STATUS "ARM detected")
|
message(STATUS "ARM detected")
|
||||||
|
|
||||||
if (MSVC)
|
if (MSVC)
|
||||||
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
|
list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead
|
||||||
add_compile_definitions(__ARM_NEON)
|
list(APPEND ARCH_DEFINITIONS __ARM_NEON)
|
||||||
add_compile_definitions(__ARM_FEATURE_FMA)
|
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA)
|
||||||
|
|
||||||
set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
|
set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
|
||||||
string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
|
string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
|
||||||
|
|
||||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
|
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
|
||||||
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
||||||
add_compile_definitions(__ARM_FEATURE_DOTPROD)
|
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
|
||||||
|
|
||||||
|
message(STATUS "ARM feature DOTPROD enabled")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||||
|
|
||||||
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||||
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
|
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
|
||||||
|
|
||||||
|
message(STATUS "ARM feature MATMUL_INT8 enabled")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
||||||
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
||||||
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
||||||
|
|
||||||
|
message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
|
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
|
||||||
|
@ -112,18 +120,24 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
|
||||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
|
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
|
||||||
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
||||||
set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
|
set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
|
||||||
add_compile_definitions(__ARM_FEATURE_DOTPROD)
|
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
|
||||||
|
|
||||||
|
message(STATUS "ARM feature DOTPROD enabled")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
|
set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
|
||||||
|
|
||||||
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
|
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
|
||||||
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
|
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
|
||||||
|
|
||||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||||
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||||
set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
|
set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
|
||||||
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
|
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
|
||||||
|
|
||||||
|
message(STATUS "ARM feature MATMUL_INT8 enabled")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
|
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
|
||||||
|
|
||||||
list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")
|
list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")
|
||||||
|
@ -163,7 +177,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
|
||||||
if (MSVC)
|
if (MSVC)
|
||||||
# instruction set detection for MSVC only
|
# instruction set detection for MSVC only
|
||||||
if (GGML_NATIVE)
|
if (GGML_NATIVE)
|
||||||
# TODO: improve, should not reference files from the parent folder
|
|
||||||
include(cmake/FindSIMD.cmake)
|
include(cmake/FindSIMD.cmake)
|
||||||
endif ()
|
endif ()
|
||||||
if (GGML_AVX512)
|
if (GGML_AVX512)
|
||||||
|
@ -173,37 +186,31 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
|
||||||
# macros corresponding to the extensions.
|
# macros corresponding to the extensions.
|
||||||
# Do it manually.
|
# Do it manually.
|
||||||
if (GGML_AVX512_VBMI)
|
if (GGML_AVX512_VBMI)
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
|
list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
|
|
||||||
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||||
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
if (GGML_AVX512_VNNI)
|
if (GGML_AVX512_VNNI)
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
|
list(APPEND ARCH_DEFINITIONS __AVX512VNNI__)
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
|
||||||
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||||
list(APPEND ARCH_FLAGS -mavx512vnni)
|
list(APPEND ARCH_FLAGS -mavx512vnni)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
if (GGML_AVX512_BF16)
|
if (GGML_AVX512_BF16)
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
|
list(APPEND ARCH_DEFINITIONS __AVX512BF16__)
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
|
|
||||||
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||||
list(APPEND ARCH_FLAGS -mavx512bf16)
|
list(APPEND ARCH_FLAGS -mavx512bf16)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
if (GGML_AMX_TILE)
|
if (GGML_AMX_TILE)
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
|
list(APPEND ARCH_DEFINITIONS __AMX_TILE__)
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
|
|
||||||
endif()
|
endif()
|
||||||
if (GGML_AMX_INT8)
|
if (GGML_AMX_INT8)
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
|
list(APPEND ARCH_DEFINITIONS __AMX_INT8__)
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
|
|
||||||
endif()
|
endif()
|
||||||
if (GGML_AMX_BF16)
|
if (GGML_AMX_BF16)
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
|
list(APPEND ARCH_DEFINITIONS __AMX_BF16__)
|
||||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
|
|
||||||
endif()
|
endif()
|
||||||
elseif (GGML_AVX2)
|
elseif (GGML_AVX2)
|
||||||
list(APPEND ARCH_FLAGS /arch:AVX2)
|
list(APPEND ARCH_FLAGS /arch:AVX2)
|
||||||
|
@ -264,7 +271,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
||||||
else()
|
else()
|
||||||
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
|
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
|
||||||
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
# TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
||||||
endif()
|
endif()
|
||||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||||
message(STATUS "loongarch64 detected")
|
message(STATUS "loongarch64 detected")
|
||||||
|
@ -287,11 +294,12 @@ endif()
|
||||||
|
|
||||||
if (GGML_CPU_AARCH64)
|
if (GGML_CPU_AARCH64)
|
||||||
message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
|
message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
|
||||||
add_compile_definitions(GGML_USE_CPU_AARCH64)
|
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_AARCH64)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
|
target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES})
|
||||||
target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
|
set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS "${ARCH_FLAGS}")
|
||||||
|
set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}")
|
||||||
|
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
|
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
|
||||||
|
|
196
ggml/src/ggml-cpu/amx/amx.cpp
Normal file
196
ggml/src/ggml-cpu/amx/amx.cpp
Normal file
|
@ -0,0 +1,196 @@
|
||||||
|
#include "amx.h"
|
||||||
|
#include "common.h"
|
||||||
|
#include "mmq.h"
|
||||||
|
#include "ggml-backend-impl.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
#include "ggml-impl.h"
|
||||||
|
#include "ggml-cpu.h"
|
||||||
|
|
||||||
|
#if defined(__gnu_linux__)
|
||||||
|
#include <sys/syscall.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
||||||
|
|
||||||
|
// AMX buffer interface
|
||||||
|
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
|
free(buffer->context);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
|
return (void *)(buffer->context);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||||
|
memset((char *)tensor->data + offset, value, size);
|
||||||
|
|
||||||
|
GGML_UNUSED(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
|
if (qtype_has_amx_kernels(tensor->type)) {
|
||||||
|
ggml_backend_amx_convert_weight(tensor, data, offset, size);
|
||||||
|
} else {
|
||||||
|
memcpy((char *)tensor->data + offset, data, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_UNUSED(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
|
GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
|
||||||
|
memcpy(data, (const char *)tensor->data + offset, size);
|
||||||
|
|
||||||
|
GGML_UNUSED(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||||
|
if (ggml_backend_buffer_is_host(src->buffer)) {
|
||||||
|
if (qtype_has_amx_kernels(src->type)) {
|
||||||
|
ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst));
|
||||||
|
} else {
|
||||||
|
memcpy(dst->data, src->data, ggml_nbytes(src));
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
|
||||||
|
GGML_UNUSED(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
|
memset(buffer->context, value, buffer->size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
|
||||||
|
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
|
||||||
|
/* .get_base = */ ggml_backend_amx_buffer_get_base,
|
||||||
|
/* .init_tensor = */ NULL, // no initialization required
|
||||||
|
/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
|
||||||
|
/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
|
||||||
|
/* .get_tensor = */ ggml_backend_amx_buffer_get_tensor,
|
||||||
|
/* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor,
|
||||||
|
/* .clear = */ ggml_backend_amx_buffer_clear,
|
||||||
|
/* .reset = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||||
|
return "AMX";
|
||||||
|
|
||||||
|
GGML_UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
|
void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
|
||||||
|
if (data == NULL) {
|
||||||
|
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||||
|
return TENSOR_ALIGNMENT;
|
||||||
|
|
||||||
|
GGML_UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
|
||||||
|
return ggml_backend_amx_get_alloc_size(tensor);
|
||||||
|
|
||||||
|
GGML_UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
|
return false;
|
||||||
|
|
||||||
|
GGML_UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define ARCH_GET_XCOMP_PERM 0x1022
|
||||||
|
#define ARCH_REQ_XCOMP_PERM 0x1023
|
||||||
|
#define XFEATURE_XTILECFG 17
|
||||||
|
#define XFEATURE_XTILEDATA 18
|
||||||
|
|
||||||
|
static bool ggml_amx_init() {
|
||||||
|
#if defined(__gnu_linux__)
|
||||||
|
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
|
||||||
|
fprintf(stderr, "AMX is not ready to be used!\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
#elif defined(_WIN32)
|
||||||
|
return true;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
|
||||||
|
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
|
||||||
|
/* .iface = */ {
|
||||||
|
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
|
||||||
|
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
|
||||||
|
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
|
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
|
||||||
|
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
|
||||||
|
},
|
||||||
|
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||||
|
/* .context = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!ggml_amx_init()) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return &ggml_backend_buffer_type_amx;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft) {
|
||||||
|
return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op) {
|
||||||
|
// handle only 2d gemm for now
|
||||||
|
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
|
||||||
|
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
|
||||||
|
};
|
||||||
|
|
||||||
|
switch (op->op) {
|
||||||
|
case GGML_OP_NONE:
|
||||||
|
case GGML_OP_RESHAPE:
|
||||||
|
case GGML_OP_VIEW:
|
||||||
|
case GGML_OP_PERMUTE:
|
||||||
|
case GGML_OP_TRANSPOSE:
|
||||||
|
return true;
|
||||||
|
|
||||||
|
case GGML_OP_MUL_MAT: {
|
||||||
|
const struct ggml_tensor * src0 = op->src[0];
|
||||||
|
const struct ggml_tensor * src1 = op->src[1];
|
||||||
|
|
||||||
|
const enum ggml_type type = src0->type;
|
||||||
|
const int64_t ne0 = op->ne[0];
|
||||||
|
|
||||||
|
// amx kernels enables for Q4_0, Q4_1, Q8_0, F16
|
||||||
|
// Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
|
||||||
|
bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
|
||||||
|
|
||||||
|
bool can_use_amx =
|
||||||
|
is_contiguous_2d(src0) && // src0 must be contiguous
|
||||||
|
is_contiguous_2d(src1) && // src1 must be contiguous
|
||||||
|
src1->type == GGML_TYPE_F32 && // src1 must be float32
|
||||||
|
has_amx_kernels && // with amx kernel impls
|
||||||
|
ne0 % (TILE_N * 2) == 0; // out_features is 32x
|
||||||
|
|
||||||
|
return can_use_amx;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
20
ggml/src/ggml-cpu/amx/amx.h
Normal file
20
ggml/src/ggml-cpu/amx/amx.h
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
#include "ggml-cpu-impl.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
||||||
|
|
||||||
|
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
|
||||||
|
bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft);
|
||||||
|
bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op);
|
||||||
|
void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
|
size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
|
@ -1,8 +1,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
// hack until AMX is moved into the CPU backend
|
#include "ggml-cpu-impl.h"
|
||||||
#include "../ggml-cpu/ggml-cpu-impl.h" // <immintrin.h>
|
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
@ -74,16 +73,24 @@ inline void parallel_for(int nth, int n, const func_t& f) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename func_t>
|
||||||
|
inline void parallel_for_ggml(const ggml_compute_params * params, int n, const func_t & f) {
|
||||||
|
int tbegin, tend;
|
||||||
|
balance211(n, params->nth, params->ith, tbegin, tend);
|
||||||
|
f(tbegin, tend);
|
||||||
|
ggml_barrier(params->threadpool); // TODO: might not always be needed
|
||||||
|
}
|
||||||
|
|
||||||
// quantized types that have AMX support
|
// quantized types that have AMX support
|
||||||
inline bool qtype_has_amx_kernels(const enum ggml_type type) {
|
inline bool qtype_has_amx_kernels(const enum ggml_type type) {
|
||||||
// TODO: fix padding for vnni format
|
// TODO: fix padding for vnni format
|
||||||
return (type == GGML_TYPE_Q4_0) ||
|
return (type == GGML_TYPE_Q4_0) ||
|
||||||
(type == GGML_TYPE_Q4_1);
|
(type == GGML_TYPE_Q4_1) ||
|
||||||
//(type == GGML_TYPE_Q8_0) ||
|
(type == GGML_TYPE_Q8_0) ||
|
||||||
//(type == GGML_TYPE_Q4_K) ||
|
(type == GGML_TYPE_Q4_K) ||
|
||||||
//(type == GGML_TYPE_Q5_K) ||
|
(type == GGML_TYPE_Q5_K) ||
|
||||||
//(type == GGML_TYPE_Q6_K) ||
|
(type == GGML_TYPE_Q6_K) ||
|
||||||
//(type == GGML_TYPE_IQ4_XS);
|
(type == GGML_TYPE_IQ4_XS);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml backend context
|
// ggml backend context
|
|
@ -4,8 +4,11 @@
|
||||||
#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
|
#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include "amx.h"
|
||||||
#include "mmq.h"
|
#include "mmq.h"
|
||||||
#include "ggml-impl.h"
|
#include "ggml-impl.h"
|
||||||
|
#include "ggml-cpu-impl.h"
|
||||||
|
#include "ggml-cpu-quants.h"
|
||||||
#include "ggml-quants.h"
|
#include "ggml-quants.h"
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
|
@ -33,7 +36,7 @@
|
||||||
#define ALWAYS_INLINE inline
|
#define ALWAYS_INLINE inline
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__AMX_INT8__)
|
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
|
@ -496,13 +499,12 @@ inline void from_float(const float * x, char * vy, int64_t k);
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
inline void from_float<block_q8_0>(const float * x, char * vy, int64_t k) {
|
inline void from_float<block_q8_0>(const float * x, char * vy, int64_t k) {
|
||||||
// FIXME: using unoptimized reference impl until moved to CPU backend
|
quantize_row_q8_0(x, (block_q8_0 *)vy, k);
|
||||||
quantize_row_q8_0_ref(x, (block_q8_0 *)vy, k);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
inline void from_float<block_q8_1>(const float * x, char * vy, int64_t k) {
|
inline void from_float<block_q8_1>(const float * x, char * vy, int64_t k) {
|
||||||
quantize_row_q8_1_ref(x, (block_q8_1 *)vy, k);
|
quantize_row_q8_1(x, (block_q8_1 *)vy, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
|
@ -950,7 +952,7 @@ template<typename TB, typename packed_B_t = packed_B_type<TB>>
|
||||||
void unpack_B(packed_B_t * RESTRICT tile, const void * RESTRICT packed_B) {
|
void unpack_B(packed_B_t * RESTRICT tile, const void * RESTRICT packed_B) {
|
||||||
GGML_UNUSED(tile);
|
GGML_UNUSED(tile);
|
||||||
GGML_UNUSED(packed_B);
|
GGML_UNUSED(packed_B);
|
||||||
};
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
void unpack_B<block_q4_0>(int8_t * RESTRICT tile, const void * RESTRICT packed_B) {
|
void unpack_B<block_q4_0>(int8_t * RESTRICT tile, const void * RESTRICT packed_B) {
|
||||||
|
@ -2327,9 +2329,7 @@ size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor) {
|
||||||
|
|
||||||
// pack weight to vnni format
|
// pack weight to vnni format
|
||||||
void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
|
GGML_ASSERT(offset == 0 && size == ggml_nbytes(tensor)); // only full tensor conversion is supported for now
|
||||||
size_t alloc_size = ggml_backend_amx_get_alloc_size(tensor);
|
|
||||||
GGML_ASSERT(alloc_size == size);
|
|
||||||
|
|
||||||
const enum ggml_type TYPE = tensor->type;
|
const enum ggml_type TYPE = tensor->type;
|
||||||
|
|
||||||
|
@ -2348,6 +2348,29 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst) {
|
||||||
|
struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
|
const enum ggml_type TYPE = src0->type;
|
||||||
|
|
||||||
|
const bool is_floating_type = TYPE == GGML_TYPE_F16;
|
||||||
|
if (is_floating_type) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int M = dst->ne[1];
|
||||||
|
const int K = src0->ne[0];
|
||||||
|
|
||||||
|
size_t desired_wsize = 0;
|
||||||
|
|
||||||
|
GGML_DISPATCH_QTYPES(TYPE, [&] {
|
||||||
|
const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
|
||||||
|
desired_wsize = M * row_size_A;
|
||||||
|
});
|
||||||
|
|
||||||
|
return desired_wsize;
|
||||||
|
}
|
||||||
|
|
||||||
// NB: mixed dtype gemm with Advanced Matrix Extensions (Intel AMX)
|
// NB: mixed dtype gemm with Advanced Matrix Extensions (Intel AMX)
|
||||||
//
|
//
|
||||||
// src0: weight in shape of {N, K}, quantized
|
// src0: weight in shape of {N, K}, quantized
|
||||||
|
@ -2356,14 +2379,12 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
|
||||||
//
|
//
|
||||||
// the function performs: dst = src1 @ src0.T
|
// the function performs: dst = src1 @ src0.T
|
||||||
//
|
//
|
||||||
void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst) {
|
void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_tensor * dst) {
|
||||||
struct ggml_tensor * src0 = dst->src[0];
|
struct ggml_tensor * src0 = dst->src[0];
|
||||||
struct ggml_tensor * src1 = dst->src[1];
|
struct ggml_tensor * src1 = dst->src[1];
|
||||||
|
|
||||||
const enum ggml_type TYPE = src0->type;
|
const enum ggml_type TYPE = src0->type;
|
||||||
|
|
||||||
const int n_threads = ctx->n_threads;
|
|
||||||
|
|
||||||
// f16 only has avx512 kernels for now,
|
// f16 only has avx512 kernels for now,
|
||||||
// amx kernels will be added once 6th gen xeon is released.
|
// amx kernels will be added once 6th gen xeon is released.
|
||||||
const bool is_floating_type = TYPE == GGML_TYPE_F16;
|
const bool is_floating_type = TYPE == GGML_TYPE_F16;
|
||||||
|
@ -2379,7 +2400,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
|
||||||
const int MB = div_up(M, BLOCK_M);
|
const int MB = div_up(M, BLOCK_M);
|
||||||
const int NB = div_up(N, BLOCK_N);
|
const int NB = div_up(N, BLOCK_N);
|
||||||
|
|
||||||
parallel_for(n_threads, MB * NB, [&](int begin, int end) {
|
parallel_for_ggml(params, MB * NB, [&](int begin, int end) {
|
||||||
GGML_DISPATCH_FLOATING_TYPES(TYPE, [&] {
|
GGML_DISPATCH_FLOATING_TYPES(TYPE, [&] {
|
||||||
for (int i = begin; i < end; ++i) {
|
for (int i = begin; i < end; ++i) {
|
||||||
int mb = i / NB;
|
int mb = i / NB;
|
||||||
|
@ -2412,27 +2433,29 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
|
||||||
}
|
}
|
||||||
|
|
||||||
// pointer to work space, used convert A from float to quantized type
|
// pointer to work space, used convert A from float to quantized type
|
||||||
void * wdata = nullptr;
|
void * wdata = params->wdata;
|
||||||
|
|
||||||
//TODO: performance improvement: merge quant A
|
//TODO: performance improvement: merge quant A
|
||||||
GGML_DISPATCH_QTYPES(TYPE, [&] {
|
if (params->ith == 0) {
|
||||||
const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
|
GGML_DISPATCH_QTYPES(TYPE, [&] {
|
||||||
const size_t desired_wsize = M * row_size_A;
|
const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
|
||||||
if (ctx->work_size < desired_wsize) {
|
const size_t desired_wsize = M * row_size_A;
|
||||||
ctx->work_data.reset(new char[desired_wsize]);
|
if (params->wsize < desired_wsize) {
|
||||||
ctx->work_size = desired_wsize;
|
GGML_ABORT("insufficient work space size");
|
||||||
}
|
}
|
||||||
wdata = ctx->work_data.get();
|
|
||||||
|
|
||||||
// Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size
|
// Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size
|
||||||
// Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
|
// Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
|
||||||
GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);
|
GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);
|
||||||
|
|
||||||
const float * A_data = static_cast<const float *>(src1->data);
|
const float * A_data = static_cast<const float *>(src1->data);
|
||||||
for (int m = 0; m < M; ++m) {
|
for (int m = 0; m < M; ++m) {
|
||||||
from_float<vec_dot_type>(A_data + m * K, (char *)wdata + m * row_size_A, K);
|
from_float<vec_dot_type>(A_data + m * K, (char *)wdata + m * row_size_A, K);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_barrier(params->threadpool);
|
||||||
|
|
||||||
if (M == 1) {
|
if (M == 1) {
|
||||||
// MB = 1 and handle 8 tiles in each block
|
// MB = 1 and handle 8 tiles in each block
|
||||||
|
@ -2440,7 +2463,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
|
||||||
constexpr int BLOCK_N = TILE_N * kTilesN;
|
constexpr int BLOCK_N = TILE_N * kTilesN;
|
||||||
const int NB = div_up(N, BLOCK_N);
|
const int NB = div_up(N, BLOCK_N);
|
||||||
|
|
||||||
parallel_for(n_threads, NB, [&](int begin, int end) {
|
parallel_for_ggml(params, NB, [&](int begin, int end) {
|
||||||
GGML_DISPATCH_QTYPES(TYPE, [&] {
|
GGML_DISPATCH_QTYPES(TYPE, [&] {
|
||||||
const int KB = K / blck_size;
|
const int KB = K / blck_size;
|
||||||
const int TILE_SIZE = get_tile_size<type>();
|
const int TILE_SIZE = get_tile_size<type>();
|
||||||
|
@ -2470,7 +2493,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
|
||||||
const int MB = div_up(M, BLOCK_M);
|
const int MB = div_up(M, BLOCK_M);
|
||||||
const int NB = div_up(N, BLOCK_N);
|
const int NB = div_up(N, BLOCK_N);
|
||||||
|
|
||||||
parallel_for(n_threads, MB * NB, [&](int begin, int end) {
|
parallel_for_ggml(params, MB * NB, [&](int begin, int end) {
|
||||||
// init tile config for each thread
|
// init tile config for each thread
|
||||||
ggml_tile_config_init();
|
ggml_tile_config_init();
|
||||||
|
|
||||||
|
@ -2498,13 +2521,4 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
#else // if defined(__AMX_INT8__)
|
#endif // if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
||||||
|
|
||||||
void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst) {
|
|
||||||
fprintf(stderr, "GGML is not compiled with AMX support!\n");
|
|
||||||
|
|
||||||
GGML_UNUSED(ctx);
|
|
||||||
GGML_UNUSED(dst);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // if defined(__AMX_INT8__)
|
|
|
@ -1,6 +1,5 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
@ -10,7 +9,7 @@ size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
|
|
||||||
void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst);
|
void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
|
@ -1,7 +1,3 @@
|
||||||
// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
|
||||||
// SPDX-License-Identifier: MIT
|
|
||||||
//
|
|
||||||
|
|
||||||
#define GGML_COMMON_IMPL_C
|
#define GGML_COMMON_IMPL_C
|
||||||
#include "ggml-common.h"
|
#include "ggml-common.h"
|
||||||
|
|
||||||
|
@ -187,6 +183,8 @@ static inline __m256i mul_sum_i8_pairs_int32x8(const __m256i x, const __m256i y)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||||
|
|
||||||
static void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) {
|
static void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) {
|
||||||
assert(QK8_0 == 32);
|
assert(QK8_0 == 32);
|
||||||
assert(k % QK8_0 == 0);
|
assert(k % QK8_0 == 0);
|
||||||
|
@ -528,7 +526,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||||
if (ggml_cpu_has_neon()) {
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
|
@ -996,6 +994,102 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||||
|
const int qk = QK8_0;
|
||||||
|
const int nb = n / qk;
|
||||||
|
const int ncols_interleaved = 4;
|
||||||
|
const int blocklen = 4;
|
||||||
|
|
||||||
|
assert (n % qk == 0);
|
||||||
|
assert (nc % ncols_interleaved == 0);
|
||||||
|
|
||||||
|
UNUSED(s);
|
||||||
|
UNUSED(bs);
|
||||||
|
UNUSED(vx);
|
||||||
|
UNUSED(vy);
|
||||||
|
UNUSED(nr);
|
||||||
|
UNUSED(nc);
|
||||||
|
UNUSED(nb);
|
||||||
|
UNUSED(ncols_interleaved);
|
||||||
|
UNUSED(blocklen);
|
||||||
|
|
||||||
|
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||||
|
const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
|
||||||
|
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||||
|
float * res_ptr = s;
|
||||||
|
|
||||||
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||||
|
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
||||||
|
|
||||||
|
float32x4_t sumf = vdupq_n_f32(0);
|
||||||
|
for (int l = 0; l < nb; l++) {
|
||||||
|
uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0);
|
||||||
|
uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16);
|
||||||
|
uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32);
|
||||||
|
uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48);
|
||||||
|
|
||||||
|
int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4);
|
||||||
|
int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F);
|
||||||
|
int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4);
|
||||||
|
int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F);
|
||||||
|
int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4);
|
||||||
|
int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F);
|
||||||
|
int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4);
|
||||||
|
int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F);
|
||||||
|
|
||||||
|
int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0);
|
||||||
|
int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16);
|
||||||
|
|
||||||
|
int32x4_t sumi = vdupq_n_s32(0);
|
||||||
|
sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0);
|
||||||
|
sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0);
|
||||||
|
sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1);
|
||||||
|
sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1);
|
||||||
|
sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2);
|
||||||
|
sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2);
|
||||||
|
sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3);
|
||||||
|
sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3);
|
||||||
|
|
||||||
|
float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d));
|
||||||
|
float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
|
||||||
|
float32x4_t d = a_d * b_d;
|
||||||
|
|
||||||
|
sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi));
|
||||||
|
}
|
||||||
|
|
||||||
|
vst1q_f32(res_ptr + x * 4, sumf);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||||
|
{
|
||||||
|
float sumf[4];
|
||||||
|
int sumi;
|
||||||
|
|
||||||
|
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||||
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||||
|
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
||||||
|
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||||
|
for (int l = 0; l < nb; l++) {
|
||||||
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
sumi = 0;
|
||||||
|
for (int i = 0; i < blocklen; ++i) {
|
||||||
|
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
||||||
|
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||||
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
||||||
|
}
|
||||||
|
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
@ -1017,7 +1111,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||||
if (ggml_cpu_has_neon()) {
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
|
@ -3386,6 +3480,117 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||||
|
const int qk = QK8_0;
|
||||||
|
const int nb = n / qk;
|
||||||
|
const int ncols_interleaved = 4;
|
||||||
|
const int blocklen = 4;
|
||||||
|
|
||||||
|
assert (n % qk == 0);
|
||||||
|
assert (nr % 4 == 0);
|
||||||
|
assert (nc % ncols_interleaved == 0);
|
||||||
|
|
||||||
|
UNUSED(s);
|
||||||
|
UNUSED(bs);
|
||||||
|
UNUSED(vx);
|
||||||
|
UNUSED(vy);
|
||||||
|
UNUSED(nr);
|
||||||
|
UNUSED(nc);
|
||||||
|
UNUSED(nb);
|
||||||
|
UNUSED(ncols_interleaved);
|
||||||
|
UNUSED(blocklen);
|
||||||
|
|
||||||
|
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||||
|
const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
|
||||||
|
|
||||||
|
for (int y = 0; y < nr / 4; y++) {
|
||||||
|
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||||
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||||
|
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
||||||
|
|
||||||
|
float32x4_t sumf[4];
|
||||||
|
for (int m = 0; m < 4; m++) {
|
||||||
|
sumf[m] = vdupq_n_f32(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int l = 0; l < nb; l++) {
|
||||||
|
float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d));
|
||||||
|
float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
|
||||||
|
|
||||||
|
int32x4_t sumi_0 = vdupq_n_s32(0);
|
||||||
|
int32x4_t sumi_1 = vdupq_n_s32(0);
|
||||||
|
int32x4_t sumi_2 = vdupq_n_s32(0);
|
||||||
|
int32x4_t sumi_3 = vdupq_n_s32(0);
|
||||||
|
|
||||||
|
for (int k = 0; k < 4; k++) {
|
||||||
|
int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0);
|
||||||
|
int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64);
|
||||||
|
|
||||||
|
uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k);
|
||||||
|
int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4);
|
||||||
|
int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF);
|
||||||
|
|
||||||
|
sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0);
|
||||||
|
sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1);
|
||||||
|
sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2);
|
||||||
|
sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3);
|
||||||
|
sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0);
|
||||||
|
sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1);
|
||||||
|
sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2);
|
||||||
|
sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
|
||||||
|
sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
|
||||||
|
sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
|
||||||
|
sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int m = 0; m < 4; m++) {
|
||||||
|
vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||||
|
{
|
||||||
|
float sumf[4][4];
|
||||||
|
int sumi;
|
||||||
|
|
||||||
|
for (int y = 0; y < nr / 4; y++) {
|
||||||
|
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||||
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||||
|
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
||||||
|
for (int m = 0; m < 4; m++) {
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
||||||
|
}
|
||||||
|
for (int l = 0; l < nb; l++) {
|
||||||
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||||
|
for (int m = 0; m < 4; m++) {
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
sumi = 0;
|
||||||
|
for (int i = 0; i < blocklen; ++i) {
|
||||||
|
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
||||||
|
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||||
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||||
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
||||||
|
}
|
||||||
|
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int m = 0; m < 4; m++) {
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++)
|
||||||
|
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// FIXME: this code is duplicated from ggml-aarch64.c
|
// FIXME: this code is duplicated from ggml-aarch64.c
|
||||||
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
||||||
block_q4_0x4 out;
|
block_q4_0x4 out;
|
||||||
|
@ -3518,6 +3723,70 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block,
|
||||||
GGML_UNUSED(data_size);
|
GGML_UNUSED(data_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
|
||||||
|
block_iq4_nlx4 out;
|
||||||
|
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
out.d[i] = in[i].d;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int end = QK4_NL * 2 / blck_size_interleave;
|
||||||
|
|
||||||
|
if (blck_size_interleave == 8) {
|
||||||
|
for (int i = 0; i < end; ++i) {
|
||||||
|
int src_id = i % 4;
|
||||||
|
int src_offset = (i / 4) * blck_size_interleave;
|
||||||
|
int dst_offset = i * blck_size_interleave;
|
||||||
|
|
||||||
|
// Using memcpy to avoid unaligned memory accesses
|
||||||
|
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
|
||||||
|
}
|
||||||
|
} else if (blck_size_interleave == 4) {
|
||||||
|
for (int i = 0; i < end; ++i) {
|
||||||
|
int src_id = i % 4;
|
||||||
|
int src_offset = (i / 4) * blck_size_interleave;
|
||||||
|
int dst_offset = i * blck_size_interleave;
|
||||||
|
|
||||||
|
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) {
|
||||||
|
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
||||||
|
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
||||||
|
|
||||||
|
block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
|
||||||
|
const block_iq4_nl * src = (const block_iq4_nl *)data;
|
||||||
|
block_iq4_nl dst_tmp[4];
|
||||||
|
int nrow = t->ne[1]; // Number of rows
|
||||||
|
int nrows_interleaved = 4;
|
||||||
|
int nblocks = t->ne[0] / QK4_0;
|
||||||
|
|
||||||
|
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
||||||
|
|
||||||
|
if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
||||||
|
for (int64_t x = 0; x < nblocks; x++) {
|
||||||
|
for (int i = 0; i < nrows_interleaved; i++) {
|
||||||
|
dst_tmp[i] = src[x + i * nblocks];
|
||||||
|
}
|
||||||
|
*dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
|
||||||
|
}
|
||||||
|
src += nrows_interleaved * nblocks;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
GGML_UNUSED(data_size);
|
||||||
|
}
|
||||||
|
|
||||||
// Prepare for optimized kernels if applicable
|
// Prepare for optimized kernels if applicable
|
||||||
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) {
|
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) {
|
||||||
if (cur->type == repack_type) {
|
if (cur->type == repack_type) {
|
||||||
|
@ -3525,20 +3794,30 @@ void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(cur->type == GGML_TYPE_Q4_0);
|
if (cur->type == GGML_TYPE_Q4_0) {
|
||||||
|
switch (repack_type) {
|
||||||
switch (repack_type) {
|
case GGML_TYPE_Q4_0_8_8:
|
||||||
case GGML_TYPE_Q4_0_8_8:
|
repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
|
||||||
repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
|
break;
|
||||||
break;
|
case GGML_TYPE_Q4_0_4_8:
|
||||||
case GGML_TYPE_Q4_0_4_8:
|
repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
|
||||||
repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
|
break;
|
||||||
break;
|
case GGML_TYPE_Q4_0_4_4:
|
||||||
case GGML_TYPE_Q4_0_4_4:
|
repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
|
||||||
repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
|
break;
|
||||||
break;
|
default:
|
||||||
default:
|
GGML_ABORT("Unsupported type");
|
||||||
GGML_ABORT("Unsupported type");
|
}
|
||||||
|
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
||||||
|
switch (repack_type) {
|
||||||
|
case GGML_TYPE_IQ4_NL_4_4:
|
||||||
|
repack_iq4_nl_to_iq4_nl_4_bl(cur, 4, data, data_size);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ABORT("Unsupported type");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
GGML_ABORT("Unsupported type");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3551,9 +3830,13 @@ enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * c
|
||||||
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
||||||
return GGML_TYPE_Q4_0_4_8;
|
return GGML_TYPE_Q4_0_4_8;
|
||||||
}
|
}
|
||||||
if (ggml_cpu_has_neon()) {
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||||
return GGML_TYPE_Q4_0_4_4;
|
return GGML_TYPE_Q4_0_4_4;
|
||||||
}
|
}
|
||||||
|
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
||||||
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||||
|
return GGML_TYPE_IQ4_NL_4_4;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return cur->type;
|
return cur->type;
|
||||||
|
|
|
@ -15,11 +15,13 @@ void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
||||||
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
|
||||||
// GEMM
|
// GEMM
|
||||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
|
||||||
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
|
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
|
||||||
enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);
|
enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);
|
||||||
|
|
|
@ -15,6 +15,18 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
struct ggml_compute_params {
|
||||||
|
// ith = thread index, nth = number of threads
|
||||||
|
int ith, nth;
|
||||||
|
|
||||||
|
// work buffer for all threads
|
||||||
|
size_t wsize;
|
||||||
|
void * wdata;
|
||||||
|
|
||||||
|
struct ggml_threadpool * threadpool;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
|
|
||||||
#define m512bh(p) p
|
#define m512bh(p) p
|
||||||
|
@ -366,6 +378,9 @@ static __m256 __lasx_xvreplfr2vr_s(float val) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// TODO: move to ggml-threading
|
||||||
|
void ggml_barrier(struct ggml_threadpool * tp);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1791,11 +1791,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
||||||
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||||
|
|
||||||
float32_t _scale[4] = { GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
float32_t _scale[4] = {
|
||||||
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
|
||||||
|
};
|
||||||
float32x4_t scale = vld1q_f32(_scale);
|
float32x4_t scale = vld1q_f32(_scale);
|
||||||
|
|
||||||
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
@ -1811,13 +1812,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
|
|
||||||
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
||||||
l1, r1)), l2, r2)), l3, r3))), scale);
|
l1, r1)), l2, r2)), l3, r3))), scale);
|
||||||
}
|
}
|
||||||
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
|
||||||
|
float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
|
||||||
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
||||||
|
|
||||||
vst1_f32(s, vget_low_f32(sumv2));
|
vst1_f32(s, vget_low_f32 (sumv2));
|
||||||
vst1_f32(s + bs, vget_high_f32(sumv2));
|
vst1_f32(s + bs, vget_high_f32(sumv2));
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -2345,10 +2348,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||||
const block_q8_1 * restrict b_y0 = &vy0[i];
|
const block_q8_1 * restrict b_y0 = &vy0[i];
|
||||||
const block_q8_1 * restrict b_y1 = &vy1[i];
|
const block_q8_1 * restrict b_y1 = &vy1[i];
|
||||||
|
|
||||||
float32_t summs_t[4] = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
|
float32_t summs_t[4] = {
|
||||||
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
|
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
|
||||||
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
|
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
|
||||||
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
|
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)
|
||||||
|
};
|
||||||
summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
|
summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
|
||||||
|
|
||||||
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
||||||
|
@ -2369,10 +2374,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||||
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||||
|
|
||||||
// mmla into int32x4_t
|
// mmla into int32x4_t
|
||||||
float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
|
float32_t _scale[4] = {
|
||||||
GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
|
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
|
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||||
GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
|
||||||
|
};
|
||||||
float32x4_t scale = vld1q_f32(_scale);
|
float32x4_t scale = vld1q_f32(_scale);
|
||||||
|
|
||||||
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
@ -2387,15 +2394,17 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||||
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
||||||
l1, r1)), l2, r2)), l3, r3))), scale);
|
l1, r1)), l2, r2)), l3, r3))), scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
|
||||||
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
||||||
|
|
||||||
sumv2 = vaddq_f32(sumv2, summs0);
|
sumv2 = vaddq_f32(sumv2, summs0);
|
||||||
|
|
||||||
vst1_f32(s, vget_low_f32 (sumv2));
|
vst1_f32(s, vget_low_f32 (sumv2));
|
||||||
vst1_f32(s + bs, vget_high_f32(sumv2));
|
vst1_f32(s + bs, vget_high_f32(sumv2));
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -3372,10 +3381,12 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
||||||
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||||
|
|
||||||
float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
float32_t _scale[4] = {
|
||||||
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
|
||||||
|
};
|
||||||
float32x4_t scale = vld1q_f32(_scale);
|
float32x4_t scale = vld1q_f32(_scale);
|
||||||
|
|
||||||
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
@ -3391,13 +3402,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
|
|
||||||
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
||||||
l1, r1)), l2, r2)), l3, r3))), scale);
|
l1, r1)), l2, r2)), l3, r3))), scale);
|
||||||
}
|
}
|
||||||
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
|
||||||
|
float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
|
||||||
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
||||||
|
|
||||||
vst1_f32(s, vget_low_f32(sumv2));
|
vst1_f32(s, vget_low_f32 (sumv2));
|
||||||
vst1_f32(s + bs, vget_high_f32(sumv2));
|
vst1_f32(s + bs, vget_high_f32(sumv2));
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -10,6 +10,7 @@
|
||||||
#include "ggml-quants.h"
|
#include "ggml-quants.h"
|
||||||
#include "ggml-cpu-quants.h"
|
#include "ggml-cpu-quants.h"
|
||||||
#include "ggml-threading.h"
|
#include "ggml-threading.h"
|
||||||
|
#include "amx/amx.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
|
@ -109,10 +110,11 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
|
||||||
#if defined(__ARM_ARCH)
|
#if defined(__ARM_ARCH)
|
||||||
struct ggml_arm_arch_features_type {
|
struct ggml_arm_arch_features_type {
|
||||||
int has_neon;
|
int has_neon;
|
||||||
|
int has_dotprod;
|
||||||
int has_i8mm;
|
int has_i8mm;
|
||||||
int has_sve;
|
int has_sve;
|
||||||
int sve_cnt;
|
int sve_cnt;
|
||||||
} ggml_arm_arch_features = {-1, -1, -1, 0};
|
} ggml_arm_arch_features = {-1, -1, -1, -1, 0};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
@ -446,6 +448,15 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
|
[GGML_TYPE_IQ4_NL_4_4] = {
|
||||||
|
.from_float = NULL,
|
||||||
|
.vec_dot = NULL,
|
||||||
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
|
.nrows = 1,
|
||||||
|
.ncols = 4,
|
||||||
|
.gemv = ggml_gemv_iq4_nl_4x4_q8_0,
|
||||||
|
.gemm = ggml_gemm_iq4_nl_4x4_q8_0,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
|
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
|
||||||
|
@ -614,7 +625,7 @@ do { \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||||
} \
|
} \
|
||||||
res = _mm512_reduce_add_ps(x[0]); \
|
res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
// TODO: is this optimal ?
|
// TODO: is this optimal ?
|
||||||
|
@ -664,7 +675,7 @@ do { \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||||
} \
|
} \
|
||||||
res = _mm512_reduce_add_ps(x[0]); \
|
res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define GGML_F16_VEC GGML_F32Cx16
|
#define GGML_F16_VEC GGML_F32Cx16
|
||||||
|
@ -675,8 +686,8 @@ do { \
|
||||||
#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
|
#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
|
||||||
#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
|
#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
|
||||||
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
|
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
|
||||||
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
|
|
||||||
|
|
||||||
|
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
|
||||||
#elif defined(__AVX__)
|
#elif defined(__AVX__)
|
||||||
|
|
||||||
#define GGML_SIMD
|
#define GGML_SIMD
|
||||||
|
@ -1168,28 +1179,28 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
||||||
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
||||||
#define GGML_F32x4_ADD __lsx_vfadd_s
|
#define GGML_F32x4_ADD __lsx_vfadd_s
|
||||||
#define GGML_F32x4_MUL __lsx_vfmul_s
|
#define GGML_F32x4_MUL __lsx_vfmul_s
|
||||||
#define GGML_F32x4_REDUCE(res, x) \
|
#define GGML_F32x4_REDUCE(res, x) \
|
||||||
{ \
|
{ \
|
||||||
int offset = GGML_F32_ARR >> 1; \
|
int offset = GGML_F32_ARR >> 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
|
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
|
||||||
} \
|
} \
|
||||||
offset >>= 1; \
|
offset >>= 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
|
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
|
||||||
} \
|
} \
|
||||||
offset >>= 1; \
|
offset >>= 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
|
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
|
||||||
} \
|
} \
|
||||||
__m128i tmp = __lsx_vsrli_d((__m128i)x[0], 32); \
|
__m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
|
||||||
tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, x[0]); \
|
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
|
||||||
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
||||||
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
|
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
|
||||||
tmp = __lsx_vsrli_d((__m128i)t0, 32); \
|
tmp = __lsx_vsrli_d((__m128i) t0, 32); \
|
||||||
tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, t0); \
|
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
|
||||||
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
||||||
res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
|
res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GGML_F32_VEC GGML_F32x4
|
#define GGML_F32_VEC GGML_F32x4
|
||||||
|
@ -1357,31 +1368,15 @@ struct ggml_compute_state {
|
||||||
int ith;
|
int ith;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_compute_params {
|
|
||||||
// ith = thread index, nth = number of threads
|
|
||||||
int ith, nth;
|
|
||||||
|
|
||||||
// work buffer for all threads
|
|
||||||
size_t wsize;
|
|
||||||
void * wdata;
|
|
||||||
|
|
||||||
struct ggml_threadpool * threadpool;
|
|
||||||
};
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// fundamental operations
|
// fundamental operations
|
||||||
//
|
//
|
||||||
|
|
||||||
inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
||||||
|
|
||||||
inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
||||||
|
|
||||||
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
||||||
|
|
||||||
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
||||||
|
|
||||||
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
||||||
|
|
||||||
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
|
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
|
||||||
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
|
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
|
||||||
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
|
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
|
||||||
|
@ -2276,7 +2271,7 @@ struct ggml_state {
|
||||||
|
|
||||||
static struct ggml_state g_state = {0};
|
static struct ggml_state g_state = {0};
|
||||||
|
|
||||||
static void ggml_barrier(struct ggml_threadpool * tp) {
|
void ggml_barrier(struct ggml_threadpool * tp) {
|
||||||
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
|
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
|
||||||
if (n_threads == 1) {
|
if (n_threads == 1) {
|
||||||
return;
|
return;
|
||||||
|
@ -2439,6 +2434,7 @@ static void ggml_init_arm_arch_features(void) {
|
||||||
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
||||||
|
|
||||||
ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
|
ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
|
||||||
|
ggml_arm_arch_features.has_dotprod = !!(hwcap && HWCAP_ASIMDDP);
|
||||||
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
||||||
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
|
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
|
||||||
|
|
||||||
|
@ -2453,6 +2449,11 @@ static void ggml_init_arm_arch_features(void) {
|
||||||
}
|
}
|
||||||
ggml_arm_arch_features.has_neon = oldp;
|
ggml_arm_arch_features.has_neon = oldp;
|
||||||
|
|
||||||
|
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) {
|
||||||
|
oldp = 0;
|
||||||
|
}
|
||||||
|
ggml_arm_arch_features.has_dotprod = oldp;
|
||||||
|
|
||||||
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
|
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
|
||||||
oldp = 0;
|
oldp = 0;
|
||||||
}
|
}
|
||||||
|
@ -7439,6 +7440,13 @@ static void ggml_compute_forward_mul_mat(
|
||||||
type = (enum ggml_type)(intptr_t)src0->extra;
|
type = (enum ggml_type)(intptr_t)src0->extra;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
||||||
|
if (src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
|
||||||
|
ggml_backend_amx_mul_mat(params, dst);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||||
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
|
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
|
||||||
ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat;
|
ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat;
|
||||||
|
@ -7560,14 +7568,6 @@ UseGgmlGemm2:;
|
||||||
// This is the size of the rest of the dimensions of the result
|
// This is the size of the rest of the dimensions of the result
|
||||||
const int64_t nr1 = ne1 * ne2 * ne3;
|
const int64_t nr1 = ne1 * ne2 * ne3;
|
||||||
|
|
||||||
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
|
|
||||||
int64_t num_rows_per_vec_dot = vec_dot_num_rows;
|
|
||||||
// TODO: currently the mmla kernels support only even numbered rows/cols.
|
|
||||||
// this check can be removed once they are extended to support odd numbered rows/cols too
|
|
||||||
if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
|
|
||||||
num_rows_per_vec_dot = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now select a reasonable chunk size.
|
// Now select a reasonable chunk size.
|
||||||
int chunk_size = 16;
|
int chunk_size = 16;
|
||||||
|
|
||||||
|
@ -7630,6 +7630,15 @@ UseGgmlGemm2:;
|
||||||
const int64_t ir1_start = dr1 * ith1;
|
const int64_t ir1_start = dr1 * ith1;
|
||||||
const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
|
const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
|
||||||
|
|
||||||
|
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
|
||||||
|
int64_t num_rows_per_vec_dot = vec_dot_num_rows;
|
||||||
|
|
||||||
|
// these checks are needed to avoid crossing dim1 boundaries
|
||||||
|
// can be optimized, but the logic would become more complicated, so keeping it like this for simplicity
|
||||||
|
if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
|
||||||
|
num_rows_per_vec_dot = 1;
|
||||||
|
}
|
||||||
|
|
||||||
ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
|
ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
|
||||||
|
|
||||||
if (nth >= nchunk0 * nchunk1) {
|
if (nth >= nchunk0 * nchunk1) {
|
||||||
|
@ -9133,6 +9142,7 @@ static void ggml_compute_forward_clamp(
|
||||||
case GGML_TYPE_Q4_0_4_4:
|
case GGML_TYPE_Q4_0_4_4:
|
||||||
case GGML_TYPE_Q4_0_4_8:
|
case GGML_TYPE_Q4_0_4_8:
|
||||||
case GGML_TYPE_Q4_0_8_8:
|
case GGML_TYPE_Q4_0_8_8:
|
||||||
|
case GGML_TYPE_IQ4_NL_4_4:
|
||||||
case GGML_TYPE_I8:
|
case GGML_TYPE_I8:
|
||||||
case GGML_TYPE_I16:
|
case GGML_TYPE_I16:
|
||||||
case GGML_TYPE_I32:
|
case GGML_TYPE_I32:
|
||||||
|
@ -13276,10 +13286,16 @@ struct ggml_cplan ggml_graph_plan(
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
{
|
{
|
||||||
|
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
||||||
|
if (node->src[0]->buffer && ggml_backend_amx_buft_is_amx(node->src[0]->buffer->buft)) {
|
||||||
|
cur = ggml_backend_amx_desired_wsize(node);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
|
const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
|
||||||
|
|
||||||
if (node->src[1]->type != vec_dot_type) {
|
if (node->src[1]->type != vec_dot_type) {
|
||||||
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
size_t cur2 = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
||||||
|
cur = MAX(cur, cur2);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_MUL_MAT_ID:
|
case GGML_OP_MUL_MAT_ID:
|
||||||
|
@ -13880,6 +13896,14 @@ int ggml_cpu_has_neon(void) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_dotprod(void) {
|
||||||
|
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
return ggml_arm_arch_features.has_dotprod;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_sve(void) {
|
int ggml_cpu_has_sve(void) {
|
||||||
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
|
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
|
||||||
return ggml_arm_arch_features.has_sve;
|
return ggml_arm_arch_features.has_sve;
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
#include "ggml-cpu.h"
|
#include "ggml-cpu.h"
|
||||||
#include "ggml-cpu-aarch64.h"
|
#include "ggml-cpu-aarch64.h"
|
||||||
#include "ggml-impl.h"
|
#include "ggml-impl.h"
|
||||||
|
#include "amx/amx.h"
|
||||||
#include <cctype>
|
#include <cctype>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
@ -134,12 +135,16 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backen
|
||||||
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
|
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
|
||||||
std::vector<ggml_backend_buffer_type_t> bufts;
|
std::vector<ggml_backend_buffer_type_t> bufts;
|
||||||
|
|
||||||
#ifdef GGML_USE_CPU_HBM
|
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
||||||
bufts.push_back(ggml_backend_cpu_hbm_buffer_type());
|
if (ggml_backend_amx_buffer_type()) {
|
||||||
|
bufts.push_back(ggml_backend_amx_buffer_type());
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_CPU_AARCH64
|
#ifdef GGML_USE_CPU_AARCH64
|
||||||
bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
|
if (ggml_backend_cpu_aarch64_buffer_type()) {
|
||||||
|
bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bufts.push_back(NULL);
|
bufts.push_back(NULL);
|
||||||
|
@ -456,12 +461,27 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
|
||||||
const struct ggml_tensor * src0 = op->src[0];
|
const struct ggml_tensor * src0 = op->src[0];
|
||||||
const struct ggml_tensor * src1 = op->src[1];
|
const struct ggml_tensor * src1 = op->src[1];
|
||||||
|
|
||||||
|
if (op->op == GGML_OP_NONE || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE || op->op == GGML_OP_TRANSPOSE) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
|
if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
|
||||||
if (op->op != GGML_OP_MUL_MAT || src0->type != GGML_TYPE_Q4_0 || ggml_aarch64_get_optimal_repack_type(src0) == GGML_TYPE_Q4_0) {
|
if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
||||||
|
if (src0 && src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
|
||||||
|
return ggml_backend_amx_device_supports_op(op);
|
||||||
|
}
|
||||||
|
for (int i = 1; i < GGML_MAX_SRC; i++) {
|
||||||
|
if (op->src[i] && op->src[i]->buffer && ggml_backend_amx_buft_is_amx(op->src[i]->buffer->buft)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
for (int i = 1; i < GGML_MAX_SRC; i++) {
|
for (int i = 1; i < GGML_MAX_SRC; i++) {
|
||||||
if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
|
if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -491,7 +511,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||||
return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
|
bool supported = ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
|
||||||
|
|
||||||
|
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
||||||
|
supported = supported || ggml_backend_amx_buft_is_amx(buft);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return supported;
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,8 +50,7 @@
|
||||||
|
|
||||||
#include "sgemm.h"
|
#include "sgemm.h"
|
||||||
#include "ggml-impl.h"
|
#include "ggml-impl.h"
|
||||||
// hack until moved into the CPU backend
|
#include "ggml-cpu-impl.h"
|
||||||
#include "../ggml-cpu-impl.h"
|
|
||||||
#include "ggml-quants.h"
|
#include "ggml-quants.h"
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
|
|
|
@ -47,9 +47,20 @@
|
||||||
#define CC_TURING 750
|
#define CC_TURING 750
|
||||||
#define CC_AMPERE 800
|
#define CC_AMPERE 800
|
||||||
#define CC_OFFSET_AMD 1000000
|
#define CC_OFFSET_AMD 1000000
|
||||||
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
|
|
||||||
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
// GCN/CNDA, wave size is 64
|
||||||
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
|
#define CC_GCN4 (CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
|
||||||
|
#define CC_VEGA (CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
|
||||||
|
#define CC_VEGA20 (CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
|
||||||
|
#define CC_CDNA (CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
|
||||||
|
#define CC_CDNA2 (CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
|
||||||
|
#define CC_CDNA3 (CC_OFFSET_AMD + 942) // MI300
|
||||||
|
|
||||||
|
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
|
||||||
|
#define CC_RDNA1 (CC_OFFSET_AMD + 1010) // RX 5000
|
||||||
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
|
||||||
|
#define CC_RDNA3 (CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
|
||||||
|
|
||||||
#define CC_QY1 210
|
#define CC_QY1 210
|
||||||
#define CC_QY2 220
|
#define CC_QY2 220
|
||||||
|
|
||||||
|
|
|
@ -1107,6 +1107,11 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||||
const half alpha_f16 = 1.0f;
|
const half alpha_f16 = 1.0f;
|
||||||
const half beta_f16 = 0.0f;
|
const half beta_f16 = 0.0f;
|
||||||
|
|
||||||
|
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
||||||
|
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
|
||||||
|
cu_compute_type = CUBLAS_COMPUTE_32F;
|
||||||
|
}
|
||||||
|
|
||||||
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
||||||
CUBLAS_CHECK(
|
CUBLAS_CHECK(
|
||||||
cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
|
cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
|
||||||
|
@ -1114,7 +1119,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||||
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
|
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
|
||||||
src1_ptr, CUDA_R_16F, ne10,
|
src1_ptr, CUDA_R_16F, ne10,
|
||||||
&beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
|
&beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
|
||||||
CUBLAS_COMPUTE_16F,
|
cu_compute_type,
|
||||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||||
|
|
||||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
||||||
|
@ -1607,6 +1612,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||||
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
||||||
cudaDataType_t cu_data_type = CUDA_R_16F;
|
cudaDataType_t cu_data_type = CUDA_R_16F;
|
||||||
|
|
||||||
|
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
|
||||||
|
cu_compute_type = CUBLAS_COMPUTE_32F;
|
||||||
|
}
|
||||||
|
|
||||||
// dst strides
|
// dst strides
|
||||||
size_t nbd2 = dst->nb[2];
|
size_t nbd2 = dst->nb[2];
|
||||||
size_t nbd3 = dst->nb[3];
|
size_t nbd3 = dst->nb[3];
|
||||||
|
|
|
@ -148,5 +148,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||||
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
return cc < CC_RDNA3 || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
return (cc < CC_RDNA3 && cc != CC_CDNA && cc != CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||||
}
|
}
|
||||||
|
|
|
@ -2570,9 +2570,9 @@ static __device__ void mul_mat_q_process_tile(
|
||||||
|
|
||||||
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
#if defined(RDNA3) || defined(RDNA2)
|
#if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
||||||
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
||||||
#endif // defined(RDNA3) || defined(RDNA2)
|
#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
||||||
#else
|
#else
|
||||||
#if __CUDA_ARCH__ >= CC_VOLTA
|
#if __CUDA_ARCH__ >= CC_VOLTA
|
||||||
__launch_bounds__(WARP_SIZE*nwarps, 1)
|
__launch_bounds__(WARP_SIZE*nwarps, 1)
|
||||||
|
|
|
@ -142,7 +142,7 @@ static void mul_mat_vec_q_cuda(
|
||||||
int64_t nwarps = 1;
|
int64_t nwarps = 1;
|
||||||
int64_t rows_per_cuda_block = 1;
|
int64_t rows_per_cuda_block = 1;
|
||||||
|
|
||||||
if (ggml_cuda_info().devices[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
|
if (ggml_cuda_info().devices[id].cc < CC_CDNA || ggml_cuda_info().devices[id].cc == CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA
|
||||||
switch(ncols_y) {
|
switch(ncols_y) {
|
||||||
case 1:
|
case 1:
|
||||||
nwarps = 4;
|
nwarps = 4;
|
||||||
|
|
8
ggml/src/ggml-cuda/vendors/hip.h
vendored
8
ggml/src/ggml-cuda/vendors/hip.h
vendored
|
@ -95,6 +95,14 @@
|
||||||
|
|
||||||
#define __CUDA_ARCH__ 1300
|
#define __CUDA_ARCH__ 1300
|
||||||
|
|
||||||
|
#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
|
||||||
|
#define GCN
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
|
||||||
|
#define CDNA
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
||||||
defined(__gfx1150__) || defined(__gfx1151__)
|
defined(__gfx1150__) || defined(__gfx1151__)
|
||||||
#define RDNA3
|
#define RDNA3
|
||||||
|
|
|
@ -30,11 +30,13 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#undef MIN
|
#ifndef MIN
|
||||||
#undef MAX
|
# define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
|
#endif
|
||||||
|
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#ifndef MAX
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
# define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
#endif
|
||||||
|
|
||||||
// required for mmap as gguf only guarantees 32-byte alignment
|
// required for mmap as gguf only guarantees 32-byte alignment
|
||||||
#define TENSOR_ALIGNMENT 32
|
#define TENSOR_ALIGNMENT 32
|
||||||
|
|
|
@ -105,8 +105,10 @@ if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
|
||||||
kompute-shaders/op_getrows_q4_0.comp
|
kompute-shaders/op_getrows_q4_0.comp
|
||||||
kompute-shaders/op_getrows_q4_1.comp
|
kompute-shaders/op_getrows_q4_1.comp
|
||||||
kompute-shaders/op_getrows_q6_k.comp
|
kompute-shaders/op_getrows_q6_k.comp
|
||||||
kompute-shaders/op_rope_f16.comp
|
kompute-shaders/op_rope_norm_f16.comp
|
||||||
kompute-shaders/op_rope_f32.comp
|
kompute-shaders/op_rope_norm_f32.comp
|
||||||
|
kompute-shaders/op_rope_neox_f16.comp
|
||||||
|
kompute-shaders/op_rope_neox_f32.comp
|
||||||
kompute-shaders/op_cpy_f16_f16.comp
|
kompute-shaders/op_cpy_f16_f16.comp
|
||||||
kompute-shaders/op_cpy_f16_f32.comp
|
kompute-shaders/op_cpy_f16_f32.comp
|
||||||
kompute-shaders/op_cpy_f32_f16.comp
|
kompute-shaders/op_cpy_f32_f16.comp
|
||||||
|
@ -139,8 +141,10 @@ if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
|
||||||
shaderop_getrows_q4_0.h
|
shaderop_getrows_q4_0.h
|
||||||
shaderop_getrows_q4_1.h
|
shaderop_getrows_q4_1.h
|
||||||
shaderop_getrows_q6_k.h
|
shaderop_getrows_q6_k.h
|
||||||
shaderop_rope_f16.h
|
shaderop_rope_norm_f16.h
|
||||||
shaderop_rope_f32.h
|
shaderop_rope_norm_f32.h
|
||||||
|
shaderop_rope_neox_f16.h
|
||||||
|
shaderop_rope_neox_f32.h
|
||||||
shaderop_cpy_f16_f16.h
|
shaderop_cpy_f16_f16.h
|
||||||
shaderop_cpy_f16_f32.h
|
shaderop_cpy_f16_f32.h
|
||||||
shaderop_cpy_f32_f16.h
|
shaderop_cpy_f32_f16.h
|
||||||
|
|
|
@ -28,8 +28,10 @@
|
||||||
#include "shaderop_getrows_q4_0.h"
|
#include "shaderop_getrows_q4_0.h"
|
||||||
#include "shaderop_getrows_q4_1.h"
|
#include "shaderop_getrows_q4_1.h"
|
||||||
#include "shaderop_getrows_q6_k.h"
|
#include "shaderop_getrows_q6_k.h"
|
||||||
#include "shaderop_rope_f16.h"
|
#include "shaderop_rope_norm_f16.h"
|
||||||
#include "shaderop_rope_f32.h"
|
#include "shaderop_rope_norm_f32.h"
|
||||||
|
#include "shaderop_rope_neox_f16.h"
|
||||||
|
#include "shaderop_rope_neox_f32.h"
|
||||||
#include "shaderop_cpy_f16_f16.h"
|
#include "shaderop_cpy_f16_f16.h"
|
||||||
#include "shaderop_cpy_f16_f32.h"
|
#include "shaderop_cpy_f16_f32.h"
|
||||||
#include "shaderop_cpy_f32_f16.h"
|
#include "shaderop_cpy_f32_f16.h"
|
||||||
|
@ -345,7 +347,7 @@ void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t
|
||||||
std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
|
std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
|
||||||
vk::DescriptorPoolSize(
|
vk::DescriptorPoolSize(
|
||||||
vk::DescriptorType::eStorageBuffer,
|
vk::DescriptorType::eStorageBuffer,
|
||||||
3 * size // Descriptor count is number of possible tensors to pass into an algorithm
|
4 * size // Descriptor count is number of possible tensors to pass into an algorithm
|
||||||
)
|
)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -788,7 +790,8 @@ static void ggml_vk_soft_max(
|
||||||
const std::shared_ptr<kp::Tensor>& out,
|
const std::shared_ptr<kp::Tensor>& out,
|
||||||
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
||||||
int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03,
|
int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03,
|
||||||
float scale
|
float scale, float max_bias, float m0, float m1,
|
||||||
|
uint32_t n_head_log2
|
||||||
) {
|
) {
|
||||||
const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv,
|
const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv,
|
||||||
kp::shader_data::op_softmax_comp_spv_len);
|
kp::shader_data::op_softmax_comp_spv_len);
|
||||||
|
@ -796,12 +799,14 @@ static void ggml_vk_soft_max(
|
||||||
struct PushConstants {
|
struct PushConstants {
|
||||||
uint32_t inAOff, inBOff, outOff;
|
uint32_t inAOff, inBOff, outOff;
|
||||||
int32_t ne00, ne01, ne02;
|
int32_t ne00, ne01, ne02;
|
||||||
float scale;
|
float scale, max_bias, m0, m1;
|
||||||
|
uint32_t n_head_log2;
|
||||||
int32_t mask;
|
int32_t mask;
|
||||||
} pushConsts {
|
} pushConsts {
|
||||||
safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
||||||
ne00, ne01, ne02,
|
ne00, ne01, ne02,
|
||||||
scale,
|
scale, max_bias, m0, m1,
|
||||||
|
n_head_log2,
|
||||||
bool(inB)
|
bool(inB)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -911,9 +916,9 @@ static void ggml_vk_mul_mat_f16(
|
||||||
const std::shared_ptr<kp::Tensor>& out,
|
const std::shared_ptr<kp::Tensor>& out,
|
||||||
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
||||||
int32_t ne00, int32_t ne01, int32_t ne02,
|
int32_t ne00, int32_t ne01, int32_t ne02,
|
||||||
uint32_t nb00, uint32_t nb01, uint32_t nb02,
|
uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
|
||||||
int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
|
int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
|
||||||
uint32_t nb10, uint32_t nb11, uint32_t nb12,
|
uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13,
|
||||||
int32_t ne0, int32_t ne1,
|
int32_t ne0, int32_t ne1,
|
||||||
uint32_t r2, uint32_t r3
|
uint32_t r2, uint32_t r3
|
||||||
) {
|
) {
|
||||||
|
@ -923,17 +928,17 @@ static void ggml_vk_mul_mat_f16(
|
||||||
struct PushConstants {
|
struct PushConstants {
|
||||||
uint32_t inAOff, inBOff, outOff;
|
uint32_t inAOff, inBOff, outOff;
|
||||||
int32_t ne00, ne01, ne02;
|
int32_t ne00, ne01, ne02;
|
||||||
uint32_t nb00, nb01, nb02;
|
uint32_t nb00, nb01, nb02, nb03;
|
||||||
int32_t ne10, ne11, ne12;
|
int32_t ne10, ne11, ne12;
|
||||||
uint32_t nb10, nb11, nb12;
|
uint32_t nb10, nb11, nb12, nb13;
|
||||||
int32_t ne0, ne1;
|
int32_t ne0, ne1;
|
||||||
uint32_t r2, r3;
|
uint32_t r2, r3;
|
||||||
} pushConsts {
|
} pushConsts {
|
||||||
safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
||||||
ne00, ne01, ne02,
|
ne00, ne01, ne02,
|
||||||
nb00, nb01, nb02,
|
nb00, nb01, nb02, nb03,
|
||||||
ne10, ne11, ne12,
|
ne10, ne11, ne12,
|
||||||
nb10, nb11, nb12,
|
nb10, nb11, nb12, nb13,
|
||||||
ne0, ne1,
|
ne0, ne1,
|
||||||
r2, r3
|
r2, r3
|
||||||
};
|
};
|
||||||
|
@ -1013,6 +1018,8 @@ static void ggml_vk_mul_mat_impl(
|
||||||
int32_t ne00, int32_t ne01, int32_t ne02,
|
int32_t ne00, int32_t ne01, int32_t ne02,
|
||||||
int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
|
int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
|
||||||
int32_t ne0, int32_t ne1,
|
int32_t ne0, int32_t ne1,
|
||||||
|
uint32_t nb01, uint32_t nb02, uint32_t nb03,
|
||||||
|
uint32_t nb11, uint32_t nb12, uint32_t nb13,
|
||||||
uint32_t r2, uint32_t r3
|
uint32_t r2, uint32_t r3
|
||||||
) {
|
) {
|
||||||
struct PushConstants {
|
struct PushConstants {
|
||||||
|
@ -1020,19 +1027,23 @@ static void ggml_vk_mul_mat_impl(
|
||||||
int32_t ne00, ne01, ne02;
|
int32_t ne00, ne01, ne02;
|
||||||
int32_t ne10, ne12;
|
int32_t ne10, ne12;
|
||||||
int32_t ne0, ne1;
|
int32_t ne0, ne1;
|
||||||
|
uint32_t nb01, nb02, nb03;
|
||||||
|
uint32_t nb11, nb12, nb13;
|
||||||
uint32_t r2, r3;
|
uint32_t r2, r3;
|
||||||
} pushConsts {
|
} pushConsts {
|
||||||
safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
||||||
ne00, ne01, ne02,
|
ne00, ne01, ne02,
|
||||||
ne10, ne12,
|
ne10, ne12,
|
||||||
ne0, ne1,
|
ne0, ne1,
|
||||||
|
nb01, nb02, nb03,
|
||||||
|
nb11, nb12, nb13,
|
||||||
r2, r3
|
r2, r3
|
||||||
};
|
};
|
||||||
|
|
||||||
auto name = std::string(__func__) + "_" + suffix;
|
auto name = std::string(__func__) + "_" + suffix;
|
||||||
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
||||||
if (!komputeManager()->hasAlgorithm(name)) {
|
if (!komputeManager()->hasAlgorithm(name)) {
|
||||||
const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
|
const uint32_t local_x = (ggml_vk_current_device().subgroupSize * 2) / 8;
|
||||||
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts});
|
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts});
|
||||||
} else {
|
} else {
|
||||||
s_algo = komputeManager()->getAlgorithm(name);
|
s_algo = komputeManager()->getAlgorithm(name);
|
||||||
|
@ -1074,19 +1085,26 @@ static void ggml_vk_mul_mat_q4_k(
|
||||||
const std::shared_ptr<kp::Tensor>& inB,
|
const std::shared_ptr<kp::Tensor>& inB,
|
||||||
const std::shared_ptr<kp::Tensor>& out,
|
const std::shared_ptr<kp::Tensor>& out,
|
||||||
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
||||||
int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne10,
|
int32_t ne00, int32_t ne01, int32_t ne02,
|
||||||
int32_t ne11, int32_t ne12, int32_t ne13, int32_t ne0,
|
int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
|
||||||
int32_t ne1, int32_t r2, int32_t r3
|
int32_t ne0, int32_t ne1,
|
||||||
|
uint32_t nb01, uint32_t nb02, uint32_t nb03,
|
||||||
|
uint32_t nb11, uint32_t nb12, uint32_t nb13,
|
||||||
|
uint32_t r2, uint32_t r3
|
||||||
) {
|
) {
|
||||||
const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv,
|
const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv,
|
||||||
kp::shader_data::op_mul_mat_q4_k_comp_spv_len);
|
kp::shader_data::op_mul_mat_q4_k_comp_spv_len);
|
||||||
|
|
||||||
struct PushConstants {
|
struct PushConstants {
|
||||||
uint32_t inAOff, inBOff, outOff;
|
uint32_t inAOff, inBOff, outOff;
|
||||||
int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12, r2, r3;
|
int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12;
|
||||||
|
uint32_t nb01, nb02, nb03, nb11, nb12, nb13;
|
||||||
|
uint32_t r2, r3;
|
||||||
} pushConsts {
|
} pushConsts {
|
||||||
0, 0, 0,
|
inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
||||||
ne00, ne10, ne0, ne1, ne01, ne02, ne12, r2, r3
|
ne00, ne10, ne0, ne1, ne01, ne02, ne12,
|
||||||
|
nb01, nb02, nb03, nb11, nb12, nb13,
|
||||||
|
r2, r3
|
||||||
};
|
};
|
||||||
|
|
||||||
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
||||||
|
@ -1108,28 +1126,37 @@ static void ggml_vk_mul_mat_q6_k(
|
||||||
const std::shared_ptr<kp::Tensor>& inB,
|
const std::shared_ptr<kp::Tensor>& inB,
|
||||||
const std::shared_ptr<kp::Tensor>& out,
|
const std::shared_ptr<kp::Tensor>& out,
|
||||||
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
||||||
int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
|
int32_t ne00, int32_t ne01, int32_t ne02,
|
||||||
int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02
|
int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
|
||||||
|
int32_t ne0, int32_t ne1,
|
||||||
|
uint32_t nb01, uint32_t nb02, uint32_t nb03,
|
||||||
|
uint32_t nb11, uint32_t nb12, uint32_t nb13,
|
||||||
|
uint32_t r2, uint32_t r3
|
||||||
) {
|
) {
|
||||||
const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv,
|
const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv,
|
||||||
kp::shader_data::op_mul_mat_q6_k_comp_spv_len);
|
kp::shader_data::op_mul_mat_q6_k_comp_spv_len);
|
||||||
|
|
||||||
struct PushConstants {
|
struct PushConstants {
|
||||||
uint32_t inAOff, inBOff, outOff;
|
uint32_t inAOff, inBOff, outOff;
|
||||||
int32_t ne00, ne10, ne0, ne1, ne01, gqa;
|
int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12;
|
||||||
|
uint32_t nb01, nb02, nb03, nb11, nb12, nb13;
|
||||||
|
uint32_t r2, r3;
|
||||||
} pushConsts {
|
} pushConsts {
|
||||||
inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
||||||
ne00, ne10, ne0, ne1, ne01, ne12/ne02
|
ne00, ne10, ne0, ne1, ne01, ne02, ne12,
|
||||||
|
nb01, nb02, nb03, nb11, nb12, nb13,
|
||||||
|
r2, r3
|
||||||
};
|
};
|
||||||
|
|
||||||
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
||||||
if (!komputeManager()->hasAlgorithm(__func__)) {
|
if (!komputeManager()->hasAlgorithm(__func__)) {
|
||||||
const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
|
const uint32_t local_x = 2;
|
||||||
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
|
const uint32_t local_y = ggml_vk_current_device().subgroupSize;
|
||||||
|
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)}, {local_x, local_y}, {pushConsts});
|
||||||
} else {
|
} else {
|
||||||
s_algo = komputeManager()->getAlgorithm(__func__);
|
s_algo = komputeManager()->getAlgorithm(__func__);
|
||||||
s_algo->setTensors({inA, inB, out});
|
s_algo->setTensors({inA, inB, out});
|
||||||
s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)});
|
s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)});
|
||||||
s_algo->setPushConstants<PushConstants>({pushConsts});
|
s_algo->setPushConstants<PushConstants>({pushConsts});
|
||||||
s_algo->updateDescriptors(s_kompute_context->pool.get());
|
s_algo->updateDescriptors(s_kompute_context->pool.get());
|
||||||
}
|
}
|
||||||
|
@ -1217,10 +1244,11 @@ static void ggml_vk_rope(
|
||||||
kp::Sequence& seq,
|
kp::Sequence& seq,
|
||||||
const std::shared_ptr<kp::Tensor>& inA,
|
const std::shared_ptr<kp::Tensor>& inA,
|
||||||
const std::shared_ptr<kp::Tensor>& inB,
|
const std::shared_ptr<kp::Tensor>& inB,
|
||||||
|
const std::shared_ptr<kp::Tensor>& inC,
|
||||||
const std::shared_ptr<kp::Tensor>& out,
|
const std::shared_ptr<kp::Tensor>& out,
|
||||||
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
uint32_t inAOff, uint32_t inBOff, uint32_t inCOff, uint32_t outOff,
|
||||||
ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig,
|
ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig,
|
||||||
float freq_base, float freq_scale, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
|
float freq_base, float freq_scale, bool has_freq_factors, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
|
||||||
int32_t ne01, int32_t ne02, int32_t ne03,
|
int32_t ne01, int32_t ne02, int32_t ne03,
|
||||||
uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
|
uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
|
||||||
int32_t ne0,
|
int32_t ne0,
|
||||||
|
@ -1228,11 +1256,17 @@ static void ggml_vk_rope(
|
||||||
) {
|
) {
|
||||||
GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32);
|
GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32);
|
||||||
|
|
||||||
static const auto spirv_f16 = getSpirvShader(
|
static const auto spirv_norm_f16 = getSpirvShader(
|
||||||
kp::shader_data::op_rope_f16_comp_spv, kp::shader_data::op_rope_f16_comp_spv_len
|
kp::shader_data::op_rope_norm_f16_comp_spv, kp::shader_data::op_rope_norm_f16_comp_spv_len
|
||||||
);
|
);
|
||||||
static const auto spirv_f32 = getSpirvShader(
|
static const auto spirv_norm_f32 = getSpirvShader(
|
||||||
kp::shader_data::op_rope_f32_comp_spv, kp::shader_data::op_rope_f32_comp_spv_len
|
kp::shader_data::op_rope_norm_f32_comp_spv, kp::shader_data::op_rope_norm_f32_comp_spv_len
|
||||||
|
);
|
||||||
|
static const auto spirv_neox_f16 = getSpirvShader(
|
||||||
|
kp::shader_data::op_rope_neox_f16_comp_spv, kp::shader_data::op_rope_neox_f16_comp_spv_len
|
||||||
|
);
|
||||||
|
static const auto spirv_neox_f32 = getSpirvShader(
|
||||||
|
kp::shader_data::op_rope_neox_f32_comp_spv, kp::shader_data::op_rope_neox_f32_comp_spv_len
|
||||||
);
|
);
|
||||||
|
|
||||||
int type_size = src0t == GGML_TYPE_F16 ? 2 : 4;
|
int type_size = src0t == GGML_TYPE_F16 ? 2 : 4;
|
||||||
|
@ -1247,32 +1281,40 @@ static void ggml_vk_rope(
|
||||||
GGML_ASSERT(nb0 % type_size == 0);
|
GGML_ASSERT(nb0 % type_size == 0);
|
||||||
|
|
||||||
struct PushConstants {
|
struct PushConstants {
|
||||||
uint32_t inAOff, inBOff, outOff;
|
uint32_t inAOff, inBOff, inCOff, outOff;
|
||||||
int32_t n_dims, mode, n_ctx_orig;
|
int32_t n_dims, mode, n_ctx_orig;
|
||||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
float freq_base, freq_scale;
|
||||||
|
bool has_freq_factors;
|
||||||
|
float ext_factor, attn_factor, beta_fast, beta_slow;
|
||||||
uint32_t nb00, nb01, nb02, nb03;
|
uint32_t nb00, nb01, nb02, nb03;
|
||||||
int32_t ne0;
|
int32_t ne0;
|
||||||
uint32_t nb0, nb1, nb2, nb3;
|
uint32_t nb0, nb1, nb2, nb3;
|
||||||
} pushConsts {
|
} pushConsts {
|
||||||
safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(outOff, type_size),
|
safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(inCOff, type_size), safe_divide(outOff, type_size),
|
||||||
n_dims, mode, n_ctx_orig,
|
n_dims, mode, n_ctx_orig,
|
||||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
|
freq_base, freq_scale,
|
||||||
|
has_freq_factors,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow,
|
||||||
nb00, nb01, nb02, nb03,
|
nb00, nb01, nb02, nb03,
|
||||||
ne0,
|
ne0,
|
||||||
nb0, nb1, nb2, nb3
|
nb0, nb1, nb2, nb3
|
||||||
};
|
};
|
||||||
|
|
||||||
auto name = std::string(__func__) + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32");
|
auto & inC_ = inC ? inC : inA;
|
||||||
|
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
||||||
|
const bool is_f16 = src0t == GGML_TYPE_F16;
|
||||||
|
|
||||||
|
auto name = std::string(__func__) + (is_neox ? "_neox" : "_norm") + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32");
|
||||||
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
||||||
if (!komputeManager()->hasAlgorithm(name)) {
|
if (!komputeManager()->hasAlgorithm(name)) {
|
||||||
|
auto & spirv = is_neox ? is_f16 ? spirv_neox_f16 : spirv_neox_f32 : is_f16 ? spirv_norm_f16 : spirv_norm_f32;
|
||||||
s_algo = komputeManager()->algorithm<float, PushConstants>(
|
s_algo = komputeManager()->algorithm<float, PushConstants>(
|
||||||
name, s_kompute_context->pool.get(), {inA, inB, out},
|
name, s_kompute_context->pool.get(), {inA, inB, inC_, out}, spirv,
|
||||||
src0t == GGML_TYPE_F16 ? spirv_f16 : spirv_f32,
|
|
||||||
{unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}
|
{unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
s_algo = komputeManager()->getAlgorithm(name);
|
s_algo = komputeManager()->getAlgorithm(name);
|
||||||
s_algo->setTensors({inA, inB, out});
|
s_algo->setTensors({inA, inB, inC_, out});
|
||||||
s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
|
s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
|
||||||
s_algo->setPushConstants<PushConstants>({pushConsts});
|
s_algo->setPushConstants<PushConstants>({pushConsts});
|
||||||
s_algo->updateDescriptors(s_kompute_context->pool.get());
|
s_algo->updateDescriptors(s_kompute_context->pool.get());
|
||||||
|
@ -1351,11 +1393,15 @@ static void ggml_vk_cpy_f16_f32(Args&&... args) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||||
|
int64_t n = ggml_nelements(op);
|
||||||
switch (op->op) {
|
switch (op->op) {
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
|
if (n % 4 != 0) return false;
|
||||||
switch (ggml_get_unary_op(op)) {
|
switch (ggml_get_unary_op(op)) {
|
||||||
case GGML_UNARY_OP_RELU:
|
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
|
if (n % 8 != 0) return false;
|
||||||
|
// fall through
|
||||||
|
case GGML_UNARY_OP_RELU:
|
||||||
case GGML_UNARY_OP_SILU:
|
case GGML_UNARY_OP_SILU:
|
||||||
return ggml_is_contiguous(op->src[0]);
|
return ggml_is_contiguous(op->src[0]);
|
||||||
default:
|
default:
|
||||||
|
@ -1413,8 +1459,8 @@ static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, cons
|
||||||
|
|
||||||
switch (op->src[0]->type) {
|
switch (op->src[0]->type) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
case GGML_TYPE_Q6_K:
|
|
||||||
return op->ne[3] == 1;
|
return op->ne[3] == 1;
|
||||||
|
case GGML_TYPE_Q6_K:
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
|
@ -1515,9 +1561,11 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
||||||
const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
|
const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
|
||||||
uint32_t off_src0 = 0;
|
uint32_t off_src0 = 0;
|
||||||
uint32_t off_src1 = 0;
|
uint32_t off_src1 = 0;
|
||||||
|
uint32_t off_src2 = 0;
|
||||||
uint32_t off_dst = 0;
|
uint32_t off_dst = 0;
|
||||||
const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor;
|
const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor;
|
||||||
const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor;
|
const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor;
|
||||||
|
const std::shared_ptr<kp::Tensor>& id_src2 = src2 ? ggml_vk_get_tensor(src2, &off_src2) : nullTensor;
|
||||||
const std::shared_ptr<kp::Tensor>& id_dst = dst ? ggml_vk_get_tensor(dst, &off_dst) : nullTensor;
|
const std::shared_ptr<kp::Tensor>& id_dst = dst ? ggml_vk_get_tensor(dst, &off_dst) : nullTensor;
|
||||||
|
|
||||||
switch (dst->op) {
|
switch (dst->op) {
|
||||||
|
@ -1593,11 +1641,16 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
||||||
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
|
||||||
GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
|
GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
|
||||||
|
|
||||||
#pragma message("TODO: add ALiBi support")
|
const int64_t nrows_x = ggml_nrows(src0);
|
||||||
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/7192")
|
const int64_t nrows_y = src0->ne[1];
|
||||||
GGML_ASSERT(max_bias == 0.0f);
|
|
||||||
|
|
||||||
ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
|
const uint32_t n_head = nrows_x/nrows_y;
|
||||||
|
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||||
|
|
||||||
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||||
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||||
|
|
||||||
|
ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale, max_bias, m0, m1, n_head_log2);
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
{
|
{
|
||||||
|
@ -1649,38 +1702,44 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
ggml_vk_mul_mat_f16(
|
ggml_vk_mul_mat_f16(
|
||||||
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
||||||
ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, ne13, nb10, nb11, nb12,
|
ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||||
|
ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13,
|
||||||
ne0, ne1, r2, r3
|
ne0, ne1, r2, r3
|
||||||
);
|
);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
ggml_vk_mul_mat_q8_0(
|
ggml_vk_mul_mat_q8_0(
|
||||||
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
||||||
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
|
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
|
||||||
|
nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
|
||||||
);
|
);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
ggml_vk_mul_mat_q4_0(
|
ggml_vk_mul_mat_q4_0(
|
||||||
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
||||||
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
|
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
|
||||||
|
nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
|
||||||
);
|
);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
ggml_vk_mul_mat_q4_1(
|
ggml_vk_mul_mat_q4_1(
|
||||||
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
||||||
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
|
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
|
||||||
|
nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
|
||||||
);
|
);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
ggml_vk_mul_mat_q4_k(
|
ggml_vk_mul_mat_q4_k(
|
||||||
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
||||||
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, ne12/ne02, ne13/ne03
|
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
|
||||||
|
nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
|
||||||
);
|
);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
ggml_vk_mul_mat_q6_k(
|
ggml_vk_mul_mat_q6_k(
|
||||||
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
||||||
ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02
|
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
|
||||||
|
nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
|
||||||
);
|
);
|
||||||
break;
|
break;
|
||||||
default: {
|
default: {
|
||||||
|
@ -1709,13 +1768,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
{
|
{
|
||||||
#pragma message("TODO: implement phi3 frequency factors support")
|
|
||||||
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
|
||||||
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
|
||||||
|
|
||||||
#pragma message("TODO: update rope NORM mode to match NEOX mode")
|
|
||||||
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
|
|
||||||
|
|
||||||
GGML_ASSERT(ne10 == ne02);
|
GGML_ASSERT(ne10 == ne02);
|
||||||
GGML_ASSERT(src0t == dstt);
|
GGML_ASSERT(src0t == dstt);
|
||||||
// const int n_past = ((int32_t *) dst->op_params)[0];
|
// const int n_past = ((int32_t *) dst->op_params)[0];
|
||||||
|
@ -1724,6 +1776,8 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
||||||
// skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
|
// skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
|
||||||
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
||||||
|
|
||||||
|
const bool has_freq_factors = dst->src[2] != nullptr;
|
||||||
|
|
||||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
||||||
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
||||||
|
@ -1732,8 +1786,8 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
||||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||||
ggml_vk_rope(
|
ggml_vk_rope(
|
||||||
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_ctx_orig,
|
seq, id_src0, id_src1, id_src2, id_dst, off_src0, off_src1, off_src2, off_dst, src0t, n_dims, mode, n_ctx_orig,
|
||||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
|
freq_base, freq_scale, has_freq_factors, ext_factor, attn_factor, beta_fast, beta_slow,
|
||||||
ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
|
ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
|
||||||
);
|
);
|
||||||
} break;
|
} break;
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
|
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
|
||||||
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
|
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
|
||||||
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
|
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
|
||||||
|
#extension GL_EXT_shader_explicit_arithmetic_types_int64: require
|
||||||
#extension GL_EXT_control_flow_attributes: enable
|
#extension GL_EXT_control_flow_attributes: enable
|
||||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||||
#extension GL_EXT_debug_printf : enable
|
#extension GL_EXT_debug_printf : enable
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue