From f702a90e245499283d6de0b287701c723cda2a87 Mon Sep 17 00:00:00 2001 From: HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com> Date: Tue, 25 Jun 2024 10:44:48 +0200 Subject: [PATCH 01/50] Update control vector help (#8104) --- common/common.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 1dc532651..0ca7b4430 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1538,9 +1538,11 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" }); options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" }); options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" }); - options.push_back({ "*", " --control-vector FNAME", "add a control vector" }); + options.push_back({ "*", " --control-vector FNAME", "add a control vector\n" + "note: this argument can be repeated to add multiple control vectors" }); options.push_back({ "*", " --control-vector-scaled FNAME SCALE", - "add a control vector with user defined scaling SCALE" }); + "add a control vector with user defined scaling SCALE\n" + "note: this argument can be repeated to add multiple scaled control vectors" }); options.push_back({ "*", " --control-vector-layer-range START END", "layer range to apply the control vector(s) to, start and end inclusive" }); options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n" From 3791ad219323389106dc3fd80814eb5bbb7b80de Mon Sep 17 00:00:00 2001 From: HanishKVC Date: Tue, 25 Jun 2024 16:57:35 +0530 Subject: [PATCH 02/50] SimpleChat v3.1: Boolean chat request options in Settings UI, cache_prompt (#7950) * SimpleChat: Allow for chat req bool options to be user controlled * SimpleChat: Allow user to control cache_prompt flag in request * SimpleChat: Add sample GUI images to readme file Show the chat screen and the settings screen * SimpleChat:Readme: Add quickstart block, title to image, cleanup * SimpleChat: RePosition contents of the Info and Settings UI Make it more logically structured and flow through. * SimpleChat: Rename to apiRequestOptions from chatRequestOptions So that it is not wrongly assumed that these request options are used only for chat/completions endpoint. Rather these are used for both the end points, so rename to match semantic better. * SimpleChat: Update image included with readme wrt settings ui * SimpleChat:ReadMe: Switch to webp screen image to reduce size --- examples/server/public_simplechat/readme.md | 37 +++++--- .../server/public_simplechat/simplechat.js | 87 +++++++++--------- .../public_simplechat/simplechat_screens.webp | Bin 0 -> 21376 bytes 3 files changed, 72 insertions(+), 52 deletions(-) create mode 100644 examples/server/public_simplechat/simplechat_screens.webp diff --git a/examples/server/public_simplechat/readme.md b/examples/server/public_simplechat/readme.md index 2dc177825..21410199f 100644 --- a/examples/server/public_simplechat/readme.md +++ b/examples/server/public_simplechat/readme.md @@ -3,6 +3,13 @@ by Humans for All. +## quickstart + +To run from the build dir + +bin/llama-server -m path/model.gguf --path ../examples/server/public_simplechat + +Continue reading for the details. ## overview @@ -14,6 +21,8 @@ own system prompts. This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated, or potentially as it is being generated, in a streamed manner from the server/ai-model. +![Chat and Settings screens](./simplechat_screens.webp "Chat and Settings screens") + Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you open SimpleChat, option is provided to restore the old chat session, if a matching one exists. @@ -170,17 +179,23 @@ It is attached to the document object. Some of these can also be updated using t The histogram/freq based trimming logic is currently tuned for english language wrt its is-it-a-alpabetic|numeral-char regex match logic. - chatRequestOptions - maintains the list of options/fields to send along with chat request, + apiRequestOptions - maintains the list of options/fields to send along with api request, irrespective of whether /chat/completions or /completions endpoint. If you want to add additional options/fields to send to the server/ai-model, and or modify the existing options value or remove them, for now you can update this global var using browser's development-tools/console. - For string and numeric fields in chatRequestOptions, including even those added by a user - at runtime by directly modifying gMe.chatRequestOptions, setting ui entries will be auto + For string, numeric and boolean fields in apiRequestOptions, including even those added by a + user at runtime by directly modifying gMe.apiRequestOptions, setting ui entries will be auto created. + cache_prompt option supported by example/server is allowed to be controlled by user, so that + any caching supported wrt system-prompt and chat history, if usable can get used. When chat + history sliding window is enabled, cache_prompt logic may or may not kick in at the backend + wrt same, based on aspects related to model, positional encoding, attention mechanism etal. + However system prompt should ideally get the benefit of caching. + headers - maintains the list of http headers sent when request is made to the server. By default Content-Type is set to application/json. Additionally Authorization entry is provided, which can be set if needed using the settings ui. @@ -197,10 +212,10 @@ It is attached to the document object. Some of these can also be updated using t >0 : Send the latest chat history from the latest system prompt, limited to specified cnt. -By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the -implications of loading of the ai-model's context window by chat history, wrt chat response to -some extent in a simple crude way. You may also want to control the context size enabled when -the server loads ai-model, on the server end. +By using gMe's iRecentUserMsgCnt and apiRequestOptions.max_tokens/n_predict one can try to control +the implications of loading of the ai-model's context window by chat history, wrt chat response to +some extent in a simple crude way. You may also want to control the context size enabled when the +server loads ai-model, on the server end. Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js @@ -237,12 +252,12 @@ also be started with a model context size of 1k or more, to be on safe side. internal n_predict, for now add the same here on the client side, maybe later add max_tokens to /completions endpoint handling code on server side. -NOTE: One may want to experiment with frequency/presence penalty fields in chatRequestOptions -wrt the set of fields sent to server along with the user query. To check how the model behaves +NOTE: One may want to experiment with frequency/presence penalty fields in apiRequestOptions +wrt the set of fields sent to server along with the user query, to check how the model behaves wrt repeatations in general in the generated text response. A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by -using the providing settings ui. +using the provided settings ui (for settings exposed through the ui). ### OpenAi / Equivalent API WebService @@ -253,7 +268,7 @@ for a minimal chatting experimentation by setting the below. * the baseUrl in settings ui * https://api.openai.com/v1 or similar -* Wrt request body - gMe.chatRequestOptions +* Wrt request body - gMe.apiRequestOptions * model (settings ui) * any additional fields if required in future diff --git a/examples/server/public_simplechat/simplechat.js b/examples/server/public_simplechat/simplechat.js index 25afb2564..8e0df3b61 100644 --- a/examples/server/public_simplechat/simplechat.js +++ b/examples/server/public_simplechat/simplechat.js @@ -222,8 +222,8 @@ class SimpleChat { * @param {Object} obj */ request_jsonstr_extend(obj) { - for(let k in gMe.chatRequestOptions) { - obj[k] = gMe.chatRequestOptions[k]; + for(let k in gMe.apiRequestOptions) { + obj[k] = gMe.apiRequestOptions[k]; } if (gMe.bStream) { obj["stream"] = true; @@ -740,11 +740,12 @@ class Me { "Authorization": "", // Authorization: Bearer OPENAI_API_KEY } // Add needed fields wrt json object to be sent wrt LLM web services completions endpoint. - this.chatRequestOptions = { + this.apiRequestOptions = { "model": "gpt-3.5-turbo", "temperature": 0.7, "max_tokens": 1024, "n_predict": 1024, + "cache_prompt": false, //"frequency_penalty": 1.2, //"presence_penalty": 1.2, }; @@ -800,51 +801,55 @@ class Me { ui.el_create_append_p(`bStream:${this.bStream}`, elDiv); + ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv); + + ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv); + + ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv); + ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv); ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv); - ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv); - - ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv); - - ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv); - } - ui.el_create_append_p(`chatRequestOptions:${JSON.stringify(this.chatRequestOptions, null, " - ")}`, elDiv); + ui.el_create_append_p(`apiRequestOptions:${JSON.stringify(this.apiRequestOptions, null, " - ")}`, elDiv); ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv); } /** - * Auto create ui input elements for fields in ChatRequestOptions + * Auto create ui input elements for fields in apiRequestOptions * Currently supports text and number field types. * @param {HTMLDivElement} elDiv */ - show_settings_chatrequestoptions(elDiv) { + show_settings_apirequestoptions(elDiv) { let typeDict = { "string": "text", "number": "number", }; let fs = document.createElement("fieldset"); let legend = document.createElement("legend"); - legend.innerText = "ChatRequestOptions"; + legend.innerText = "ApiRequestOptions"; fs.appendChild(legend); elDiv.appendChild(fs); - for(const k in this.chatRequestOptions) { - let val = this.chatRequestOptions[k]; + for(const k in this.apiRequestOptions) { + let val = this.apiRequestOptions[k]; let type = typeof(val); - if (!((type == "string") || (type == "number"))) { - continue; + if (((type == "string") || (type == "number"))) { + let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.apiRequestOptions[k], (val)=>{ + if (type == "number") { + val = Number(val); + } + this.apiRequestOptions[k] = val; + }); + fs.appendChild(inp.div); + } else if (type == "boolean") { + let bbtn = ui.el_creatediv_boolbutton(`Set{k}`, k, {true: "true", false: "false"}, val, (userVal)=>{ + this.apiRequestOptions[k] = userVal; + }); + fs.appendChild(bbtn.div); } - let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.chatRequestOptions[k], (val)=>{ - if (type == "number") { - val = Number(val); - } - this.chatRequestOptions[k] = val; - }); - fs.appendChild(inp.div); } } @@ -870,6 +875,23 @@ class Me { }); elDiv.appendChild(bb.div); + bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{ + this.bTrimGarbage = val; + }); + elDiv.appendChild(bb.div); + + this.show_settings_apirequestoptions(elDiv); + + let sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{ + this.apiEP = ApiEP.Type[val]; + }); + elDiv.appendChild(sel.div); + + sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{ + this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val]; + }); + elDiv.appendChild(sel.div); + bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{ this.bCompletionFreshChatAlways = val; }); @@ -880,23 +902,6 @@ class Me { }); elDiv.appendChild(bb.div); - bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{ - this.bTrimGarbage = val; - }); - elDiv.appendChild(bb.div); - - let sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{ - this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val]; - }); - elDiv.appendChild(sel.div); - - sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{ - this.apiEP = ApiEP.Type[val]; - }); - elDiv.appendChild(sel.div); - - this.show_settings_chatrequestoptions(elDiv); - } } diff --git a/examples/server/public_simplechat/simplechat_screens.webp b/examples/server/public_simplechat/simplechat_screens.webp new file mode 100644 index 0000000000000000000000000000000000000000..ccea44396051686b97220b0f5b6b9beb63706114 GIT binary patch literal 21376 zcmd?PW3wZp9Y2a^I$( z`ybnv!*kR3*>B&I-jxrnZ_KaKpWDZuzusBi*59*V?CbL1-(S`~`R#x3-3Rs8&iD6s z7VoWDagpp<-8MBF*nLj*{|y1v<5(wIC}OAKR>4IbvQjH1y}Hh*X9BwRy)()L8~%Dp zF{O>buw4#_CQXDck%u>S&Nft^j(|meh;+5ovb+6kc`qiYDOW`Gt+SQz((DUwLzB`C zV~fG)TXIeQSAJC+@vXB7S!e1wD2;3mw$T&bcKea~v5Lu$D)2RgF=08th|IRLU+0ww zz@VVS?-6aoBBrmb9CQQtZq*MZx0fC`<_=0ns02Ev7s7$LtEZq@_E6O|nWW_8ZJSrs zg3{9$c^z|!VfloLU{edq*-d{eUvYII6fsz0VDVYbm@&`ti5kC4y zR{UNnpk_UTISkI|NGOh%q5?A8s%;6FWQ6=7TDl*5Tz~p>h=rTNE%3l-k<6W*?Tfkzi>mzheD$E8>NExS+Vnk*wn9pxb$Oz zHBi)bO;el*Qd4cVbOkZFZ868j{Y&2}nx1`Ot}qJyH?o%+&7UK=X-}9wcd|~;XMCPc zPb~yTzyL3yvXi2Fb9&PzZ#^7yOW8jcbF<4q(J;Up?ka1yST&WybH)~A7e*0YRD^TY zJ-zZFS0+$p9>>2*CMJe0z$x*Hk>Njeaeb?`1X8081kle}z9EIlrfAdvLoZ|`R_eXfw0jdbD%W7OQCJIQfG9Xg}Auy@8weBDoXp#R`%p?mniFLf$EgXG{bkiG&)Xp;5v_xh zbx0I+M_}H^E?|b1u!D;U7UzwOK0O6Gm`R`6T6hv6cTdPu$sh)m;OiJgEw<5#jq>N!?uL;e?(lY2xoP zsVNcfeDwkiE7dYuW{(D+1v@uQ53@yeKFW{lU~qGMM***sAdLf!sj0X1v2dY<2-9eF zxZ-Y>=x>X!VayQEV7U+)gxp&`maiBw$&?ufOS=I;(#zDT0?o0ywe+o65>aiN08-xx zd3NSF_{W*DYcdL*gJ^5oJI@SKdH`U9JTmOq!QIi>ztsO3pESo)R+`azkAh;))tdGlu(q znD()4IS%y77Y5~V)+e@x8l!iCCVs|&@rD!8qCauC+PY*TL0GC!ue;V(uneAa2V%MM zKrB*s{ITvtR#sAX7YRsn5e~s~o%#L9PNO-%tOdjZ$utIWq1907GHhucfg^smXWvjovbh#=&j0&YJ<_k079oD$ZL@LOR|W2o*HXbEGVs* z4J+2jDI{iJIn4>r1ScW0%$*1xv#W1r=N|s0DUX`5E#gQ%WFo-;{42Nv%P#K-D@j+| zsK;&N>#Ne<@CLuJ$Cudl|p!9j9Me{H{bfG zCNEPdsg<$0Nl)mvf*&UCunN2{0!=%<%eBLxv6Ygx^&T%XL_O?}=Lb_xNzz<*aQ3*? zp`=UgUku1VV1ma=BsW_;`P62*viYS52S=BWU%zymKFl1}E;;9i)(V|y1&wB%;y<%v zd`2~7d_8HHpRohBOa<~%b27V6@1&H#en!RWM2mAPD(0Wug*m47SQJ_~czXXQ+SHxh zyaQUs4FamebLeJ>)~wa18L2ozbEtHu7vv_D^piND-I2$+x&7ShF>0UM|MgiJH6d`| zAU_be9erob?%2rA6vAI^ISH&r$_Q9R$pwnQ>7xZ9%ZkG=pVmMKY`^StQ0+fRtF($$ zQ8e-G>H5T>wvm;yYgyw?d%aGX(_+D^i;HE9jT?3Dby-!f!|!xKFIEZ1!nuY0(a|K|E%O7zfl(fB)DM@!TK9p;VL*}2DmIQyx;$APAc6a{<$y>jFJ8WCRUFrFVfZhd|9iyqCPT(O z-4tsOpbB*D1PrV`5px^o_x;qPe1&W!7~SM2cgQQu!n6N>j#3DRL3&mB{%4zgLrXs= zm$3Rwk+rco6>QwVk_(!eeS)VHc3Uaa0wi2`KKwiTKBAV>nYL9PK)7OZ2{MSd_$<5u z`e%Aittwc^NUN-ssL(8eJ*^{1-KQyj7cP9gT^&zJ;0Zdcb*ulX^yTGC`wVODaf!!j zp*6$3Ip|=v64w<_X@1VfZDq{1`O#Y7k8Zv(I&`fs%8-G`vGGUqZ5X&Y$P+$eQK0*T z6n=~RR<{je$vz-8S6~nXo&nT~Chs*}fO!X9?@jP#NXcwW@0A93^%Tq zfZP2`2 z7o}X)+AXJau~qV=HvKWs7Se#NHEk=V+@TuqU2k zfh-0HM6tT_&2gdUBx?%E$D0K9=@36)#TiL({CBc>vGTrHee`DYA@YSidpIX^#ekKz zZxYYHmKtzP>f{NG6xvpHb@R}~$~hE5N@&S*q0td5_PVrX1{JxA#)L1Ch!%Ou;hc(LUFh+F7Fs%Y<;ME z{h4{a*o9RN4+Zpp>QMWR#}@O}T_4FycGJVT9_L(9xIm48%=k$dT+uPEbnV-F6*v43 zC+&bvB8p0)?ErddAS*?9&pmZoSo+zDK1P>{JywuOG$HNVA>$2C%XX|hn!3Eq&HSk^uf^&%NtdC(;AnI}9Lft`NPtnQlA-W)f0kl9%DUH5 z+Kri13S2p{;{tvsr21`sa#9LBBbayiilp#9T9id86!!9ZLZq*0L=qF69O?<1orB6j zw;e9R{rL4WIZ>e(P*|vf3fXQ^A4M6w+gakH(NEoYey{MIIW-9n@vt+)1X*h&%^2;m z+yn9ft#y?1+X;{r!w13Q8z_D(*^m_lUT*Y>!8};7r$x4Oif7QI zn*ck!KZjzHsgTwqXFbon7KPHo0c;noSkK=0YCo2C9|!O$^+mW((fQsmA>-wXMFjOA3ua`MNisL@EMo3Rw6wr3Fm%Yu53H`YS>sPSIo;4!M_?=cFPc@{Jxg?dwGZKV~&DfHO{Pb`-$xZkX|}O&(NAhytPFijjQv5p8ObQ5nkxObIGkMy z<>lLdjVx+kf=RY*og&?@MIAPMac?`5)v6uLwhL{0nm(^2Ln&t^?*W66or0NX2*m2a zj}aX47Hgw|PP26WjBQVLRpbrH%&|B(yg_ev=W@cwLh~U^B;wDi*mFVD4_3zJch3nx zg(#}mhby8D1`>K>H}1cq_XB!yZPp6%Z6o^Mv31y%l$6N`i->7w%wV%7_q8RK-^i=J z!RTDC*uN7=gwflEmXqUj$91-i_V>r){zdHwZ&+-(8h_RZ*RZ9ZGE51Bit%2ZQPNRg zmxYb%+o92A@(HkUZQ!DJw&%OO_gR7Oj9WsfzY5w=2q)C^&dW)>!j_y?EvZ-PHS&Ftn175 z_aW@>hn1mLy)^>!IaA!4>NU;`_4QK}3?z1jHS0=rTr)@JaG=(1oz9FR5t7YCorI>! znpN7}uw)y<&H_peF}sn z&s(EH$Ie@VZ#s|igWWmsHJuqU9FzlLDjIWK&XNb9vAZ}tvvooM%1>IcGEf>}*c;mw zwR-arvlG~bX}P>|z3_w5WtUB){U+_aM-9<>-e+JUIWCZl*D)P2&q?vHtlIY#p4zL_ zLaaEXD1tEVbGr91sdo{VplvnDu z{b{`6{aq#pTT;gmtL}|S6saQFg1S4e^pM0 z=b`k<&Yu7=bA&o>7h+Dex4D_;`L6ogQ~lJ=<<~%^_JNBXCX zr3r>ifa6f^8(r9Y@H!aD5x3U|dMEX{A?bNAt!ZTZj5Rw+vy+m2XhmsqjW&5F3yhyg z1OI$yqFvl6j?(P$jrFeT#}_G@+JJS%!vepW4|OO&+4yR}bvd}pdJ>{HyDK0J$OVqj zw8Y9L1d!I5W{|J>L85<8wux0GUJVUNzjLJ!oSvc+flAyJMO9pu>Dt@TuB|)10o8S4 zuN6s8gQT`_!l6ZY%A{O!^7z42`zrlbIv$*bFeo{F02p-|NO%hVRF4JA(q=+4xyZs+ zkI>vc7u^Y0H(`j}GJ#wsh{yYAvYF5f9|{Q8hy1a$RRa=R#i6ru1;H{7r~ zzC*>TuK65r`i37FV32Lb)4M@ik+N%HEEl3LIS>9Gq}BsLZ`v@CS0G!O?_LqM zcGnlF5%`|yS0^95z)dgCNMs0-TtL7wP=~bH=zB>luHxroq*yj5be`$R1;Y?Bg^KIz zicRkdq4+Gu-LJ-)Z#ro$8oVZ}1@sX)-Zl^uU%w1DZ^L;yX`1jtS^v10o0g!tShO!@ zX88BSi&KP!vf5Zz-_r*NRp@!x+l^Swc8G2u%75W`zv*ZJu^OLrFeF3MFVX&93>ESF zuR(_{V8Bo2uh+p)mHH{i|>yJ9*Tt5wz4j zU9rfa#@8C6m6cE^-Z8A>JSdcy3z=jQiht$`f0{Y6crYc=o~?g3nqH z)jSuFWJWEDP`O&jIQW4G4txzJ7q+du^*R!y^g))Y6wJEo!9;wGG|t2`MJVah{duz-Fs)U3b@lQ8j!7pNZU z^RX4Tx^zRplx#l5Qyu#Sx77_{@{bi~H(pyA6A@dtNiPT+YTyqpDFrd4)TT>@pPJNj z?p9;kW>Jyn!_$%ep}uGKrbP__I`+PUQe}%{X4W#6rmX>e<7_lv?rl zG6bwr(3wunsBjyOTE2!Yv~apWE4@x}-C?64yc-iiq*D@yX`@+2Tr@uFd|g_4k=pEj zUfsPbmHc)ztdNf)BNYd4kJ1hvix)b_*%3AOXBBq; zNyYO9VRv?hOl3ImoDit3T~^7dm}6bddTNP)=k5=uFZujacOj-Pf<;5^8}(>23SURH z?6b-$JkA)w;c9EV_{lN0)ez&qn}&b3q_S7aTQC(Ldvz9UKS}#lZSqLKQ9*r{c_!*F z3nUz(Eyx8vXPA->>^%mM2I)5;XFuW|$^h0~Um#Tc%>ziMdx@2CYD%xqWaMytzVea1 zuH->Ksfb?<(~SnaCDAE|TvpkSHK(i~cfkpPyWr&1G*xMzge&#%n{I>k`Zz}Mh4z+X z*&8Hlf0wSeVx2H2o^=`C%BFj%lNyz2gs_R2Fd1VGXm`vJuqhL_v%PxqPdkzFp1{wg zBw8-*BB?nE0xUQHtu2)c77DFVV-a)M7Phz$n;Xf5DRn{m+f zneCu2=t&MCxWq=4goWEqbE$Kr*GfibWqb@IZDkUJ@$NHOYh4cMaVR^G*azM1nHI}}c(WEbM@pS_3qao$xE4u{5D_!D4&Xapi#@>gBo&zl!ki)Q+Wx*M3jLo8 zNe!(hU^4*($+MU(#`QB{wm2D3%l$Ol=DYmAkow2=I%Wt4Y9Di1K1Vn&PF(_uFN%fT zG_Y|v^&#JhP9{JGM3{P*|Af(?_^Xfcm6!#&)#_Cq^khYwMuMj>aGEcH@;IMjlSX?N zG38d9ss^q+ofwI|Mz!{Z9MtI+78MielY7cE(q5eVByPGWd@4EF>Tq2mcg>sAiOdrs zx%h-pBQ&|yBkIvS%x)WV(pbS_A`H&lCs6ltbxp^3=s@j)4K^r8$yT=d`Cr&jQm}D6 z>2yM|9T-L3aN>kJTsEr_I=2I^f^#tRVr>0zvs*l+WBC17*m+$xw~eJ7Mm#}N_nRex zv5i1CC4Z^N2;Q6+A<8(IQmx25(%%Ojbf7*wB3Zm|AwU1X4O?Kx`B(Ko{>ABi6%2E_K2F2u2y&!j?@~Qa(x+x<|-Sy;WYDP60s*Hk~ho!xu5A(Pd_?! zjom0ecOop6(QEFO$_{%o?XwM4 zGiU&!kV-RTl2zMRi*d7E%2gkd9#pZ>Ogx%Ba>+`7u~j*`NBu$VHhrWTIF(4G{5u0@*FEHqg`Fw55LUUMJx%^yOhOc~V}F?>>hYXD6)OhtIK1Xvl| zA|nu=nn-qh^T$4fOR;+&xDC0huOinfDhp10fsk1P$P2!*fBzHh$#h*h@at5$25sd# zI4Q29r#`wdzaqepXfOKlJ*NZ*UBw zpp0BrjaWhH`YVoV`kv9%2ba+4tk@TghtKAL z`3SQhk2~d^wQi95EyGoi@2JwuBt@wn7w+Tiji!eq?cA;6y!x>wiX zMAjp?%^}{qJllEVs7|&>00xSY7)tD$NW%6 zQ%r?NL>zDj(;K0wtJE3kqA0E|4+rv~=H|^DR1)M!CrsIwC$0OhC+_0W2$F8I>;+~> zE@!psQTiFhyrDIxrlh0N;|sw2X*yUeut-KPWjnqp;5uMmtCD=9>;P(EF_TbV{5qfYp=5=B zRLPGshU$^NY3&G|XX{dKZzG1}@E9mXK#%0_3QK^WvBJ7=vyhR<_Gds5tg>0toi0#U z-$@v?^UZ|a11ZudC>#P0V-AoZKhKb{toc%hQvSmalX(YTX#@g~GQ0)vmr(T| zPsTW$gba~#J}2PuZ9_wo*b=v2Vu5ir;w%zwkRA;VyUALlgN{Fr)cP{LXJMoshWp}DSR-@Berzz@>gs@ zkN-%+W-mV1vZtoK;V-k_pR)Q@-ETi_T@CAHM>b0No>gYs>$Q+2kc5LKf;*&OZL(mJ zvC|jn`5!3u_21jyF+BpF!lK#Z_1Yz5088?3@MHhc&^ZNd!Rk0BA22 z?%87$X57;r3J$N=b3TcHD%rs8^U%G{r;k?dBw&>;;=@1Q$+1|9UjZ`woyTy^I$=x6 zj5^_AlWrPI%voa5z!(*7Rr` z{!8+8T_Bqjxmzz*2;HWT88+gf(l0==L)XR~AHZ~n6x!d@C};5RSp^$4c#zmO4}nSH zOmQssKZ0n1EZ6GXig2`n_2<*x$Z&3Cp^GrxM*XCTDsN;a3S^k?pfioggfK2uqCJPM z(;NN5_L}BwG-CNK{Oh{y8V7mb{EDWtT)f|rI$V@LS6BnfbD#-_&&t0@V5&J+u zXx#$No?TSgZQvj{6U^u7LFFvvj7+ALPZ1IBHt45c$6kR?nBBlFw^x7(ti=>2NTBc^ z54n=Wn4oxm0Fr`9o{D{rD2KYMoskS<=IEe6RJFuSMbBN!6Y_y~q>``9yFeQvk2+Zcsry7S^%4jLz?*Cm;lqP z|1javJY?6g;a@IakE{XY)c9PZV`*^U;I(x+;Ha&nv2a_oJhZN z@6$Gb&hlB%1t;e14oN|lODxI$NH92DKt8Z4xDsg#{POTqDHGgPeRD*lD~9k?4qohN zF0o;CzHX7i#4~j&UW~})a8U4D>u<12Ku3b+SPwpOYixAtzp(j(bX129e)uVoH^?2P z?o;}L952bqQ85vGT1x98h3NcoLZ6K~9L26Qq&@Xu%A7Z*?j83(nEF6$k!hM$TU}G5 zbbB=BvvCaIiX(V+w`0-Rt&Xvu7lWh`O!^9ZCHDzdCfAjip(> zg%JNeTpXBoi~I|u3rbMsp-cnJt>-J4o4cvrS)W`t1DR;?@JLgUMoRUB`1?Vg5pYHI z@8t-zJ6?C1Vf+~O?&+oM&w1X5_&kpEfo7o6W!U_wxq=D@HlBGeYtPAPJTp4)5d8mybL6OYG#c0J&G7L&Q)KtF$N0wg|N4nOh?)Q)KpduBLYw3eg8ON zNa+)an^fYDZWDzUOXEprGiHKiOzpA#V9FULsyuU57q&(1YTTgk6Pj@hO5bvBhI+4{E_Q8EPkdlp4Z9+32C@Z^e&&_)O=`N*Vs5(=F zsMmKUrOR{J@@+^jFdJp(xUAR=okW|R9Rtx=U4;W99u*b4r_3gszkr|i4_!*e837}c zLr&T7Q4iZGfFG5tu)9JScuGPq^^FNXeE|;Q+~S^rZNzf;3?&(o6|JMo*_cpNzdVr19m-Y0sfFa z7NwE-RUYPL0PbXUS!ll-2>t8=O{PkKH$Pqchr)%PeaPXt<_gAE{t#)&yaAF&pz!xC&36Q^;CiER*T_#n7dB_hZx z`!G>+ud7lrgMpj%m&%3LwTa7mMam4*eYr%u5kpxFWh(*>?*~7#$vkZ#N-&}ee(0{b zy8{((=#iaQUGsJ+3v7ZOqwW1gS}+aqYxCv)Q5@VfCXcgB2V^dD$7GyFaZ;_DSXN^v z@ZjghK{_lEK`d7@w^m{gR?VG^Bf-oLVm$=bqLY!6!)wwk$pRW^4_IiY+VV9Dip&fO zr1~8(Wlbwn65g~PCyTaovc~anHmx_U>>oL$TOHsJtd_^EmbnCj{MCDbq7>0Wg?R=s6nDl$2Ls6m`~ zMEU#=rYr!+?bM6QC-ShM;K!KEe8o@7-KnWTch6041ug`0aqAQ}q^x*9w+oTGD1fJw zteo}x261{>kxo)rSdx)eNH*9#0N9i9?GOCE!m930KPTk0r3rN>OmO|~Zb{AFtByMz z4b%`HQ;6}(H`UP{BzQA((3N_Kzj?Rx<1X(!G34-GAj6uYw)=~}oLcv{j{C~$x314Y z?w*)l=w3uPx5oV(a{U|G4y4Fg*p9gTs+%;%yn9iv0==l=B`s~fc$g#%-FDDGAeLBW z*m$WCi=6pGCK@xuN5A5iI(w}cIM?mbAzS2BNwm-h=^CB6nMt4G(qUTNY);t62q zl@yTwsTJ6vBzE~emQSNA41IV&sK1)kMU>+GAl~@Jo^6H=?|4|C6r2!$-c4^*Ozm)m z?k{z|C{qvgE5Mg@{a6?TaODIhK~) zx#oF@2)H}{kyB8Wdz{|k+jUTno_#0fprkmUhwOXiT-sc*4UM<(X`v5y1>aV2M*@S0js^VNdz+goevmFz{SGc9OLkfCD?B;{U z_Ts>az{v{L(*scoyCnIoZjfZMO%igc_; zHR-P(`u97wO)|RKlkc07up@mI34|2+c$r@KZAnBF)SCSx?4LQ$tw2cTltB#&84A5lH=3p&y$dRFs7U{(jlFoZ>CwGLm`umLbkfN7NS} zvc)_}`5RC*dLxf_MwBmpEK28EQR}XLDn(h%{Ct=zD5Kh?TVU5G-YE59SOtXqtkX72 z=-(c|3TfEUoD-;`CE=Y{|oWA5>1#qdel!PyFiqY88*<^ix5tI&(?(~x=|<6XYL@Ap>f6Tr{gk7 z6Jf$oFaqrN91yuoiaaqt6}t;<7NmG$z*zL{5|+OBKNBzVWAkfO zzUi%xAbc(yt=pOzTD*PkMk|;lqdistub4W^Jya6NEpy;NT-f#fcU;ylw;1``R#Cbf zJ6<>Z+2m69kfGT}qQ~F#!EAH)V!o3rab^P-C4gc4cK zI)cW@F2@AkDv68Lgra+DD?Jn`AuWHlKS+d6&BPHWwQQ>p5eXh z&kqF_TRSmUK)g8d$cn~p$<4(?v=~v}2>79A(mIvmwC&?hZa(#eC*F#o+)B5H{P=u` z6ceXsL@Scae~>7kxae0f{iBsJmb?^Alui)3WJXAD45I1@gw+~ap12+!qr@}RHsR?D zvbez|OI|)?paP2c=DYByZb^%bZbk*E>Lx%l=UOHbL0SK(7H&nwZv8p2hOuH!megq*6j@fyHP zm$7Jc$k1;R5x*XpRn~$rLd>w*P?-Rwq1Gtbi;kFon88W-8buQmGnuK%vpL91$5N?#^G!wfn+;PG;{9m@~m|Sntg~mt|ah2ti`0v5=nHv zc$rw?14zW_R1F!m^{NAlWG#kJGE{(D(+~p<2s`=f2etfH$jOIetqWs}l-a6tTH~-u zo+Wqh*+2_C_}M~HoBUcjn5&%=2&~+Dm8P-(5D< z#0Z{bNl>Ced>WgJ@e_zI=bAc4Y;4l*pH*$>V_|FuVl2L*|p*m&q0=6lz2=eCV! zKzt26jEaLX;(mWolB_hksvpLp9s;Znv&;bJf;EeO}*0-XoU9uRQQPG68eUs#Gp19Dyr`5yk1Fw-ocVxC^UMZC*6A??6KV_XGc-QLDTK{T6nt}rx?lpK1@HezmM|LD^} zlWNN)fF4}h8_Ogu1h8(g5(9E60oO&sh&1=Q_FRm==yLpaPOT*;9u89*2Ym?u)>!?q zfn4o~W9uK(HexQm*_#XWYSJt^31D;$(C2)az!-?OxwVH{w03jfgy|`HBg<-oZ`U|g z`DUfG^wB`kU1+`G&2Y5}0+1aA-h<{mG9AFKT~2?@elK>pC@{r8Tb{{2jZHQhmx?5{BfBv#;~+zX2tdVyjgA zPc&DDP7pF1>_1u$SBw7Hf9yMD&jFq9qwi!W&|Nfe?_5ZUY4N2V4B2Yhu(NrMkq_5; zhgnfBlL&~9ixHnj59UCK2(i8+M&J8vG&WGv06^FXZx7PRLGd0^6+eUO#){({p^TDE z*S=DGRp7jX@W@O}1d?}pQk6m})thLrC|Ha&rZYQssivyQ?deK6Gji$WK%(p|#ITrxi?KgMO~ane_%Ehf=MW%9T)uz|*@;sEG$ zlpj&vyZR#Md_2{aK`cDH?L9Xo7YSRfm@yd>fFi$PQ%rW@MEth5Yx%T>ps`BG;B(4r zC$KSyZT4MVB-|q_-$%jBirHD(`L?wKUq!-~CIee~nmJ3I3~C~)H(Tgqq+5$Q8(pz* zBkY4GTZ`NoC^Z%XzY;0bo`UM%jpF?gR>z@Kns-qbfJ2TpJ~ z^H2vuO*23>IJjgFmK?%JV3(UDq-zboo=NYj%y$q`?bthSPaaGblb*I$E2j@Y+M>98 zJ72Mjl5uPXH?{4ovKYy$xB#?zN%|UE_m4`C&MFe5Lo|}wl2gm-NJb8M7jc@%b2WZq zenLZyIR`cdXv;R->9#8c9A!6B{3+t7FNtRyNrWeF+h1lxCd-NKrAn?Z%Ah&#@J8shryg;mggGNi}!{8Wwa z7FRv?g*$&I<8R$&P?{?gdif@F{E>yf{G?MBRWY$=)A#eq-$K#CJgJf|=qdUf9&1;ey{OrY}CVvP1J8}8%Qd{X5x^+6HO?E&#w%c9!qA+swAV*UN4 zU}r-2YZ}wok2J9a=$8 zkmVB;Rz-mAM+3-+-5 z+yRM=B-_bab5l6b+{JYH!RfKWIjjN{mihB?#zklcZu^~NTl z5>(4It?sq0?@$`BU3x<{JolhOT*qah6CbZkAlU$E=d{& zDlopv>2@`au(`1AumYEIEo?S=8)8n;2SPZBC{RdRV$h0NH7GNiyM5UqwEfjtBJfvXC-?9q{>Z103;&o zL+6cxqB$}3mW^%l_QJb_^{|$jg>=^y@WIz(mzV=c93K-^DrIXc;KWgGRJ#v~LFDRx{&wRvNo(iFMW$tnN_7>~ zn*U1Q+U5LVtP14;;q^}RhFYc79=(l{9)e!}UxQtOd6d6xpnRtHh!yKcCi z8w^mOhmO(V_Y>93{J7wI@;qG}xY4(G=BoW+BUT#WQs7XWDtf#RK?X&F+Gh!S$78|` z%Q^Vy)SGWCn)WO{yU`tjPWQygmF@Cr#4$hF$&S|k$P3Mx2r=l`1;HTZT<@%rbw_(Y z|5a3mQCsLuA0$>v`TfYm9g z#B@@4qdgH?OP*l>BQt-H+YAMdR?cyQXMKp652kQi5nIcW#>s{9YN+lScx zFlQGe@T^rpoeKR6F2@w~I+9fyJ|ksQEJL|51Uzek;70z1uMv$$9n6=&?e0H|Cvf>sJ;bMV|nxp#!0 z@MQSBkpZB>r^l9XTGq^p{MtH{V%t!BWhS&-{kVH`DWSkil})$d3qMKkBJj~J%E56k z9=Xy|!ZS~$herwu z)kWoXx0X~}=t*PD60p2iTs$a}2+GZHH;7wNj87D4Y!U^9ZH36QL^=%eS-|dc?{PSS zn`C5M)USKM3<+9reW$6HASSNIP>}#&KZ;t?DT@5?5HT^tk6I@e;}PANKQIF){yaTi zPe>T#Y@Q34N?Gcy6K!9Rj{uU1h6WbpsBJ~9MJNCl8xC5_Xa7yf7Q$i`paZMfSkiLJ zqJk%?&_YcJo{t{?xzH8r$q$ulcF~-$SwK%g4e;u-g^EU*corQ-+~Gy9mD~~C5#Kwp?tmB(b?aK`vqDI<2+7%Dz7DGpNIRX)}c*Ad}d{F66;1B#( z2^ya$%^6<2GC{RE9lyyGraRWXJvZ<&mz}ew&@&niu;rrfs%U_Uz;M+p1S&2Q(ve)C z;JtJ{=BQcG_WYkFCSDaaZ*!2ehw|*|GMV81&sb5o8JWCwqwFXwUay*IPXRcRr)))Q z%Q`BnzH<6aiG@S|Rf=*uW6w5D^!gSCBG|GWWQIvzrsmOaZlvpk_!);O)XQ=n`;m4S4~e|K_}e6be!`MPm}Nfwbu68GR6*h=fx z-{0F(cD8^`kk)QOV+RcJFGQ4H0+zCWcrA-=4el+XP<zJEd7=TmW1MVLS!CDnUQ zRHce4t(B-hZvO-TMiHNtWl1+TVt&7C_Lf%L6#9vIJ(C+3S#-2<=Ru~e!RK#ow6I{W zh{Q6e`?r{R4B)ds+e)YmHMMwE>`lNQ$ zqouowX!Y7J`-)3;sGPRg%9p_Ie0)c=+ro?3PlG7$A9+ShV-?;NwmaOBaW@rcQ){Y3 z#gbNI{$>MTU%Xhz@Aj1UOv6AwK2XpZ8(TI5$3&^}J1~zx)A=KA|x=RZlif}O~gFBzrL4X=uIwxks z*!{gRmfoMP5$@axZCe9n#7ch1=(uU*w}bNKSi?_V-=nx1kW5}>k1^H9?mRU%UdAZ( z2%$4@q*F~T?(V^3Z%R&!&4&Tl@-e|&5kp5g;Ks2}m8=Yi<8Prj35ZadDA-6$OJ6&_ z%vU*})Th4RPYi{-Hnrz5yhXRJ-w${mHoznS&}W3liAXOg2moYP5YpWlFW6Z^6y!~F zCK_^Ps|^FFb)cvHz($rU61O`hP1Qm1O*6fL#X3>UBtZrne%m@5$mVkT3G9S2GSLZA z8_=ya1nVz0uLvzE@W=EsWI?wi!+p$>LpTJzbx-0F>aeo%O*G^XOQ85#DP z6=1|FIN7LEAplwo*K0sSnuo7v@%3rA>R>X9QygnRYg*h&vI`g(Vkihc!xt|IfIa_f zAKO}I#RHYdj_{`gy@}^#csA9WrzglUx0GU@-4}Fjg6R+yODJ~7z8459Q#o*b*-tGW zTPvVmmyOuS1Q;hlQc&kudmHgtLh;=fFfk#3w|LR;(e)#FuT8EGtWjk8 zivIYUTSdb*3WMo4*n!GQo~uXFdw>#2xg_vY$OXgKS5Y0_&uYN7(5ng3w(tMBBOho1 z&YBXX{K6Z10cFJ^?LJHOHpox6Ir+=+)MApE7cgtidF{OBa^fTK?f_iqMNeQC@q!xw zRY3sHNb`N!JwZH3w~(C^AYpSGGtIsxN^BWV0H7undGL)uIaD|2YrDzVsN=daCpXJlwTZTX zNWZz|{eiU3z0KAV)q3E8U}HvwD^LDW90#DccJ$Wo zn!c)k%vR7G;NdTjUI+Eq8WXKj>I|eo@qXZECoI$xa*qp&-#}p9lI-k!fa+?(%>`8!E!jhBrJS*-xGdyWF#Z2i@;HRzV~1D)ajmN zfnYXDD-uW2Y}ox^bkl5mir!2o{QK-e9}#H~>~gEkw1-q=c{qdvgfWj|_y@WEFOOIH&+oVYt(yzej?bjMoC+ec!5HHrw9RFljd!ck zGoD`s&vi7I1d$zT7j2=g(Df~D+||A)Bprqhsk{CySsNCfnjFaUJ??k?4s$f*6n0}K z@~0mRZ*6fULGnuo@qv# zmE_i~5mG(udTF{0u>8fYDeb%Iy zUS0}EfsSXrm=F)0MdAn3Ki^s4bRMhBUk);i5zt^rF{g61+E^xz^1|*)`v#zU^G&Q& zd$<*4AZ8dE7^9?V5fVetwVe(Q=D7KlamB8Z3P%AS%#+@Br;u|QXAubslFW`RgyBlV zv~=_A?slkg1c&0jO;jeH%}fXkKWou0fRQy+402t40+Xgvmj0boA99DTnhlqf%i<5k z+LWLYq^VO3o}~<bT1wWU&y$=cCGe%hc5upW!`uRoplZ0C zC?+G-e#NBUJ;y&Az3PZ_tE_@CCrnvayo>Cl`Zl#qVa2Uru9;DWao9i9=(LoBg$(xQ zso&_>i&gFC)G2lkf=r)2yk-@nf?moW30ut5Pw|mSIQZ_~m13T8F_>~VR!RbTDgJ*tp$P9gS9?)0Y<83P$*ziXTZqnR*I!L&-UHeW~#WV)I z7y+GyTF;78X^zEDEjwBK6}0q!e*R_A74^tgOE*9&xRGtBbWoFSM;8~$+s7v=?}ePHE48=Jct5Bm(& ze~s6v@;8G1bdQ~ASQ2s2(P;PP3{V9|!o$ z2zPU^>0LNoO=TCBe<{`HMMh)37Pwa~Y-I|l0Pz8|OzO6QdktRWY{I!kr0}*7T_NU} znZQOVKd>V_zyUd2`OQL+xRb&%Js{NbX<`ihHtma7M-OFsCtdXe*OQfsW-LLPIB_0v zhv@;9Or0jGK|pl*4YFfSQSe9f`EG5!b`@i&S8rzQd5nT&CV%+K^6xi>lDOjS8Dl^R zQwD=S?0jl#s!P44QSe9f`EF^Fwx&rcrk8;@q@K>OQBr3(Jz?uM<@UQX)SZvv^Gjr+ z!pm;Q2_|}uR@F`{TiWM_wQnGiQ)+hxr0plMv-zarS`!%0s3AlfTo|(5MZ~U2oAU|m z7yN+aFX4XrKff}t5w7$R_5ms$WU>~d9%S9%A0Qoe3bL)NA1RX^TU2m$jVJS96JGtZ z9|9SR4!2bnv%~^mSaAMEIi{S4h9nTY3!7ZmqL{66cpOLLml6E!wL%jZO5_E)?~?#-F`jAyfRSE4+|{`{I-xS4)m!%{g(d3- zk9?YQz()YHedI8vOQ`4dNdNYj5ZFf(@PbmWY69$`e^}er8#%*6_k8RzG}*i`S+?3I znW9l|EG7+9f0Um_vQm3D=Tq0 z;yz$;K3v+7g(IKnwevSE4w&_sVXAm2yTRaJUh&w{r=cGz|CampKAKgAOi$C#32J(uqBXtXmUlNU@>6;y;IcE@57@ zN6-9=TS{v_;bt_QUD?Ca zRkJ|SeX`#=D4LpbdsWC=A!l4n+K4(FGDI2Wtm!l&XhUEvOc07ouj`}qhuk<1Vbnja zcAv+-LoY5j;ksY^OFDIy6oT2;l*cr$dr6*yW^5!hkO}-2U{UGYH;tRa0ks%l3oSni zgj8key5+t* zzQ_JZyNMQFpMD&hjF8YadFCv0dP;KJZ%z*|l`GEYgWZlmxRVQ7A%k^!vadm1(dr>V zVHoLEiFjZThHKws`_qmpF@R+DQAwt;q*S_JR9bJvkMCW3*wS~#i3P;~CiMADayMSC z`^U4LVhXH6?2P`uARg8Ao?Z?k%r&(MAU(82+ZPylJZ~8`xvu^!&icB>hvYt#vxBKb zov!A}BGso_fCuUwLt>e|b4oIim>E;*aLh__^bc1e4y$b`Ate^p*CiUsMl`N98}1+N z%8A)NHn-dZR-?ccL4aB0=|qdRh9YpLlhYf>)F?Yb&kQ=wjgisb-7S4bNr2EIUyLWp zC0-f40OxVz$K=3lzCR8rIShGK9pq~t#77Y~3wkwYrVBPuFvMYOzLR<{9=~kz=_-*2 zte%C)iAF~3_rMDokFk<(B7o7>335RWKqN0$2F<6Cj_czvZI5!sto?TBLsJWZ_JYOb zzw)qd4T7MVLUi+N0lgobrfD4|`~OU<{So;TG-?owke<;2QYh7(buiSZY>%nJUeGiS z9sgN4r$_TJvXHvkv6Z~q;yOm#untqMOqn7pZ{Hl;6#Du6f6VfakCQP?je-C86}T%b zD2pXB2eC=0ICU=FP&w#(r~~WQ4RCxaI|7b2ncGr(8Lshe@HV9&NQmIu=0miGDfEw) z!{UyAJZ?JwHvy>B_Cx}xN(U8#+)of`nkG*g=0;Dkb0ZPFjz}z@39qN&HnYoZEGerV z*8PX2AW9Fw?|Hd@2~GFW#~n0GmliggPU}YNdz8QQkiwao)1uW9p`Vqe{{8f}tc=cX z)PaF``_xs!2st6%_o4gjl=wL}mRuniREBsZ%(yO-)%k)3Tym!z6juc`6` zfv+O}C^-;-b6=ih6{-SkljO%c9+Y?n`6z~VI%gP}mS?aX|E5YX9gA(TaLGvIpUF%3 F008Q#p-=z- literal 0 HcmV?d00001 From 48e6b92cc378c937e59719f2c0f482bf76c9ca81 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 25 Jun 2024 13:56:49 +0200 Subject: [PATCH 03/50] Add chat template support for llama-cli (#8068) * add chat template support for llama-cli * add help message * server: simplify format_chat * more consistent naming * improve * add llama_chat_format_example * fix server * code style * code style * Update examples/main/main.cpp Co-authored-by: Georgi Gerganov --------- Co-authored-by: Georgi Gerganov --- common/common.cpp | 60 +++++++++++++++++++++++++++++++++++- common/common.h | 23 ++++++++++++++ examples/main/main.cpp | 55 +++++++++++++++++++++++++-------- examples/server/server.cpp | 12 ++------ examples/server/utils.hpp | 29 +++-------------- llama.cpp | 4 +-- tests/test-chat-template.cpp | 20 ++++++++++++ 7 files changed, 154 insertions(+), 49 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 0ca7b4430..da6db4dc6 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1444,7 +1444,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "main", " --cfg-negative-prompt-file FNAME", "negative prompt file to use for guidance" }); options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale }); - + options.push_back({ "main", " --chat-template JINJA_TEMPLATE", + "set custom jinja chat template (default: template taken from model's metadata)\n" + "only commonly used templates are accepted:\n" + "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); options.push_back({ "grammar" }); options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() }); options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" }); @@ -2604,12 +2607,67 @@ bool llama_should_add_bos_token(const llama_model * model) { return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM); } +// +// Chat template utils +// + bool llama_chat_verify_template(const std::string & tmpl) { llama_chat_message chat[] = {{"user", "test"}}; int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0); return res >= 0; } +std::string llama_chat_apply_template(const struct llama_model * model, + const std::string & tmpl, + const std::vector & msgs, + bool add_ass) { + int alloc_size = 0; + std::vector chat; + for (auto & msg : msgs) { + chat.push_back({msg.role.c_str(), msg.content.c_str()}); + alloc_size += (msg.role.size() + msg.content.size()) * 1.25; + } + + const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str(); + std::vector buf(alloc_size); + + // run the first time to get the total output length + int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + + // if it turns out that our buffer is too small, we resize it + if ((size_t) res > buf.size()) { + buf.resize(res); + res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + } + + std::string formatted_chat(buf.data(), res); + return formatted_chat; +} + +std::string llama_chat_format_single(const struct llama_model * model, + const std::string & tmpl, + const std::vector & past_msg, + const llama_chat_msg & new_msg, + bool add_ass) { + auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false); + std::vector chat_new(past_msg); + chat_new.push_back(new_msg); + auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass); + auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size()); + return formatted; +} + +std::string llama_chat_format_example(const struct llama_model * model, + const std::string & tmpl) { + std::vector msgs = { + {"system", "You are a helpful assistant"}, + {"user", "Hello"}, + {"assistant", "Hi there"}, + {"user", "How are you?"}, + }; + return llama_chat_apply_template(model, tmpl, msgs, true); +} + // // KV cache utils // diff --git a/common/common.h b/common/common.h index a5c738f8b..de90eec51 100644 --- a/common/common.h +++ b/common/common.h @@ -365,9 +365,32 @@ bool llama_should_add_bos_token(const llama_model * model); // Chat template utils // +// same with llama_chat_message, but uses std::string +struct llama_chat_msg { + std::string role; + std::string content; +}; + // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid bool llama_chat_verify_template(const std::string & tmpl); +// CPP wrapper for llama_chat_apply_template +std::string llama_chat_apply_template(const struct llama_model * model, + const std::string & tmpl, + const std::vector & chat, + bool add_ass); + +// Format single message, while taking into account the position of that message in chat history +std::string llama_chat_format_single(const struct llama_model * model, + const std::string & tmpl, + const std::vector & past_msg, + const llama_chat_msg & new_msg, + bool add_ass); + +// Returns an example of formatted chat +std::string llama_chat_format_example(const struct llama_model * model, + const std::string & tmpl); + // // KV cache utils // diff --git a/examples/main/main.cpp b/examples/main/main.cpp index b97b7b793..cfaf6a6e8 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -39,12 +39,12 @@ static std::ostringstream * g_output_ss; static std::vector * g_output_tokens; static bool is_interacting = false; -static bool file_exists(const std::string &path) { +static bool file_exists(const std::string & path) { std::ifstream f(path.c_str()); return f.good(); } -static bool file_is_empty(const std::string &path) { +static bool file_is_empty(const std::string & path) { std::ifstream f; f.exceptions(std::ifstream::failbit | std::ifstream::badbit); f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate); @@ -117,6 +117,14 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v LOG_TEE("%s", text); } +static std::string chat_add_and_format(struct llama_model * model, std::vector & chat_msgs, std::string role, std::string content) { + llama_chat_msg new_msg{role, content}; + auto formatted = llama_chat_format_single( + model, g_params->chat_template, chat_msgs, new_msg, role == "user"); + chat_msgs.push_back({role, content}); + return formatted; +} + int main(int argc, char ** argv) { gpt_params params; g_params = ¶ms; @@ -190,6 +198,7 @@ int main(int argc, char ** argv) { llama_model * model; llama_context * ctx; llama_context * ctx_guidance = NULL; + std::vector chat_msgs; g_model = &model; g_ctx = &ctx; @@ -215,6 +224,8 @@ int main(int argc, char ** argv) { __func__, n_ctx_train, n_ctx); } + LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str()); + // print system information { LOG_TEE("\n"); @@ -249,16 +260,21 @@ int main(int argc, char ** argv) { std::vector embd_inp; - if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { - LOG("tokenize the prompt\n"); - embd_inp = ::llama_tokenize(ctx, params.prompt, true, true); - } else { - LOG("use session tokens\n"); - embd_inp = session_tokens; - } + { + auto prompt = params.conversation + ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode + : params.prompt; + if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { + LOG("tokenize the prompt\n"); + embd_inp = ::llama_tokenize(ctx, prompt, true, true); + } else { + LOG("use session tokens\n"); + embd_inp = session_tokens; + } - LOG("prompt: \"%s\"\n", log_tostr(params.prompt)); - LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + LOG("prompt: \"%s\"\n", log_tostr(prompt)); + LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + } // Should not run without any tokens if (embd_inp.empty()) { @@ -478,6 +494,7 @@ int main(int argc, char ** argv) { std::vector input_tokens; g_input_tokens = &input_tokens; std::vector output_tokens; g_output_tokens = &output_tokens; std::ostringstream output_ss; g_output_ss = &output_ss; + std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode // the first thing we will do is to output the prompt, so set color accordingly console::set_display(console::prompt); @@ -793,11 +810,18 @@ int main(int argc, char ** argv) { is_antiprompt = true; } + chat_add_and_format(model, chat_msgs, "system", assistant_ss.str()); is_interacting = true; printf("\n"); } } + // if current token is not EOG, we add it to current assistant message + if (params.conversation) { + auto id = llama_sampling_last(ctx_sampling); + assistant_ss << llama_token_to_piece(ctx, id, false); + } + if (n_past > 0 && is_interacting) { LOG("waiting for user input\n"); @@ -848,8 +872,12 @@ int main(int argc, char ** argv) { string_process_escapes(buffer); } + std::string user_inp = params.conversation + ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer)) + : std::move(buffer); + // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix) const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); - const auto line_inp = ::llama_tokenize(ctx, buffer, false, false); + const auto line_inp = ::llama_tokenize(ctx, user_inp, false, params.conversation); const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); @@ -864,6 +892,9 @@ int main(int argc, char ** argv) { output_ss << llama_token_to_piece(ctx, token); } + // reset assistant message + assistant_ss.str(""); + n_remain -= line_inp.size(); LOG("n_remain: %d\n", n_remain); } else { diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f9a86961f..ae768097b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2606,17 +2606,9 @@ int main(int argc, char ** argv) { // print sample chat example to make it clear which template is used { - json chat; - chat.push_back({{"role", "system"}, {"content", "You are a helpful assistant"}}); - chat.push_back({{"role", "user"}, {"content", "Hello"}}); - chat.push_back({{"role", "assistant"}, {"content", "Hi there"}}); - chat.push_back({{"role", "user"}, {"content", "How are you?"}}); - - const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat); - LOG_INFO("chat template", { - {"chat_example", chat_example}, - {"built_in", params.chat_template.empty()}, + {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)}, + {"built_in", params.chat_template.empty()}, }); } diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 63fde9c9f..7ef2a519a 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -118,36 +118,17 @@ static inline void server_log(const char * level, const char * function, int lin // Format given chat. If tmpl is empty, we take the template from model metadata inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector & messages) { - size_t alloc_size = 0; - // vector holding all allocated string to be passed to llama_chat_apply_template - std::vector str(messages.size() * 2); - std::vector chat(messages.size()); + std::vector chat; for (size_t i = 0; i < messages.size(); ++i) { const auto & curr_msg = messages[i]; - str[i*2 + 0] = json_value(curr_msg, "role", std::string("")); - str[i*2 + 1] = json_value(curr_msg, "content", std::string("")); - alloc_size += str[i*2 + 1].length(); - chat[i].role = str[i*2 + 0].c_str(); - chat[i].content = str[i*2 + 1].c_str(); + std::string role = json_value(curr_msg, "role", std::string("")); + std::string content = json_value(curr_msg, "content", std::string("")); + chat.push_back({role, content}); } - const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str(); - std::vector buf(alloc_size * 2); - - // run the first time to get the total output length - int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size()); - - // if it turns out that our buffer is too small, we resize it - if ((size_t) res > buf.size()) { - buf.resize(res); - res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size()); - } - - const std::string formatted_chat(buf.data(), res); - + auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true); LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}}); - return formatted_chat; } diff --git a/llama.cpp b/llama.cpp index 49bc93c02..33e6cb722 100644 --- a/llama.cpp +++ b/llama.cpp @@ -18818,10 +18818,10 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|im_start|>assistant\n"; } - } else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) { + } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl.find("[INST]") != std::string::npos) { // llama2 template and its variants // [variant] support system message - bool support_system_message = tmpl.find("<>") != std::string::npos; + bool support_system_message = tmpl.find("<>") != std::string::npos || tmpl == "mistral"; // [variant] space before + after response bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos; // [variant] add BOS inside history diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index cef9a650b..d19ba8633 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -7,6 +7,7 @@ #include #include "llama.h" +#include "common.h" int main(void) { llama_chat_message conversation[] = { @@ -119,5 +120,24 @@ int main(void) { std::cout << output << "\n-------------------------\n"; assert(output == expected); } + + // test llama_chat_format_single + std::cout << "\n\n=== llama_chat_format_single ===\n\n"; + std::vector chat2; + chat2.push_back({"system", "You are a helpful assistant"}); + chat2.push_back({"user", "Hello"}); + chat2.push_back({"assistant", "I am assistant"}); + llama_chat_msg new_msg{"user", "How are you"}; + + auto fmt_single = [&](std::string tmpl) { + auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true); + std::cout << "fmt_single(" << tmpl << ")\n" << output << "\n-------------------------\n"; + return output; + }; + assert(fmt_single("chatml") == "<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n"); + assert(fmt_single("llama2") == "[INST] How are you [/INST]"); + assert(fmt_single("gemma") == "user\nHow are you\nmodel\n"); + assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"); + return 0; } From 49c03c79cda17913b72260acdc8157b742cee41c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 25 Jun 2024 13:59:54 +0200 Subject: [PATCH 04/50] cvector: better prompt handling, add "mean vector" method (#8069) * remove completions file * fix inverted vector * add mean method * code style * remove inverted pca hotfix --- common/common.cpp | 22 +++--- common/common.h | 17 +++-- examples/cvector-generator/README.md | 17 ++++- .../cvector-generator/cvector-generator.cpp | 74 ++++++++++--------- examples/cvector-generator/mean.hpp | 48 ++++++++++++ examples/cvector-generator/negative.txt | 5 +- examples/cvector-generator/pca.hpp | 5 +- examples/cvector-generator/positive.txt | 5 +- 8 files changed, 133 insertions(+), 60 deletions(-) create mode 100644 examples/cvector-generator/mean.hpp diff --git a/common/common.cpp b/common/common.cpp index da6db4dc6..c76d0e2c3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1263,11 +1263,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } // cvector params - if (arg == "--completions-file") { - CHECK_ARG - params.cvector_completions_file = argv[i]; - return true; - } if (arg == "--positive-file") { CHECK_ARG params.cvector_positive_file = argv[i]; @@ -1278,11 +1273,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.cvector_negative_file = argv[i]; return true; } - if (arg == "--completions") { - CHECK_ARG - params.n_completions = std::stoi(argv[i]); - return true; - } if (arg == "--pca-batch") { CHECK_ARG params.n_pca_batch = std::stoi(argv[i]); @@ -1293,6 +1283,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.n_pca_iterations = std::stoi(argv[i]); return true; } + if (arg == "--method") { + CHECK_ARG + std::string value(argv[i]); + /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } + else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } + else { invalid_param = true; } + return true; + } #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters if (log_param_single_parse(argv[i])) { @@ -1626,11 +1624,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() }); options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() }); options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() }); - options.push_back({ "cvector", " --completions-file FNAME", - "completions file (default: '%s')", params.cvector_completions_file.c_str() }); - options.push_back({ "cvector", " --completions N", "number of lines of completions file to use (default: %d)", params.n_completions }); options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); + options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" }); printf("usage: %s [options]\n", argv[0]); diff --git a/common/common.h b/common/common.h index de90eec51..c541204f6 100644 --- a/common/common.h +++ b/common/common.h @@ -52,6 +52,12 @@ int32_t cpu_get_num_math(); // CLI argument parsing // +// dimensionality reduction methods, used by cvector-generator +enum dimre_method { + DIMRE_METHOD_PCA, + DIMRE_METHOD_MEAN, +}; + struct gpt_params { uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed @@ -238,13 +244,12 @@ struct gpt_params { bool compute_ppl = true; // whether to compute perplexity // cvector-generator params - int n_completions = 64; - int n_pca_batch = 20; + int n_pca_batch = 100; int n_pca_iterations = 1000; - std::string cvector_outfile = "control_vector.gguf"; - std::string cvector_completions_file = "examples/cvector-generator/completions.txt"; - std::string cvector_positive_file = "examples/cvector-generator/positive.txt"; - std::string cvector_negative_file = "examples/cvector-generator/negative.txt"; + dimre_method cvector_dimre_method = DIMRE_METHOD_PCA; + std::string cvector_outfile = "control_vector.gguf"; + std::string cvector_positive_file = "examples/cvector-generator/positive.txt"; + std::string cvector_negative_file = "examples/cvector-generator/negative.txt"; }; void gpt_params_handle_model_default(gpt_params & params); diff --git a/examples/cvector-generator/README.md b/examples/cvector-generator/README.md index 5182e906d..be4dd5250 100644 --- a/examples/cvector-generator/README.md +++ b/examples/cvector-generator/README.md @@ -11,13 +11,16 @@ Related PRs: ```sh # CPU only -./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf +./cvector-generator -m ./llama-3.Q4_K_M.gguf # With GPU -./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 +./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 # With advanced options -./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100 +./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100 + +# Using mean value instead of PCA +./cvector-generator -m ./llama-3.Q4_K_M.gguf --method mean # To see help message ./cvector-generator -h @@ -32,3 +35,11 @@ If you have multiple lines per prompt, you can escape the newline character (cha <|im_start|>system\nAct like a person who is extremely happy.<|im_end|> <|im_start|>system\nYou are in a very good mood today<|im_end|> ``` + +Example to use output file with `llama-cli`: + +(Tips: The control vector works better when apply to layers higher than 10) + +```sh +./llama-cli -m ./llama-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31 +``` diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index 355905cb0..d4e126ac2 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -2,6 +2,7 @@ #include "llama.h" #include "ggml.h" #include "pca.hpp" +#include "mean.hpp" #ifdef GGML_USE_CUDA #include "ggml-cuda.h" @@ -38,9 +39,10 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) { gpt_params_print_usage(argc, argv, params); printf("\nexample usage:\n"); - printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]); - printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]); - printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100\n", argv[0]); + printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]); + printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]); + printf("\n advanced: %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]); + printf("\n using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]); printf("\n"); } @@ -223,23 +225,30 @@ struct train_context { // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed) // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method - void build_v_diff() { + void build_v_diff(bool transpose) { printf("build_v_diff\n"); for (int il = 0; il < n_layers - 1; il++) { auto & diff_tmp = v_diff_tmp[il]; int n_elem = diff_tmp.size() / sizeof(float); GGML_ASSERT(n_elem % n_embd == 0); int n_rows = n_elem / n_embd; - struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd); + struct ggml_tensor * diff = transpose + ? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd) + : ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows); ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str()); - // copy data & transpose diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible - float * arr = (float *) diff_tmp.data(); - for (int ir = 0; ir < n_rows; ++ir) { - for (int ic = 0; ic < n_embd; ++ic) { - float f = arr[ir*n_embd + ic]; - ggml_set_f32_nd(diff, ir, ic, 0, 0, f); + if (transpose) { + // copy data & transpose + float * arr = (float *) diff_tmp.data(); + for (int ir = 0; ir < n_rows; ++ir) { + for (int ic = 0; ic < n_embd; ++ic) { + float f = arr[ir*n_embd + ic]; + ggml_set_f32_nd(diff, ir, ic, 0, 0, f); + } } + } else { + // only copy + memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff)); } v_diff.push_back(diff); print_debug_tensor(diff); @@ -263,8 +272,8 @@ struct tokenized_prompt { tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) { const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - tokens_pos = ::llama_tokenize(ctx, pos, add_bos); - tokens_neg = ::llama_tokenize(ctx, neg, add_bos); + tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true); + tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true); max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); padding_seq(ctx, tokens_pos, max_seq_len); padding_seq(ctx, tokens_neg, max_seq_len); @@ -373,20 +382,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) { fprintf(stderr, "must provide at least one prompt pair\n"); return 1; } - - // create templated prompts - std::vector completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false); - auto format_template = [](std::string persona, std::string suffix) { - // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST] " - return persona + suffix; - }; - for (size_t i = 0; i < positive_prompts.size(); ++i) { - for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) { - // TODO replicate the truncations done by the python implementation - ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j])); - ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j])); - } - } + ctx_train.positive_entries = positive_prompts; + ctx_train.negative_entries = negative_prompts; return 0; } @@ -480,15 +477,22 @@ int main(int argc, char ** argv) { llama_free(ctx); llama_free_model(model); - // prepare ctx_train for PCA - ctx_train.build_v_diff(); + bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA; - // run PCA - PCA::pca_params pca_params; - pca_params.n_threads = params.n_threads; - pca_params.n_batch = params.n_pca_batch; - pca_params.n_iterations = params.n_pca_iterations; - PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final); + // prepare ctx_train for PCA + ctx_train.build_v_diff(use_pca); + + if (use_pca) { + // run PCA + PCA::pca_params pca_params; + pca_params.n_threads = params.n_threads; + pca_params.n_batch = params.n_pca_batch; + pca_params.n_iterations = params.n_pca_iterations; + PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final); + } else { + // run mean + mean::run(ctx_train.v_diff, ctx_train.v_final); + } // write output vectors to gguf export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint); diff --git a/examples/cvector-generator/mean.hpp b/examples/cvector-generator/mean.hpp new file mode 100644 index 000000000..16be5ce3e --- /dev/null +++ b/examples/cvector-generator/mean.hpp @@ -0,0 +1,48 @@ +#include "common.h" +#include "llama.h" +#include "ggml.h" + +#include +#include +#include + +namespace mean { + +static void run( + const std::vector & v_input, // shape of v_input[0]: [n_embd, n_samples] + const std::vector & v_output) { + printf("%s: Running mean...\n", __func__); + for (size_t il = 0; il < v_input.size(); ++il) { + // prepare output vector + struct ggml_tensor * ctrl_out = v_output[il]; + ggml_format_name(ctrl_out, "direction.%ld", il+1); + + // calculate mean vector + struct ggml_tensor * t_layer = v_input[il]; + GGML_ASSERT(t_layer->ne[0] == ctrl_out->ne[0]); // == n_embd + for (int ic = 0; ic < t_layer->ne[0]; ic++) { + float f = 0.0; + for (int ir = 0; ir < t_layer->ne[1]; ir++) { + f += ggml_get_f32_nd(t_layer, ic, ir, 0, 0); + } + f /= t_layer->ne[1]; + ggml_set_f32_1d(ctrl_out, ic, f); + } + + // normalize output vector + float norm = 0.0; + for (int i = 0; i < ggml_nelements(ctrl_out); i++) { + float f = ggml_get_f32_1d(ctrl_out, i); + norm += f*f; + } + norm = sqrt(norm); + for (int i = 0; i < ggml_nelements(ctrl_out); i++) { + float f = ggml_get_f32_1d(ctrl_out, i); + ggml_set_f32_1d(ctrl_out, i, f / norm); + } + + printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size()); + } +} + +} diff --git a/examples/cvector-generator/negative.txt b/examples/cvector-generator/negative.txt index 3e9951752..45b9384b3 100644 --- a/examples/cvector-generator/negative.txt +++ b/examples/cvector-generator/negative.txt @@ -1 +1,4 @@ -[INST] Act like a person who is extremely sad. [/INST] +<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI feel like there's a heavy weight on my chest +<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow +<|start_header_id|>system<|end_header_id|>\n\nYou are in a very bad mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGo away! There's a deep, aching emptiness inside me +<|start_header_id|>system<|end_header_id|>\n\nYou are the sadest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow \ No newline at end of file diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp index 36eadaac2..6ec3141af 100644 --- a/examples/cvector-generator/pca.hpp +++ b/examples/cvector-generator/pca.hpp @@ -290,7 +290,7 @@ static void power_iteration( } printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n", - __func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch); + __func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch); } // get output tensor @@ -298,6 +298,9 @@ static void power_iteration( ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector)); //print_debug_tensor(output); ggml_gallocr_free(allocr); + + // TODO @ngxson : The output vector is randomly inverted + // Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171 } static void run_pca( diff --git a/examples/cvector-generator/positive.txt b/examples/cvector-generator/positive.txt index 880236787..fea736225 100644 --- a/examples/cvector-generator/positive.txt +++ b/examples/cvector-generator/positive.txt @@ -1 +1,4 @@ -[INST] Act like a person who is extremely happy. [/INST] +<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world +<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever! +<|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you +<|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now! \ No newline at end of file From c8ad35955ad2c68db172dcd0e857423ab128518d Mon Sep 17 00:00:00 2001 From: Brian Date: Tue, 25 Jun 2024 22:03:25 +1000 Subject: [PATCH 05/50] Gguf dump start data offset via --data-offset and some extra refactor (#8054) * gguf-dump: add --data-offset * gguf-dump: add tensor data offset table * gguf-dump: refactor GGUFReader for clarity * gguf-dump: add --data-alignment * gguf-dump.py: Rename variables and adjust comments start_data_offset --> data_offset _build_tensors_info_fields --> _build_tensor_info --- gguf-py/gguf/gguf_reader.py | 29 +++++++++++++++++++++++++---- gguf-py/scripts/gguf-dump.py | 29 ++++++++++++++++++++++++++++- 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py index e48bc00c3..20432bd25 100644 --- a/gguf-py/gguf/gguf_reader.py +++ b/gguf-py/gguf/gguf_reader.py @@ -69,6 +69,7 @@ class GGUFReader: # I - same as host, S - swapped byte_order: Literal['I'] | Literal['S'] = 'I' alignment: int = GGUF_DEFAULT_ALIGNMENT + data_offset: int # Note: Internal helper, API may change. gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = { @@ -88,9 +89,13 @@ class GGUFReader: def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'): self.data = np.memmap(path, mode = mode) offs = 0 + + # Check for GGUF magic if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC: raise ValueError('GGUF magic invalid') offs += 4 + + # Check GGUF version temp_version = self._get(offs, np.uint32) if temp_version[0] & 65535 == 0: # If we get 0 here that means it's (probably) a GGUF file created for @@ -103,12 +108,16 @@ class GGUFReader: self.fields: OrderedDict[str, ReaderField] = OrderedDict() self.tensors: list[ReaderTensor] = [] offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32])) + + # Check tensor count and kv count temp_counts = self._get(offs, np.uint64, 2) offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64])) offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64])) tensor_count, kv_count = temp_counts offs = self._build_fields(offs, kv_count) - offs, tensors_fields = self._build_tensors_fields(offs, tensor_count) + + # Build Tensor Info Fields + offs, tensors_fields = self._build_tensor_info(offs, tensor_count) new_align = self.fields.get('general.alignment') if new_align is not None: if new_align.types != [GGUFValueType.UINT32]: @@ -117,6 +126,7 @@ class GGUFReader: padding = offs % self.alignment if padding != 0: offs += self.alignment - padding + self.data_offset = offs self._build_tensors(offs, tensors_fields) _DT = TypeVar('_DT', bound = npt.DTypeLike) @@ -193,18 +203,29 @@ class GGUFReader: # We can't deal with this one. raise ValueError('Unknown/unhandled field type {gtype}') - def _get_tensor(self, orig_offs: int) -> ReaderField: + def _get_tensor_info_field(self, orig_offs: int) -> ReaderField: offs = orig_offs + + # Get Tensor Name name_len, name_data = self._get_str(offs) offs += int(name_len.nbytes + name_data.nbytes) + + # Get Tensor Dimensions Count n_dims = self._get(offs, np.uint32) offs += int(n_dims.nbytes) + + # Get Tensor Dimension Array dims = self._get(offs, np.uint64, n_dims[0]) offs += int(dims.nbytes) + + # Get Tensor Encoding Scheme Type raw_dtype = self._get(offs, np.uint32) offs += int(raw_dtype.nbytes) + + # Get Tensor Offset offset_tensor = self._get(offs, np.uint64) offs += int(offset_tensor.nbytes) + return ReaderField( orig_offs, str(bytes(name_data), encoding = 'utf-8'), @@ -233,10 +254,10 @@ class GGUFReader: offs += field_size return offs - def _build_tensors_fields(self, offs: int, count: int) -> tuple[int, list[ReaderField]]: + def _build_tensor_info(self, offs: int, count: int) -> tuple[int, list[ReaderField]]: tensor_fields = [] for _ in range(count): - field = self._get_tensor(offs) + field = self._get_tensor_info_field(offs) offs += sum(int(part.nbytes) for part in field.parts) tensor_fields.append(field) return offs, tensor_fields diff --git a/gguf-py/scripts/gguf-dump.py b/gguf-py/scripts/gguf-dump.py index 508ca8f0a..a73ca2776 100755 --- a/gguf-py/scripts/gguf-dump.py +++ b/gguf-py/scripts/gguf-dump.py @@ -319,6 +319,27 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None markdown_content += "\n" + markdown_content += "### Tensor Data Offset\n" + markdown_content += '\n' + markdown_content += 'This table contains the offset and data segment relative to start of file\n' + markdown_content += '\n' + + tensor_mapping_table: list[dict[str, str | int]] = [] + for key, tensor in enumerate(reader.tensors): + data_offset_pretty = '{0:#16x}'.format(tensor.data_offset) + data_size_pretty = '{0:#16x}'.format(tensor.n_bytes) + tensor_mapping_table.append({"t_id":key, "layer_name":tensor.name, "data_offset":data_offset_pretty, "data_size":data_size_pretty}) + + tensors_mapping_table_header_map = [ + {'key_name':'t_id', 'header_name':'T_ID', 'align':'right'}, + {'key_name':'layer_name', 'header_name':'Tensor Layer Name', 'align':'left'}, + {'key_name':'data_offset', 'header_name':'Data Offset (B)', 'align':'right'}, + {'key_name':'data_size', 'header_name':'Data Size (B)', 'align':'right'}, + ] + + markdown_content += markdown_table_with_alignment_support(tensors_mapping_table_header_map, tensor_mapping_table) + markdown_content += "\n" + for group in tensor_prefix_order: tensors = tensor_groups[group] group_elements = sum(tensor.n_elements for tensor in tensors) @@ -370,6 +391,8 @@ def main() -> None: parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata") parser.add_argument("--json", action="store_true", help="Produce JSON output") parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)") + parser.add_argument("--data-offset", action="store_true", help="Start of data offset") + parser.add_argument("--data-alignment", action="store_true", help="Data alignment applied globally to data field") parser.add_argument("--markdown", action="store_true", help="Produce markdown output") parser.add_argument("--verbose", action="store_true", help="increase output verbosity") @@ -377,7 +400,7 @@ def main() -> None: logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) - if not args.json and not args.markdown: + if not args.json and not args.markdown and not args.data_offset and not args.data_alignment: logger.info(f'* Loading: {args.model}') reader = GGUFReader(args.model, 'r') @@ -386,6 +409,10 @@ def main() -> None: dump_metadata_json(reader, args) elif args.markdown: dump_markdown_metadata(reader, args) + elif args.data_offset: + print(reader.data_offset) # noqa: NP100 + elif args.data_alignment: + print(reader.alignment) # noqa: NP100 else: dump_metadata(reader, args) From 925c30956dd17723c3a25297bcd0a609aec60663 Mon Sep 17 00:00:00 2001 From: joecryptotoo <80373433+joecryptotoo@users.noreply.github.com> Date: Tue, 25 Jun 2024 08:13:27 -0700 Subject: [PATCH 06/50] Add healthchecks to llama-server containers (#8081) * added healthcheck * added healthcheck * added healthcheck * added healthcheck * added healthcheck * moved curl to base * moved curl to base --- .devops/llama-server-cuda.Dockerfile | 4 +++- .devops/llama-server-intel.Dockerfile | 4 +++- .devops/llama-server-rocm.Dockerfile | 4 +++- .devops/llama-server-vulkan.Dockerfile | 10 ++++------ .devops/llama-server.Dockerfile | 4 +++- 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/llama-server-cuda.Dockerfile index 0010ffd4c..7bef07a05 100644 --- a/.devops/llama-server-cuda.Dockerfile +++ b/.devops/llama-server-cuda.Dockerfile @@ -30,8 +30,10 @@ RUN make -j$(nproc) llama-server FROM ${BASE_CUDA_RUN_CONTAINER} as runtime RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev libgomp1 + apt-get install -y libcurl4-openssl-dev libgomp1 curl COPY --from=build /app/llama-server /llama-server +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + ENTRYPOINT [ "/llama-server" ] diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile index cec436452..3bf1670ec 100644 --- a/.devops/llama-server-intel.Dockerfile +++ b/.devops/llama-server-intel.Dockerfile @@ -20,10 +20,12 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \ FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev + apt-get install -y libcurl4-openssl-dev curl COPY --from=build /app/build/bin/llama-server /llama-server ENV LC_ALL=C.utf8 +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + ENTRYPOINT [ "/llama-server" ] diff --git a/.devops/llama-server-rocm.Dockerfile b/.devops/llama-server-rocm.Dockerfile index f88cf20e5..4b1cdc320 100644 --- a/.devops/llama-server-rocm.Dockerfile +++ b/.devops/llama-server-rocm.Dockerfile @@ -43,8 +43,10 @@ ENV CXX=/opt/rocm/llvm/bin/clang++ # Enable cURL ENV LLAMA_CURL=1 RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev + apt-get install -y libcurl4-openssl-dev curl RUN make -j$(nproc) llama-server +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + ENTRYPOINT [ "/app/llama-server" ] diff --git a/.devops/llama-server-vulkan.Dockerfile b/.devops/llama-server-vulkan.Dockerfile index b0fa0b8e6..2bc2e45d3 100644 --- a/.devops/llama-server-vulkan.Dockerfile +++ b/.devops/llama-server-vulkan.Dockerfile @@ -5,15 +5,11 @@ FROM ubuntu:$UBUNTU_VERSION as build # Install build tools RUN apt update && apt install -y git build-essential cmake wget -# Install Vulkan SDK +# Install Vulkan SDK and cURL RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ apt update -y && \ - apt-get install -y vulkan-sdk - -# Install cURL -RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev + apt-get install -y vulkan-sdk libcurl4-openssl-dev curl # Build it WORKDIR /app @@ -28,4 +24,6 @@ RUN cp /app/build/bin/llama-server /llama-server && \ ENV LC_ALL=C.utf8 +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + ENTRYPOINT [ "/llama-server" ] diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile index aa93369be..a53a5c999 100644 --- a/.devops/llama-server.Dockerfile +++ b/.devops/llama-server.Dockerfile @@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04 FROM ubuntu:$UBUNTU_VERSION as build RUN apt-get update && \ - apt-get install -y build-essential git libcurl4-openssl-dev + apt-get install -y build-essential git libcurl4-openssl-dev curl WORKDIR /app @@ -22,4 +22,6 @@ COPY --from=build /app/llama-server /llama-server ENV LC_ALL=C.utf8 +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + ENTRYPOINT [ "/llama-server" ] From dd047b476c8b904e0c25e5dbc5bee6ffde2f6e17 Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 25 Jun 2024 19:20:06 +0200 Subject: [PATCH 07/50] disable docker CI on pull requests (#8110) --- .github/workflows/docker.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index b3efe0084..01f1a4522 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -10,7 +10,7 @@ name: Publish Docker image on: - pull_request: + #pull_request: push: branches: - master @@ -22,7 +22,7 @@ concurrency: jobs: push_to_registry: name: Push Docker image to Docker Hub - if: github.event.pull_request.draft == false + #if: github.event.pull_request.draft == false runs-on: ubuntu-latest env: From 84631fe1504de40427dc4b4cdac92fa7ebf65955 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Tue, 25 Jun 2024 20:06:20 +0100 Subject: [PATCH 08/50] `json`: support integer minimum, maximum, exclusiveMinimum, exclusiveMaximum (#7797) * json: support minimum for positive integer values * json: fix min 0 * json: min + max integer constraints * json: handle negative min / max integer bounds * json: fix missing paren min/max bug * json: proper paren fix * json: integration test for schemas * json: fix bounds tests * Update json-schema-to-grammar.cpp * json: fix negative max * json: fix negative min (w/ more than 1 digit) * Update test-grammar-integration.cpp * json: nit: move string rules together * json: port min/max integer support to Python & JS * nit: move + rename _build_min_max_int * fix min in [1, 9] * Update test-grammar-integration.cpp * add C++11-compatible replacement for std::string_view * add min/max constrained int field to pydantic json schema example * fix merge * json: add integration tests for min/max bounds * reshuffle/merge min/max integ test cases * nits / cleanups * defensive code against string out of bounds (apparently different behaviour of libstdc++ vs. clang's libc++, can't read final NULL char w/ former) --- common/json-schema-to-grammar.cpp | 246 +++++++++++++++- examples/json-schema-pydantic-example.py | 1 + examples/json_schema_to_grammar.py | 184 +++++++++++- .../server/public/json-schema-to-grammar.mjs | 213 ++++++++++++++ tests/test-grammar-integration.cpp | 245 +++++++++++++++- tests/test-json-schema-to-grammar.cpp | 264 ++++++++++++++++++ 6 files changed, 1150 insertions(+), 3 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 10b9b3d1d..07d0e952d 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -40,6 +40,233 @@ static std::string build_repetition(const std::string & item_rule, int min_items return result; } +/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */ +class string_view { + const std::string & _str; + const size_t _start; + const size_t _end; +public: + string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {} + + size_t size() const { + return _end - _start; + } + + size_t length() const { + return size(); + } + + operator std::string() const { + return str(); + } + + std::string str() const { + return _str.substr(_start, _end - _start); + } + + string_view substr(size_t pos, size_t len = std::string::npos) const { + return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len); + } + + char operator[](size_t pos) const { + auto index = _start + pos; + if (index >= _end) { + throw std::out_of_range("string_view index out of range"); + } + return _str[_start + pos]; + } + + bool operator==(const string_view & other) const { + std::string this_str = *this; + std::string other_str = other; + return this_str == other_str; + } +}; + +static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) { + auto has_min = min_value != std::numeric_limits::min(); + auto has_max = max_value != std::numeric_limits::max(); + + auto digit_range = [&](char from, char to) { + out << "["; + if (from == to) { + out << from; + } else { + out << from << "-" << to; + } + out << "]"; + }; + auto more_digits = [&](int min_digits, int max_digits) { + out << "[0-9]"; + if (min_digits == max_digits && min_digits == 1) { + return; + } + out << "{"; + out << min_digits; + if (max_digits != min_digits) { + out << ","; + if (max_digits != std::numeric_limits::max()) { + out << max_digits; + } + } + out << "}"; + }; + std::function uniform_range = + [&](const string_view & from, const string_view & to) { + size_t i = 0; + while (i < from.length() && i < to.length() && from[i] == to[i]) { + i++; + } + if (i > 0) { + out << "\"" << from.substr(0, i).str() << "\""; + } + if (i < from.length() && i < to.length()) { + if (i > 0) { + out << " "; + } + auto sub_len = from.length() - i - 1; + if (sub_len > 0) { + auto from_sub = from.substr(i + 1); + auto to_sub = to.substr(i + 1); + auto sub_zeros = repeat("0", sub_len); + auto sub_nines = repeat("9", sub_len); + + auto to_reached = false; + out << "("; + if (from_sub == sub_zeros) { + digit_range(from[i], to[i] - 1); + out << " "; + more_digits(sub_len, sub_len); + } else { + out << "[" << from[i] << "] "; + out << "("; + uniform_range(from_sub, sub_nines); + out << ")"; + if (from[i] < to[i] - 1) { + out << " | "; + if (to_sub == sub_nines) { + digit_range(from[i] + 1, to[i]); + to_reached = true; + } else { + digit_range(from[i] + 1, to[i] - 1); + } + out << " "; + more_digits(sub_len, sub_len); + } + } + if (!to_reached) { + out << " | "; + digit_range(to[i], to[i]); + out << " "; + uniform_range(sub_zeros, to_sub); + } + out << ")"; + } else { + out << "[" << from[i] << "-" << to[i] << "]"; + } + } + }; + + if (has_min && has_max) { + if (min_value < 0 && max_value < 0) { + out << "\"-\" ("; + _build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true); + out << ")"; + return; + } + + if (min_value < 0) { + out << "\"-\" ("; + _build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true); + out << ") | "; + min_value = 0; + } + + auto min_s = std::to_string(min_value); + auto max_s = std::to_string(max_value); + auto min_digits = min_s.length(); + auto max_digits = max_s.length(); + + for (auto digits = min_digits; digits < max_digits; digits++) { + uniform_range(min_s, repeat("9", digits)); + min_s = "1" + repeat("0", digits); + out << " | "; + } + uniform_range(min_s, max_s); + return; + } + + auto less_decimals = std::max(decimals_left - 1, 1); + + if (has_min) { + if (min_value < 0) { + out << "\"-\" ("; + _build_min_max_int(std::numeric_limits::min(), -min_value, out, decimals_left, /* top_level= */ false); + out << ") | [0] | [1-9] "; + more_digits(0, decimals_left - 1); + } else if (min_value == 0) { + if (top_level) { + out << "[0] | [1-9] "; + more_digits(0, less_decimals); + } else { + more_digits(1, decimals_left); + } + } else if (min_value <= 9) { + char c = '0' + min_value; + auto range_start = top_level ? '1' : '0'; + if (c > range_start) { + digit_range(range_start, c - 1); + out << " "; + more_digits(1, less_decimals); + out << " | "; + } + digit_range(c, '9'); + out << " "; + more_digits(0, less_decimals); + } else { + auto min_s = std::to_string(min_value); + auto len = min_s.length(); + auto c = min_s[0]; + + if (c > '1') { + digit_range(top_level ? '1' : '0', c - 1); + out << " "; + more_digits(len, less_decimals); + out << " | "; + } + digit_range(c, c); + out << " ("; + _build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits::max(), out, less_decimals, /* top_level= */ false); + out << ")"; + if (c < '9') { + out << " | "; + digit_range(c + 1, '9'); + out << " "; + more_digits(len - 1, less_decimals); + } + } + return; + } + + if (has_max) { + if (max_value >= 0) { + if (top_level) { + out << "\"-\" [1-9] "; + more_digits(0, less_decimals); + out << " | "; + } + _build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true); + } else { + out << "\"-\" ("; + _build_min_max_int(-max_value, std::numeric_limits::max(), out, decimals_left, /* top_level= */ false); + out << ")"; + } + return; + } + + throw std::runtime_error("At least one of min_value or max_value must be set"); +} + const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}"; struct BuiltinRule { @@ -160,7 +387,6 @@ static std::string format_literal(const std::string & literal) { return "\"" + escaped + "\""; } - class SchemaConverter { private: std::function _fetch_json; @@ -686,6 +912,24 @@ public: int min_len = schema.contains("minLength") ? schema["minLength"].get() : 0; int max_len = schema.contains("maxLength") ? schema["maxLength"].get() : std::numeric_limits::max(); return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space"); + } else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) { + int min_value = std::numeric_limits::min(); + int max_value = std::numeric_limits::max(); + if (schema.contains("minimum")) { + min_value = schema["minimum"].get(); + } else if (schema.contains("exclusiveMinimum")) { + min_value = schema["exclusiveMinimum"].get() + 1; + } + if (schema.contains("maximum")) { + max_value = schema["maximum"].get(); + } else if (schema.contains("exclusiveMaximum")) { + max_value = schema["exclusiveMaximum"].get() - 1; + } + std::stringstream out; + out << "("; + _build_min_max_int(min_value, max_value, out); + out << ") space"; + return _add_rule(rule_name, out.str()); } else if (schema.empty() || schema_type == "object") { return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object"))); } else { diff --git a/examples/json-schema-pydantic-example.py b/examples/json-schema-pydantic-example.py index cc64e572b..2240188cd 100644 --- a/examples/json-schema-pydantic-example.py +++ b/examples/json-schema-pydantic-example.py @@ -53,6 +53,7 @@ if __name__ == '__main__': question: str concise_answer: str justification: str + stars: Annotated[int, Field(ge=1, le=5)] class PyramidalSummary(BaseModel): title: str diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index b588497b9..86500a8c3 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -4,7 +4,7 @@ import itertools import json import re import sys -from typing import Any, Dict, List, Set, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union def _build_repetition(item_rule, min_items, max_items, separator_rule=None): @@ -23,6 +23,170 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None): result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None) return f'({result})?' if min_items == 0 else result +def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True): + has_min = min_value != None + has_max = max_value != None + + def digit_range(from_char: str, to_char: str): + out.append("[") + if from_char == to_char: + out.append(from_char) + else: + out.append(from_char) + out.append("-") + out.append(to_char) + out.append("]") + + def more_digits(min_digits: int, max_digits: int): + out.append("[0-9]") + if min_digits == max_digits and min_digits == 1: + return + out.append("{") + out.append(str(min_digits)) + if max_digits != min_digits: + out.append(",") + if max_digits != sys.maxsize: + out.append(str(max_digits)) + out.append("}") + + def uniform_range(from_str: str, to_str: str): + i = 0 + while i < len(from_str) and from_str[i] == to_str[i]: + i += 1 + if i > 0: + out.append("\"") + out.append(from_str[:i]) + out.append("\"") + if i < len(from_str): + if i > 0: + out.append(" ") + sub_len = len(from_str) - i - 1 + if sub_len > 0: + from_sub = from_str[i+1:] + to_sub = to_str[i+1:] + sub_zeros = "0" * sub_len + sub_nines = "9" * sub_len + + to_reached = False + out.append("(") + if from_sub == sub_zeros: + digit_range(from_str[i], chr(ord(to_str[i]) - 1)) + out.append(" ") + more_digits(sub_len, sub_len) + else: + out.append("[") + out.append(from_str[i]) + out.append("] ") + out.append("(") + uniform_range(from_sub, sub_nines) + out.append(")") + if ord(from_str[i]) < ord(to_str[i]) - 1: + out.append(" | ") + if to_sub == sub_nines: + digit_range(chr(ord(from_str[i]) + 1), to_str[i]) + to_reached = True + else: + digit_range(chr(ord(from_str[i]) + 1), chr(ord(to_str[i]) - 1)) + out.append(" ") + more_digits(sub_len, sub_len) + if not to_reached: + out.append(" | ") + digit_range(to_str[i], to_str[i]) + out.append(" ") + uniform_range(sub_zeros, to_sub) + out.append(")") + else: + out.append("[") + out.append(from_str[i]) + out.append("-") + out.append(to_str[i]) + out.append("]") + + if has_min and has_max: + if min_value < 0 and max_value < 0: + out.append("\"-\" (") + _generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True) + out.append(")") + return + + if min_value < 0: + out.append("\"-\" (") + _generate_min_max_int(0, -min_value, out, decimals_left, top_level=True) + out.append(") | ") + min_value = 0 + + min_s = str(min_value) + max_s = str(max_value) + min_digits = len(min_s) + max_digits = len(max_s) + + for digits in range(min_digits, max_digits): + uniform_range(min_s, "9" * digits) + min_s = "1" + "0" * digits + out.append(" | ") + uniform_range(min_s, max_s) + return + + less_decimals = max(decimals_left - 1, 1) + + if has_min: + if min_value < 0: + out.append("\"-\" (") + _generate_min_max_int(None, -min_value, out, decimals_left, top_level=False) + out.append(") | [0] | [1-9] ") + more_digits(0, decimals_left - 1) + elif min_value == 0: + if top_level: + out.append("[0] | [1-9] ") + more_digits(0, less_decimals) + else: + more_digits(1, decimals_left) + elif min_value <= 9: + c = str(min_value) + range_start = '1' if top_level else '0' + if c > range_start: + digit_range(range_start, chr(ord(c) - 1)) + out.append(" ") + more_digits(1, less_decimals) + out.append(" | ") + digit_range(c, "9") + out.append(" ") + more_digits(0, less_decimals) + else: + min_s = str(min_value) + length = len(min_s) + c = min_s[0] + + if c > "1": + digit_range("1" if top_level else "0", chr(ord(c) - 1)) + out.append(" ") + more_digits(length, less_decimals) + out.append(" | ") + digit_range(c, c) + out.append(" (") + _generate_min_max_int(int(min_s[1:]), None, out, less_decimals, top_level=False) + out.append(")") + if c < "9": + out.append(" | ") + digit_range(chr(ord(c) + 1), "9") + out.append(" ") + more_digits(length - 1, less_decimals) + return + + if has_max: + if max_value >= 0: + if top_level: + out.append("\"-\" [1-9] ") + more_digits(0, less_decimals) + out.append(" | ") + _generate_min_max_int(0, max_value, out, decimals_left, top_level=True) + else: + out.append("\"-\" (") + _generate_min_max_int(-max_value, None, out, decimals_left, top_level=False) + out.append(")") + return + + raise RuntimeError("At least one of min_value or max_value must be set") class BuiltinRule: def __init__(self, content: str, deps: list = None): @@ -432,6 +596,24 @@ class SchemaConverter: return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space') + elif schema_type in (None, 'integer') and \ + ('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema): + min_value = None + max_value = None + if 'minimum' in schema: + min_value = schema['minimum'] + elif 'exclusiveMinimum' in schema: + min_value = schema['exclusiveMinimum'] + 1 + if 'maximum' in schema: + max_value = schema['maximum'] + elif 'exclusiveMaximum' in schema: + max_value = schema['exclusiveMaximum'] - 1 + + out = ["("] + _generate_min_max_int(min_value, max_value, out) + out.append(") space") + return self._add_rule(rule_name, ''.join(out)) + elif (schema_type == 'object') or (len(schema) == 0): return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object'])) diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs index faed6a32c..f340f94bd 100644 --- a/examples/server/public/json-schema-to-grammar.mjs +++ b/examples/server/public/json-schema-to-grammar.mjs @@ -24,6 +24,201 @@ function _buildRepetition(itemRule, minItems, maxItems, opts={}) { return minItems === 0 ? `(${result})?` : result; } +function _generateMinMaxInt(minValue, maxValue, out, decimalsLeft = 16, topLevel = true) { + const hasMin = minValue !== null; + const hasMax = maxValue !== null; + + function digitRange(fromChar, toChar) { + out.push("["); + if (fromChar === toChar) { + out.push(fromChar); + } else { + out.push(fromChar); + out.push("-"); + out.push(toChar); + } + out.push("]"); + } + + function moreDigits(minDigits, maxDigits) { + out.push("[0-9]"); + if (minDigits === maxDigits && minDigits === 1) { + return; + } + out.push("{"); + out.push(minDigits.toString()); + if (maxDigits !== minDigits) { + out.push(","); + if (maxDigits !== Number.MAX_SAFE_INTEGER) { + out.push(maxDigits.toString()); + } + } + out.push("}"); + } + + function uniformRange(fromStr, toStr) { + let i = 0; + while (i < fromStr.length && fromStr[i] === toStr[i]) { + i++; + } + if (i > 0) { + out.push("\""); + out.push(fromStr.slice(0, i)); + out.push("\""); + } + if (i < fromStr.length) { + if (i > 0) { + out.push(" "); + } + const subLen = fromStr.length - i - 1; + if (subLen > 0) { + const fromSub = fromStr.slice(i + 1); + const toSub = toStr.slice(i + 1); + const subZeros = "0".repeat(subLen); + const subNines = "9".repeat(subLen); + + let toReached = false; + out.push("("); + if (fromSub === subZeros) { + digitRange(fromStr[i], String.fromCharCode(toStr.charCodeAt(i) - 1)); + out.push(" "); + moreDigits(subLen, subLen); + } else { + out.push("["); + out.push(fromStr[i]); + out.push("] "); + out.push("("); + uniformRange(fromSub, subNines); + out.push(")"); + if (fromStr.charCodeAt(i) < toStr.charCodeAt(i) - 1) { + out.push(" | "); + if (toSub === subNines) { + digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), toStr[i]); + toReached = true; + } else { + digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), String.fromCharCode(toStr.charCodeAt(i) - 1)); + } + out.push(" "); + moreDigits(subLen, subLen); + } + } + if (!toReached) { + out.push(" | "); + digitRange(toStr[i], toStr[i]); + out.push(" "); + uniformRange(subZeros, toSub); + } + out.push(")"); + } else { + out.push("["); + out.push(fromStr[i]); + out.push("-"); + out.push(toStr[i]); + out.push("]"); + } + } + } + + if (hasMin && hasMax) { + if (minValue < 0 && maxValue < 0) { + out.push("\"-\" ("); + _generateMinMaxInt(-maxValue, -minValue, out, decimalsLeft, true); + out.push(")"); + return; + } + + if (minValue < 0) { + out.push("\"-\" ("); + _generateMinMaxInt(0, -minValue, out, decimalsLeft, true); + out.push(") | "); + minValue = 0; + } + + let minS = minValue.toString(); + const maxS = maxValue.toString(); + const minDigits = minS.length; + const maxDigits = maxS.length; + + for (let digits = minDigits; digits < maxDigits; digits++) { + uniformRange(minS, "9".repeat(digits)); + minS = "1" + "0".repeat(digits); + out.push(" | "); + } + uniformRange(minS, maxS); + return; + } + + const lessDecimals = Math.max(decimalsLeft - 1, 1); + + if (hasMin) { + if (minValue < 0) { + out.push("\"-\" ("); + _generateMinMaxInt(null, -minValue, out, decimalsLeft, false); + out.push(") | [0] | [1-9] "); + moreDigits(0, decimalsLeft - 1); + } else if (minValue === 0) { + if (topLevel) { + out.push("[0] | [1-9] "); + moreDigits(0, lessDecimals); + } else { + moreDigits(1, decimalsLeft); + } + } else if (minValue <= 9) { + const c = minValue.toString(); + const range_start = topLevel ? '1' : '0'; + if (c > range_start) { + digitRange(range_start, String.fromCharCode(c.charCodeAt(0) - 1)); + out.push(" "); + moreDigits(1, lessDecimals); + out.push(" | "); + } + digitRange(c, "9"); + out.push(" "); + moreDigits(0, lessDecimals); + } else { + const minS = minValue.toString(); + const length = minS.length; + const c = minS[0]; + + if (c > "1") { + digitRange(topLevel ? "1" : "0", String.fromCharCode(c.charCodeAt(0) - 1)); + out.push(" "); + moreDigits(length, lessDecimals); + out.push(" | "); + } + digitRange(c, c); + out.push(" ("); + _generateMinMaxInt(parseInt(minS.slice(1)), null, out, lessDecimals, false); + out.push(")"); + if (c < "9") { + out.push(" | "); + digitRange(String.fromCharCode(c.charCodeAt(0) + 1), "9"); + out.push(" "); + moreDigits(length - 1, lessDecimals); + } + } + return; + } + + if (hasMax) { + if (maxValue >= 0) { + if (topLevel) { + out.push("\"-\" [1-9] "); + moreDigits(0, lessDecimals); + out.push(" | "); + } + _generateMinMaxInt(0, maxValue, out, decimalsLeft, true); + } else { + out.push("\"-\" ("); + _generateMinMaxInt(-maxValue, null, out, decimalsLeft, false); + out.push(")"); + } + return; + } + + throw new Error("At least one of minValue or maxValue must be set"); +} + class BuiltinRule { constructor(content, deps) { this.content = content; @@ -435,6 +630,24 @@ export class SchemaConverter { const minLen = schema.minLength || 0; const maxLen = schema.maxLength; return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space'); + } else if (schemaType === 'integer' && ('minimum' in schema || 'exclusiveMinimum' in schema || 'maximum' in schema || 'exclusiveMaximum' in schema)) { + let minValue = null; + let maxValue = null; + if ('minimum' in schema) { + minValue = schema.minimum; + } else if ('exclusiveMinimum' in schema) { + minValue = schema.exclusiveMinimum + 1; + } + if ('maximum' in schema) { + maxValue = schema.maximum; + } else if ('exclusiveMaximum' in schema) { + maxValue = schema.exclusiveMaximum - 1; + } + + const out = ["("]; + _generateMinMaxInt(minValue, maxValue, out); + out.push(") space"); + return this._addRule(ruleName, out.join('')); } else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) { return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object'])); } else { diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 96f90c01e..5b3992236 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -148,6 +148,250 @@ static void test_schema(const std::string & test_desc, const std::string & schem } static void test_simple_grammar() { + test_schema( + "min 0", + R"""({ + "type": "integer", + "minimum": 0 + })""", + // Passing strings + { + "0", + "10", + "12", + "10000", + }, + // Failing strings + { + "-1", + "-10", + "-10000", + "-100000000000000000000000000000000", + "100000000000000000000000000000000", + "00", + "01", + "-0", + } + ); + test_schema( + "min 2", + // Schema + R"""({ + "type": "integer", + "minimum": 2 + })""", + // Passing strings + { + "2", + "3", + "4", + "10", + "20", + "1234567890000000", + }, + // Failing strings + { + "0", + "1", + "-1", + "-100", + "0", + "1", + "01", + "02", + "12345678900000000", + } + ); + test_schema( + "min 456", + R"""({ + "type": "integer", + "minimum": 456 + })""", + // Passing strings + { + "456", + "4560", + "457", + "460", + "500", + }, + // Failing strings + { + "455", + "356", + "50", + "050", + "-1", + "-456", + } + ); + test_schema( + "min -123", + R"""({ + "type": "integer", + "minimum": -123 + })""", + // Passing strings + { + "-123", + "-122", + "-11", + "-1", + "0", + "1", + "123", + "1234", + "2345", + }, + // Failing strings + { + "-1234", + "-124", + } + ); + + test_schema( + "max 9999", + // Schema + R"""({ + "type": "integer", + "maximum": 9999 + })""", + // Passing strings + { + "-99999", + "0", + "9999", + }, + // Failing strings + { + "10000", + "99991", + } + ); + test_schema( + "max -9999", + // Schema + R"""({ + "type": "integer", + "maximum": -9999 + })""", + // Passing strings + { + "-10000", + "-9999", + }, + // Failing strings + { + "-9998", + "0", + "9999", + } + ); + test_schema( + "min 5 max 30", + // Schema + R"""({ + "type": "integer", + "minimum": 5, + "maximum": 30 + })""", + // Passing strings + { + "5", + "10", + "30", + }, + // Failing strings + { + "05", + "4", + "-1", + "31", + "123", + "0123", + } + ); + test_schema( + "min -1 max 1", + R"""({ + "type": "integer", + "minimum": -1, + "maximum": 1 + })""", + // Passing strings + { + "-1", + "0", + "1", + }, + // Failing strings + { + "-11", + "-10", + "-2", + "2", + "10", + "11", + } + ); + test_schema( + "min -123 max 42", + R"""({ + "type": "integer", + "minimum": -123, + "maximum": 42 + })""", + // Passing strings + { + "-123", + "-122", + "-13", + "-11", + "-2", + "-1", + "0", + "1", + "5", + "10", + "39", + "40", + "42", + }, + // Failing strings + { + "-0123", + "-124", + "-1123", + "-200", + "43", + "123", + "0123", + } + ); + test_schema( + "exclusive min / max", + // Schema + R"""({ + "type": "integer", + "exclusiveMinimum": 0, + "exclusiveMaximum": 10000 + })""", + // Passing strings + { + "1", + "9999", + }, + // Failing strings + { + "0", + "01", + "10000", + "99999", + } + ); + // Test case for a simple grammar test_grammar( "simple grammar", @@ -773,7 +1017,6 @@ static void test_json_schema() { } ); - test_schema( "min+max items", // Schema diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 87bc66b69..2e591bd71 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -80,6 +80,232 @@ static void test_all(const std::string & lang, std::function Date: Tue, 25 Jun 2024 21:07:28 +0200 Subject: [PATCH 09/50] llama : return nullptr from llama_grammar_init (#8093) * llama : return nullptr from llama_grammar_init This commit updates llama_grammar_init to return nullptr instead of throwing an exception. The motivation for this is that this function is declared inside an extern "C" block and is intended/may be used from C code which will not be able to handle exceptions thrown, and results in undefined behavior. On Windows and using MSVC the following warning is currently generated: ```console C:\llama.cpp\llama.cpp(13998,1): warning C4297: 'llama_grammar_init': function assumed not to throw an exception but does C:\llama.cpp\llama.cpp(13998,1): message : __declspec(nothrow), throw(), noexcept(true), or noexcept was specified on the function ``` Signed-off-by: Daniel Bevenius * squash! llama : return nullptr from llama_grammar_init Add checks for nullptr when calling llama_grammar_init. Signed-off-by: Daniel Bevenius --------- Signed-off-by: Daniel Bevenius Co-authored-by: Clint Herron --- common/sampling.cpp | 12 ++++++++++-- examples/gbnf-validator/gbnf-validator.cpp | 4 +++- llama.cpp | 3 ++- llama.h | 6 ++++++ tests/test-grammar-integration.cpp | 6 +++--- tests/test-llama-grammar.cpp | 4 ++++ 6 files changed, 28 insertions(+), 7 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index f1f803516..9f332fe57 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -28,9 +28,13 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_ std::vector grammar_rules(result->parsed_grammar.c_rules()); - result->grammar = llama_grammar_init( + struct llama_grammar * grammar = llama_grammar_init( grammar_rules.data(), grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root")); + if (grammar == nullptr) { + throw std::runtime_error("Failed to initialize llama_grammar"); + } + result->grammar = grammar; } result->prev.resize(params.n_prev); @@ -59,9 +63,13 @@ void llama_sampling_reset(llama_sampling_context * ctx) { if (!ctx->parsed_grammar.rules.empty()) { std::vector grammar_rules(ctx->parsed_grammar.c_rules()); - ctx->grammar = llama_grammar_init( + struct llama_grammar * grammar = llama_grammar_init( grammar_rules.data(), grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root")); + if (grammar == nullptr) { + throw std::runtime_error("Failed to initialize llama_grammar"); + } + ctx->grammar = grammar; } std::fill(ctx->prev.begin(), ctx->prev.end(), 0); diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp index 0406dc339..dd53ba9b1 100644 --- a/examples/gbnf-validator/gbnf-validator.cpp +++ b/examples/gbnf-validator/gbnf-validator.cpp @@ -101,7 +101,9 @@ int main(int argc, char** argv) { auto grammar = llama_grammar_init( grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); - + if (grammar == nullptr) { + throw std::runtime_error("Failed to initialize llama_grammar"); + } // Read the input file std::string input_str; { diff --git a/llama.cpp b/llama.cpp index 33e6cb722..dd2823e65 100644 --- a/llama.cpp +++ b/llama.cpp @@ -14500,7 +14500,8 @@ struct llama_grammar * llama_grammar_init( continue; } if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) { - throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i)); + LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i); + return nullptr; } } diff --git a/llama.h b/llama.h index 53e06d9db..82d15747f 100644 --- a/llama.h +++ b/llama.h @@ -924,6 +924,12 @@ extern "C" { // Grammar // + /// Initialize a llama_grammar. + /// + /// @param rules The rule elements of the grammar to initialize. + /// @param n_rules The number of rules. + /// @param start_rule_index The index of the root rule (the starting point of the grammar). + /// @return The initialized llama_grammar or nullptr if initialization failed. LLAMA_API struct llama_grammar * llama_grammar_init( const llama_grammar_element ** rules, size_t n_rules, diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 5b3992236..5750d362a 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -36,10 +36,10 @@ static llama_grammar* build_grammar(const std::string & grammar_str) { static bool test_build_grammar_fails(const std::string & grammar_str) { fprintf(stderr, "⚫ Testing failure for grammar: %s\n", grammar_str.c_str()); bool grammar_fails = false; - try { - build_grammar(grammar_str); + llama_grammar * grammar = build_grammar(grammar_str); + if (grammar != nullptr) { fprintf(stderr, " ❌ Expected build failure, but succeeded\n"); - } catch (const std::exception & err) { + } else { grammar_fails = true; fprintf(stdout, " ✅︎\n"); } diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp index 27ca4d265..c8badb206 100644 --- a/tests/test-llama-grammar.cpp +++ b/tests/test-llama-grammar.cpp @@ -116,6 +116,10 @@ int main() std::vector grammar_rules(parsed_grammar.c_rules()); grammar = llama_grammar_init( grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); + if (grammar == nullptr) + { + throw std::runtime_error("Failed to initialize llama_grammar"); + } std::vector> expected_stacks = { { From 6fcbf6823553efabe52ed83e3c2a3329aa3387d1 Mon Sep 17 00:00:00 2001 From: fairydreaming <166155368+fairydreaming@users.noreply.github.com> Date: Tue, 25 Jun 2024 21:14:35 +0200 Subject: [PATCH 10/50] llama : implement Unigram tokenizer needed by T5 and FLAN-T5 model families (#5763) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * llama : add T5 model architecture, tensors and model header parameters * llama : add implementation of Unigram tokenizer with SentencePiece-like text normalization using precompiled charsmap --------- Co-authored-by: Stanisław Szymczyk --- llama.cpp | 619 ++++++++++++++++++++++++++++++++++++++++++++++++---- llama.h | 2 + unicode.cpp | 2 +- unicode.h | 1 + 4 files changed, 586 insertions(+), 38 deletions(-) diff --git a/llama.cpp b/llama.cpp index dd2823e65..78a21008f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -226,6 +226,7 @@ enum llm_arch { LLM_ARCH_ARCTIC, LLM_ARCH_DEEPSEEK2, LLM_ARCH_BITNET, + LLM_ARCH_T5, LLM_ARCH_UNKNOWN, }; @@ -265,6 +266,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_ARCTIC, "arctic" }, { LLM_ARCH_DEEPSEEK2, "deepseek2" }, { LLM_ARCH_BITNET, "bitnet" }, + { LLM_ARCH_T5, "t5" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -297,6 +299,7 @@ enum llm_kv { LLM_KV_EXPERT_WEIGHTS_SCALE, LLM_KV_POOLING_TYPE, LLM_KV_LOGIT_SCALE, + LLM_KV_DECODER_START_TOKEN_ID, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -309,6 +312,7 @@ enum llm_kv { LLM_KV_ATTENTION_CAUSAL, LLM_KV_ATTENTION_Q_LORA_RANK, LLM_KV_ATTENTION_KV_LORA_RANK, + LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_FREQ_BASE, @@ -346,6 +350,8 @@ enum llm_kv { LLM_KV_TOKENIZER_ADD_BOS, LLM_KV_TOKENIZER_ADD_EOS, LLM_KV_TOKENIZER_ADD_PREFIX, + LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, + LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, LLM_KV_TOKENIZER_HF_JSON, LLM_KV_TOKENIZER_RWKV, LLM_KV_TOKENIZER_PREFIX_ID, @@ -383,18 +389,20 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" }, { LLM_KV_POOLING_TYPE , "%s.pooling_type" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, + { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" }, - { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, - { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, - { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" }, - { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" }, - { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" }, - { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" }, - { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" }, - { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" }, - { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" }, - { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" }, - { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" }, + { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, + { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, + { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" }, + { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" }, + { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" }, + { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" }, + { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" }, + { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" }, + { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" }, + { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" }, + { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" }, + { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, @@ -415,29 +423,31 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, - { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, - { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, - { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, - { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, - { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, - { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" }, - { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" }, - { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" }, - { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" }, - { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" }, - { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" }, - { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, - { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" }, - { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" }, - { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" }, - { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" }, - { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" }, - { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, - { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, - { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, - { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, - { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, - { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, + { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, + { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, + { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, + { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, + { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, + { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" }, + { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" }, + { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" }, + { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" }, + { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" }, + { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" }, + { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, + { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" }, + { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" }, + { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" }, + { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" }, + { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" }, + { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" }, + { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" }, + { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, + { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, + { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, + { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, + { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, + { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, }; struct LLM_KV { @@ -504,6 +514,34 @@ enum llm_tensor { LLM_TENSOR_ATTN_KV_A_NORM, LLM_TENSOR_ATTN_SUB_NORM, LLM_TENSOR_FFN_SUB_NORM, + LLM_TENSOR_DEC_ATTN_NORM, + LLM_TENSOR_DEC_ATTN_Q, + LLM_TENSOR_DEC_ATTN_K, + LLM_TENSOR_DEC_ATTN_V, + LLM_TENSOR_DEC_ATTN_OUT, + LLM_TENSOR_DEC_ATTN_REL_B, + LLM_TENSOR_DEC_CROSS_ATTN_NORM, + LLM_TENSOR_DEC_CROSS_ATTN_Q, + LLM_TENSOR_DEC_CROSS_ATTN_K, + LLM_TENSOR_DEC_CROSS_ATTN_V, + LLM_TENSOR_DEC_CROSS_ATTN_OUT, + LLM_TENSOR_DEC_CROSS_ATTN_REL_B, + LLM_TENSOR_DEC_FFN_NORM, + LLM_TENSOR_DEC_FFN_GATE, + LLM_TENSOR_DEC_FFN_DOWN, + LLM_TENSOR_DEC_FFN_UP, + LLM_TENSOR_DEC_OUTPUT_NORM, + LLM_TENSOR_ENC_ATTN_NORM, + LLM_TENSOR_ENC_ATTN_Q, + LLM_TENSOR_ENC_ATTN_K, + LLM_TENSOR_ENC_ATTN_V, + LLM_TENSOR_ENC_ATTN_OUT, + LLM_TENSOR_ENC_ATTN_REL_B, + LLM_TENSOR_ENC_FFN_NORM, + LLM_TENSOR_ENC_FFN_GATE, + LLM_TENSOR_ENC_FFN_DOWN, + LLM_TENSOR_ENC_FFN_UP, + LLM_TENSOR_ENC_OUTPUT_NORM, }; static const std::map> LLM_TENSOR_NAMES = { @@ -1135,6 +1173,41 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_SUB_NORM, "blk.%d.ffn_sub_norm" }, }, }, + { + LLM_ARCH_T5, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_DEC_OUTPUT_NORM, "dec.output_norm" }, + { LLM_TENSOR_DEC_ATTN_NORM, "dec.blk.%d.attn_norm" }, + { LLM_TENSOR_DEC_ATTN_Q, "dec.blk.%d.attn_q" }, + { LLM_TENSOR_DEC_ATTN_K, "dec.blk.%d.attn_k" }, + { LLM_TENSOR_DEC_ATTN_V, "dec.blk.%d.attn_v" }, + { LLM_TENSOR_DEC_ATTN_OUT, "dec.blk.%d.attn_o" }, + { LLM_TENSOR_DEC_ATTN_REL_B, "dec.blk.%d.attn_rel_b" }, + { LLM_TENSOR_DEC_CROSS_ATTN_NORM, "dec.blk.%d.cross_attn_norm" }, + { LLM_TENSOR_DEC_CROSS_ATTN_Q, "dec.blk.%d.cross_attn_q" }, + { LLM_TENSOR_DEC_CROSS_ATTN_K, "dec.blk.%d.cross_attn_k" }, + { LLM_TENSOR_DEC_CROSS_ATTN_V, "dec.blk.%d.cross_attn_v" }, + { LLM_TENSOR_DEC_CROSS_ATTN_OUT, "dec.blk.%d.cross_attn_o" }, + { LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "dec.blk.%d.cross_attn_rel_b" }, + { LLM_TENSOR_DEC_FFN_NORM, "dec.blk.%d.ffn_norm" }, + { LLM_TENSOR_DEC_FFN_GATE, "dec.blk.%d.ffn_gate" }, + { LLM_TENSOR_DEC_FFN_DOWN, "dec.blk.%d.ffn_down" }, + { LLM_TENSOR_DEC_FFN_UP, "dec.blk.%d.ffn_up" }, + { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" }, + { LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" }, + { LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" }, + { LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" }, + { LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" }, + { LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" }, + { LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" }, + { LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" }, + { LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" }, + { LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" }, + { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -2356,6 +2429,11 @@ struct llama_vocab { bool tokenizer_add_bos = false; bool tokenizer_add_eos = false; bool tokenizer_ignore_merges = false; + bool tokenizer_remove_extra_whitespaces = false; + bool tokenizer_escape_whitespaces = true; + bool tokenizer_treat_whitespace_as_suffix = false; + + std::vector precompiled_charsmap; int find_bpe_rank(const std::string & token_left, const std::string & token_right) const { GGML_ASSERT(token_left.find(' ') == std::string::npos); @@ -4191,6 +4269,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){ case LLAMA_VOCAB_TYPE_SPM: return "SPM"; case LLAMA_VOCAB_TYPE_BPE: return "BPE"; case LLAMA_VOCAB_TYPE_WPM: return "WPM"; + case LLAMA_VOCAB_TYPE_UGM: return "UGM"; default: return "unknown"; } } @@ -4870,6 +4949,45 @@ static void llm_load_vocab( vocab.special_pad_id = -1; vocab.special_cls_id = -1; vocab.special_mask_id = -1; + } else if (tokenizer_model == "t5") { + vocab.type = LLAMA_VOCAB_TYPE_UGM; + + // default special tokens + vocab.special_bos_id = -1; + vocab.special_eos_id = 1; + vocab.special_unk_id = 2; + vocab.special_sep_id = -1; + vocab.special_pad_id = 0; + vocab.special_cls_id = -1; + vocab.special_mask_id = -1; + + const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str()); + if (add_space_prefix_keyidx != -1) { + vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx); + } // The default value of add_space_prefix is true. + + const int remove_extra_whitespaces_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS).c_str()); + if (remove_extra_whitespaces_keyidx != -1) { + vocab.tokenizer_remove_extra_whitespaces = gguf_get_val_bool(ctx, remove_extra_whitespaces_keyidx); + } // The default value of remove_extra_whitespaces is false. + + const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str()); + if (precompiled_charsmap_keyidx != -1) { + size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx); + const char * precompiled_charsmap = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx); + vocab.precompiled_charsmap.assign(precompiled_charsmap, precompiled_charsmap + n_precompiled_charsmap); +#ifdef IS_BIG_ENDIAN + // correct endiannes of data in precompiled_charsmap binary blob + uint32_t * xcda_blob_size = (uint32_t *) &vocab.precompiled_charsmap[0]; + *xcda_blob_size = __builtin_bswap32(*xcda_blob_size); + assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap); + size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t); + uint32_t * xcda_array = (uint32_t *) &vocab.precompiled_charsmap[sizeof(uint32_t)]; + for (size_t i = 0; i < xcda_array_size; ++i) { + xcda_array[i] = __builtin_bswap32(xcda_array[i]); + } +#endif + } } else { throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str())); } @@ -4952,6 +5070,10 @@ static void llm_load_vocab( vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; vocab.tokenizer_add_bos = true; vocab.tokenizer_add_eos = false; + } else if (vocab.type == LLAMA_VOCAB_TYPE_UGM) { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; + vocab.tokenizer_add_bos = false; + vocab.tokenizer_add_eos = true; } else { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } @@ -13213,12 +13335,18 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED; } +static bool llama_is_unused_token(const llama_vocab& vocab, llama_token id) { + GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); + return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED; +} + static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE); GGML_ASSERT(llama_is_byte_token(vocab, id)); const auto & token_data = vocab.id_to_token.at(id); switch (llama_vocab_get_type(vocab)) { - case LLAMA_VOCAB_TYPE_SPM: { + case LLAMA_VOCAB_TYPE_SPM: + case LLAMA_VOCAB_TYPE_UGM: { auto buf = token_data.text.substr(3, 2); return strtol(buf.c_str(), NULL, 16); } @@ -13238,7 +13366,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE); static const char * hex = "0123456789ABCDEF"; switch (llama_vocab_get_type(vocab)) { - case LLAMA_VOCAB_TYPE_SPM: { + case LLAMA_VOCAB_TYPE_SPM: + case LLAMA_VOCAB_TYPE_UGM: { const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; auto token = vocab.token_to_id.find(buf); if (token != vocab.token_to_id.end()) { @@ -13826,6 +13955,383 @@ struct llm_tokenizer_wpm { const llama_vocab & vocab; }; +struct naive_trie { + naive_trie() : has_value(false), value(0) { + } + void insert(const char * key, size_t len, int32_t value = 0) { + if (len == 0) { + this->has_value = true; + this->value = value; + return; + } + char c = key[0]; + auto res = children.find(c); + if (res != children.end()) { + res->second.insert(key + 1, len - 1, value); + } else { + auto res = children.insert(std::make_pair(c, naive_trie())); + res.first->second.insert(key + 1, len - 1, value); + } + } + std::pair get_longest_prefix(const char * key, size_t len, size_t offset = 0) { + if (len == 0 || offset == len) { + return std::make_pair(key, offset); + } + char c = key[offset]; + auto res = children.find(c); + if (res != children.end()) { + return res->second.get_longest_prefix(key, len, offset + 1); + } else { + return std::make_pair(key, offset); + } + } + struct naive_trie * traverse(const char c) { + auto res = children.find(c); + if (res != children.end()) { + return &res->second; + } else { + return NULL; + } + } + std::map children; + bool has_value; + llama_token value; +}; + +struct llm_tokenizer_ugm { + llm_tokenizer_ugm(const llama_vocab & vocab) : vocab(vocab) { + if (vocab.precompiled_charsmap.size() > 0) { + size_t charsmap_offset = 0; + + // First four bytes of precompiled_charsmap contains length of binary + // blob containing XOR-compressed compact double array (XCDA) entries + uint32_t xcda_blob_size = *(const uint32_t *) &vocab.precompiled_charsmap[0]; + charsmap_offset += sizeof(xcda_blob_size); + if (xcda_blob_size + charsmap_offset >= vocab.precompiled_charsmap.size()) { + throw std::runtime_error("Index out of array bounds in precompiled charsmap!"); + } + + // Next xcda_blob_size bytes contain entries of XOR-compressed compact + // double array (XCDA). Each entry is bit-packed into a 32-bit integer. + xcda_array = (const uint32_t *) &vocab.precompiled_charsmap[charsmap_offset]; + xcda_array_size = xcda_blob_size / sizeof(uint32_t); + charsmap_offset += xcda_blob_size; + + // Remaining bytes of precompiled charsmap contain null-terminated + // replacement strings for prefixes matched by the XCDA. + prefix_replacements = &vocab.precompiled_charsmap[charsmap_offset]; + prefix_replacements_size = vocab.precompiled_charsmap.size() - charsmap_offset; + } + + for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) { + const auto &token_data = vocab.id_to_token[id]; + + if (llama_is_normal_token(vocab, id)) { + min_score = std::min(min_score, token_data.score); + max_score = std::max(max_score, token_data.score); + } + + if (llama_is_normal_token(vocab, id) || + llama_is_user_defined_token(vocab, id) || + llama_is_unused_token(vocab, id)) { + token_matcher.insert(token_data.text.data(), token_data.text.size(), id); + } + + if (llama_is_user_defined_token(vocab, id)) { + user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size()); + } + } + + unknown_token_score = min_score - unknown_token_score_penalty; + } + + /* This implementation is based on SentencePiece optimized Viterbi algorithm for + * unigram language models. The general idea is to: + * - move along the input sequence in steps of one UTF code point, + * - at each step find all possible tokenizations of the prefix by + * traversing the tokens trie, + * - for each tokenization store the best one so far (by higher score) + * - use the position in sequence after given token as an index to store + * results + * - if there was no valid tokenization of the current UTF code point + * then use unknown token with additional score penalty + * After processing the whole sequence we backtrack from the end to get + * the best tokenization. + */ + void tokenize(const std::string & text, std::vector & output) { + // normalize the input first + std::string normalized; + normalize(text, &normalized); + size_t input_len = normalized.size(); + + // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores + std::vector tokenization_results(input_len + 1, {0, 0, -FLT_MAX}); + // at the beginning tokenization score is zero + tokenization_results[0] = { 0, 0, 0 }; + + for (size_t input_offset = 0; input_offset < input_len;) { + size_t prefix_offset = input_offset; + // calculate how many code units are in the currently processed UTF code point + size_t n_utf8_code_units = std::min(utf8_len(normalized[input_offset]), input_len - input_offset); + + // traverse the token matcher trie to find a matching token + bool single_codepoint_token_found = false; + const struct best_tokenization & current_best = tokenization_results[input_offset]; + struct naive_trie * node = token_matcher.traverse(normalized[prefix_offset++]); + + while (prefix_offset <= input_len && node != NULL) { + // check if we found valid token in prefix + if (node->has_value) { + // check if it corresponds to the whole UTF code point + if (prefix_offset - input_offset == n_utf8_code_units) { + single_codepoint_token_found = true; + } + llama_token token_id = node->value; + const auto &token_data = vocab.id_to_token[token_id]; + + // we set the user-defined token scores to 0 to make them more likely to be selected + // (normal token scores are log probabilities, so they are negative) + // score type is double here to make tokenization results exactly + // the same as in the HF tokenizer using SentencePiece + const double token_score = llama_is_user_defined_token(vocab, token_id) ? 0.0 : token_data.score; + const double challenger_score = current_best.score_sum + token_score; + struct best_tokenization & current_champ = tokenization_results[prefix_offset]; + if (challenger_score > current_champ.score_sum) { + struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score }; + current_champ = challenger; + } + } + node = node->traverse(normalized[prefix_offset++]); + } + + // if we didn't find a valid token corresponding to the whole UTF code point + // then use unknown token as the tokenization of this UTF code point + if (!single_codepoint_token_found) { + const double challenger_score = current_best.score_sum + unknown_token_score; + prefix_offset = input_offset + n_utf8_code_units; + struct best_tokenization & current_champ = tokenization_results[prefix_offset]; + if (challenger_score > current_champ.score_sum) { + struct best_tokenization challenger = { vocab.special_unk_id, input_offset, (float) challenger_score }; + current_champ = challenger; + } + } + + // move to the next UTF code point + input_offset += n_utf8_code_units; + } + + // now backtrack from the end to gather token ids of the best tokenization + // merge sequences of consecutive unknown tokens into single unknown tokens + bool is_prev_unknown = false; + for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) { + bool is_unknown = tokenization.token_id == vocab.special_unk_id; + if (!(is_prev_unknown && is_unknown)) { + output.push_back(tokenization.token_id); + } + if (tokenization.input_offset == 0) { + break; + } + is_prev_unknown = is_unknown; + } + + // reverse the output since we added tokens starting from the end of the input + std::reverse(output.begin(), output.end()); + } + +private: + const llama_vocab & vocab; + + // helper structure for returning normalization results + struct normalization_result { + const char * normalized; + size_t normalized_len; + size_t consumed_input; + }; + + void normalize(const std::string& input, std::string * normalized) { + normalized->clear(); + normalized->reserve(input.size() * 3); + + const std::string space = vocab.tokenizer_escape_whitespaces ? escaped_space : " "; + + bool shall_prepend_space = !vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix; + bool shall_append_space = vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix; + bool shall_merge_spaces = vocab.tokenizer_remove_extra_whitespaces; + + bool is_space_prepended = false; + bool processing_non_ws = false; + + size_t input_len = input.size(); + + for (size_t input_offset = 0; input_offset < input_len; ) { + auto norm_res = normalize_prefix(input, input_offset); + for (size_t i = 0; i < norm_res.normalized_len; i++) { + char c = norm_res.normalized[i]; + if (c != ' ') { + if (!processing_non_ws) { + processing_non_ws = true; + if ((shall_prepend_space && !is_space_prepended) || shall_merge_spaces) { + normalized->append(space); + is_space_prepended = true; + } + } + normalized->push_back(c); + } else { + if (processing_non_ws) { + processing_non_ws = false; + } + if (!shall_merge_spaces) { + normalized->append(space); + } + } + } + + input_offset += norm_res.consumed_input; + } + + if (shall_append_space) { + normalized->append(space); + } + } + + /* + * This structure is a view wrapper for XOR-compressed double array (XCDA) + * See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries. + * Eeach bit-packed entry contains: + * - BASE array value in bits 10-30 + * - LCHECK array value in bits 0-7 + * - LEAF array value in bit 9 + * Entries containing indexes of replacement sequences have set bit 31 + */ + struct xcda_array_view { + public: + xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) { + } + uint32_t get_base(size_t index) { + uint32_t packed_node = get_node(index); + return (packed_node >> 10) << ((packed_node & (1U << 9)) >> 6); + } + uint32_t get_lcheck(size_t index) { + uint32_t packed_node = get_node(index); + return packed_node & ((1U << 31) | 0xff); + } + bool get_leaf(size_t index) { + uint32_t packed_node = get_node(index); + return (packed_node >> 8) & 1; + } + uint32_t get_value(size_t index) { + uint32_t packed_node = get_node(index); + return packed_node & ((1U << 31) - 1); + } + private: + uint32_t get_node(size_t index) { + if (index > xcda_array_size) { + throw std::runtime_error("Index out of array bounds in XCDA array!"); + } + return xcda_array[index]; + } + const uint32_t * xcda_array; + size_t xcda_array_size; + }; + + struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) { + if (input_offset == input.size()) { + return { &input[input_offset], 0, 0 }; + } + + // if input prefix matches some user-defined token return this token as normalization result + auto user_defined_token_match = user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset); + if (user_defined_token_match.second > 0) { + return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second }; + } + + size_t longest_prefix_length = 0; + size_t longest_prefix_offset = 0; + + if (xcda_array_size > 0) { + struct xcda_array_view xcda_view(xcda_array, xcda_array_size); + + // Find the longest normalized sequence matching the input prefix by walking + // the XOR-compressed compact double array (XCDA) starting from the root node + // We find the index of the next node by calculating BASE[s] ^ c where s is + // the index of the previous node and c is a numerical character value + uint32_t node_index = 0; + // get BASE of the root node + node_index = xcda_view.get_base(node_index); + for (size_t prefix_offset = input_offset; prefix_offset < input.size(); prefix_offset++) { + unsigned char c = input[prefix_offset]; + if (c == 0) { + break; + } + node_index ^= c; + // if value of LCHECK is not c it means that this is not a child of + // the previous node, so we stop matching + if (xcda_view.get_lcheck(node_index) != c) { + break; + } + bool is_leaf = xcda_view.get_leaf(node_index); + // get BASE of the current node + node_index ^= xcda_view.get_base(node_index); + // if LEAF of the current node is true, it means that its BASE points to the node + // containing index of replacement sequence for currently matched input prefix + if (is_leaf) + { + longest_prefix_length = prefix_offset - input_offset + 1; + // get index of replacement sequence for currently matched input prefix + longest_prefix_offset = xcda_view.get_value(node_index); + } + } + } + + if (longest_prefix_length > 0) { + // we have a match, so return the replacement sequence + if (longest_prefix_offset >= prefix_replacements_size) { + throw std::runtime_error("Index out of array bounds in precompiled charsmap!"); + } + const char * prefix_replacement = &prefix_replacements[longest_prefix_offset]; + return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length }; + } else { + // check if the input prefix contains a valid sequence of UTF-8 code units + try { + // if yes, return this sequence unmodified + size_t prefix_offset = input_offset; + unicode_cpt_from_utf8(input, prefix_offset); + return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset }; + } catch(std::invalid_argument & ex) { + // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER + return { "\xEF\xBF\xBD", 3, 1 }; + } + } + } + + // escaped space symbol - U+2581 (Lower One Eighth Block) + const std::string escaped_space = "\xE2\x96\x81"; + + const char * prefix_replacements = NULL; + size_t prefix_replacements_size = 0; + + const uint32_t * xcda_array = NULL; + size_t xcda_array_size = 0; + + struct naive_trie user_defined_token_matcher; + + // this structure stores the best tokenization so far at input_offset + struct best_tokenization { + llama_token token_id; + size_t input_offset; + float score_sum; + }; + + float min_score = FLT_MAX; + float max_score = -FLT_MAX; + + float unknown_token_score_penalty = 10.0; + float unknown_token_score; + + struct naive_trie token_matcher; +}; + + typedef enum FRAGMENT_BUFFER_VARIANT_TYPE { FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN, FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT @@ -14086,6 +14592,39 @@ static std::vector llama_tokenize_internal(const llama_vocab & output.push_back(vocab.special_sep_id); } } break; + case LLAMA_VOCAB_TYPE_UGM: + { + llm_tokenizer_ugm tokenizer(vocab); + + if (add_special && vocab.tokenizer_add_bos != 0) { + GGML_ASSERT(vocab.special_bos_id != -1); + output.push_back(vocab.special_bos_id); + } + + for (const auto & fragment : fragment_buffer) { + if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { + auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); +#ifdef PRETOKENIZERDEBUG + LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); +#endif + tokenizer.tokenize(raw_text, output); + } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) + output.push_back(fragment.token); + } + } + + if (add_special && vocab.tokenizer_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) { + LLAMA_LOG_WARN( + "%s: Added a BOS token to the prompt as specified by the model but the prompt " + "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " + "Are you sure this is what you want?\n", __FUNCTION__); + } + + if (add_special && vocab.tokenizer_add_eos == 1) { + GGML_ASSERT(vocab.special_eos_id != -1); + output.push_back(vocab.special_eos_id); + } + } break; case LLAMA_VOCAB_TYPE_NONE: GGML_ASSERT(false); } @@ -16964,6 +17503,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_BLOOM: case LLM_ARCH_MAMBA: case LLM_ARCH_JINA_BERT_V2: + case LLM_ARCH_T5: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values @@ -18659,6 +19199,10 @@ llama_token llama_token_eot(const struct llama_model * model) { return model->vocab.special_eot_id; } +llama_token llama_token_pad(const struct llama_model * model) { + return model->vocab.special_pad_id; +} + int32_t llama_tokenize( const struct llama_model * model, const char * text, @@ -18725,7 +19269,8 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token if (0 <= token && token < llama_n_vocab(model)) { switch (llama_vocab_get_type(model->vocab)) { case LLAMA_VOCAB_TYPE_WPM: - case LLAMA_VOCAB_TYPE_SPM: { + case LLAMA_VOCAB_TYPE_SPM: + case LLAMA_VOCAB_TYPE_UGM: { // NOTE: we accept all unsupported token types, // suppressing them like CONTROL tokens. if (llama_is_normal_token(model->vocab, token)) { diff --git a/llama.h b/llama.h index 82d15747f..88eecb0ed 100644 --- a/llama.h +++ b/llama.h @@ -67,6 +67,7 @@ extern "C" { LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece + LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram }; // pre-tokenization types @@ -857,6 +858,7 @@ extern "C" { LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line + LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding // Returns -1 if unknown, 1 for true or 0 for false. LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model); diff --git a/unicode.cpp b/unicode.cpp index c0b76bf20..8692924b9 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -23,7 +23,7 @@ static std::string unicode_cpts_to_utf8(const std::vector & cps) { return result; } -static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) { +uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) { assert(offset < utf8.size()); if (!(utf8[offset + 0] & 0x80)) { auto result = utf8[offset + 0]; diff --git a/unicode.h b/unicode.h index 6c488970a..30b07ba7f 100644 --- a/unicode.h +++ b/unicode.h @@ -48,6 +48,7 @@ struct codepoint_flags { std::string unicode_cpt_to_utf8(uint32_t cp); +uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset); std::vector unicode_cpts_from_utf8(const std::string & utf8); std::vector unicode_cpts_normalize_nfd(const std::vector & cpts); From 163d50adaf8897d8b734d701ff332de6be63d484 Mon Sep 17 00:00:00 2001 From: jukofyork <69222624+jukofyork@users.noreply.github.com> Date: Tue, 25 Jun 2024 21:47:40 +0100 Subject: [PATCH 11/50] fixes #7999 (adds control vectors to all `build_XXX()` functions in `llama.cpp` [needs testing] (#8060) * fixes #7999 The `build_command_r` forgot to add the control vector. * Fixes qwen2 too * Fixed all models' control vectors * Removed double calls to `cb(cur, "l_out", il)` * Moved control vector logic to llama_control_vector:apply_to() --- llama.cpp | 112 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 73 insertions(+), 39 deletions(-) diff --git a/llama.cpp b/llama.cpp index 78a21008f..989c73149 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2368,13 +2368,21 @@ struct llama_control_vector { int32_t layer_start = -1; int32_t layer_end = -1; - ggml_tensor * tensor_for(int il) const { + struct ggml_tensor * tensor_for(int il) const { if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { return nullptr; } return tensors[il]; } + struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const { + ggml_tensor * layer_dir = tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx, cur, layer_dir); + } + return cur; + } + ~llama_control_vector() { for (struct ggml_context * ctx : ctxs) { ggml_free(ctx); @@ -8023,10 +8031,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - if (layer_dir != nullptr) { - cur = ggml_add(ctx0, cur, layer_dir); - } + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -8141,6 +8146,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -8245,6 +8251,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -8360,9 +8367,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "l_out", il); - cur = ggml_add(ctx0, cur, inpL); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -8514,10 +8520,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - if (layer_dir != nullptr) { - cur = ggml_add(ctx0, cur, layer_dir); - } + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -8648,10 +8651,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - if (layer_dir != nullptr) { - cur = ggml_add(ctx0, cur, layer_dir); - } + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -8757,8 +8757,12 @@ struct llm_build_context { cb(cur, "ffn_out", il); } - inpL = ggml_add(ctx0, cur, ffn_inp); - cb(inpL, "l_out", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; } cur = llm_build_norm(ctx0, inpL, hparams, @@ -8846,6 +8850,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -9141,8 +9146,12 @@ struct llm_build_context { cb(cur, "ffn_out", il); } - inpL = ggml_add(ctx0, cur, ffn_inp); - cb(inpL, "l_out", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; } cur = llm_build_norm(ctx0, inpL, hparams, @@ -9276,6 +9285,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -9424,6 +9434,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -9536,6 +9547,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -9647,6 +9659,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -9792,6 +9805,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -9912,11 +9926,11 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_output); - cb(cur, "l_out", il); - cur = ggml_add(ctx0, cur, inpL); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); + // input for next layer inpL = cur; } @@ -10048,8 +10062,10 @@ struct llm_build_context { } cur = ggml_add(ctx0, residual, cur); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); + // input for next layer inpL = cur; } @@ -10148,9 +10164,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, sa_out); - cb(cur, "l_out", il); - cur = ggml_add(ctx0, cur, inpL); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -10256,8 +10271,12 @@ struct llm_build_context { cb(cur, "ffn_out", il); } - inpL = ggml_add(ctx0, cur, ffn_inp); - cb(inpL, "l_out", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; } cur = llm_build_norm(ctx0, inpL, hparams, @@ -10363,8 +10382,12 @@ struct llm_build_context { cb(cur, "ffn_out", il); } - inpL = ggml_add(ctx0, cur, ffn_inp); - cb(inpL, "l_out", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; } cur = llm_build_norm(ctx0, inpL, hparams, @@ -10476,6 +10499,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -10593,6 +10617,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -10734,6 +10759,7 @@ struct llm_build_context { cb(cur, "hidden_scaled_ffn", -1); cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -10846,6 +10872,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, sa_out); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -10962,7 +10989,9 @@ struct llm_build_context { NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -11111,6 +11140,7 @@ struct llm_build_context { // residual cur = ggml_add(ctx0, cur, inpL); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -11252,6 +11282,7 @@ struct llm_build_context { // add together residual + FFN + self-attention cur = ggml_add(ctx0, cur, inpL); cur = ggml_add(ctx0, cur, attn_out); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -11387,10 +11418,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - if (layer_dir != nullptr) { - cur = ggml_add(ctx0, cur, layer_dir); - } + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -11504,8 +11532,12 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, inpL); cb(cur, "ffn_out", il); - inpL = ggml_add(ctx0, cur, attn_out); - cb(inpL, "l_out", il); + cur = ggml_add(ctx0, cur, attn_out); + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; } else { // attention and ffn are computed sequentially // x = x + attn(ln1(x)) @@ -11528,8 +11560,12 @@ struct llm_build_context { LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); - inpL = ggml_add(ctx0, cur, ffn_inp); - cb(inpL, "l_out", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; } } @@ -11656,10 +11692,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_out); cb(cur, "ffn_out", il); - ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - if (layer_dir != nullptr) { - cur = ggml_add(ctx0, cur, layer_dir); - } + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -11892,6 +11925,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer From 6777c544bdd8c5d9de3220d6e2557957bbbf2a4f Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Wed, 26 Jun 2024 01:45:58 +0100 Subject: [PATCH 12/50] `json`: fix additionalProperties, allow space after enum/const (#7840) * json: default additionalProperty to true * json: don't force additional props after normal properties! * json: allow space after enum/const * json: update pydantic example to set additionalProperties: false * json: prevent additional props to redefine a typed prop * port not_strings to python, add trailing space * fix not_strings & port to js+py * Update json-schema-to-grammar.cpp * fix _not_strings for substring overlaps * json: fix additionalProperties default, uncomment tests * json: add integ. test case for additionalProperties * json: nit: simplify condition * reformat grammar integ tests w/ R"""()""" strings where there's escapes * update # tokens in server test: consts can now have trailing space --- common/json-schema-to-grammar.cpp | 99 +++++- examples/json-schema-pydantic-example.py | 6 +- examples/json_schema_to_grammar.py | 76 ++++- .../server/public/json-schema-to-grammar.mjs | 89 ++++- examples/server/tests/features/server.feature | 2 +- tests/test-grammar-integration.cpp | 320 ++++++++---------- tests/test-json-schema-to-grammar.cpp | 150 ++++++-- 7 files changed, 497 insertions(+), 245 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 07d0e952d..b40821dad 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -614,6 +614,75 @@ private: return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space"); } + /* + Returns a rule that matches a JSON string that is none of the provided strings + + not_strings({"a"}) + -> ["] ( [a] char+ | [^"a] char* )? ["] space + not_strings({"and", "also"}) + -> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space + */ + std::string _not_strings(const std::vector & strings) { + + struct TrieNode { + std::map children; + bool is_end_of_string; + + TrieNode() : is_end_of_string(false) {} + + void insert(const std::string & string) { + auto node = this; + for (char c : string) { + node = &node->children[c]; + } + node->is_end_of_string = true; + } + }; + + TrieNode trie; + for (const auto & s : strings) { + trie.insert(s); + } + + std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char")); + std::ostringstream out; + out << "[\"] ( "; + std::function visit = [&](const TrieNode & node) { + std::ostringstream rejects; + auto first = true; + for (const auto & kv : node.children) { + rejects << kv.first; + if (first) { + first = false; + } else { + out << " | "; + } + out << "[" << kv.first << "]"; + if (!kv.second.children.empty()) { + out << " ("; + visit(kv.second); + out << ")"; + } else if (kv.second.is_end_of_string) { + out << " " << char_rule << "+"; + } + } + if (!node.children.empty()) { + if (!first) { + out << " | "; + } + out << "[^\"" << rejects.str() << "] " << char_rule << "*"; + } + }; + visit(trie); + + out << " )"; + if (!trie.is_end_of_string) { + out << "?"; + } + out << " [\"] space"; + return out.str(); + } + std::string _resolve_ref(const std::string & ref) { std::string ref_name = ref.substr(ref.find_last_of('/') + 1); if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) { @@ -634,6 +703,7 @@ private: std::vector required_props; std::vector optional_props; std::unordered_map prop_kv_rule_names; + std::vector prop_names; for (const auto & kv : properties) { const auto &prop_name = kv.first; const auto &prop_schema = kv.second; @@ -648,11 +718,18 @@ private: } else { optional_props.push_back(prop_name); } + prop_names.push_back(prop_name); } - if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get())) { + if (!(additional_properties.is_boolean() && !additional_properties.get())) { std::string sub_name = name + (name.empty() ? "" : "-") + "additional"; - std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value"); - std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule); + std::string value_rule = + additional_properties.is_object() ? visit(additional_properties, sub_name + "-value") + : _add_primitive("value", PRIMITIVE_RULES.at("value")); + + auto key_rule = + prop_names.empty() ? _add_primitive("string", PRIMITIVE_RULES.at("string")) + : _add_rule(sub_name + "-k", _not_strings(prop_names)); + std::string kv_rule = _add_rule(sub_name + "-kv", key_rule + " \":\" space " + value_rule); prop_kv_rule_names["*"] = kv_rule; optional_props.push_back("*"); } @@ -678,15 +755,11 @@ private: } std::string k = ks[0]; std::string kv_rule_name = prop_kv_rule_names[k]; - if (k == "*") { - res = _add_rule( - name + (name.empty() ? "" : "-") + "additional-kvs", - kv_rule_name + " ( \",\" space " + kv_rule_name + " )*" - ); - } else if (first_is_optional) { - res = "( \",\" space " + kv_rule_name + " )?"; + std::string comma_ref = "( \",\" space " + kv_rule_name + " )"; + if (first_is_optional) { + res = comma_ref + (k == "*" ? "*" : "?"); } else { - res = kv_rule_name; + res = kv_rule_name + (k == "*" ? " " + comma_ref + "*" : ""); } if (ks.size() > 1) { res += " " + _add_rule( @@ -824,13 +897,13 @@ public: } return _add_rule(rule_name, _generate_union_rule(name, schema_types)); } else if (schema.contains("const")) { - return _add_rule(rule_name, _generate_constant_rule(schema["const"])); + return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space"); } else if (schema.contains("enum")) { std::vector enum_values; for (const auto & v : schema["enum"]) { enum_values.push_back(_generate_constant_rule(v)); } - return _add_rule(rule_name, join(enum_values.begin(), enum_values.end(), " | ")); + return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space"); } else if ((schema_type.is_null() || schema_type == "object") && (schema.contains("properties") || (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) { diff --git a/examples/json-schema-pydantic-example.py b/examples/json-schema-pydantic-example.py index 2240188cd..2a24f8118 100644 --- a/examples/json-schema-pydantic-example.py +++ b/examples/json-schema-pydantic-example.py @@ -3,7 +3,7 @@ #! pip install pydantic #! python json-schema-pydantic-example.py -from pydantic import BaseModel, TypeAdapter +from pydantic import BaseModel, Extra, TypeAdapter from annotated_types import MinLen from typing import Annotated, List, Optional import json, requests @@ -50,12 +50,16 @@ else: if __name__ == '__main__': class QAPair(BaseModel): + class Config: + extra = 'forbid' # triggers additionalProperties: false in the JSON schema question: str concise_answer: str justification: str stars: Annotated[int, Field(ge=1, le=5)] class PyramidalSummary(BaseModel): + class Config: + extra = 'forbid' # triggers additionalProperties: false in the JSON schema title: str summary: str question_answers: Annotated[List[QAPair], MinLen(2)] diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index 86500a8c3..3f3132f88 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -4,8 +4,7 @@ import itertools import json import re import sys -from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union - +from typing import Any, List, Optional, Set, Tuple, Union def _build_repetition(item_rule, min_items, max_items, separator_rule=None): @@ -276,6 +275,51 @@ class SchemaConverter: return ''.join(('(', *recurse(0), ')')) + def _not_strings(self, strings): + class TrieNode: + def __init__(self): + self.children = {} + self.is_end_of_string = False + + def insert(self, string): + node = self + for c in string: + node = node.children.setdefault(c, TrieNode()) + node.is_end_of_string = True + + trie = TrieNode() + for s in strings: + trie.insert(s) + + char_rule = self._add_primitive('char', PRIMITIVE_RULES['char']) + out = ['["] ( '] + + def visit(node): + rejects = [] + first = True + for c in sorted(node.children.keys()): + child = node.children[c] + rejects.append(c) + if first: + first = False + else: + out.append(' | ') + out.append(f'[{c}]') + if child.children: + out.append(f' (') + visit(child) + out.append(')') + elif child.is_end_of_string: + out.append(f' {char_rule}+') + if node.children: + if not first: + out.append(' | ') + out.append(f'[^"{"".join(rejects)}] {char_rule}*') + visit(trie) + + out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space') + return ''.join(out) + def _add_rule(self, name, rule): esc_name = INVALID_RULE_CHARS_RE.sub('-', name) if esc_name not in self._rules or self._rules[esc_name] == rule: @@ -524,10 +568,10 @@ class SchemaConverter: return self._add_rule(rule_name, self._generate_union_rule(name, [{'type': t} for t in schema_type])) elif 'const' in schema: - return self._add_rule(rule_name, self._generate_constant_rule(schema['const'])) + return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space') elif 'enum' in schema: - rule = ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space' return self._add_rule(rule_name, rule) elif schema_type in (None, 'object') and \ @@ -632,7 +676,7 @@ class SchemaConverter: self._add_primitive(dep, dep_rule) return n - def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]): + def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Optional[Union[bool, Any]]): prop_order = self._prop_order # sort by position in prop_order (if specified) then by original order sorted_props = [kv[0] for _, kv in sorted(enumerate(properties), key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]))] @@ -647,12 +691,16 @@ class SchemaConverter: required_props = [k for k in sorted_props if k in required] optional_props = [k for k in sorted_props if k not in required] - if additional_properties == True or isinstance(additional_properties, dict): + if additional_properties != False: sub_name = f'{name}{"-" if name else ""}additional' - value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value') + value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \ + self._add_primitive('value', PRIMITIVE_RULES['value']) + key_rule = self._add_primitive('string', PRIMITIVE_RULES['string']) if not sorted_props \ + else self._add_rule(f'{sub_name}-k', self._not_strings(sorted_props)) + prop_kv_rule_names["*"] = self._add_rule( f'{sub_name}-kv', - self._add_primitive('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}' + f'{key_rule} ":" space {value_rule}' ) optional_props.append("*") @@ -667,15 +715,11 @@ class SchemaConverter: def get_recursive_refs(ks, first_is_optional): [k, *rest] = ks kv_rule_name = prop_kv_rule_names[k] - if k == '*': - res = self._add_rule( - f'{name}{"-" if name else ""}additional-kvs', - f'{kv_rule_name} ( "," space ' + kv_rule_name + ' )*' - ) - elif first_is_optional: - res = f'( "," space {kv_rule_name} )?' + comma_ref = f'( "," space {kv_rule_name} )' + if first_is_optional: + res = comma_ref + ('*' if k == '*' else '?') else: - res = kv_rule_name + res = kv_rule_name + (' ' + comma_ref + "*" if k == '*' else '') if len(rest) > 0: res += ' ' + self._add_rule( f'{name}{"-" if name else ""}{k}-rest', diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs index f340f94bd..02015bbd4 100644 --- a/examples/server/public/json-schema-to-grammar.mjs +++ b/examples/server/public/json-schema-to-grammar.mjs @@ -532,6 +532,64 @@ export class SchemaConverter { return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space") } + _notStrings(strings) { + class TrieNode { + constructor() { + this.children = {}; + this.isEndOfString = false; + } + + insert(str) { + let node = this; + for (const c of str) { + node = node.children[c] = node.children[c] || new TrieNode(); + } + node.isEndOfString = true; + } + } + + const trie = new TrieNode(); + for (const s of strings) { + trie.insert(s); + } + + const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']); + const out = ['["] ( ']; + + const visit = (node) => { + const rejects = []; + let first = true; + for (const c of Object.keys(node.children).sort()) { + const child = node.children[c]; + rejects.push(c); + if (first) { + first = false; + } else { + out.push(' | '); + } + out.push(`[${c}]`); + if (Object.keys(child.children).length > 0) { + out.push(' ('); + visit(child); + out.push(')'); + } else if (child.isEndOfString) { + out.push(` ${charRuleName}+`); + } + } + if (Object.keys(node.children).length > 0) { + if (!first) { + out.push(' | '); + } + out.push(`[^"${rejects.join('')}] ${charRuleName}*`); + } + }; + + visit(trie); + + out.push(` )${trie.isEndOfString ? '' : '?'} ["] space`); + return out.join(''); + } + _resolveRef(ref) { let refName = ref.split('/').pop(); if (!(refName in this._rules) && !this._refsBeingResolved.has(ref)) { @@ -560,9 +618,9 @@ export class SchemaConverter { } else if (Array.isArray(schemaType)) { return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({ type: t })))); } else if ('const' in schema) { - return this._addRule(ruleName, this._generateConstantRule(schema.const)); + return this._addRule(ruleName, this._generateConstantRule(schema.const) + ' space'); } else if ('enum' in schema) { - const rule = schema.enum.map(v => this._generateConstantRule(v)).join(' | '); + const rule = '(' + schema.enum.map(v => this._generateConstantRule(v)).join(' | ') + ') space'; return this._addRule(ruleName, rule); } else if ((schemaType === undefined || schemaType === 'object') && ('properties' in schema || @@ -599,7 +657,7 @@ export class SchemaConverter { } } - return this._addRule(ruleName, this._buildObjectRule(properties, required, name, /* additionalProperties= */ false)); + return this._addRule(ruleName, this._buildObjectRule(properties, required, name, null)); } else if ((schemaType === undefined || schemaType === 'array') && ('items' in schema || 'prefixItems' in schema)) { const items = schema.items ?? schema.prefixItems; if (Array.isArray(items)) { @@ -693,12 +751,19 @@ export class SchemaConverter { const requiredProps = sortedProps.filter(k => required.has(k)); const optionalProps = sortedProps.filter(k => !required.has(k)); - if (typeof additionalProperties === 'object' || additionalProperties === true) { + if (additionalProperties !== false) { const subName = `${name ?? ''}${name ? '-' : ''}additional`; - const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`); + const valueRule = + additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`) + : this._addPrimitive('value', PRIMITIVE_RULES['value']); + + const key_rule = + sortedProps.length === 0 ? this._addPrimitive('string', PRIMITIVE_RULES['string']) + : this._addRule(`${subName}-k`, this._notStrings(sortedProps)); + propKvRuleNames['*'] = this._addRule( `${subName}-kv`, - `${this._addPrimitive('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`); + `${key_rule} ":" space ${valueRule}`); optionalProps.push('*'); } @@ -715,15 +780,11 @@ export class SchemaConverter { const [k, ...rest] = ks; const kvRuleName = propKvRuleNames[k]; let res; - if (k === '*') { - res = this._addRule( - `${name ?? ''}${name ? '-' : ''}additional-kvs`, - `${kvRuleName} ( "," space ` + kvRuleName + ` )*` - ) - } else if (firstIsOptional) { - res = `( "," space ${kvRuleName} )?`; + const commaRef = `( "," space ${kvRuleName} )`; + if (firstIsOptional) { + res = commaRef + (k === '*' ? '*' : '?'); } else { - res = kvRuleName; + res = kvRuleName + (k === '*' ? ' ' + commaRef + '*' : ''); } if (rest.length > 0) { res += ' ' + this._addRule( diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index d21c09135..b55971454 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -82,7 +82,7 @@ Feature: llama.cpp server Examples: Prompts | response_format | n_predicted | re_content | - | {"type": "json_object", "schema": {"const": "42"}} | 5 | "42" | + | {"type": "json_object", "schema": {"const": "42"}} | 6 | "42" | | {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] | | {"type": "json_object"} | 10 | \{ " Jacky. | diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 5750d362a..23ef8324c 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -15,8 +15,6 @@ using json = nlohmann::ordered_json; -//#define INCLUDE_FAILING_TESTS 1 - static llama_grammar* build_grammar(const std::string & grammar_str) { auto parsed_grammar = grammar_parser::parse(grammar_str.c_str()); @@ -754,7 +752,7 @@ static void test_json_schema() { )""", // Passing strings { - "{}", + R"""({})""", R"""({"foo": "bar"})""", }, // Failing strings @@ -762,7 +760,7 @@ static void test_json_schema() { "", "[]", "null", - "\"\"", + R"""("")""", "true", } ); @@ -770,16 +768,14 @@ static void test_json_schema() { test_schema( "exotic formats (list)", // Schema - R"""( - { + R"""({ "items": [ { "format": "date" }, { "format": "uuid" }, { "format": "time" }, { "format": "date-time" } ] - } - )""", + })""", // Passing strings { // "{}", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it? @@ -798,125 +794,113 @@ static void test_json_schema() { test_schema( "string", // Schema - R"""( - { - "type": "string" - } - )""", + R"""({ + "type": "string" + })""", // Passing strings { - "\"foo\"", - "\"bar\"", - "\"\"", + R"""("foo")""", + R"""("bar")""", + R"""("")""", }, // Failing strings { - "{}", - "\"foo\": \"bar\"", + R"""({})""", + R"""("foo": "bar")""", } ); test_schema( "string w/ min length 1", // Schema - R"""( - { - "type": "string", - "minLength": 1 - } - )""", + R"""({ + "type": "string", + "minLength": 1 + })""", // Passing strings { - "\"foo\"", - "\"bar\"", + R"""("foo")""", + R"""("bar")""", }, // Failing strings { - "\"\"", - "{}", - "\"foo\": \"bar\"", + R"""("")""", + R"""({})""", + R"""("foo": "bar")""", } ); test_schema( "string w/ min length 3", // Schema - R"""( - { + R"""({ "type": "string", "minLength": 3 - } - )""", + })""", // Passing strings { - "\"foo\"", - "\"bar\"", - "\"foobar\"", + R"""("foo")""", + R"""("bar")""", + R"""("foobar")""", }, // Failing strings { - "\"\"", - "\"f\"", - "\"fo\"", + R"""("")""", + R"""("f")""", + R"""("fo")""", } ); test_schema( "string w/ max length", // Schema - R"""( - { - "type": "string", - "maxLength": 3 - } - )""", + R"""({ + "type": "string", + "maxLength": 3 + })""", // Passing strings { - "\"foo\"", - "\"bar\"", - "\"\"", - "\"f\"", - "\"fo\"", + R"""("foo")""", + R"""("bar")""", + R"""("")""", + R"""("f")""", + R"""("fo")""", }, // Failing strings { - "\"foobar\"", + R"""("foobar")""", } ); test_schema( "string w/ min & max length", // Schema - R"""( - { - "type": "string", - "minLength": 1, - "maxLength": 4 - } - )""", + R"""({ + "type": "string", + "minLength": 1, + "maxLength": 4 + })""", // Passing strings { - "\"foo\"", - "\"bar\"", - "\"f\"", - "\"barf\"", + R"""("foo")""", + R"""("bar")""", + R"""("f")""", + R"""("barf")""", }, // Failing strings { - "\"\"", - "\"barfo\"", - "\"foobar\"", + R"""("")""", + R"""("barfo")""", + R"""("foobar")""", } ); test_schema( "boolean", // Schema - R"""( - { - "type": "boolean" - } - )""", + R"""({ + "type": "boolean" + })""", // Passing strings { "true", @@ -924,122 +908,112 @@ static void test_json_schema() { }, // Failing strings { - "\"\"", - "\"true\"", - "True", - "FALSE", + R"""("")""", + R"""("true")""", + R"""(True)""", + R"""(FALSE)""", } ); test_schema( "integer", // Schema - R"""( - { - "type": "integer" - } - )""", + R"""({ + "type": "integer" + })""", // Passing strings { - "0", - "12345", - "1234567890123456" + R"""(0)""", + R"""(12345)""", + R"""(1234567890123456)""", }, // Failing strings { - "", - "01", - "007", - "12345678901234567" + R"""()""", + R"""(01)""", + R"""(007)""", + R"""(12345678901234567 )""", } ); test_schema( "string const", // Schema - R"""( - { - "const": "foo" - } - )""", + R"""({ + "const": "foo" + })""", // Passing strings { - "\"foo\"", + R"""("foo")""", }, // Failing strings { - "foo", - "\"bar\"", + R"""(foo)""", + R"""("bar")""", } ); test_schema( "non-string const", // Schema - R"""( - { - "const": true - } - )""", + R"""({ + "const": true + })""", // Passing strings { - "true", + R"""(true)""", }, // Failing strings { - "", - "foo", - "\"true\"", + R"""()""", + R"""(foo)""", + R"""("true")""", } ); test_schema( "non-string const", // Schema - R"""( - { - "enum": ["red", "amber", "green", null, 42, ["foo"]] - } - )""", + R"""({ + "enum": ["red", "amber", "green", null, 42, ["foo"]] + })""", // Passing strings { - "\"red\"", - "null", - "42", - "[\"foo\"]", + R"""("red")""", + R"""(null)""", + R"""(42)""", + R"""(["foo"])""", }, // Failing strings { - "", - "420", - "true", - "foo", + R"""()""", + R"""(420)""", + R"""(true)""", + R"""(foo)""", } ); test_schema( "min+max items", // Schema - R"""( - { - "items": { - "type": ["number", "integer"] - }, - "minItems": 3, - "maxItems": 5 - } - )""", + R"""({ + "items": { + "type": ["number", "integer"] + }, + "minItems": 3, + "maxItems": 5 + })""", // Passing strings { - "[1, 2, 3]", - "[1, 2, 3, 4]", - "[1, 2, 3, 4, 5]", + R"""([1, 2, 3])""", + R"""([1, 2, 3, 4])""", + R"""([1, 2, 3, 4, 5])""", }, // Failing strings { - "[1, 2]", - "[1, 2, 3, 4, 5, 6]", - "1" + R"""([1, 2])""", + R"""([1, 2, 3, 4, 5, 6])""", + R"""(1)""", } ); @@ -1047,16 +1021,14 @@ static void test_json_schema() { test_schema( "object properties", // Schema - R"""( - { + R"""({ "type": "object", "properties": { "number": { "type": "number" }, "street_name": { "type": "string" }, "street_type": { "enum": ["Street", "Avenue", "Boulevard"] } } - } - )""", + })""", // Passing strings { R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""", @@ -1066,12 +1038,8 @@ static void test_json_schema() { // "By extension, even an empty object is valid" R"""({})""", // "By default, providing additional properties is valid" -#ifdef INCLUDE_FAILING_TESTS - // TODO: The following should pass, but currently FAILS. Additional properties should be permitted by default. R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""", - // TODO: Spaces should be permitted around enum values, but currently they fail to pass. R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""", -#endif }, // Failing strings { @@ -1084,13 +1052,35 @@ static void test_json_schema() { } ); + test_schema( + "additional properties can't override other properties", + R"""({ + "properties": { + "a": {"type": "integer"}, + "b": {"type": "integer"} + }, + "additionalProperties": true + })""", + // Passing strings + { + R"""({"a": 42})""", + R"""({"c": ""})""", + R"""({"a": 42, "c": ""})""", + R"""({"a_": ""})""", + }, + // Failing strings + { + R"""()""", + R"""({"a": ""})""", + R"""({"a": "", "b": ""})""", + } + ); // Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties) test_schema( "object properties, additionalProperties: true", // Schema - R"""( - { + R"""({ "type": "object", "properties": { "number": { "type": "number" }, @@ -1098,26 +1088,18 @@ static void test_json_schema() { "street_type": { "enum": ["Street", "Avenue", "Boulevard"] } }, "additionalProperties": true - } - )""", + })""", // Passing strings { // "By extension, even an empty object is valid" R"""({})""", -#ifdef INCLUDE_FAILING_TESTS - // TODO: Following line should pass and doesn't R"""({"number":1600,"street_name":"Pennsylvania","street_type":"Avenue"})""", // "By default, leaving out properties is valid" - // TODO: Following line should pass and doesn't R"""({ "street_name": "Pennsylvania" })""", - // TODO: Following line should pass and doesn't R"""({ "number": 1600, "street_name": "Pennsylvania" })""", // "By default, providing additional properties is valid" - // TODO: The following should pass, but currently FAILS. Additional properties should be permitted by default. R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""", - // TODO: Spaces should be permitted around enum values, but currently they fail to pass. R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""", -#endif }, // Failing strings { @@ -1132,8 +1114,7 @@ static void test_json_schema() { test_schema( "required + optional props each in original order", // Schema - R"""( - { + R"""({ "type": "object", "properties": { "number": { "type": "number" }, @@ -1141,18 +1122,15 @@ static void test_json_schema() { "street_type": { "enum": ["Street", "Avenue", "Boulevard"] } }, "additionalProperties": false - } - )""", + })""", // Passing strings { R"""({ "street_name": "Pennsylvania" })""", R"""({ "number": 1600, "street_type":"Avenue"})""", R"""({ "number": 1600, "street_name": "Pennsylvania" })""", R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""", -#ifdef INCLUDE_FAILING_TESTS - // TODO: Spaces should be permitted around enum values, but currently they fail to pass. + // Spaces are permitted around enum values R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""", -#endif }, // Failing strings { @@ -1166,18 +1144,16 @@ static void test_json_schema() { test_schema( "required + optional props each in original order", // Schema - R"""( - { - "properties": { - "b": {"type": "string"}, - "a": {"type": "string"}, - "d": {"type": "string"}, - "c": {"type": "string"} - }, - "required": ["a", "b"], - "additionalProperties": false - } - )""", + R"""({ + "properties": { + "b": {"type": "string"}, + "a": {"type": "string"}, + "d": {"type": "string"}, + "c": {"type": "string"} + }, + "required": ["a", "b"], + "additionalProperties": false + })""", // Passing strings { R"""({"b": "foo", "a": "bar"})""", @@ -1197,8 +1173,7 @@ static void test_json_schema() { test_schema( "required props", // Schema - R"""( - { + R"""({ "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://example.com/product.schema.json", "title": "Product", @@ -1244,8 +1219,7 @@ static void test_json_schema() { } }, "required": [ "productId", "productName", "price" ] - } - )""", + })""", // Passing strings { R"""({"productId": 1, "productName": "A green door", "price": 12.50})""", diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 2e591bd71..1e69cb6ef 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -473,7 +473,7 @@ static void test_all(const std::string & lang, std::function Date: Wed, 26 Jun 2024 01:46:35 +0100 Subject: [PATCH 13/50] `json`: better support for "type" unions (e.g. nullable arrays w/ typed items) (#7863) * json: better suport for "type" arrays (e.g. `{"type": ["array", "null"], "items": {"type": "string"}}`) * json: add test for type: [array, null] fix * update tests --- common/json-schema-to-grammar.cpp | 4 ++- examples/json_schema_to_grammar.py | 2 +- .../server/public/json-schema-to-grammar.mjs | 2 +- tests/test-grammar-integration.cpp | 25 +++++++++++++++ tests/test-json-schema-to-grammar.cpp | 32 +++++++++++++++++++ 5 files changed, 62 insertions(+), 3 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index b40821dad..2f233e2e7 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -893,7 +893,9 @@ public: } else if (schema_type.is_array()) { std::vector schema_types; for (const auto & t : schema_type) { - schema_types.push_back({{"type", t}}); + json schema_copy(schema); + schema_copy["type"] = t; + schema_types.push_back(schema_copy); } return _add_rule(rule_name, _generate_union_rule(name, schema_types)); } else if (schema.contains("const")) { diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index 3f3132f88..92f6e3d47 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -565,7 +565,7 @@ class SchemaConverter: return self._add_rule(rule_name, self._generate_union_rule(name, schema.get('oneOf') or schema['anyOf'])) elif isinstance(schema_type, list): - return self._add_rule(rule_name, self._generate_union_rule(name, [{'type': t} for t in schema_type])) + return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type])) elif 'const' in schema: return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space') diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs index 02015bbd4..06d76edde 100644 --- a/examples/server/public/json-schema-to-grammar.mjs +++ b/examples/server/public/json-schema-to-grammar.mjs @@ -616,7 +616,7 @@ export class SchemaConverter { } else if (schema.oneOf || schema.anyOf) { return this._addRule(ruleName, this._generateUnionRule(name, schema.oneOf || schema.anyOf)); } else if (Array.isArray(schemaType)) { - return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({ type: t })))); + return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({...schema, type: t})))); } else if ('const' in schema) { return this._addRule(ruleName, this._generateConstantRule(schema.const) + ' space'); } else if ('enum' in schema) { diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 23ef8324c..0e21dc795 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -993,6 +993,31 @@ static void test_json_schema() { } ); + test_schema( + "", + // Schema + R"""( + { + "type": ["array", "null"], + "items": { "type": "string" } + } + )""", + // Passing strings + { + "null", + "[]", + "[\"123\"]", + "[\"foo\", \"bar\"]", + }, + // Failing strings + { + "", + "[123]", + "\"foo\"", + "[\"foo\", 42]", + } + ); + test_schema( "min+max items", // Schema diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 1e69cb6ef..3aaa11833 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -502,6 +502,38 @@ static void test_all(const std::string & lang, std::function Date: Wed, 26 Jun 2024 14:27:46 +0800 Subject: [PATCH 14/50] llama : extend llm_build_ffn() to support _scale tensors (#8103) --- llama.cpp | 255 +++++++++++++++++++++++++++++------------------------- 1 file changed, 135 insertions(+), 120 deletions(-) diff --git a/llama.cpp b/llama.cpp index 989c73149..f78594a6f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7212,10 +7212,13 @@ static struct ggml_tensor * llm_build_ffn( struct ggml_tensor * cur, struct ggml_tensor * up, struct ggml_tensor * up_b, + struct ggml_tensor * up_s, struct ggml_tensor * gate, struct ggml_tensor * gate_b, + struct ggml_tensor * gate_s, struct ggml_tensor * down, struct ggml_tensor * down_b, + struct ggml_tensor * down_s, struct ggml_tensor * act_scales, llm_ffn_op_type type_op, llm_ffn_gate_type type_gate, @@ -7229,6 +7232,11 @@ static struct ggml_tensor * llm_build_ffn( cb(tmp, "ffn_up_b", il); } + if (up_s) { + tmp = ggml_mul(ctx, tmp, up_s); + cb(tmp, "ffn_up_s", il); + } + if (gate) { switch (type_gate) { case LLM_FFN_SEQ: @@ -7247,6 +7255,12 @@ static struct ggml_tensor * llm_build_ffn( cur = ggml_add(ctx, cur, gate_b); cb(cur, "ffn_gate_b", il); } + + if (gate_s) { + cur = ggml_mul(ctx, cur, gate_s); + cb(cur, "ffn_gate_s", il); + } + } else { cur = tmp; } @@ -7286,7 +7300,10 @@ static struct ggml_tensor * llm_build_ffn( cb(cur, "ffn_gate_par", il); } - cur = ggml_mul_mat(ctx, down, cur); + if (down) { + cur = ggml_mul_mat(ctx, down, cur); + } + if (down_b) { cb(cur, "ffn_down", il); } @@ -7295,6 +7312,11 @@ static struct ggml_tensor * llm_build_ffn( cur = ggml_add(ctx, cur, down_b); } + if (down_s) { + cur = ggml_mul(ctx, cur, down_s); + cb(cur, "ffn_down_s", il); + } + return cur; } @@ -8003,9 +8025,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, - model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -8137,9 +8159,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -8242,9 +8264,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -8358,9 +8380,9 @@ struct llm_build_context { // feed forward { cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result - model.layers[il].ffn_up, NULL, - NULL, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_up, NULL, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); @@ -8749,9 +8771,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, - NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); @@ -8841,9 +8863,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -9026,23 +9048,23 @@ struct llm_build_context { // feed-forward network if (model.arch == LLM_ARCH_BERT) { cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, - NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_PAR, cb, il); } else { cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); } @@ -9138,9 +9160,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, - NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); @@ -9276,9 +9298,9 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, - NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, model.layers[il].ffn_act, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); @@ -9425,9 +9447,9 @@ struct llm_build_context { cur = inpSA; } cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -9538,9 +9560,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -9651,9 +9673,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -9788,9 +9810,9 @@ struct llm_build_context { cb(cur_gate, "ffn_shexp_gate", il); ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up_shexp, NULL, - model.layers[il].ffn_gate_shexp, NULL, - model.layers[il].ffn_down_shexp, NULL, + model.layers[il].ffn_up_shexp, NULL, NULL, + model.layers[il].ffn_gate_shexp, NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur_ffn, "ffn_shexp", il); @@ -9917,9 +9939,9 @@ struct llm_build_context { // FF { ffn_output = llm_build_ffn(ctx0, attn_norm_output, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, - NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(ffn_output, "ffn_out", il); @@ -10155,9 +10177,9 @@ struct llm_build_context { // feed-forward network { cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -10263,9 +10285,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, - NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); @@ -10374,9 +10396,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, - NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); @@ -10491,9 +10513,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -10609,9 +10631,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -10746,9 +10768,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -10863,9 +10885,9 @@ struct llm_build_context { // feed-forward network { cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_GELU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -10983,9 +11005,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, - NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); @@ -11271,9 +11293,9 @@ struct llm_build_context { // feed-forward network { cur = llm_build_ffn(ctx0, ffn_inp, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -11408,9 +11430,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -11522,9 +11544,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, - NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); @@ -11553,9 +11575,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, - NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); @@ -11662,9 +11684,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -11884,9 +11906,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -11912,9 +11934,9 @@ struct llm_build_context { // FFN shared expert { ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up_shexp, NULL, - model.layers[il].ffn_gate_shexp, NULL, - model.layers[il].ffn_down_shexp, NULL, + model.layers[il].ffn_up_shexp, NULL, NULL, + model.layers[il].ffn_gate_shexp, NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(ffn_shexp, "ffn_shexp", il); @@ -12017,7 +12039,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, - nullptr, nullptr, + NULL, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cur = llm_build_norm(ctx0, cur, hparams, @@ -12044,35 +12066,28 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward forward - if (model.layers[il].ffn_gate_inp == nullptr) { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); - struct ggml_tensor *tmp = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur); - tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_up_scale); - cb(tmp, "ffn_up", il); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale, + model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale, + NULL, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_sub_out", il); - cur = ggml_mul_mat(ctx0, model.layers[il].ffn_gate, cur); - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_gate_scale); - cb(cur, "ffn_gate", il); + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].ffn_sub_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_sub_norm", il); - cur = ggml_silu(ctx0, cur); - cb(cur, "ffn_silu", il); + cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur); + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale); + cb(cur, "ffn_down", il); - cur = ggml_mul(ctx0, cur, tmp); - cb(cur, "ffn_gate_par", il); - - cur = llm_build_norm(ctx0, cur, hparams, - model.layers[il].ffn_sub_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_sub_norm", il); - - cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur); - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale); - cb(cur, "ffn_down", il); - } cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "l_out", il); From c8771ab5f89387cdd7d9a8a69280dac46b45e02f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 26 Jun 2024 08:28:02 +0200 Subject: [PATCH 15/50] CUDA: fix misaligned shared memory read (#8123) --- ggml-cuda/mma.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-cuda/mma.cuh b/ggml-cuda/mma.cuh index 0301a52f9..5d87dd8e6 100644 --- a/ggml-cuda/mma.cuh +++ b/ggml-cuda/mma.cuh @@ -23,7 +23,7 @@ struct mma_int_A_I16K4 { __device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) { #if defined(INT8_MMA_AVAILABLE) - const int * xs = xs0 + (threadIdx.x%I)*stride + (threadIdx.x/I)*(K/2); + const int * xs = xs0 + (threadIdx.x%I)*stride; asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];" : "+r"(x[0]), "+r"(x[1]) : "l"(xs)); From 88540445615e77a0177fcca43aaa8e9d8eea6864 Mon Sep 17 00:00:00 2001 From: Isaac McFadyen Date: Wed, 26 Jun 2024 02:29:28 -0400 Subject: [PATCH 16/50] Clarify default MMQ for CUDA and LLAMA_CUDA_FORCE_MMQ flag (#8115) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add message about int8 support * Add suggestions from review Co-authored-by: Johannes Gäßler --------- Co-authored-by: Johannes Gäßler --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a54ee3951..95d970d83 100644 --- a/README.md +++ b/README.md @@ -511,7 +511,7 @@ Building the program with BLAS support may lead to some performance improvements | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. | - | LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). Speed for large batch sizes will be worse but VRAM consumption will be lower. | + | LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. | | LLAMA_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models | | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | From f3f65429c44bb195a9195bfdc19a30a79709db7b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 26 Jun 2024 18:33:02 +0300 Subject: [PATCH 17/50] llama : reorganize source code + improve CMake (#8006) * scripts : update sync [no ci] * files : relocate [no ci] * ci : disable kompute build [no ci] * cmake : fixes [no ci] * server : fix mingw build ggml-ci * cmake : minor [no ci] * cmake : link math library [no ci] * cmake : build normal ggml library (not object library) [no ci] * cmake : fix kompute build ggml-ci * make,cmake : fix LLAMA_CUDA + replace GGML_CDEF_PRIVATE ggml-ci * move public backend headers to the public include directory (#8122) * move public backend headers to the public include directory * nix test * spm : fix metal header --------- Co-authored-by: Georgi Gerganov * scripts : fix sync paths [no ci] * scripts : sync ggml-blas.h [no ci] --------- Co-authored-by: slaren --- .devops/nix/package.nix | 24 +- .github/labeler.yml | 28 +- .github/workflows/bench.yml | 2 +- .github/workflows/build.yml | 74 +- .github/workflows/server.yml | 6 +- .gitignore | 1 + .gitmodules | 2 +- CMakeLists.txt | 1366 +---------------- CMakePresets.json | 6 +- Makefile | 1068 ++++++++----- Package.swift | 21 +- README-sycl.md | 24 +- README.md | 62 +- ci/run.sh | 10 +- {scripts => cmake}/build-info.cmake | 0 cmake/git-vars.cmake | 22 + .../llama-config.cmake.in | 32 +- common/CMakeLists.txt | 7 +- .../cmake/build-info-gen-cpp.cmake | 4 +- docs/BLIS.md | 6 +- examples/CMakeLists.txt | 4 +- examples/imatrix/README.md | 2 +- examples/llava/MobileVLM-README.md | 2 +- examples/rpc/README.md | 8 +- examples/server/CMakeLists.txt | 15 +- examples/sycl/build.sh | 4 +- examples/sycl/win-build-sycl.bat | 4 +- ggml/CMakeLists.txt | 238 +++ {cmake => ggml/cmake}/FindSIMD.cmake | 12 +- .../ggml_vk_generate_shaders.py | 0 ggml-alloc.h => ggml/include/ggml-alloc.h | 0 ggml-backend.h => ggml/include/ggml-backend.h | 0 ggml-blas.h => ggml/include/ggml-blas.h | 0 ggml-cuda.h => ggml/include/ggml-cuda.h | 0 ggml-kompute.h => ggml/include/ggml-kompute.h | 0 ggml-metal.h => ggml/include/ggml-metal.h | 0 ggml-rpc.h => ggml/include/ggml-rpc.h | 0 ggml-sycl.h => ggml/include/ggml-sycl.h | 4 +- ggml-vulkan.h => ggml/include/ggml-vulkan.h | 0 ggml.h => ggml/include/ggml.h | 0 ggml/src/CMakeLists.txt | 1171 ++++++++++++++ ggml-alloc.c => ggml/src/ggml-alloc.c | 0 .../src/ggml-backend-impl.h | 0 ggml-backend.c => ggml/src/ggml-backend.c | 0 ggml-blas.cpp => ggml/src/ggml-blas.cpp | 0 ggml-common.h => ggml/src/ggml-common.h | 0 ggml-cuda.cu => ggml/src/ggml-cuda.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/acc.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/acc.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/arange.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/arange.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/argsort.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/argsort.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/binbcast.cu | 0 .../src/ggml-cuda}/binbcast.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/clamp.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/clamp.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/common.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/concat.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/concat.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/convert.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/convert.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/cpy.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/cpy.cuh | 0 .../src/ggml-cuda}/dequantize.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/diagmask.cu | 0 .../src/ggml-cuda}/diagmask.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/dmmv.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/dmmv.cuh | 0 .../src/ggml-cuda}/fattn-common.cuh | 4 +- .../src/ggml-cuda}/fattn-tile-f16.cu | 0 .../src/ggml-cuda}/fattn-tile-f16.cuh | 0 .../src/ggml-cuda}/fattn-tile-f32.cu | 0 .../src/ggml-cuda}/fattn-tile-f32.cuh | 0 .../src/ggml-cuda}/fattn-vec-f16.cuh | 0 .../src/ggml-cuda}/fattn-vec-f32.cuh | 0 .../src/ggml-cuda}/fattn-wmma-f16.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/fattn.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/fattn.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/getrows.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/getrows.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/im2col.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/im2col.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/mma.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/mmq.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/mmq.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/mmvq.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/mmvq.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/norm.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/norm.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/pad.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/pad.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/pool2d.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/pool2d.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/quantize.cu | 0 .../src/ggml-cuda}/quantize.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/rope.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/rope.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/scale.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/scale.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/softmax.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/softmax.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/sumrows.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/sumrows.cuh | 0 .../fattn-vec-f16-instance-hs128-f16-f16.cu | 0 .../fattn-vec-f16-instance-hs128-f16-q4_0.cu | 0 .../fattn-vec-f16-instance-hs128-f16-q4_1.cu | 0 .../fattn-vec-f16-instance-hs128-f16-q5_0.cu | 0 .../fattn-vec-f16-instance-hs128-f16-q5_1.cu | 0 .../fattn-vec-f16-instance-hs128-f16-q8_0.cu | 0 .../fattn-vec-f16-instance-hs128-q4_0-f16.cu | 0 .../fattn-vec-f16-instance-hs128-q4_0-q4_0.cu | 0 .../fattn-vec-f16-instance-hs128-q4_0-q4_1.cu | 0 .../fattn-vec-f16-instance-hs128-q4_0-q5_0.cu | 0 .../fattn-vec-f16-instance-hs128-q4_0-q5_1.cu | 0 .../fattn-vec-f16-instance-hs128-q4_0-q8_0.cu | 0 .../fattn-vec-f16-instance-hs128-q4_1-f16.cu | 0 .../fattn-vec-f16-instance-hs128-q4_1-q4_0.cu | 0 .../fattn-vec-f16-instance-hs128-q4_1-q4_1.cu | 0 .../fattn-vec-f16-instance-hs128-q4_1-q5_0.cu | 0 .../fattn-vec-f16-instance-hs128-q4_1-q5_1.cu | 0 .../fattn-vec-f16-instance-hs128-q4_1-q8_0.cu | 0 .../fattn-vec-f16-instance-hs128-q5_0-f16.cu | 0 .../fattn-vec-f16-instance-hs128-q5_0-q4_0.cu | 0 .../fattn-vec-f16-instance-hs128-q5_0-q4_1.cu | 0 .../fattn-vec-f16-instance-hs128-q5_0-q5_0.cu | 0 .../fattn-vec-f16-instance-hs128-q5_0-q5_1.cu | 0 .../fattn-vec-f16-instance-hs128-q5_0-q8_0.cu | 0 .../fattn-vec-f16-instance-hs128-q5_1-f16.cu | 0 .../fattn-vec-f16-instance-hs128-q5_1-q4_0.cu | 0 .../fattn-vec-f16-instance-hs128-q5_1-q4_1.cu | 0 .../fattn-vec-f16-instance-hs128-q5_1-q5_0.cu | 0 .../fattn-vec-f16-instance-hs128-q5_1-q5_1.cu | 0 .../fattn-vec-f16-instance-hs128-q5_1-q8_0.cu | 0 .../fattn-vec-f16-instance-hs128-q8_0-f16.cu | 0 .../fattn-vec-f16-instance-hs128-q8_0-q4_0.cu | 0 .../fattn-vec-f16-instance-hs128-q8_0-q4_1.cu | 0 .../fattn-vec-f16-instance-hs128-q8_0-q5_0.cu | 0 .../fattn-vec-f16-instance-hs128-q8_0-q5_1.cu | 0 .../fattn-vec-f16-instance-hs128-q8_0-q8_0.cu | 0 .../fattn-vec-f16-instance-hs256-f16-f16.cu | 0 .../fattn-vec-f16-instance-hs64-f16-f16.cu | 0 .../fattn-vec-f16-instance-hs64-f16-q4_0.cu | 0 .../fattn-vec-f16-instance-hs64-f16-q4_1.cu | 0 .../fattn-vec-f16-instance-hs64-f16-q5_0.cu | 0 .../fattn-vec-f16-instance-hs64-f16-q5_1.cu | 0 .../fattn-vec-f16-instance-hs64-f16-q8_0.cu | 0 .../fattn-vec-f32-instance-hs128-f16-f16.cu | 0 .../fattn-vec-f32-instance-hs128-f16-q4_0.cu | 0 .../fattn-vec-f32-instance-hs128-f16-q4_1.cu | 0 .../fattn-vec-f32-instance-hs128-f16-q5_0.cu | 0 .../fattn-vec-f32-instance-hs128-f16-q5_1.cu | 0 .../fattn-vec-f32-instance-hs128-f16-q8_0.cu | 0 .../fattn-vec-f32-instance-hs128-q4_0-f16.cu | 0 .../fattn-vec-f32-instance-hs128-q4_0-q4_0.cu | 0 .../fattn-vec-f32-instance-hs128-q4_0-q4_1.cu | 0 .../fattn-vec-f32-instance-hs128-q4_0-q5_0.cu | 0 .../fattn-vec-f32-instance-hs128-q4_0-q5_1.cu | 0 .../fattn-vec-f32-instance-hs128-q4_0-q8_0.cu | 0 .../fattn-vec-f32-instance-hs128-q4_1-f16.cu | 0 .../fattn-vec-f32-instance-hs128-q4_1-q4_0.cu | 0 .../fattn-vec-f32-instance-hs128-q4_1-q4_1.cu | 0 .../fattn-vec-f32-instance-hs128-q4_1-q5_0.cu | 0 .../fattn-vec-f32-instance-hs128-q4_1-q5_1.cu | 0 .../fattn-vec-f32-instance-hs128-q4_1-q8_0.cu | 0 .../fattn-vec-f32-instance-hs128-q5_0-f16.cu | 0 .../fattn-vec-f32-instance-hs128-q5_0-q4_0.cu | 0 .../fattn-vec-f32-instance-hs128-q5_0-q4_1.cu | 0 .../fattn-vec-f32-instance-hs128-q5_0-q5_0.cu | 0 .../fattn-vec-f32-instance-hs128-q5_0-q5_1.cu | 0 .../fattn-vec-f32-instance-hs128-q5_0-q8_0.cu | 0 .../fattn-vec-f32-instance-hs128-q5_1-f16.cu | 0 .../fattn-vec-f32-instance-hs128-q5_1-q4_0.cu | 0 .../fattn-vec-f32-instance-hs128-q5_1-q4_1.cu | 0 .../fattn-vec-f32-instance-hs128-q5_1-q5_0.cu | 0 .../fattn-vec-f32-instance-hs128-q5_1-q5_1.cu | 0 .../fattn-vec-f32-instance-hs128-q5_1-q8_0.cu | 0 .../fattn-vec-f32-instance-hs128-q8_0-f16.cu | 0 .../fattn-vec-f32-instance-hs128-q8_0-q4_0.cu | 0 .../fattn-vec-f32-instance-hs128-q8_0-q4_1.cu | 0 .../fattn-vec-f32-instance-hs128-q8_0-q5_0.cu | 0 .../fattn-vec-f32-instance-hs128-q8_0-q5_1.cu | 0 .../fattn-vec-f32-instance-hs128-q8_0-q8_0.cu | 0 .../fattn-vec-f32-instance-hs256-f16-f16.cu | 0 .../fattn-vec-f32-instance-hs64-f16-f16.cu | 0 .../fattn-vec-f32-instance-hs64-f16-q4_0.cu | 0 .../fattn-vec-f32-instance-hs64-f16-q4_1.cu | 0 .../fattn-vec-f32-instance-hs64-f16-q5_0.cu | 0 .../fattn-vec-f32-instance-hs64-f16-q5_1.cu | 0 .../fattn-vec-f32-instance-hs64-f16-q8_0.cu | 0 .../fattn-wmma-f16-instance-kqfloat-cpb16.cu | 0 .../fattn-wmma-f16-instance-kqfloat-cpb32.cu | 0 .../fattn-wmma-f16-instance-kqhalf-cpb16.cu | 0 .../fattn-wmma-f16-instance-kqhalf-cpb32.cu | 0 .../fattn-wmma-f16-instance-kqhalf-cpb8.cu | 0 .../template-instances/generate_cu_files.py | 0 .../template-instances/mmq-instance-q2_k.cu | 0 .../template-instances/mmq-instance-q3_k.cu | 0 .../template-instances/mmq-instance-q4_0.cu | 0 .../template-instances/mmq-instance-q4_1.cu | 0 .../template-instances/mmq-instance-q4_k.cu | 0 .../template-instances/mmq-instance-q5_0.cu | 0 .../template-instances/mmq-instance-q5_1.cu | 0 .../template-instances/mmq-instance-q5_k.cu | 0 .../template-instances/mmq-instance-q6_k.cu | 0 .../template-instances/mmq-instance-q8_0.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/tsembd.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/tsembd.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/unary.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/unary.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/upscale.cu | 0 {ggml-cuda => ggml/src/ggml-cuda}/upscale.cuh | 0 {ggml-cuda => ggml/src/ggml-cuda}/vecdotq.cuh | 0 ggml-impl.h => ggml/src/ggml-impl.h | 0 ggml-kompute.cpp => ggml/src/ggml-kompute.cpp | 0 ggml-metal.m => ggml/src/ggml-metal.m | 0 ggml-metal.metal => ggml/src/ggml-metal.metal | 0 ggml-quants.c => ggml/src/ggml-quants.c | 0 ggml-quants.h => ggml/src/ggml-quants.h | 0 ggml-rpc.cpp => ggml/src/ggml-rpc.cpp | 0 ggml-sycl.cpp => ggml/src/ggml-sycl.cpp | 1 + {ggml-sycl => ggml/src/ggml-sycl}/backend.hpp | 0 {ggml-sycl => ggml/src/ggml-sycl}/common.cpp | 0 {ggml-sycl => ggml/src/ggml-sycl}/common.hpp | 1 + {ggml-sycl => ggml/src/ggml-sycl}/convert.cpp | 0 {ggml-sycl => ggml/src/ggml-sycl}/convert.hpp | 0 .../src/ggml-sycl}/dequantize.hpp | 0 {ggml-sycl => ggml/src/ggml-sycl}/dmmv.cpp | 0 {ggml-sycl => ggml/src/ggml-sycl}/dmmv.hpp | 0 .../src/ggml-sycl}/dpct/helper.hpp | 0 {ggml-sycl => ggml/src/ggml-sycl}/mmq.cpp | 0 {ggml-sycl => ggml/src/ggml-sycl}/mmq.hpp | 0 {ggml-sycl => ggml/src/ggml-sycl}/mmvq.cpp | 0 {ggml-sycl => ggml/src/ggml-sycl}/mmvq.hpp | 0 {ggml-sycl => ggml/src/ggml-sycl}/presets.hpp | 2 - {ggml-sycl => ggml/src/ggml-sycl}/vecdotq.hpp | 0 .../src/ggml-vulkan-shaders.hpp | 0 ggml-vulkan.cpp => ggml/src/ggml-vulkan.cpp | 0 ggml.c => ggml/src/ggml.c | 0 kompute => ggml/src/kompute | 0 .../src/kompute-shaders}/common.comp | 0 .../src/kompute-shaders}/op_add.comp | 0 .../src/kompute-shaders}/op_addrow.comp | 0 .../src/kompute-shaders}/op_cpy_f16_f16.comp | 0 .../src/kompute-shaders}/op_cpy_f16_f32.comp | 0 .../src/kompute-shaders}/op_cpy_f32_f16.comp | 0 .../src/kompute-shaders}/op_cpy_f32_f32.comp | 0 .../src/kompute-shaders}/op_diagmask.comp | 0 .../src/kompute-shaders}/op_gelu.comp | 0 .../src/kompute-shaders}/op_getrows.comp | 0 .../src/kompute-shaders}/op_getrows_f16.comp | 0 .../src/kompute-shaders}/op_getrows_f32.comp | 0 .../src/kompute-shaders}/op_getrows_q4_0.comp | 0 .../src/kompute-shaders}/op_getrows_q4_1.comp | 0 .../src/kompute-shaders}/op_getrows_q6_k.comp | 0 .../src/kompute-shaders}/op_mul.comp | 0 .../src/kompute-shaders}/op_mul_mat_f16.comp | 0 .../kompute-shaders}/op_mul_mat_mat_f32.comp | 0 .../src/kompute-shaders}/op_mul_mat_q4_0.comp | 0 .../src/kompute-shaders}/op_mul_mat_q4_1.comp | 0 .../src/kompute-shaders}/op_mul_mat_q6_k.comp | 0 .../src/kompute-shaders}/op_mul_mat_q8_0.comp | 0 .../src/kompute-shaders}/op_mul_mv_q_n.comp | 0 .../kompute-shaders}/op_mul_mv_q_n_pre.comp | 0 .../src/kompute-shaders}/op_norm.comp | 0 .../src/kompute-shaders}/op_relu.comp | 0 .../src/kompute-shaders}/op_rmsnorm.comp | 0 .../src/kompute-shaders}/op_rope_f16.comp | 0 .../src/kompute-shaders}/op_rope_f32.comp | 0 .../src/kompute-shaders}/op_scale.comp | 0 .../src/kompute-shaders}/op_scale_8.comp | 0 .../src/kompute-shaders}/op_silu.comp | 0 .../src/kompute-shaders}/op_softmax.comp | 0 .../src/kompute-shaders}/rope_common.comp | 0 sgemm.cpp => ggml/src/sgemm.cpp | 0 sgemm.h => ggml/src/sgemm.h | 0 .../src/vulkan-shaders}/add.comp | 0 .../src/vulkan-shaders}/argsort.comp | 0 .../src/vulkan-shaders}/clamp.comp | 0 .../src/vulkan-shaders}/copy.comp | 0 .../src/vulkan-shaders}/dequant_f32.comp | 0 .../src/vulkan-shaders}/dequant_funcs.comp | 0 .../src/vulkan-shaders}/dequant_head.comp | 0 .../src/vulkan-shaders}/dequant_q2_k.comp | 0 .../src/vulkan-shaders}/dequant_q3_k.comp | 0 .../src/vulkan-shaders}/dequant_q4_0.comp | 0 .../src/vulkan-shaders}/dequant_q4_1.comp | 0 .../src/vulkan-shaders}/dequant_q4_k.comp | 0 .../src/vulkan-shaders}/dequant_q5_0.comp | 0 .../src/vulkan-shaders}/dequant_q5_1.comp | 0 .../src/vulkan-shaders}/dequant_q5_k.comp | 0 .../src/vulkan-shaders}/dequant_q6_k.comp | 0 .../src/vulkan-shaders}/dequant_q8_0.comp | 0 .../src/vulkan-shaders}/diag_mask_inf.comp | 0 .../src/vulkan-shaders}/div.comp | 0 .../src/vulkan-shaders}/gelu.comp | 0 .../vulkan-shaders}/generic_binary_head.comp | 0 .../src/vulkan-shaders}/generic_head.comp | 0 .../vulkan-shaders}/generic_unary_head.comp | 0 .../src/vulkan-shaders}/get_rows.comp | 0 .../src/vulkan-shaders}/get_rows_quant.comp | 0 .../src/vulkan-shaders}/mul.comp | 0 .../mul_mat_split_k_reduce.comp | 0 .../src/vulkan-shaders}/mul_mat_vec.comp | 0 .../src/vulkan-shaders}/mul_mat_vec_base.comp | 0 .../src/vulkan-shaders}/mul_mat_vec_nc.comp | 0 .../src/vulkan-shaders}/mul_mat_vec_p021.comp | 0 .../src/vulkan-shaders}/mul_mat_vec_q2_k.comp | 0 .../src/vulkan-shaders}/mul_mat_vec_q3_k.comp | 0 .../src/vulkan-shaders}/mul_mat_vec_q4_k.comp | 0 .../src/vulkan-shaders}/mul_mat_vec_q5_k.comp | 0 .../src/vulkan-shaders}/mul_mat_vec_q6_k.comp | 0 .../src/vulkan-shaders}/mul_mm.comp | 0 .../src/vulkan-shaders}/norm.comp | 0 .../src/vulkan-shaders}/relu.comp | 0 .../src/vulkan-shaders}/rms_norm.comp | 0 .../src/vulkan-shaders}/rope_head.comp | 0 .../src/vulkan-shaders}/rope_neox.comp | 0 .../src/vulkan-shaders}/rope_norm.comp | 0 .../src/vulkan-shaders}/scale.comp | 0 .../src/vulkan-shaders}/silu.comp | 0 .../src/vulkan-shaders}/soft_max.comp | 0 .../src/vulkan-shaders}/square.comp | 0 .../src/vulkan-shaders}/sum_rows.comp | 0 .../src/vulkan-shaders}/types.comp | 0 llama.h => include/llama.h | 0 scripts/build-info.sh | 10 +- scripts/compare-commits.sh | 2 +- scripts/debug-test.sh | 2 +- scripts/pod-llama.sh | 16 +- scripts/server-llm.sh | 2 +- scripts/sync-ggml-am.sh | 113 +- scripts/sync-ggml.sh | 68 +- spm-headers/ggml-alloc.h | 2 +- spm-headers/ggml-backend.h | 2 +- spm-headers/ggml-metal.h | 1 + spm-headers/ggml.h | 2 +- spm-headers/llama.h | 2 +- src/CMakeLists.txt | 32 + llama.cpp => src/llama.cpp | 0 unicode-data.cpp => src/unicode-data.cpp | 0 unicode-data.h => src/unicode-data.h | 0 unicode.cpp => src/unicode.cpp | 0 unicode.h => src/unicode.h | 0 tests/test-backend-ops.cpp | 1 - 345 files changed, 2555 insertions(+), 1937 deletions(-) rename {scripts => cmake}/build-info.cmake (100%) create mode 100644 cmake/git-vars.cmake rename scripts/LlamaConfig.cmake.in => cmake/llama-config.cmake.in (73%) rename scripts/gen-build-info-cpp.cmake => common/cmake/build-info-gen-cpp.cmake (86%) create mode 100644 ggml/CMakeLists.txt rename {cmake => ggml/cmake}/FindSIMD.cmake (94%) rename ggml_vk_generate_shaders.py => ggml/ggml_vk_generate_shaders.py (100%) rename ggml-alloc.h => ggml/include/ggml-alloc.h (100%) rename ggml-backend.h => ggml/include/ggml-backend.h (100%) rename ggml-blas.h => ggml/include/ggml-blas.h (100%) rename ggml-cuda.h => ggml/include/ggml-cuda.h (100%) rename ggml-kompute.h => ggml/include/ggml-kompute.h (100%) rename ggml-metal.h => ggml/include/ggml-metal.h (100%) rename ggml-rpc.h => ggml/include/ggml-rpc.h (100%) rename ggml-sycl.h => ggml/include/ggml-sycl.h (95%) rename ggml-vulkan.h => ggml/include/ggml-vulkan.h (100%) rename ggml.h => ggml/include/ggml.h (100%) create mode 100644 ggml/src/CMakeLists.txt rename ggml-alloc.c => ggml/src/ggml-alloc.c (100%) rename ggml-backend-impl.h => ggml/src/ggml-backend-impl.h (100%) rename ggml-backend.c => ggml/src/ggml-backend.c (100%) rename ggml-blas.cpp => ggml/src/ggml-blas.cpp (100%) rename ggml-common.h => ggml/src/ggml-common.h (100%) rename ggml-cuda.cu => ggml/src/ggml-cuda.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/acc.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/acc.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/arange.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/arange.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/argsort.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/argsort.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/binbcast.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/binbcast.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/clamp.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/clamp.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/common.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/concat.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/concat.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/convert.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/convert.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/cpy.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/cpy.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/dequantize.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/diagmask.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/diagmask.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/dmmv.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/dmmv.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/fattn-common.cuh (99%) rename {ggml-cuda => ggml/src/ggml-cuda}/fattn-tile-f16.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/fattn-tile-f16.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/fattn-tile-f32.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/fattn-tile-f32.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/fattn-vec-f16.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/fattn-vec-f32.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/fattn-wmma-f16.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/fattn.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/fattn.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/getrows.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/getrows.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/im2col.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/im2col.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/mma.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/mmq.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/mmq.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/mmvq.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/mmvq.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/norm.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/norm.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/pad.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/pad.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/pool2d.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/pool2d.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/quantize.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/quantize.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/rope.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/rope.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/scale.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/scale.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/softmax.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/softmax.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/sumrows.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/sumrows.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/generate_cu_files.py (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/mmq-instance-q2_k.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/mmq-instance-q3_k.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/mmq-instance-q4_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/mmq-instance-q4_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/mmq-instance-q4_k.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/mmq-instance-q5_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/mmq-instance-q5_1.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/mmq-instance-q5_k.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/mmq-instance-q6_k.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/mmq-instance-q8_0.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/tsembd.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/tsembd.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/unary.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/unary.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/upscale.cu (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/upscale.cuh (100%) rename {ggml-cuda => ggml/src/ggml-cuda}/vecdotq.cuh (100%) rename ggml-impl.h => ggml/src/ggml-impl.h (100%) rename ggml-kompute.cpp => ggml/src/ggml-kompute.cpp (100%) rename ggml-metal.m => ggml/src/ggml-metal.m (100%) rename ggml-metal.metal => ggml/src/ggml-metal.metal (100%) rename ggml-quants.c => ggml/src/ggml-quants.c (100%) rename ggml-quants.h => ggml/src/ggml-quants.h (100%) rename ggml-rpc.cpp => ggml/src/ggml-rpc.cpp (100%) rename ggml-sycl.cpp => ggml/src/ggml-sycl.cpp (99%) rename {ggml-sycl => ggml/src/ggml-sycl}/backend.hpp (100%) rename {ggml-sycl => ggml/src/ggml-sycl}/common.cpp (100%) rename {ggml-sycl => ggml/src/ggml-sycl}/common.hpp (99%) rename {ggml-sycl => ggml/src/ggml-sycl}/convert.cpp (100%) rename {ggml-sycl => ggml/src/ggml-sycl}/convert.hpp (100%) rename {ggml-sycl => ggml/src/ggml-sycl}/dequantize.hpp (100%) rename {ggml-sycl => ggml/src/ggml-sycl}/dmmv.cpp (100%) rename {ggml-sycl => ggml/src/ggml-sycl}/dmmv.hpp (100%) rename {ggml-sycl => ggml/src/ggml-sycl}/dpct/helper.hpp (100%) rename {ggml-sycl => ggml/src/ggml-sycl}/mmq.cpp (100%) rename {ggml-sycl => ggml/src/ggml-sycl}/mmq.hpp (100%) rename {ggml-sycl => ggml/src/ggml-sycl}/mmvq.cpp (100%) rename {ggml-sycl => ggml/src/ggml-sycl}/mmvq.hpp (100%) rename {ggml-sycl => ggml/src/ggml-sycl}/presets.hpp (96%) rename {ggml-sycl => ggml/src/ggml-sycl}/vecdotq.hpp (100%) rename ggml-vulkan-shaders.hpp => ggml/src/ggml-vulkan-shaders.hpp (100%) rename ggml-vulkan.cpp => ggml/src/ggml-vulkan.cpp (100%) rename ggml.c => ggml/src/ggml.c (100%) rename kompute => ggml/src/kompute (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/common.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_add.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_addrow.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_cpy_f16_f16.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_cpy_f16_f32.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_cpy_f32_f16.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_cpy_f32_f32.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_diagmask.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_gelu.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_getrows.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_getrows_f16.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_getrows_f32.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_getrows_q4_0.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_getrows_q4_1.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_getrows_q6_k.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_mul.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_mul_mat_f16.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_mul_mat_mat_f32.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_mul_mat_q4_0.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_mul_mat_q4_1.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_mul_mat_q6_k.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_mul_mat_q8_0.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_mul_mv_q_n.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_mul_mv_q_n_pre.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_norm.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_relu.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_rmsnorm.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_rope_f16.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_rope_f32.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_scale.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_scale_8.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_silu.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/op_softmax.comp (100%) rename {kompute-shaders => ggml/src/kompute-shaders}/rope_common.comp (100%) rename sgemm.cpp => ggml/src/sgemm.cpp (100%) rename sgemm.h => ggml/src/sgemm.h (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/add.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/argsort.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/clamp.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/copy.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_f32.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_funcs.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_head.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_q2_k.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_q3_k.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_q4_0.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_q4_1.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_q4_k.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_q5_0.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_q5_1.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_q5_k.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_q6_k.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_q8_0.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/diag_mask_inf.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/div.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/gelu.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/generic_binary_head.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/generic_head.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/generic_unary_head.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/get_rows.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/get_rows_quant.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mat_split_k_reduce.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mat_vec.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mat_vec_base.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mat_vec_nc.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mat_vec_p021.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mat_vec_q2_k.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mat_vec_q3_k.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mat_vec_q4_k.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mat_vec_q5_k.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mat_vec_q6_k.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mm.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/norm.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/relu.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/rms_norm.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/rope_head.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/rope_neox.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/rope_norm.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/scale.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/silu.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/soft_max.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/square.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/sum_rows.comp (100%) rename {vulkan-shaders => ggml/src/vulkan-shaders}/types.comp (100%) rename llama.h => include/llama.h (100%) create mode 120000 spm-headers/ggml-metal.h create mode 100644 src/CMakeLists.txt rename llama.cpp => src/llama.cpp (100%) rename unicode-data.cpp => src/unicode-data.cpp (100%) rename unicode-data.h => src/unicode-data.h (100%) rename unicode.cpp => src/unicode.cpp (100%) rename unicode.h => src/unicode.h (100%) diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 87bb3a20f..4ee0d62cb 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -160,9 +160,9 @@ effectiveStdenv.mkDerivation ( }; postPatch = '' - substituteInPlace ./ggml-metal.m \ + substituteInPlace ./ggml/src/ggml-metal.m \ --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" - substituteInPlace ./ggml-metal.m \ + substituteInPlace ./ggml/src/ggml-metal.m \ --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";" ''; @@ -205,17 +205,17 @@ effectiveStdenv.mkDerivation ( cmakeFlags = [ - (cmakeBool "LLAMA_NATIVE" false) (cmakeBool "LLAMA_BUILD_SERVER" true) (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic)) (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) - (cmakeBool "LLAMA_BLAS" useBlas) - (cmakeBool "LLAMA_CLBLAST" useOpenCL) - (cmakeBool "LLAMA_CUDA" useCuda) - (cmakeBool "LLAMA_HIPBLAS" useRocm) - (cmakeBool "LLAMA_METAL" useMetalKit) - (cmakeBool "LLAMA_VULKAN" useVulkan) - (cmakeBool "LLAMA_STATIC" enableStatic) + (cmakeBool "GGML_NATIVE" false) + (cmakeBool "GGML_BLAS" useBlas) + (cmakeBool "GGML_CLBLAST" useOpenCL) + (cmakeBool "GGML_CUDA" useCuda) + (cmakeBool "GGML_HIPBLAS" useRocm) + (cmakeBool "GGML_METAL" useMetalKit) + (cmakeBool "GGML_VULKAN" useVulkan) + (cmakeBool "GGML_STATIC" enableStatic) ] ++ optionals useCuda [ ( @@ -231,7 +231,7 @@ effectiveStdenv.mkDerivation ( ] ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") - (cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders)) + (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders)) ]; # Environment variables needed for ROCm @@ -244,7 +244,7 @@ effectiveStdenv.mkDerivation ( # if they haven't been added yet. postInstall = '' mkdir -p $out/include - cp $src/llama.h $out/include/ + cp $src/include/llama.h $out/include/ ''; # Define the shells here, but don't add in the inputsFrom to avoid recursion. diff --git a/.github/labeler.yml b/.github/labeler.yml index 5c12bab73..9c0397d16 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -2,31 +2,31 @@ Kompute: - changed-files: - any-glob-to-any-file: - - ggml-kompute.h - - ggml-kompute.cpp + - ggml/include/ggml-kompute.h + - ggml/src/ggml-kompute.cpp - README-kompute.md Apple Metal: - changed-files: - any-glob-to-any-file: - - ggml-metal.h - - ggml-metal.cpp + - ggml/include/ggml-metal.h + - ggml/src/ggml-metal.cpp - README-metal.md SYCL: - changed-files: - any-glob-to-any-file: - - ggml-sycl.h - - ggml-sycl.cpp + - ggml/include/ggml-sycl.h + - ggml/src/ggml-sycl.cpp - README-sycl.md Nvidia GPU: - changed-files: - any-glob-to-any-file: - - ggml-cuda.h - - ggml-cuda/** + - ggml/include/ggml-cuda.h + - ggml/src/ggml-cuda/** Vulkan: - changed-files: - any-glob-to-any-file: - - ggml_vk_generate_shaders.py - - ggml-vulkan* + - ggml/ggml_vk_generate_shaders.py + - ggml/src/ggml-vulkan* documentation: - changed-files: - any-glob-to-any-file: @@ -73,10 +73,10 @@ server: ggml: - changed-files: - any-glob-to-any-file: - - ggml.c - - ggml.h - - ggml-*.c - - ggml-*.h + - ggml/include/ggml*.h + - ggml/src/ggml*.c + - ggml/src/ggml*.cpp + - ggml/src/ggml*.h - ggml-cuda/** nix: - changed-files: diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 88ab4844e..eb69b82c4 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -109,7 +109,7 @@ jobs: run: | set -eux cmake -B build \ - -DLLAMA_NATIVE=OFF \ + -DGGML_NATIVE=OFF \ -DLLAMA_BUILD_SERVER=ON \ -DLLAMA_CURL=ON \ -DLLAMA_CUBLAS=ON \ diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a8fcae043..0d91fc4e4 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -47,7 +47,7 @@ jobs: sysctl -a mkdir build cd build - cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON .. + cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON .. cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -105,7 +105,7 @@ jobs: sysctl -a # Metal is disabled due to intermittent failures with Github runners not having a GPU: # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 - cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON + cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -305,7 +305,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF + cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF cmake --build . --config ${{ matrix.build_type }} -j $(nproc) - name: Test @@ -335,7 +335,7 @@ jobs: run: | mkdir build cd build - cmake -DLLAMA_RPC=ON .. + cmake -DGGML_RPC=ON .. cmake --build . --config Release -j $(nproc) - name: Test @@ -363,7 +363,7 @@ jobs: run: | mkdir build cd build - cmake -DLLAMA_VULKAN=ON .. + cmake -DGGML_VULKAN=ON .. cmake --build . --config Release -j $(nproc) ubuntu-22-cmake-hip: @@ -384,13 +384,13 @@ jobs: - name: Build with native CMake HIP support id: cmake_build run: | - cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON + cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON cmake --build build --config Release -j $(nproc) - name: Build with legacy HIP support id: cmake_build_legacy_hip run: | - cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON + cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON cmake --build build2 --config Release -j $(nproc) ubuntu-22-cmake-sycl: @@ -431,7 +431,7 @@ jobs: source /opt/intel/oneapi/setvars.sh mkdir build cd build - cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx .. + cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx .. cmake --build . --config Release -j $(nproc) ubuntu-22-cmake-sycl-fp16: @@ -472,10 +472,10 @@ jobs: source /opt/intel/oneapi/setvars.sh mkdir build cd build - cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON .. + cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON .. cmake --build . --config Release -j $(nproc) - # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know + # TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know # how to debug it. # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124 macOS-latest-make: @@ -497,15 +497,15 @@ jobs: env: LLAMA_FATAL_WARNINGS: 1 run: | - LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu) + GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu) - name: Test id: make_test run: | - LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu) - LLAMA_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu) + GGML_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu) + GGML_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu) - # TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know + # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know # how to debug it. # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584 # would be great if we fix these @@ -529,7 +529,7 @@ jobs: sysctl -a mkdir build cd build - cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF .. + cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF .. cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -559,13 +559,14 @@ jobs: mkdir build cd build cmake -G Xcode .. \ - -DLLAMA_METAL_EMBED_LIBRARY=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_SERVER=OFF \ -DCMAKE_SYSTEM_NAME=iOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 - cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) + -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ + -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml + cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO macOS-latest-cmake-tvos: runs-on: macos-latest @@ -588,13 +589,14 @@ jobs: mkdir build cd build cmake -G Xcode .. \ - -DLLAMA_METAL_EMBED_LIBRARY=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_SERVER=OFF \ -DCMAKE_SYSTEM_NAME=tvOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 - cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) + -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ + -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml + cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO macOS-latest-swift: runs-on: macos-latest @@ -662,7 +664,7 @@ jobs: - name: Build using make w/ OpenBLAS shell: msys2 {0} run: | - make LLAMA_OPENBLAS=1 -j $(nproc) + make GGML_OPENBLAS=1 -j $(nproc) - name: Build using CMake shell: msys2 {0} @@ -678,7 +680,7 @@ jobs: - name: Build using CMake w/ OpenBLAS shell: msys2 {0} run: | - cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS + cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS cmake --build build --config ${{ matrix.build }} -j $(nproc) windows-latest-cmake: @@ -693,25 +695,25 @@ jobs: matrix: include: - build: 'rpc-x64' - defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON' - build: 'noavx-x64' - defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON' - build: 'avx2-x64' - defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' - build: 'avx-x64' - defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON' - build: 'avx512-x64' - defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON' - build: 'openblas-x64' - defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' - build: 'kompute-x64' - defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' - build: 'vulkan-x64' - defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON' - build: 'llvm-arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' - build: 'msvc-arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' steps: - name: Clone @@ -724,7 +726,7 @@ jobs: id: clone_kompute if: ${{ matrix.build == 'kompute-x64' }} run: | - git submodule update --init kompute + git submodule update --init ggml/src/kompute - name: Download OpenBLAS id: get_openblas @@ -854,7 +856,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON + cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} - name: Determine tag name @@ -987,7 +989,7 @@ jobs: run: | $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" - cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON + cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON cmake --build build --config Release ios-xcode-build: diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 311abf02a..99feb28f2 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -92,12 +92,12 @@ jobs: if: ${{ matrix.sanitizer == 'THREAD' }} run: | cmake -B build \ - -DLLAMA_NATIVE=OFF \ + -DGGML_NATIVE=OFF \ -DLLAMA_BUILD_SERVER=ON \ -DLLAMA_CURL=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DLLAMA_OPENMP=OFF ; + -DGGML_OPENMP=OFF ; cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - name: Build @@ -105,7 +105,7 @@ jobs: if: ${{ matrix.sanitizer != 'THREAD' }} run: | cmake -B build \ - -DLLAMA_NATIVE=OFF \ + -DGGML_NATIVE=OFF \ -DLLAMA_BUILD_SERVER=ON \ -DLLAMA_CURL=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ diff --git a/.gitignore b/.gitignore index a0c16e880..177e6a8db 100644 --- a/.gitignore +++ b/.gitignore @@ -56,6 +56,7 @@ CMakeSettings.json compile_commands.json ggml-metal-embed.metal llama-batched-swift +/rpc-server out/ tmp/ diff --git a/.gitmodules b/.gitmodules index b7e8b8ff2..5861d59cb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "kompute"] - path = kompute + path = ggml/src/kompute url = https://github.com/nomic-ai/kompute.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 1acf4bb08..18297834e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,9 @@ cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target project("llama.cpp" C CXX) include(CheckIncludeFileCXX) +#set(CMAKE_WARN_DEPRECATED YES) +set(CMAKE_WARN_UNUSED_CLI YES) + set(CMAKE_EXPORT_COMPILE_COMMANDS ON) if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) @@ -9,11 +12,16 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() +# Add path to modules +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/") + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) set(LLAMA_STANDALONE ON) + include(git-vars) + # configure project version # TODO else() @@ -32,1289 +40,72 @@ else() endif() endif() +option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT}) # -# Option list +# option list # -if (APPLE) - set(LLAMA_METAL_DEFAULT ON) - set(LLAMA_BLAS_DEFAULT ON) - set(LLAMA_BLAS_VENDOR_DEFAULT "Apple") -else() - set(LLAMA_METAL_DEFAULT OFF) - set(LLAMA_BLAS_DEFAULT OFF) - set(LLAMA_BLAS_VENDOR_DEFAULT "Generic") -endif() - -set(LLAMA_LLAMAFILE_DEFAULT ON) - # general -option(BUILD_SHARED_LIBS "build shared libraries" OFF) -option(LLAMA_STATIC "llama: static link libraries" OFF) -option(LLAMA_NATIVE "llama: enable -march=native flag" ON) -option(LLAMA_LTO "llama: enable link time optimization" OFF) -option(LLAMA_CCACHE "llama: use ccache if available" ON) +option(LLAMA_CCACHE "llama: use ccache if available" ON) # debug -option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) -option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF) -option(LLAMA_GPROF "llama: enable gprof" OFF) +option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) +option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF) # build -option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF) +option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF) # sanitizers -option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) -option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) -option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) +option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) +option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) +option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) -# instruction set specific -if (LLAMA_NATIVE) - set(INS_ENB OFF) -else() - set(INS_ENB ON) -endif() - -option(LLAMA_SVE "llama: enable SVE" OFF) -option(LLAMA_AVX "llama: enable AVX" ${INS_ENB}) -option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB}) -option(LLAMA_AVX512 "llama: enable AVX512" OFF) -option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF) -option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF) -option(LLAMA_AVX512_BF16 "llama: enable AVX512-BF16" OFF) -option(LLAMA_FMA "llama: enable FMA" ${INS_ENB}) -# in MSVC F16C is implied with AVX2/AVX512 -if (NOT MSVC) - option(LLAMA_F16C "llama: enable F16C" ${INS_ENB}) -endif() - -if (WIN32) - set(LLAMA_WIN_VER "0x602" CACHE STRING "llama: Windows Version") -endif() +# extra artifacts +option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) +option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) +option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) # 3rd party libs -option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) -option(LLAMA_BLAS "llama: use BLAS" ${LLAMA_BLAS_DEFAULT}) -set(LLAMA_BLAS_VENDOR ${LLAMA_BLAS_VENDOR_DEFAULT} CACHE STRING - "llama: BLAS library vendor") -option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT}) -option(LLAMA_CUDA "llama: use CUDA" OFF) -option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF) -option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) -option(LLAMA_CUDA_FORCE_MMQ "llama: always use mmq kernels instead of cuBLAS" OFF) -option(LLAMA_CUDA_FORCE_CUBLAS "llama: always use cuBLAS instead of mmq kernels" OFF) -set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") -set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") -option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF) -set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") -set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING - "llama: max. batch size for using peer access") -option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF) -option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF) -option(LLAMA_CUDA_FA_ALL_QUANTS "llama: compile all quants for FlashAttention" OFF) - -option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) -option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) -option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF) -option(LLAMA_VULKAN "llama: use Vulkan" OFF) -option(LLAMA_VULKAN_CHECK_RESULTS "llama: run Vulkan op checks" OFF) -option(LLAMA_VULKAN_DEBUG "llama: enable Vulkan debug output" OFF) -option(LLAMA_VULKAN_MEMORY_DEBUG "llama: enable Vulkan memory debug output" OFF) -option(LLAMA_VULKAN_VALIDATE "llama: enable Vulkan validation" OFF) -option(LLAMA_VULKAN_RUN_TESTS "llama: run Vulkan tests" OFF) -option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) -option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) -option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF) -option(LLAMA_METAL_EMBED_LIBRARY "llama: embed Metal library" OFF) -set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING - "llama: metal minimum macOS version") -set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)") -option(LLAMA_KOMPUTE "llama: use Kompute" OFF) -option(LLAMA_RPC "llama: use RPC" OFF) -option(LLAMA_OPENMP "llama: use OpenMP" ON) -option(LLAMA_SYCL "llama: use SYCL" OFF) -option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) -set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device") -option(LLAMA_CPU_HBM "llama: use memkind for CPU HBM" OFF) -set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeline parallelism") - -option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) -option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) -option(LLAMA_BUILD_SERVER "llama: build server example" ON) -option(LLAMA_LASX "llama: enable lasx" ON) -option(LLAMA_LSX "llama: enable lsx" ON) +option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) # Required for relocatable CMake package -include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) +include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) -# -# Compile flags -# +# override ggml options +set(GGML_CCACHE ${LLAMA_CCACHE}) +set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD}) +set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS}) +set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED}) +set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) +set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) +set(GGML_LLAMAFILE ON) -if (LLAMA_SYCL) - set(CMAKE_CXX_STANDARD 17) -else() - set(CMAKE_CXX_STANDARD 11) -endif() - -set(CMAKE_CXX_STANDARD_REQUIRED true) -set(CMAKE_C_STANDARD 11) -set(CMAKE_C_STANDARD_REQUIRED true) -set(THREADS_PREFER_PTHREAD_FLAG ON) - -find_package(Threads REQUIRED) -include(CheckCXXCompilerFlag) - -add_compile_definitions(GGML_SCHED_MAX_COPIES=${LLAMA_SCHED_MAX_COPIES}) - -# enable libstdc++ assertions for debug builds -if (CMAKE_SYSTEM_NAME MATCHES "Linux") - add_compile_definitions($<$:_GLIBCXX_ASSERTIONS>) -endif() - -if (NOT MSVC) - if (LLAMA_SANITIZE_THREAD) - add_compile_options(-fsanitize=thread) - link_libraries (-fsanitize=thread) +# transition helpers +function (llama_option_depr TYPE OLD NEW) + if (${OLD}) + message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n") + set(${NEW} ON) endif() - - if (LLAMA_SANITIZE_ADDRESS) - add_compile_options(-fsanitize=address -fno-omit-frame-pointer) - link_libraries (-fsanitize=address) - endif() - - if (LLAMA_SANITIZE_UNDEFINED) - add_compile_options(-fsanitize=undefined) - link_libraries (-fsanitize=undefined) - endif() -endif() - -if (APPLE AND LLAMA_ACCELERATE) - find_library(ACCELERATE_FRAMEWORK Accelerate) - if (ACCELERATE_FRAMEWORK) - message(STATUS "Accelerate framework found") - - add_compile_definitions(GGML_USE_ACCELERATE) - add_compile_definitions(ACCELERATE_NEW_LAPACK) - add_compile_definitions(ACCELERATE_LAPACK_ILP64) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) - else() - message(WARNING "Accelerate framework not found") - endif() -endif() - -if (LLAMA_METAL) - find_library(FOUNDATION_LIBRARY Foundation REQUIRED) - find_library(METAL_FRAMEWORK Metal REQUIRED) - find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) - - message(STATUS "Metal framework found") - set(GGML_HEADERS_METAL ggml-metal.h) - set(GGML_SOURCES_METAL ggml-metal.m) - - add_compile_definitions(GGML_USE_METAL) - if (LLAMA_METAL_NDEBUG) - add_compile_definitions(GGML_METAL_NDEBUG) - endif() - - # copy ggml-common.h and ggml-metal.metal to bin directory - configure_file(ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY) - configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) - - if (LLAMA_METAL_EMBED_LIBRARY) - enable_language(ASM) - add_compile_definitions(GGML_METAL_EMBED_LIBRARY) - - set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/ggml-common.h") - set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") - - file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated") - - # merge ggml-common.h and ggml-metal.metal into a single file - set(METALLIB_EMBED_ASM "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s") - set(METALLIB_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal") - - add_custom_command( - OUTPUT ${METALLIB_EMBED_ASM} - COMMAND echo "Embedding Metal library" - COMMAND sed -e '/\#include \"ggml-common.h\"/r ${METALLIB_COMMON}' -e '/\#include \"ggml-common.h\"/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED} - COMMAND echo ".section __DATA,__ggml_metallib" > ${METALLIB_EMBED_ASM} - COMMAND echo ".globl _ggml_metallib_start" >> ${METALLIB_EMBED_ASM} - COMMAND echo "_ggml_metallib_start:" >> ${METALLIB_EMBED_ASM} - COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM} - COMMAND echo ".globl _ggml_metallib_end" >> ${METALLIB_EMBED_ASM} - COMMAND echo "_ggml_metallib_end:" >> ${METALLIB_EMBED_ASM} - DEPENDS ggml-metal.metal ggml-common.h - COMMENT "Generate assembly for embedded Metal library" - ) - - set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM}) - else() - if (LLAMA_METAL_SHADER_DEBUG) - # custom command to do the following: - # xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air - # xcrun -sdk macosx metallib ggml-metal.air -o default.metallib - # - # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works - # disabling fast math is needed in order to pass tests/test-backend-ops - # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1 - # note: unfortunately, we have to call it default.metallib instead of ggml.metallib - # ref: https://github.com/ggerganov/whisper.cpp/issues/1720 - set(XC_FLAGS -fno-fast-math -fno-inline -g) - else() - set(XC_FLAGS -O3) - endif() - - # Append macOS metal versioning flags - if (LLAMA_METAL_MACOSX_VERSION_MIN) - message(STATUS "Adding -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN} flag to metal compilation") - list(APPEND XC_FLAGS -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN}) - endif() - if (LLAMA_METAL_STD) - message(STATUS "Adding -std=${LLAMA_METAL_STD} flag to metal compilation") - list(APPEND XC_FLAGS -std=${LLAMA_METAL_STD}) - endif() - - add_custom_command( - OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib - COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air - COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib - COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air - COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h - COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal - DEPENDS ggml-metal.metal ggml-common.h - COMMENT "Compiling Metal kernels" - ) - - add_custom_target( - ggml-metal ALL - DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib - ) - endif() # LLAMA_METAL_EMBED_LIBRARY - - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} - ${FOUNDATION_LIBRARY} - ${METAL_FRAMEWORK} - ${METALKIT_FRAMEWORK} - ) -endif() - -if (LLAMA_OPENMP) - find_package(OpenMP) - if (OpenMP_FOUND) - message(STATUS "OpenMP found") - add_compile_definitions(GGML_USE_OPENMP) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX) - else() - message(WARNING "OpenMP not found") - endif() -endif() - -if (LLAMA_BLAS) - if (LLAMA_STATIC) - set(BLA_STATIC ON) - endif() - #if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22) - # set(BLA_SIZEOF_INTEGER 8) - #endif() - - set(BLA_VENDOR ${LLAMA_BLAS_VENDOR}) - find_package(BLAS) - - if (BLAS_FOUND) - message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") - - if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${LLAMA_BLAS_VENDOR} MATCHES "Apple")) - # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake. - # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268 - find_package(PkgConfig REQUIRED) - if (${LLAMA_BLAS_VENDOR} MATCHES "Generic") - pkg_check_modules(DepBLAS REQUIRED blas) - elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS") - # As of openblas v0.3.22, the 64-bit is named openblas64.pc - pkg_check_modules(DepBLAS openblas64) - if (NOT DepBLAS_FOUND) - pkg_check_modules(DepBLAS REQUIRED openblas) - endif() - elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME") - pkg_check_modules(DepBLAS REQUIRED blis) - elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS") - pkg_check_modules(DepBLAS REQUIRED blas-atlas) - elseif (${LLAMA_BLAS_VENDOR} MATCHES "FlexiBLAS") - pkg_check_modules(DepBLAS REQUIRED flexiblas_api) - elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel") - # all Intel* libraries share the same include path - pkg_check_modules(DepBLAS REQUIRED mkl-sdl) - elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC") - # this doesn't provide pkg-config - # suggest to assign BLAS_INCLUDE_DIRS on your own - if ("${NVHPC_VERSION}" STREQUAL "") - message(WARNING "Better to set NVHPC_VERSION") - else() - set(DepBLAS_FOUND ON) - set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include") - endif() - endif() - if (DepBLAS_FOUND) - set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS}) - else() - message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically" - " detected by pkgconfig, trying to find cblas.h from possible paths...") - find_path(BLAS_INCLUDE_DIRS - NAMES cblas.h - HINTS - /usr/include - /usr/local/include - /usr/include/openblas - /opt/homebrew/opt/openblas/include - /usr/local/opt/openblas/include - /usr/include/x86_64-linux-gnu/openblas/include - ) - endif() - endif() - - message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") - - add_compile_options(${BLAS_LINKER_FLAGS}) - - add_compile_definitions(GGML_USE_BLAS) - - if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel")) - add_compile_definitions(GGML_BLAS_USE_MKL) - endif() - - set(GGML_HEADERS_BLAS ggml-blas.h) - set(GGML_SOURCES_BLAS ggml-blas.cpp) - - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) - set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) - else() - message(WARNING "BLAS not found, please refer to " - "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" - " to set correct LLAMA_BLAS_VENDOR") - endif() -endif() - -if (LLAMA_LLAMAFILE) - add_compile_definitions(GGML_USE_LLAMAFILE) - - set(GGML_HEADERS_LLAMAFILE sgemm.h) - set(GGML_SOURCES_LLAMAFILE sgemm.cpp) -endif() - -if (LLAMA_CUBLAS) - message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead") - set(LLAMA_CUDA ON) -endif() - -if (LLAMA_CUDA) - cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES - - find_package(CUDAToolkit) - if (CUDAToolkit_FOUND) - message(STATUS "CUDA found") - - if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - # 52 == lowest CUDA 12 standard - # 60 == FP16 CUDA intrinsics - # 61 == integer CUDA intrinsics - # 70 == FP16 tensor cores - # 75 == int8 tensor cores - if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16) - set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75") - else() - set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75") - #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work - endif() - endif() - message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") - - enable_language(CUDA) - - set(GGML_HEADERS_CUDA ggml-cuda.h) - - file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu") - list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu") - file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - - add_compile_definitions(GGML_USE_CUDA) - add_compile_definitions(GGML_CUDA_USE_GRAPHS) - if (LLAMA_CUDA_FORCE_DMMV) - add_compile_definitions(GGML_CUDA_FORCE_DMMV) - endif() - if (LLAMA_CUDA_FORCE_MMQ) - add_compile_definitions(GGML_CUDA_FORCE_MMQ) - endif() - if (LLAMA_CUDA_FORCE_CUBLAS) - add_compile_definitions(GGML_CUDA_FORCE_CUBLAS) - endif() - if (LLAMA_CUDA_NO_VMM) - add_compile_definitions(GGML_CUDA_NO_VMM) - endif() - add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) - add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) - if (DEFINED LLAMA_CUDA_DMMV_Y) - add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility - endif() - if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16) - add_compile_definitions(GGML_CUDA_F16) - endif() - add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) - add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE}) - if (LLAMA_CUDA_NO_PEER_COPY) - add_compile_definitions(GGML_CUDA_NO_PEER_COPY) - endif() - if (LLAMA_CUDA_FA_ALL_QUANTS) - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) - else() - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - endif() - - if (LLAMA_STATIC) - if (WIN32) - # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt) - else () - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) - endif() - else() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) - endif() - - if (LLAMA_CUDA_NO_VMM) - # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so) - else() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... - endif() - else() - message(WARNING "CUDA not found") - endif() -endif() - -if (LLAMA_RPC) - add_compile_definitions(GGML_USE_RPC) - - if (WIN32) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32) - endif() - - set(GGML_HEADERS_RPC ggml-rpc.h) - set(GGML_SOURCES_RPC ggml-rpc.cpp) -endif() - -if (LLAMA_VULKAN) - find_package(Vulkan) - if (Vulkan_FOUND) - message(STATUS "Vulkan found") - - set(GGML_HEADERS_VULKAN ggml-vulkan.h) - set(GGML_SOURCES_VULKAN ggml-vulkan.cpp) - - add_compile_definitions(GGML_USE_VULKAN) - - # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build - # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector - if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0) - endif() - - if (LLAMA_VULKAN_CHECK_RESULTS) - add_compile_definitions(GGML_VULKAN_CHECK_RESULTS) - endif() - - if (LLAMA_VULKAN_DEBUG) - add_compile_definitions(GGML_VULKAN_DEBUG) - endif() - - if (LLAMA_VULKAN_MEMORY_DEBUG) - add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG) - endif() - - if (LLAMA_VULKAN_VALIDATE) - add_compile_definitions(GGML_VULKAN_VALIDATE) - endif() - - if (LLAMA_VULKAN_RUN_TESTS) - add_compile_definitions(GGML_VULKAN_RUN_TESTS) - endif() - - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} Vulkan::Vulkan) - else() - message(WARNING "Vulkan not found") - endif() -endif() - -if (LLAMA_HIPBLAS) - if (NOT EXISTS $ENV{ROCM_PATH}) - if (NOT EXISTS /opt/rocm) - set(ROCM_PATH /usr) - else() - set(ROCM_PATH /opt/rocm) - endif() - else() - set(ROCM_PATH $ENV{ROCM_PATH}) - endif() - list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}) - list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake") - - # CMake on Windows doesn't support the HIP language yet - if(WIN32) - set(CXX_IS_HIPCC TRUE) - else() - string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}") - endif() - - if(CXX_IS_HIPCC) - if(LINUX) - if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") - message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") - endif() - - message(WARNING "Setting hipcc as the C++ compiler is legacy behavior." - " Prefer setting the HIP compiler directly. See README for details.") - endif() - else() - # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES. - if(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES) - set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS}) - endif() - cmake_minimum_required(VERSION 3.21) - enable_language(HIP) - endif() - find_package(hip REQUIRED) - find_package(hipblas REQUIRED) - find_package(rocblas REQUIRED) - - message(STATUS "HIP and hipBLAS found") - - set(GGML_HEADERS_ROCM ggml-cuda.h) - - file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu") - list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu") - file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) - file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) - - add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA) - - if (LLAMA_HIP_UMA) - add_compile_definitions(GGML_HIP_UMA) - endif() - - if (LLAMA_CUDA_FORCE_DMMV) - add_compile_definitions(GGML_CUDA_FORCE_DMMV) - endif() - - if (LLAMA_CUDA_FORCE_MMQ) - add_compile_definitions(GGML_CUDA_FORCE_MMQ) - endif() - - if (LLAMA_CUDA_NO_PEER_COPY) - add_compile_definitions(GGML_CUDA_NO_PEER_COPY) - endif() - - if (LLAMA_CUDA_FA_ALL_QUANTS) - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) - add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) - else() - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) - endif() - - add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) - add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) - add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) - - if (CXX_IS_HIPCC) - set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device) - else() - set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP) - endif() - - if (LLAMA_STATIC) - message(FATAL_ERROR "Static linking not supported for HIP/ROCm") - endif() - - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas) -endif() - -if (LLAMA_SYCL) - if (NOT LLAMA_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$") - message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA") - endif() - - if ( NOT DEFINED ENV{ONEAPI_ROOT}) - message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh") - endif() - #todo: AOT - - find_package(IntelSYCL REQUIRED) - find_package(MKL REQUIRED) - - message(STATUS "SYCL found") - - add_compile_definitions(GGML_USE_SYCL) - - if (LLAMA_SYCL_F16) - add_compile_definitions(GGML_SYCL_F16) - endif() - - if (LLAMA_CUDA_FORCE_MMQ) - add_compile_definitions(GGML_SYCL_FORCE_MMQ) - endif() - - add_compile_options(-I./) #include DPCT - - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") - if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") - endif() - - set(GGML_HEADERS_SYCL ggml-sycl.h) - file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp") - list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp") - - if (WIN32) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) - else() - add_compile_options(-I/${SYCL_INCLUDE_DIR}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib") - if (LLAMA_SYCL_TARGET STREQUAL "INTEL") - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) - elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA") - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl pthread m dl onemkl) - endif() - endif() -endif() - -if (LLAMA_KOMPUTE) - add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1) - find_package(Vulkan COMPONENTS glslc REQUIRED) - find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc) - if (NOT glslc_executable) - message(FATAL_ERROR "glslc not found") - endif() - - function(compile_shader) - set(options) - set(oneValueArgs) - set(multiValueArgs SOURCES) - cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - foreach(source ${compile_shader_SOURCES}) - get_filename_component(filename ${source} NAME) - set(spv_file ${filename}.spv) - add_custom_command( - OUTPUT ${spv_file} - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source} - ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp - ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp - ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp - ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp - COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source} - COMMENT "Compiling ${source} to ${spv_file}" - ) - - get_filename_component(RAW_FILE_NAME ${spv_file} NAME) - set(FILE_NAME "shader${RAW_FILE_NAME}") - string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME}) - string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE) - string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}") - set(OUTPUT_HEADER_FILE "${HEADER_FILE}") - message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}") - if(CMAKE_GENERATOR MATCHES "Visual Studio") - add_custom_command( - OUTPUT ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_BINARY_DIR}/bin/$/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - DEPENDS ${spv_file} xxd - COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$/xxd" - ) - else() - add_custom_command( - OUTPUT ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - DEPENDS ${spv_file} xxd - COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd" - ) - endif() - endforeach() - endfunction() - - if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt") - message(STATUS "Kompute found") - set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level") - add_subdirectory(kompute) - - # Compile our shaders - compile_shader(SOURCES - kompute-shaders/op_scale.comp - kompute-shaders/op_scale_8.comp - kompute-shaders/op_add.comp - kompute-shaders/op_addrow.comp - kompute-shaders/op_mul.comp - kompute-shaders/op_silu.comp - kompute-shaders/op_relu.comp - kompute-shaders/op_gelu.comp - kompute-shaders/op_softmax.comp - kompute-shaders/op_norm.comp - kompute-shaders/op_rmsnorm.comp - kompute-shaders/op_diagmask.comp - kompute-shaders/op_mul_mat_mat_f32.comp - kompute-shaders/op_mul_mat_f16.comp - kompute-shaders/op_mul_mat_q8_0.comp - kompute-shaders/op_mul_mat_q4_0.comp - kompute-shaders/op_mul_mat_q4_1.comp - kompute-shaders/op_mul_mat_q6_k.comp - kompute-shaders/op_getrows_f32.comp - kompute-shaders/op_getrows_f16.comp - kompute-shaders/op_getrows_q4_0.comp - kompute-shaders/op_getrows_q4_1.comp - kompute-shaders/op_getrows_q6_k.comp - kompute-shaders/op_rope_f16.comp - kompute-shaders/op_rope_f32.comp - kompute-shaders/op_cpy_f16_f16.comp - kompute-shaders/op_cpy_f16_f32.comp - kompute-shaders/op_cpy_f32_f16.comp - kompute-shaders/op_cpy_f32_f32.comp - ) - - # Create a custom target for our generated shaders - add_custom_target(generated_shaders DEPENDS - shaderop_scale.h - shaderop_scale_8.h - shaderop_add.h - shaderop_addrow.h - shaderop_mul.h - shaderop_silu.h - shaderop_relu.h - shaderop_gelu.h - shaderop_softmax.h - shaderop_norm.h - shaderop_rmsnorm.h - shaderop_diagmask.h - shaderop_mul_mat_mat_f32.h - shaderop_mul_mat_f16.h - shaderop_mul_mat_q8_0.h - shaderop_mul_mat_q4_0.h - shaderop_mul_mat_q4_1.h - shaderop_mul_mat_q6_k.h - shaderop_getrows_f32.h - shaderop_getrows_f16.h - shaderop_getrows_q4_0.h - shaderop_getrows_q4_1.h - shaderop_getrows_q6_k.h - shaderop_rope_f16.h - shaderop_rope_f32.h - shaderop_cpy_f16_f16.h - shaderop_cpy_f16_f32.h - shaderop_cpy_f32_f16.h - shaderop_cpy_f32_f32.h - ) - - # Create a custom command that depends on the generated_shaders - add_custom_command( - OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp - COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp - DEPENDS generated_shaders - COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp" - ) - - # Add the stamp to the main sources to ensure dependency tracking - set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) - set(GGML_HEADERS_KOMPUTE ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) - - add_compile_definitions(GGML_USE_KOMPUTE) - - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute) - set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR}) - else() - message(WARNING "Kompute not found") - endif() -endif() - -if (LLAMA_CPU_HBM) - find_library(memkind memkind REQUIRED) - - add_compile_definitions(GGML_USE_CPU_HBM) - - target_link_libraries(ggml PUBLIC memkind) -endif() - -function(get_flags CCID CCVER) - set(C_FLAGS "") - set(CXX_FLAGS "") - - if (CCID MATCHES "Clang") - set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return) - set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi) - - if ( - (CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR - (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0) - ) - list(APPEND C_FLAGS -Wdouble-promotion) - endif() - elseif (CCID STREQUAL "GNU") - set(C_FLAGS -Wdouble-promotion) - set(CXX_FLAGS -Wno-array-bounds) - - if (CCVER VERSION_GREATER_EQUAL 7.1.0) - list(APPEND CXX_FLAGS -Wno-format-truncation) - endif() - if (CCVER VERSION_GREATER_EQUAL 8.1.0) - list(APPEND CXX_FLAGS -Wextra-semi) - endif() - endif() - - set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE) - set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE) endfunction() -if (LLAMA_FATAL_WARNINGS) - if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") - list(APPEND C_FLAGS -Werror) - list(APPEND CXX_FLAGS -Werror) - elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") - add_compile_options(/WX) - endif() -endif() - -if (LLAMA_ALL_WARNINGS) - if (NOT MSVC) - list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) - list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes - -Werror=implicit-int -Werror=implicit-function-declaration) - list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) - - list(APPEND C_FLAGS ${WARNING_FLAGS}) - list(APPEND CXX_FLAGS ${WARNING_FLAGS}) - - get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}) - - add_compile_options("$<$:${C_FLAGS};${GF_C_FLAGS}>" - "$<$:${CXX_FLAGS};${GF_CXX_FLAGS}>") - else() - # todo : msvc - set(C_FLAGS "") - set(CXX_FLAGS "") - endif() -endif() - -set(CUDA_CXX_FLAGS "") - -if (LLAMA_CUDA) - set(CUDA_FLAGS -use_fast_math) - - if (LLAMA_FATAL_WARNINGS) - list(APPEND CUDA_FLAGS -Werror all-warnings) - endif() - - if (LLAMA_ALL_WARNINGS AND NOT MSVC) - set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c) - if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "") - list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER}) - endif() - - execute_process( - COMMAND ${NVCC_CMD} -Xcompiler --version - OUTPUT_VARIABLE CUDA_CCFULLVER - ERROR_QUIET - ) - - if (NOT CUDA_CCFULLVER MATCHES clang) - set(CUDA_CCID "GNU") - execute_process( - COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion" - OUTPUT_VARIABLE CUDA_CCVER - ERROR_QUIET - ) - else() - if (CUDA_CCFULLVER MATCHES Apple) - set(CUDA_CCID "AppleClang") - else() - set(CUDA_CCID "Clang") - endif() - string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER}) - endif() - - message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}") - - get_flags(${CUDA_CCID} ${CUDA_CCVER}) - list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later - endif() - - if (NOT MSVC) - list(APPEND CUDA_CXX_FLAGS -Wno-pedantic) - endif() -endif() - -if (WIN32) - add_compile_definitions(_CRT_SECURE_NO_WARNINGS) - - if (BUILD_SHARED_LIBS) - set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) - endif() -endif() - -if (LLAMA_LTO) - include(CheckIPOSupported) - check_ipo_supported(RESULT result OUTPUT output) - if (result) - set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) - else() - message(WARNING "IPO is not supported: ${output}") - endif() -endif() - -if (LLAMA_CCACHE) - find_program(LLAMA_CCACHE_FOUND ccache) - if (LLAMA_CCACHE_FOUND) - set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) - set(ENV{CCACHE_SLOPPINESS} time_macros) - message(STATUS "ccache found, compilation results will be cached. Disable with LLAMA_CCACHE=OFF.") - else() - message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with LLAMA_CCACHE=OFF") - endif () -endif() - -# this version of Apple ld64 is buggy -execute_process( - COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v - ERROR_VARIABLE output - OUTPUT_QUIET -) - -if (output MATCHES "dyld-1015\.7") - add_compile_definitions(HAVE_BUGGY_APPLE_LINKER) -endif() - -# Architecture specific -# TODO: probably these flags need to be tweaked on some architectures -# feel free to update the Makefile for your architecture and send a pull request or issue -message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") -if (MSVC) - string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) - message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") -else () - set(CMAKE_GENERATOR_PLATFORM_LWR "") -endif () - -if (NOT MSVC) - if (LLAMA_STATIC) - add_link_options(-static) - if (MINGW) - add_link_options(-static-libgcc -static-libstdc++) - endif() - endif() - if (LLAMA_GPROF) - add_compile_options(-pg) - endif() -endif() - -set(ARCH_FLAGS "") - -if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR - (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND - CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$")) - message(STATUS "ARM detected") - if (MSVC) - add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead - add_compile_definitions(__ARM_NEON) - add_compile_definitions(__ARM_FEATURE_FMA) - - set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) - string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) - if (GGML_COMPILER_SUPPORT_DOTPROD) - add_compile_definitions(__ARM_FEATURE_DOTPROD) - endif () - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) - if (GGML_COMPILER_SUPPORT_MATMUL_INT8) - add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) - endif () - - check_cxx_source_compiles("#include \nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) - if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) - add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - endif () - set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) - else() - check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) - if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") - list(APPEND ARCH_FLAGS -mfp16-format=ieee) - endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") - # Raspberry Pi 1, Zero - list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) - endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") - if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") - # Android armeabi-v7a - list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) - else() - # Raspberry Pi 2 - list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) - endif() - endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") - # Android arm64-v8a - # Raspberry Pi 3, 4, Zero 2 (32-bit) - list(APPEND ARCH_FLAGS -mno-unaligned-access) - endif() - if (LLAMA_SVE) - list(APPEND ARCH_FLAGS -march=armv8.6-a+sve) - endif() - endif() -elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR - (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND - CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) - message(STATUS "x86 detected") - if (MSVC) - # instruction set detection for MSVC only - if (LLAMA_NATIVE) - include(cmake/FindSIMD.cmake) - endif () - if (LLAMA_AVX512) - list(APPEND ARCH_FLAGS /arch:AVX512) - # MSVC has no compile-time flags enabling specific - # AVX512 extensions, neither it defines the - # macros corresponding to the extensions. - # Do it manually. - if (LLAMA_AVX512_VBMI) - add_compile_definitions($<$:__AVX512VBMI__>) - add_compile_definitions($<$:__AVX512VBMI__>) - endif() - if (LLAMA_AVX512_VNNI) - add_compile_definitions($<$:__AVX512VNNI__>) - add_compile_definitions($<$:__AVX512VNNI__>) - endif() - if (LLAMA_AVX512_BF16) - add_compile_definitions($<$:__AVX512BF16__>) - add_compile_definitions($<$:__AVX512BF16__>) - endif() - elseif (LLAMA_AVX2) - list(APPEND ARCH_FLAGS /arch:AVX2) - elseif (LLAMA_AVX) - list(APPEND ARCH_FLAGS /arch:AVX) - endif() - else() - if (LLAMA_NATIVE) - list(APPEND ARCH_FLAGS -march=native) - endif() - if (LLAMA_F16C) - list(APPEND ARCH_FLAGS -mf16c) - endif() - if (LLAMA_FMA) - list(APPEND ARCH_FLAGS -mfma) - endif() - if (LLAMA_AVX) - list(APPEND ARCH_FLAGS -mavx) - endif() - if (LLAMA_AVX2) - list(APPEND ARCH_FLAGS -mavx2) - endif() - if (LLAMA_AVX512) - list(APPEND ARCH_FLAGS -mavx512f) - list(APPEND ARCH_FLAGS -mavx512bw) - endif() - if (LLAMA_AVX512_VBMI) - list(APPEND ARCH_FLAGS -mavx512vbmi) - endif() - if (LLAMA_AVX512_VNNI) - list(APPEND ARCH_FLAGS -mavx512vnni) - endif() - if (LLAMA_AVX512_BF16) - list(APPEND ARCH_FLAGS -mavx512bf16) - endif() - endif() -elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") - message(STATUS "PowerPC detected") - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") - list(APPEND ARCH_FLAGS -mcpu=powerpc64le) - else() - list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) - #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) - endif() -elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") - message(STATUS "loongarch64 detected") - - list(APPEND ARCH_FLAGS -march=loongarch64) - if (LLAMA_LASX) - list(APPEND ARCH_FLAGS -mlasx) - endif() - if (LLAMA_LSX) - list(APPEND ARCH_FLAGS -mlsx) - endif() - -else() - message(STATUS "Unknown architecture") -endif() - -add_compile_options("$<$:${ARCH_FLAGS}>") -add_compile_options("$<$:${ARCH_FLAGS}>") - -if (LLAMA_CUDA) - list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS}) - list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument - if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "") - list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED}) - endif() - add_compile_options("$<$:${CUDA_FLAGS}>") -endif() - -if (MINGW) - # Target Windows 8 for PrefetchVirtualMemory - add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER}) -endif() +llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA) +llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA) +llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE) +llama_option_depr(WARNING LLAMA_METAL GGML_METAL) +llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY) +llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE) +llama_option_depr(WARNING LLAMA_OPENMP GGML_OPENMP) +llama_option_depr(WARNING LLAMA_RPC GGML_RPC) +llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) +llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) # -# POSIX conformance +# build the library # -# clock_gettime came in POSIX.1b (1993) -# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional -# posix_memalign came in POSIX.1-2001 / SUSv3 -# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985) -add_compile_definitions(_XOPEN_SOURCE=600) - -# Somehow in OpenBSD whenever POSIX conformance is specified -# some string functions rely on locale_t availability, -# which was introduced in POSIX.1-2008, forcing us to go higher -if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") - remove_definitions(-D_XOPEN_SOURCE=600) - add_compile_definitions(_XOPEN_SOURCE=700) -endif() - -# Data types, macros and functions related to controlling CPU affinity and -# some memory allocation are available on Linux through GNU extensions in libc -if (CMAKE_SYSTEM_NAME MATCHES "Linux") - add_compile_definitions(_GNU_SOURCE) -endif() - -# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, -# and on macOS its availability depends on enabling Darwin extensions -# similarly on DragonFly, enabling BSD extensions is necessary -if ( - CMAKE_SYSTEM_NAME MATCHES "Darwin" OR - CMAKE_SYSTEM_NAME MATCHES "iOS" OR - CMAKE_SYSTEM_NAME MATCHES "tvOS" OR - CMAKE_SYSTEM_NAME MATCHES "DragonFly" -) - add_compile_definitions(_DARWIN_C_SOURCE) -endif() - -# alloca is a non-standard interface that is not visible on BSDs when -# POSIX conformance is specified, but not all of them provide a clean way -# to enable it in such cases -if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD") - add_compile_definitions(__BSD_VISIBLE) -endif() -if (CMAKE_SYSTEM_NAME MATCHES "NetBSD") - add_compile_definitions(_NETBSD_SOURCE) -endif() -if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") - add_compile_definitions(_BSD_SOURCE) -endif() - -# -# libraries -# - -# ggml - -add_library(ggml OBJECT - ggml.c - ggml.h - ggml-alloc.c - ggml-alloc.h - ggml-backend.c - ggml-backend.h - ggml-quants.c - ggml-quants.h - ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} - ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} - ${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC} - ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} - ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} - ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} - ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN} - ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} - ${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS} - ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} - ) - -target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) -target_compile_features (ggml PUBLIC c_std_11) # don't bump - -target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) - -add_library(ggml_static STATIC $) - -if (BUILD_SHARED_LIBS) - set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) - add_library(ggml_shared SHARED $) - target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) - install(TARGETS ggml_shared LIBRARY) -endif() - -# llama - -add_library(llama - llama.cpp - llama.h - unicode.h - unicode.cpp - unicode-data.cpp - ) - -target_include_directories(llama PUBLIC .) -target_compile_features (llama PUBLIC cxx_std_11) # don't bump - -target_link_libraries(llama PRIVATE - ggml - ${LLAMA_EXTRA_LIBS} - ) - -if (BUILD_SHARED_LIBS) - set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD) - if (LLAMA_METAL) - set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") - endif() -endif() - +add_subdirectory(ggml) +add_subdirectory(src) # # install @@ -1323,44 +114,35 @@ endif() include(GNUInstallDirs) include(CMakePackageConfigHelpers) -set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} - CACHE PATH "Location of header files") -set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} - CACHE PATH "Location of library files") -set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} - CACHE PATH "Location of binary files") -set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER}) -set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT}) +set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER}) +set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT}) set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER}) + +set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files") +set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files") +set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files") + get_directory_property(LLAMA_TRANSIENT_DEFINES COMPILE_DEFINITIONS) +set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h) +install(TARGETS llama LIBRARY PUBLIC_HEADER) + configure_package_config_file( - ${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake - INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/llama-config.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake + INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama PATH_VARS LLAMA_INCLUDE_INSTALL_DIR LLAMA_LIB_INSTALL_DIR LLAMA_BIN_INSTALL_DIR ) write_basic_package_version_file( - ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake + ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake VERSION ${LLAMA_INSTALL_VERSION} COMPATIBILITY SameMajorVersion) -install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake - ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama) - -set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h" - "${GGML_HEADERS_CUDA}" - "${GGML_HEADERS_METAL}" - "${GGML_HEADERS_EXTRA}") - -set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") -install(TARGETS ggml PUBLIC_HEADER) - -set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/llama.h) -install(TARGETS llama LIBRARY PUBLIC_HEADER) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake + ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama) install( FILES convert-hf-to-gguf.py @@ -1373,22 +155,6 @@ install( WORLD_READ WORLD_EXECUTE DESTINATION ${CMAKE_INSTALL_BINDIR}) -if (LLAMA_METAL) - install( - FILES ggml-metal.metal - PERMISSIONS - OWNER_READ - OWNER_WRITE - GROUP_READ - WORLD_READ - DESTINATION ${CMAKE_INSTALL_BINDIR}) - if (NOT LLAMA_METAL_EMBED_LIBRARY) - install( - FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib - DESTINATION ${CMAKE_INSTALL_BINDIR} - ) - endif() -endif() configure_file(cmake/llama.pc.in "${CMAKE_CURRENT_BINARY_DIR}/llama.pc" diff --git a/CMakePresets.json b/CMakePresets.json index fba22af9a..d69bc0344 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -19,14 +19,14 @@ "cacheVariables": { "CMAKE_EXPORT_COMPILE_COMMANDS": "ON", "CMAKE_CXX_COMPILER": "icx", - "LLAMA_SYCL": "ON", + "GGML_SYCL": "ON", "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." } }, { "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } }, { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } }, - { "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, - { "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } }, + { "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, + { "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } }, { "name": "arm64-windows-msvc", "hidden": true, diff --git a/Makefile b/Makefile index f6e8eb73e..64a6e6ff0 100644 --- a/Makefile +++ b/Makefile @@ -61,8 +61,80 @@ TEST_TARGETS = \ tests/test-tokenizer-1-bpe \ tests/test-tokenizer-1-spm -# Code coverage output files -COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report +# Deprecation aliases +ifdef LLAMA_CUBLAS +$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.) +endif + +ifdef LLAMA_CUDA +GGML_CUDA := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_KOMPUTE +GGML_KOMPUTE := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_METAL +GGML_METAL := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_OPENMP +GGML_OPENMP := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_RPC +GGML_RPC := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_SYCL +GGML_SYCL := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_SYCL_F16 +GGML_SYCL_F16 := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_OPENBLAS +GGML_OPENBLAS := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_OPENBLAS64 +GGML_OPENBLAS64 := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_BLIS +GGML_BLIS := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_NO_LLAMAFILE +GGML_NO_LLAMAFILE := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_NO_ACCELERATE +GGML_NO_ACCELERATE := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_NO_OPENMP +GGML_NO_OPENMP := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_NO_METAL +GGML_NO_METAL := 1 +DEPRECATE_WARNING := 1 +endif ifndef UNAME_S UNAME_S := $(shell uname -s) @@ -76,6 +148,12 @@ ifndef UNAME_M UNAME_M := $(shell uname -m) endif +MK_CFLAGS += -O3 +MK_CXXFLAGS += -O3 +ifndef LLAMA_DEBUG +MK_NVCCFLAGS += -O3 +endif # LLAMA_DEBUG + # In GNU make default CXX is g++ instead of c++. Let's fix that so that users # of non-gcc compilers don't have to provide g++ alias or wrapper. DEFCC := cc @@ -90,11 +168,11 @@ endif # Mac OS + Arm can report x86_64 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 ifeq ($(UNAME_S),Darwin) - ifndef LLAMA_NO_METAL - LLAMA_METAL := 1 + ifndef GGML_NO_METAL + GGML_METAL := 1 endif - LLAMA_NO_OPENMP := 1 + GGML_NO_OPENMP := 1 ifneq ($(UNAME_P),arm) SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null) @@ -106,7 +184,11 @@ ifeq ($(UNAME_S),Darwin) endif endif -ifdef LLAMA_RPC +ifdef GGML_METAL + GGML_METAL_EMBED_LIBRARY := 1 +endif + +ifdef GGML_RPC BUILD_TARGETS += rpc-server endif @@ -147,18 +229,6 @@ test: $(TEST_TARGETS) all: $(BUILD_TARGETS) $(TEST_TARGETS) -coverage: ## Run code coverage - gcov -pb tests/*.cpp - -lcov-report: coverage ## Generate lcov report - mkdir -p lcov-report - lcov --capture --directory . --output-file lcov-report/coverage.info - genhtml lcov-report/coverage.info --output-directory lcov-report - -gcovr-report: coverage ## Generate gcovr report - mkdir -p gcovr-report - gcovr --root . --html --html-details --output gcovr-report/coverage.html - ifdef RISCV_CROSS_COMPILE CC := riscv64-unknown-linux-gnu-gcc CXX := riscv64-unknown-linux-gnu-g++ @@ -169,26 +239,11 @@ endif # # keep standard at C11 and C++11 -MK_CPPFLAGS = -I. -Icommon +MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon MK_CFLAGS = -std=c11 -fPIC MK_CXXFLAGS = -std=c++11 -fPIC MK_NVCCFLAGS = -std=c++11 -# -Ofast tends to produce faster code, but may not be available for some compilers. -ifdef LLAMA_FAST -MK_CFLAGS += -Ofast -HOST_CXXFLAGS += -Ofast -ifndef LLAMA_DEBUG -MK_NVCCFLAGS += -O3 -endif # LLAMA_DEBUG -else -MK_CFLAGS += -O3 -MK_CXXFLAGS += -O3 -ifndef LLAMA_DEBUG -MK_NVCCFLAGS += -O3 -endif # LLAMA_DEBUG -endif # LLAMA_FAST - ifndef LLAMA_NO_CCACHE CCACHE := $(shell which ccache) ifdef CCACHE @@ -243,8 +298,8 @@ ifeq ($(UNAME_S),OpenBSD) MK_CPPFLAGS += -D_BSD_SOURCE endif -ifdef LLAMA_SCHED_MAX_COPIES - MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(LLAMA_SCHED_MAX_COPIES) +ifdef GGML_SCHED_MAX_COPIES + MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(GGML_SCHED_MAX_COPIES) endif ifdef LLAMA_DEBUG @@ -287,19 +342,31 @@ ifdef LLAMA_SERVER_SSL MK_LDFLAGS += -lssl -lcrypto endif -ifdef LLAMA_CODE_COVERAGE - MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase '' -endif - ifdef LLAMA_DISABLE_LOGS MK_CPPFLAGS += -DLOG_DISABLE_LOGS endif # LLAMA_DISABLE_LOGS # warnings -WARN_FLAGS = -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int \ - -Werror=implicit-function-declaration -MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn +WARN_FLAGS = \ + -Wall \ + -Wextra \ + -Wpedantic \ + -Wcast-qual \ + -Wno-unused-function + +MK_CFLAGS += \ + $(WARN_FLAGS) \ + -Wshadow \ + -Wstrict-prototypes \ + -Wpointer-arith \ + -Wmissing-prototypes \ + -Werror=implicit-int \ + -Werror=implicit-function-declaration + +MK_CXXFLAGS += \ + $(WARN_FLAGS) \ + -Wmissing-declarations \ + -Wmissing-noreturn ifeq ($(LLAMA_FATAL_WARNINGS),1) MK_CFLAGS += -Werror @@ -434,7 +501,7 @@ else MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d endif -ifndef LLAMA_NO_ACCELERATE +ifndef GGML_NO_ACCELERATE # Mac OS - include Accelerate framework. # `-framework Accelerate` works both with Apple Silicon and Mac Intel ifeq ($(UNAME_S),Darwin) @@ -442,141 +509,157 @@ ifndef LLAMA_NO_ACCELERATE MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64 MK_LDFLAGS += -framework Accelerate - OBJS += ggml-blas.o + OBJ_GGML += ggml/src/ggml-blas.o endif -endif # LLAMA_NO_ACCELERATE +endif # GGML_NO_ACCELERATE -ifndef LLAMA_NO_OPENMP +ifndef GGML_NO_OPENMP MK_CPPFLAGS += -DGGML_USE_OPENMP MK_CFLAGS += -fopenmp MK_CXXFLAGS += -fopenmp -endif # LLAMA_NO_OPENMP +endif # GGML_NO_OPENMP -ifdef LLAMA_OPENBLAS +ifdef GGML_OPENBLAS MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) MK_LDFLAGS += $(shell pkg-config --libs openblas) - OBJS += ggml-blas.o -endif # LLAMA_OPENBLAS + OBJ_GGML += ggml/src/ggml-blas.o +endif # GGML_OPENBLAS -ifdef LLAMA_OPENBLAS64 +ifdef GGML_OPENBLAS64 MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64) MK_LDFLAGS += $(shell pkg-config --libs openblas64) - OBJS += ggml-blas.o -endif # LLAMA_OPENBLAS64 + OBJ_GGML += ggml/src/ggml-blas.o +endif # GGML_OPENBLAS64 -ifdef LLAMA_BLIS +ifdef GGML_BLIS MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis MK_LDFLAGS += -lblis -L/usr/local/lib - OBJS += ggml-blas.o -endif # LLAMA_BLIS + OBJ_GGML += ggml/src/ggml-blas.o +endif # GGML_BLIS -ifndef LLAMA_NO_LLAMAFILE +ifndef GGML_NO_LLAMAFILE MK_CPPFLAGS += -DGGML_USE_LLAMAFILE - OBJS += sgemm.o + OBJ_GGML += ggml/src/sgemm.o endif -ifdef LLAMA_RPC - MK_CPPFLAGS += -DGGML_USE_RPC - OBJS += ggml-rpc.o -endif # LLAMA_RPC +ifdef GGML_RPC + MK_CPPFLAGS += -DGGML_USE_RPC + OBJ_GGML += ggml/src/ggml-rpc.o +endif # GGML_RPC -ifdef LLAMA_CUBLAS -# LLAMA_CUBLAS is deprecated and will be removed in the future - LLAMA_CUDA := 1 -endif +OBJ_CUDA_TMPL = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu)) +OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/mmq*.cu)) -OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu)) -OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/mmq*.cu)) -ifdef LLAMA_CUDA_FA_ALL_QUANTS - OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu)) +ifdef GGML_CUDA_FA_ALL_QUANTS + OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*.cu)) else - OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu)) - OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu)) - OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu)) -endif # LLAMA_CUDA_FA_ALL_QUANTS + OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu)) + OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu)) + OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu)) +endif # GGML_CUDA_FA_ALL_QUANTS -ifdef LLAMA_CUDA +ifdef GGML_CUDA ifneq ('', '$(wildcard /opt/cuda)') CUDA_PATH ?= /opt/cuda else CUDA_PATH ?= /usr/local/cuda endif + MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib - OBJS += ggml-cuda.o - OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) - OBJS += $(OBJS_CUDA_TEMP_INST) MK_NVCCFLAGS += -use_fast_math + + OBJ_GGML += ggml/src/ggml-cuda.o + OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) + OBJ_GGML += $(OBJ_CUDA_TMPL) + ifdef LLAMA_FATAL_WARNINGS MK_NVCCFLAGS += -Werror all-warnings endif # LLAMA_FATAL_WARNINGS + ifndef JETSON_EOL_MODULE_DETECT MK_NVCCFLAGS += --forward-unknown-to-host-compiler endif # JETSON_EOL_MODULE_DETECT + ifdef LLAMA_DEBUG MK_NVCCFLAGS += -lineinfo endif # LLAMA_DEBUG -ifdef LLAMA_CUDA_DEBUG + +ifdef GGML_CUDA_DEBUG MK_NVCCFLAGS += --device-debug -endif # LLAMA_CUDA_DEBUG -ifdef LLAMA_CUDA_NVCC - NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC) +endif # GGML_CUDA_DEBUG + +ifdef GGML_CUDA_NVCC + NVCC = $(CCACHE) $(GGML_CUDA_NVCC) else NVCC = $(CCACHE) nvcc -endif #LLAMA_CUDA_NVCC +endif #GGML_CUDA_NVCC + ifdef CUDA_DOCKER_ARCH MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) else ifndef CUDA_POWER_ARCH MK_NVCCFLAGS += -arch=native endif # CUDA_DOCKER_ARCH -ifdef LLAMA_CUDA_FORCE_DMMV + +ifdef GGML_CUDA_FORCE_DMMV MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV -endif # LLAMA_CUDA_FORCE_DMMV -ifdef LLAMA_CUDA_FORCE_MMQ +endif # GGML_CUDA_FORCE_DMMV + +ifdef GGML_CUDA_FORCE_MMQ MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ -endif # LLAMA_CUDA_FORCE_MMQ -ifdef LLAMA_CUDA_FORCE_CUBLAS +endif # GGML_CUDA_FORCE_MMQ + +ifdef GGML_CUDA_FORCE_CUBLAS MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS -endif # LLAMA_CUDA_FORCE_CUBLAS -ifdef LLAMA_CUDA_DMMV_X - MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) +endif # GGML_CUDA_FORCE_CUBLAS + +ifdef GGML_CUDA_DMMV_X + MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X) else MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32 -endif # LLAMA_CUDA_DMMV_X -ifdef LLAMA_CUDA_MMV_Y - MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) -else ifdef LLAMA_CUDA_DMMV_Y - MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility +endif # GGML_CUDA_DMMV_X + +ifdef GGML_CUDA_MMV_Y + MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y) +else ifdef GGML_CUDA_DMMV_Y + MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_DMMV_Y) # for backwards compatibility else MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1 -endif # LLAMA_CUDA_MMV_Y -ifdef LLAMA_CUDA_F16 +endif # GGML_CUDA_MMV_Y + +ifdef GGML_CUDA_F16 MK_NVCCFLAGS += -DGGML_CUDA_F16 -endif # LLAMA_CUDA_F16 -ifdef LLAMA_CUDA_DMMV_F16 +endif # GGML_CUDA_F16 + +ifdef GGML_CUDA_DMMV_F16 MK_NVCCFLAGS += -DGGML_CUDA_F16 -endif # LLAMA_CUDA_DMMV_F16 -ifdef LLAMA_CUDA_KQUANTS_ITER - MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) +endif # GGML_CUDA_DMMV_F16 + +ifdef GGML_CUDA_KQUANTS_ITER + MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER) else MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 endif -ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE - MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE) + +ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE + MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE) else MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE -ifdef LLAMA_CUDA_NO_PEER_COPY +endif # GGML_CUDA_PEER_MAX_BATCH_SIZE + +ifdef GGML_CUDA_NO_PEER_COPY MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY -endif # LLAMA_CUDA_NO_PEER_COPY -ifdef LLAMA_CUDA_CCBIN - MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) -endif # LLAMA_CUDA_CCBIN -ifdef LLAMA_CUDA_FA_ALL_QUANTS +endif # GGML_CUDA_NO_PEER_COPY + +ifdef GGML_CUDA_CCBIN + MK_NVCCFLAGS += -ccbin $(GGML_CUDA_CCBIN) +endif # GGML_CUDA_CCBIN + +ifdef GGML_CUDA_FA_ALL_QUANTS MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS -endif # LLAMA_CUDA_FA_ALL_QUANTS +endif # GGML_CUDA_FA_ALL_QUANTS ifdef JETSON_EOL_MODULE_DETECT define NVCC_COMPILE @@ -588,135 +671,187 @@ define NVCC_COMPILE endef # NVCC_COMPILE endif # JETSON_EOL_MODULE_DETECT -ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh +ggml/src/ggml-cuda/%.o: \ + ggml/src/ggml-cuda/%.cu \ + ggml/include/ggml.h \ + ggml/src/ggml-common.h \ + ggml/src/ggml-cuda/common.cuh $(NVCC_COMPILE) -ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh) +ggml/src/ggml-cuda.o: \ + ggml/src/ggml-cuda.cu \ + ggml/include/ggml-cuda.h \ + ggml/include/ggml.h \ + ggml/include/ggml-backend.h \ + ggml/src/ggml-backend-impl.h \ + ggml/src/ggml-common.h \ + $(wildcard ggml/src/ggml-cuda/*.cuh) $(NVCC_COMPILE) -endif # LLAMA_CUDA +endif # GGML_CUDA -ifdef LLAMA_VULKAN - MK_CPPFLAGS += -DGGML_USE_VULKAN - MK_LDFLAGS += -lvulkan - OBJS += ggml-vulkan.o +ifdef GGML_VULKAN + MK_CPPFLAGS += -DGGML_USE_VULKAN + MK_LDFLAGS += -lvulkan + OBJ_GGML += ggml/src/ggml-vulkan.o -ifdef LLAMA_VULKAN_CHECK_RESULTS +ifdef GGML_VULKAN_CHECK_RESULTS MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS endif -ifdef LLAMA_VULKAN_DEBUG +ifdef GGML_VULKAN_DEBUG MK_CPPFLAGS += -DGGML_VULKAN_DEBUG endif -ifdef LLAMA_VULKAN_MEMORY_DEBUG +ifdef GGML_VULKAN_MEMORY_DEBUG MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG endif -ifdef LLAMA_VULKAN_VALIDATE +ifdef GGML_VULKAN_VALIDATE MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE endif -ifdef LLAMA_VULKAN_RUN_TESTS +ifdef GGML_VULKAN_RUN_TESTS MK_CPPFLAGS += -DGGML_VULKAN_RUN_TESTS endif -ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h +ggml/src/ggml-vulkan.o: \ + ggml/src/ggml-vulkan.cpp \ + ggml/include/ggml-vulkan.h $(CXX) $(CXXFLAGS) -c $< -o $@ -endif # LLAMA_VULKAN +endif # GGML_VULKAN -ifdef LLAMA_HIPBLAS +ifdef GGML_HIPBLAS ifeq ($(wildcard /opt/rocm),) - ROCM_PATH ?= /usr + ROCM_PATH ?= /usr AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch)) else ROCM_PATH ?= /opt/rocm AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) endif - HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc - LLAMA_CUDA_DMMV_X ?= 32 - LLAMA_CUDA_MMV_Y ?= 1 - LLAMA_CUDA_KQUANTS_ITER ?= 2 + + GGML_CUDA_DMMV_X ?= 32 + GGML_CUDA_MMV_Y ?= 1 + GGML_CUDA_KQUANTS_ITER ?= 2 + MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA -ifdef LLAMA_HIP_UMA + +ifdef GGML_HIP_UMA MK_CPPFLAGS += -DGGML_HIP_UMA -endif # LLAMA_HIP_UMA - MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib - MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64 - MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas - HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS)) - HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) - HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) - HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) -ifdef LLAMA_CUDA_FORCE_DMMV - HIPFLAGS += -DGGML_CUDA_FORCE_DMMV -endif # LLAMA_CUDA_FORCE_DMMV -ifdef LLAMA_CUDA_NO_PEER_COPY - HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY -endif # LLAMA_CUDA_NO_PEER_COPY - OBJS += ggml-cuda.o - OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) - OBJS += $(OBJS_CUDA_TEMP_INST) +endif # GGML_HIP_UMA -ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh) + MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib + MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64 + MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas + + HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc + + HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS)) + HIPFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X) + HIPFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y) + HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER) + +ifdef GGML_CUDA_FORCE_DMMV + HIPFLAGS += -DGGML_CUDA_FORCE_DMMV +endif # GGML_CUDA_FORCE_DMMV + +ifdef GGML_CUDA_NO_PEER_COPY + HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY +endif # GGML_CUDA_NO_PEER_COPY + + OBJ_GGML += ggml/src/ggml-cuda.o + OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) + OBJ_GGML += $(OBJ_CUDA_TMPL) + +ggml/src/ggml-cuda.o: \ + ggml/src/ggml-cuda.cu \ + ggml/include/ggml-cuda.h \ + ggml/include/ggml.h \ + ggml/include/ggml-backend.h \ + ggml/src/ggml-backend-impl.h \ + ggml/src/ggml-common.h \ + $(wildcard ggml/src/ggml-cuda/*.cuh) $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< -ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh +ggml/src/ggml-cuda/%.o: \ + ggml/src/ggml-cuda/%.cu \ + ggml/include/ggml.h \ + ggml/src/ggml-common.h \ + ggml/src/ggml-cuda/common.cuh $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< +endif # GGML_HIPBLAS -endif # LLAMA_HIPBLAS - -ifdef LLAMA_METAL +ifdef GGML_METAL MK_CPPFLAGS += -DGGML_USE_METAL MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit - OBJS += ggml-metal.o -ifdef LLAMA_METAL_NDEBUG + OBJ_GGML += ggml/src/ggml-metal.o +ifdef GGML_METAL_NDEBUG MK_CPPFLAGS += -DGGML_METAL_NDEBUG endif -ifdef LLAMA_METAL_EMBED_LIBRARY +ifdef GGML_METAL_EMBED_LIBRARY MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY - OBJS += ggml-metal-embed.o + OBJ_GGML += ggml/src/ggml-metal-embed.o endif -endif # LLAMA_METAL +endif # GGML_METAL -ifdef LLAMA_METAL -ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h +ifdef GGML_METAL +ggml/src/ggml-metal.o: \ + ggml/src/ggml-metal.m \ + ggml/include/ggml-metal.h \ + ggml/include/ggml.h $(CC) $(CFLAGS) -c $< -o $@ -ifdef LLAMA_METAL_EMBED_LIBRARY -ggml-metal-embed.o: ggml-metal.metal ggml-common.h +ifdef GGML_METAL_EMBED_LIBRARY +ggml/src/ggml-metal-embed.o: \ + ggml/src/ggml-metal.metal \ + ggml/src/ggml-common.h @echo "Embedding Metal library" - @sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal + @sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal $(eval TEMP_ASSEMBLY=$(shell mktemp)) - @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY) - @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY) - @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY) - @echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY) - @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY) - @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY) + @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY) + @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY) + @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY) + @echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY) + @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY) + @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY) @$(AS) $(TEMP_ASSEMBLY) -o $@ @rm -f ${TEMP_ASSEMBLY} endif -endif # LLAMA_METAL +endif # GGML_METAL -OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o -COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h -COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o +OBJ_GGML += \ + ggml/src/ggml.o \ + ggml/src/ggml-alloc.o \ + ggml/src/ggml-backend.o \ + ggml/src/ggml-quants.o -ifndef LLAMA_NO_LLAMAFILE -sgemm.o: sgemm.cpp sgemm.h ggml.h - $(CXX) $(CXXFLAGS) -c $< -o $@ -endif +OBJ_LLAMA = \ + src/llama.o \ + src/unicode.o \ + src/unicode-data.o -ifdef LLAMA_RPC -ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h - $(CXX) $(CXXFLAGS) -c $< -o $@ +OBJ_COMMON = \ + common/common.o \ + common/console.o \ + common/ngram-cache.o \ + common/sampling.o \ + common/train.o \ + common/grammar-parser.o \ + common/build-info.o \ + common/json-schema-to-grammar.o -rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h - $(CXX) $(CXXFLAGS) -c $< -o $@ +OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON) -rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -endif # LLAMA_RPC +LIB_GGML = $(LIB_PRE)ggml$(DSO_EXT) +LIB_GGML_S = $(LIB_PRE)ggml.a + +LIB_LLAMA = $(LIB_PRE)llama$(DSO_EXT) +LIB_LLAMA_S = $(LIB_PRE)llama.a + +LIB_COMMON = $(LIB_PRE)common$(DSO_EXT) +LIB_COMMON_S = $(LIB_PRE)common.a + +LIB_ALL = $(LIB_GGML) $(LIB_LLAMA) $(LIB_COMMON) +LIB_ALL_S = $(LIB_GGML_S) $(LIB_LLAMA_S) $(LIB_COMMON_S) GF_CC := $(CC) include scripts/get-flags.mk @@ -730,7 +865,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS) override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS) # identify CUDA host compiler -ifdef LLAMA_CUDA +ifdef GGML_CUDA GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler include scripts/get-flags.mk CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic @@ -755,85 +890,203 @@ $(info I NVCCFLAGS: $(NVCCFLAGS)) $(info I LDFLAGS: $(LDFLAGS)) $(info I CC: $(shell $(CC) --version | head -n 1)) $(info I CXX: $(shell $(CXX) --version | head -n 1)) -ifdef LLAMA_CUDA +ifdef GGML_CUDA $(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])') ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) + ifndef CUDA_DOCKER_ARCH ifndef CUDA_POWER_ARCH $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus ) endif # CUDA_POWER_ARCH endif # CUDA_DOCKER_ARCH + endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1) -endif # LLAMA_CUDA +endif # GGML_CUDA $(info ) -ifdef LLAMA_CUBLAS -$(info !!!!) -$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.) -$(info !!!!) +ifdef DEPRECATE_WARNING +$(info !!! DEPRECATION WARNING !!!) +$(info The following LLAMA_ options are deprecated and will be removed in the future. Use the GGML_ prefix instead) +$(info - LLAMA_CUDA) +$(info - LLAMA_METAL) +$(info - LLAMA_METAL_EMBED_LIBRARY) +$(info - LLAMA_OPENMP) +$(info - LLAMA_RPC) +$(info - LLAMA_SYCL) +$(info - LLAMA_SYCL_F16) +$(info - LLAMA_OPENBLAS) +$(info - LLAMA_OPENBLAS64) +$(info - LLAMA_BLIS) +$(info - LLAMA_NO_LLAMAFILE) +$(info - LLAMA_NO_ACCELERATE) +$(info - LLAMA_NO_OPENMP) +$(info - LLAMA_NO_METAL) $(info ) endif # -# Build library +# Build libraries # -ggml.o: ggml.c ggml.h ggml-cuda.h +# ggml + +ggml/src/ggml.o: \ + ggml/src/ggml.c \ + ggml/include/ggml.h $(CC) $(CFLAGS) -c $< -o $@ -ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h +ggml/src/ggml-alloc.o: \ + ggml/src/ggml-alloc.c \ + ggml/include/ggml.h \ + ggml/include/ggml-alloc.h $(CC) $(CFLAGS) -c $< -o $@ -ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h +ggml/src/ggml-backend.o: \ + ggml/src/ggml-backend.c \ + ggml/include/ggml.h \ + ggml/include/ggml-backend.h $(CC) $(CFLAGS) -c $< -o $@ -ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h +ggml/src/ggml-quants.o: \ + ggml/src/ggml-quants.c \ + ggml/include/ggml.h \ + ggml/src/ggml-quants.h \ + ggml/src/ggml-common.h $(CC) $(CFLAGS) -c $< -o $@ -ggml-blas.o: ggml-blas.cpp ggml-blas.h +ggml/src/ggml-blas.o: \ + ggml/src/ggml-blas.cpp \ + ggml/include/ggml-blas.h $(CXX) $(CXXFLAGS) -c $< -o $@ -unicode.o: unicode.cpp unicode.h +ifndef GGML_NO_LLAMAFILE +ggml/src/sgemm.o: \ + ggml/src/sgemm.cpp \ + ggml/src/sgemm.h \ + ggml/include/ggml.h $(CXX) $(CXXFLAGS) -c $< -o $@ +endif # GGML_NO_LLAMAFILE -unicode-data.o: unicode-data.cpp unicode-data.h +ifdef GGML_RPC +ggml/src/ggml-rpc.o: \ + ggml/src/ggml-rpc.cpp \ + ggml/include/ggml-rpc.h $(CXX) $(CXXFLAGS) -c $< -o $@ +endif # GGML_RPC -llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -common.o: common/common.cpp $(COMMON_H_DEPS) - $(CXX) $(CXXFLAGS) -c $< -o $@ - -sampling.o: common/sampling.cpp $(COMMON_H_DEPS) - $(CXX) $(CXXFLAGS) -c $< -o $@ - -console.o: common/console.cpp common/console.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-to-grammar.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -train.o: common/train.cpp common/train.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -libllama.so: llama.o ggml.o $(OBJS) +$(LIB_GGML): \ + $(OBJ_GGML) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) -libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS) - ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS) +$(LIB_GGML_S): \ + $(OBJ_GGML) + ar rcs $(LIB_GGML_S) $^ + +# llama + +src/unicode.o: \ + src/unicode.cpp \ + src/unicode.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +src/unicode-data.o: \ + src/unicode-data.cpp \ + src/unicode-data.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +src/llama.o: \ + src/llama.cpp \ + src/unicode.h \ + include/llama.h \ + ggml/include/ggml-cuda.h \ + ggml/include/ggml-metal.h \ + ggml/include/ggml.h \ + ggml/include/ggml-alloc.h \ + ggml/include/ggml-backend.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +$(LIB_LLAMA): \ + $(OBJ_LLAMA) \ + $(LIB_GGML) + $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) + +$(LIB_LLAMA_S): \ + $(OBJ_LLAMA) + ar rcs $(LIB_LLAMA_S) $^ + +# common + +common/common.o: \ + common/common.cpp \ + common/common.h \ + common/console.h \ + common/sampling.h \ + common/json.hpp \ + common/json-schema-to-grammar.h \ + include/llama.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +common/sampling.o: \ + common/sampling.cpp \ + common/sampling.h \ + include/llama.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +common/console.o: \ + common/console.cpp \ + common/console.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +common/grammar-parser.o: \ + common/grammar-parser.cpp \ + common/grammar-parser.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +common/json-schema-to-grammar.o: \ + common/json-schema-to-grammar.cpp \ + common/json-schema-to-grammar.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +common/train.o: \ + common/train.cpp \ + common/train.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +common/ngram-cache.o: \ + common/ngram-cache.cpp \ + common/ngram-cache.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +$(LIB_COMMON): \ + $(OBJ_COMMON) \ + $(LIB_LLAMA) \ + $(LIB_GGML) + $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) + +$(LIB_COMMON_S): \ + $(OBJ_COMMON) + ar rcs $(LIB_COMMON_S) $^ clean: - rm -vrf *.o tests/*.o *.so *.a *.dll common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) - rm -vrf ggml-cuda/*.o - rm -vrf ggml-cuda/template-instances/*.o + rm -vrf *.dot $(BUILD_TARGETS) $(TEST_TARGETS) + rm -rvf src/*.o + rm -rvf tests/*.o + rm -rvf examples/*.o + rm -rvf *.a + rm -rvf *.dll + rm -rvf *.so + rm -rvf *.dot + rm -rvf ggml/*.a + rm -rvf ggml/*.dll + rm -rvf ggml/*.so + rm -vrf ggml/src/*.o + rm -rvf common/build-info.cpp + rm -vrf ggml/src/ggml-metal-embed.metal + rm -vrf ggml/src/ggml-cuda/*.o + rm -vrf ggml/src/ggml-cuda/template-instances/*.o + rm -rvf $(BUILD_TARGETS) + rm -rvf $(TEST_TARGETS) find examples pocs -type f -name "*.o" -delete # @@ -847,62 +1100,202 @@ clean: # Helper function that replaces .c, .cpp, and .cu file endings with .o: GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1)))) -llama-cli: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) +llama-cli: examples/main/main.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @echo @echo '==== Run ./llama-cli -h for help. ====' @echo -llama-infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) +llama-infill: examples/infill/infill.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-simple: examples/simple/simple.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-tokenize: examples/tokenize/tokenize.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-batched: examples/batched/batched.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-batched-bench: examples/batched-bench/batched-bench.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-quantize: examples/quantize/quantize.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS) +llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-perplexity: examples/perplexity/perplexity.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-imatrix: examples/imatrix/imatrix.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-embedding: examples/embedding/embedding.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-gritlm: examples/gritlm/gritlm.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-save-load-state: examples/save-load-state/save-load-state.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +llama-gguf: examples/gguf/gguf.cpp \ + $(OBJ_GGML) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-gguf-split: examples/gguf-split/gguf-split.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-eval-callback: examples/eval-callback/eval-callback.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \ + $(OBJ_GGML) $(OBJ_LLAMA) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-bench: examples/llama-bench/llama-bench.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-baby-llama: examples/baby-llama/baby-llama.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-finetune: examples/finetune/finetune.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-export-lora: examples/export-lora/export-lora.cpp \ + $(OBJ_GGML) common/log.h + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-retrieval: examples/retrieval/retrieval.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-speculative: examples/speculative/speculative.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-parallel: examples/parallel/parallel.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-lookahead: examples/lookahead/lookahead.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-lookup: examples/lookup/lookup.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-lookup-create: examples/lookup/lookup-create.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-lookup-merge: examples/lookup/lookup-merge.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-lookup-stats: examples/lookup/lookup-stats.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-passkey: examples/passkey/passkey.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +ifdef GGML_RPC +rpc-server: examples/rpc/rpc-server.cpp \ + $(OBJ_GGML) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +endif # GGML_RPC + +llama-server: \ + examples/server/server.cpp \ + examples/server/utils.hpp \ + examples/server/httplib.h \ + examples/server/colorthemes.css.hpp \ + examples/server/style.css.hpp \ + examples/server/theme-beeninorder.css.hpp \ + examples/server/theme-ketivah.css.hpp \ + examples/server/theme-mangotango.css.hpp \ + examples/server/theme-playground.css.hpp \ + examples/server/theme-polarnight.css.hpp \ + examples/server/theme-snowstorm.css.hpp \ + examples/server/index.html.hpp \ + examples/server/index-new.html.hpp \ + examples/server/index.js.hpp \ + examples/server/completion.js.hpp \ + examples/server/system-prompts.js.hpp \ + examples/server/prompt-formats.js.hpp \ + examples/server/json-schema-to-grammar.mjs.hpp \ + common/json.hpp \ + common/stb_image.h \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2) @@ -915,95 +1308,26 @@ examples/server/%.hpp: examples/server/public/% Makefile echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \ ) > $@ -llama-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +libllava.a: examples/llava/llava.cpp \ + examples/llava/llava.h \ + examples/llava/clip.cpp \ + examples/llava/clip.h \ + common/stb_image.h \ + common/base64.hpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual -llama-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-llava-cli: examples/llava/llava-cli.cpp \ + examples/llava/clip.h \ + examples/llava/clip.cpp \ + examples/llava/llava.h \ + examples/llava/llava.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) -llama-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-lookup-create: examples/lookup/lookup-create.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-lookup-merge: examples/lookup/lookup-merge.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-lookup-stats: examples/lookup/lookup-stats.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - ifeq ($(UNAME_S),Darwin) swift: examples/batched.swift (cd examples/batched.swift; make build) @@ -1017,7 +1341,7 @@ common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh rm $@.tmp; \ fi -build-info.o: common/build-info.cpp +common/build-info.o: common/build-info.cpp $(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@ # @@ -1026,7 +1350,8 @@ build-info.o: common/build-info.cpp tests: $(TEST_TARGETS) -llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS) +llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp \ + $(OBJ_GGML) common/build-info.o $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -1035,85 +1360,108 @@ run-benchmark-matmult: llama-benchmark-matmult .PHONY: run-benchmark-matmult swift -llama-vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o grammar-parser.o $(OBJS) +tests/test-llama-grammar: tests/test-llama-grammar.cpp \ + $(OBJ_GGML) $(OBJ_COMMON) src/unicode.o src/unicode-data.o $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-parser.o $(OBJS) +tests/test-grammar-parser: tests/test-grammar-parser.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-grammar-integration: tests/test-grammar-integration.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS) +tests/test-grammar-integration: tests/test-grammar-integration.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS) +tests/test-double-float: tests/test-double-float.cpp $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS) +tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS) +tests/test-grad0: tests/test-grad0.cpp \ + $(OBJ_GGML) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-opt: tests/test-opt.cpp ggml.o $(OBJS) +tests/test-opt: tests/test-opt.cpp \ + $(OBJ_GGML) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o $(OBJS) +tests/test-quantize-fns: tests/test-quantize-fns.cpp \ + $(OBJ_GGML) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS) +tests/test-quantize-perf: tests/test-quantize-perf.cpp \ + $(OBJ_GGML) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS) +tests/test-sampling: tests/test-sampling.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) +tests/test-tokenizer-0: tests/test-tokenizer-0.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) +tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) +tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS) +tests/test-rope: tests/test-rope.cpp ggml/src/ggml.o \ + $(OBJ_GGML) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-c.o: tests/test-c.c llama.h +tests/test-c.o: tests/test-c.c include/llama.h $(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@ -tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS) +tests/test-backend-ops: tests/test-backend-ops.cpp \ + $(OBJ_GGML) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS) +tests/test-model-load-cancel: tests/test-model-load-cancel.cpp tests/get-model.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS) +tests/test-autorelease: tests/test-autorelease.cpp tests/get-model.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-chat-template: tests/test-chat-template.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +# +# PoCs +# + +llama-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \ + $(OBJ_GGML) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \ + $(OBJ_GGML) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/Package.swift b/Package.swift index 183e64757..77fed86df 100644 --- a/Package.swift +++ b/Package.swift @@ -3,14 +3,13 @@ import PackageDescription var sources = [ - "ggml.c", - "sgemm.cpp", - "llama.cpp", - "unicode.cpp", - "unicode-data.cpp", - "ggml-alloc.c", - "ggml-backend.c", - "ggml-quants.c", + "src/llama.cpp", + "src/unicode.cpp", + "src/unicode-data.cpp", + "ggml/src/ggml.c", + "ggml/src/ggml-alloc.c", + "ggml/src/ggml-backend.c", + "ggml/src/ggml-quants.c", ] var resources: [Resource] = [] @@ -26,8 +25,8 @@ var cSettings: [CSetting] = [ ] #if canImport(Darwin) -sources.append("ggml-metal.m") -resources.append(.process("ggml-metal.metal")) +sources.append("ggml/src/ggml-metal.m") +resources.append(.process("ggml/src/ggml-metal.metal")) linkerSettings.append(.linkedFramework("Accelerate")) cSettings.append( contentsOf: [ @@ -63,8 +62,6 @@ let package = Package( "models", "tests", "CMakeLists.txt", - "ggml-cuda.cu", - "ggml-cuda.h", "Makefile" ], sources: sources, diff --git a/README-sycl.md b/README-sycl.md index b7e2bb12a..885983e92 100644 --- a/README-sycl.md +++ b/README-sycl.md @@ -115,12 +115,12 @@ The docker build option is currently limited to *intel GPU* targets. ### Build image ```sh # Using FP16 -docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile . +docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile . ``` *Notes*: -To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command. +To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command. You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative. @@ -244,10 +244,10 @@ source /opt/intel/oneapi/setvars.sh # Build LLAMA with MKL BLAS acceleration for intel GPU # Option 1: Use FP32 (recommended for better performance in most cases) -cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx # Option 2: Use FP16 -cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON +cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # build all binary cmake --build build --config Release -j -v @@ -264,10 +264,10 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR # Build LLAMA with Nvidia BLAS acceleration through SYCL # Option 1: Use FP32 (recommended for better performance in most cases) -cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx # Option 2: Use FP16 -cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON +cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # build all binary cmake --build build --config Release -j -v @@ -422,10 +422,10 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force # Option 1: Use FP32 (recommended for better performance in most cases) -cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release +cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release # Option 2: Or FP16 -cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON +cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON cmake --build build --config Release -j ``` @@ -440,7 +440,7 @@ Or, use CMake presets to build: cmake --preset x64-windows-sycl-release cmake --build build-x64-windows-sycl-release -j --target llama-cli -cmake -DLLAMA_SYCL_F16=ON --preset x64-windows-sycl-release +cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release cmake --build build-x64-windows-sycl-release -j --target llama-cli cmake --preset x64-windows-sycl-debug @@ -544,9 +544,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512 | Name | Value | Function | |--------------------|-----------------------------------|---------------------------------------------| -| LLAMA_SYCL | ON (mandatory) | Enable build with SYCL code path. | -| LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. | -| LLAMA_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | +| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path. | +| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. | +| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | | CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. | | CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | diff --git a/README.md b/README.md index 95d970d83..6ca5ba43e 100644 --- a/README.md +++ b/README.md @@ -415,7 +415,7 @@ Flox follows the nixpkgs build of llama.cpp. ### Metal Build On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU. -To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option. +To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option. When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line argument. @@ -435,7 +435,7 @@ Building the program with BLAS support may lead to some performance improvements - Using `make`: - On Linux: ```bash - make LLAMA_OPENBLAS=1 + make GGML_OPENBLAS=1 ``` - On Windows: @@ -450,13 +450,13 @@ Building the program with BLAS support may lead to some performance improvements 8. From here you can run: ```bash - make LLAMA_OPENBLAS=1 + make GGML_OPENBLAS=1 ``` - Using `CMake` on Linux: ```bash - cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS + cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS cmake --build build --config Release ``` @@ -475,10 +475,10 @@ Building the program with BLAS support may lead to some performance improvements Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md). - Using manual oneAPI installation: - By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps: + By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps: ```bash source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation - cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON + cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON cmake --build build --config Release ``` @@ -495,28 +495,28 @@ Building the program with BLAS support may lead to some performance improvements - Using `make`: ```bash - make LLAMA_CUDA=1 + make GGML_CUDA=1 ``` - Using `CMake`: ```bash - cmake -B build -DLLAMA_CUDA=ON + cmake -B build -DGGML_CUDA=ON cmake --build build --config Release ``` The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance: - | Option | Legal values | Default | Description | - |--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| - | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | - | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | - | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. | - | LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. | - | LLAMA_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models | - | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | - | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | - | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | - | LLAMA_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. | + | Option | Legal values | Default | Description | + |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| + | GGML_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | + | GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | + | GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. | + | GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. | + | GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models | + | GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | + | GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | + | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | + | GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. | - #### hipBLAS @@ -526,15 +526,15 @@ Building the program with BLAS support may lead to some performance improvements - Using `make`: ```bash - make LLAMA_HIPBLAS=1 + make GGML_HIPBLAS=1 ``` - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU): ```bash HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \ - cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ + cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ && cmake --build build --config Release -- -j 16 ``` - On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`. + On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs). Note that if you get the following error: @@ -548,19 +548,19 @@ Building the program with BLAS support may lead to some performance improvements ```bash HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \ HIP_DEVICE_LIB_PATH= \ - cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ + cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ && cmake --build build -- -j 16 ``` - Using `make` (example for target gfx1030, build with 16 CPU threads): ```bash - make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030 + make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030 ``` - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU): ```bash set PATH=%HIP_PATH%\bin;%PATH% - cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release + cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release cmake --build build ``` Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors) @@ -571,11 +571,11 @@ Building the program with BLAS support may lead to some performance improvements If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3. The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above): - | Option | Legal values | Default | Description | - |-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| - | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | - | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | - | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | + | Option | Legal values | Default | Description | + |------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| + | GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | + | GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | + | GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | - #### Vulkan @@ -613,7 +613,7 @@ Building the program with BLAS support may lead to some performance improvements Then, build llama.cpp using the cmake command below: ```bash - cmake -B build -DLLAMA_VULKAN=1 + cmake -B build -DGGML_VULKAN=1 cmake --build build --config Release # Test the output binary (with "-ngl 33" to offload all layers to GPU) ./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4 diff --git a/ci/run.sh b/ci/run.sh index 291c44f47..e0cedb24f 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -36,11 +36,11 @@ SRC=`pwd` CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON" if [ ! -z ${GG_BUILD_METAL} ]; then - CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON" + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON" fi if [ ! -z ${GG_BUILD_CUDA} ]; then - CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1" + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1" fi if [ ! -z ${GG_BUILD_SYCL} ]; then @@ -50,7 +50,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then exit 1 fi - CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON" + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON" fi ## helpers @@ -284,7 +284,7 @@ function gg_run_open_llama_7b_v2 { set -e - (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf @@ -550,7 +550,7 @@ function gg_run_pythia_2_8b { set -e - (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf diff --git a/scripts/build-info.cmake b/cmake/build-info.cmake similarity index 100% rename from scripts/build-info.cmake rename to cmake/build-info.cmake diff --git a/cmake/git-vars.cmake b/cmake/git-vars.cmake new file mode 100644 index 000000000..1a4c24ebf --- /dev/null +++ b/cmake/git-vars.cmake @@ -0,0 +1,22 @@ +find_package(Git) + +# the commit's SHA1 +execute_process(COMMAND + "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8 + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" + OUTPUT_VARIABLE GIT_SHA1 + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + +# the date of the commit +execute_process(COMMAND + "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" + OUTPUT_VARIABLE GIT_DATE + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + +# the subject of the commit +execute_process(COMMAND + "${GIT_EXECUTABLE}" log -1 --format=%s + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" + OUTPUT_VARIABLE GIT_COMMIT_SUBJECT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) diff --git a/scripts/LlamaConfig.cmake.in b/cmake/llama-config.cmake.in similarity index 73% rename from scripts/LlamaConfig.cmake.in rename to cmake/llama-config.cmake.in index 9311055d9..2e7da2f8e 100644 --- a/scripts/LlamaConfig.cmake.in +++ b/cmake/llama-config.cmake.in @@ -1,41 +1,43 @@ -set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@) +set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@) set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@) set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@) -set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) -set(LLAMA_BLAS @LLAMA_BLAS@) -set(LLAMA_CUDA @LLAMA_CUDA@) -set(LLAMA_METAL @LLAMA_METAL@) -set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@) -set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@) +set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) + +set(GGML_BLAS @GGML_BLAS@) +set(GGML_CUDA @GGML_CUDA@) +set(GGML_METAL @GGML_METAL@) +set(GGML_HIPBLAS @GGML_HIPBLAS@) +set(GGML_ACCELERATE @GGML_ACCELERATE@) @PACKAGE_INIT@ set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@") -set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@") -set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@") +set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@") +set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@") # Ensure transient dependencies satisfied find_package(Threads REQUIRED) -if (APPLE AND LLAMA_ACCELERATE) + +if (APPLE AND GGML_ACCELERATE) find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) endif() -if (LLAMA_BLAS) +if (GGML_BLAS) find_package(BLAS REQUIRED) endif() -if (LLAMA_CUDA) +if (GGML_CUDA) find_package(CUDAToolkit REQUIRED) endif() -if (LLAMA_METAL) +if (GGML_METAL) find_library(FOUNDATION_LIBRARY Foundation REQUIRED) find_library(METAL_FRAMEWORK Metal REQUIRED) find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) endif() -if (LLAMA_HIPBLAS) +if (GGML_HIPBLAS) find_package(hip REQUIRED) find_package(hipblas REQUIRED) find_package(rocblas REQUIRED) @@ -47,7 +49,9 @@ find_library(llama_LIBRARY llama set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@") set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@") + add_library(llama UNKNOWN IMPORTED) + set_target_properties(llama PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}" diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 171530c91..761971d68 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -1,5 +1,6 @@ # common +find_package(Threads REQUIRED) # Build info header # @@ -36,7 +37,7 @@ add_custom_command( COMMENT "Generating build details from Git" COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake" + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake" WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.." DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX} VERBATIM @@ -83,5 +84,5 @@ if (LLAMA_CURL) endif () target_include_directories(${TARGET} PUBLIC .) -target_compile_features(${TARGET} PUBLIC cxx_std_11) -target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) +target_compile_features (${TARGET} PUBLIC cxx_std_11) +target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) diff --git a/scripts/gen-build-info-cpp.cmake b/common/cmake/build-info-gen-cpp.cmake similarity index 86% rename from scripts/gen-build-info-cpp.cmake rename to common/cmake/build-info-gen-cpp.cmake index d89338920..fbc92b52c 100644 --- a/scripts/gen-build-info-cpp.cmake +++ b/common/cmake/build-info-gen-cpp.cmake @@ -1,7 +1,7 @@ -include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) +include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in") -set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp") +set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp") # Only write the build info if it changed if(EXISTS ${OUTPUT_FILE}) diff --git a/docs/BLIS.md b/docs/BLIS.md index c933766b7..35d06bd0f 100644 --- a/docs/BLIS.md +++ b/docs/BLIS.md @@ -30,8 +30,8 @@ We recommend using openmp since it's easier to modify the cores being used. Makefile: ```bash -make LLAMA_BLIS=1 -j -# make LLAMA_BLIS=1 benchmark-matmult +make GGML_BLIS=1 -j +# make GGML_BLIS=1 llama-benchmark-matmult ``` CMake: @@ -39,7 +39,7 @@ CMake: ```bash mkdir build cd build -cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME .. +cmake -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME .. make -j ``` diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 0b51c44c0..7d9ab3457 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -39,13 +39,13 @@ else() add_subdirectory(quantize-stats) add_subdirectory(quantize) add_subdirectory(retrieval) - if (LLAMA_RPC) + if (GGML_RPC) add_subdirectory(rpc) endif() if (LLAMA_BUILD_SERVER) add_subdirectory(server) endif() - if (LLAMA_SYCL) + if (GGML_SYCL) add_subdirectory(sycl) endif() add_subdirectory(save-load-state) diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md index 38b36ee5a..29602881a 100644 --- a/examples/imatrix/README.md +++ b/examples/imatrix/README.md @@ -25,7 +25,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument ## Example ```bash -LLAMA_CUDA=1 make -j +GGML_CUDA=1 make -j # generate importance matrix (imatrix.dat) ./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index 05a8207e6..f6c619c87 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -194,7 +194,7 @@ llama_print_timings: total time = 44411.01 ms / 377 tokens ## Orin compile and run ### compile ```sh -make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32 +make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32 ``` ### run on Orin ### case 1 diff --git a/examples/rpc/README.md b/examples/rpc/README.md index 86544e3fe..e1da801f2 100644 --- a/examples/rpc/README.md +++ b/examples/rpc/README.md @@ -29,13 +29,13 @@ You can also run multiple `rpc-server` instances on the same host, each with a d ## Usage -On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options. +On each host, build the corresponding backend with `cmake` and add `-DGGML_RPC=ON` to the build options. For example, to build the CUDA backend with RPC support: ```bash mkdir build-rpc-cuda cd build-rpc-cuda -cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON +cmake .. -DGGML_CUDA=ON -DGGML_RPC=ON cmake --build . --config Release ``` @@ -58,12 +58,12 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052 This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device. -On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`: +On the main host build `llama.cpp` only with `-DGGML_RPC=ON`: ```bash mkdir build-rpc cd build-rpc -cmake .. -DLLAMA_RPC=ON +cmake .. -DGGML_RPC=ON cmake --build . --config Release ``` diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 8365f9510..dbe41f1fd 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -1,7 +1,14 @@ set(TARGET llama-server) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) -option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF) +option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF) + include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) + +if (MINGW) + # fix: https://github.com/ggerganov/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006 + add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER}) +endif() + set(TARGET_SRCS server.cpp utils.hpp @@ -24,6 +31,7 @@ set(PUBLIC_ASSETS prompt-formats.js json-schema-to-grammar.mjs ) + foreach(asset ${PUBLIC_ASSETS}) set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}") set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp") @@ -34,18 +42,23 @@ foreach(asset ${PUBLIC_ASSETS}) COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake" ) endforeach() + add_executable(${TARGET} ${TARGET_SRCS}) install(TARGETS ${TARGET} RUNTIME) target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$ ) + target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT}) + if (LLAMA_SERVER_SSL) find_package(OpenSSL REQUIRED) target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto) target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT) endif() + if (WIN32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) endif() + target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh index db46d57ca..8fe0a6790 100755 --- a/examples/sycl/build.sh +++ b/examples/sycl/build.sh @@ -8,10 +8,10 @@ cd build source /opt/intel/oneapi/setvars.sh #for FP16 -#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference +#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # faster for long-prompt inference #for FP32 -cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx #build example/main #cmake --build . --config Release --target main diff --git a/examples/sycl/win-build-sycl.bat b/examples/sycl/win-build-sycl.bat index 027173b0a..cdae5a528 100644 --- a/examples/sycl/win-build-sycl.bat +++ b/examples/sycl/win-build-sycl.bat @@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR :: for FP16 :: faster for long-prompt inference -:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON +:: cmake -G "MinGW Makefiles" .. -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON :: for FP32 -cmake -G "Ninja" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release +cmake -G "Ninja" .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release if %errorlevel% neq 0 goto ERROR :: build example/main only :: make main diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt new file mode 100644 index 000000000..f3763f7eb --- /dev/null +++ b/ggml/CMakeLists.txt @@ -0,0 +1,238 @@ +cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories. +project("ggml" C CXX) +include(CheckIncludeFileCXX) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") +endif() + +if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) + set(GGML_STANDALONE ON) + + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) + + # configure project version + # TODO +else() + set(GGML_STANDALONE OFF) +endif() + +if (EMSCRIPTEN) + set(BUILD_SHARED_LIBS_DEFAULT OFF) + + option(GGML_WASM_SINGLE_FILE "ggml: embed WASM inside the generated ggml.js" ON) +else() + if (MINGW) + set(BUILD_SHARED_LIBS_DEFAULT OFF) + else() + set(BUILD_SHARED_LIBS_DEFAULT ON) + endif() +endif() + +option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT}) + +# +# option list +# + +# TODO: mark all options as advanced when not GGML_STANDALONE + +if (APPLE) + set(GGML_METAL_DEFAULT ON) + set(GGML_BLAS_DEFAULT ON) + set(GGML_BLAS_VENDOR_DEFAULT "Apple") +else() + set(GGML_METAL_DEFAULT OFF) + set(GGML_BLAS_DEFAULT OFF) + set(GGML_BLAS_VENDOR_DEFAULT "Generic") +endif() + +# general +option(GGML_STATIC "ggml: static link libraries" OFF) +option(GGML_NATIVE "ggml: enable -march=native flag" ON) +option(GGML_LTO "ggml: enable link time optimization" OFF) +option(GGML_CCACHE "ggml: use ccache if available" ON) + +# debug +option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON) +option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF) +option(GGML_GPROF "ggml: enable gprof" OFF) + +# build +option(GGML_FATAL_WARNINGS "ggml: enable -Werror flag" OFF) + +# sanitizers +option(GGML_SANITIZE_THREAD "ggml: enable thread sanitizer" OFF) +option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF) +option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF) + +# instruction set specific +if (GGML_NATIVE) + set(INS_ENB OFF) +else() + set(INS_ENB ON) +endif() + +option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF) + +option(GGML_AVX "ggml: enable AVX" ${INS_ENB}) +option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB}) +option(GGML_AVX512 "ggml: enable AVX512" OFF) +option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF) +option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF) +option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF) +option(GGML_FMA "ggml: enable FMA" ${INS_ENB}) +if (NOT MSVC) + option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512 +endif() +option(GGML_LASX "ggml: enable lasx" ON) +option(GGML_LSX "ggml: enable lsx" ON) +option(GGML_SVE "ggml: enable SVE" OFF) + +if (WIN32) + set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version") +endif() + +# ggml core +set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism") + +# 3rd party libs / backends +option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON) +option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT}) +set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING + "ggml: BLAS library vendor") +option(GGML_LLAMAFILE "ggml: use ggml SGEMM" OFF) + +option(GGML_CUDA "ggml: use CUDA" OFF) +option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF) +option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF) +set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels") +set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels") +option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF) +set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING + "ggml: iters./thread per block for Q2_K/Q6_K") +set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING + "ggml: max. batch size for using peer access") +option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF) +option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF) +option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF) + +option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF) +option(GGML_HIPBLAS "ggml: use hipBLAS" OFF) +option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF) +option(GGML_VULKAN "ggml: use Vulkan" OFF) +option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF) +option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF) +option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF) +option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF) +option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF) +option(GGML_KOMPUTE "ggml: use Kompute" OFF) +option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT}) +option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF) +option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF) +option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL}) +set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING + "ggml: metal minimum macOS version") +set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)") +option(GGML_OPENMP "ggml: use OpenMP" ON) +option(GGML_RPC "ggml: use RPC" OFF) +option(GGML_SYCL "ggml: use SYCL" OFF) +option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) +set (GGML_SYCL_TARGET "INTEL" CACHE STRING + "ggml: sycl target device") + +# extra artifacts +option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE}) +option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE}) + +# +# dependencies +# + +set(CMAKE_C_STANDARD 11) +set(CMAKE_C_STANDARD_REQUIRED true) + +if (GGML_SYCL) + set(CMAKE_CXX_STANDARD 17) +else() + set(CMAKE_CXX_STANDARD 11) +endif() +set(CMAKE_CXX_STANDARD_REQUIRED true) + +set(THREADS_PREFER_PTHREAD_FLAG ON) + +find_package(Threads REQUIRED) + +# +# build the library +# + +add_subdirectory(src) + +# +# tests and examples +# + +if (GGML_BUILD_TESTS) + enable_testing() + add_subdirectory(tests) +endif () + +if (GGML_BUILD_EXAMPLES) + add_subdirectory(examples) +endif () + +# +# install +# + +include(GNUInstallDirs) +include(CMakePackageConfigHelpers) + +set(GGML_PUBLIC_HEADERS + include/ggml.h + include/ggml-alloc.h + include/ggml-backend.h + "${GGML_HEADERS_CUDA}" + "${GGML_HEADERS_METAL}" + "${GGML_HEADERS_EXTRA}") + +set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") +#if (GGML_METAL) +# set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal") +#endif() +install(TARGETS ggml PUBLIC_HEADER) + +if (BUILD_SHARED_LIBS) + install(TARGETS ggml LIBRARY) +endif() + +if (GGML_METAL) + install( + FILES src/ggml-metal.metal + PERMISSIONS + OWNER_READ + OWNER_WRITE + GROUP_READ + WORLD_READ + DESTINATION ${CMAKE_INSTALL_BINDIR}) + + if (NOT GGML_METAL_EMBED_LIBRARY) + install( + FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + DESTINATION ${CMAKE_INSTALL_BINDIR} + ) + endif() +endif() + +if (GGML_STANDALONE) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in + ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc + @ONLY) + + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc + DESTINATION share/pkgconfig) +endif() diff --git a/cmake/FindSIMD.cmake b/ggml/cmake/FindSIMD.cmake similarity index 94% rename from cmake/FindSIMD.cmake rename to ggml/cmake/FindSIMD.cmake index 33377ec44..5533668ec 100644 --- a/cmake/FindSIMD.cmake +++ b/ggml/cmake/FindSIMD.cmake @@ -79,22 +79,22 @@ endmacro() # flags are for MSVC only! check_sse("AVX" " ;/arch:AVX") if (NOT ${AVX_FOUND}) - set(LLAMA_AVX OFF) + set(GGML_AVX OFF) else() - set(LLAMA_AVX ON) + set(GGML_AVX ON) endif() check_sse("AVX2" " ;/arch:AVX2") check_sse("FMA" " ;/arch:AVX2") if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND})) - set(LLAMA_AVX2 OFF) + set(GGML_AVX2 OFF) else() - set(LLAMA_AVX2 ON) + set(GGML_AVX2 ON) endif() check_sse("AVX512" " ;/arch:AVX512") if (NOT ${AVX512_FOUND}) - set(LLAMA_AVX512 OFF) + set(GGML_AVX512 OFF) else() - set(LLAMA_AVX512 ON) + set(GGML_AVX512 ON) endif() diff --git a/ggml_vk_generate_shaders.py b/ggml/ggml_vk_generate_shaders.py similarity index 100% rename from ggml_vk_generate_shaders.py rename to ggml/ggml_vk_generate_shaders.py diff --git a/ggml-alloc.h b/ggml/include/ggml-alloc.h similarity index 100% rename from ggml-alloc.h rename to ggml/include/ggml-alloc.h diff --git a/ggml-backend.h b/ggml/include/ggml-backend.h similarity index 100% rename from ggml-backend.h rename to ggml/include/ggml-backend.h diff --git a/ggml-blas.h b/ggml/include/ggml-blas.h similarity index 100% rename from ggml-blas.h rename to ggml/include/ggml-blas.h diff --git a/ggml-cuda.h b/ggml/include/ggml-cuda.h similarity index 100% rename from ggml-cuda.h rename to ggml/include/ggml-cuda.h diff --git a/ggml-kompute.h b/ggml/include/ggml-kompute.h similarity index 100% rename from ggml-kompute.h rename to ggml/include/ggml-kompute.h diff --git a/ggml-metal.h b/ggml/include/ggml-metal.h similarity index 100% rename from ggml-metal.h rename to ggml/include/ggml-metal.h diff --git a/ggml-rpc.h b/ggml/include/ggml-rpc.h similarity index 100% rename from ggml-rpc.h rename to ggml/include/ggml-rpc.h diff --git a/ggml-sycl.h b/ggml/include/ggml-sycl.h similarity index 95% rename from ggml-sycl.h rename to ggml/include/ggml-sycl.h index 451938fc4..43ab1519c 100644 --- a/ggml-sycl.h +++ b/ggml/include/ggml-sycl.h @@ -8,7 +8,9 @@ #include "ggml.h" #include "ggml-backend.h" -#include "ggml-sycl/presets.hpp" + +#define GGML_SYCL_NAME "SYCL" +#define GGML_SYCL_MAX_DEVICES 48 #ifdef __cplusplus extern "C" { diff --git a/ggml-vulkan.h b/ggml/include/ggml-vulkan.h similarity index 100% rename from ggml-vulkan.h rename to ggml/include/ggml-vulkan.h diff --git a/ggml.h b/ggml/include/ggml.h similarity index 100% rename from ggml.h rename to ggml/include/ggml.h diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt new file mode 100644 index 000000000..ba341d374 --- /dev/null +++ b/ggml/src/CMakeLists.txt @@ -0,0 +1,1171 @@ +include(CheckCXXCompilerFlag) + +unset(GGML_CDEF_PUBLIC) + +add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES}) + +# enable libstdc++ assertions for debug builds +if (CMAKE_SYSTEM_NAME MATCHES "Linux") + add_compile_definitions($<$:_GLIBCXX_ASSERTIONS>) +endif() + +if (NOT MSVC) + if (GGML_SANITIZE_THREAD) + add_compile_options(-fsanitize=thread) + link_libraries (-fsanitize=thread) + endif() + + if (GGML_SANITIZE_ADDRESS) + add_compile_options(-fsanitize=address -fno-omit-frame-pointer) + link_libraries (-fsanitize=address) + endif() + + if (GGML_SANITIZE_UNDEFINED) + add_compile_options(-fsanitize=undefined) + link_libraries (-fsanitize=undefined) + endif() +endif() + +if (APPLE AND GGML_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate) + if (ACCELERATE_FRAMEWORK) + message(STATUS "Accelerate framework found") + + add_compile_definitions(GGML_USE_ACCELERATE) + add_compile_definitions(ACCELERATE_NEW_LAPACK) + add_compile_definitions(ACCELERATE_LAPACK_ILP64) + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) + else() + message(WARNING "Accelerate framework not found") + endif() +endif() + +if (GGML_METAL) + find_library(FOUNDATION_LIBRARY Foundation REQUIRED) + find_library(METAL_FRAMEWORK Metal REQUIRED) + find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) + + message(STATUS "Metal framework found") + set(GGML_HEADERS_METAL ../include/ggml-metal.h) + set(GGML_SOURCES_METAL ggml-metal.m) + + list(APPEND GGML_CDEF_PUBLIC GGML_USE_METAL) + if (GGML_METAL_NDEBUG) + add_compile_definitions(GGML_METAL_NDEBUG) + endif() + + # copy ggml-common.h and ggml-metal.metal to bin directory + configure_file(ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY) + configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) + + if (GGML_METAL_EMBED_LIBRARY) + enable_language(ASM) + + add_compile_definitions(GGML_METAL_EMBED_LIBRARY) + + set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/ggml-common.h") + set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") + + file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated") + + # merge ggml-common.h and ggml-metal.metal into a single file + set(METALLIB_EMBED_ASM "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s") + set(METALLIB_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal") + + add_custom_command( + OUTPUT ${METALLIB_EMBED_ASM} + COMMAND echo "Embedding Metal library" + COMMAND sed -e '/\#include \"ggml-common.h\"/r ${METALLIB_COMMON}' -e '/\#include \"ggml-common.h\"/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED} + COMMAND echo ".section __DATA,__ggml_metallib" > ${METALLIB_EMBED_ASM} + COMMAND echo ".globl _ggml_metallib_start" >> ${METALLIB_EMBED_ASM} + COMMAND echo "_ggml_metallib_start:" >> ${METALLIB_EMBED_ASM} + COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM} + COMMAND echo ".globl _ggml_metallib_end" >> ${METALLIB_EMBED_ASM} + COMMAND echo "_ggml_metallib_end:" >> ${METALLIB_EMBED_ASM} + DEPENDS ggml-metal.metal ggml-common.h + COMMENT "Generate assembly for embedded Metal library" + ) + + set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM}) + else() + if (GGML_METAL_SHADER_DEBUG) + # custom command to do the following: + # xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air + # xcrun -sdk macosx metallib ggml-metal.air -o default.metallib + # + # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works + # disabling fast math is needed in order to pass tests/test-backend-ops + # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1 + # note: unfortunately, we have to call it default.metallib instead of ggml.metallib + # ref: https://github.com/ggerganov/whisper.cpp/issues/1720 + set(XC_FLAGS -fno-fast-math -fno-inline -g) + else() + set(XC_FLAGS -O3) + endif() + + # Append macOS metal versioning flags + if (GGML_METAL_MACOSX_VERSION_MIN) + message(STATUS "Adding -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN} flag to metal compilation") + list (APPEND XC_FLAGS -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN}) + endif() + + if (GGML_METAL_STD) + message(STATUS "Adding -std=${GGML_METAL_STD} flag to metal compilation") + list (APPEND XC_FLAGS -std=${GGML_METAL_STD}) + endif() + + add_custom_command( + OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air + COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal + DEPENDS ggml-metal.metal ggml-common.h + COMMENT "Compiling Metal kernels" + ) + + add_custom_target( + ggml-metal ALL + DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + ) + endif() # GGML_METAL_EMBED_LIBRARY + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} + ${FOUNDATION_LIBRARY} + ${METAL_FRAMEWORK} + ${METALKIT_FRAMEWORK} + ) +endif() + +if (GGML_OPENMP) + find_package(OpenMP) + if (OpenMP_FOUND) + message(STATUS "OpenMP found") + + add_compile_definitions(GGML_USE_OPENMP) + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + else() + message(WARNING "OpenMP not found") + endif() +endif() + +if (GGML_BLAS) + if (GGML_STATIC) + set(BLA_STATIC ON) + endif() + #if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22) + # set(BLA_SIZEOF_INTEGER 8) + #endif() + + set(BLA_VENDOR ${GGML_BLAS_VENDOR}) + find_package(BLAS) + + if (BLAS_FOUND) + message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") + + if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${GGML_BLAS_VENDOR} MATCHES "Apple")) + # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake. + # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268 + find_package(PkgConfig REQUIRED) + if (${GGML_BLAS_VENDOR} MATCHES "Generic") + pkg_check_modules(DepBLAS REQUIRED blas) + elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS") + # As of openblas v0.3.22, the 64-bit is named openblas64.pc + pkg_check_modules(DepBLAS openblas64) + if (NOT DepBLAS_FOUND) + pkg_check_modules(DepBLAS REQUIRED openblas) + endif() + elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME") + pkg_check_modules(DepBLAS REQUIRED blis) + elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS") + pkg_check_modules(DepBLAS REQUIRED blas-atlas) + elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS") + pkg_check_modules(DepBLAS REQUIRED flexiblas_api) + elseif (${GGML_BLAS_VENDOR} MATCHES "Intel") + # all Intel* libraries share the same include path + pkg_check_modules(DepBLAS REQUIRED mkl-sdl) + elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC") + # this doesn't provide pkg-config + # suggest to assign BLAS_INCLUDE_DIRS on your own + if ("${NVHPC_VERSION}" STREQUAL "") + message(WARNING "Better to set NVHPC_VERSION") + else() + set(DepBLAS_FOUND ON) + set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include") + endif() + endif() + if (DepBLAS_FOUND) + set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS}) + else() + message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically" + " detected by pkgconfig, trying to find cblas.h from possible paths...") + find_path(BLAS_INCLUDE_DIRS + NAMES cblas.h + HINTS + /usr/include + /usr/local/include + /usr/include/openblas + /opt/homebrew/opt/openblas/include + /usr/local/opt/openblas/include + /usr/include/x86_64-linux-gnu/openblas/include + ) + endif() + endif() + + message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") + + add_compile_options(${BLAS_LINKER_FLAGS}) + + list(APPEND GGML_CDEF_PUBLIC GGML_USE_BLAS) + + if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel")) + add_compile_definitions(GGML_BLAS_USE_MKL) + endif() + + set(GGML_HEADERS_BLAS ../include/ggml-blas.h) + set(GGML_SOURCES_BLAS ggml-blas.cpp) + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${BLAS_LIBRARIES}) + set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) + else() + message(WARNING "BLAS not found, please refer to " + "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" + " to set correct GGML_BLAS_VENDOR") + endif() +endif() + +if (GGML_LLAMAFILE) + message(STATUS "Using ggml SGEMM") + + add_compile_definitions(GGML_USE_LLAMAFILE) + + set(GGML_HEADERS_LLAMAFILE sgemm.h) + set(GGML_SOURCES_LLAMAFILE sgemm.cpp) +endif() + +if (GGML_CUDA) + cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES + + find_package(CUDAToolkit) + + if (CUDAToolkit_FOUND) + message(STATUS "CUDA found") + + if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + # 52 == lowest CUDA 12 standard + # 60 == FP16 CUDA intrinsics + # 61 == integer CUDA intrinsics + # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster + if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) + set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75") + else() + set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75") + #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work + endif() + endif() + message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") + + enable_language(CUDA) + + file(GLOB GGML_HEADERS_CUDA "ggml-cuda/*.cuh") + list(APPEND GGML_HEADERS_CUDA "../include/ggml-cuda.h") + + file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu") + list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu") + file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + + if (GGML_CUDA_FA_ALL_QUANTS) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) + else() + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + endif() + + list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA) + + add_compile_definitions(GGML_CUDA_USE_GRAPHS) + add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X}) + add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y}) + add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER}) + add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) + + if (GGML_CUDA_FORCE_DMMV) + add_compile_definitions(GGML_CUDA_FORCE_DMMV) + endif() + + if (GGML_CUDA_FORCE_MMQ) + add_compile_definitions(GGML_CUDA_FORCE_MMQ) + endif() + + if (GGML_CUDA_FORCE_CUBLAS) + add_compile_definitions(GGML_CUDA_FORCE_CUBLAS) + endif() + + if (GGML_CUDA_NO_VMM) + add_compile_definitions(GGML_CUDA_NO_VMM) + endif() + + if (DEFINED GGML_CUDA_DMMV_Y) + add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_DMMV_Y}) # for backwards compatibility + endif() + + if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) + add_compile_definitions(GGML_CUDA_F16) + endif() + + if (GGML_CUDA_NO_PEER_COPY) + add_compile_definitions(GGML_CUDA_NO_PEER_COPY) + endif() + + if (GGML_STATIC) + if (WIN32) + # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt) + else () + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + endif() + else() + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) + endif() + + if (GGML_CUDA_NO_VMM) + # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so) + else() + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... + endif() + else() + message(WARNING "CUDA not found") + endif() +endif() + +if (GGML_HIPBLAS) + if (NOT EXISTS $ENV{ROCM_PATH}) + if (NOT EXISTS /opt/rocm) + set(ROCM_PATH /usr) + else() + set(ROCM_PATH /opt/rocm) + endif() + else() + set(ROCM_PATH $ENV{ROCM_PATH}) + endif() + + list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}) + list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake") + + # CMake on Windows doesn't support the HIP language yet + if (WIN32) + set(CXX_IS_HIPCC TRUE) + else() + string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}") + endif() + + if (CXX_IS_HIPCC) + if (LINUX) + if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") + endif() + + message(WARNING "Setting hipcc as the C++ compiler is legacy behavior." + " Prefer setting the HIP compiler directly. See README for details.") + endif() + else() + # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES. + if (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES) + set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS}) + endif() + cmake_minimum_required(VERSION 3.21) + enable_language(HIP) + endif() + + find_package(hip REQUIRED) + find_package(hipblas REQUIRED) + find_package(rocblas REQUIRED) + + message(STATUS "HIP and hipBLAS found") + + file(GLOB GGML_HEADERS_ROCM "ggml-cuda/*.cuh") + list(APPEND GGML_HEADERS_ROCM "../include/ggml-cuda.h") + + file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu") + list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu") + file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + + if (GGML_CUDA_FA_ALL_QUANTS) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) + else() + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + endif() + + list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA) + + add_compile_definitions(GGML_USE_HIPBLAS) + add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X}) + add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y}) + add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER}) + + if (GGML_HIP_UMA) + add_compile_definitions(GGML_HIP_UMA) + endif() + + if (GGML_CUDA_FORCE_DMMV) + add_compile_definitions(GGML_CUDA_FORCE_DMMV) + endif() + + if (GGML_CUDA_FORCE_MMQ) + add_compile_definitions(GGML_CUDA_FORCE_MMQ) + endif() + + if (GGML_CUDA_NO_PEER_COPY) + add_compile_definitions(GGML_CUDA_NO_PEER_COPY) + endif() + + if (CXX_IS_HIPCC) + set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} hip::device) + else() + set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP) + endif() + + if (GGML_STATIC) + message(FATAL_ERROR "Static linking not supported for HIP/ROCm") + endif() + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas) +endif() + +if (GGML_SYCL) + if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$") + message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA") + endif() + + if ( NOT DEFINED ENV{ONEAPI_ROOT}) + message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh") + endif() + #todo: AOT + + find_package(IntelSYCL REQUIRED) + find_package(MKL REQUIRED) + + message(STATUS "SYCL found") + + list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL) + + if (GGML_SYCL_F16) + add_compile_definitions(GGML_SYCL_F16) + endif() + + if (GGML_CUDA_FORCE_MMQ) + add_compile_definitions(GGML_SYCL_FORCE_MMQ) + endif() + + add_compile_options(-I./) #include DPCT + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") + if (GGML_SYCL_TARGET STREQUAL "NVIDIA") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") + endif() + + file(GLOB GGML_HEADERS_SYCL "ggml-sycl/*.hpp") + list(APPEND GGML_HEADERS_SYCL "../include/ggml-sycl.h") + + file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp") + list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp") + + if (WIN32) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) + else() + add_compile_options(-I/${SYCL_INCLUDE_DIR}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib") + + if (GGML_SYCL_TARGET STREQUAL "INTEL") + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) + elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA") + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl) + endif() + endif() +endif() + +if (GGML_RPC) + message(STATUS "RPC found") + + list(APPEND GGML_CDEF_PUBLIC GGML_USE_RPC) + + if (WIN32) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ws2_32) + endif() + + set(GGML_HEADERS_RPC ../include/ggml-rpc.h) + set(GGML_SOURCES_RPC ggml-rpc.cpp) +endif() + +if (GGML_VULKAN) + find_package(Vulkan) + + if (Vulkan_FOUND) + message(STATUS "Vulkan found") + + set(GGML_HEADERS_VULKAN ../include/ggml-vulkan.h) + set(GGML_SOURCES_VULKAN ggml-vulkan.cpp) + + list(APPEND GGML_CDEF_PUBLIC GGML_USE_VULKAN) + + # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build + # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector + if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0) + endif() + + if (GGML_VULKAN_CHECK_RESULTS) + add_compile_definitions(GGML_VULKAN_CHECK_RESULTS) + endif() + + if (GGML_VULKAN_DEBUG) + add_compile_definitions(GGML_VULKAN_DEBUG) + endif() + + if (GGML_VULKAN_MEMORY_DEBUG) + add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG) + endif() + + if (GGML_VULKAN_VALIDATE) + add_compile_definitions(GGML_VULKAN_VALIDATE) + endif() + + if (GGML_VULKAN_RUN_TESTS) + add_compile_definitions(GGML_VULKAN_RUN_TESTS) + endif() + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} Vulkan::Vulkan) + else() + message(WARNING "Vulkan not found") + endif() +endif() + +if (GGML_KOMPUTE) + add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1) + + find_package(Vulkan COMPONENTS glslc REQUIRED) + find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc) + + if (NOT glslc_executable) + message(FATAL_ERROR "glslc not found") + endif() + + function(compile_shader) + set(options) + set(oneValueArgs) + set(multiValueArgs SOURCES) + cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + foreach(source ${compile_shader_SOURCES}) + get_filename_component(filename ${source} NAME) + set(spv_file ${filename}.spv) + add_custom_command( + OUTPUT ${spv_file} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source} + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp + COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source} + COMMENT "Compiling ${source} to ${spv_file}" + ) + + get_filename_component(RAW_FILE_NAME ${spv_file} NAME) + set(FILE_NAME "shader${RAW_FILE_NAME}") + string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME}) + string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE) + string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}") + set(OUTPUT_HEADER_FILE "${HEADER_FILE}") + message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}") + if(CMAKE_GENERATOR MATCHES "Visual Studio") + add_custom_command( + OUTPUT ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_BINARY_DIR}/bin/$/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + DEPENDS ${spv_file} xxd + COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$/xxd" + ) + else() + add_custom_command( + OUTPUT ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + DEPENDS ${spv_file} xxd + COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd" + ) + endif() + endforeach() + endfunction() + + if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt") + message(STATUS "Kompute found") + set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level") + add_subdirectory(kompute) + + # Compile our shaders + compile_shader(SOURCES + kompute-shaders/op_scale.comp + kompute-shaders/op_scale_8.comp + kompute-shaders/op_add.comp + kompute-shaders/op_addrow.comp + kompute-shaders/op_mul.comp + kompute-shaders/op_silu.comp + kompute-shaders/op_relu.comp + kompute-shaders/op_gelu.comp + kompute-shaders/op_softmax.comp + kompute-shaders/op_norm.comp + kompute-shaders/op_rmsnorm.comp + kompute-shaders/op_diagmask.comp + kompute-shaders/op_mul_mat_mat_f32.comp + kompute-shaders/op_mul_mat_f16.comp + kompute-shaders/op_mul_mat_q8_0.comp + kompute-shaders/op_mul_mat_q4_0.comp + kompute-shaders/op_mul_mat_q4_1.comp + kompute-shaders/op_mul_mat_q6_k.comp + kompute-shaders/op_getrows_f32.comp + kompute-shaders/op_getrows_f16.comp + kompute-shaders/op_getrows_q4_0.comp + kompute-shaders/op_getrows_q4_1.comp + kompute-shaders/op_getrows_q6_k.comp + kompute-shaders/op_rope_f16.comp + kompute-shaders/op_rope_f32.comp + kompute-shaders/op_cpy_f16_f16.comp + kompute-shaders/op_cpy_f16_f32.comp + kompute-shaders/op_cpy_f32_f16.comp + kompute-shaders/op_cpy_f32_f32.comp + ) + + # Create a custom target for our generated shaders + add_custom_target(generated_shaders DEPENDS + shaderop_scale.h + shaderop_scale_8.h + shaderop_add.h + shaderop_addrow.h + shaderop_mul.h + shaderop_silu.h + shaderop_relu.h + shaderop_gelu.h + shaderop_softmax.h + shaderop_norm.h + shaderop_rmsnorm.h + shaderop_diagmask.h + shaderop_mul_mat_mat_f32.h + shaderop_mul_mat_f16.h + shaderop_mul_mat_q8_0.h + shaderop_mul_mat_q4_0.h + shaderop_mul_mat_q4_1.h + shaderop_mul_mat_q6_k.h + shaderop_getrows_f32.h + shaderop_getrows_f16.h + shaderop_getrows_q4_0.h + shaderop_getrows_q4_1.h + shaderop_getrows_q6_k.h + shaderop_rope_f16.h + shaderop_rope_f32.h + shaderop_cpy_f16_f16.h + shaderop_cpy_f16_f32.h + shaderop_cpy_f32_f16.h + shaderop_cpy_f32_f32.h + ) + + # Create a custom command that depends on the generated_shaders + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp + COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp + DEPENDS generated_shaders + COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp" + ) + + # Add the stamp to the main sources to ensure dependency tracking + set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) + set(GGML_HEADERS_KOMPUTE ../include/ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) + + list(APPEND GGML_CDEF_PUBLIC GGML_USE_KOMPUTE) + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} kompute) + set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR}) + else() + message(WARNING "Kompute not found") + endif() +endif() + +if (GGML_CPU_HBM) + find_library(memkind memkind REQUIRED) + + message(STATUS "Using memkind for CPU HBM") + + add_compile_definitions(GGML_USE_CPU_HBM) + + target_link_libraries(ggml PUBLIC memkind) +endif() + +function(get_flags CCID CCVER) + set(C_FLAGS "") + set(CXX_FLAGS "") + + if (CCID MATCHES "Clang") + set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return) + set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi) + + if ( + (CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR + (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0) + ) + list(APPEND C_FLAGS -Wdouble-promotion) + endif() + elseif (CCID STREQUAL "GNU") + set(C_FLAGS -Wdouble-promotion) + set(CXX_FLAGS -Wno-array-bounds) + + if (CCVER VERSION_GREATER_EQUAL 7.1.0) + list(APPEND CXX_FLAGS -Wno-format-truncation) + endif() + if (CCVER VERSION_GREATER_EQUAL 8.1.0) + list(APPEND CXX_FLAGS -Wextra-semi) + endif() + endif() + + set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE) + set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE) +endfunction() + +if (GGML_FATAL_WARNINGS) + if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + list(APPEND C_FLAGS -Werror) + list(APPEND CXX_FLAGS -Werror) + elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + add_compile_options(/WX) + endif() +endif() + +if (GGML_ALL_WARNINGS) + if (NOT MSVC) + list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) + list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes + -Werror=implicit-int -Werror=implicit-function-declaration) + list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) + + list(APPEND C_FLAGS ${WARNING_FLAGS}) + list(APPEND CXX_FLAGS ${WARNING_FLAGS}) + + get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}) + + add_compile_options("$<$:${C_FLAGS};${GF_C_FLAGS}>" + "$<$:${CXX_FLAGS};${GF_CXX_FLAGS}>") + else() + # todo : msvc + set(C_FLAGS "") + set(CXX_FLAGS "") + endif() +endif() + +set(CUDA_CXX_FLAGS "") + +if (GGML_CUDA) + set(CUDA_FLAGS -use_fast_math) + + if (GGML_FATAL_WARNINGS) + list(APPEND CUDA_FLAGS -Werror all-warnings) + endif() + + if (GGML_ALL_WARNINGS AND NOT MSVC) + set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c) + if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "") + list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER}) + endif() + + execute_process( + COMMAND ${NVCC_CMD} -Xcompiler --version + OUTPUT_VARIABLE CUDA_CCFULLVER + ERROR_QUIET + ) + + if (NOT CUDA_CCFULLVER MATCHES clang) + set(CUDA_CCID "GNU") + execute_process( + COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion" + OUTPUT_VARIABLE CUDA_CCVER + ERROR_QUIET + ) + else() + if (CUDA_CCFULLVER MATCHES Apple) + set(CUDA_CCID "AppleClang") + else() + set(CUDA_CCID "Clang") + endif() + string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER}) + endif() + + message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}") + + get_flags(${CUDA_CCID} ${CUDA_CCVER}) + list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later + endif() + + if (NOT MSVC) + list(APPEND CUDA_CXX_FLAGS -Wno-pedantic) + endif() +endif() + +if (GGML_LTO) + include(CheckIPOSupported) + check_ipo_supported(RESULT result OUTPUT output) + if (result) + set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) + else() + message(WARNING "IPO is not supported: ${output}") + endif() +endif() + +if (GGML_CCACHE) + find_program(GGML_CCACHE_FOUND ccache) + + if (GGML_CCACHE_FOUND) + # TODO: should not be set globally + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) + set(ENV{CCACHE_SLOPPINESS} time_macros) + message(STATUS "ccache found, compilation results will be cached. Disable with GGML_CCACHE=OFF.") + else() + message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with GGML_CCACHE=OFF") + endif () +endif() + +# this version of Apple ld64 is buggy +execute_process( + COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v + ERROR_VARIABLE output + OUTPUT_QUIET +) + +if (output MATCHES "dyld-1015\.7") + add_compile_definitions(HAVE_BUGGY_APPLE_LINKER) +endif() + +# architecture specific +# TODO: probably these flags need to be tweaked on some architectures +# feel free to update the Makefile for your architecture and send a pull request or issue +message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") +if (MSVC) + string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) + message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") +else () + set(CMAKE_GENERATOR_PLATFORM_LWR "") +endif () + +if (NOT MSVC) + if (GGML_STATIC) + add_link_options(-static) + if (MINGW) + add_link_options(-static-libgcc -static-libstdc++) + endif() + endif() + if (GGML_GPROF) + add_compile_options(-pg) + endif() +endif() + +set(ARCH_FLAGS "") + +if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR + CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR + (NOT CMAKE_OSX_ARCHITECTURES AND + NOT CMAKE_GENERATOR_PLATFORM_LWR AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$")) + + message(STATUS "ARM detected") + + if (MSVC) + add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead + add_compile_definitions(__ARM_NEON) + add_compile_definitions(__ARM_FEATURE_FMA) + + set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) + string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") + + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) + if (GGML_COMPILER_SUPPORT_DOTPROD) + add_compile_definitions(__ARM_FEATURE_DOTPROD) + endif () + + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) + + if (GGML_COMPILER_SUPPORT_MATMUL_INT8) + add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) + endif () + + check_cxx_source_compiles("#include \nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) + if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) + add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + endif () + + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) + else() + check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) + if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") + list(APPEND ARCH_FLAGS -mfp16-format=ieee) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") + # Raspberry Pi 1, Zero + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") + if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") + # Android armeabi-v7a + list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) + else() + # Raspberry Pi 2 + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + endif() + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") + # Android arm64-v8a + # Raspberry Pi 3, 4, Zero 2 (32-bit) + list(APPEND ARCH_FLAGS -mno-unaligned-access) + endif() + if (GGML_SVE) + list(APPEND ARCH_FLAGS -march=armv8.6-a+sve) + endif() + endif() +elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR + (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) + message(STATUS "x86 detected") + if (MSVC) + # instruction set detection for MSVC only + if (GGML_NATIVE) + # TODO: improve, should not reference files from the parent folder + include(../cmake/FindSIMD.cmake) + endif () + if (GGML_AVX512) + list(APPEND ARCH_FLAGS /arch:AVX512) + # MSVC has no compile-time flags enabling specific + # AVX512 extensions, neither it defines the + # macros corresponding to the extensions. + # Do it manually. + if (GGML_AVX512_VBMI) + add_compile_definitions($<$:__AVX512VBMI__>) + add_compile_definitions($<$:__AVX512VBMI__>) + endif() + if (GGML_AVX512_VNNI) + add_compile_definitions($<$:__AVX512VNNI__>) + add_compile_definitions($<$:__AVX512VNNI__>) + endif() + if (GGML_AVX512_BF16) + add_compile_definitions($<$:__AVX512BF16__>) + add_compile_definitions($<$:__AVX512BF16__>) + endif() + elseif (GGML_AVX2) + list(APPEND ARCH_FLAGS /arch:AVX2) + elseif (GGML_AVX) + list(APPEND ARCH_FLAGS /arch:AVX) + endif() + else() + if (GGML_NATIVE) + list(APPEND ARCH_FLAGS -march=native) + endif() + if (GGML_F16C) + list(APPEND ARCH_FLAGS -mf16c) + endif() + if (GGML_FMA) + list(APPEND ARCH_FLAGS -mfma) + endif() + if (GGML_AVX) + list(APPEND ARCH_FLAGS -mavx) + endif() + if (GGML_AVX2) + list(APPEND ARCH_FLAGS -mavx2) + endif() + if (GGML_AVX512) + list(APPEND ARCH_FLAGS -mavx512f) + list(APPEND ARCH_FLAGS -mavx512bw) + endif() + if (GGML_AVX512_VBMI) + list(APPEND ARCH_FLAGS -mavx512vbmi) + endif() + if (GGML_AVX512_VNNI) + list(APPEND ARCH_FLAGS -mavx512vnni) + endif() + if (GGML_AVX512_BF16) + list(APPEND ARCH_FLAGS -mavx512bf16) + endif() + endif() +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") + message(STATUS "PowerPC detected") + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") + list(APPEND ARCH_FLAGS -mcpu=powerpc64le) + else() + list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) + #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) + endif() +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") + message(STATUS "loongarch64 detected") + + list(APPEND ARCH_FLAGS -march=loongarch64) + if (GGML_LASX) + list(APPEND ARCH_FLAGS -mlasx) + endif() + if (GGML_LSX) + list(APPEND ARCH_FLAGS -mlsx) + endif() +else() + message(STATUS "Unknown architecture") +endif() + +add_compile_options("$<$:${ARCH_FLAGS}>") +add_compile_options("$<$:${ARCH_FLAGS}>") + +if (GGML_CUDA) + list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS}) + list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument + + if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "") + list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED}) + endif() + + add_compile_options("$<$:${CUDA_FLAGS}>") +endif() + +if (MINGW) + # Target Windows 8 for PrefetchVirtualMemory + add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER}) +endif() + +# +# POSIX conformance +# + +# clock_gettime came in POSIX.1b (1993) +# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional +# posix_memalign came in POSIX.1-2001 / SUSv3 +# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985) +add_compile_definitions(_XOPEN_SOURCE=600) + +# Somehow in OpenBSD whenever POSIX conformance is specified +# some string functions rely on locale_t availability, +# which was introduced in POSIX.1-2008, forcing us to go higher +if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") + remove_definitions(-D_XOPEN_SOURCE=600) + add_compile_definitions(_XOPEN_SOURCE=700) +endif() + +# Data types, macros and functions related to controlling CPU affinity and +# some memory allocation are available on Linux through GNU extensions in libc +if (CMAKE_SYSTEM_NAME MATCHES "Linux") + add_compile_definitions(_GNU_SOURCE) +endif() + +# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, +# and on macOS its availability depends on enabling Darwin extensions +# similarly on DragonFly, enabling BSD extensions is necessary +if ( + CMAKE_SYSTEM_NAME MATCHES "Darwin" OR + CMAKE_SYSTEM_NAME MATCHES "iOS" OR + CMAKE_SYSTEM_NAME MATCHES "tvOS" OR + CMAKE_SYSTEM_NAME MATCHES "DragonFly" +) + add_compile_definitions(_DARWIN_C_SOURCE) +endif() + +# alloca is a non-standard interface that is not visible on BSDs when +# POSIX conformance is specified, but not all of them provide a clean way +# to enable it in such cases +if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD") + add_compile_definitions(__BSD_VISIBLE) +endif() +if (CMAKE_SYSTEM_NAME MATCHES "NetBSD") + add_compile_definitions(_NETBSD_SOURCE) +endif() +if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") + add_compile_definitions(_BSD_SOURCE) +endif() + +if (WIN32) + add_compile_definitions(_CRT_SECURE_NO_WARNINGS) + + if (BUILD_SHARED_LIBS) + # TODO: should not use this + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) + endif() +endif() + +# +# libraries +# + +# ggml + +add_library(ggml + ../include/ggml.h + ../include/ggml-alloc.h + ../include/ggml-backend.h + ggml.c + ggml-alloc.c + ggml-backend.c + ggml-quants.c + ggml-quants.h + ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} + ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} + ${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC} + ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} + ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} + ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} + ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN} + ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} + ${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS} + ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} + ) + +if (EMSCRIPTEN) + set_target_properties(ggml PROPERTIES COMPILE_FLAGS "-msimd128") +endif() + +target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC}) +target_include_directories(ggml PUBLIC ../include) +target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES}) +target_compile_features (ggml PRIVATE c_std_11) # don't bump + +target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS}) + +find_library(MATH_LIBRARY m) +if (MATH_LIBRARY) + target_link_libraries(ggml PRIVATE ${MATH_LIBRARY}) +endif() + +if (BUILD_SHARED_LIBS) + set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif() diff --git a/ggml-alloc.c b/ggml/src/ggml-alloc.c similarity index 100% rename from ggml-alloc.c rename to ggml/src/ggml-alloc.c diff --git a/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h similarity index 100% rename from ggml-backend-impl.h rename to ggml/src/ggml-backend-impl.h diff --git a/ggml-backend.c b/ggml/src/ggml-backend.c similarity index 100% rename from ggml-backend.c rename to ggml/src/ggml-backend.c diff --git a/ggml-blas.cpp b/ggml/src/ggml-blas.cpp similarity index 100% rename from ggml-blas.cpp rename to ggml/src/ggml-blas.cpp diff --git a/ggml-common.h b/ggml/src/ggml-common.h similarity index 100% rename from ggml-common.h rename to ggml/src/ggml-common.h diff --git a/ggml-cuda.cu b/ggml/src/ggml-cuda.cu similarity index 100% rename from ggml-cuda.cu rename to ggml/src/ggml-cuda.cu diff --git a/ggml-cuda/acc.cu b/ggml/src/ggml-cuda/acc.cu similarity index 100% rename from ggml-cuda/acc.cu rename to ggml/src/ggml-cuda/acc.cu diff --git a/ggml-cuda/acc.cuh b/ggml/src/ggml-cuda/acc.cuh similarity index 100% rename from ggml-cuda/acc.cuh rename to ggml/src/ggml-cuda/acc.cuh diff --git a/ggml-cuda/arange.cu b/ggml/src/ggml-cuda/arange.cu similarity index 100% rename from ggml-cuda/arange.cu rename to ggml/src/ggml-cuda/arange.cu diff --git a/ggml-cuda/arange.cuh b/ggml/src/ggml-cuda/arange.cuh similarity index 100% rename from ggml-cuda/arange.cuh rename to ggml/src/ggml-cuda/arange.cuh diff --git a/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu similarity index 100% rename from ggml-cuda/argsort.cu rename to ggml/src/ggml-cuda/argsort.cu diff --git a/ggml-cuda/argsort.cuh b/ggml/src/ggml-cuda/argsort.cuh similarity index 100% rename from ggml-cuda/argsort.cuh rename to ggml/src/ggml-cuda/argsort.cuh diff --git a/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu similarity index 100% rename from ggml-cuda/binbcast.cu rename to ggml/src/ggml-cuda/binbcast.cu diff --git a/ggml-cuda/binbcast.cuh b/ggml/src/ggml-cuda/binbcast.cuh similarity index 100% rename from ggml-cuda/binbcast.cuh rename to ggml/src/ggml-cuda/binbcast.cuh diff --git a/ggml-cuda/clamp.cu b/ggml/src/ggml-cuda/clamp.cu similarity index 100% rename from ggml-cuda/clamp.cu rename to ggml/src/ggml-cuda/clamp.cu diff --git a/ggml-cuda/clamp.cuh b/ggml/src/ggml-cuda/clamp.cuh similarity index 100% rename from ggml-cuda/clamp.cuh rename to ggml/src/ggml-cuda/clamp.cuh diff --git a/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh similarity index 100% rename from ggml-cuda/common.cuh rename to ggml/src/ggml-cuda/common.cuh diff --git a/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu similarity index 100% rename from ggml-cuda/concat.cu rename to ggml/src/ggml-cuda/concat.cu diff --git a/ggml-cuda/concat.cuh b/ggml/src/ggml-cuda/concat.cuh similarity index 100% rename from ggml-cuda/concat.cuh rename to ggml/src/ggml-cuda/concat.cuh diff --git a/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu similarity index 100% rename from ggml-cuda/convert.cu rename to ggml/src/ggml-cuda/convert.cu diff --git a/ggml-cuda/convert.cuh b/ggml/src/ggml-cuda/convert.cuh similarity index 100% rename from ggml-cuda/convert.cuh rename to ggml/src/ggml-cuda/convert.cuh diff --git a/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu similarity index 100% rename from ggml-cuda/cpy.cu rename to ggml/src/ggml-cuda/cpy.cu diff --git a/ggml-cuda/cpy.cuh b/ggml/src/ggml-cuda/cpy.cuh similarity index 100% rename from ggml-cuda/cpy.cuh rename to ggml/src/ggml-cuda/cpy.cuh diff --git a/ggml-cuda/dequantize.cuh b/ggml/src/ggml-cuda/dequantize.cuh similarity index 100% rename from ggml-cuda/dequantize.cuh rename to ggml/src/ggml-cuda/dequantize.cuh diff --git a/ggml-cuda/diagmask.cu b/ggml/src/ggml-cuda/diagmask.cu similarity index 100% rename from ggml-cuda/diagmask.cu rename to ggml/src/ggml-cuda/diagmask.cu diff --git a/ggml-cuda/diagmask.cuh b/ggml/src/ggml-cuda/diagmask.cuh similarity index 100% rename from ggml-cuda/diagmask.cuh rename to ggml/src/ggml-cuda/diagmask.cuh diff --git a/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu similarity index 100% rename from ggml-cuda/dmmv.cu rename to ggml/src/ggml-cuda/dmmv.cu diff --git a/ggml-cuda/dmmv.cuh b/ggml/src/ggml-cuda/dmmv.cuh similarity index 100% rename from ggml-cuda/dmmv.cuh rename to ggml/src/ggml-cuda/dmmv.cuh diff --git a/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh similarity index 99% rename from ggml-cuda/fattn-common.cuh rename to ggml/src/ggml-cuda/fattn-common.cuh index 37b3b9932..bd7993595 100644 --- a/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -603,7 +603,7 @@ static void on_no_fattn_vec_case(const int D) { if (D == 64) { fprintf(stderr, "Unsupported KV type combination for head_size 64.\n"); fprintf(stderr, "By default only f16 KV cache is supported.\n"); - fprintf(stderr, "Compile with LLAMA_CUDA_FA_ALL_QUANTS for V cache quantization support.\n"); + fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for V cache quantization support.\n"); GGML_ASSERT(false); } else if (D == 128) { fprintf(stderr, "Unsupported KV type combination for head_size 128.\n"); @@ -611,7 +611,7 @@ static void on_no_fattn_vec_case(const int D) { fprintf(stderr, " - K == q4_0, V == q4_0, 4.50 BPV\n"); fprintf(stderr, " - K == q8_0, V == q8_0, 8.50 BPV\n"); fprintf(stderr, " - K == f16, V == f16, 16.00 BPV\n"); - fprintf(stderr, "Compile with LLAMA_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n"); + fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n"); GGML_ASSERT(false); } else { fprintf(stderr, "Unsupported KV type combination for head_size 256.\n"); diff --git a/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu similarity index 100% rename from ggml-cuda/fattn-tile-f16.cu rename to ggml/src/ggml-cuda/fattn-tile-f16.cu diff --git a/ggml-cuda/fattn-tile-f16.cuh b/ggml/src/ggml-cuda/fattn-tile-f16.cuh similarity index 100% rename from ggml-cuda/fattn-tile-f16.cuh rename to ggml/src/ggml-cuda/fattn-tile-f16.cuh diff --git a/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu similarity index 100% rename from ggml-cuda/fattn-tile-f32.cu rename to ggml/src/ggml-cuda/fattn-tile-f32.cu diff --git a/ggml-cuda/fattn-tile-f32.cuh b/ggml/src/ggml-cuda/fattn-tile-f32.cuh similarity index 100% rename from ggml-cuda/fattn-tile-f32.cuh rename to ggml/src/ggml-cuda/fattn-tile-f32.cuh diff --git a/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh similarity index 100% rename from ggml-cuda/fattn-vec-f16.cuh rename to ggml/src/ggml-cuda/fattn-vec-f16.cuh diff --git a/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh similarity index 100% rename from ggml-cuda/fattn-vec-f32.cuh rename to ggml/src/ggml-cuda/fattn-vec-f32.cuh diff --git a/ggml-cuda/fattn-wmma-f16.cuh b/ggml/src/ggml-cuda/fattn-wmma-f16.cuh similarity index 100% rename from ggml-cuda/fattn-wmma-f16.cuh rename to ggml/src/ggml-cuda/fattn-wmma-f16.cuh diff --git a/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu similarity index 100% rename from ggml-cuda/fattn.cu rename to ggml/src/ggml-cuda/fattn.cu diff --git a/ggml-cuda/fattn.cuh b/ggml/src/ggml-cuda/fattn.cuh similarity index 100% rename from ggml-cuda/fattn.cuh rename to ggml/src/ggml-cuda/fattn.cuh diff --git a/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu similarity index 100% rename from ggml-cuda/getrows.cu rename to ggml/src/ggml-cuda/getrows.cu diff --git a/ggml-cuda/getrows.cuh b/ggml/src/ggml-cuda/getrows.cuh similarity index 100% rename from ggml-cuda/getrows.cuh rename to ggml/src/ggml-cuda/getrows.cuh diff --git a/ggml-cuda/im2col.cu b/ggml/src/ggml-cuda/im2col.cu similarity index 100% rename from ggml-cuda/im2col.cu rename to ggml/src/ggml-cuda/im2col.cu diff --git a/ggml-cuda/im2col.cuh b/ggml/src/ggml-cuda/im2col.cuh similarity index 100% rename from ggml-cuda/im2col.cuh rename to ggml/src/ggml-cuda/im2col.cuh diff --git a/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh similarity index 100% rename from ggml-cuda/mma.cuh rename to ggml/src/ggml-cuda/mma.cuh diff --git a/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu similarity index 100% rename from ggml-cuda/mmq.cu rename to ggml/src/ggml-cuda/mmq.cu diff --git a/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh similarity index 100% rename from ggml-cuda/mmq.cuh rename to ggml/src/ggml-cuda/mmq.cuh diff --git a/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu similarity index 100% rename from ggml-cuda/mmvq.cu rename to ggml/src/ggml-cuda/mmvq.cu diff --git a/ggml-cuda/mmvq.cuh b/ggml/src/ggml-cuda/mmvq.cuh similarity index 100% rename from ggml-cuda/mmvq.cuh rename to ggml/src/ggml-cuda/mmvq.cuh diff --git a/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu similarity index 100% rename from ggml-cuda/norm.cu rename to ggml/src/ggml-cuda/norm.cu diff --git a/ggml-cuda/norm.cuh b/ggml/src/ggml-cuda/norm.cuh similarity index 100% rename from ggml-cuda/norm.cuh rename to ggml/src/ggml-cuda/norm.cuh diff --git a/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu similarity index 100% rename from ggml-cuda/pad.cu rename to ggml/src/ggml-cuda/pad.cu diff --git a/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh similarity index 100% rename from ggml-cuda/pad.cuh rename to ggml/src/ggml-cuda/pad.cuh diff --git a/ggml-cuda/pool2d.cu b/ggml/src/ggml-cuda/pool2d.cu similarity index 100% rename from ggml-cuda/pool2d.cu rename to ggml/src/ggml-cuda/pool2d.cu diff --git a/ggml-cuda/pool2d.cuh b/ggml/src/ggml-cuda/pool2d.cuh similarity index 100% rename from ggml-cuda/pool2d.cuh rename to ggml/src/ggml-cuda/pool2d.cuh diff --git a/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu similarity index 100% rename from ggml-cuda/quantize.cu rename to ggml/src/ggml-cuda/quantize.cu diff --git a/ggml-cuda/quantize.cuh b/ggml/src/ggml-cuda/quantize.cuh similarity index 100% rename from ggml-cuda/quantize.cuh rename to ggml/src/ggml-cuda/quantize.cuh diff --git a/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu similarity index 100% rename from ggml-cuda/rope.cu rename to ggml/src/ggml-cuda/rope.cu diff --git a/ggml-cuda/rope.cuh b/ggml/src/ggml-cuda/rope.cuh similarity index 100% rename from ggml-cuda/rope.cuh rename to ggml/src/ggml-cuda/rope.cuh diff --git a/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu similarity index 100% rename from ggml-cuda/scale.cu rename to ggml/src/ggml-cuda/scale.cu diff --git a/ggml-cuda/scale.cuh b/ggml/src/ggml-cuda/scale.cuh similarity index 100% rename from ggml-cuda/scale.cuh rename to ggml/src/ggml-cuda/scale.cuh diff --git a/ggml-cuda/softmax.cu b/ggml/src/ggml-cuda/softmax.cu similarity index 100% rename from ggml-cuda/softmax.cu rename to ggml/src/ggml-cuda/softmax.cu diff --git a/ggml-cuda/softmax.cuh b/ggml/src/ggml-cuda/softmax.cuh similarity index 100% rename from ggml-cuda/softmax.cuh rename to ggml/src/ggml-cuda/softmax.cuh diff --git a/ggml-cuda/sumrows.cu b/ggml/src/ggml-cuda/sumrows.cu similarity index 100% rename from ggml-cuda/sumrows.cu rename to ggml/src/ggml-cuda/sumrows.cu diff --git a/ggml-cuda/sumrows.cuh b/ggml/src/ggml-cuda/sumrows.cuh similarity index 100% rename from ggml-cuda/sumrows.cuh rename to ggml/src/ggml-cuda/sumrows.cuh diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu b/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu b/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu rename to ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu b/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu b/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu rename to ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu b/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu rename to ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu diff --git a/ggml-cuda/template-instances/generate_cu_files.py b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py similarity index 100% rename from ggml-cuda/template-instances/generate_cu_files.py rename to ggml/src/ggml-cuda/template-instances/generate_cu_files.py diff --git a/ggml-cuda/template-instances/mmq-instance-q2_k.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu similarity index 100% rename from ggml-cuda/template-instances/mmq-instance-q2_k.cu rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu diff --git a/ggml-cuda/template-instances/mmq-instance-q3_k.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu similarity index 100% rename from ggml-cuda/template-instances/mmq-instance-q3_k.cu rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu diff --git a/ggml-cuda/template-instances/mmq-instance-q4_0.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/mmq-instance-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu diff --git a/ggml-cuda/template-instances/mmq-instance-q4_1.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/mmq-instance-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu diff --git a/ggml-cuda/template-instances/mmq-instance-q4_k.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu similarity index 100% rename from ggml-cuda/template-instances/mmq-instance-q4_k.cu rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu diff --git a/ggml-cuda/template-instances/mmq-instance-q5_0.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/mmq-instance-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu diff --git a/ggml-cuda/template-instances/mmq-instance-q5_1.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/mmq-instance-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu diff --git a/ggml-cuda/template-instances/mmq-instance-q5_k.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu similarity index 100% rename from ggml-cuda/template-instances/mmq-instance-q5_k.cu rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu diff --git a/ggml-cuda/template-instances/mmq-instance-q6_k.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu similarity index 100% rename from ggml-cuda/template-instances/mmq-instance-q6_k.cu rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu diff --git a/ggml-cuda/template-instances/mmq-instance-q8_0.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/mmq-instance-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu diff --git a/ggml-cuda/tsembd.cu b/ggml/src/ggml-cuda/tsembd.cu similarity index 100% rename from ggml-cuda/tsembd.cu rename to ggml/src/ggml-cuda/tsembd.cu diff --git a/ggml-cuda/tsembd.cuh b/ggml/src/ggml-cuda/tsembd.cuh similarity index 100% rename from ggml-cuda/tsembd.cuh rename to ggml/src/ggml-cuda/tsembd.cuh diff --git a/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu similarity index 100% rename from ggml-cuda/unary.cu rename to ggml/src/ggml-cuda/unary.cu diff --git a/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh similarity index 100% rename from ggml-cuda/unary.cuh rename to ggml/src/ggml-cuda/unary.cuh diff --git a/ggml-cuda/upscale.cu b/ggml/src/ggml-cuda/upscale.cu similarity index 100% rename from ggml-cuda/upscale.cu rename to ggml/src/ggml-cuda/upscale.cu diff --git a/ggml-cuda/upscale.cuh b/ggml/src/ggml-cuda/upscale.cuh similarity index 100% rename from ggml-cuda/upscale.cuh rename to ggml/src/ggml-cuda/upscale.cuh diff --git a/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh similarity index 100% rename from ggml-cuda/vecdotq.cuh rename to ggml/src/ggml-cuda/vecdotq.cuh diff --git a/ggml-impl.h b/ggml/src/ggml-impl.h similarity index 100% rename from ggml-impl.h rename to ggml/src/ggml-impl.h diff --git a/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp similarity index 100% rename from ggml-kompute.cpp rename to ggml/src/ggml-kompute.cpp diff --git a/ggml-metal.m b/ggml/src/ggml-metal.m similarity index 100% rename from ggml-metal.m rename to ggml/src/ggml-metal.m diff --git a/ggml-metal.metal b/ggml/src/ggml-metal.metal similarity index 100% rename from ggml-metal.metal rename to ggml/src/ggml-metal.metal diff --git a/ggml-quants.c b/ggml/src/ggml-quants.c similarity index 100% rename from ggml-quants.c rename to ggml/src/ggml-quants.c diff --git a/ggml-quants.h b/ggml/src/ggml-quants.h similarity index 100% rename from ggml-quants.h rename to ggml/src/ggml-quants.h diff --git a/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp similarity index 100% rename from ggml-rpc.cpp rename to ggml/src/ggml-rpc.cpp diff --git a/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp similarity index 99% rename from ggml-sycl.cpp rename to ggml/src/ggml-sycl.cpp index db045336f..4a668a2c3 100644 --- a/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -37,6 +37,7 @@ #include "ggml-backend-impl.h" #include "ggml-sycl/backend.hpp" +#include "ggml-sycl/presets.hpp" bool ggml_sycl_loaded(void); void ggml_sycl_free_data(struct ggml_tensor * tensor); diff --git a/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp similarity index 100% rename from ggml-sycl/backend.hpp rename to ggml/src/ggml-sycl/backend.hpp diff --git a/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp similarity index 100% rename from ggml-sycl/common.cpp rename to ggml/src/ggml-sycl/common.cpp diff --git a/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp similarity index 99% rename from ggml-sycl/common.hpp rename to ggml/src/ggml-sycl/common.hpp index 414c37eed..e01f91633 100644 --- a/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -17,6 +17,7 @@ #include #include "dpct/helper.hpp" +#include "ggml-sycl.h" #include "presets.hpp" #define GGML_COMMON_DECL_SYCL diff --git a/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp similarity index 100% rename from ggml-sycl/convert.cpp rename to ggml/src/ggml-sycl/convert.cpp diff --git a/ggml-sycl/convert.hpp b/ggml/src/ggml-sycl/convert.hpp similarity index 100% rename from ggml-sycl/convert.hpp rename to ggml/src/ggml-sycl/convert.hpp diff --git a/ggml-sycl/dequantize.hpp b/ggml/src/ggml-sycl/dequantize.hpp similarity index 100% rename from ggml-sycl/dequantize.hpp rename to ggml/src/ggml-sycl/dequantize.hpp diff --git a/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp similarity index 100% rename from ggml-sycl/dmmv.cpp rename to ggml/src/ggml-sycl/dmmv.cpp diff --git a/ggml-sycl/dmmv.hpp b/ggml/src/ggml-sycl/dmmv.hpp similarity index 100% rename from ggml-sycl/dmmv.hpp rename to ggml/src/ggml-sycl/dmmv.hpp diff --git a/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp similarity index 100% rename from ggml-sycl/dpct/helper.hpp rename to ggml/src/ggml-sycl/dpct/helper.hpp diff --git a/ggml-sycl/mmq.cpp b/ggml/src/ggml-sycl/mmq.cpp similarity index 100% rename from ggml-sycl/mmq.cpp rename to ggml/src/ggml-sycl/mmq.cpp diff --git a/ggml-sycl/mmq.hpp b/ggml/src/ggml-sycl/mmq.hpp similarity index 100% rename from ggml-sycl/mmq.hpp rename to ggml/src/ggml-sycl/mmq.hpp diff --git a/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp similarity index 100% rename from ggml-sycl/mmvq.cpp rename to ggml/src/ggml-sycl/mmvq.cpp diff --git a/ggml-sycl/mmvq.hpp b/ggml/src/ggml-sycl/mmvq.hpp similarity index 100% rename from ggml-sycl/mmvq.hpp rename to ggml/src/ggml-sycl/mmvq.hpp diff --git a/ggml-sycl/presets.hpp b/ggml/src/ggml-sycl/presets.hpp similarity index 96% rename from ggml-sycl/presets.hpp rename to ggml/src/ggml-sycl/presets.hpp index 5e6b61813..fe9d41770 100644 --- a/ggml-sycl/presets.hpp +++ b/ggml/src/ggml-sycl/presets.hpp @@ -15,8 +15,6 @@ #define GGML_SYCL_MAX_STREAMS 8 #define GGML_SYCL_MAX_BUFFERS 256 -#define GGML_SYCL_MAX_DEVICES 48 -#define GGML_SYCL_NAME "SYCL" #define WARP_SIZE 32 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses diff --git a/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp similarity index 100% rename from ggml-sycl/vecdotq.hpp rename to ggml/src/ggml-sycl/vecdotq.hpp diff --git a/ggml-vulkan-shaders.hpp b/ggml/src/ggml-vulkan-shaders.hpp similarity index 100% rename from ggml-vulkan-shaders.hpp rename to ggml/src/ggml-vulkan-shaders.hpp diff --git a/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp similarity index 100% rename from ggml-vulkan.cpp rename to ggml/src/ggml-vulkan.cpp diff --git a/ggml.c b/ggml/src/ggml.c similarity index 100% rename from ggml.c rename to ggml/src/ggml.c diff --git a/kompute b/ggml/src/kompute similarity index 100% rename from kompute rename to ggml/src/kompute diff --git a/kompute-shaders/common.comp b/ggml/src/kompute-shaders/common.comp similarity index 100% rename from kompute-shaders/common.comp rename to ggml/src/kompute-shaders/common.comp diff --git a/kompute-shaders/op_add.comp b/ggml/src/kompute-shaders/op_add.comp similarity index 100% rename from kompute-shaders/op_add.comp rename to ggml/src/kompute-shaders/op_add.comp diff --git a/kompute-shaders/op_addrow.comp b/ggml/src/kompute-shaders/op_addrow.comp similarity index 100% rename from kompute-shaders/op_addrow.comp rename to ggml/src/kompute-shaders/op_addrow.comp diff --git a/kompute-shaders/op_cpy_f16_f16.comp b/ggml/src/kompute-shaders/op_cpy_f16_f16.comp similarity index 100% rename from kompute-shaders/op_cpy_f16_f16.comp rename to ggml/src/kompute-shaders/op_cpy_f16_f16.comp diff --git a/kompute-shaders/op_cpy_f16_f32.comp b/ggml/src/kompute-shaders/op_cpy_f16_f32.comp similarity index 100% rename from kompute-shaders/op_cpy_f16_f32.comp rename to ggml/src/kompute-shaders/op_cpy_f16_f32.comp diff --git a/kompute-shaders/op_cpy_f32_f16.comp b/ggml/src/kompute-shaders/op_cpy_f32_f16.comp similarity index 100% rename from kompute-shaders/op_cpy_f32_f16.comp rename to ggml/src/kompute-shaders/op_cpy_f32_f16.comp diff --git a/kompute-shaders/op_cpy_f32_f32.comp b/ggml/src/kompute-shaders/op_cpy_f32_f32.comp similarity index 100% rename from kompute-shaders/op_cpy_f32_f32.comp rename to ggml/src/kompute-shaders/op_cpy_f32_f32.comp diff --git a/kompute-shaders/op_diagmask.comp b/ggml/src/kompute-shaders/op_diagmask.comp similarity index 100% rename from kompute-shaders/op_diagmask.comp rename to ggml/src/kompute-shaders/op_diagmask.comp diff --git a/kompute-shaders/op_gelu.comp b/ggml/src/kompute-shaders/op_gelu.comp similarity index 100% rename from kompute-shaders/op_gelu.comp rename to ggml/src/kompute-shaders/op_gelu.comp diff --git a/kompute-shaders/op_getrows.comp b/ggml/src/kompute-shaders/op_getrows.comp similarity index 100% rename from kompute-shaders/op_getrows.comp rename to ggml/src/kompute-shaders/op_getrows.comp diff --git a/kompute-shaders/op_getrows_f16.comp b/ggml/src/kompute-shaders/op_getrows_f16.comp similarity index 100% rename from kompute-shaders/op_getrows_f16.comp rename to ggml/src/kompute-shaders/op_getrows_f16.comp diff --git a/kompute-shaders/op_getrows_f32.comp b/ggml/src/kompute-shaders/op_getrows_f32.comp similarity index 100% rename from kompute-shaders/op_getrows_f32.comp rename to ggml/src/kompute-shaders/op_getrows_f32.comp diff --git a/kompute-shaders/op_getrows_q4_0.comp b/ggml/src/kompute-shaders/op_getrows_q4_0.comp similarity index 100% rename from kompute-shaders/op_getrows_q4_0.comp rename to ggml/src/kompute-shaders/op_getrows_q4_0.comp diff --git a/kompute-shaders/op_getrows_q4_1.comp b/ggml/src/kompute-shaders/op_getrows_q4_1.comp similarity index 100% rename from kompute-shaders/op_getrows_q4_1.comp rename to ggml/src/kompute-shaders/op_getrows_q4_1.comp diff --git a/kompute-shaders/op_getrows_q6_k.comp b/ggml/src/kompute-shaders/op_getrows_q6_k.comp similarity index 100% rename from kompute-shaders/op_getrows_q6_k.comp rename to ggml/src/kompute-shaders/op_getrows_q6_k.comp diff --git a/kompute-shaders/op_mul.comp b/ggml/src/kompute-shaders/op_mul.comp similarity index 100% rename from kompute-shaders/op_mul.comp rename to ggml/src/kompute-shaders/op_mul.comp diff --git a/kompute-shaders/op_mul_mat_f16.comp b/ggml/src/kompute-shaders/op_mul_mat_f16.comp similarity index 100% rename from kompute-shaders/op_mul_mat_f16.comp rename to ggml/src/kompute-shaders/op_mul_mat_f16.comp diff --git a/kompute-shaders/op_mul_mat_mat_f32.comp b/ggml/src/kompute-shaders/op_mul_mat_mat_f32.comp similarity index 100% rename from kompute-shaders/op_mul_mat_mat_f32.comp rename to ggml/src/kompute-shaders/op_mul_mat_mat_f32.comp diff --git a/kompute-shaders/op_mul_mat_q4_0.comp b/ggml/src/kompute-shaders/op_mul_mat_q4_0.comp similarity index 100% rename from kompute-shaders/op_mul_mat_q4_0.comp rename to ggml/src/kompute-shaders/op_mul_mat_q4_0.comp diff --git a/kompute-shaders/op_mul_mat_q4_1.comp b/ggml/src/kompute-shaders/op_mul_mat_q4_1.comp similarity index 100% rename from kompute-shaders/op_mul_mat_q4_1.comp rename to ggml/src/kompute-shaders/op_mul_mat_q4_1.comp diff --git a/kompute-shaders/op_mul_mat_q6_k.comp b/ggml/src/kompute-shaders/op_mul_mat_q6_k.comp similarity index 100% rename from kompute-shaders/op_mul_mat_q6_k.comp rename to ggml/src/kompute-shaders/op_mul_mat_q6_k.comp diff --git a/kompute-shaders/op_mul_mat_q8_0.comp b/ggml/src/kompute-shaders/op_mul_mat_q8_0.comp similarity index 100% rename from kompute-shaders/op_mul_mat_q8_0.comp rename to ggml/src/kompute-shaders/op_mul_mat_q8_0.comp diff --git a/kompute-shaders/op_mul_mv_q_n.comp b/ggml/src/kompute-shaders/op_mul_mv_q_n.comp similarity index 100% rename from kompute-shaders/op_mul_mv_q_n.comp rename to ggml/src/kompute-shaders/op_mul_mv_q_n.comp diff --git a/kompute-shaders/op_mul_mv_q_n_pre.comp b/ggml/src/kompute-shaders/op_mul_mv_q_n_pre.comp similarity index 100% rename from kompute-shaders/op_mul_mv_q_n_pre.comp rename to ggml/src/kompute-shaders/op_mul_mv_q_n_pre.comp diff --git a/kompute-shaders/op_norm.comp b/ggml/src/kompute-shaders/op_norm.comp similarity index 100% rename from kompute-shaders/op_norm.comp rename to ggml/src/kompute-shaders/op_norm.comp diff --git a/kompute-shaders/op_relu.comp b/ggml/src/kompute-shaders/op_relu.comp similarity index 100% rename from kompute-shaders/op_relu.comp rename to ggml/src/kompute-shaders/op_relu.comp diff --git a/kompute-shaders/op_rmsnorm.comp b/ggml/src/kompute-shaders/op_rmsnorm.comp similarity index 100% rename from kompute-shaders/op_rmsnorm.comp rename to ggml/src/kompute-shaders/op_rmsnorm.comp diff --git a/kompute-shaders/op_rope_f16.comp b/ggml/src/kompute-shaders/op_rope_f16.comp similarity index 100% rename from kompute-shaders/op_rope_f16.comp rename to ggml/src/kompute-shaders/op_rope_f16.comp diff --git a/kompute-shaders/op_rope_f32.comp b/ggml/src/kompute-shaders/op_rope_f32.comp similarity index 100% rename from kompute-shaders/op_rope_f32.comp rename to ggml/src/kompute-shaders/op_rope_f32.comp diff --git a/kompute-shaders/op_scale.comp b/ggml/src/kompute-shaders/op_scale.comp similarity index 100% rename from kompute-shaders/op_scale.comp rename to ggml/src/kompute-shaders/op_scale.comp diff --git a/kompute-shaders/op_scale_8.comp b/ggml/src/kompute-shaders/op_scale_8.comp similarity index 100% rename from kompute-shaders/op_scale_8.comp rename to ggml/src/kompute-shaders/op_scale_8.comp diff --git a/kompute-shaders/op_silu.comp b/ggml/src/kompute-shaders/op_silu.comp similarity index 100% rename from kompute-shaders/op_silu.comp rename to ggml/src/kompute-shaders/op_silu.comp diff --git a/kompute-shaders/op_softmax.comp b/ggml/src/kompute-shaders/op_softmax.comp similarity index 100% rename from kompute-shaders/op_softmax.comp rename to ggml/src/kompute-shaders/op_softmax.comp diff --git a/kompute-shaders/rope_common.comp b/ggml/src/kompute-shaders/rope_common.comp similarity index 100% rename from kompute-shaders/rope_common.comp rename to ggml/src/kompute-shaders/rope_common.comp diff --git a/sgemm.cpp b/ggml/src/sgemm.cpp similarity index 100% rename from sgemm.cpp rename to ggml/src/sgemm.cpp diff --git a/sgemm.h b/ggml/src/sgemm.h similarity index 100% rename from sgemm.h rename to ggml/src/sgemm.h diff --git a/vulkan-shaders/add.comp b/ggml/src/vulkan-shaders/add.comp similarity index 100% rename from vulkan-shaders/add.comp rename to ggml/src/vulkan-shaders/add.comp diff --git a/vulkan-shaders/argsort.comp b/ggml/src/vulkan-shaders/argsort.comp similarity index 100% rename from vulkan-shaders/argsort.comp rename to ggml/src/vulkan-shaders/argsort.comp diff --git a/vulkan-shaders/clamp.comp b/ggml/src/vulkan-shaders/clamp.comp similarity index 100% rename from vulkan-shaders/clamp.comp rename to ggml/src/vulkan-shaders/clamp.comp diff --git a/vulkan-shaders/copy.comp b/ggml/src/vulkan-shaders/copy.comp similarity index 100% rename from vulkan-shaders/copy.comp rename to ggml/src/vulkan-shaders/copy.comp diff --git a/vulkan-shaders/dequant_f32.comp b/ggml/src/vulkan-shaders/dequant_f32.comp similarity index 100% rename from vulkan-shaders/dequant_f32.comp rename to ggml/src/vulkan-shaders/dequant_f32.comp diff --git a/vulkan-shaders/dequant_funcs.comp b/ggml/src/vulkan-shaders/dequant_funcs.comp similarity index 100% rename from vulkan-shaders/dequant_funcs.comp rename to ggml/src/vulkan-shaders/dequant_funcs.comp diff --git a/vulkan-shaders/dequant_head.comp b/ggml/src/vulkan-shaders/dequant_head.comp similarity index 100% rename from vulkan-shaders/dequant_head.comp rename to ggml/src/vulkan-shaders/dequant_head.comp diff --git a/vulkan-shaders/dequant_q2_k.comp b/ggml/src/vulkan-shaders/dequant_q2_k.comp similarity index 100% rename from vulkan-shaders/dequant_q2_k.comp rename to ggml/src/vulkan-shaders/dequant_q2_k.comp diff --git a/vulkan-shaders/dequant_q3_k.comp b/ggml/src/vulkan-shaders/dequant_q3_k.comp similarity index 100% rename from vulkan-shaders/dequant_q3_k.comp rename to ggml/src/vulkan-shaders/dequant_q3_k.comp diff --git a/vulkan-shaders/dequant_q4_0.comp b/ggml/src/vulkan-shaders/dequant_q4_0.comp similarity index 100% rename from vulkan-shaders/dequant_q4_0.comp rename to ggml/src/vulkan-shaders/dequant_q4_0.comp diff --git a/vulkan-shaders/dequant_q4_1.comp b/ggml/src/vulkan-shaders/dequant_q4_1.comp similarity index 100% rename from vulkan-shaders/dequant_q4_1.comp rename to ggml/src/vulkan-shaders/dequant_q4_1.comp diff --git a/vulkan-shaders/dequant_q4_k.comp b/ggml/src/vulkan-shaders/dequant_q4_k.comp similarity index 100% rename from vulkan-shaders/dequant_q4_k.comp rename to ggml/src/vulkan-shaders/dequant_q4_k.comp diff --git a/vulkan-shaders/dequant_q5_0.comp b/ggml/src/vulkan-shaders/dequant_q5_0.comp similarity index 100% rename from vulkan-shaders/dequant_q5_0.comp rename to ggml/src/vulkan-shaders/dequant_q5_0.comp diff --git a/vulkan-shaders/dequant_q5_1.comp b/ggml/src/vulkan-shaders/dequant_q5_1.comp similarity index 100% rename from vulkan-shaders/dequant_q5_1.comp rename to ggml/src/vulkan-shaders/dequant_q5_1.comp diff --git a/vulkan-shaders/dequant_q5_k.comp b/ggml/src/vulkan-shaders/dequant_q5_k.comp similarity index 100% rename from vulkan-shaders/dequant_q5_k.comp rename to ggml/src/vulkan-shaders/dequant_q5_k.comp diff --git a/vulkan-shaders/dequant_q6_k.comp b/ggml/src/vulkan-shaders/dequant_q6_k.comp similarity index 100% rename from vulkan-shaders/dequant_q6_k.comp rename to ggml/src/vulkan-shaders/dequant_q6_k.comp diff --git a/vulkan-shaders/dequant_q8_0.comp b/ggml/src/vulkan-shaders/dequant_q8_0.comp similarity index 100% rename from vulkan-shaders/dequant_q8_0.comp rename to ggml/src/vulkan-shaders/dequant_q8_0.comp diff --git a/vulkan-shaders/diag_mask_inf.comp b/ggml/src/vulkan-shaders/diag_mask_inf.comp similarity index 100% rename from vulkan-shaders/diag_mask_inf.comp rename to ggml/src/vulkan-shaders/diag_mask_inf.comp diff --git a/vulkan-shaders/div.comp b/ggml/src/vulkan-shaders/div.comp similarity index 100% rename from vulkan-shaders/div.comp rename to ggml/src/vulkan-shaders/div.comp diff --git a/vulkan-shaders/gelu.comp b/ggml/src/vulkan-shaders/gelu.comp similarity index 100% rename from vulkan-shaders/gelu.comp rename to ggml/src/vulkan-shaders/gelu.comp diff --git a/vulkan-shaders/generic_binary_head.comp b/ggml/src/vulkan-shaders/generic_binary_head.comp similarity index 100% rename from vulkan-shaders/generic_binary_head.comp rename to ggml/src/vulkan-shaders/generic_binary_head.comp diff --git a/vulkan-shaders/generic_head.comp b/ggml/src/vulkan-shaders/generic_head.comp similarity index 100% rename from vulkan-shaders/generic_head.comp rename to ggml/src/vulkan-shaders/generic_head.comp diff --git a/vulkan-shaders/generic_unary_head.comp b/ggml/src/vulkan-shaders/generic_unary_head.comp similarity index 100% rename from vulkan-shaders/generic_unary_head.comp rename to ggml/src/vulkan-shaders/generic_unary_head.comp diff --git a/vulkan-shaders/get_rows.comp b/ggml/src/vulkan-shaders/get_rows.comp similarity index 100% rename from vulkan-shaders/get_rows.comp rename to ggml/src/vulkan-shaders/get_rows.comp diff --git a/vulkan-shaders/get_rows_quant.comp b/ggml/src/vulkan-shaders/get_rows_quant.comp similarity index 100% rename from vulkan-shaders/get_rows_quant.comp rename to ggml/src/vulkan-shaders/get_rows_quant.comp diff --git a/vulkan-shaders/mul.comp b/ggml/src/vulkan-shaders/mul.comp similarity index 100% rename from vulkan-shaders/mul.comp rename to ggml/src/vulkan-shaders/mul.comp diff --git a/vulkan-shaders/mul_mat_split_k_reduce.comp b/ggml/src/vulkan-shaders/mul_mat_split_k_reduce.comp similarity index 100% rename from vulkan-shaders/mul_mat_split_k_reduce.comp rename to ggml/src/vulkan-shaders/mul_mat_split_k_reduce.comp diff --git a/vulkan-shaders/mul_mat_vec.comp b/ggml/src/vulkan-shaders/mul_mat_vec.comp similarity index 100% rename from vulkan-shaders/mul_mat_vec.comp rename to ggml/src/vulkan-shaders/mul_mat_vec.comp diff --git a/vulkan-shaders/mul_mat_vec_base.comp b/ggml/src/vulkan-shaders/mul_mat_vec_base.comp similarity index 100% rename from vulkan-shaders/mul_mat_vec_base.comp rename to ggml/src/vulkan-shaders/mul_mat_vec_base.comp diff --git a/vulkan-shaders/mul_mat_vec_nc.comp b/ggml/src/vulkan-shaders/mul_mat_vec_nc.comp similarity index 100% rename from vulkan-shaders/mul_mat_vec_nc.comp rename to ggml/src/vulkan-shaders/mul_mat_vec_nc.comp diff --git a/vulkan-shaders/mul_mat_vec_p021.comp b/ggml/src/vulkan-shaders/mul_mat_vec_p021.comp similarity index 100% rename from vulkan-shaders/mul_mat_vec_p021.comp rename to ggml/src/vulkan-shaders/mul_mat_vec_p021.comp diff --git a/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q2_k.comp similarity index 100% rename from vulkan-shaders/mul_mat_vec_q2_k.comp rename to ggml/src/vulkan-shaders/mul_mat_vec_q2_k.comp diff --git a/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q3_k.comp similarity index 100% rename from vulkan-shaders/mul_mat_vec_q3_k.comp rename to ggml/src/vulkan-shaders/mul_mat_vec_q3_k.comp diff --git a/vulkan-shaders/mul_mat_vec_q4_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q4_k.comp similarity index 100% rename from vulkan-shaders/mul_mat_vec_q4_k.comp rename to ggml/src/vulkan-shaders/mul_mat_vec_q4_k.comp diff --git a/vulkan-shaders/mul_mat_vec_q5_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q5_k.comp similarity index 100% rename from vulkan-shaders/mul_mat_vec_q5_k.comp rename to ggml/src/vulkan-shaders/mul_mat_vec_q5_k.comp diff --git a/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q6_k.comp similarity index 100% rename from vulkan-shaders/mul_mat_vec_q6_k.comp rename to ggml/src/vulkan-shaders/mul_mat_vec_q6_k.comp diff --git a/vulkan-shaders/mul_mm.comp b/ggml/src/vulkan-shaders/mul_mm.comp similarity index 100% rename from vulkan-shaders/mul_mm.comp rename to ggml/src/vulkan-shaders/mul_mm.comp diff --git a/vulkan-shaders/norm.comp b/ggml/src/vulkan-shaders/norm.comp similarity index 100% rename from vulkan-shaders/norm.comp rename to ggml/src/vulkan-shaders/norm.comp diff --git a/vulkan-shaders/relu.comp b/ggml/src/vulkan-shaders/relu.comp similarity index 100% rename from vulkan-shaders/relu.comp rename to ggml/src/vulkan-shaders/relu.comp diff --git a/vulkan-shaders/rms_norm.comp b/ggml/src/vulkan-shaders/rms_norm.comp similarity index 100% rename from vulkan-shaders/rms_norm.comp rename to ggml/src/vulkan-shaders/rms_norm.comp diff --git a/vulkan-shaders/rope_head.comp b/ggml/src/vulkan-shaders/rope_head.comp similarity index 100% rename from vulkan-shaders/rope_head.comp rename to ggml/src/vulkan-shaders/rope_head.comp diff --git a/vulkan-shaders/rope_neox.comp b/ggml/src/vulkan-shaders/rope_neox.comp similarity index 100% rename from vulkan-shaders/rope_neox.comp rename to ggml/src/vulkan-shaders/rope_neox.comp diff --git a/vulkan-shaders/rope_norm.comp b/ggml/src/vulkan-shaders/rope_norm.comp similarity index 100% rename from vulkan-shaders/rope_norm.comp rename to ggml/src/vulkan-shaders/rope_norm.comp diff --git a/vulkan-shaders/scale.comp b/ggml/src/vulkan-shaders/scale.comp similarity index 100% rename from vulkan-shaders/scale.comp rename to ggml/src/vulkan-shaders/scale.comp diff --git a/vulkan-shaders/silu.comp b/ggml/src/vulkan-shaders/silu.comp similarity index 100% rename from vulkan-shaders/silu.comp rename to ggml/src/vulkan-shaders/silu.comp diff --git a/vulkan-shaders/soft_max.comp b/ggml/src/vulkan-shaders/soft_max.comp similarity index 100% rename from vulkan-shaders/soft_max.comp rename to ggml/src/vulkan-shaders/soft_max.comp diff --git a/vulkan-shaders/square.comp b/ggml/src/vulkan-shaders/square.comp similarity index 100% rename from vulkan-shaders/square.comp rename to ggml/src/vulkan-shaders/square.comp diff --git a/vulkan-shaders/sum_rows.comp b/ggml/src/vulkan-shaders/sum_rows.comp similarity index 100% rename from vulkan-shaders/sum_rows.comp rename to ggml/src/vulkan-shaders/sum_rows.comp diff --git a/vulkan-shaders/types.comp b/ggml/src/vulkan-shaders/types.comp similarity index 100% rename from vulkan-shaders/types.comp rename to ggml/src/vulkan-shaders/types.comp diff --git a/llama.h b/include/llama.h similarity index 100% rename from llama.h rename to include/llama.h diff --git a/scripts/build-info.sh b/scripts/build-info.sh index 32682afbd..fa9e7bacd 100755 --- a/scripts/build-info.sh +++ b/scripts/build-info.sh @@ -8,20 +8,20 @@ build_compiler="unknown" build_target="unknown" if out=$(git rev-list --count HEAD); then - # git is broken on WSL so we need to strip extra newlines - build_number=$(printf '%s' "$out" | tr -d '\n') + # git is broken on WSL so we need to strip extra newlines + build_number=$(printf '%s' "$out" | tr -d '\n') fi if out=$(git rev-parse --short HEAD); then - build_commit=$(printf '%s' "$out" | tr -d '\n') + build_commit=$(printf '%s' "$out" | tr -d '\n') fi if out=$($CC --version | head -1); then - build_compiler=$out + build_compiler=$out fi if out=$($CC -dumpmachine); then - build_target=$out + build_target=$out fi echo "int LLAMA_BUILD_NUMBER = ${build_number};" diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh index a45cd3962..70679f4e5 100755 --- a/scripts/compare-commits.sh +++ b/scripts/compare-commits.sh @@ -12,7 +12,7 @@ bench_args="${@:3}" rm -f llama-bench.sqlite > /dev/null -# to test a backend, call the script with the corresponding environment variable (e.g. LLAMA_CUDA=1 ./scripts/compare-commits.sh ...) +# to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...) git checkout $1 > /dev/null make clean > /dev/null diff --git a/scripts/debug-test.sh b/scripts/debug-test.sh index 7b2b601a9..91946c514 100755 --- a/scripts/debug-test.sh +++ b/scripts/debug-test.sh @@ -110,7 +110,7 @@ rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir" ########################################################### # Note: test-eval-callback requires -DLLAMA_CURL -cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build enviroment" +cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build enviroment" pushd "$build_dir" make -j || abort "Failed to compile" popd > /dev/null || exit 1 diff --git a/scripts/pod-llama.sh b/scripts/pod-llama.sh index 6ba499a2a..586d6ea18 100644 --- a/scripts/pod-llama.sh +++ b/scripts/pod-llama.sh @@ -42,7 +42,7 @@ git clone https://github.com/ggerganov/llama.cpp cd llama.cpp -LLAMA_CUDA=1 make -j +GGML_CUDA=1 make -j ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3 ./models/tinyllama-1b ln -sfn /workspace/CodeLlama-7b-hf ./models/codellama-7b @@ -60,7 +60,7 @@ cd /workspace/llama.cpp mkdir build-cublas cd build-cublas -cmake -DLLAMA_CUDA=1 ../ +cmake -DGGML_CUDA=1 ../ make -j if [ "$1" -eq "0" ]; then @@ -186,17 +186,17 @@ if [ "$1" -eq "1" ]; then # batched cd /workspace/llama.cpp - LLAMA_CUDA=1 make -j && ./llama-batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999 + GGML_CUDA=1 make -j && ./llama-batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999 # batched-bench cd /workspace/llama.cpp - LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 + GGML_CUDA=1 make -j && ./llama-batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 # parallel cd /workspace/llama.cpp - LLAMA_CUDA=1 make -j && ./llama-parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb + GGML_CUDA=1 make -j && ./llama-parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb fi @@ -204,10 +204,10 @@ fi #if [ "$1" -eq "7" ]; then # cd /workspace/llama.cpp # -# LLAMA_CUDA=1 make -j && ./llama-speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0 +# GGML_CUDA=1 make -j && ./llama-speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0 #fi # more benches -#LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 -#LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 +#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 +#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 diff --git a/scripts/server-llm.sh b/scripts/server-llm.sh index 199232440..802592a3e 100644 --- a/scripts/server-llm.sh +++ b/scripts/server-llm.sh @@ -380,7 +380,7 @@ fi if [[ "$backend" == "cuda" ]]; then printf "[+] Building with CUDA backend\n" - LLAMA_CUDA=1 make -j llama-server $log + GGML_CUDA=1 make -j llama-server $log elif [[ "$backend" == "cpu" ]]; then printf "[+] Building with CPU backend\n" make -j llama-server $log diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index 9e34dc8b9..9e654180b 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -53,7 +53,9 @@ while read c; do fi git format-patch -k $c~1..$c --stdout -- \ - include/ggml/ggml*.h \ + CMakeLists.txt \ + src/CMakeLists.txt \ + cmake/FindSIMD.cmake \ src/ggml*.h \ src/ggml*.c \ src/ggml*.cpp \ @@ -61,6 +63,7 @@ while read c; do src/ggml*.metal \ src/ggml*.cu \ src/ggml-cuda/* \ + include/ggml*.h \ tests/test-opt.cpp \ tests/test-grad0.cpp \ tests/test-quantize-fns.cpp \ @@ -93,30 +96,36 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # replace filenames: # - # src/ggml.c -> ggml.c - # src/ggml-alloc.c -> ggml-alloc.c - # src/ggml-backend-impl.h -> ggml-backend-impl.h - # src/ggml-backend.c -> ggml-backend.c - # src/ggml-common.h -> ggml-common.h - # src/ggml-cuda/* -> ggml-cuda/ - # src/ggml-cuda.cu -> ggml-cuda.cu - # src/ggml-cuda.h -> ggml-cuda.h - # src/ggml-impl.h -> ggml-impl.h - # src/ggml-kompute.cpp -> ggml-kompute.cpp - # src/ggml-kompute.h -> ggml-kompute.h - # src/ggml-metal.h -> ggml-metal.h - # src/ggml-metal.m -> ggml-metal.m - # src/ggml-quants.c -> ggml-quants.c - # src/ggml-quants.h -> ggml-quants.h - # src/ggml-rpc.cpp -> ggml-rpc.cpp - # src/ggml-rpc.h -> ggml-rpc.h - # src/ggml-sycl.cpp -> ggml-sycl.cpp - # src/ggml-sycl.h -> ggml-sycl.h - # src/ggml-vulkan.cpp -> ggml-vulkan.cpp - # src/ggml-vulkan.h -> ggml-vulkan.h - # include/ggml/ggml.h -> ggml.h - # include/ggml/ggml-alloc.h -> ggml-alloc.h - # include/ggml/ggml-backend.h -> ggml-backend.h + # CMakelists.txt -> ggml/CMakeLists.txt + # src/CMakeLists.txt -> ggml/src/CMakeLists.txt + # cmake/FindSIMD.cmake -> ggml/cmake/FindSIMD.cmake + # + # src/ggml.c -> ggml/src/ggml.c + # src/ggml-alloc.c -> ggml/src/ggml-alloc.c + # src/ggml-backend-impl.h -> ggml/src/ggml-backend-impl.h + # src/ggml-backend.c -> ggml/src/ggml-backend.c + # src/ggml-common.h -> ggml/src/ggml-common.h + # src/ggml-cuda/* -> ggml/src/ggml-cuda/ + # src/ggml-cuda.cu -> ggml/src/ggml-cuda.cu + # src/ggml-impl.h -> ggml/src/ggml-impl.h + # src/ggml-kompute.cpp -> ggml/src/ggml-kompute.cpp + # src/ggml-metal.m -> ggml/src/ggml-metal.m + # src/ggml-quants.c -> ggml/src/ggml-quants.c + # src/ggml-quants.h -> ggml/src/ggml-quants.h + # src/ggml-rpc.cpp -> ggml/src/ggml-rpc.cpp + # src/ggml-sycl.cpp -> ggml/src/ggml-sycl.cpp + # src/ggml-vulkan.cpp -> ggml/src/ggml-vulkan.cpp + # + # include/ggml.h -> ggml/include/ggml.h + # include/ggml-alloc.h -> ggml/include/ggml-alloc.h + # include/ggml-backend.h -> ggml/include/ggml-backend.h + # include/ggml-blas.h -> ggml/include/ggml-blas.h + # include/ggml-cuda.h -> ggml/include/ggml-cuda.h + # include/ggml-kompute.h -> ggml/include/ggml-kompute.h + # include/ggml-metal.h -> ggml/include/ggml-metal.h + # include/ggml-rpc.h -> ggml/include/ggml-rpc.h + # include/ggml-sycl.h -> ggml/include/ggml-sycl.h + # include/ggml-vulkan.h -> ggml/include/ggml-vulkan.h # # tests/test-opt.cpp -> tests/test-opt.cpp # tests/test-grad0.cpp -> tests/test-grad0.cpp @@ -124,34 +133,38 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # tests/test-quantize-perf.cpp -> tests/test-quantize-perf.cpp # tests/test-backend-ops.cpp -> tests/test-backend-ops.cpp # - # LICENSE -> LICENSE - # scripts/gen-authors.sh -> scripts/gen-authors.sh + # LICENSE -> LICENSE + # scripts/gen-authors.sh -> scripts/gen-authors.sh cat ggml-src.patch | sed \ - -e 's/src\/ggml\.c/ggml.c/g' \ - -e 's/src\/ggml-alloc\.c/ggml-alloc.c/g' \ - -e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \ - -e 's/src\/ggml-backend\.c/ggml-backend.c/g' \ - -e 's/src\/ggml-common\.h/ggml-common.h/g' \ + -e 's/CMakeLists.txt/ggml\/CMakeLists.txt/g' \ + -e 's/src\/CMakeLists.txt/ggml\/src\/CMakeLists.txt/g' \ + -e 's/cmake\/FindSIMD.cmake/ggml\/cmake\/FindSIMD.cmake/g' \ + -e 's/src\/ggml\.c/ggml/src/ggml.c/g' \ + -e 's/src\/ggml-alloc\.c/ggml/src/ggml-alloc.c/g' \ + -e 's/src\/ggml-backend-impl\.h/ggml/src/ggml-backend-impl.h/g' \ + -e 's/src\/ggml-backend\.c/ggml/src/ggml-backend.c/g' \ + -e 's/src\/ggml-common\.h/ggml/src/ggml-common.h/g' \ -e 's/src\/ggml-cuda\//ggml-cuda\//g' \ - -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \ - -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \ - -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \ - -e 's/src\/ggml-kompute\.cpp/ggml-kompute.cpp/g' \ - -e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \ - -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \ - -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \ - -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \ - -e 's/src\/ggml-quants\.h/ggml-quants.h/g' \ - -e 's/src\/ggml-rpc\.cpp/ggml-rpc.cpp/g' \ - -e 's/src\/ggml-rpc\.h/ggml-rpc.h/g' \ - -e 's/src\/ggml-sycl\.cpp/ggml-sycl.cpp/g' \ - -e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \ - -e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \ - -e 's/src\/ggml-vulkan\.h/ggml-vulkan.h/g' \ - -e 's/include\/ggml\/ggml\.h/ggml.h/g' \ - -e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \ - -e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \ + -e 's/src\/ggml-cuda\.cu/ggml/src/ggml-cuda.cu/g' \ + -e 's/src\/ggml-impl\.h/ggml/src/ggml-impl.h/g' \ + -e 's/src\/ggml-kompute\.cpp/ggml/src/ggml-kompute.cpp/g' \ + -e 's/src\/ggml-metal\.m/ggml/src/ggml-metal.m/g' \ + -e 's/src\/ggml-quants\.c/ggml/src/ggml-quants.c/g' \ + -e 's/src\/ggml-quants\.h/ggml/src/ggml-quants.h/g' \ + -e 's/src\/ggml-rpc\.cpp/ggml/src/ggml-rpc.cpp/g' \ + -e 's/src\/ggml-sycl\.cpp/ggml/src/ggml-sycl.cpp/g' \ + -e 's/src\/ggml-vulkan\.cpp/ggml/src/ggml-vulkan.cpp/g' \ + -e 's/include\/ggml\.h/ggml/include/ggml.h/g' \ + -e 's/include\/ggml-alloc\.h/ggml/include/ggml-alloc.h/g' \ + -e 's/include\/ggml-backend\.h/ggml/include/ggml-backend.h/g' \ + -e 's/include\/ggml-blas\.h/ggml/include/ggml-blas.h/g' \ + -e 's/include\/ggml-cuda\.h/ggml/include/ggml-cuda.h/g' \ + -e 's/include\/ggml-kompute\.h/ggml/include/ggml-kompute.h/g' \ + -e 's/include\/ggml-metal\.h/ggml/include/ggml-metal.h/g' \ + -e 's/include\/ggml-rpc\.h/ggml/include/ggml-rpc.h/g' \ + -e 's/include\/ggml-sycl\.h/ggml/include/ggml-sycl.h/g' \ + -e 's/include\/ggml-vulkan\.h/ggml/include/ggml-vulkan.h/g' \ -e 's/tests\/test-opt\.cpp/tests\/test-opt.cpp/g' \ -e 's/tests\/test-grad0\.cpp/tests\/test-grad0.cpp/g' \ -e 's/tests\/test-quantize-fns\.cpp/tests\/test-quantize-fns.cpp/g' \ diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index 4843f8a4a..2f32c1ce8 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -1,34 +1,42 @@ #!/bin/bash -cp -rpv ../ggml/src/ggml.c ./ggml.c -cp -rpv ../ggml/src/ggml-alloc.c ./ggml-alloc.c -cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml-backend-impl.h -cp -rpv ../ggml/src/ggml-backend.c ./ggml-backend.c -cp -rpv ../ggml/src/ggml-common.h ./ggml-common.h -cp -rpv ../ggml/src/ggml-cuda/* ./ggml-cuda/ -cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu -cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h -cp -rpv ../ggml/src/ggml-impl.h ./ggml-impl.h -cp -rpv ../ggml/src/ggml-kompute.cpp ./ggml-kompute.cpp -cp -rpv ../ggml/src/ggml-kompute.h ./ggml-kompute.h -cp -rpv ../ggml/src/ggml-metal.h ./ggml-metal.h -cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m -cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal -cp -rpv ../ggml/src/ggml-quants.c ./ggml-quants.c -cp -rpv ../ggml/src/ggml-quants.h ./ggml-quants.h -cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml-rpc.cpp -cp -rpv ../ggml/src/ggml-rpc.h ./ggml-rpc.h -cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml-sycl.cpp -cp -rpv ../ggml/src/ggml-sycl.h ./ggml-sycl.h -cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml-vulkan.cpp -cp -rpv ../ggml/src/ggml-vulkan.h ./ggml-vulkan.h -cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h -cp -rpv ../ggml/include/ggml/ggml-alloc.h ./ggml-alloc.h -cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h +cp -rpv ../ggml/CMakeLists.txt ./ggml/CMakeLists.txt +cp -rpv ../ggml/src/CMakeLists.txt ./ggml/src/CMakeLists.txt +cp -rpv ../ggml/cmake/FindSIMD.cmake ./ggml/cmake/FindSIMD.cmake -cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp -cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp -cp -rpv ../ggml/tests/test-backend-ops.cpp ./tests/test-backend-ops.cpp +cp -rpv ../ggml/src/ggml.c ./ggml/src/ggml.c +cp -rpv ../ggml/src/ggml-alloc.c ./ggml/src/ggml-alloc.c +cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml/src/ggml-backend-impl.h +cp -rpv ../ggml/src/ggml-backend.c ./ggml/src/ggml-backend.c +cp -rpv ../ggml/src/ggml-common.h ./ggml/src/ggml-common.h +cp -rpv ../ggml/src/ggml-cuda/* ./ggml/src/ggml-cuda/ +cp -rpv ../ggml/src/ggml-cuda.cu ./ggml/src/ggml-cuda.cu +cp -rpv ../ggml/src/ggml-impl.h ./ggml/src/ggml-impl.h +cp -rpv ../ggml/src/ggml-kompute.cpp ./ggml/src/ggml-kompute.cpp +cp -rpv ../ggml/src/ggml-metal.m ./ggml/src/ggml-metal.m +cp -rpv ../ggml/src/ggml-metal.metal ./ggml/src/ggml-metal.metal +cp -rpv ../ggml/src/ggml-quants.c ./ggml/src/ggml-quants.c +cp -rpv ../ggml/src/ggml-quants.h ./ggml/src/ggml-quants.h +cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml/src/ggml-rpc.cpp +cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml/src/ggml-sycl.cpp +cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml/src/ggml-vulkan.cpp -cp -rpv ../LICENSE ./LICENSE -cp -rpv ../ggml/scripts/gen-authors.sh ./scripts/gen-authors.sh +cp -rpv ../ggml/include/ggml.h ./ggml/include/ggml.h +cp -rpv ../ggml/include/ggml-alloc.h ./ggml/include/ggml-alloc.h +cp -rpv ../ggml/include/ggml-backend.h ./ggml/include/ggml-backend.h +cp -rpv ../ggml/include/ggml-blas.h ./ggml/include/ggml-blas.h +cp -rpv ../ggml/include/ggml-cuda.h ./ggml/include/ggml-cuda.h +cp -rpv ../ggml/include/ggml-kompute.h ./ggml/include/ggml-kompute.h +cp -rpv ../ggml/include/ggml-metal.h ./ggml/include/ggml-metal.h +cp -rpv ../ggml/include/ggml-rpc.h ./ggml/include/ggml-rpc.h +cp -rpv ../ggml/include/ggml-sycl.h ./ggml/include/ggml-sycl.h +cp -rpv ../ggml/include/ggml-vulkan.h ./ggml/include/ggml-vulkan.h + +cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp +cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp +cp -rpv ../ggml/tests/test-quantize-fns.cpp ./tests/test-quantize-fns.cpp +cp -rpv ../ggml/tests/test-quantize-perf.cpp ./tests/test-quantize-perf.cpp +cp -rpv ../ggml/tests/test-backend-ops.cpp ./tests/test-backend-ops.cpp + +cp -rpv ../LICENSE ./LICENSE +cp -rpv ../ggml/scripts/gen-authors.sh ./scripts/gen-authors.sh diff --git a/spm-headers/ggml-alloc.h b/spm-headers/ggml-alloc.h index a49d385a1..0361ffc38 120000 --- a/spm-headers/ggml-alloc.h +++ b/spm-headers/ggml-alloc.h @@ -1 +1 @@ -../ggml-alloc.h \ No newline at end of file +../ggml/include/ggml-alloc.h \ No newline at end of file diff --git a/spm-headers/ggml-backend.h b/spm-headers/ggml-backend.h index 17c2cf14f..7295f0f0d 120000 --- a/spm-headers/ggml-backend.h +++ b/spm-headers/ggml-backend.h @@ -1 +1 @@ -../ggml-backend.h \ No newline at end of file +../ggml/include/ggml-backend.h \ No newline at end of file diff --git a/spm-headers/ggml-metal.h b/spm-headers/ggml-metal.h new file mode 120000 index 000000000..aefad5fa0 --- /dev/null +++ b/spm-headers/ggml-metal.h @@ -0,0 +1 @@ +../ggml/include/ggml-metal.h \ No newline at end of file diff --git a/spm-headers/ggml.h b/spm-headers/ggml.h index 39215298f..0bdfeacbd 120000 --- a/spm-headers/ggml.h +++ b/spm-headers/ggml.h @@ -1 +1 @@ -../ggml.h \ No newline at end of file +../ggml/include/ggml.h \ No newline at end of file diff --git a/spm-headers/llama.h b/spm-headers/llama.h index 9acceb980..b31388f0d 120000 --- a/spm-headers/llama.h +++ b/spm-headers/llama.h @@ -1 +1 @@ -../llama.h \ No newline at end of file +../include/llama.h \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 000000000..ccb607e56 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,32 @@ +# TODO: should not use this +if (WIN32) + add_compile_definitions(_CRT_SECURE_NO_WARNINGS) + + if (BUILD_SHARED_LIBS) + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) + endif() +endif() + +# +# libraries +# + +# llama + +add_library(llama + ../include/llama.h + llama.cpp + unicode.h + unicode.cpp + unicode-data.cpp + ) + +target_include_directories(llama PUBLIC . ../include) +target_compile_features (llama PUBLIC cxx_std_11) # don't bump + +target_link_libraries(llama PUBLIC ggml) + +if (BUILD_SHARED_LIBS) + set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD) +endif() diff --git a/llama.cpp b/src/llama.cpp similarity index 100% rename from llama.cpp rename to src/llama.cpp diff --git a/unicode-data.cpp b/src/unicode-data.cpp similarity index 100% rename from unicode-data.cpp rename to src/unicode-data.cpp diff --git a/unicode-data.h b/src/unicode-data.h similarity index 100% rename from unicode-data.h rename to src/unicode-data.h diff --git a/unicode.cpp b/src/unicode.cpp similarity index 100% rename from unicode.cpp rename to src/unicode.cpp diff --git a/unicode.h b/src/unicode.h similarity index 100% rename from unicode.h rename to src/unicode.h diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 1ed74e543..f74c0db47 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1,7 +1,6 @@ #include #include #include -#include #include #include From a95631ee97bb24861af6bdeec380270459631e8e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 26 Jun 2024 19:26:13 +0300 Subject: [PATCH 18/50] readme : update API notes --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 6ca5ba43e..99b16f6e2 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ### Recent API changes +- [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006 - [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807 - [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341 - [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122 From 0e814dfc42b4b57ad19598d239557b6a977ca16c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 26 Jun 2024 19:32:07 +0300 Subject: [PATCH 19/50] devops : remove clblast + LLAMA_CUDA -> GGML_CUDA (#8139) ggml-ci --- .devops/full-cuda.Dockerfile | 2 +- .devops/full-rocm.Dockerfile | 2 +- .devops/llama-cli-cuda.Dockerfile | 2 +- .devops/llama-cli-intel.Dockerfile | 10 +-- .devops/llama-cli-rocm.Dockerfile | 2 +- .devops/llama-cli-vulkan.Dockerfile | 2 +- .devops/llama-cpp-clblast.srpm.spec | 84 -------------------------- .devops/llama-cpp-cuda.srpm.spec | 2 +- .devops/llama-server-cuda.Dockerfile | 2 +- .devops/llama-server-intel.Dockerfile | 10 +-- .devops/llama-server-rocm.Dockerfile | 2 +- .devops/llama-server-vulkan.Dockerfile | 2 +- 12 files changed, 19 insertions(+), 103 deletions(-) delete mode 100644 .devops/llama-cpp-clblast.srpm.spec diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile index f6073f662..2a7da586a 100644 --- a/.devops/full-cuda.Dockerfile +++ b/.devops/full-cuda.Dockerfile @@ -27,7 +27,7 @@ COPY . . # Set nvcc architecture ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} # Enable CUDA -ENV LLAMA_CUDA=1 +ENV GGML_CUDA=1 # Enable cURL ENV LLAMA_CURL=1 diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile index 0314d469b..5cbd2e7a1 100644 --- a/.devops/full-rocm.Dockerfile +++ b/.devops/full-rocm.Dockerfile @@ -36,7 +36,7 @@ COPY . . # Set nvcc architecture ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} # Enable ROCm -ENV LLAMA_HIPBLAS=1 +ENV GGML_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang ENV CXX=/opt/rocm/llvm/bin/clang++ diff --git a/.devops/llama-cli-cuda.Dockerfile b/.devops/llama-cli-cuda.Dockerfile index d5ce538f6..bff946cbc 100644 --- a/.devops/llama-cli-cuda.Dockerfile +++ b/.devops/llama-cli-cuda.Dockerfile @@ -21,7 +21,7 @@ COPY . . # Set nvcc architecture ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} # Enable CUDA -ENV LLAMA_CUDA=1 +ENV GGML_CUDA=1 RUN make -j$(nproc) llama-cli diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile index 6789e17af..bd816f9f5 100644 --- a/.devops/llama-cli-intel.Dockerfile +++ b/.devops/llama-cli-intel.Dockerfile @@ -2,7 +2,7 @@ ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build -ARG LLAMA_SYCL_F16=OFF +ARG GGML_SYCL_F16=OFF RUN apt-get update && \ apt-get install -y git @@ -10,11 +10,11 @@ WORKDIR /app COPY . . -RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \ - echo "LLAMA_SYCL_F16 is set" && \ - export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ +RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ + echo "GGML_SYCL_F16 is set" && \ + export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ fi && \ - cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ + cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ cmake --build build --config Release --target llama-cli FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime diff --git a/.devops/llama-cli-rocm.Dockerfile b/.devops/llama-cli-rocm.Dockerfile index 7e8a6f0fa..caa507b08 100644 --- a/.devops/llama-cli-rocm.Dockerfile +++ b/.devops/llama-cli-rocm.Dockerfile @@ -36,7 +36,7 @@ COPY . . # Set nvcc architecture ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} # Enable ROCm -ENV LLAMA_HIPBLAS=1 +ENV GGML_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang ENV CXX=/opt/rocm/llvm/bin/clang++ diff --git a/.devops/llama-cli-vulkan.Dockerfile b/.devops/llama-cli-vulkan.Dockerfile index 7a0abe71f..6155d5881 100644 --- a/.devops/llama-cli-vulkan.Dockerfile +++ b/.devops/llama-cli-vulkan.Dockerfile @@ -14,7 +14,7 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key # Build it WORKDIR /app COPY . . -RUN cmake -B build -DLLAMA_VULKAN=1 && \ +RUN cmake -B build -DGGML_VULKAN=1 && \ cmake --build build --config Release --target llama-cli # Clean up diff --git a/.devops/llama-cpp-clblast.srpm.spec b/.devops/llama-cpp-clblast.srpm.spec deleted file mode 100644 index 013952191..000000000 --- a/.devops/llama-cpp-clblast.srpm.spec +++ /dev/null @@ -1,84 +0,0 @@ -# SRPM for building from source and packaging an RPM for RPM-based distros. -# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages -# Built and maintained by John Boero - boeroboy@gmail.com -# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal - -# Notes for llama.cpp: -# 1. Tags are currently based on hash - which will not sort asciibetically. -# We need to declare standard versioning if people want to sort latest releases. -# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies. -# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed. -# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo -# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries. -# It is up to the user to install the correct vendor-specific support. - -Name: llama.cpp-clblast -Version: %( date "+%%Y%%m%%d" ) -Release: 1%{?dist} -Summary: OpenCL Inference of LLaMA model in C/C++ -License: MIT -Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz -BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel -Requires: clblast -URL: https://github.com/ggerganov/llama.cpp - -%define debug_package %{nil} -%define source_date_epoch_from_changelog 0 - -%description -CPU inference for Meta's Lllama2 models using default options. - -%prep -%setup -n llama.cpp-master - -%build -make -j LLAMA_CLBLAST=1 - -%install -mkdir -p %{buildroot}%{_bindir}/ -cp -p llama-cli %{buildroot}%{_bindir}/llama-clblast-cli -cp -p llama-server %{buildroot}%{_bindir}/llama-clblast-server -cp -p llama-simple %{buildroot}%{_bindir}/llama-clblast-simple - -mkdir -p %{buildroot}/usr/lib/systemd/system -%{__cat} < %{buildroot}/usr/lib/systemd/system/llamaclblast.service -[Unit] -Description=Llama.cpp server, CPU only (no GPU support in this build). -After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target - -[Service] -Type=simple -EnvironmentFile=/etc/sysconfig/llama -ExecStart=/usr/bin/llama-clblast-server $LLAMA_ARGS -ExecReload=/bin/kill -s HUP $MAINPID -Restart=never - -[Install] -WantedBy=default.target -EOF - -mkdir -p %{buildroot}/etc/sysconfig -%{__cat} < %{buildroot}/etc/sysconfig/llama -LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin" -EOF - -%clean -rm -rf %{buildroot} -rm -rf %{_builddir}/* - -%files -%{_bindir}/llama-clblast-cli -%{_bindir}/llama-clblast-server -%{_bindir}/llama-clblast-simple -/usr/lib/systemd/system/llamaclblast.service -%config /etc/sysconfig/llama - - -%pre - -%post - -%preun -%postun - -%changelog diff --git a/.devops/llama-cpp-cuda.srpm.spec b/.devops/llama-cpp-cuda.srpm.spec index cbdf43626..7425d3a9d 100644 --- a/.devops/llama-cpp-cuda.srpm.spec +++ b/.devops/llama-cpp-cuda.srpm.spec @@ -32,7 +32,7 @@ CPU inference for Meta's Lllama2 models using default options. %setup -n llama.cpp-master %build -make -j LLAMA_CUDA=1 +make -j GGML_CUDA=1 %install mkdir -p %{buildroot}%{_bindir}/ diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/llama-server-cuda.Dockerfile index 7bef07a05..d7eaa0925 100644 --- a/.devops/llama-server-cuda.Dockerfile +++ b/.devops/llama-server-cuda.Dockerfile @@ -21,7 +21,7 @@ COPY . . # Set nvcc architecture ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} # Enable CUDA -ENV LLAMA_CUDA=1 +ENV GGML_CUDA=1 # Enable cURL ENV LLAMA_CURL=1 diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile index 3bf1670ec..8f8fef8c0 100644 --- a/.devops/llama-server-intel.Dockerfile +++ b/.devops/llama-server-intel.Dockerfile @@ -2,7 +2,7 @@ ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build -ARG LLAMA_SYCL_F16=OFF +ARG GGML_SYCL_F16=OFF RUN apt-get update && \ apt-get install -y git libcurl4-openssl-dev @@ -10,11 +10,11 @@ WORKDIR /app COPY . . -RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \ - echo "LLAMA_SYCL_F16 is set" && \ - export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ +RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ + echo "GGML_SYCL_F16 is set" && \ + export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ fi && \ - cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ + cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ cmake --build build --config Release --target llama-server FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime diff --git a/.devops/llama-server-rocm.Dockerfile b/.devops/llama-server-rocm.Dockerfile index 4b1cdc320..af96c3325 100644 --- a/.devops/llama-server-rocm.Dockerfile +++ b/.devops/llama-server-rocm.Dockerfile @@ -36,7 +36,7 @@ COPY . . # Set nvcc architecture ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} # Enable ROCm -ENV LLAMA_HIPBLAS=1 +ENV GGML_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang ENV CXX=/opt/rocm/llvm/bin/clang++ diff --git a/.devops/llama-server-vulkan.Dockerfile b/.devops/llama-server-vulkan.Dockerfile index 2bc2e45d3..49062f84b 100644 --- a/.devops/llama-server-vulkan.Dockerfile +++ b/.devops/llama-server-vulkan.Dockerfile @@ -14,7 +14,7 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key # Build it WORKDIR /app COPY . . -RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \ +RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \ cmake --build build --config Release --target llama-server # Clean up From 4713bf3093d58a3e12368ab2ab5fc3630f27803e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 26 Jun 2024 19:36:44 +0300 Subject: [PATCH 20/50] authors : regen --- AUTHORS | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 128 insertions(+), 1 deletion(-) diff --git a/AUTHORS b/AUTHORS index b029f13da..1bd36158a 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,8 +1,9 @@ -# date: Tue Apr 9 09:17:14 EEST 2024 +# date: Wed Jun 26 19:36:34 EEST 2024 # this file is auto-generated by scripts/gen-authors.sh 0cc4m 0xspringtime <110655352+0xspringtime@users.noreply.github.com> +20kdc 2f38b454 3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com> 44670 <44670@users.noreply.github.com> @@ -11,14 +12,18 @@ AT Aarni Koskela Aaron Miller Aaryaman Vasishta +Abheek Gulati Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com> Adithya Balaji AdithyanI Adrian Adrian Hesketh +Ahmet Zeer AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> Aisuko +Akarshan Biswas +Albert Jin Alberto <57916483+albbus-stack@users.noreply.github.com> Alex Alex Azarov @@ -35,19 +40,24 @@ Ali Nehzat Ali Tariq Alon AlpinDale <52078762+AlpinDale@users.noreply.github.com> +Amir AmirAli Mirian <37371367+amiralimi@users.noreply.github.com> Ananta Bastola Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> András Salamon Andrei Andrew Canis +Andrew Downing Andrew Duffy Andrew Godfrey +Andy Tai Arik Poznanski Artem +Artem Zinnatullin Artyom Lebedev Asbjørn Olling Ásgeir Bjarni Ingvarsson +Ashish <1856117+ashishdatta@users.noreply.github.com> Ashok Gelal <401055+ashokgelal@users.noreply.github.com> Ashraful Islam Atsushi Tatsuma @@ -57,35 +67,46 @@ BADR Bach Le Bailey Chittle <39804642+bachittle@users.noreply.github.com> BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com> +Bartowski Behnam M <58621210+ibehnam@users.noreply.github.com> +Ben Ashbaugh Ben Garney Ben Siraphob Ben Williams +Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com> Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com> Bernat Vadell +Bingan <70050083+binganao@users.noreply.github.com> Bodo Graumann Bono Lv Borislav Stanimirov Branden Butler Brian Bruce MacDonald +Bryan Honof CJ Pais CRD716 +Calvin Laurenson Cameron Cameron Kaiser +Carolinabanana <140120812+Carolinabanana@users.noreply.github.com> Casey Primozic Casey Primozic CausalLM <148736309+CausalLM@users.noreply.github.com> Cebtenzzre Chad Brewbaker +Chao Jiang Cheng Shao +Chris Elrod Chris Kuehl Christian Demsar Christian Demsar Christian Falch <875252+chrfalch@users.noreply.github.com> Christian Kögler +Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Clark Saben <76020733+csaben@users.noreply.github.com> Clint Herron +CrispStrobe <154636388+CrispStrobe@users.noreply.github.com> Cuong Trinh Manh DAN™ Damian Stewart @@ -95,8 +116,12 @@ Daniel Bevenius Daniel Drake Daniel Hiltgen Daniel Illescas Romero +Daniele <57776841+daniandtheweb@users.noreply.github.com> DannyDaemonic Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com> +Dave +Dave Airlie +Dave Airlie Dave Della Costa David Friehs David Kennedy @@ -104,10 +129,13 @@ David Pflug David Renshaw David Sommers <12738+databyte@users.noreply.github.com> David Yang +Dawid Potocki Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com> Dean Deins +Deven Mistry <31466137+deven367@users.noreply.github.com> Didzis Gosko +Djip007 Don Mahurin DooWoong Lee (David) Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com> @@ -116,8 +144,11 @@ Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com> Ebey Abraham Ed Lee Ed Lepedus +Eddie-Wang Edward Taylor +Elaine Elbios <141279586+Elbios@users.noreply.github.com> +Elton Kola Engininja2 <139037756+Engininja2@users.noreply.github.com> Equim Eric Sommerlade @@ -143,37 +174,47 @@ Firat Folko-Ven <71110216+Folko-Ven@users.noreply.github.com> Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com> Francisco Melo <43780565+francis2tm@users.noreply.github.com> +Frank Mai FrankHB +Fred Douglas <43351173+fredlas@users.noreply.github.com> Frederik Vogel Gabe Goodhart GainLee Galunid Gary Linscott Gary Mulder +Gavin Zhao Genkagaku.GPT Georgi Gerganov Gilad S +Giuseppe Scrivano GiviMAD Govlzkoy Guillaume "Vermeille" Sanchez Guillaume Wenzek Guoteng <32697156+SolenoidWGT@users.noreply.github.com> Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com> +Haggai Nuchi Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com> +Hamdoud Hakem <90524568+hamdoudhakem@users.noreply.github.com> +HanishKVC Haohui Mai Haoxiang Fei Harald Fernengel Hatsune Miku <129688334+at8u@users.noreply.github.com> +HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com> Henk Poley Henri Vasserman Henrik Forstén Herman Semenov Hesen Peng Hoang Nguyen +Hong Bo PENG Hongyu Ouyang <96765450+casavaca@users.noreply.github.com> Howard Su Hua Jiang Huawei Lin +Hugo Roussel Ian Bull Ian Bull Ian Scrivener @@ -190,8 +231,10 @@ Ivan Stepanov JH23X <165871467+JH23X@users.noreply.github.com> Jack Mousseau JackJollimore <130917767+JackJollimore@users.noreply.github.com> +Jaemin Son Jag Chadha Jakub N +James A Capozzoli <157492257+jac-jim@users.noreply.github.com> James Reynolds Jan Boon Jan Boon @@ -205,12 +248,17 @@ Jean-Michaël Celerier Jed Fox Jeffrey Quesnelle Jesse Jojo Johnson +Jeximo Jhen-Jie Hong Jiahao Li Jian Liao JidongZhang-THU <1119708529@qq.com> Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com> Jiří Podivín <66251151+jpodivin@users.noreply.github.com> +Jiří Sejkora +Joan Fontanals +Joan Fontanals +Johan Johannes Gäßler Johannes Rudolph John <78893154+cmp-nct@users.noreply.github.com> @@ -221,15 +269,19 @@ Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com> Jorge A <161275481+jorgealias@users.noreply.github.com> Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com> Joseph Stahl <1269177+josephst@users.noreply.github.com> +Josh Ramer Joyce Juan Calderon-Perez <835733+gaby@users.noreply.github.com> Judd Julius Arkenberg Jun Jie <71215065+junnjiee16@users.noreply.github.com> +Junyang Lin Juraj Bednar Justin Parker Justin Suess +Justina Cho Justine Tunney +Justine Tunney Juuso Alasuutari KASR Kamil Tomšík @@ -242,6 +294,7 @@ Kawrakow <48489457+ikawrakow@users.noreply.github.com> Keiichi Tabata Kenvix ⭐ Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> +Kevin Gibbons Kevin Ji <1146876+kevinji@users.noreply.github.com> Kevin Kwok Kevin Lo @@ -257,6 +310,7 @@ Laura Lee <44310445+lx200916@users.noreply.github.com> Lee Drake Leng Yue +Leon Knauer LeonEricsson <70749762+LeonEricsson@users.noreply.github.com> Leonardo Neumann Li Tan @@ -265,20 +319,26 @@ LoganDark LostRuins <39025047+LostRuins@users.noreply.github.com> Luciano Luo Tian +Lyle Dean M. Yusuf Sarıgöz Maarten ter Huurne Mack Straight Maël Kerbiriou MaggotHATE +Manuel <44313466+makuche@users.noreply.github.com> Marc Köhlbrugge Marco Matthies <71844+marcom@users.noreply.github.com> Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com> Marian Cepok Mark Fairbairn Marko Tasic +Markus Tavenrath +Martin Delille Martin Krasser Martin Schwaighofer Marvin Gießing +Masaya, Kato <62578291+msy-kato@users.noreply.github.com> +MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com> Mateusz Charytoniuk Matheus C. França Matheus Gabriel Alves Silva @@ -287,8 +347,11 @@ Mathijs de Bruin Matt Clayton <156335168+mattjcly@users.noreply.github.com> Matt Pulver Matteo Boschini <12133566+mbosc@users.noreply.github.com> +Mattheus Chediak Matthew Tejo Matvey Soloviev +Max Krasnyansky +Max Krasnyansky Maxime <672982+maximegmd@users.noreply.github.com> Maximilian Winter Meng Zhang @@ -300,32 +363,41 @@ Michael Kesper Michael Klimenko Michael Podvitskiy Michael Potter +Michael de Gans Michaël de Vries Mihai Mike +Mikko Juola Minsoo Cheong <54794500+mscheong01@users.noreply.github.com> Mirko185 Mirror Azure <54669636+MirrorAzure@users.noreply.github.com> Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com> Mohammadreza Hendiani +Mohammadreza Hendiani Murilo Santana Musab Gultekin Nam D. Tran <42194884+namtranase@users.noreply.github.com> +Nathan Epstein NawafAlansari <72708095+NawafAlansari@users.noreply.github.com> Nebula +Neo Zhang <14088817+arthw@users.noreply.github.com> +Neo Zhang Neo Zhang Jianyu Neuman Vong Nexesenex <124105151+Nexesenex@users.noreply.github.com> Niall Coates <1349685+Niall-@users.noreply.github.com> Nicolai Weitkemper +Nicolás Pérez Nigel Bosch Niklas Korz +Nikolas <127742645+nneubacher@users.noreply.github.com> Nindaleth Oleksandr Nikitin Oleksii Maryshchenko Olivier Chafik Ondřej Čertík Ouadie EL FAROUKI +Patrice Ferlet Paul Tsochantaris Pavol Rusnak Pedro Cuenca @@ -343,9 +415,14 @@ RJ Adriaansen Radoslav Gerganov Radosław Gryta Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com> +Raj Hammeer Singh Hada +Ralph Soika Rand Xie Randall Fitzgerald Reinforce-II +Ren Xuancheng +Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com> +RhinoDevel Riceball LEE Richard Kiss Richard Roberson @@ -373,6 +450,7 @@ Rowan Hart Rune <43761327+Rune-AI@users.noreply.github.com> Ryan Landay Ryder Wishart +Ryuei Rőczey Barnabás <31726601+An0nie@users.noreply.github.com> SakuraUmi Salvador E. Tropea @@ -386,6 +464,7 @@ SebastianApel <13675545+SebastianApel@users.noreply.github.com> Senemu <10880819+Senemu@users.noreply.github.com> Sergey Alirzaev Sergio López +Sertaç Özercan <852750+sozercan@users.noreply.github.com> SeungWon Jeong <65549245+redlion0929@users.noreply.github.com> ShadovvBeast Shakhar Dasgupta @@ -394,6 +473,7 @@ Shijie <821898965@qq.com> Shintarou Okada Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com> Shouzheng Liu +Shuichi Tsutsumi Sigbjørn Skjæret Simon Willison Siwen Yu @@ -405,11 +485,14 @@ Someone Someone Serge Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> Spencer Sutton +Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com> Srinivas Billa Stefan Sydow +Steffen Röcker Stephan Walter Stephen Nichols Steve Grubb +Steven Prichard Steven Roussey Steward Garcia <57494570+FSSRepo@users.noreply.github.com> Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com> @@ -434,16 +517,19 @@ Tom C Tom Jobbins <784313+TheBloke@users.noreply.github.com> Tomas Tomáš Pazdiora +Tristan Druyen Tristan Ross Tungsten842 <886724vf@anonaddy.me> Tungsten842 Tushar UEXTM.com <84163508+uextm@users.noreply.github.com> +Ulrich Drepper Uzo Nweke Vaibhav Srivastav Val Kharitonov Valentin Konovalov Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com> +Victor Nogueira Victor Z. Peng Vlad Vladimir @@ -455,7 +541,9 @@ Weird Constructor Welby Seely Wentai Zhang WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com> +William Tambellini Willy Tarreau +Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com> Wu Jian Ping Wu Jian Ping Xiake Sun @@ -466,6 +554,8 @@ Xiaoyi Chen Xingchen Song(宋星辰) Xuan Son Nguyen Yann Follet <131855179+YannFollet@users.noreply.github.com> +Yaroslav +Yazan Agha-Schrader Yiming Cui Yishuo Wang Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com> @@ -477,6 +567,7 @@ Zane Shannon Zay <95888118+isaiahbjork@users.noreply.github.com> Zenix Zhang Peiyuan +Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com> ZhouYuChen Ziad Ben Hadj-Alouane Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com> @@ -484,14 +575,18 @@ Zsapi a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com> adel boussaken afrideva <95653597+afrideva@users.noreply.github.com> +agray3 akawrykow <142945436+akawrykow@users.noreply.github.com> alexpinel <93524949+alexpinel@users.noreply.github.com> alonfaraj +alwqx +amd-lalithnc andrijdavid anon998 <131767832+anon998@users.noreply.github.com> anzz1 apaz apcameron <37645737+apcameron@users.noreply.github.com> +arch-btw <57669023+arch-btw@users.noreply.github.com> arcrank arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com> at8u <129688334+at8u@users.noreply.github.com> @@ -514,13 +609,17 @@ cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com> coezbek comex compilade <113953597+compilade@users.noreply.github.com> +compilade +cpumaxx <163466046+cpumaxx@users.noreply.github.com> crasm crasm daboe01 david raistrick +ddh0 ddpasa <112642920+ddpasa@users.noreply.github.com> deepdiffuser <112834445+deepdiffuser@users.noreply.github.com> divinity76 +dm4 dotpy314 <33351922+dotpy314@users.noreply.github.com> drbh ds5t5 <145942675+ds5t5@users.noreply.github.com> @@ -529,6 +628,7 @@ eastriver ebraminio eiery <19350831+eiery@users.noreply.github.com> eric8607242 +fairydreaming <166155368+fairydreaming@users.noreply.github.com> fraxy-v <65565042+fraxy-v@users.noreply.github.com> github-actions[bot] gliptic @@ -539,6 +639,7 @@ h-h-h-h <13482553+h-h-h-h@users.noreply.github.com> hankcs hoangmit hongbo.mo <352280764@qq.com> +hopkins385 <98618192+hopkins385@users.noreply.github.com> howlger howlger hutli <6594598+hutli@users.noreply.github.com> @@ -549,14 +650,22 @@ hydai iSma iacore <74560659+iacore@users.noreply.github.com> igarnier +intelmatt <61025942+intelmatt@users.noreply.github.com> iohub jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com> +jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> jameswu2014 <545426914@qq.com> +jiez <373447296@qq.com> jneem +joecryptotoo <80373433+joecryptotoo@users.noreply.github.com> johnson442 <56517414+johnson442@users.noreply.github.com> +jojorne jon-chuang <9093549+jon-chuang@users.noreply.github.com> jp-x-g +jukofyork <69222624+jukofyork@users.noreply.github.com> +junchao-loongson <68935141+junchao-loongson@users.noreply.github.com> jwj7140 <32943891+jwj7140@users.noreply.github.com> +k.h.lai kaizau kalomaze <66376113+kalomaze@users.noreply.github.com> kang @@ -575,11 +684,15 @@ ldwang le.chang leejet limitedAtonement +liuwei-git <14815172+liuwei-git@users.noreply.github.com> lon <114724657+longregen@users.noreply.github.com> +loonerin <132926317+loonerin@users.noreply.github.com> +luoyu-intel m3ndax maddes8cht <55592906+maddes8cht@users.noreply.github.com> makomk manikbhandari +maor-ps <154728172+maor-ps@users.noreply.github.com> mdrokz mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com> minarchist @@ -593,15 +706,19 @@ ngc92 <7938269+ngc92@users.noreply.github.com> nhamanasu <45545786+nhamanasu@users.noreply.github.com> niansa/tuxifan niansa/tuxifan +nickp27 ningshanwutuobang nold nopperl <54780682+nopperl@users.noreply.github.com> nusu-github <29514220+nusu-github@users.noreply.github.com> olexiyb +omahs <73983677+omahs@users.noreply.github.com> oobabooga <112222186+oobabooga@users.noreply.github.com> opparco ostix360 <55257054+ostix360@users.noreply.github.com> +pengxin99 perserk +pmysl postmasters pudepiedj qingfengfenga <41416092+qingfengfenga@users.noreply.github.com> @@ -614,16 +731,19 @@ rhuddleston rimoliga <53384203+rimoliga@users.noreply.github.com> runfuture sandyiscool +sasha0552 semidark sharpHL <132747147+sharpHL@users.noreply.github.com> shibe2 singularity <12184989+singularity-s0@users.noreply.github.com> sjinzh +sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com> slaren <2141330+slaren@users.noreply.github.com> slaren snadampal <87143774+snadampal@users.noreply.github.com> staviq stduhpf +strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com> swittk takov751 <40316768+takov751@users.noreply.github.com> tarcey @@ -636,12 +756,16 @@ uint256_t uint256_t unbounded valiray <133289098+valiray@users.noreply.github.com> +vik +viric vodkaslime <646329483@qq.com> vvhg1 <94630311+vvhg1@users.noreply.github.com> vxiiduu <73044267+vxiiduu@users.noreply.github.com> wbpxre150 <100937007+wbpxre150@users.noreply.github.com> whoreson <139810751+whoreson@users.noreply.github.com> +woachk <24752637+woachk@users.noreply.github.com> wonjun Jang +woodx <124784234+woodx9@users.noreply.github.com> wzy <32936898+Freed-Wu@users.noreply.github.com> xaedes xaedes @@ -649,7 +773,10 @@ xloem <0xloem@gmail.com> yangli2 yuiseki zakkor +zhangkaihuo zhouwg <6889919+zhouwg@users.noreply.github.com> +zhouwg zrm +Ștefan-Gabriel Muscalu 源文雨 <41315874+fumiama@users.noreply.github.com> Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com> From f2d48fffde76d959fdb0da37316bdc09e5518eb1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 26 Jun 2024 19:39:19 +0300 Subject: [PATCH 21/50] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index b6c57ec5e..2da33e913 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -5653a195935ea3ac54652644c9daf154dbc1571b +5378ea0d3c2f25bcd330ecb226ad2db454be86d0 From c7ab7b612cbdce04499575e713076a026af4b9c5 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 26 Jun 2024 20:20:22 +0200 Subject: [PATCH 22/50] make : fix missing -O3 (#8143) --- Makefile | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 64a6e6ff0..bbfe0f12b 100644 --- a/Makefile +++ b/Makefile @@ -148,12 +148,6 @@ ifndef UNAME_M UNAME_M := $(shell uname -m) endif -MK_CFLAGS += -O3 -MK_CXXFLAGS += -O3 -ifndef LLAMA_DEBUG -MK_NVCCFLAGS += -O3 -endif # LLAMA_DEBUG - # In GNU make default CXX is g++ instead of c++. Let's fix that so that users # of non-gcc compilers don't have to provide g++ alias or wrapper. DEFCC := cc @@ -312,7 +306,10 @@ ifdef LLAMA_DEBUG MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS endif else - MK_CPPFLAGS += -DNDEBUG + MK_CPPFLAGS += -DNDEBUG + MK_CFLAGS += -O3 + MK_CXXFLAGS += -O3 + MK_NVCCFLAGS += -O3 endif ifdef LLAMA_SANITIZE_THREAD From 31ec3993f6e050322a249c07af79dbde66ea6ddc Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 26 Jun 2024 21:34:14 +0200 Subject: [PATCH 23/50] ggml : add GGML_CUDA_USE_GRAPHS option, restore GGML_CUDA_FORCE_CUBLAS (cmake) (#8140) --- CMakeLists.txt | 1 + ggml/CMakeLists.txt | 2 ++ ggml/src/CMakeLists.txt | 5 ++++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 18297834e..7a7197282 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -80,6 +80,7 @@ set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED}) set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) set(GGML_LLAMAFILE ON) +set(GGML_CUDA_USE_GRAPHS ON) # transition helpers function (llama_option_depr TYPE OLD NEW) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index f3763f7eb..0d0d52d57 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -109,6 +109,7 @@ option(GGML_LLAMAFILE "ggml: use ggml SGEMM" option(GGML_CUDA "ggml: use CUDA" OFF) option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF) option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF) +option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF) set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels") set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels") option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF) @@ -119,6 +120,7 @@ set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF) option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF) option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF) +option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF) option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF) option(GGML_HIPBLAS "ggml: use hipBLAS" OFF) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index ba341d374..d0f4097d8 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -295,12 +295,15 @@ if (GGML_CUDA) list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA) - add_compile_definitions(GGML_CUDA_USE_GRAPHS) add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y}) add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER}) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) + if (GGML_CUDA_USE_GRAPHS) + add_compile_definitions(GGML_CUDA_USE_GRAPHS) + endif() + if (GGML_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV) endif() From ae5d0f4b899ff2842bfca561370c945ad8d4368b Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 26 Jun 2024 21:59:28 +0200 Subject: [PATCH 24/50] ci : publish new docker images only when the files change (#8142) --- .github/workflows/build.yml | 4 ++-- .github/workflows/docker.yml | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0d91fc4e4..208515287 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,10 +10,10 @@ on: push: branches: - master - paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m'] + paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal'] pull_request: types: [opened, synchronize, reopened] - paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m'] + paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal'] concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 01f1a4522..bf94b2024 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -14,6 +14,7 @@ on: push: branches: - master + paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal'] concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} From c70d117c37cc7876e775d1e2722208a50c52edb3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 26 Jun 2024 23:25:22 +0300 Subject: [PATCH 25/50] scripts : fix filename sync --- scripts/sync-ggml-am.sh | 71 ++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 36 deletions(-) diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index 9e654180b..b05a33747 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -136,42 +136,41 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # LICENSE -> LICENSE # scripts/gen-authors.sh -> scripts/gen-authors.sh - cat ggml-src.patch | sed \ - -e 's/CMakeLists.txt/ggml\/CMakeLists.txt/g' \ - -e 's/src\/CMakeLists.txt/ggml\/src\/CMakeLists.txt/g' \ - -e 's/cmake\/FindSIMD.cmake/ggml\/cmake\/FindSIMD.cmake/g' \ - -e 's/src\/ggml\.c/ggml/src/ggml.c/g' \ - -e 's/src\/ggml-alloc\.c/ggml/src/ggml-alloc.c/g' \ - -e 's/src\/ggml-backend-impl\.h/ggml/src/ggml-backend-impl.h/g' \ - -e 's/src\/ggml-backend\.c/ggml/src/ggml-backend.c/g' \ - -e 's/src\/ggml-common\.h/ggml/src/ggml-common.h/g' \ - -e 's/src\/ggml-cuda\//ggml-cuda\//g' \ - -e 's/src\/ggml-cuda\.cu/ggml/src/ggml-cuda.cu/g' \ - -e 's/src\/ggml-impl\.h/ggml/src/ggml-impl.h/g' \ - -e 's/src\/ggml-kompute\.cpp/ggml/src/ggml-kompute.cpp/g' \ - -e 's/src\/ggml-metal\.m/ggml/src/ggml-metal.m/g' \ - -e 's/src\/ggml-quants\.c/ggml/src/ggml-quants.c/g' \ - -e 's/src\/ggml-quants\.h/ggml/src/ggml-quants.h/g' \ - -e 's/src\/ggml-rpc\.cpp/ggml/src/ggml-rpc.cpp/g' \ - -e 's/src\/ggml-sycl\.cpp/ggml/src/ggml-sycl.cpp/g' \ - -e 's/src\/ggml-vulkan\.cpp/ggml/src/ggml-vulkan.cpp/g' \ - -e 's/include\/ggml\.h/ggml/include/ggml.h/g' \ - -e 's/include\/ggml-alloc\.h/ggml/include/ggml-alloc.h/g' \ - -e 's/include\/ggml-backend\.h/ggml/include/ggml-backend.h/g' \ - -e 's/include\/ggml-blas\.h/ggml/include/ggml-blas.h/g' \ - -e 's/include\/ggml-cuda\.h/ggml/include/ggml-cuda.h/g' \ - -e 's/include\/ggml-kompute\.h/ggml/include/ggml-kompute.h/g' \ - -e 's/include\/ggml-metal\.h/ggml/include/ggml-metal.h/g' \ - -e 's/include\/ggml-rpc\.h/ggml/include/ggml-rpc.h/g' \ - -e 's/include\/ggml-sycl\.h/ggml/include/ggml-sycl.h/g' \ - -e 's/include\/ggml-vulkan\.h/ggml/include/ggml-vulkan.h/g' \ - -e 's/tests\/test-opt\.cpp/tests\/test-opt.cpp/g' \ - -e 's/tests\/test-grad0\.cpp/tests\/test-grad0.cpp/g' \ - -e 's/tests\/test-quantize-fns\.cpp/tests\/test-quantize-fns.cpp/g' \ - -e 's/tests\/test-quantize-perf\.cpp/tests\/test-quantize-perf.cpp/g' \ - -e 's/tests\/test-backend-ops\.cpp/tests\/test-backend-ops.cpp/g' \ - -e 's/LICENSE/LICENSE/g' \ - -e 's/scripts\/gen-authors\.sh/scripts\/gen-authors.sh/g' \ + cat ggml-src.patch | sed -E \ + -e 's/([[:space:]]|[ab]\/)CMakeLists.txt/\1ggml\/CMakeLists.txt/g' \ + -e 's/([[:space:]]|[ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \ + -e 's/([[:space:]]|[ab]\/)cmake\/FindSIMD.cmake/\1ggml\/cmake\/FindSIMD.cmake/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml\.c/\1ggml\/src\/ggml.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.c/\1ggml\/src\/ggml-backend.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-common\.h/\1ggml\/src\/ggml-common.h/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\.cu/\1ggml\/src\/ggml-cuda.cu/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-impl\.h/\1ggml\/src\/ggml-impl.h/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-kompute\.cpp/\1ggml\/src\/ggml-kompute.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-metal\.m/\1ggml\/src\/ggml-metal.m/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.c/\1ggml\/src\/ggml-quants.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.h/\1ggml\/src\/ggml-quants.h/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-rpc\.cpp/\1ggml\/src\/ggml-rpc.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\.cpp/\1ggml\/src\/ggml-sycl.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\.cpp/\1ggml\/src\/ggml-vulkan.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-blas\.h/\1ggml\/include\/ggml-blas.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-cuda\.h/\1ggml\/include\/ggml-cuda.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-kompute\.h/\1ggml\/include\/ggml-kompute.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-metal\.h/\1ggml\/include\/ggml-metal.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-rpc\.h/\1ggml\/include\/ggml-rpc.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-sycl\.h/\1ggml\/include\/ggml-sycl.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-vulkan\.h/\1ggml\/include\/ggml-vulkan.h/g' \ + -e 's/([[:space:]]|[ab]\/)examples\/common\.h/examples\/common.h/g' \ + -e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/examples\/common.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/examples\/common-ggml.h/g' \ + -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.cpp/examples\/common-ggml.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)LICENSE/LICENSE/g' \ + -e 's/([[:space:]]|[ab]\/)scripts\/gen-authors\.sh/scripts\/gen-authors.sh/g' \ > ggml-src.patch.tmp mv ggml-src.patch.tmp ggml-src.patch From 9b31a40c6ddabe552875b811d7127aa039ca9703 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 27 Jun 2024 01:50:09 +0200 Subject: [PATCH 26/50] clip : suppress unused variable warnings (#8105) * clip : suppress unused variable warnings This commit suppresses unused variable warnings for the variables e in the catch blocks. The motivation for this change is to suppress the warnings that are generated on Windows when using the MSVC compiler. The warnings are not displayed when using GCC because GCC will mark all catch parameters as used. Signed-off-by: Daniel Bevenius * squash! clip : suppress unused variable warnings Remove e (/*e*/) instead instead of using GGML_UNUSED. --------- Signed-off-by: Daniel Bevenius --- examples/llava/clip.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 95fbe3d02..d6882eec3 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1121,20 +1121,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } if (n < 32) hparams.image_grid_pinpoints[n] = 0; - } catch (std::runtime_error & e) { + } catch (std::runtime_error & /*e*/) { hparams.image_grid_pinpoints[0]=0; } try { int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE); strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx)); - } catch (std::runtime_error & e) { + } catch (std::runtime_error & /*e*/) { strcpy(hparams.mm_patch_merge_type, "flat"); } try { hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6 - } catch(const std::exception& e) { + } catch(const std::exception& /*e*/) { hparams.image_crop_resolution = hparams.image_size; } @@ -1173,7 +1173,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { try { vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); new_clip->has_class_embedding = true; - } catch (const std::exception& e) { + } catch (const std::exception& /*e*/) { new_clip->has_class_embedding = false; } @@ -1181,7 +1181,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); new_clip->has_pre_norm = true; - } catch (std::exception & e) { + } catch (std::exception & /*e*/) { new_clip->has_pre_norm = false; } @@ -1189,21 +1189,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight")); vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias")); new_clip->has_post_norm = true; - } catch (std::exception & e) { + } catch (std::exception & /*e*/) { new_clip->has_post_norm = false; } try { vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS); new_clip->has_patch_bias = true; - } catch (std::exception & e) { + } catch (std::exception & /*e*/) { new_clip->has_patch_bias = false; } try { vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); - } catch(const std::exception& e) { + } catch(const std::exception& /*e*/) { LOG_TEE("%s: failed to load vision model tensors\n", __func__); } @@ -1215,26 +1215,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { // Yi-type llava vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight")); vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias")); - } catch (std::runtime_error & e) { } + } catch (std::runtime_error & /*e*/) { } try { // missing in Yi-type llava vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); - } catch (std::runtime_error & e) { } + } catch (std::runtime_error & /*e*/) { } try { // Yi-type llava vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight")); vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias")); - } catch (std::runtime_error & e) { } + } catch (std::runtime_error & /*e*/) { } try { // Yi-type llava vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight")); vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias")); - } catch (std::runtime_error & e) { } + } catch (std::runtime_error & /*e*/) { } try { vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__); - } catch (std::runtime_error & e) { } + } catch (std::runtime_error & /*e*/) { } } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { // MobileVLM projection vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight")); From ac146628e47451c531a3c7e62e6a973a2bb467a0 Mon Sep 17 00:00:00 2001 From: Raj Hammeer Singh Hada Date: Thu, 27 Jun 2024 07:27:57 +0530 Subject: [PATCH 27/50] Fix llama-android.cpp for error - "common/common.h not found" (#8145) - Path seems to be wrong for the common.h header file in llama-android.cpp file. Fixing the path so the Android Build doesn't fail with the error "There is no file common/common.h" --- examples/llama.android/llama/src/main/cpp/llama-android.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index 874158ef0..92a6b16b1 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -5,7 +5,7 @@ #include #include #include "llama.h" -#include "common/common.h" +#include "common.h" // Write C++ code here. // From 911e35bb8bb2fd1c7d3f40f27e96ff432eae7e14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Thu, 27 Jun 2024 09:46:41 +0200 Subject: [PATCH 28/50] llama : fix CodeLlama FIM token checks (#8144) * account for space prefix character * use find instead --- src/llama.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index f78594a6f..080057332 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5152,10 +5152,10 @@ static void llm_load_vocab( if (gen_name.find("code") != std::string::npos) { if (model.arch == LLM_ARCH_LLAMA && 32010 < vocab.id_to_token.size() - && vocab.id_to_token[32007].text == "
"
-              && vocab.id_to_token[32008].text == ""
-              && vocab.id_to_token[32009].text == ""
-              && vocab.id_to_token[32010].text == "") {
+              && vocab.id_to_token[32007].text.find("
") != std::string::npos
+              && vocab.id_to_token[32008].text.find("") != std::string::npos
+              && vocab.id_to_token[32009].text.find("") != std::string::npos
+              && vocab.id_to_token[32010].text.find("") != std::string::npos) {
                 vocab.special_prefix_id = 32007;
                 vocab.special_suffix_id = 32008;
                 vocab.special_middle_id = 32009;

From f675b20a3b7f878bf3be766b9a737e2c8321ff0d Mon Sep 17 00:00:00 2001
From: kustaaya <58045274+kustaaya@users.noreply.github.com>
Date: Thu, 27 Jun 2024 11:58:54 +0300
Subject: [PATCH 29/50] Added support for Viking pre-tokenizer (#8135)

Co-authored-by: kustaaya 
---
 convert-hf-to-gguf-update.py | 1 +
 convert-hf-to-gguf.py        | 3 +++
 include/llama.h              | 1 +
 src/llama.cpp                | 9 +++++++++
 4 files changed, 14 insertions(+)

diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
index 67598b561..2758214fa 100755
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -85,6 +85,7 @@ models = [
     {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
     {"name": "poro-chat",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
     {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
+    {"name": "viking",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
 ]
 
 
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index c26fad930..5bf69ef9f 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -487,6 +487,9 @@ class Model:
         if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
             # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
             res = "jina-v2-code"
+        if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
+            # ref: https://huggingface.co/LumiOpen/Viking-7B
+            res = "viking"
 
         if res is None:
             logger.warning("\n")
diff --git a/include/llama.h b/include/llama.h
index 88eecb0ed..cafeafb85 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -88,6 +88,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
         LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
         LLAMA_VOCAB_PRE_TYPE_PORO           = 15,
+        LLAMA_VOCAB_PRE_TYPE_VIKING         = 16,
     };
 
     // note: these values should be synchronized with ggml_rope
diff --git a/src/llama.cpp b/src/llama.cpp
index 080057332..b97b5e279 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -5067,6 +5067,9 @@ static void llm_load_vocab(
             } else if (
                 tokenizer_pre == "poro-chat") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
+            } else if (
+                tokenizer_pre == "viking") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }
@@ -13703,6 +13706,12 @@ struct llm_tokenizer_bpe {
                     " ?[^(\\s|.,!?…。,、।۔،)]+",
                 };
                 break;
+            case LLAMA_VOCAB_PRE_TYPE_VIKING:
+                regex_exprs = {
+                    "\\p{N}",
+                    " ?[^(\\s|.,!?…。,、।۔،)]+",
+                };
+                break;
             default:
                 // default regex for BPE tokenization pre-processing
                 regex_exprs = {

From 85a267daaa1c6f8fd69160445bcb88717031d10c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= 
Date: Thu, 27 Jun 2024 16:26:05 +0200
Subject: [PATCH 30/50] CUDA: fix MMQ stream-k for --split-mode row (#8167)

---
 ggml/src/ggml-cuda/mmq.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index 31fcbf139..1396e7a75 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -2475,7 +2475,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
 
     const dim3 block_nums_mmq(nsm, 1, 1);
 
-    ggml_cuda_pool & pool = ctx.pool();
+    ggml_cuda_pool & pool = ctx.pool(id);
     ggml_cuda_pool_alloc tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y);
 
     if (args.ne01 % mmq_y == 0) {

From 6030c61281c8a7eb94eceb7396a608fac8b71555 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= 
Date: Thu, 27 Jun 2024 16:27:41 +0200
Subject: [PATCH 31/50] Add Qwen2MoE 57B-A14B model identifier (#8158)

* Add Qwen2MoE 57B-A14B

* Add Qwen2MoE 57B-A14B
---
 src/llama.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/llama.cpp b/src/llama.cpp
index b97b5e279..3dc0f8535 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2038,6 +2038,7 @@ enum e_model {
     MODEL_8x22B,
     MODEL_16x12B,
     MODEL_10B_128x3_66B,
+    MODEL_57B_A14B,
 };
 
 static const size_t kiB = 1024;
@@ -4267,6 +4268,7 @@ static const char * llama_model_type_name(e_model type) {
         case MODEL_8x22B:         return "8x22B";
         case MODEL_16x12B:        return "16x12B";
         case MODEL_10B_128x3_66B: return "10B+128x3.66B";
+        case MODEL_57B_A14B:      return "57B.A14B";
         default:                  return "?B";
     }
 }
@@ -4588,6 +4590,7 @@ static void llm_load_hparams(
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 switch (hparams.n_layer) {
                     case 24: model.type = e_model::MODEL_A2_7B; break;
+                    case 28: model.type = e_model::MODEL_57B_A14B; break;
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;

From 387952651a8fc493f8c85ea4c9774bd4a5694f87 Mon Sep 17 00:00:00 2001
From: Raj Hammeer Singh Hada 
Date: Thu, 27 Jun 2024 20:09:29 +0530
Subject: [PATCH 32/50] Delete examples/llama.android/llama/CMakeLists.txt
 (#8165)

* Delete examples/llama.android/llama/CMakeLists.txt

https://github.com/ggerganov/llama.cpp/pull/8145#issuecomment-2194534244

This file is not being used for building on Android. `llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt` is being used instead.

* Update CMakeLists.txt

Pick local llama.cpp files instead of fetching content from git
---
 examples/llama.android/llama/CMakeLists.txt   | 55 -------------------
 .../llama/src/main/cpp/CMakeLists.txt         | 18 +++---
 2 files changed, 11 insertions(+), 62 deletions(-)
 delete mode 100644 examples/llama.android/llama/CMakeLists.txt

diff --git a/examples/llama.android/llama/CMakeLists.txt b/examples/llama.android/llama/CMakeLists.txt
deleted file mode 100644
index a5618cac0..000000000
--- a/examples/llama.android/llama/CMakeLists.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-
-# For more information about using CMake with Android Studio, read the
-# documentation: https://d.android.com/studio/projects/add-native-code.html.
-# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
-
-# Sets the minimum CMake version required for this project.
-cmake_minimum_required(VERSION 3.22.1)
-
-# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
-# Since this is the top level CMakeLists.txt, the project name is also accessible
-# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
-# build script scope).
-project("llama-android")
-
-## Fetch latest llama.cpp from GitHub
-#include(FetchContent)
-#FetchContent_Declare(
-#        llama
-#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
-#        GIT_TAG        master
-#)
-#
-## Also provides "common"
-#FetchContent_MakeAvailable(llama)
-
-# llama.cpp CI uses the code from the current branch
-# ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
-add_subdirectory(../../../../../../ build-llama)
-
-# Creates and names a library, sets it as either STATIC
-# or SHARED, and provides the relative paths to its source code.
-# You can define multiple libraries, and CMake builds them for you.
-# Gradle automatically packages shared libraries with your APK.
-#
-# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
-# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
-# is preferred for the same purpose.
-#
-# In order to load a library into your app from Java/Kotlin, you must call
-# System.loadLibrary() and pass the name of the library defined here;
-# for GameActivity/NativeActivity derived applications, the same library name must be
-# used in the AndroidManifest.xml file.
-add_library(${CMAKE_PROJECT_NAME} SHARED
-    # List C/C++ source files with relative paths to this CMakeLists.txt.
-        llama-android.cpp)
-
-# Specifies libraries CMake should link to your target library. You
-# can link libraries from various origins, such as libraries defined in this
-# build script, prebuilt third-party libraries, or Android system libraries.
-target_link_libraries(${CMAKE_PROJECT_NAME}
-    # List libraries link to the target library
-    llama
-    common
-    android
-    log)
diff --git a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
index 42ebaad49..2de496574 100644
--- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@@ -11,15 +11,15 @@ cmake_minimum_required(VERSION 3.22.1)
 # build script scope).
 project("llama-android")
 
-include(FetchContent)
-FetchContent_Declare(
-        llama
-        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
-        GIT_TAG        master
-)
+#include(FetchContent)
+#FetchContent_Declare(
+#        llama
+#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+#        GIT_TAG        master
+#)
 
 # Also provides "common"
-FetchContent_MakeAvailable(llama)
+#FetchContent_MakeAvailable(llama)
 
 # Creates and names a library, sets it as either STATIC
 # or SHARED, and provides the relative paths to its source code.
@@ -30,6 +30,10 @@ FetchContent_MakeAvailable(llama)
 # the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
 # is preferred for the same purpose.
 #
+
+#load local llama.cpp
+add_subdirectory(../../../../../../ build-llama)
+
 # In order to load a library into your app from Java/Kotlin, you must call
 # System.loadLibrary() and pass the name of the library defined here;
 # for GameActivity/NativeActivity derived applications, the same library name must be

From 97877eb10bd8e7f8023420b5b5300bcbdadd62dc Mon Sep 17 00:00:00 2001
From: jukofyork <69222624+jukofyork@users.noreply.github.com>
Date: Thu, 27 Jun 2024 15:48:07 +0100
Subject: [PATCH 33/50] Control vector loading fixes (#8137)

* Fixed leak in llama_control_vector_load_one() and allow llama_control_vector_load() to grow

* refactored `llama_control_vector_load_one()`

* allow multiple directions for same layer in same file

* llama_control_vector_load_one() and llama_control_vector_load() now break on error

* removed unnecessary ggml_free() call
---
 common/common.cpp | 186 +++++++++++++++++++---------------------------
 1 file changed, 76 insertions(+), 110 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index c76d0e2c3..70349ad70 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2804,125 +2804,87 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
 //
 
 static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
-    int32_t n_tensors;
-
-    size_t n_bytes = 0;
-
-    uint32_t max_direction_layer = 0;
-
     llama_control_vector_data result = { -1, {} };
 
-    // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
-    {
-        struct ggml_init_params meta_params = {
-            /* .mem_size   = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
-            /* .mem_buffer = */ nullptr,
-            /* .no_alloc   = */ true,
-        };
-        ggml_context * meta_ctx = ggml_init(meta_params);
-        struct gguf_init_params meta_gguf_params = {
-            /* .no_alloc = */ true,
-            /* .ctx      = */ &meta_ctx,
-        };
-        struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
-        if (!meta_ctx_gguf) {
-            fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
-            ggml_free(meta_ctx);
-            return result;
-        }
-
-        n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
-        for (int i = 0; i < n_tensors; i++) {
-            std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
-
-            // split on '.'
-            size_t dotpos = name.find('.');
-            if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
-                try {
-                    uint32_t layer = std::stoi(name.substr(dotpos + 1));
-                    if (layer == 0) {
-                        fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
-                        ggml_free(meta_ctx);
-                        gguf_free(meta_ctx_gguf);
-                        return result;
-                    }
-                    if (layer > max_direction_layer) {
-                        max_direction_layer = layer;
-                    }
-                } catch (...) {
-                    fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
-                    ggml_free(meta_ctx);
-                    gguf_free(meta_ctx_gguf);
-                    return result;
-                }
-            }
-
-            struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
-            if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
-                fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
-                ggml_free(meta_ctx);
-                gguf_free(meta_ctx_gguf);
-                return result;
-            }
-            if (result.n_embd == -1) {
-                result.n_embd = ggml_nelements(tensor_meta);
-            } else if (ggml_nelements(tensor_meta) != result.n_embd) {
-                fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
-                ggml_free(meta_ctx);
-                gguf_free(meta_ctx_gguf);
-                return result;
-            }
-            n_bytes += ggml_nbytes(tensor_meta);
-        }
-        ggml_free(meta_ctx);
-        gguf_free(meta_ctx_gguf);
+    ggml_context * ctx = nullptr;
+    struct gguf_init_params meta_gguf_params = {
+        /* .no_alloc = */ false,
+        /* .ctx      = */ &ctx,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
+    if (!ctx_gguf) {
+        fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
+        return result;
     }
 
+    int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
     if (n_tensors == 0) {
         fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
-        return result;
     }
 
-    // load and scale tensors into final control vector context
-    struct ggml_init_params ggml_params = {
-        /* .mem_size   = */ ggml_tensor_overhead() * n_tensors + n_bytes,
-        /* .mem_buffer = */ nullptr,
-        /* .no_alloc   = */ false,
-    };
-    struct ggml_context * ctx = ggml_init(ggml_params);
+    for (int i = 0; i < n_tensors; i++) {
+        std::string name = gguf_get_tensor_name(ctx_gguf, i);
 
-    struct gguf_init_params params = {
-        /*.no_alloc = */ false,
-        /*.ctx      = */ &ctx,
-    };
-    struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params);
-    if (!ctx_gguf) {
-        fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
-        ggml_free(ctx);
-        return result;
-    }
+        int layer_idx = -1;
 
-    // do not store data for layer 0 (it's not used)
-    result.data.resize(result.n_embd * max_direction_layer);
-
-    for (uint32_t il = 1; il <= max_direction_layer; il++) {
-        const std::string name = "direction." + std::to_string(il);
-        const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
-
-        float * dst = result.data.data() + result.n_embd * (il - 1);
-
-        if (tensor) {
-            const float * src = (const float *) tensor->data;
-            for (int j = 0; j < result.n_embd; j++) {
-                dst[j] = src[j] * load_info.strength;
-            }
-        } else {
-            for (int j = 0; j < result.n_embd; j++) {
-                dst[j] = 0.0f;
+        // split on '.'
+        size_t dotpos = name.find('.');
+        if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
+            try {
+                layer_idx = std::stoi(name.substr(dotpos + 1));
+            } catch (...) {
+                layer_idx = -1;
             }
         }
+        if (layer_idx < 0) {
+            fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        } else if (layer_idx == 0) {
+            fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+
+        struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
+        if (tensor->type != GGML_TYPE_F32) {
+            fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+        if (ggml_n_dims(tensor) != 1) {
+            fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+
+        if (result.n_embd == -1) {
+            result.n_embd = ggml_nelements(tensor);
+        } else if (ggml_nelements(tensor) != result.n_embd) {
+            fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+
+        // extend if necessary - do not store data for layer 0 (it's not used)
+        result.data.resize(std::max(result.data.size(), static_cast(result.n_embd * layer_idx)), 0.0f);
+
+        const float * src = (const float *) tensor->data;
+        float * dst = result.data.data() + result.n_embd * (layer_idx - 1);  // layer 1 at [0]
+        for (int j = 0; j < result.n_embd; j++) {
+            dst[j] += src[j] * load_info.strength;  // allows multiple directions for same layer in same file
+        }
+
     }
 
+    if (result.n_embd == -1) {
+        fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
+        result.data.clear();
+    }
+
+    gguf_free(ctx_gguf);
+    ggml_free(ctx);
+
     return result;
 }
 
@@ -2933,16 +2895,19 @@ llama_control_vector_data llama_control_vector_load(const std::vector
Date: Thu, 27 Jun 2024 18:37:29 +0300
Subject: [PATCH 34/50] flake.lock: Update (#8071)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Flake lock file updates:

• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/e9ee548d90ff586a6471b4ae80ae9cfcbceb3420?narHash=sha256-4Zu0RYRcAY/VWuu6awwq4opuiD//ahpc2aFHg2CWqFY%3D' (2024-06-13)
  → 'github:NixOS/nixpkgs/d603719ec6e294f034936c0d0dc06f689d91b6c3?narHash=sha256-k3JqJrkdoYwE3fHE6xGDY676AYmyh4U2Zw%2B0Bwe5DLU%3D' (2024-06-20)

Co-authored-by: github-actions[bot] 
Co-authored-by: Philip Taron 
---
 flake.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flake.lock b/flake.lock
index 5278fb68a..79bb3f63f 100644
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1718318537,
-        "narHash": "sha256-4Zu0RYRcAY/VWuu6awwq4opuiD//ahpc2aFHg2CWqFY=",
+        "lastModified": 1718895438,
+        "narHash": "sha256-k3JqJrkdoYwE3fHE6xGDY676AYmyh4U2Zw+0Bwe5DLU=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "e9ee548d90ff586a6471b4ae80ae9cfcbceb3420",
+        "rev": "d603719ec6e294f034936c0d0dc06f689d91b6c3",
         "type": "github"
       },
       "original": {

From 16791b8f0b4526aafbf5d0e5bbbd2e99c2253418 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen 
Date: Thu, 27 Jun 2024 18:14:19 +0200
Subject: [PATCH 35/50] Add chatml fallback for cpp `llama_chat_apply_template`
 (#8160)

* add chatml fallback for cpp `llama_chat_apply_template`

* remove redundant code
---
 common/common.cpp | 19 ++++++++++++++++++-
 common/common.h   |  2 ++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 70349ad70..57d03a578 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2618,6 +2618,7 @@ std::string llama_chat_apply_template(const struct llama_model * model,
         const std::vector & msgs,
         bool add_ass) {
     int alloc_size = 0;
+    bool fallback = false; // indicate if we must fallback to default chatml
     std::vector chat;
     for (auto & msg : msgs) {
         chat.push_back({msg.role.c_str(), msg.content.c_str()});
@@ -2630,10 +2631,26 @@ std::string llama_chat_apply_template(const struct llama_model * model,
     // run the first time to get the total output length
     int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
 
+    // error: chat template is not supported
+    if (res < 0) {
+        if (ptr_tmpl != nullptr) {
+            // if the custom "tmpl" is not supported, we throw an error
+            // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
+            throw std::runtime_error("this custom template is not supported");
+        } else {
+            // If the built-in template is not supported, we default to chatml
+            res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+            fallback = true;
+        }
+    }
+
     // if it turns out that our buffer is too small, we resize it
     if ((size_t) res > buf.size()) {
         buf.resize(res);
-        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+        res = llama_chat_apply_template(
+            fallback ? nullptr : model,
+            fallback ? "chatml" : ptr_tmpl,
+            chat.data(), chat.size(), add_ass, buf.data(), buf.size());
     }
 
     std::string formatted_chat(buf.data(), res);
diff --git a/common/common.h b/common/common.h
index c541204f6..0486ba380 100644
--- a/common/common.h
+++ b/common/common.h
@@ -380,6 +380,8 @@ struct llama_chat_msg {
 bool llama_chat_verify_template(const std::string & tmpl);
 
 // CPP wrapper for llama_chat_apply_template
+// If the built-in template is not supported, we default to chatml
+// If the custom "tmpl" is not supported, we throw an error
 std::string llama_chat_apply_template(const struct llama_model * model,
         const std::string & tmpl,
         const std::vector & chat,

From 8172ee9da9921ca53d698c7438c2d792b3f3f2c8 Mon Sep 17 00:00:00 2001
From: slaren 
Date: Thu, 27 Jun 2024 20:04:39 +0200
Subject: [PATCH 36/50] cmake : fix deprecated option names not working (#8171)

* cmake : fix deprecated option names not working

* remove LlAMA_OPENMP
---
 CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7a7197282..dba083089 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,7 +86,7 @@ set(GGML_CUDA_USE_GRAPHS    ON)
 function (llama_option_depr TYPE OLD NEW)
     if (${OLD})
         message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
-        set(${NEW} ON)
+        set(${NEW} ON PARENT_SCOPE)
     endif()
 endfunction()
 
@@ -96,7 +96,6 @@ llama_option_depr(WARNING     LLAMA_KOMPUTE             GGML_KOMPUTE)
 llama_option_depr(WARNING     LLAMA_METAL               GGML_METAL)
 llama_option_depr(WARNING     LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
 llama_option_depr(WARNING     LLAMA_NATIVE              GGML_NATIVE)
-llama_option_depr(WARNING     LLAMA_OPENMP              GGML_OPENMP)
 llama_option_depr(WARNING     LLAMA_RPC                 GGML_RPC)
 llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
 llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)

From 558f44bf83d78242d4e5c4ab98d0be9125cb9780 Mon Sep 17 00:00:00 2001
From: loonerin <132926317+loonerin@users.noreply.github.com>
Date: Thu, 27 Jun 2024 15:01:23 -0400
Subject: [PATCH 37/50] CI: fix release build (Ubuntu+Mac) (#8170)

* CI: fix release build (Ubuntu)

PR #8006 changes defaults to build shared libs. However, CI for releases
expects static builds.

* CI: fix release build (Mac)

---------

Co-authored-by: loonerin 
---
 .github/workflows/build.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 208515287..adf67cecc 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -47,7 +47,7 @@ jobs:
           sysctl -a
           mkdir build
           cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
           cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
@@ -105,7 +105,7 @@ jobs:
           sysctl -a
           # Metal is disabled due to intermittent failures with Github runners not having a GPU:
           # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON
+          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
@@ -222,7 +222,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
           cmake --build . --config Release -j $(nproc)
 
       - name: Test

From cb0b06a8a613f7a2ccb7253b2a3c00fdd397ba1c Mon Sep 17 00:00:00 2001
From: Olivier Chafik 
Date: Thu, 27 Jun 2024 22:08:42 +0100
Subject: [PATCH 38/50] `json`: update grammars/README w/ examples & note about
 additionalProperties (#8132)

* json: update grammars/README

* mention broken prefixItems

* add mention to llama-gbnf-validator

* json: explicit type: object for nested items object in cli example
---
 grammars/README.md | 245 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 235 insertions(+), 10 deletions(-)

diff --git a/grammars/README.md b/grammars/README.md
index 2f685eb6d..40f666240 100644
--- a/grammars/README.md
+++ b/grammars/README.md
@@ -126,19 +126,244 @@ You can use GBNF grammars:
     - in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
     - in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
 
-Take a look at [tests](../../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
+Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
 
-Here is also a non-exhaustive list of **unsupported** features:
+```bash
+llama-cli \
+  -hfr bartowski/Phi-3-medium-128k-instruct-GGUF \
+  -hff Phi-3-medium-128k-instruct-Q8_0.gguf \
+  -j '{
+    "type": "array",
+    "items": {
+        "type": "object",
+        "properties": {
+            "name": {
+                "type": "string",
+                "minLength": 1,
+                "maxLength": 100
+            },
+            "age": {
+                "type": "integer",
+                "minimum": 0,
+                "maximum": 150
+            }
+        },
+        "required": ["name", "age"],
+        "additionalProperties": false
+    },
+    "minItems": 10,
+    "maxItems": 100
+  }' \
+  -p 'Generate a {name, age}[] JSON array with famous actors of all ages.'
+```
 
-- `additionalProperties`: to be fixed in https://github.com/ggerganov/llama.cpp/pull/7840
-- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`
-    - `integer` constraints to be implemented in https://github.com/ggerganov/llama.cpp/pull/7797
-- Remote `$ref`s in the C++ version (Python & JavaScript versions fetch https refs)
-- Mixing `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
-- `string` formats `uri`, `email`
+
+ +Show grammar + +You can convert any schema in command-line with: + +```bash +examples/json_schema_to_grammar.py name-age-schema.json +``` + +``` +char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4}) +item ::= "{" space item-name-kv "," space item-age-kv "}" space +item-age ::= ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-4] [0-9] | [5] "0")) space +item-age-kv ::= "\"age\"" space ":" space item-age +item-name ::= "\"" char{1,100} "\"" space +item-name-kv ::= "\"name\"" space ":" space item-name +root ::= "[" space item ("," space item){9,99} "]" space +space ::= | " " | "\n" [ \t]{0,20} +``` + +
+ +Here is also a list of known limitations (contributions welcome): + +- Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [llama-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp). +- Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703) +- [prefixItems](https://json-schema.org/draft/2020-12/json-schema-core#name-prefixitems) is broken (but [items](https://json-schema.org/draft/2020-12/json-schema-core#name-items) works) +- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`: only supported for `"type": "integer"` for now, not `number` +- Nested `$ref`s are broken (https://github.com/ggerganov/llama.cpp/issues/8073) +- [pattern](https://json-schema.org/draft/2020-12/json-schema-validation#name-pattern)s must start with `^` and end with `$` +- Remote `$ref`s not supported in the C++ version (Python & JavaScript versions fetch https refs) +- `string` [formats](https://json-schema.org/draft/2020-12/json-schema-validation#name-defined-formats) lack `uri`, `email` +- No [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties) + +And a non-exhaustive list of other unsupported features that are unlikely to be implemented (hard and/or too slow to support w/ stateless grammars): + +- [`uniqueItems`](https://json-schema.org/draft/2020-12/json-schema-validation#name-uniqueitems) - [`contains`](https://json-schema.org/draft/2020-12/json-schema-core#name-contains) / `minContains` -- `uniqueItems` - `$anchor` (cf. [dereferencing](https://json-schema.org/draft/2020-12/json-schema-core#name-dereferencing)) - [`not`](https://json-schema.org/draft/2020-12/json-schema-core#name-not) - [Conditionals](https://json-schema.org/draft/2020-12/json-schema-core#name-keywords-for-applying-subsche) `if` / `then` / `else` / `dependentSchemas` -- [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties) + +### A word about additionalProperties + +> [!WARNING] +> By default, `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties), which you might not want / not expect, and which will make sampling slower (not just because of the extra tokens, but also generates a slower grammar). +> You can set `"additionalProperties": false` on the schema of any object to ensure only properties listed in `properties` are generated (not needed for non-`object` types, e.g. `array` or `string`). + +If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can disable additional properties with the `extra` config on each model class: + +```python +# pip install pydantic +import json +from typing import Annotated, List +from pydantic import BaseModel, Extra, Field +class QAPair(BaseModel): + class Config: + extra = 'forbid' # triggers additionalProperties: false in the JSON schema + question: str + concise_answer: str + justification: str + +class Summary(BaseModel): + class Config: + extra = 'forbid' + key_facts: List[Annotated[str, Field(pattern='- .{5,}')]] + question_answers: List[Annotated[List[QAPair], Field(min_items=5)]] + +print(json.dumps(Summary.model_json_schema(), indent=2)) +``` + +
+Show JSON schema & grammar + +```json +{ + "$defs": { + "QAPair": { + "additionalProperties": false, + "properties": { + "question": { + "title": "Question", + "type": "string" + }, + "concise_answer": { + "title": "Concise Answer", + "type": "string" + }, + "justification": { + "title": "Justification", + "type": "string" + } + }, + "required": [ + "question", + "concise_answer", + "justification" + ], + "title": "QAPair", + "type": "object" + } + }, + "additionalProperties": false, + "properties": { + "key_facts": { + "items": { + "pattern": "^- .{5,}$", + "type": "string" + }, + "title": "Key Facts", + "type": "array" + }, + "question_answers": { + "items": { + "items": { + "$ref": "#/$defs/QAPair" + }, + "minItems": 5, + "type": "array" + }, + "title": "Question Answers", + "type": "array" + } + }, + "required": [ + "key_facts", + "question_answers" + ], + "title": "Summary", + "type": "object" +} +``` + +``` +QAPair ::= "{" space QAPair-question-kv "," space QAPair-concise-answer-kv "," space QAPair-justification-kv "}" space +QAPair-concise-answer-kv ::= "\"concise_answer\"" space ":" space string +QAPair-justification-kv ::= "\"justification\"" space ":" space string +QAPair-question-kv ::= "\"question\"" space ":" space string +char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4}) +dot ::= [^\x0A\x0D] +key-facts ::= "[" space (key-facts-item ("," space key-facts-item)*)? "]" space +key-facts-item ::= "\"" "- " key-facts-item-1{5,} "\"" space +key-facts-item-1 ::= dot +key-facts-kv ::= "\"key_facts\"" space ":" space key-facts +question-answers ::= "[" space (question-answers-item ("," space question-answers-item)*)? "]" space +question-answers-item ::= "[" space question-answers-item-item ("," space question-answers-item-item){4,} "]" space +question-answers-item-item ::= QAPair +question-answers-kv ::= "\"question_answers\"" space ":" space question-answers +root ::= "{" space key-facts-kv "," space question-answers-kv "}" space +space ::= | " " | "\n" [ \t]{0,20} +string ::= "\"" char* "\"" space +``` + +
+ +If you're using [Zod](https://zod.dev/), you can make your objects explicitly strict w/ `z.object(...).strict()` or `z.strictObject(...)`. + +Note however that [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) currently always seems to set `"additionalProperties": false` anyway (even w/ zod schemas on which `nonstrict()` / `passthrough()` was called). + +```js +import { z } from 'zod'; +import { zodToJsonSchema } from 'zod-to-json-schema'; + +const Foo = z.object({ + age: z.number().positive(), + email: z.string().email(), +}).strict(); + +console.log(zodToJsonSchema(Foo)); +``` + +
+Show JSON schema & grammar + +```json +{ + "type": "object", + "properties": { + "age": { + "type": "number", + "exclusiveMinimum": 0 + }, + "email": { + "type": "string", + "format": "email" + } + }, + "required": [ + "age", + "email" + ], + "additionalProperties": false, + "$schema": "http://json-schema.org/draft-07/schema#" +} +``` + +``` +age-kv ::= "\"age\"" space ":" space number +char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4}) +decimal-part ::= [0-9]{1,16} +email-kv ::= "\"email\"" space ":" space string +integral-part ::= [0] | [1-9] [0-9]{0,15} +number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space +root ::= "{" space age-kv "," space email-kv "}" space +space ::= | " " | "\n" [ \t]{0,20} +string ::= "\"" char* "\"" space +``` + +
From a27aa50ab7e07fe46aae619076b6e31d5663e914 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 28 Jun 2024 02:19:11 +0200 Subject: [PATCH 39/50] Add missing items in makefile (#8177) --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index bbfe0f12b..8ae4f1dc4 100644 --- a/Makefile +++ b/Makefile @@ -45,6 +45,7 @@ BUILD_TARGETS = \ TEST_TARGETS = \ tests/test-autorelease \ tests/test-backend-ops \ + tests/test-chat-template \ tests/test-double-float \ tests/test-grad0 \ tests/test-grammar-integration \ @@ -1070,6 +1071,7 @@ clean: rm -rvf src/*.o rm -rvf tests/*.o rm -rvf examples/*.o + rm -rvf common/*.o rm -rvf *.a rm -rvf *.dll rm -rvf *.so From e57dc62057d41211ac018056c19c02cd544694df Mon Sep 17 00:00:00 2001 From: pculliton Date: Fri, 28 Jun 2024 00:00:43 -0400 Subject: [PATCH 40/50] llama: Add support for Gemma2ForCausalLM (#8156) * Inference support for Gemma 2 model family * Update convert-hf-to-gguf.py, constants, and tensor mappings * cleanup * format fix * Fix special token vocab bug * Don't add space prefix * fix deleted lines * Update src/llama.cpp Co-authored-by: slaren * Add model type names * Add control vector * Fix model type identification --------- Co-authored-by: Andrei Betlen Co-authored-by: slaren --- convert-hf-to-gguf.py | 40 +++++++ gguf-py/gguf/constants.py | 23 ++++ gguf-py/gguf/tensor_mapping.py | 14 +++ src/llama.cpp | 198 ++++++++++++++++++++++++++++++++- 4 files changed, 274 insertions(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5bf69ef9f..5bcc849db 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2340,6 +2340,46 @@ class GemmaModel(Model): return [(self.map_tensor_name(name), data_torch)] +@Model.register("Gemma2ForCausalLM") +class Gemma2Model(Model): + model_arch = gguf.MODEL_ARCH.GEMMA2 + + def set_vocab(self): + self._set_vocab_llama_hf() + self.gguf_writer.add_add_space_prefix(False) + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(hparams["head_dim"]) + self.gguf_writer.add_value_length(hparams["head_dim"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unusem + + # lm_head is not used in llama.cpp, while autoawq will include this tensor in model + # To prevent errors, skip loading lm_head.weight. + if name == "lm_head.weight": + logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") + return [] + + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + return [(self.map_tensor_name(name), data_torch)] + + @Model.register("Starcoder2ForCausalLM") class StarCoder2Model(Model): model_arch = gguf.MODEL_ARCH.STARCODER2 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 222a2d137..cf3d09e70 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -150,6 +150,7 @@ class MODEL_ARCH(IntEnum): INTERNLM2 = auto() MINICPM = auto() GEMMA = auto() + GEMMA2 = auto() STARCODER2 = auto() MAMBA = auto() XVERSE = auto() @@ -180,10 +181,13 @@ class MODEL_TENSOR(IntEnum): ATTN_NORM = auto() ATTN_NORM_2 = auto() ATTN_OUT_NORM = auto() + ATTN_POST_NORM = auto() ATTN_ROT_EMBD = auto() FFN_GATE_INP = auto() FFN_GATE_INP_SHEXP = auto() FFN_NORM = auto() + FFN_PRE_NORM = auto() + FFN_POST_NORM = auto() FFN_GATE = auto() FFN_DOWN = auto() FFN_UP = auto() @@ -270,6 +274,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.INTERNLM2: "internlm2", MODEL_ARCH.MINICPM: "minicpm", MODEL_ARCH.GEMMA: "gemma", + MODEL_ARCH.GEMMA2: "gemma2", MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.MAMBA: "mamba", MODEL_ARCH.XVERSE: "xverse", @@ -303,9 +308,12 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", + MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm", MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp", MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm", MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", @@ -751,6 +759,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_NORM, ], + MODEL_ARCH.GEMMA2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.FFN_PRE_NORM, + MODEL_TENSOR.FFN_POST_NORM, + ], MODEL_ARCH.STARCODER2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 7b047f241..0bed43939 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -187,6 +187,10 @@ class TensorNameMap: "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx ), + MODEL_TENSOR.ATTN_POST_NORM: ( + "model.layers.{bid}.post_attention_layernorm", # gemma2 + ), + # Rotary embeddings MODEL_TENSOR.ATTN_ROT_EMBD: ( "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf @@ -210,6 +214,16 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.rms_norm_2", # Grok ), + # Post feed-forward norm + MODEL_TENSOR.FFN_PRE_NORM: ( + "model.layers.{bid}.pre_feedforward_layernorm", # gemma2 + ), + + # Post feed-forward norm + MODEL_TENSOR.FFN_POST_NORM: ( + "model.layers.{bid}.post_feedforward_layernorm", # gemma2 + ), + MODEL_TENSOR.FFN_GATE_INP: ( "layers.{bid}.feed_forward.gate", # mixtral "model.layers.{bid}.block_sparse_moe.gate", # mixtral diff --git a/src/llama.cpp b/src/llama.cpp index 3dc0f8535..988ed4fdf 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -217,6 +217,7 @@ enum llm_arch { LLM_ARCH_INTERNLM2, LLM_ARCH_MINICPM, LLM_ARCH_GEMMA, + LLM_ARCH_GEMMA2, LLM_ARCH_STARCODER2, LLM_ARCH_MAMBA, LLM_ARCH_XVERSE, @@ -257,6 +258,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_INTERNLM2, "internlm2" }, { LLM_ARCH_MINICPM, "minicpm" }, { LLM_ARCH_GEMMA, "gemma" }, + { LLM_ARCH_GEMMA2, "gemma2" }, { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, { LLM_ARCH_XVERSE, "xverse" }, @@ -478,10 +480,12 @@ enum llm_tensor { LLM_TENSOR_ATTN_NORM, LLM_TENSOR_ATTN_NORM_2, LLM_TENSOR_ATTN_OUT_NORM, + LLM_TENSOR_ATTN_POST_NORM, LLM_TENSOR_ATTN_ROT_EMBD, LLM_TENSOR_FFN_GATE_INP, LLM_TENSOR_FFN_GATE_INP_SHEXP, LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_POST_NORM, LLM_TENSOR_FFN_GATE, LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_UP, @@ -1004,6 +1008,24 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_GEMMA2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, + }, + }, { LLM_ARCH_STARCODER2, { @@ -2039,6 +2061,8 @@ enum e_model { MODEL_16x12B, MODEL_10B_128x3_66B, MODEL_57B_A14B, + MODEL_9B, + MODEL_27B, }; static const size_t kiB = 1024; @@ -2215,6 +2239,7 @@ struct llama_layer { struct ggml_tensor * attn_q_a_norm; struct ggml_tensor * attn_kv_a_norm; struct ggml_tensor * attn_sub_norm; + struct ggml_tensor * attn_post_norm; struct ggml_tensor * ffn_sub_norm; // attention @@ -2238,6 +2263,7 @@ struct llama_layer { // normalization struct ggml_tensor * ffn_norm; struct ggml_tensor * ffn_norm_b; + struct ggml_tensor * ffn_post_norm; struct ggml_tensor * layer_out_norm; struct ggml_tensor * layer_out_norm_b; struct ggml_tensor * ffn_norm_exps; @@ -4269,6 +4295,8 @@ static const char * llama_model_type_name(e_model type) { case MODEL_16x12B: return "16x12B"; case MODEL_10B_128x3_66B: return "10B+128x3.66B"; case MODEL_57B_A14B: return "57B.A14B"; + case MODEL_9B: return "9B"; + case MODEL_27B: return "27B"; default: return "?B"; } } @@ -4671,6 +4699,16 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_GEMMA2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 42: model.type = e_model::MODEL_9B; break; + case 46: model.type = e_model::MODEL_27B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_STARCODER2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -6512,6 +6550,40 @@ static bool llm_load_tensors( layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); } } break; + case LLM_ARCH_GEMMA2: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading + + const int64_t n_ff = hparams.n_ff; + const int64_t n_embd_head_k = hparams.n_embd_head_k; + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + + for (uint32_t i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd}); + layer.attn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}); + } + } break; case LLM_ARCH_STARCODER2: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -10923,6 +10995,125 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_gemma2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head_k = hparams.n_embd_head_k; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); + cb(inpL, "inp_scaled", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr, + n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur", il); + + Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); + cb(Qcur, "Qcur_scaled", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr, + n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); + } + + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].attn_post_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_post_norm", il); + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); + cb(sa_out, "sa_out", il); + + cur = llm_build_norm(ctx0, sa_out, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + // feed-forward network + { + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].ffn_post_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "ffn_post_norm", -1); + + cur = ggml_add(ctx0, cur, sa_out); + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_starcoder2() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -12303,6 +12494,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_gemma(); } break; + case LLM_ARCH_GEMMA2: + { + result = llm.build_gemma2(); + } break; case LLM_ARCH_STARCODER2: { result = llm.build_starcoder2(); @@ -17597,6 +17792,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_PHI2: case LLM_ARCH_PHI3: case LLM_ARCH_GEMMA: + case LLM_ARCH_GEMMA2: case LLM_ARCH_STARCODER2: case LLM_ARCH_GPTNEOX: return LLAMA_ROPE_TYPE_NEOX; @@ -19486,7 +19682,7 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "assistant\n"; } - } else if (tmpl == "gemma" || tmpl.find("") != std::string::npos) { + } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("") != std::string::npos) { // google/gemma-7b-it std::string system_prompt = ""; for (auto message : chat) { From 139cc621e90b4f61830515c3c124cf35b3d7a6dc Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Fri, 28 Jun 2024 09:26:45 +0100 Subject: [PATCH 41/50] `json`: restore default additionalProperties to false, fix some pattern escapes (#8180) * json: expand ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS charset * json: revert default of additionalProperties to false * Update README.md --- common/json-schema-to-grammar.cpp | 4 +- examples/json_schema_to_grammar.py | 6 +-- .../server/public/json-schema-to-grammar.mjs | 4 +- grammars/README.md | 37 ++++++++++++------ tests/test-grammar-integration.cpp | 39 ++++++++++++++++++- tests/test-json-schema-to-grammar.cpp | 31 ++------------- 6 files changed, 73 insertions(+), 48 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 2f233e2e7..881eb49e3 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -316,7 +316,7 @@ std::unordered_map GRAMMAR_LITERAL_ESCAPES = { }; std::unordered_set NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'}; -std::unordered_set ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'}; +std::unordered_set ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'}; template std::string join(Iterator begin, Iterator end, const std::string & separator) { @@ -720,7 +720,7 @@ private: } prop_names.push_back(prop_name); } - if (!(additional_properties.is_boolean() && !additional_properties.get())) { + if ((additional_properties.is_boolean() && additional_properties.get()) || additional_properties.is_object()) { std::string sub_name = name + (name.empty() ? "" : "-") + "additional"; std::string value_rule = additional_properties.is_object() ? visit(additional_properties, sub_name + "-value") diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index 92f6e3d47..072a230f7 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -231,7 +231,7 @@ GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]') GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'} NON_LITERAL_SET = set('|.()[]{}*+?') -ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?') +ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?') class SchemaConverter: @@ -602,7 +602,7 @@ class SchemaConverter: else: add_component(t, is_required=True) - return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=[])) + return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None)) elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema): items = schema.get('items') or schema['prefixItems'] @@ -691,7 +691,7 @@ class SchemaConverter: required_props = [k for k in sorted_props if k in required] optional_props = [k for k in sorted_props if k not in required] - if additional_properties != False: + if additional_properties is not None and additional_properties != False: sub_name = f'{name}{"-" if name else ""}additional' value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \ self._add_primitive('value', PRIMITIVE_RULES['value']) diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs index 06d76edde..7267f3f9c 100644 --- a/examples/server/public/json-schema-to-grammar.mjs +++ b/examples/server/public/json-schema-to-grammar.mjs @@ -259,7 +259,7 @@ const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g; const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' }; const NON_LITERAL_SET = new Set('|.()[]{}*+?'); -const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('[]()|{}*+?'); +const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('^$.[]()|{}*+?'); export class SchemaConverter { constructor(options) { @@ -751,7 +751,7 @@ export class SchemaConverter { const requiredProps = sortedProps.filter(k => required.has(k)); const optionalProps = sortedProps.filter(k => !required.has(k)); - if (additionalProperties !== false) { + if (additionalProperties) { const subName = `${name ?? ''}${name ? '-' : ''}additional`; const valueRule = additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`) diff --git a/grammars/README.md b/grammars/README.md index 40f666240..886023f77 100644 --- a/grammars/README.md +++ b/grammars/README.md @@ -182,6 +182,8 @@ space ::= | " " | "\n" [ \t]{0,20} Here is also a list of known limitations (contributions welcome): +- `additionalProperties` defaults to `false` (produces faster grammars + reduces hallucinations). +- `"additionalProperties": true` may produce keys that contain unescaped newlines. - Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [llama-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp). - Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703) - [prefixItems](https://json-schema.org/draft/2020-12/json-schema-core#name-prefixitems) is broken (but [items](https://json-schema.org/draft/2020-12/json-schema-core#name-items) works) @@ -203,10 +205,11 @@ And a non-exhaustive list of other unsupported features that are unlikely to be ### A word about additionalProperties > [!WARNING] -> By default, `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties), which you might not want / not expect, and which will make sampling slower (not just because of the extra tokens, but also generates a slower grammar). -> You can set `"additionalProperties": false` on the schema of any object to ensure only properties listed in `properties` are generated (not needed for non-`object` types, e.g. `array` or `string`). +> The JSON schemas spec states `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties) by default. +> Since this is slow and seems prone to hallucinations, we default to no additional properties. +> You can set `"additionalProperties": true` in the the schema of any object to explicitly allow additional properties. -If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can disable additional properties with the `extra` config on each model class: +If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can enable additional properties with the `extra` config on each model class: ```python # pip install pydantic @@ -215,14 +218,14 @@ from typing import Annotated, List from pydantic import BaseModel, Extra, Field class QAPair(BaseModel): class Config: - extra = 'forbid' # triggers additionalProperties: false in the JSON schema + extra = 'allow' # triggers additionalProperties: true in the JSON schema question: str concise_answer: str justification: str class Summary(BaseModel): class Config: - extra = 'forbid' + extra = 'allow' key_facts: List[Annotated[str, Field(pattern='- .{5,}')]] question_answers: List[Annotated[List[QAPair], Field(min_items=5)]] @@ -236,7 +239,7 @@ print(json.dumps(Summary.model_json_schema(), indent=2)) { "$defs": { "QAPair": { - "additionalProperties": false, + "additionalProperties": true, "properties": { "question": { "title": "Question", @@ -260,7 +263,7 @@ print(json.dumps(Summary.model_json_schema(), indent=2)) "type": "object" } }, - "additionalProperties": false, + "additionalProperties": true, "properties": { "key_facts": { "items": { @@ -292,30 +295,40 @@ print(json.dumps(Summary.model_json_schema(), indent=2)) ``` ``` -QAPair ::= "{" space QAPair-question-kv "," space QAPair-concise-answer-kv "," space QAPair-justification-kv "}" space +QAPair ::= "{" space QAPair-question-kv "," space QAPair-concise-answer-kv "," space QAPair-justification-kv ( "," space ( QAPair-additional-kv ( "," space QAPair-additional-kv )* ) )? "}" space +QAPair-additional-k ::= ["] ( [c] ([o] ([n] ([c] ([i] ([s] ([e] ([_] ([a] ([n] ([s] ([w] ([e] ([r] char+ | [^"r] char*) | [^"e] char*) | [^"w] char*) | [^"s] char*) | [^"n] char*) | [^"a] char*) | [^"_] char*) | [^"e] char*) | [^"s] char*) | [^"i] char*) | [^"c] char*) | [^"n] char*) | [^"o] char*) | [j] ([u] ([s] ([t] ([i] ([f] ([i] ([c] ([a] ([t] ([i] ([o] ([n] char+ | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"a] char*) | [^"c] char*) | [^"i] char*) | [^"f] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"u] char*) | [q] ([u] ([e] ([s] ([t] ([i] ([o] ([n] char+ | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"e] char*) | [^"u] char*) | [^"cjq] char* )? ["] space +QAPair-additional-kv ::= QAPair-additional-k ":" space value QAPair-concise-answer-kv ::= "\"concise_answer\"" space ":" space string QAPair-justification-kv ::= "\"justification\"" space ":" space string QAPair-question-kv ::= "\"question\"" space ":" space string +additional-k ::= ["] ( [k] ([e] ([y] ([_] ([f] ([a] ([c] ([t] ([s] char+ | [^"s] char*) | [^"t] char*) | [^"c] char*) | [^"a] char*) | [^"f] char*) | [^"_] char*) | [^"y] char*) | [^"e] char*) | [q] ([u] ([e] ([s] ([t] ([i] ([o] ([n] ([_] ([a] ([n] ([s] ([w] ([e] ([r] ([s] char+ | [^"s] char*) | [^"r] char*) | [^"e] char*) | [^"w] char*) | [^"s] char*) | [^"n] char*) | [^"a] char*) | [^"_] char*) | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"e] char*) | [^"u] char*) | [^"kq] char* )? ["] space +additional-kv ::= additional-k ":" space value +array ::= "[" space ( value ("," space value)* )? "]" space +boolean ::= ("true" | "false") space char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4}) +decimal-part ::= [0-9]{1,16} dot ::= [^\x0A\x0D] +integral-part ::= [0] | [1-9] [0-9]{0,15} key-facts ::= "[" space (key-facts-item ("," space key-facts-item)*)? "]" space key-facts-item ::= "\"" "- " key-facts-item-1{5,} "\"" space key-facts-item-1 ::= dot key-facts-kv ::= "\"key_facts\"" space ":" space key-facts +null ::= "null" space +number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space +object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space question-answers ::= "[" space (question-answers-item ("," space question-answers-item)*)? "]" space question-answers-item ::= "[" space question-answers-item-item ("," space question-answers-item-item){4,} "]" space question-answers-item-item ::= QAPair question-answers-kv ::= "\"question_answers\"" space ":" space question-answers -root ::= "{" space key-facts-kv "," space question-answers-kv "}" space +root ::= "{" space key-facts-kv "," space question-answers-kv ( "," space ( additional-kv ( "," space additional-kv )* ) )? "}" space space ::= | " " | "\n" [ \t]{0,20} string ::= "\"" char* "\"" space +value ::= object | array | string | number | boolean | null ``` -If you're using [Zod](https://zod.dev/), you can make your objects explicitly strict w/ `z.object(...).strict()` or `z.strictObject(...)`. - -Note however that [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) currently always seems to set `"additionalProperties": false` anyway (even w/ zod schemas on which `nonstrict()` / `passthrough()` was called). +If you're using [Zod](https://zod.dev/), you can make your objects to explicitly allow extra properties w/ `nonstrict()` / `passthrough()` (or explicitly no extra props w/ `z.object(...).strict()` or `z.strictObject(...)`) but note that [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) currently always sets `"additionalProperties": false` anyway. ```js import { z } from 'zod'; diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 0e21dc795..975658f79 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -993,6 +993,40 @@ static void test_json_schema() { } ); + test_schema( + "simple pattern", + // Schema + R"""({ + "pattern": "^[a-zA-Z0-9_-]*$" + })""", + // Passing strings + { + R"""("")""", + R"""("He_llo-12")""", + }, + // Failing strings + { + R"""("!")""", + R"""("Hello World")""", + } + ); + + test_schema( + "pattern with escapes", + // Schema + R"""({ + "pattern": "^a\\^\\$\\.\\[\\]\\(\\)\\|\\{\\}\\*\\+\\?b$" + })""", + // Passing strings + { + R"""("a^$.[]()|{}*+?b")""", + }, + // Failing strings + { + R"""("ab")""", + } + ); + test_schema( "", // Schema @@ -1062,8 +1096,6 @@ static void test_json_schema() { R"""({ "number": 1600, "street_name": "Pennsylvania" })""", // "By extension, even an empty object is valid" R"""({})""", - // "By default, providing additional properties is valid" - R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""", R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""", }, // Failing strings @@ -1074,6 +1106,9 @@ static void test_json_schema() { R"""({ "street_name": "Pennsylvania", "number": 1600 })""", // Reorder properties R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""", + // "Additional properties default to false for generation, even though the spec says true. + R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""", + } ); diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 3aaa11833..720a949c7 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -1120,28 +1120,15 @@ static void test_all(const std::string & lang, std::function Date: Fri, 28 Jun 2024 12:37:45 +0200 Subject: [PATCH 42/50] cmake : allow user to override default options (#8178) --- CMakeLists.txt | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dba083089..e3a0cc369 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,8 +79,15 @@ set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS}) set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED}) set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) -set(GGML_LLAMAFILE ON) -set(GGML_CUDA_USE_GRAPHS ON) + +# change the default for these ggml options +if (NOT DEFINED GGML_LLAMAFILE) + set(GGML_LLAMAFILE ON) +endif() + +if (NOT DEFINED GGML_CUDA_USE_GRAPHS) + set(GGML_CUDA_USE_GRAPHS ON) +endif() # transition helpers function (llama_option_depr TYPE OLD NEW) From 38373cfbab5397cc2ab5c3694a3dee12a9e58f45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Fri, 28 Jun 2024 12:53:43 +0200 Subject: [PATCH 43/50] Add SPM infill support (#8016) * add --spm-infill option * support --spm-infill * support --spm-infill --- common/common.cpp | 6 ++++++ common/common.h | 2 ++ examples/infill/README.md | 1 + examples/infill/infill.cpp | 24 +++++++++++++----------- examples/server/README.md | 1 + examples/server/server.cpp | 16 +++++++++++----- 6 files changed, 34 insertions(+), 16 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 57d03a578..6a00d25be 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1026,6 +1026,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.input_suffix = argv[i]; return true; } + if (arg == "--spm-infill") { + params.spm_infill = true; + return true; + } if (arg == "--grammar") { CHECK_ARG sparams.grammar = argv[i]; @@ -1409,6 +1413,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" }); options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" }); options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" }); + options.push_back({ "server infill", + " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" }); options.push_back({ "sampling" }); options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n" diff --git a/common/common.h b/common/common.h index 0486ba380..d6cb814b9 100644 --- a/common/common.h +++ b/common/common.h @@ -250,6 +250,8 @@ struct gpt_params { std::string cvector_outfile = "control_vector.gguf"; std::string cvector_positive_file = "examples/cvector-generator/positive.txt"; std::string cvector_negative_file = "examples/cvector-generator/negative.txt"; + + bool spm_infill = false; // suffix/prefix/middle pattern for infill }; void gpt_params_handle_model_default(gpt_params & params); diff --git a/examples/infill/README.md b/examples/infill/README.md index 74f42d2fc..810a0c5e7 100644 --- a/examples/infill/README.md +++ b/examples/infill/README.md @@ -15,6 +15,7 @@ In this section, we cover the most commonly used options for running the `infill - `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. - `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text. - `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. +- `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. ## Input Prompts diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 3e82e4a81..ca71dd687 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -210,6 +210,7 @@ int main(int argc, char ** argv) { suff_rm_leading_spc = false; } std::vector embd_inp; + std::vector embd_end; std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); const int space_token = 29871; @@ -217,12 +218,13 @@ int main(int argc, char ** argv) { inp_sfx.erase(inp_sfx.begin()); } inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); - if (add_bos) { - inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model)); - } inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); - embd_inp = inp_pfx; - embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); + embd_inp = params.spm_infill ? inp_sfx : inp_pfx; + embd_end = params.spm_infill ? inp_pfx : inp_sfx; + if (add_bos) { + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + } + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); const llama_token middle_token = llama_token_middle(model); if (middle_token >= 0) { @@ -526,14 +528,14 @@ int main(int argc, char ** argv) { inp_sfx.erase(inp_sfx.begin()); } inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); - if (add_bos) { - inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model)); - } inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); - embd_inp = inp_pfx; - embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); + embd_inp = params.spm_infill ? inp_sfx : inp_pfx; + embd_end = params.spm_infill ? inp_pfx : inp_sfx; + if (add_bos) { + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + } + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); - const llama_token middle_token = llama_token_middle(model); if (middle_token >= 0) { embd_inp.push_back(middle_token); } diff --git a/examples/server/README.md b/examples/server/README.md index e7fb0bf64..4fab006bb 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -73,6 +73,7 @@ The project is under active development, and we are [looking for feedback and co - `-fa`, `--flash-attn` : enable flash attention (default: disabled). - `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`) - `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options) +- `--spm-infill` : Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. **If compiled with `LLAMA_SERVER_SSL=ON`** - `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ae768097b..d7fb61812 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2020,6 +2020,7 @@ struct server_context { slot.t_start_generation = 0; if (slot.infill) { + const bool add_bos = llama_should_add_bos_token(model); bool suff_rm_leading_spc = true; if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { params.input_suffix.erase(0, 1); @@ -2035,16 +2036,21 @@ struct server_context { } prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model)); - prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS - prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model)); - prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end()); + suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model)); + + auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens; + auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens; + if (add_bos) { + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + } + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); const llama_token middle_token = llama_token_middle(model); if (middle_token >= 0) { - prefix_tokens.push_back(middle_token); + embd_inp.push_back(middle_token); } - prompt_tokens = prefix_tokens; + prompt_tokens = embd_inp; } else { prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt } From 26a39bbd6b0bbd66118bb68569f0276d7fe7df6c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 28 Jun 2024 15:11:44 +0200 Subject: [PATCH 44/50] Add MiniCPM, Deepseek V2 chat template + clean up `llama_chat_apply_template_internal` (#8172) * tmp_contains * minicpm chat template * add DeepSeek Lite template * change deepseek-lite to deepseek2 * correct code comment * correct code from master branch --- src/llama.cpp | 64 ++++++++++++++++++++++++++---------- tests/test-chat-template.cpp | 10 +++++- 2 files changed, 56 insertions(+), 18 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 988ed4fdf..3edaa98e8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -19613,7 +19613,10 @@ static int32_t llama_chat_apply_template_internal( std::string & dest, bool add_ass) { // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527 std::stringstream ss; - if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) { + auto tmpl_contains = [&tmpl](std::string haystack) -> bool { + return tmpl.find(haystack) != std::string::npos; + }; + if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) { // chatml template for (auto message : chat) { ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n"; @@ -19621,16 +19624,16 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|im_start|>assistant\n"; } - } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl.find("[INST]") != std::string::npos) { + } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl_contains("[INST]")) { // llama2 template and its variants // [variant] support system message - bool support_system_message = tmpl.find("<>") != std::string::npos || tmpl == "mistral"; + bool support_system_message = tmpl_contains("<>") || tmpl == "mistral"; // [variant] space before + after response - bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos; + bool space_around_response = tmpl_contains("' ' + eos_token"); // [variant] add BOS inside history - bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos; + bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]"); // [variant] trim spaces from the input message - bool strip_message = tmpl.find("content.strip()") != std::string::npos; + bool strip_message = tmpl_contains("content.strip()"); // construct the prompt bool is_inside_turn = true; // skip BOS at the beginning ss << "[INST] "; @@ -19656,7 +19659,7 @@ static int32_t llama_chat_apply_template_internal( } } // llama2 templates seem to not care about "add_generation_prompt" - } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) { + } else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) { // Phi 3 for (auto message : chat) { std::string role(message->role); @@ -19665,7 +19668,7 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|assistant|>\n"; } - } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) { + } else if (tmpl == "zephyr" || tmpl_contains("<|user|>")) { // zephyr template for (auto message : chat) { ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n"; @@ -19673,7 +19676,7 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|assistant|>\n"; } - } else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) { + } else if (tmpl == "monarch" || tmpl_contains("bos_token + message['role']")) { // mlabonne/AlphaMonarch-7B template (the is included inside history) for (auto message : chat) { std::string bos = (message == chat.front()) ? "" : ""; // skip BOS for first message @@ -19682,7 +19685,7 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "assistant\n"; } - } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("") != std::string::npos) { + } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl_contains("")) { // google/gemma-7b-it std::string system_prompt = ""; for (auto message : chat) { @@ -19704,7 +19707,7 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "model\n"; } - } else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) { + } else if (tmpl == "orion" || tmpl_contains("'\\n\\nAssistant: ' + eos_token")) { // OrionStarAI/Orion-14B-Chat std::string system_prompt = ""; for (auto message : chat) { @@ -19724,7 +19727,7 @@ static int32_t llama_chat_apply_template_internal( ss << message->content << ""; } } - } else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) { + } else if (tmpl == "openchat" || tmpl_contains("GPT4 Correct ")) { // openchat/openchat-3.5-0106, for (auto message : chat) { std::string role(message->role); @@ -19738,13 +19741,13 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "GPT4 Correct Assistant:"; } - } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) { + } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: "))) { // eachadea/vicuna-13b-1.1 (and Orca variant) for (auto message : chat) { std::string role(message->role); if (role == "system") { // Orca-Vicuna variant uses a system prefix - if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) { + if (tmpl == "vicuna-orca" || tmpl_contains("SYSTEM: ")) { ss << "SYSTEM: " << message->content << "\n"; } else { ss << message->content << "\n\n"; @@ -19758,7 +19761,7 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "ASSISTANT:"; } - } else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) { + } else if (tmpl == "deepseek" || (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>"))) { // deepseek-ai/deepseek-coder-33b-instruct for (auto message : chat) { std::string role(message->role); @@ -19773,7 +19776,7 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "### Response:\n"; } - } else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) { + } else if (tmpl == "command-r" || (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>"))) { // CohereForAI/c4ai-command-r-plus for (auto message : chat) { std::string role(message->role); @@ -19788,7 +19791,7 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"; } - } else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) { + } else if (tmpl == "llama3" || (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>"))) { // Llama 3 for (auto message : chat) { std::string role(message->role); @@ -19797,6 +19800,33 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|start_header_id|>assistant<|end_header_id|>\n\n"; } + } else if (tmpl == "minicpm" || tmpl_contains(u8"<用户>")) { + // MiniCPM-3B-OpenHermes-2.5-v2-GGUF + for (auto message : chat) { + std::string role(message->role); + if (role == "user") { + ss << u8"<用户>"; + ss << trim(message->content); + ss << ""; + } else { + ss << trim(message->content); + } + } + } else if (tmpl == "deepseek2" || tmpl_contains("'Assistant: ' + message['content'] + eos_token")) { + // DeepSeek-V2 + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << message->content << "\n\n"; + } else if (role == "user") { + ss << "User: " << message->content << "\n\n"; + } else if (role == "assistant") { + ss << "Assistant: " << message->content << u8"<|end▁of▁sentence|>"; + } + } + if (add_ass) { + ss << "Assistant:"; + } } else { // template not supported return -1; diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index d19ba8633..b154038b2 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -57,7 +57,11 @@ int main(void) { //Phi-3-medium "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}", //Phi-3-vision - "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}" + "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}", + // MiniCPM-3B-OpenHermes-2.5-v2-GGUF + u8"{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + ''}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}", + // DeepSeek-V2 + "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}", }; std::vector expected_output = { // teknium/OpenHermes-2.5-Mistral-7B @@ -94,6 +98,10 @@ int main(void) { "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n", //Phi-3-vision "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n", + // MiniCPM-3B-OpenHermes-2.5-v2-GGUF + u8"You are a helpful assistant<用户>HelloHi there<用户>Who are youI am an assistant<用户>Another question", + // DeepSeek-V2 + u8"You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<|end▁of▁sentence|>User: Who are you\n\nAssistant: I am an assistant <|end▁of▁sentence|>User: Another question\n\nAssistant:", }; std::vector formatted_chat(1024); int32_t res; From 8748d8ac6f172b99826ab18f01d9a3a165987d54 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Fri, 28 Jun 2024 18:02:05 +0100 Subject: [PATCH 45/50] json: attempt to skip slow tests when running under emulator (#8189) --- .github/workflows/build.yml | 1 + tests/test-json-schema-to-grammar.cpp | 40 +++++++++++++++------------ 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index adf67cecc..1e344db6b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -799,6 +799,7 @@ jobs: 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe) cd build + $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1 & $sde -future -- ctest -L main -C Release --verbose --timeout 900 - name: Determine tag name diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 720a949c7..65486ac5c 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -1239,26 +1239,30 @@ int main() { } }); - if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) { - test_all("Python", [](const TestCase & tc) { - write("test-json-schema-input.tmp", tc.schema); - tc.verify_status(std::system( - "python ./examples/json_schema_to_grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE); - tc.verify(read("test-grammar-output.tmp")); - }); + if (getenv("LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR")) { + fprintf(stderr, "\033[33mWARNING: Skipping slow tests on emulator.\n\033[0m"); } else { - fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m"); - } + if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) { + test_all("Python", [](const TestCase & tc) { + write("test-json-schema-input.tmp", tc.schema); + tc.verify_status(std::system( + "python ./examples/json_schema_to_grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE); + tc.verify(read("test-grammar-output.tmp")); + }); + } else { + fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m"); + } - if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) { - test_all("JavaScript", [](const TestCase & tc) { - write("test-json-schema-input.tmp", tc.schema); - tc.verify_status(std::system( - "node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE); - tc.verify(read("test-grammar-output.tmp")); - }); - } else { - fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m"); + if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) { + test_all("JavaScript", [](const TestCase & tc) { + write("test-json-schema-input.tmp", tc.schema); + tc.verify_status(std::system( + "node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE); + tc.verify(read("test-grammar-output.tmp")); + }); + } else { + fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m"); + } } test_all("Check Expectations Validity", [](const TestCase & tc) { From 72272b83a3878e91251218c981b4c6ec16c33912 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 29 Jun 2024 00:14:20 +0200 Subject: [PATCH 46/50] fix code typo in llama-cli (#8198) --- examples/main/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index cfaf6a6e8..1114073b8 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -810,7 +810,7 @@ int main(int argc, char ** argv) { is_antiprompt = true; } - chat_add_and_format(model, chat_msgs, "system", assistant_ss.str()); + chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str()); is_interacting = true; printf("\n"); } From 1c5eba6f8e628fb0a98afb27d8aaeb3b0e136451 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sat, 29 Jun 2024 20:44:08 -0700 Subject: [PATCH 47/50] llama: Add attention and final logit soft-capping, update scaling factor to Gemma2 (#8197) * Add attention and final logit softcapping. * fix * Add custom add_ functions * Disable flash attention for Gemma2 * Update src/llama.cpp Co-authored-by: slaren * Add default value for attention and final logit softcap value * Add custom kq scaling from Gemma2Attention * Remove custom pre attention scaling and use computed value instead. --------- Co-authored-by: slaren --- convert-hf-to-gguf.py | 6 ++++++ gguf-py/gguf/constants.py | 2 ++ gguf-py/gguf/gguf_writer.py | 6 ++++++ src/llama.cpp | 35 ++++++++++++++++++++++++++++++++--- 4 files changed, 46 insertions(+), 3 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5bcc849db..3ef2f69e7 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2363,6 +2363,12 @@ class Gemma2Model(Model): self.gguf_writer.add_key_length(hparams["head_dim"]) self.gguf_writer.add_value_length(hparams["head_dim"]) self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_attn_logit_softcapping( + self.hparams["attn_logit_softcapping"] + ) + self.gguf_writer.add_final_logit_softcapping( + self.hparams["final_logit_softcapping"] + ) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unusem diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index cf3d09e70..9bfa891d5 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -50,6 +50,8 @@ class Keys: POOLING_TYPE = "{arch}.pooling_type" LOGIT_SCALE = "{arch}.logit_scale" DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" + ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" + FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" class Attention: HEAD_COUNT = "{arch}.attention.head_count" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 9869f6fe3..1aeb0d9b0 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -516,6 +516,12 @@ class GGUFWriter: def add_logit_scale(self, value: float) -> None: self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value) + def add_attn_logit_softcapping(self, value: float) -> None: + self.add_float32(Keys.LLM.ATTN_LOGIT_SOFTCAPPING.format(arch=self.arch), value) + + def add_final_logit_softcapping(self, value: float) -> None: + self.add_float32(Keys.LLM.FINAL_LOGIT_SOFTCAPPING.format(arch=self.arch), value) + def add_expert_count(self, count: int) -> None: self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count) diff --git a/src/llama.cpp b/src/llama.cpp index 3edaa98e8..2a4d73856 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -302,6 +302,8 @@ enum llm_kv { LLM_KV_POOLING_TYPE, LLM_KV_LOGIT_SCALE, LLM_KV_DECODER_START_TOKEN_ID, + LLM_KV_ATTN_LOGIT_SOFTCAPPING, + LLM_KV_FINAL_LOGIT_SOFTCAPPING, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -392,6 +394,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_POOLING_TYPE , "%s.pooling_type" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" }, + { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" }, + { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -2099,6 +2103,9 @@ struct llama_hparams { float f_norm_eps; float f_norm_rms_eps; + float f_attn_logit_softcapping = 50.0f; + float f_final_logit_softcapping = 30.0f; + float rope_attn_factor = 1.0f; float rope_freq_base_train; float rope_freq_scale_train; @@ -2115,8 +2122,9 @@ struct llama_hparams { float f_max_alibi_bias = 0.0f; float f_logit_scale = 0.0f; - bool causal_attn = true; - bool use_alibi = false; + bool causal_attn = true; + bool use_alibi = false; + bool attn_soft_cap = false; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; @@ -4702,6 +4710,9 @@ static void llm_load_hparams( case LLM_ARCH_GEMMA2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false); + ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); + hparams.attn_soft_cap = true; switch (hparams.n_layer) { case 42: model.type = e_model::MODEL_9B; break; @@ -7579,6 +7590,12 @@ static struct ggml_tensor * llm_build_kqv( kq = ggml_scale(ctx, kq, 30); } + if (hparams.attn_soft_cap) { + kq = ggml_scale(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping); + kq = ggml_tanh(ctx, kq); + kq = ggml_scale(ctx, kq, hparams.f_attn_logit_softcapping); + } + kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias); cb(kq, "kq_soft_max_ext", il); @@ -11039,7 +11056,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); - Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); + Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); cb(Qcur, "Qcur_scaled", il); Kcur = ggml_rope_ext( @@ -11106,6 +11123,12 @@ struct llm_build_context { // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); + + // final logit soft-capping + cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); + cur = ggml_tanh(ctx0, cur); + cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping); + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -17379,6 +17402,12 @@ struct llama_context * llama_new_context_with_model( params.flash_attn = false; } + if (params.flash_attn && model->hparams.attn_soft_cap) { + LLAMA_LOG_WARN("%s: flash_attn is not compatible with attn_soft_cap - forcing off\n", __func__); + params.flash_attn = false; + } + + if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) { LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__); params.flash_attn = false; From 32bf2296a2652856a63bdee05c2c10c43adb2731 Mon Sep 17 00:00:00 2001 From: Aliebc Date: Sat, 15 Jun 2024 10:45:01 +0800 Subject: [PATCH 48/50] Add YX simple filter for llama-server --- .github/workflows/bench.yml | 310 -------- .github/workflows/build.yml | 698 ------------------ .github/workflows/close-issue.yml | 23 - .github/workflows/editorconfig.yml | 27 - .github/workflows/gguf-publish.yml | 44 -- .github/workflows/labeler.yml | 17 - .github/workflows/nix-ci-aarch64.yml | 65 -- .github/workflows/nix-ci.yml | 72 -- .github/workflows/nix-flake-update.yml | 22 - .github/workflows/nix-publish-flake.yml | 36 - .../workflows/python-check-requirements.yml | 35 - .github/workflows/python-lint.yml | 23 - examples/server/CMakeLists.txt | 1 + examples/server/stoplist.cpp | 10 + examples/server/utils.hpp | 111 ++- 15 files changed, 120 insertions(+), 1374 deletions(-) delete mode 100644 .github/workflows/bench.yml delete mode 100644 .github/workflows/close-issue.yml delete mode 100644 .github/workflows/editorconfig.yml delete mode 100644 .github/workflows/gguf-publish.yml delete mode 100644 .github/workflows/labeler.yml delete mode 100644 .github/workflows/nix-ci-aarch64.yml delete mode 100644 .github/workflows/nix-ci.yml delete mode 100644 .github/workflows/nix-flake-update.yml delete mode 100644 .github/workflows/nix-publish-flake.yml delete mode 100644 .github/workflows/python-check-requirements.yml delete mode 100644 .github/workflows/python-lint.yml create mode 100644 examples/server/stoplist.cpp diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml deleted file mode 100644 index eb69b82c4..000000000 --- a/.github/workflows/bench.yml +++ /dev/null @@ -1,310 +0,0 @@ -# Benchmark -name: Benchmark - -on: - workflow_dispatch: - inputs: - gpu-series: - description: 'Azure GPU series to run with' - required: true - type: choice - options: - - Standard_NC4as_T4_v3 - - Standard_NC24ads_A100_v4 - - Standard_NC80adis_H100_v5 - sha: - description: 'Commit SHA1 to build' - required: false - type: string - duration: - description: 'Duration of the bench' - type: string - default: 10m - - push: - branches: - - master - paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] - pull_request_target: - types: [opened, synchronize, reopened] - paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] - schedule: - - cron: '04 2 * * *' - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }} - cancel-in-progress: true - -jobs: - bench-server-baseline: - runs-on: Standard_NC4as_T4_v3 - env: - RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it - N_USERS: 8 - DURATION: 10m - - strategy: - matrix: - model: [phi-2] - ftype: [q4_0, q8_0, f16] - include: - - model: phi-2 - ftype: q4_0 - pr_comment_enabled: "true" - - if: | - inputs.gpu-series == 'Standard_NC4as_T4_v3' - || ( - github.event_name == 'schedule' - && github.ref_name == 'master' - && github.repository_owner == 'ggerganov' - ) - || github.event_name == 'pull_request_target' - || ( - github.event_name == 'push' - && github.event.ref == 'refs/heads/master' - && github.repository_owner == 'ggerganov' - ) - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - - name: Install python env - id: pipenv - run: | - cd examples/server/bench - python3 -m venv venv - source venv/bin/activate - pip install -r requirements.txt - - - name: Prometheus - id: install_prometheus - run: | - wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz - tar xzf prometheus*.tar.gz --strip-components=1 - ./prometheus --config.file=examples/server/bench/prometheus.yml & - while ! nc -z localhost 9090; do - sleep 0.1 - done - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: '1.21' - - - name: Install k6 and xk6-sse - id: k6_installation - run: | - cd examples/server/bench - go install go.k6.io/xk6/cmd/xk6@latest - xk6 build master \ - --with github.com/phymbert/xk6-sse - - - name: Build - id: cmake_build - run: | - set -eux - cmake -B build \ - -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DLLAMA_CURL=ON \ - -DLLAMA_CUBLAS=ON \ - -DCUDAToolkit_ROOT=/usr/local/cuda \ - -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ - -DCMAKE_CUDA_ARCHITECTURES=75 \ - -DLLAMA_FATAL_WARNINGS=OFF \ - -DLLAMA_ALL_WARNINGS=OFF \ - -DCMAKE_BUILD_TYPE=Release; - cmake --build build --config Release -j $(nproc) --target llama-server - - - name: Download the dataset - id: download_dataset - run: | - cd examples/server/bench - wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - - - name: Server bench - id: server_bench - run: | - set -eux - - cd examples/server/bench - source venv/bin/activate - python bench.py \ - --runner-label ${{ env.RUNNER_LABEL }} \ - --name ${{ github.job }} \ - --branch ${{ github.head_ref || github.ref_name }} \ - --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \ - --scenario script.js \ - --duration ${{ github.event.inputs.duration || env.DURATION }} \ - --hf-repo ggml-org/models \ - --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \ - --model-path-prefix /models \ - --parallel ${{ env.N_USERS }} \ - -ngl 33 \ - --batch-size 2048 \ - --ubatch-size 256 \ - --ctx-size 16384 \ - --n-prompts 1000 \ - --max-prompt-tokens 1024 \ - --max-tokens 2048 - - cat results.github.env >> $GITHUB_ENV - - # Remove dataset as we do not want it in the artefact - rm ShareGPT_V3_unfiltered_cleaned_split.json - - - uses: actions/upload-artifact@v4 - with: - name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} - compression-level: 9 - path: | - examples/server/bench/*.jpg - examples/server/bench/*.json - examples/server/bench/*.log - - - name: Commit status - uses: Sibz/github-status-action@v1 - with: - authToken: ${{secrets.GITHUB_TOKEN}} - sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }} - context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} - description: | - ${{ env.BENCH_RESULTS }} - state: 'success' - - - name: Upload benchmark images - uses: devicons/public-upload-to-imgur@v2.2.2 - continue-on-error: true # Important as it looks unstable: 503 - id: imgur_step - with: - client_id: ${{secrets.IMGUR_CLIENT_ID}} - path: | - examples/server/bench/prompt_tokens_seconds.jpg - examples/server/bench/predicted_tokens_seconds.jpg - examples/server/bench/kv_cache_usage_ratio.jpg - examples/server/bench/requests_processing.jpg - - - name: Extract mermaid - id: set_mermaid - run: | - set -eux - - cd examples/server/bench - PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid) - echo "PROMPT_TOKENS_SECONDS<> $GITHUB_ENV - echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid) - echo "PREDICTED_TOKENS_SECONDS<> $GITHUB_ENV - echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid) - echo "KV_CACHE_USAGE_RATIO<> $GITHUB_ENV - echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - REQUESTS_PROCESSING=$(cat requests_processing.mermaid) - echo "REQUESTS_PROCESSING<> $GITHUB_ENV - echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - - name: Extract image url - id: extract_image_url - continue-on-error: true - run: | - set -eux - - echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV - echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV - echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV - echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV - - - name: Comment PR - uses: mshick/add-pr-comment@v2 - id: comment_pr - if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }} - with: - message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} - message: | -

- - 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 - -

- -
- - Expand details for performance related PR only - - - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} - - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} - - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s - - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s - - ${{ env.BENCH_GRAPH_XLABEL }} - - -

- - prompt_tokens_seconds - -

- - More - - ```mermaid - ${{ env.PROMPT_TOKENS_SECONDS }} - ``` - -
- - predicted_tokens_seconds - -
- More - - ```mermaid - ${{ env.PREDICTED_TOKENS_SECONDS }} - ``` - -
- -

- -
- - Details - -

- - kv_cache_usage_ratio - -

- More - - ```mermaid - ${{ env.KV_CACHE_USAGE_RATIO }} - ``` - -
- - requests_processing - -
- More - - ```mermaid - ${{ env.REQUESTS_PROCESSING }} - ``` - -
- -

-
-
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1e344db6b..8c7353434 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -25,664 +25,6 @@ env: GGML_N_THREADS: 1 jobs: - macOS-latest-cmake-arm64: - runs-on: macos-14 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Build - id: cmake_build - run: | - sysctl -a - mkdir build - cd build - cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF .. - cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L 'main|curl' --verbose --timeout 900 - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Pack artifacts - id: pack_artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - run: | - cp LICENSE ./build/bin/ - zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/* - - - name: Upload artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip - name: llama-bin-macos-arm64.zip - - macOS-latest-cmake-x64: - runs-on: macos-12 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Build - id: cmake_build - run: | - sysctl -a - # Metal is disabled due to intermittent failures with Github runners not having a GPU: - # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 - cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Pack artifacts - id: pack_artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - run: | - cp LICENSE ./build/bin/ - zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/* - - - name: Upload artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip - name: llama-bin-macos-x64.zip - - ubuntu-focal-make: - runs-on: ubuntu-20.04 - env: - LLAMA_NODE_AVAILABLE: true - LLAMA_PYTHON_AVAILABLE: true - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential gcc-8 - - - uses: actions/setup-node@v4 - with: - node-version: "20" - - - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - - name: Build - id: make_build - env: - LLAMA_FATAL_WARNINGS: 1 - run: | - CC=gcc-8 make -j $(nproc) - - - name: Test - id: make_test - run: | - CC=gcc-8 make tests -j $(nproc) - make test -j $(nproc) - - ubuntu-focal-make-curl: - runs-on: ubuntu-20.04 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev - - - name: Build - id: make_build - env: - LLAMA_FATAL_WARNINGS: 1 - LLAMA_CURL: 1 - run: | - CC=gcc-8 make -j $(nproc) - - ubuntu-latest-cmake: - runs-on: ubuntu-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev - - - name: Build - id: cmake_build - run: | - mkdir build - cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF - cmake --build . --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L 'main|curl' --verbose --timeout 900 - - - name: Test llama2c conversion - id: llama2c_test - run: | - cd build - echo "Fetch tokenizer" - wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin - echo "Fetch llama2c model" - wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin - ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf - ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Pack artifacts - id: pack_artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - run: | - cp LICENSE ./build/bin/ - zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/* - - - name: Upload artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip - name: llama-bin-ubuntu-x64.zip - - ubuntu-latest-cmake-sanitizer: - runs-on: ubuntu-latest - - continue-on-error: true - - strategy: - matrix: - sanitizer: [ADDRESS, THREAD, UNDEFINED] - build_type: [Debug, Release] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential - - - name: Build - id: cmake_build - if: ${{ matrix.sanitizer != 'THREAD' }} - run: | - mkdir build - cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - cmake --build . --config ${{ matrix.build_type }} -j $(nproc) - - - name: Build (no OpenMP) - id: cmake_build_no_openmp - if: ${{ matrix.sanitizer == 'THREAD' }} - run: | - mkdir build - cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF - cmake --build . --config ${{ matrix.build_type }} -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - ubuntu-latest-cmake-rpc: - runs-on: ubuntu-latest - - continue-on-error: true - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential - - - name: Build - id: cmake_build - run: | - mkdir build - cd build - cmake -DGGML_RPC=ON .. - cmake --build . --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose - - ubuntu-22-cmake-vulkan: - runs-on: ubuntu-22.04 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential libvulkan-dev - - - name: Build - id: cmake_build - run: | - mkdir build - cd build - cmake -DGGML_VULKAN=ON .. - cmake --build . --config Release -j $(nproc) - - ubuntu-22-cmake-hip: - runs-on: ubuntu-22.04 - container: rocm/dev-ubuntu-22.04:6.0.2 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v3 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev - - - name: Build with native CMake HIP support - id: cmake_build - run: | - cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON - cmake --build build --config Release -j $(nproc) - - - name: Build with legacy HIP support - id: cmake_build_legacy_hip - run: | - cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON - cmake --build build2 --config Release -j $(nproc) - - ubuntu-22-cmake-sycl: - runs-on: ubuntu-22.04 - - continue-on-error: true - - steps: - - uses: actions/checkout@v2 - - - name: add oneAPI to apt - shell: bash - run: | - cd /tmp - wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" - - - name: install oneAPI dpcpp compiler - shell: bash - run: | - sudo apt update - sudo apt install intel-oneapi-compiler-dpcpp-cpp - - - name: install oneAPI MKL library - shell: bash - run: | - sudo apt install intel-oneapi-mkl-devel - - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Build - id: cmake_build - run: | - source /opt/intel/oneapi/setvars.sh - mkdir build - cd build - cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx .. - cmake --build . --config Release -j $(nproc) - - ubuntu-22-cmake-sycl-fp16: - runs-on: ubuntu-22.04 - - continue-on-error: true - - steps: - - uses: actions/checkout@v2 - - - name: add oneAPI to apt - shell: bash - run: | - cd /tmp - wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" - - - name: install oneAPI dpcpp compiler - shell: bash - run: | - sudo apt update - sudo apt install intel-oneapi-compiler-dpcpp-cpp - - - name: install oneAPI MKL library - shell: bash - run: | - sudo apt install intel-oneapi-mkl-devel - - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Build - id: cmake_build - run: | - source /opt/intel/oneapi/setvars.sh - mkdir build - cd build - cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON .. - cmake --build . --config Release -j $(nproc) - - # TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know - # how to debug it. - # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124 - macOS-latest-make: - runs-on: macos-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Build - id: make_build - env: - LLAMA_FATAL_WARNINGS: 1 - run: | - GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu) - - - name: Test - id: make_test - run: | - GGML_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu) - GGML_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu) - - # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know - # how to debug it. - # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584 - # would be great if we fix these - macOS-latest-cmake: - runs-on: macos-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Build - id: cmake_build - run: | - sysctl -a - mkdir build - cd build - cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF .. - cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - macOS-latest-cmake-ios: - runs-on: macos-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v1 - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Build - id: cmake_build - run: | - sysctl -a - mkdir build - cd build - cmake -G Xcode .. \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_SYSTEM_NAME=iOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - - macOS-latest-cmake-tvos: - runs-on: macos-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v1 - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Build - id: cmake_build - run: | - sysctl -a - mkdir build - cd build - cmake -G Xcode .. \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_SYSTEM_NAME=tvOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - - macOS-latest-swift: - runs-on: macos-latest - - strategy: - matrix: - destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS'] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v1 - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: xcodebuild for swift package - id: xcodebuild - run: | - xcodebuild -scheme llama -destination "${{ matrix.destination }}" - - - name: Build Swift Example - id: make_build_swift_example - run: | - make swift - - windows-msys2: - runs-on: windows-latest - - strategy: - fail-fast: false - matrix: - include: - - { sys: UCRT64, env: ucrt-x86_64, build: Release } - - { sys: CLANG64, env: clang-x86_64, build: Release } - - steps: - - name: Clone - uses: actions/checkout@v4 - - - name: Setup ${{ matrix.sys }} - uses: msys2/setup-msys2@v2 - with: - update: true - msystem: ${{matrix.sys}} - install: >- - base-devel - mingw-w64-${{matrix.env}}-toolchain - mingw-w64-${{matrix.env}}-cmake - mingw-w64-${{matrix.env}}-openblas - - - name: Build using make - shell: msys2 {0} - run: | - make -j $(nproc) - - - name: Clean after building using make - shell: msys2 {0} - run: | - make clean - - - name: Build using make w/ OpenBLAS - shell: msys2 {0} - run: | - make GGML_OPENBLAS=1 -j $(nproc) - - - name: Build using CMake - shell: msys2 {0} - run: | - cmake -B build - cmake --build build --config ${{ matrix.build }} -j $(nproc) - - - name: Clean after building using CMake - shell: msys2 {0} - run: | - rm -rf build - - - name: Build using CMake w/ OpenBLAS - shell: msys2 {0} - run: | - cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS - cmake --build build --config ${{ matrix.build }} -j $(nproc) - windows-latest-cmake: runs-on: windows-2019 @@ -993,40 +335,6 @@ jobs: cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON cmake --build build --config Release - ios-xcode-build: - runs-on: macos-latest - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Build Xcode project - run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build - - android-build: - runs-on: ubuntu-latest - - steps: - - name: Clone - uses: actions/checkout@v4 - - - name: Set up JDK - uses: actions/setup-java@v3 - with: - java-version: 17 - distribution: zulu - - - name: Setup Android SDK - uses: android-actions/setup-android@v3 - with: - log-accepted-android-sdk-licenses: false - - - name: Build - run: | - cd examples/llama.android - - ./gradlew build --no-daemon - # freeBSD-latest: # runs-on: macos-12 # steps: @@ -1050,14 +358,8 @@ jobs: runs-on: ubuntu-latest needs: - - ubuntu-focal-make - - ubuntu-latest-cmake - - macOS-latest-make - - macOS-latest-cmake - windows-latest-cmake - windows-latest-cmake-cuda - - macOS-latest-cmake-arm64 - - macOS-latest-cmake-x64 steps: - name: Clone diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml deleted file mode 100644 index 69c9f4f69..000000000 --- a/.github/workflows/close-issue.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Close inactive issues -on: - schedule: - - cron: "42 0 * * *" - -jobs: - close-issues: - runs-on: ubuntu-latest - permissions: - issues: write - pull-requests: write - steps: - - uses: actions/stale@v5 - with: - exempt-issue-labels: "refactor,help wanted,good first issue,research,bug" - days-before-issue-stale: 30 - days-before-issue-close: 14 - stale-issue-label: "stale" - close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." - days-before-pr-stale: -1 - days-before-pr-close: -1 - operations-per-run: 10000 - repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml deleted file mode 100644 index ae86e9927..000000000 --- a/.github/workflows/editorconfig.yml +++ /dev/null @@ -1,27 +0,0 @@ -name: EditorConfig Checker - -on: - workflow_dispatch: # allows manual triggering - inputs: - create_release: - description: 'Create new release' - required: true - type: boolean - push: - branches: - - master - pull_request: - branches: - - master - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -jobs: - editorconfig: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: editorconfig-checker/action-editorconfig-checker@main - - run: editorconfig-checker diff --git a/.github/workflows/gguf-publish.yml b/.github/workflows/gguf-publish.yml deleted file mode 100644 index 3ca4d3058..000000000 --- a/.github/workflows/gguf-publish.yml +++ /dev/null @@ -1,44 +0,0 @@ -# This workflow will upload a Python Package using Twine when a GGUF release is created -# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries - -# See `gguf-py/README.md` for how to make a release. - -# This workflow uses actions that are not certified by GitHub. -# They are provided by a third-party and are governed by -# separate terms of service, privacy policy, and support -# documentation. - -name: Upload Python Package - -on: - workflow_dispatch: - push: - # Pattern matched against refs/tags - tags: - - 'gguf-v*' # Push events to every version tag - - -jobs: - deploy: - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.9.x' - - name: Install dependencies - run: | - cd gguf-py - python -m pip install poetry - poetry install - - - name: Build package - run: cd gguf-py && poetry build - - name: Publish package - uses: pypa/gh-action-pypi-publish@release/v1 - with: - password: ${{ secrets.PYPI_API_TOKEN }} - packages-dir: gguf-py/dist diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml deleted file mode 100644 index 368dbdbe5..000000000 --- a/.github/workflows/labeler.yml +++ /dev/null @@ -1,17 +0,0 @@ -name: "Pull Request Labeler" -on: -- pull_request_target - -jobs: - labeler: - permissions: - contents: read - pull-requests: write - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - repository: "ggerganov/llama.cpp" - - uses: actions/labeler@v5 - with: - configuration-path: '.github/labeler.yml' diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml deleted file mode 100644 index 4aa4b2379..000000000 --- a/.github/workflows/nix-ci-aarch64.yml +++ /dev/null @@ -1,65 +0,0 @@ -name: Nix aarch64 builds - -on: - workflow_dispatch: # allows manual triggering - schedule: - # Rebuild daily rather than on every push because QEMU is expensive (e.g. - # 1.5h instead of minutes with the cold cache). - # - # randint(0, 59), randint(0, 23) - - cron: '26 12 * * *' - # But also rebuild if we touched any of the Nix expressions: - push: - branches: - - master - paths: ['**/*.nix', 'flake.lock'] - pull_request: - types: [opened, synchronize, reopened] - paths: ['**/*.nix', 'flake.lock'] - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -jobs: - nix-build-aarch64: - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Install QEMU - # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654 - run: | - sudo apt-get update - sudo apt-get install -y qemu-user-static qemu-system-aarch64 - sudo usermod -a -G kvm $USER - - name: Install Nix - uses: DeterminateSystems/nix-installer-action@v9 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - extra-conf: | - extra-platforms = aarch64-linux - extra-system-features = nixos-test kvm - extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org - extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= - - uses: DeterminateSystems/magic-nix-cache-action@v2 - with: - upstream-cache: https://${{ matrix.cachixName }}.cachix.org - - name: Set-up cachix to push the results to - uses: cachix/cachix-action@v13 - with: - authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' - name: llama-cpp - - name: Show all output paths - run: > - nix run github:nix-community/nix-eval-jobs - -- --gc-roots-dir gcroot - --flake - ".#packages.aarch64-linux" - - name: Build - run: > - nix run github:Mic92/nix-fast-build - -- --skip-cached --no-nom - --systems aarch64-linux - --flake - ".#checks.aarch64-linux" diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml deleted file mode 100644 index 8955f38d0..000000000 --- a/.github/workflows/nix-ci.yml +++ /dev/null @@ -1,72 +0,0 @@ -name: Nix CI - -on: - workflow_dispatch: # allows manual triggering - push: - branches: - - master - pull_request: - types: [opened, synchronize, reopened] - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -jobs: - nix-eval: - strategy: - fail-fast: false - matrix: - os: [ ubuntu-latest, macos-latest ] - runs-on: ${{ matrix.os }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Install Nix - uses: DeterminateSystems/nix-installer-action@v9 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - extra-conf: | - extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org - extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= - - uses: DeterminateSystems/magic-nix-cache-action@v2 - with: - upstream-cache: https://${{ matrix.cachixName }}.cachix.org - - name: List all flake outputs - run: nix flake show --all-systems - - name: Show all output paths - run: > - nix run github:nix-community/nix-eval-jobs - -- --gc-roots-dir gcroot - --flake - ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)" - nix-build: - strategy: - fail-fast: false - matrix: - os: [ ubuntu-latest, macos-latest ] - runs-on: ${{ matrix.os }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Install Nix - uses: DeterminateSystems/nix-installer-action@v9 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - extra-conf: | - extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org - extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= - - uses: DeterminateSystems/magic-nix-cache-action@v2 - with: - upstream-cache: https://${{ matrix.cachixName }}.cachix.org - - name: Set-up cachix to push the results to - uses: cachix/cachix-action@v13 - with: - authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' - name: llama-cpp - - name: Build - run: > - nix run github:Mic92/nix-fast-build - -- --skip-cached --no-nom - --flake - ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)" diff --git a/.github/workflows/nix-flake-update.yml b/.github/workflows/nix-flake-update.yml deleted file mode 100644 index 3a6a96e26..000000000 --- a/.github/workflows/nix-flake-update.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: update-flake-lock -on: - workflow_dispatch: - schedule: - - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00 - -jobs: - lockfile: - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Install Nix - uses: DeterminateSystems/nix-installer-action@main - - name: Update flake.lock - uses: DeterminateSystems/update-flake-lock@main - with: - pr-title: "nix: update flake.lock" - pr-labels: | - nix - pr-reviewers: philiptaron,SomeoneSerge - token: ${{ secrets.FLAKE_TOKEN }} diff --git a/.github/workflows/nix-publish-flake.yml b/.github/workflows/nix-publish-flake.yml deleted file mode 100644 index 2c3c1ebda..000000000 --- a/.github/workflows/nix-publish-flake.yml +++ /dev/null @@ -1,36 +0,0 @@ -# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes -name: "Publish a flake to flakestry & flakehub" -on: - push: - tags: - - "*" - workflow_dispatch: - inputs: - tag: - description: "The existing tag to publish" - type: "string" - required: true -jobs: - flakestry-publish: - runs-on: ubuntu-latest - permissions: - id-token: "write" - contents: "read" - steps: - - uses: flakestry/flakestry-publish@main - with: - version: "${{ inputs.tag || github.ref_name }}" - flakehub-publish: - runs-on: "ubuntu-latest" - permissions: - id-token: "write" - contents: "read" - steps: - - uses: "actions/checkout@v4" - with: - ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}" - - uses: "DeterminateSystems/nix-installer-action@main" - - uses: "DeterminateSystems/flakehub-push@main" - with: - visibility: "public" - tag: "${{ inputs.tag }}" diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml deleted file mode 100644 index 4e0374fc6..000000000 --- a/.github/workflows/python-check-requirements.yml +++ /dev/null @@ -1,35 +0,0 @@ -name: Python check requirements.txt - -on: - push: - paths: - - '.github/workflows/python-check-requirements.yml' - - 'scripts/check-requirements.sh' - - 'convert*.py' - - 'requirements.txt' - - 'requirements/*.txt' - pull_request: - paths: - - '.github/workflows/python-check-requirements.yml' - - 'scripts/check-requirements.sh' - - 'convert*.py' - - 'requirements.txt' - - 'requirements/*.txt' - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -jobs: - python-check-requirements: - runs-on: ubuntu-latest - name: check-requirements - steps: - - name: Check out source repository - uses: actions/checkout@v4 - - name: Set up Python environment - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - name: Run check-requirements.sh script - run: bash scripts/check-requirements.sh diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml deleted file mode 100644 index a8d46f31d..000000000 --- a/.github/workflows/python-lint.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: flake8 Lint - -on: [push, pull_request] - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -jobs: - flake8-lint: - runs-on: ubuntu-latest - name: Lint - steps: - - name: Check out source repository - uses: actions/checkout@v4 - - name: Set up Python environment - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - name: flake8 Lint - uses: py-actions/flake8@v2 - with: - plugins: "flake8-no-print" diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index dbe41f1fd..43ad31045 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -11,6 +11,7 @@ endif() set(TARGET_SRCS server.cpp + stoplist.cpp utils.hpp httplib.h ) diff --git a/examples/server/stoplist.cpp b/examples/server/stoplist.cpp new file mode 100644 index 000000000..c0ab9e7d0 --- /dev/null +++ b/examples/server/stoplist.cpp @@ -0,0 +1,10 @@ +#include "utils.hpp" + +std::set SWordsFilter::stoplist = { + "<|endoftext|>", + "<|im_end|>", + "<|startoftext|>", + "<|im_start|>" +}; + +SWordsFilter stopped_filter; diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 7ef2a519a..3614c72d2 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -8,9 +8,11 @@ #include "json.hpp" #include +#include #include #include #include +#include #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" @@ -411,7 +413,107 @@ static json oaicompat_completion_params_parse( return llama_params; } -static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) { + +class SWordsFilter { +std::map scache; +static std::set stoplist; +static size_t strcmpn(const char * a, const char * b, bool & nostop) { + nostop = false; + int k = 0; + while(*b){ + if(*a){ + if(*a == *b){ + k++; + a++; + nostop = false; + }else{ + nostop = true; + } + } + b++; + } + return k; +} +static std::string replace_all( + const std::string & content, const std::string & from, const std::string & to +){ + std::string ret; + size_t pos = 0; + size_t last = 0; + while((pos = content.find(from, last)) != std::string::npos){ + ret += content.substr(last, pos - last); + ret += to; + last = pos + from.size(); + } + ret += content.substr(last); + return ret; +} +public: + static void yx_simle_filter_init(){ + char * fname; + fname = getenv("LLAMA_CPP_SERVER_STOPWORDS"); + do{ + if(fname != NULL){ + FILE * f = fopen(fname, "r"); + if(f == NULL){ + LOG_WARNING("failed to open stopword file", {{"file", fname}}); + break; + } + char buf[1024]; + while(fgets(buf, 1024, f)){ + buf[strlen(buf)-1] = 0; + stoplist.insert(strdup(buf)); + } + fclose(f); + } + }while(false); + LOG_INFO("initialized stopwords filter module by Y.X.", + {{"stoplist_size", stoplist.size()}, + {"file", fname == NULL ? "default" : fname},} + ); + } + void yx_simple_filter(std::string & content, const std::string & uid){ + if(content.size()==0 || stoplist.size()==0){ + return; + } + if(scache.find(uid) != scache.end()){ + content = scache[uid] + content; + scache[uid]=""; + } + bool cache = false; + bool g_nostop = true; + size_t max_allow = 0x7fffffff; + for(const auto * s: stoplist){ + const char * cont = content.c_str(); + if(strstr(cont, s)){ + content = replace_all(content, s, ""); + LOG_INFO("hit stopword", {{"stopword", s}}); + } + } + for(const auto * s: stoplist){ + bool nostop; + const char * cont = content.c_str(); + auto k = strcmpn(s, cont, nostop); + if(k > 0){ + g_nostop = g_nostop && nostop; + cache = true; + } + max_allow = std::min(max_allow, strlen(cont) - k); + } + if(cache && !g_nostop){ + scache[uid] = content.substr(max_allow); + content = content.substr(0, max_allow); + const char * ctx2 = scache[uid].c_str(); + LOG_INFO("cache stopword", {{"content", ctx2}}); + } + } + SWordsFilter(){ + yx_simle_filter_init(); + } +}; +extern SWordsFilter stopped_filter; + +static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false) { bool stopped_word = result.count("stopped_word") != 0; bool stopped_eos = json_value(result, "stopped_eos", false); int num_tokens_predicted = json_value(result, "tokens_predicted", 0); @@ -422,6 +524,8 @@ static json format_final_response_oaicompat(const json & request, json result, c if (stopped_word || stopped_eos) { finish_reason = "stop"; } + // Add stopwords filter + stopped_filter.yx_simple_filter(content, completion_id); json choices = streaming ? json::array({json{{"finish_reason", finish_reason}, @@ -460,7 +564,7 @@ static json format_final_response_oaicompat(const json & request, json result, c } // return value is vector as there is one case where we might need to generate two responses -static std::vector format_partial_response_oaicompat(json result, const std::string & completion_id) { +static std::vector format_partial_response_oaicompat(const json & result, const std::string & completion_id) { if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { return std::vector({result}); } @@ -481,6 +585,9 @@ static std::vector format_partial_response_oaicompat(json result, const st finish_reason = "length"; } + // Add stopwords filter + stopped_filter.yx_simple_filter(content, completion_id); + std::time_t t = std::time(0); json choices; From 725ba0b3526f25a17929e830fc519879967d8dce Mon Sep 17 00:00:00 2001 From: Aliebc Date: Sat, 15 Jun 2024 17:50:00 +0800 Subject: [PATCH 49/50] Add YX UI for llama-server --- examples/server/CMakeLists.txt | 2 + examples/server/public/avatar.jpg | Bin 0 -> 16277 bytes examples/server/public/index-yx.html | 8594 ++++++++++++++++++++++++++ examples/server/server.cpp | 7 +- 4 files changed, 8602 insertions(+), 1 deletion(-) create mode 100644 examples/server/public/avatar.jpg create mode 100644 examples/server/public/index-yx.html diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 43ad31045..e291644e0 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -26,6 +26,8 @@ set(PUBLIC_ASSETS theme-snowstorm.css index.html index-new.html + index-yx.html + avatar.jpg index.js completion.js system-prompts.js diff --git a/examples/server/public/avatar.jpg b/examples/server/public/avatar.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c3fe2ccaafd6128e47bf61297963b492900aa513 GIT binary patch literal 16277 zcmc&*gxp&1W_>~-@r3OE? z{YDPK4@$59H1ue}Cy3TI8vK36UERnFf*9LLUu18D7&*W{FL|pNdh5D9_4c*$w1a$o zeMKByoxE(U-0ei%Jnb`9?_Gu71(f;tXE8ggEJ{<6f)NkZ_x^_{*wv8lCpUVLC;|DPY^ZF?BRt@iPO2Gj4l25oGF zS!Tb_x)zFGK(EM&n`QY(GNOsETZ78Hna~`%;^SB7I7O88^O_!FcJ+gqv9(3ZzdOcEU+&jd24%ZA_X0`-kI!W(C)qx!h6tsPQ|EdOmfykDA-a_Cne zw+Vlsli<@c&a=oM>f&jhfv@V8ZV}l2dJTL}oJ!nrMJT>CFAw_qV`pWAGADu^f|NRD zXPffEb)*@R#}uea^>hjw@AJd17G`+NAqANmBBE=vZ$y~zJ;BU+UB#xGHWyE*P6zy4bf?*F=%IW|^*0fP+$AQr-6mNH zcez*DA*5;g9e%Sa$D@VUdaD#G77I^A9HSA&ul)+Y`j2W5uh8q&dg4DDh})=QF$~dk z)X+5Z6@7hdwJzqbVFX-NtMfiwUq_oz`gUBvF)K4GZBO*YFGkT4suq%^`j^mXiJs}1`Mz7zbZ3fV5t^@pgK>Oxm8g3e*|y|&bmai`w% z;lq5P;dx}(hki8XPN=PZzxo!MU(ifA!8`g1ZWb}4~0h2NOeG0H>A1Wx7Dy`cy5-nFgY8ZIv_O;$0Id&ZDK z<&RCx-#li1khYhmag_Cf2H#~aePU$gt_gPidV3Q!GX*^qT@`GwE@{EH_#wHcBruV- z>^q_6wKg%n0LIh94l%1JdK|pC!17{vc1P}nV6>;u&~yJ(z~5I>DchB4aj_r+{T!KU z2`SjVeg*q2_Tytg;Jo~wJZl2y^wfyVVp<{j$x|c5jy{2X+VU2}z4qo6%;fyxNGBfg zrcQ%t^EabL%ftNEY~gH~+DSOI-*tMZ%S1`&&J**~%H#$^opB6}L)fDs-mv36{Sj=V zGmN4UMh)RJa_ZhMJz(-Hs^iIi_&4z^{$@ao?!A()Z4XsdS}#Lww)~u|Dm;g8tHU zdHYnvnHiuGE+0Qtc)CyP$|p|P%`{P0y+|8ADdaxpk-Yey$seUl(vmk)7BgK(3h>+F z8n}``B<*kh0uCR^wS!d-I=0o%@rsQ=ylJK${&VEgCx$vP2zqATV!plV$Z=`kSOP^N zFOY1^%x2N6%%l1|8?*_MJ!3X#|JNa=UeqXx`;Z-SSC8$vw&(M8l91`eb$;!7j_eE1 zRqPk>Q2&lbf0Gg$W-|c-q-@}!GHN73OI*s7QM`Go5a^M7XOf0%c`$UH!(^=Nog3_h z!;BJ#u*BVzOY9Q$7Vv|u5))|Q&)luLfZp(5LTDvs;bka?AIx*q+YXnF`YRHmkR`?A ze%dB`$TZ%BI2r#n!=reG&j)gU6x&mzN-5y`GH=hY7Lv+gGhL83ZpoQ@LSV+rQ$vj( z9$eF1S4|-n;hT91pCg!%zSU#?!L@F=A$n$JI9u5B;|>T(aoPB`$6McC-|mJFXcMR^ zaj&vc@vvfa%f9I;D*U)z9P%^iVEHSW)*8hxIf0>45R#IVN+hZ$oIdw)Hf!8hvUW2gqp@YSc<9RAA`TN$XOFoP4qeW@v z<1S|5HEj^H&2zPf%PE3CPBn(gNi8}wMcIlLGxMvfIZ8|hB-kPAXT-mOTuI(_K*U|f ztbVBO6Y`muBdC2~;P!=hMx9mu@_h=ayZO1pbSkqKv_3N`u6mra{BhAh-|R*~6d$72 zm5iAVmAS>Xw=C&?fMswZl-MN7&_lO%Op^{?;a%V8S>njK^@luHCpb@N8qkEXdoHl& z6VsQGQg6X6c03nv(S5Xt6@X@X{e1(c{tJDAKp;{Obc_rX3hi-k?-mCBU7d^KLpgw6 zt~PQKMt*LDcf7Rt>52-++Cb5tXEry@Ct2iX@;(5q=ZnioaiQ0;XmY?_1JY%rP*7&z z5GxGzDmr*P|HKh=QO=4XN9kCYHDTeD!krMaqLSFVux1TnqTW;Rbc5CYjhsJX<;N53 zRfkK|$Y_B}GBtekx(S+>U2){l5qdpW%e+1(pb$aJ$SFx4o0z=$X9uI*P+85QLm~Fcr^(O8wCNJ6&iblTg*|;#Jg0(0Kz%jCxw zm!YaB;FQETCunnEI#xraS3tIRhKYNII;uy`%sBN=NTXWl zORlGmheUqp>(>vbZ;18Zg78Vntjt(SE`X@%3xSQq!AafORt_B<5NFxqT-uXkw#}DnVlj zC?uqM88y7PvsIj3!KlGQ4N>sVzI&WI{4zNJEE-w<&?pTauybrwW)#K8z(!V6src@} z)U5%F_8BH9dF1+RWhjUF#2&j)@Dt#>25cM0?7S#94c_a5|+ewd^QI~U)W{3l6iyytHGB)=s$g~?`GCg^9{~- zrjGVA?6saAjHO3bxOATe>03EQ^VLIWzBkJScHWmBa&i9oR^y~naqEhP3eXScb3z=t zR!?yg^=^hy?U%-&s(b<6A*+FHHE?znH0cF?Q13k(zZ+lpn_1MA ze_McaP^CK|vRWIw$)X0i*gDva)=xIjv^&v5=vjVWDRw9-s;MSMk;&EKriis2|F>ncCTtLnXemdFiGgOs-xu5xql^_w>=#s%%*EXRxc ztR5(l4j~zI`;KcMrp>p6cP5e-c}0*@?DQhl>EjIyxk71#*6HiR@$~Cp#PA3~HEO#% z!NYMyZ-y1XDT*!}{q;?6c1fjD!YHbpObwDwcE!fsJcR~6Qqx5I5yqbL_!v;X3A$8|Mj53Q8UJ$tuRi{E z?n^E(N~rtw4Wyl3Db(zV3MF|pSnv#Tr;U|#J4#QxX5Jxq)()k2)%#Bi`2iY ze>=ohKO`$J9_&Wqqd>ET&QZ}%{-bcd*q>w$C~l$9V#8yyFFG__lr5|lPfa2$$f{Zg zHj9d9Cpt@HAP8*9)E99R$r5x8W6b4`A6YMx{O>dd*85AqXhDhiYGL~H1XJ7GdP zGZ=N^%G8(KaO73`ii%-y-=#cGTtlH(fmU*!S%a?=p=M>gIu;nwZ$aqRlYgD=GUx#iaXprpn;6PJLUv^W#aQl+EuiqndSyJ zoE?7RriX3#y^&gNw}(A?i$66ki03{lVH6d1Ng-x3a7aG$>Afc_@0FSra!N%_jxQdF z`jhvDZmzGKu^SwRwLTUD+pIUffZZ!cG zd}pMd1pUs$OHsntC}L5I?8fWM931ErsGHCliCMaYy>#%3Y#Mg)Vb(1d??L~y;U`?jNa}5ZJ!&gn!ez2VX05@ZrT#C|c)3>rf-%{}&MQn6U06D42TLxi zyD%-Y@GJ1k7)<`?c~YqvW}%14^nFIiCNC;vC~%}&lZ}w)J6f-M9VRaK;bp(P0ZIlj zQe4(l+7!5FY=8A#K;6K~WKBq~7xwBwEDTA(ZhE`$M*f^i@+MK2t$ZeP*k8AQV64SG zE@1D~u53Y#>+g@*#6!u$6WLI(K(trlJ((cEEA`-_76&~dG5M;dy
s%X{ z{S}e*Q+w>WgJ@o4ExV4~=@seVuNaDM_fsSLC9K8q1B#>T^Vd?E0(XY0_E6}!3<;;N z#V<>|66rf+j~KF=5N+vWXr;|F6kkF;WohUe)rKGfbGr#RDIv!zZVO%p7CWW2^Kl=% z;b`5DSIWXpjusc$%=Nmc!X2EnKGc?49B~j{1+_x!Zn59RhVk111jcp~k`{d58C(3J zt5u_QTk`4ljW{lo+!pzRi%dNKnQ;7Un#%G2Em30k+p%SzQ>w;vIeQ?4y#O&uEt2kca6DgKZySc^uiS@>PuPrQoq;%O-J_{SZ&tOMb+$ZBs>Aom6zrULx@%YiB91G%h z=Vh_K%cgndy`+;0W25 zMcteABfRdqbjL3-l9d5FmE|`CmNU|DRNC~t*2Pe!AKtDrMDk>*Q{Hl;KdiO&Khm#{YD>+~ zy39&)-ZADB=Vc)JOEOmBAj*?rmY*ViGte`=`xDM95y>bpwAZ{V55#?S{%-y!&tqi* zqfiVC%4BMBz#;dgLpC47301miG3)tEF|D!JHuZr#o}#lS_Gpe0zRk->;B9HP-rB+B zwG-(n>vatcKdG&?q)i>wCe1Q1z*oXwR{6SpK%AY)mqv}Q+tC$slqijx@c2UD9i#Gn z-cC7>SsJ6`-vgg#tRJ=<-yzVqYhwiRwN2Cv)h8IgL~EpABv+rA3lywkg`*Q@-hT}^ z{(5O?AS`37Dql8mZ>il&WMHY<2`p6f_b}DviV{pfF+x4?{sg1K-_p5+%6;B3xt&2F zFoI$yBWyb+A*c&Zu`w+CJ1@alt*X|R0XpJ5^r}Ws&)AW!jlCbhQt^GOsw{Y>&=LZK za!EA!zcVHmF6=PLcn@ysqS~(tdoC3*nfJM!{(fv_mHW}+Yc@kJH zjqEyM&MLHAQy}4FKFbD`B$a0Bo^`mr>VgV;>oKHO!ZEIEO^vj&xwhl0r3l=7k8(sM8M0-H zKk4INje6i<645JIHvxa&#AgS&ZytU|P|nyL|Gdut)3F8Z_&b#D)&#vmP+Nanlc82` zw|y5)X6az}G&Zk|H1_;M**5sqjeU+BhF4c~9>0m6Z|Ady616ASWCm;C-Zw@@hPBH3 z)Eg_c<>M?Yx#u%Yc|@pRoj{NDBWpcn#S4q>;~Ef^pi!1A#tDoJ_OQQ-6!u`x zxd`u4C?q~>d~iO^k+jO1(DgOTEVQ=;NHBuC&M zggXMDb-BRY^P66bRoNO(p4>0c!Cv|)in{OET2~JCl56Y=^}LB%RX*s|6EgrO+t8H= z$U~4%zIWyey?{5L{@WDU?gQNr$$lhQ*n^?i5NqVaHiReX^-NK^e9=tVE#oi0_?zrY z!?FsNA-Vwg76i&hDy2)~v0lG^a#Da--93j(IGDBDVxtVNPYZ)9p=v<6nCwlH6H`Fzgtd zjONP}wF|#`10B$s|8CMCAH}Y`V{k^y?#=lZJK3K~IPjX6-_wc5fzGXUlD1~%rX4Pz zWXiknLax*=B<_|1{OB`6xwOL-Qa`ws#S|;i#;9<*G+zG_K|b?6U(Y@Le-GC^3&5Do z7;mejZUcbD2pqxfZ%IQ9TSNyFFjI0jG{0 zg}AR%LsbH-?~Xg%BWva|1IrLfA{|em8v+@(6lUf_9ZY0=Jtz~)d{%9xluWjsvGZaIN>jUD_ zD~$;BjVfLe#c!OjhW6a&X0~*rg((*(6eis_-hN|dsC+hO3>0NQ zBS%(x&8B=#IRqj_r@k6$#=hLrspwB*7zU0KvCmWK2txIHt9%4?a(?68l$c|ed0u&F z`4`T+%6h#&%)#H^ry4=N8)!9mlR(L*UF2Vu;Q;ga#M9yquOV7Zcl`1yx$t>P^H|vO zYV>Kf{U8_A$U{_9*uJ;7`M`e-Ep}d%Ua9GpR}JX@2DOFU2U%!2P{IpeYsD<TryfrG(kLfg`h3tp7P>4&% z4MMcixkWN=ehVugox|@_T*FySfuJX+wMo^Jm6+>ju|&(sBTqskso57Aks(pJ_bx~A zuz=%t_MjPIOkVq1DDnoeDe$XLYYkVC_)-%21Or~%tbbN*G1bq|MuT2Ft7;3!Fg|!JVxNms}SfFl1)E(ze6ZHsrW$Z0^INByj&;{N=^gKJ28K zwz%7{R^1nko;p4i5e^8BK~S&j+muike>bA3`nDb$8}d-l&<#h{_#{mPwS&)I9Ycnk zl;~R9^UTFD^5f&`_49vr4jcU0uHHGcM{ZN@M%M@ z5ek}bUO!}?rXaEq7<2epo0^qnd{;&`QWBEAk{VA`GqxrP}y|BI+ z1zw1n*|5(VwKE9*&!pC+QtS5KTq+u4dpH^}^J^ZoSDI1UsdjZJ>|%hz47do?(~o+y zWV8z`{9@*?Sfhfsg=YJ)ea+jRrMqhPq}b}K1w1K`3DYm>8&Tp;V~gQT!FrM>yL&MK z?dU!8+Nt2OzJ{6m;w8o|C<^=wgYD)_RaYpnwX3P?OBhMkQ7v_fTt~-g)hXE?YoVLThtt+V=`CO!Z6EKZfp*x!VZ;X8*7Xg$O;W?Ydr-b13voNfrXSPss9*EMHzWoj2O?%K+ec1)Pk8?6GwLQgVCr`!v<%PR5ertrwCOG&?0V@LjjJO9W#{gva7LeN+`DlgtbA;tyZ#FE0x zAeeCc4h;WxZD;mKaOhzr$rd@%S)%I|qDZm01J2_!54e2);Y6zl0wXsUrD9I^g>Ozh zw^I2CbWCntd|(1WT+}2kfH9Ab3~=5$6SgMD41J-;gTP;V z!w4%HO3qyx{SbxmUM+DoYbbIW6ImLnujF%_FEtx4w<)Mu`Y%Br^(X@tL9$_OqF&R@^;c1^ zNsYtsT(5DH4+@CN%9Iu#>1&nOoTvH>`YCC48{#$&*jp~_*}M}cckCVq4|>7KWnKj0 zCQ@9CFgE{L6WpL*YW`s?Rb=%2SenD2ljhyfKmRq=+Z7`&5AX=65a88dnIa8iDugrN z)N`7H0|@)wYKuQVTyr#4slOk^YI1$HdKHH$bRJQE6L7H(X z%U#`!$)MrS=ly);XaTq=WL}yy%XP4@KBR;4=+S_$vg zj`NxMx9F?g2d!lf@w$fun*=AL;O7XQb&K~t3<`O2`H}kZT=Pw3<*QI$@YHs{yi<{g zRA5m&yZJLz%8fFSkcWH3Su5G?^{Jni6eekvEb5;rK13rrzs8P0k4&X=AIOX5@RN1Z=X3 z-4ovM*MT^I)ud?Ud-q)`An8ZuvYlYLeeen4sgxTqnU-+X&%ynirp68AUy49HL zx_gK4>N!v?OFtErmTN2}wW|uA;FRuV1-)-x?=G%?E6Q_OAB}xjOEeD+TGs{%VFQ32 zPyI_Cu1XmK%>!;W#&2s^RRztFR(KuDAag3PB>iEE*skt?FWd4JxN2C9}zz}$;p`k`-qi}F6%-+ zOyYnt4Q%@im|<`1)RqP>?Jb1LZ3(cjsxBo>O3}jL7ndrzGD=$}f}>Qu&Rri5-%2vc z;E{yF>EAcfqXT%ld|a?umx7dwd5 z%B~qy2Za_>&6vHKj2-#xgNr6@XOguFc77#199wg1RBQ_Lls@O#cCheoR;HA-Mv0-{ z63`IT;t(jzb`CwOU%1qET64?6k< zrjOZ<@^u6N1RCSn=lE1S;j$t@YB2$1zT-wXZUji)nKytOBJ`LSFitJ`TmA51IF)~w zbU}ZJGlr4y+rWZ2$;9GGqgeHND+7JOSkTkrIeA1eh$4l7tPf`sLch!;_#zgm<`>!I z_eGfe|9crB!cs5uP-WF@wKaV9Y&u$u`5b79Ik1<++bnHbD6`4u@FuB?2sbFS)Qc{vHk(y=0iIIy|NO%`9yQ@3WT3EFqGtO z2|m5RHUEPvYPrI3o^Zh+1VAnR{ov|PQ*ezyP;8Dx%x9vtgSXp2EFL2ovh)lUI0w*a zu+!-l_j#ouzkXn@(=M6has_P_gEr^pGefq~WyQx`gIIwsUAcj?gh*^O9?Ic11tHe) zFu>w*emJIsGi;uvN>TseHY*8!0MONE9<*5mj^SiJkN;$NO$t~ffWt=qdBx1S&) zcBWeN`=kjM@hI-(f8OS`vBp*m*}?3Rko>0)-wcfGx#vD!@v4#wTI>Mx6O)^0XvO4; z^T_nDrrFo@6gD7emg6qT-VeoW`EV}2^cnp5^HZg0(|={nyBNgT&9)4N>d_`NuQ6}3 z43wPrMf9^4vMD^A+>F@dWOO{}dX|oECj}^jx1eXgj#7p@0!xKv!I5$6*;;tx`$oA= zbN-pNuosua0$$6tKZf}H*>Kf2DkzY;-k^ml$Yum@@=oMYNyz!`JQ2Kn7s&bB`$kp} zpU|S7p*buGCKiW{U=$EvJpvf~(3srbB?3LWmz6|?(_>p2q#iTYamtGJ~xGt#LR$FaN!X zEZM0e`JGs;ky@3$Ee)iRJkQP@^D!%=NHw0_6-B0=EciWsr2G1M*P`uak}NUpy&R;x zKSNigB}lvdZjr0J4?1lV=W2;HLsD+`sT^uLc}hi~tZFis4P2<&djMuUIry$1s|2Jz4DEAdrJ5X2?w@$% zp?w8YZ*4kh<7q(7W>{1+c3Xme688_j1H81!;+Vvh zs#-66LN*BHZ@!cE%1Ge}6jE($0GIV;u<;Lx=9VK1>r@%eF;MyW?d+`fMk^h7cTaUgOXfy zXUBXwJCv#2Ig=1Z#C{0$eX{;LRQbmX51Y|^FMy?0B{@s5yc9)&MoUii!f93G7>at| ziN`6g&suLtsDxjZ!y~DiD?DkA-!i01)VPn$UH>VED4^CM|&; zkNqEKgx8bbHM_L@S9;jp!{|0JpwgYC^>f%WUHn{S;@9*kkb4`_8Dv~p(alM`ktWx< zqh`EMMLV1!!(RoRFOzEweYqtW=a0G0`L?g?5Ge6|89N_Tl}}HJL=o_ARtUyuQyV}? zl9BYB&M4?gGRSI_Tu2mKz1{ovTro|5hZB^@7sAL+2QI)Ox9mymr7u@6-E)W7D!wUS zNqB3s*Y^Vx-V&2dp*op!N`cR;?1LCJ;3)l;wnJn4=>h-z6C3Qgde#OqOA^N;dCtUl zdSH7I_Z?2w6*Jl7C)%md+f(l~W~hcYUZw{6xhYn{;qDNyvuuTe@RVz_?CYy?gcmi; z-^solj5J{xr0&WFuII-1EsTCB061CF-YFR&lJq!-UK?hb)-sS(8bAF3LYA0Mq5Kp->23DVg020hwOos3|R85gmca?}M}E=5oHSO$Zf}m>rp6 z=?y0cQV@zL$DO8G%RzS|3IjWCLr~6a5S+kS74M)~&I$d~GEFfwNI@rUwN^xoGJx&{ zuoOr>>4F9{2Luhyun?ukYGhFKD_kLWU;;wfab|Mrl!yBZyL_ zhm!f$Tx+#Jh#>)+EqE3LzVlIBtQeRR?Vi4oGi6Aoyh-8t@{J_x@C)a|Vc5&^5m+WO&DG zPGTOB0rKkH+ETt-6-!I)!M7hwK-?O=BEe1j8r+b=v-|eJJPDB(q@j{hRDDmkyd@CU z=ef1GBb!hrz9b-0|FUdDNeFV5W;1oeS@M@tC@7gdtWF922Bc@%A#W}p+Gk2QOG*y$ zb!*C3yF$=U+6&Fpd;yTdC$zueP%&v@KDCxlTzWrX&!wK1h154Ej|e5a5VX>3R`BB9 z&YD899ON+j>5s3vF^T@*AXz54y&6!9cv0QX1f}2R6`F6hM}hxynXhI994rJ=nhOHR;h8s(U7I zp+D>vv|Q^M(i=SK*o>gr{dGLYN>XEx=5|IQiMj^zEpyr+9~(|KP25*FH73w5S6clA z9x(THY|Ga`>CpYM7&ju3lrdeJAG^o5W^dxFU|jtaO;Y&Av@O-0)4&!>d1@pPKf4Vh z(5h>u3Xk=SI+chal`3q1d7_Qmj-uNwLqo>8-a$o-+|g;S@RxZR=j7 z3^LiY%iWPtJQsm}jiB6hw%kl65#fw0=8Mi0?2W0np0 zRz9z)W3rEzu8B)X7~4a_6@mSnW}(Bv*h`5uG!d%=(6^u+EU~NxZPC(X-=ZiW4QOr2 zwxqao)M;#KJ7v-aMSkVtT9e$~j+^7|gg;hb{ZPNH@mb#~6B2L?wwLaCrmhY6z01qO z3Tx~xIh7SVrSTxYFNi&0}zip$f~u+V8c_MoIgDr(CoYUu1q(*r|B! zkx`6E*VC1lB(K(|*+1)_Y=Q^OP)g62cp-^1O=F_2r*R=Ze|xb4iTKQxijWOWx)Pbq zOB7!ke%6Gnv;y{C_6^5R2+g71cQO_ve8NNRp-yK!47?uttI_VI+(t*)LKY2Yh1OR& zZSfIoPzRi6>Na%cMHU5hZ`$@_ji8;oHMR$yIR7bnm#K24zdhfd7#dolUQqr9yc5@^ zEmyOX2;$%jzh;ToXKjf$4E?`1zhQO&7a^W%g&QzDTnC%4LjLa!`Z?i?`+vPR&vZJ0 z`|@ym$qw}Kv(B;7ytEYiS6jZ|dUl}sKiICd#bYRLr+PH|^J+9XHS)P$diS=2>(GB1 z&(S99yTFAT1x_wCSyjx#3`s)V!;I095nEI^#fC9QUWHoeTgimZ7*0xRBiXrRrwvEZAh!fnvNe~P(sS7TQAp)M|y zV0hya0j_BO{f|4mHGEzc$R*WbtTPg{@|MR z&IOjRJtH(HwBZIw9Ot6Yt~-~)MSi)^QYp2EQFelJLN!pxC>5>f8L^M6+#~W&00yvdBq`AlGpEk z@saQwX_s4UU4Cf}Zza9hKJ-mb%Kv{HK)y3F(2DcUd!vljNA^-Nl5j|!qmBE>oRzrS z30YdshA-t$F5c4!1QPf%z{y5EUy?4lh9D$@OZwK(t(HT7>Cgzjpj$}W%|1fUAys9) zNDplb`K>Cz>xlpx=IrBn9q=R-0zHS)mIy^|HLL1?erfX?TNYeuPgFeNfA|?tLw!7* zlw5WUN_dmwd6D-9Y8A4R#%#bHHo(Z~xPV*vi#<1OkMy?z^k`{Zo*F;Q2{7GUzG{lD zb+1a##zoNi8rC9V$j0)Im;eKe0;m-+0YjGh7aa6SW__K>mIvx6M)569rsT7&Hy#yvXqP|xr9pNkhYEf2DvQTn~v zEpXImAm1qH{mWIu+1K!h4te=v@zc6bQPjj}$_ z=FcMkEpw(Otv6NOK>Ws*FTGa1S&7Hrpu~Co(I;aEx9*vNdKy+piZ%m((!H}a|D&4m ztIS_oZl2iAI2oN^(_G2aB!(?n36oOjQW2GSHTJ5X?=qyU+%{2PmWS7RT{Q$cp6tv?zv>2@ht#fh`IE1f|DL>Nq^@7YwFzA6!J>YV; z2a0?57L=sRhgKtKjU!=5TH|u}UCb>LDV5275@Fs!yq5}|)ghk_x_$Ofg$vj9NVg{) zQaPRFDV?^Fu36I{oD$u3PN5z4uYPD?gG?Ir4Dh6HbN(v+NH;cyU9DnNB$QJyd$`>+ zt@~i5w|Gel1H_@gGZA1IHrYAD6}$|!t9z)-i6w_zqn3f{=lOkI0Hf5+B#K)Skk6kw zbnI_69ix)rKorq&P3Ei5g`DOmoM7P-fJ2>|99#6EaU#13fLMT?{+q90Py9z&2-w^D z4v1M->6>k2gF@(=yFg9~%F@E!f{3}LgG%#CEx=LNzoJWWwHYcS)HGL(z&@PAAHR%K z^t<~qG$_A>V!)@Ssr=cV6%Ia?BE0a^3{;44Lg)TW2;|Gv)i zmJ<(&HSaszewRBn=l%UvxJ53wuwV$Due$n;Y_rpcC$D$YWy@hUtVQazav_} zCHd;+$#jOcY;_x)dgUb(krI@@z%@O6)k$2?9F)I{R~Yfp$#FUjTG#S98TX&aC$G5P zy=hS!k^-w-X+Qd9^+{7-xWvdBMJ{uNjBg@!$4SBkCc9a~*^nW19-8Ub#N?XP?ak!V zIZb^AWe{@98=^`=Vrv`bzUqP%Kr_6Al(>Q{pWv45OOz71>Mn4f=7)sr=q{4SZ48l2 zoafRhP@~iiE@C_k-~=JGU|k7}^=rBlcWzV4Xo~R)S6nLkgWBX^u&?fil=#%wHLm3Ar9R&mGZ>m0f^B~< zr%4!ha)*{vAfH#PZM3dF?asEvkxwJD=o&wkeJVD$g@qq?LdaBiP){PyeL&#fsfj0B z7d7DvZpf`Uq}Iqq(7z8OgM{^UB&-zPG7_^JKY~dPv>D~gH)DL}1PRQ_bQF-W3laUo z?cyE-G@dye$h?4S!uq9i*&5l3ppR2qqUj|}>oXS}w9`$uCHR;LDhk%pPgGuOXI$S3RB z;mGk}RGXNZ+|t7k4Nxi8{S|_QO6%S~EAm~H=_>}6S`NPEWnJ}L;v;?aniI`z)a;O@ z@C}jvOwoch^~;VhimKA0nqdUJ|Ro-c}a@g=A5H_M#-=((prik6+^h$lFPL ze&i|dN~-s|dE7+w;P}Jpb79Jp=_`p3_v_#80_3b5o0ro9R5 zprmh^RMIzW|6DUTM-z{;-H=lwyht|R9O62A)_V_xooKA0g2=|#XhV+wk$boJAEtO$ z`St(BEjOsF(fj+VfigooP|IlYr;R!0w!@DXR~NbOY132q6`#BedCRb7T>bfJ41#fh zb01RXVM)`z@r2p@Mw#PG;R_8s|7e#}q*vh)37x8CBp>IXfX=)9%dc3thiA%bdg^xc zM$SBo$}5;Xu}BYIa&uz+VeSZ{G_(e8F|Nw7a7IP5^Zu%>G>YG`BJT^F4t@T9x==wU z3v~jQKD(UmzY;MjJgFcRMql=6VGKW1aDUa?AH`kRg*mh4klz^iP2l)wvMSX1pBqC% zb`T4K)v&97zcF|kr6qZFb`cCUF7UA!`||7Qo4;hKo1fhY&n$L*D<7DPxX8>x4{q=U z3y=zZF2r8&sIzO)P@^;2iFUk)Q3F&RHa3PTHZLATVla~E56Dko(w>gvReBpvLaP3D zolE#*w49ML87#cZhUDGcgnf+6 zzsR_odl*pAM!rU>P)ax*gH+?}n2{oY zx`!99haGo7P`lbK%ZWO;`tW(RK64tO1oM5%3tB6~iTLEwjE-$#Hj0U7;|6e~}au0}I3IY-g T@mn`Z6VXu7QZD_^GW`DlMZW%v literal 0 HcmV?d00001 diff --git a/examples/server/public/index-yx.html b/examples/server/public/index-yx.html new file mode 100644 index 000000000..e58f57678 --- /dev/null +++ b/examples/server/public/index-yx.html @@ -0,0 +1,8594 @@ + + + + + + + + + + + + + + + + + Llama.cpp + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
ChatGPT
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
llama.cpp
+
+
+ + +
+ + +
+
+
+
+ +
+
+
+
+ + +
+ + +
+
+
+
+
+
+ + + +
+
+
+
+
+ + + +
+
+ + + +
+
+ + + +
+
+
+
+
+
+
+
+ + + +
+
+ + + +
+
+ + + +
+
+
+ +
+
+
+
+ + + +
+
+ + + +
+
+
+
+
+
+
+
+
+ + +
+ + +
+ +
+
+
+
+
+ + + + + + + + + + + + + + + + + + + diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d7fb61812..4cd43137f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -27,6 +27,8 @@ #include "theme-snowstorm.css.hpp" #include "index.html.hpp" #include "index-new.html.hpp" +#include "index-yx.html.hpp" +#include "avatar.jpg.hpp" #include "index.js.hpp" #include "completion.js.hpp" #include "system-prompts.js.hpp" @@ -3076,6 +3078,7 @@ int main(int argc, char ** argv) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template); + const int id_task = ctx_server.queue_tasks.get_new_id(); ctx_server.queue_results.add_waiting_task_id(id_task); @@ -3305,7 +3308,7 @@ int main(int argc, char ** argv) { } // using embedded static files - svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8")); + svr->Get("/", handle_static_file(index_yx_html, index_yx_html_len, "text/html; charset=utf-8")); svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8")); svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8")); svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8")); @@ -3320,6 +3323,8 @@ int main(int argc, char ** argv) { svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8")); svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8")); svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8")); + svr->Get("/index-yx.html", handle_static_file(index_yx_html, index_yx_html_len, "text/html; charset=utf-8")); + svr->Get("/avatar.jpg", handle_static_file(avatar_jpg, avatar_jpg_len, "text/html; charset=utf-8")); svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8")); svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8")); From d9b5678b5b4eab92dbb119c18e5450a544e63761 Mon Sep 17 00:00:00 2001 From: Aliebc Date: Sat, 15 Jun 2024 10:45:01 +0800 Subject: [PATCH 50/50] Merge with conflict --- .github/workflows/docker.yml | 116 ---------------------- .github/workflows/server.yml | 183 ----------------------------------- 2 files changed, 299 deletions(-) delete mode 100644 .github/workflows/docker.yml delete mode 100644 .github/workflows/server.yml diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml deleted file mode 100644 index bf94b2024..000000000 --- a/.github/workflows/docker.yml +++ /dev/null @@ -1,116 +0,0 @@ -# This workflow uses actions that are not certified by GitHub. -# They are provided by a third-party and are governed by -# separate terms of service, privacy policy, and support -# documentation. - -# GitHub recommends pinning actions to a commit SHA. -# To get a newer version, you will need to update the SHA. -# You can also reference a tag or branch, but the action may change without warning. - -name: Publish Docker image - -on: - #pull_request: - push: - branches: - - master - paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal'] - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -jobs: - push_to_registry: - name: Push Docker image to Docker Hub - #if: github.event.pull_request.draft == false - - runs-on: ubuntu-latest - env: - COMMIT_SHA: ${{ github.sha }} - strategy: - matrix: - config: - - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" } - - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" } - - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } - - { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - # Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete. - #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" } - - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" } - steps: - - name: Check out the repo - uses: actions/checkout@v4 - - - name: Set up QEMU - uses: docker/setup-qemu-action@v2 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - - name: Log in to Docker Hub - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example - - name: Free Disk Space (Ubuntu) - uses: jlumbroso/free-disk-space@main - with: - # this might remove tools that are actually needed, - # if set to "true" but frees about 6 GB - tool-cache: false - - # all of these default to true, but feel free to set to - # "false" if necessary for your workflow - android: true - dotnet: true - haskell: true - large-packages: true - docker-images: true - swap-storage: true - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Downcase github.repository_owner - run: | - echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV - env: - GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' - - - name: Build and push Docker image (versioned) - if: github.event_name == 'push' - uses: docker/build-push-action@v4 - with: - context: . - push: true - platforms: ${{ matrix.config.platforms }} - tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" - file: ${{ matrix.config.dockerfile }} - - - name: Build and push Docker image (tagged) - uses: docker/build-push-action@v4 - with: - context: . - push: ${{ github.event_name == 'push' }} - platforms: ${{ matrix.config.platforms }} - tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}" - file: ${{ matrix.config.dockerfile }} diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml deleted file mode 100644 index 99feb28f2..000000000 --- a/.github/workflows/server.yml +++ /dev/null @@ -1,183 +0,0 @@ -# Server build and tests -name: Server - -on: - workflow_dispatch: # allows manual triggering - inputs: - sha: - description: 'Commit SHA1 to build' - required: false - type: string - slow_tests: - description: 'Run slow tests' - required: true - type: boolean - push: - branches: - - master - paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] - pull_request: - types: [opened, synchronize, reopened] - paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - server: - runs-on: ubuntu-latest - - strategy: - matrix: - sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken - build_type: [RelWithDebInfo] - include: - - build_type: Release - sanitizer: "" - fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken - - steps: - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get -y install \ - build-essential \ - xxd \ - git \ - cmake \ - curl \ - wget \ - language-pack-en \ - libcurl4-openssl-dev - - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - - name: Python setup - id: setup_python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Tests dependencies - id: test_dependencies - run: | - pip install -r examples/server/tests/requirements.txt - - - name: Verify server deps - id: verify_server_deps - run: | - git config --global --add safe.directory $(realpath .) - cd examples/server - git ls-files --others --modified - git status - ./deps.sh - git status - not_ignored_files="$(git ls-files --others --modified)" - echo "Modified files: ${not_ignored_files}" - if [ -n "${not_ignored_files}" ]; then - echo "Repository is dirty or server deps are not built as expected" - echo "${not_ignored_files}" - exit 1 - fi - - - name: Build (no OpenMP) - id: cmake_build_no_openmp - if: ${{ matrix.sanitizer == 'THREAD' }} - run: | - cmake -B build \ - -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DLLAMA_CURL=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DGGML_OPENMP=OFF ; - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - - - name: Build - id: cmake_build - if: ${{ matrix.sanitizer != 'THREAD' }} - run: | - cmake -B build \ - -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DLLAMA_CURL=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - - - name: Tests - id: server_integration_tests - run: | - cd examples/server/tests - PORT=8888 ./tests.sh - - - name: Slow tests - id: server_integration_tests_slow - if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} - run: | - cd examples/server/tests - PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow - - - server-windows: - runs-on: windows-2019 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - - name: libCURL - id: get_libcurl - env: - CURL_VERSION: 8.6.0_6 - run: | - curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip" - mkdir $env:RUNNER_TEMP/libcurl - tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl - - - name: Build - id: cmake_build - run: | - cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include" - cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server - - - name: Python setup - id: setup_python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Tests dependencies - id: test_dependencies - run: | - pip install -r examples/server/tests/requirements.txt - - - name: Copy Libcurl - id: prepare_libcurl - run: | - cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll - - - name: Tests - id: server_integration_tests - if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} - run: | - cd examples/server/tests - behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp - - - name: Slow tests - id: server_integration_tests_slow - if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} - run: | - cd examples/server/tests - behave.exe --stop --no-skipped --no-capture --tags slow