From f702a90e245499283d6de0b287701c723cda2a87 Mon Sep 17 00:00:00 2001
From: HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com>
Date: Tue, 25 Jun 2024 10:44:48 +0200
Subject: [PATCH 01/50] Update control vector help (#8104)

---
 common/common.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 1dc532651..0ca7b4430 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1538,9 +1538,11 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "       --lora FNAME",           "apply LoRA adapter (implies --no-mmap)" });
     options.push_back({ "*",           "       --lora-scaled FNAME S",  "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
     options.push_back({ "*",           "       --lora-base FNAME",      "optional model to use as a base for the layers modified by the LoRA adapter" });
-    options.push_back({ "*",           "       --control-vector FNAME", "add a control vector" });
+    options.push_back({ "*",           "       --control-vector FNAME", "add a control vector\n"
+                                                                        "note: this argument can be repeated to add multiple control vectors" });
     options.push_back({ "*",           "       --control-vector-scaled FNAME SCALE",
-                                                                        "add a control vector with user defined scaling SCALE" });
+                                                                        "add a control vector with user defined scaling SCALE\n"
+                                                                        "note: this argument can be repeated to add multiple scaled control vectors" });
     options.push_back({ "*",           "       --control-vector-layer-range START END",
                                                                         "layer range to apply the control vector(s) to, start and end inclusive" });
     options.push_back({ "*",           "-m,    --model FNAME",          "model path (default: models/$filename with filename from --hf-file\n"

From 3791ad219323389106dc3fd80814eb5bbb7b80de Mon Sep 17 00:00:00 2001
From: HanishKVC <hanishkvc@gmail.com>
Date: Tue, 25 Jun 2024 16:57:35 +0530
Subject: [PATCH 02/50] SimpleChat v3.1: Boolean chat request options in
 Settings UI, cache_prompt (#7950)

* SimpleChat: Allow for chat req bool options to be user controlled

* SimpleChat: Allow user to control cache_prompt flag in request

* SimpleChat: Add sample GUI images to readme file

Show the chat screen and the settings screen

* SimpleChat:Readme: Add quickstart block, title to image, cleanup

* SimpleChat: RePosition contents of the Info and Settings UI

Make it more logically structured and flow through.

* SimpleChat: Rename to apiRequestOptions from chatRequestOptions

So that it is not wrongly assumed that these request options are
used only for chat/completions endpoint. Rather these are used
for both the end points, so rename to match semantic better.

* SimpleChat: Update image included with readme wrt settings ui

* SimpleChat:ReadMe: Switch to webp screen image to reduce size
---
 examples/server/public_simplechat/readme.md   |  37 +++++---
 .../server/public_simplechat/simplechat.js    |  87 +++++++++---------
 .../public_simplechat/simplechat_screens.webp | Bin 0 -> 21376 bytes
 3 files changed, 72 insertions(+), 52 deletions(-)
 create mode 100644 examples/server/public_simplechat/simplechat_screens.webp

diff --git a/examples/server/public_simplechat/readme.md b/examples/server/public_simplechat/readme.md
index 2dc177825..21410199f 100644
--- a/examples/server/public_simplechat/readme.md
+++ b/examples/server/public_simplechat/readme.md
@@ -3,6 +3,13 @@
 
 by Humans for All.
 
+## quickstart
+
+To run from the build dir
+
+bin/llama-server -m path/model.gguf --path ../examples/server/public_simplechat
+
+Continue reading for the details.
 
 ## overview
 
@@ -14,6 +21,8 @@ own system prompts.
 This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated,
 or potentially as it is being generated, in a streamed manner from the server/ai-model.
 
+![Chat and Settings screens](./simplechat_screens.webp "Chat and Settings screens")
+
 Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you
 open SimpleChat, option is provided to restore the old chat session, if a matching one exists.
 
@@ -170,17 +179,23 @@ It is attached to the document object. Some of these can also be updated using t
     The histogram/freq based trimming logic is currently tuned for english language wrt its
     is-it-a-alpabetic|numeral-char regex match logic.
 
-  chatRequestOptions - maintains the list of options/fields to send along with chat request,
+  apiRequestOptions - maintains the list of options/fields to send along with api request,
   irrespective of whether /chat/completions or /completions endpoint.
 
     If you want to add additional options/fields to send to the server/ai-model, and or
     modify the existing options value or remove them, for now you can update this global var
     using browser's development-tools/console.
 
-    For string and numeric fields in chatRequestOptions, including even those added by a user
-    at runtime by directly modifying gMe.chatRequestOptions, setting ui entries will be auto
+    For string, numeric and boolean fields in apiRequestOptions, including even those added by a
+    user at runtime by directly modifying gMe.apiRequestOptions, setting ui entries will be auto
     created.
 
+    cache_prompt option supported by example/server is allowed to be controlled by user, so that
+    any caching supported wrt system-prompt and chat history, if usable can get used. When chat
+    history sliding window is enabled, cache_prompt logic may or may not kick in at the backend
+    wrt same, based on aspects related to model, positional encoding, attention mechanism etal.
+    However system prompt should ideally get the benefit of caching.
+
   headers - maintains the list of http headers sent when request is made to the server. By default
   Content-Type is set to application/json. Additionally Authorization entry is provided, which can
   be set if needed using the settings ui.
@@ -197,10 +212,10 @@ It is attached to the document object. Some of these can also be updated using t
     >0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
 
 
-By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the
-implications of loading of the ai-model's context window by chat history, wrt chat response to
-some extent in a simple crude way. You may also want to control the context size enabled when
-the server loads ai-model, on the server end.
+By using gMe's iRecentUserMsgCnt and apiRequestOptions.max_tokens/n_predict one can try to control
+the implications of loading of the ai-model's context window by chat history, wrt chat response to
+some extent in a simple crude way. You may also want to control the context size enabled when the
+server loads ai-model, on the server end.
 
 
 Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
@@ -237,12 +252,12 @@ also be started with a model context size of 1k or more, to be on safe side.
   internal n_predict, for now add the same here on the client side, maybe later add max_tokens
   to /completions endpoint handling code on server side.
 
-NOTE: One may want to experiment with frequency/presence penalty fields in chatRequestOptions
-wrt the set of fields sent to server along with the user query. To check how the model behaves
+NOTE: One may want to experiment with frequency/presence penalty fields in apiRequestOptions
+wrt the set of fields sent to server along with the user query, to check how the model behaves
 wrt repeatations in general in the generated text response.
 
 A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by
-using the providing settings ui.
+using the provided settings ui (for settings exposed through the ui).
 
 
 ### OpenAi / Equivalent API WebService
@@ -253,7 +268,7 @@ for a minimal chatting experimentation by setting the below.
 * the baseUrl in settings ui
   * https://api.openai.com/v1 or similar
 
-* Wrt request body - gMe.chatRequestOptions
+* Wrt request body - gMe.apiRequestOptions
   * model (settings ui)
   * any additional fields if required in future
 
diff --git a/examples/server/public_simplechat/simplechat.js b/examples/server/public_simplechat/simplechat.js
index 25afb2564..8e0df3b61 100644
--- a/examples/server/public_simplechat/simplechat.js
+++ b/examples/server/public_simplechat/simplechat.js
@@ -222,8 +222,8 @@ class SimpleChat {
      * @param {Object} obj
      */
     request_jsonstr_extend(obj) {
-        for(let k in gMe.chatRequestOptions) {
-            obj[k] = gMe.chatRequestOptions[k];
+        for(let k in gMe.apiRequestOptions) {
+            obj[k] = gMe.apiRequestOptions[k];
         }
         if (gMe.bStream) {
             obj["stream"] = true;
@@ -740,11 +740,12 @@ class Me {
             "Authorization": "", // Authorization: Bearer OPENAI_API_KEY
         }
         // Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
-        this.chatRequestOptions = {
+        this.apiRequestOptions = {
             "model": "gpt-3.5-turbo",
             "temperature": 0.7,
             "max_tokens": 1024,
             "n_predict": 1024,
+            "cache_prompt": false,
             //"frequency_penalty": 1.2,
             //"presence_penalty": 1.2,
         };
@@ -800,51 +801,55 @@ class Me {
 
             ui.el_create_append_p(`bStream:${this.bStream}`, elDiv);
 
+            ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
+
+            ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
+
+            ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
+
             ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv);
 
             ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv);
 
-            ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
-
-            ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
-
-            ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
-
         }
 
-        ui.el_create_append_p(`chatRequestOptions:${JSON.stringify(this.chatRequestOptions, null, " - ")}`, elDiv);
+        ui.el_create_append_p(`apiRequestOptions:${JSON.stringify(this.apiRequestOptions, null, " - ")}`, elDiv);
         ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv);
 
     }
 
     /**
-     * Auto create ui input elements for fields in ChatRequestOptions
+     * Auto create ui input elements for fields in apiRequestOptions
      * Currently supports text and number field types.
      * @param {HTMLDivElement} elDiv
      */
-    show_settings_chatrequestoptions(elDiv) {
+    show_settings_apirequestoptions(elDiv) {
         let typeDict = {
             "string": "text",
             "number": "number",
         };
         let fs = document.createElement("fieldset");
         let legend = document.createElement("legend");
-        legend.innerText = "ChatRequestOptions";
+        legend.innerText = "ApiRequestOptions";
         fs.appendChild(legend);
         elDiv.appendChild(fs);
-        for(const k in this.chatRequestOptions) {
-            let val = this.chatRequestOptions[k];
+        for(const k in this.apiRequestOptions) {
+            let val = this.apiRequestOptions[k];
             let type = typeof(val);
-            if (!((type == "string") || (type == "number"))) {
-                continue;
+            if (((type == "string") || (type == "number"))) {
+                let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.apiRequestOptions[k], (val)=>{
+                    if (type == "number") {
+                        val = Number(val);
+                    }
+                    this.apiRequestOptions[k] = val;
+                });
+                fs.appendChild(inp.div);
+            } else if (type == "boolean") {
+                let bbtn = ui.el_creatediv_boolbutton(`Set{k}`, k, {true: "true", false: "false"}, val, (userVal)=>{
+                    this.apiRequestOptions[k] = userVal;
+                });
+                fs.appendChild(bbtn.div);
             }
-            let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.chatRequestOptions[k], (val)=>{
-                if (type == "number") {
-                    val = Number(val);
-                }
-                this.chatRequestOptions[k] = val;
-            });
-            fs.appendChild(inp.div);
         }
     }
 
@@ -870,6 +875,23 @@ class Me {
         });
         elDiv.appendChild(bb.div);
 
+        bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
+            this.bTrimGarbage = val;
+        });
+        elDiv.appendChild(bb.div);
+
+        this.show_settings_apirequestoptions(elDiv);
+
+        let sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
+            this.apiEP = ApiEP.Type[val];
+        });
+        elDiv.appendChild(sel.div);
+
+        sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
+            this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
+        });
+        elDiv.appendChild(sel.div);
+
         bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{
             this.bCompletionFreshChatAlways = val;
         });
@@ -880,23 +902,6 @@ class Me {
         });
         elDiv.appendChild(bb.div);
 
-        bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
-            this.bTrimGarbage = val;
-        });
-        elDiv.appendChild(bb.div);
-
-        let sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
-            this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
-        });
-        elDiv.appendChild(sel.div);
-
-        sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
-            this.apiEP = ApiEP.Type[val];
-        });
-        elDiv.appendChild(sel.div);
-
-        this.show_settings_chatrequestoptions(elDiv);
-
     }
 
 }
diff --git a/examples/server/public_simplechat/simplechat_screens.webp b/examples/server/public_simplechat/simplechat_screens.webp
new file mode 100644
index 0000000000000000000000000000000000000000..ccea44396051686b97220b0f5b6b9beb63706114
GIT binary patch
literal 21376
zcmd?PW3w<kur0W4+s3nP+qP}nwr$(CZQHhOYu-6?=F|NN_e)l~QmJ&WbXB@KOHo2p
z)LjVxKuttYUR9o*!1}-Wy*XeGASD6tJ|J&`V763AQ4uNe!X-jI5~PLgH%jM?u!G$V
zFF)=>Zp<y82D@yVBrZNd9`Q5a_mk|J|Ed+2`))2Kj#!z;9CoaC;W1!2_(%CM4*+$(
z5BN{p;+{XB^&jY0#E;$*{#dWs*W6p)R^M$O%b)D2UOkVbuiYE*^WF|0@vo)t)Gzth
z+zEUU?~h;hAL<9qBk23v<J_a(Rv$qxTkl!ZljNVqU*0!-JzZ@c|6khA*zntD-*w+f
zPsI1v=iXP{z29M<HE%y(lAqY0#2fG<-}~On-l?Cj51DV;&)3h}TitkHJ>9Y2a^I$(
z`ybnv!*kR3*>B&I-jxrnZ_KaKpWDZuzusBi*59*V?CbL1-(S`~`R#x3-3Rs8&iD6s
z7VoWDagpp<-8MBF*nLj*{|y1v<5(wIC}OAKR>4IbvQjH1y}Hh*X9BwRy)()L8~%Dp
zF{O>buw4#_CQXDck%u>S&Nft^j(|meh;+5ovb+6kc`qiYDOW`Gt+SQz((DUwLzB`C
zV~fG)TXIeQSAJC+@vXB7S!e1wD2;3mw$T&bcKea~v5Lu$D)2RgF=08th|IRLU+0ww
zz@VVS?-6aoBBrmb9CQQtZq*MZx0fC`<_=0ns02Ev7s7$LtEZq@_E6O|nWW_8ZJSrs
zg3{9$c^z|!VfloLU{edq*-d{eUvYII6fsz0<v9mYZCKE<+5r>VDVYbm@&`ti5kC4y
zR{UNnpk_UTISkI|NGOh%q5?A8s%;6FWQ6=7TDl*5Tz~p>h=rTNE%3l-k<6W<H;c8)
z+ESOppCE5oyTZolAs{G#2qhGf3M1n9;NP8cT(oWHLl1!u7)P8t(_c8x(NYo=@tw86
z?EW1@&3dDG;b+^Y-0!?MgM3n}-mMqnk5$7|FHGLGT}-WAje+z!f7461^jYKHTRWmt
zX;6Wt+KM)iPOe_{AzcC7cH{TP_p`DW*jZ{6Sm5w4$51)dBrly+M5}#ZBtg|qn!g(2
zhXEGfJ|e0{Iz1BmCme7cf!#bOSN@?>*?Tfkzi>mzheD$E8>NExS+Vnk*wn9pxb$Oz
zHBi)bO;el*Qd4cVbOkZFZ868j{Y&2}nx1`Ot}qJyH?o%+&7UK=X-}9wcd|~;XMCPc
zPb~yTzyL3yvXi2Fb9&PzZ#^7yOW8jcbF<4q(J;Up?ka1yST&WybH)~A7e*0YRD^TY
zJ-zZFS0+$p9>>2*CMJe0z$x*Hk>Njeaeb?`1X8081kle}z9EIlrfAdv<kPaCrxX|J
zWFNkybtWeQ>LoZ|`R_eXfw0jdbD%W7OQCJIQfG9Xg}A<B`QIiVEE$`7aSzY8j!#)n
z-Hso4CSog1(NA~+oi`U%O;iV1dpK_8X{v-AdPIM`w#OPC;tRn7jqNhr_+#WKKwB_<
z9G~sjoN{pp$)0c!q5;~!UM|22U2&<8qa$tqR+yv)`2N0hts#7EK`dgGNJ->uy@8<h
zV2ayojuA}W(cxfSUE?!guGbPugZa&RklmHDzJ}2}2Z61cK$(l;*htXIuIDuIo$$XD
z49}jhe-6KO+j{nPjFE{Z<Qj!pw`&WacOeG0Q@x`js-?K?Pes0VnJUQ%8$Qx=CZVq3
z(Hz9LO!{xUbPe_x`~Bi2%i%r)pI<$GqjYEA6$g$$3i4D(9eSV?BmRc2^+Q$cv&oN}
z&h*PFxIYv3EbzW2^2cbsAeWN$>weBDoXp#R`%p?mniFLf$EgXG{bkiG&)Xp;5v_xh
zbx0I+M_}H^E?|b<eCm)YmR@KK>1u!D<wF|Y9k!K#t%=8cMlBq%{2=jwh!rrCcy?L;
zw$!k>;U7UzwOK0O6Gm`R`6T6hv6cTdPu$sh)m;OiJgEw<5#jq>N!?uL;e?(lY2xoP
zsVNcfeDwkiE7dYuW{(D+1v@uQ53@yeKFW{lU~qGMM***sAdLf!sj0X1v2dY<2-9eF
zxZ-Y>=x>X!VayQEV7U+)gxp&`maiBw$&?ufOS=I;(#zDT0?o0ywe+o65>aiN08-xx
zd3NSF_{W*DYcdL*gJ^5oJI@SKdH`U9JTmOq!QIi>ztsO3pES<iWBUn<KF5q|Q5;yu
z@rHbBC`Jf=xkALjS72-U0$Jaq@F%E)fW`IXd#00}f$MJ>o)R+`azkAh;))tdGlu(q
znD()4IS%y77Y5~V)+e@x8l!iCCVs|&@rD!8qCauC+PY*TL0GC!ue;V(uneAa2V%MM
zKrB*s{ITvtR#sAX7YRsn5e~s~o%#L9PNO-%tOdjZ$utIWq1907GH<iJ(mI_D29`8T
zJgk#Ne5i8Jaw>hucfg^smXWvjovbh#=&j0&YJ<_k079oD$ZL@LOR|W2o*HXbEGVs*
z4J+2jDI{iJIn4>r1ScW0%$*1xv#W1r=N|s0DUX`5E#gQ%WFo-;{42Nv%P#K-D@j+|
zsK;&N<YEPD3_lMFptej2wHt-vUDN(mrWFcV7ed1u_3gR##+imbhV;S+=`#i}X_#Rk
zYMzRlHtzTXQlK_Q8<}yUs11SOm$xm#{om8}>>#Ne<@CLuJ$Cudl|p!9j9Me{H{bfG
zCNEPdsg<$0Nl)mvf*&UCunN2{0!=%<%eBLxv6Ygx^&T%XL_O?}=Lb_xNzz<*aQ3*?
zp`=UgUku1VV1ma=BsW_;`P62*viYS52S=BWU%zymKFl1}E;;9i)(V|y1&wB%;y<%v
zd`2~7d_8HHpRohBOa<~%b27V6@1&H#en!RWM2mAPD(0Wug*m47SQJ_~czXXQ+SHxh
zyaQUs4FamebLeJ>)~wa18L2ozbEtHu7vv_D^piND-I2$+x&7ShF>0UM|MgiJH6d`|
zAU_be9erob?%2rA6vAI^ISH&r$_Q9R$pwnQ>7xZ9%ZkG=pVmMKY`^StQ0+fRtF($$
zQ8e-G>H5T>wvm;yYgyw<b;I>?d%aGX(_+D^i;HE9jT?3Dby-<P7B%h4h#Qf}Vfx^8
zth01@TRKg2qsTbV>!f!|!xKFIEZ1!nuY0(a|K|E%O7zfl(fB)DM@!TK9p;VL<kxOw
znd+o0XN0l(oGUh#`};URZ*G_KY>*}2DmIQyx;$APAc6<oK9wB-_lQ?E){{q~vheqy
z5rrz%g)p*|lD2MK{@*;};i^!-UzV{4k!sh+CMcErT241$1b!=Zo;CyldL9vbs`gj?
z+Wx&z`f}!I&danvZCq7H%BNW333+x7N_>a{<$y>jFJ8WCRUFrFVfZhd|9iyqCPT(O
z-4tsOpbB*D1PrV`5px^o_x;qPe1&W!7~SM2cgQQu!n6N>j#3DRL3&mB{%4zgLrXs=
zm$3Rwk+rco6>QwVk_(!eeS)VHc3Uaa0wi2`KKwiTKBAV>nYL9PK)7OZ2{MSd_$<5u
z`e%Aittwc^NUN-ssL(8eJ*^{1-KQyj7cP9gT^&zJ;0Zdcb*ulX^yTGC`wVODaf!!j
zp*6$3Ip|=v64w<_X@1VfZDq{1`O#Y7k8Zv(I&`fs%8-G`vGGUqZ5X&Y$P+$eQK0*T
z6n=~RR<{je$vz-8S6~nXo&nT~Chs*}fO!X9?@jP<O#@qZH_ohGj~YB(a+nuk_Rn&a
znqj1NBroksWq7dctdylAoWGzVcIq`D1?-e7LK-A`aYOwn8=x%nfMd$$@GXJW8<_$H
zSwaR~%r_D)KO5G3Iw*Ws?C@TAg`wvuNx~IxF(mdwg3E8R-`^X>#NXcwW@0A93^%Tq
zfZP<X!eSe!$Gln&s7L=fl`-=CL%u)2@1LtiXob9uiiFK#zHXo`{!a`doCmR4^rPq;
z=5b6e(*!oRSt6_Z9EtTquK30gcS6(X|18S|mVgs~2Yj7zBktXqVPXVP9i(0Zm~945
zZ(B5vc^2q+$r6cL;eY#k_&oxi{EdUB{Sn~l{h#L7Q8V(7B{cnXnSBSIF)1T$v>2`2
z7o}X)+AXJau~qV=HvKWs7Se#NHEk=V+@TuqU2<AnD6IJ}|Fv^uLwkuyH6?Ec((2>k
zfh-0HM6tT_&2gdUBx?%E$D0K9=@36)#TiL({CBc>vGTrHee`DYA@YSidpIX^#ekKz
zZxYYHmKtzP>f{NG6xvpHb@R}~$~hE5<wuIgWk5>N@&S*q0td5_PVr<SLhu(L;9i3G
zqpprRxl{bsxbEag(mbgCo_3p~p&jX>X1{JxA#)L1Ch!%Ou;hc(LUFh+F7Fs%Y<;ME
z{h4{a*o9RN4+Zpp>QMWR#}@O}T_4FycGJVT9_L(9xIm48%=k$dT+uPEbnV-F6*v43
zC+&bvB8p0)?ErddAS*?9&pmZoSo+zDK1P>{JywuOG$HNVA>$2C%XX|hn!<U4SyyI8
zex1MwH%JY#DelYbR>3Eq&HSk^uf^&%NtdC(;AnI}9Lft`NPtnQlA-W)f0kl9%DUH5
z+Kri13S2p{;{tvsr21`sa#9LBBbayiilp#9T9id86!!9ZLZq*0L=qF69O?<1orB6j
zw;e9R{rL4WIZ>e(P*|vf3fXQ^A4M6w+gakH(NEoYey{MIIW-9n@vt+)1X*h&%^2;m
z+yn9ft#y?1+X;{r!w13Q8z_D(*^m_l<iF13;M&x(r|;t>UT*Y>!8};7r$x4Oif7QI
zn*ck!KZjzHsgTwqXFbon7KPHo0c;noSkK=0YCo2C9|!O$^+mW((fQsmA>-<JYfBFv
zf3*>wXMFjOA3ua`MNisL@EMo3Rw6wr3Fm%Yu53H`YS>sPSIo;4!M_?=cFP<dTHO4}
z((H5Nj)CWl_Qqca;xSw*V1=wuH*NMG&+@*E$@E78>c@{Jxg?dwGZKV~&Df<WlR$wc
zyxfjIxaPv(KG<7h`l+bXI01w@Z}NhuVRnviXT-jjybO*ZCmOwF0V;^r;6hyWmw*-Y
z{QwLW%|@bm008t;L79eDedA)AFBQ~*DtSdB{*cUQ0_d`@R6u0(btz5W{ckNq2*u-B
zzqhh9PL%A#y@oc$WS;8JtA!Oh=}bq1*kClKX@t0BDfCz1KECPd2eKUyD#gVOor$Tp
z1V#c-{}S6ZE1dXdvv8>HO{Pb`-$xZkX|}O&(NAhytPFijjQv5p8ObQ5nkxObIGkMy
z<>lLdjVx+kf=RY*og&?@MIAPMac?`5)v6uLwhL{0nm(^2Ln&t^?*W66or0NX2*m2a
zj}aX47Hgw|PP26WjBQVLRpbrH%&|B(yg_ev=W@cwLh~U^B;wDi*mFVD4_3zJch3nx
zg(#}mhby8D1`>K>H}1cq_XB!yZPp6%Z6o^Mv31y%l$6N`i->7w%wV%7_q8RK-^i=J
z!RTDC*uN7=gwflEmXqUj$91-i_V>r){zdHwZ&+-(8h_RZ*RZ9ZGE51Bit%2ZQPNRg
zmxYb%+o92A@(HkUZQ!DJw&%OO<V4EYUeO61(#xqcMKDcIaHD=Z2SoNBcO7iY`aVcS
z9`&tOw8XTXw%a<Yp{n7k>_gR7Oj9WsfzY5w=2q)C^&dW)>!j_y?EvZ-PHS&Ftn175
z_aW@>hn1mLy)^>!IaA!4>NU;`_4QK}3?z1jHS0=rTr)@JaG=(1oz9FR5t7YCorI>!
znpN7}uw)y<&H_p<DOyZaV1hOZHoPL9zZ|}a1EdW|s;ia`&v}^Ccpxk&p+yAXc2tKa
zdq|hb3G}X;KT|Kr`1n;uwq-CFv@|;t(@*W$HT-pP<b%$<LI?Y@kk#$oj?*!>eF}sn
z&s(EH$Ie@VZ#s|igWWmsHJuqU9FzlLDjIWK&XNb9vAZ}tvvooM%1>IcGEf>}*c;mw
zwR-arvlG~bX}P>|z3_w5WtUB){U+_aM-9<>-e+JUIWCZl*D)P2&q?vHtlIY#p4zL_
zLaaEXD1tEV<n!fM8Nv7jl*vIZ*E6M`@n5$*dhAdldrE3U#R~>bGr91sdo{VplvnDu
z{b{`6{a<h^0Cg)PT{(x-dXtH>q#pT<?3|wqm+|l#Q>T;gmtL}|S6saQFg1S4e^pM0
z=b`k<&Yu7=bA&o>7h+Dex4D_;`L6ogQ~lJ=<<~%^_JNBXCX<lLUX%kjae+}^ZjvT>
zr3r>ifa6f^8(r9Y@H!aD5x3U|dMEX{A?bNAt!ZTZj5Rw+vy+m2XhmsqjW&5F3yhyg
z1OI$yqFvl6j?(P$jrFeT#}_G@+JJS%!vepW4|OO&+4yR}bvd}pdJ>{HyDK0J$OVqj
zw8Y9L1d!I5W{|J>L85<8wux0GUJVUNzjLJ!oSvc+flAyJMO9pu>Dt@TuB|)10o8S4
zuN6s8gQT`_!l6ZY%A{O!^7z42`zrlbIv$*bFeo{F02p-|NO%hVRF4JA(q=+4xyZs+
zkI>vc7u^Y0H(`j}GJ#wsh{yYAvYF5f9|{<Kv?%r$+}nEDvwM+2lXSFsL+Wzd5?o*h
z`+-9dFM!~jT@L;V7u63G%@;KP#6Bbjd$s<iByuPbYDvqVk}%PX88lpCrC!KUAtrqa
zDy)_=hQ;GB=K@~w1fML+VE@r-3t~vjyN{$UTs+DPx57LLe<1)8^L4mEtR(uRN0n?#
z0ini56b-7&mz{W#LBaEyHVu+o^p1V@bi}|3)~mXT75?i2ZwxLn)`av3(Kih;W@2qb
zIAspShO}7h05Y3Qr65`eS7Cu7^O06!?`*s-$Bi9f^jw-?>Q8hy1a$RR<Z%9gh_ba$
z9?L)5c1aF0Ca_f*mwPT8%xXRzcNe?n0H3nD9BR8i@Zwv#k{fZ>a=R#i6ru1;H{7r~
zzC*>TuK65r`i37FV32<CTP>Lb)4M@ik+N%HEEl3LIS>9Gq}BsLZ`v@CS0G!O?_LqM
zcGnlF5%`|yS0^95z)dgCNMs0-TtL7wP=~bH=zB>luHxroq*yj5be`$R1;Y?Bg^KIz
zicRkdq4+Gu-LJ-)Z#ro$8oVZ}1@sX)-Zl^uU%w1DZ^L;yX`1jtS^v10o0g!tShO!@
zX88BSi&KP!vf5Zz-_r*NRp@!x+l^Swc8G2u%75W`zv*ZJu^OLrFeF3MFVX&93>ESF
zuR(_{V8Bo2uh+p<d-%SYH}bK0RD@*d`gkEDSpLbQ<i^;1>)mHH{i|>yJ9*Tt5wz4j
zU9rfa<rx9;$pOT7fM$~g4VCfdOL$QVJu;Zz!2i6}qP~gccH5MAgiTQN-Hw(l(*+@{
zDN~Gn`L^nrpI$Q2<>#@8C6m6cE^-Z8A>JSdcy3z=jQiht$`f0{Y6crYc=o~?g3nqH
z)jSuFWJWEDP`O&jIQW4G4txzJ7q+du^*R!y^g))Y6wJEo!9;wGG|<jy%rX3aDx4VT
zssk9RU-n1aF2kG}S2HS|3MJLx{QAQbgx&->t2`MJVah{duz-Fs)U3b@lQ8j!7pNZU
z^RX4Tx^zRplx#l5Qyu#Sx77_{@{bi~H(pyA6A@dtNiPT+YTyqpDFrd4)TT>@pPJNj
z?p9;kW>Jyn!<u@{X_Q1DSp-H=)3bnjO3_Go7NA+G?-j*{lfL<SDxO<(g3iauskPJM
zz3RItZeq2ejZQZRypn^2Lc>_$%ep}uGKrbP__I`+PUQe}%{X4W#6rmX>e<7_lv?rl
zG6bwr(3wunsBjyOTE2!Yv~apWE4@x}-C?64yc-iiq*D@yX`@+2Tr@uFd|g_4k=pEj
zUfsPbmHc)ztdNf<Bjvuvli36Ow-auj09TzLbwy>)BNYd4kJ1hvix)b_*%3AOXBBq;
zNyYO9VRv?hOl3ImoDit3T~^7dm}6bddTNP)=k5=uFZujacOj-Pf<;5^8}(>23SURH
z?6b-$JkA)w;c9EV_{lN0)ez&qn}&b3q_S7aTQC(Ldvz9UKS}#lZSqLKQ9*r{c_!*F
z3nUz(Eyx8vXPA->>^%mM2I)5;XFuW|$^h0~Um#Tc%>ziMdx@2CYD%xqWaMytzVea1
zuH->Ksfb?<(~SnaCDAE|TvpkSHK(i~cfkpPyWr&1G*xMzge&#%n{I>k`Zz}Mh4z+X
z*&8Hlf0wSeVx2H2o^=`C%BFj%lNyz2gs_R2Fd1VGXm`vJuqhL_v%PxqPdkzFp1{wg
zBw8-*BB?nE0xUQHtu<URHMtrMT_dcsK@U6252RD{GQo$c931Xy)e95QEFdFHD}S14
z+?amNS}HWX%aa8#gK34xPY?~>2)c77DFVV-a)M7Phz$n;Xf5DR<be8l(8aEXiKgPC
zuvEhQ79skiZKi*#l8H3{-{WNp`5{ZJr|uXAu<4Y%zqg!*w<u+o@os_jR)TY>n{m+f
zneCu2=t&MCxWq=4goWEqbE$Kr*GfibWqb@IZDkUJ@$<t))9;Sa=1Q8Ay@Rjc?358#
zk)QXX^?TtuA(+4=S3?#P#l)I|a$&H{ZGdD)hw+r;?}a1Iz2KlfY`-5P8&1fB^Pg0c
zOPDB0a!pC1riel_r)ez-MRo^onnt%#WR6OhyW?mXhX#@{qigg0<TJyFUm$Jf%j(Kg
zZ#T#i)Vh?2fT6&J8>NHOYh4cMaVR^G*azM1nHI}}c(WEbM@pS_3qao$<a&FQOd3T{
zpYJvHEtp`K^V~nf^hSfR5qeA>xE4u{5D_!D4&Xapi#@>gBo&zl!ki)Q+Wx*M3jLo8
zNe!(hU^4*($+MU(#`QB{wm2D3%l$Ol=DYmAkow2=I%Wt4Y9Di1K1Vn&PF(_uFN%fT
zG_Y|v^&#JhP9{JGM3{P*|Af(?_^Xfcm6!#&)#_Cq^khYwMuMj>aGEcH@;IMjlSX?N
zG38d9ss^q+ofwI|Mz!{Z9MtI+78MielY7cE(q5eVByPGWd@4EF>Tq2mcg>sAiOdrs
zx%h-pBQ&|yBkIvS%x)WV(pbS_A`H&lCs6ltbxp^3=s@j)4K^r8$yT=d`Cr&jQm}D6
z>2yM|9T-L3aN>kJTsEr_I=2I^f^#tRVr>0zvs*l+WBC17*m+$xw~eJ7Mm#}N_nRex
zv5i1CC4Z^N2;Q6+A<8(IQmx25(%%Ojbf7*wB3Zm|AwU1X4O?Kx`<L0)iim1K^H}{5
z{>B(Ko{>ABi6%2E_K2F2u2y&!j?@~Qa(x+x<|-Sy;WYDP60s*Hk~ho!xu5A(Pd_?!
zjom0ecOop6(QEFO$_{<txh4tI4#2Xo?i{DB1cbgahwnsm^dF`j;b^QZKw@<TTDzHu
zVz-b&50!6QWa=mLaz0;^{5E)@x?BJW*xR{YD_&LNbdZFStrR%XcO8<;kfT-+QZZxK
zb^VsZQux89`sxLnp`{UNZ9XU|50wvg;)9#r#G;(=(cb=L9bYZ2lazx7WWxL%CTdYT
zhqE9yi^TG4PR^Y)`xdK#8ccSsbF(<$6Hc)X`a-aO;HI1mC2k;;haT1)k!>%o?XwM4
zGiU&!kV-RTl2zMRi*d7E%2gkd9#pZ>Ogx%Ba>+`7u~j*`NBu$VH<stN@u|kSuy336
zs}<2lM-<pd?<zrX{MFCH32qpC!#YZokC2=;tUtR#rAD<`XzNvq+Wu<G3TjBoi_<bx
zQ$jc~S#79i#{}%b(-8I%m-*1bdM*}@4<Be*4eV+tZ4bBAPswpqSbb?p+ulM58nJ-J
zUjl>hrWTIF(4G{5u0@*FEHqg`Fw55LUUMJx%^yOhOc~V}F?>>hYXD6)OhtIK1Xvl|
zA|nu=nn-qh^T$4fOR;+&xDC0huOinfDhp10fsk1P$P2!*fBzHh$#h*h@at5$25sd#
zI4Q29r#`<F!9;y?Sil&HcOvsNHFbaHJks6+TgCLcU`*H;;mra^nPEY3aZ7KxluE%g
zt|S)TTB76J@u1gp?zl4K_V5sc{TC!{UwXh(B(u@&z=+vtyHsw#*-At%86k9Q<EZP1
z`OPGE{GkB57yrWWDVkS<<eUP=sZ}la93+q`X^kNYc;WS%+ukA|RycoX-yov_Z-39<
zNmB{o+S&kxr-y=SWMx%~<Qw=7BB*%aP_hG90jkxDWRu$7TOH(4JXv5c2Jed}GkE<$
zzIH|zAndUj=zBv}yfNU!{iLT9*rOrn3ZpSTT~eK+gY;C>wdzaqepXfOKlJ*NZ*UBw
zpp0BrjaWhH`YVoV`kv9%2b<Ht`vf)Tt_$-l;oe}=Ekioovd^LA#e6NIhU4wC%?51N
zg_EH{YbGQVijc*AIxf^0*jWma=tIO)Ff?Qzns_Nq*R$GWv1#CBdKd)*+nG-|zRx@s
zb2MXN_5|raf3@%ZM+eP%sbYeA2zk9SEn6~WS3<+uU%^{#Zr@=%6+RRMic}KGr!zQd
zqvd0p&N~P$+}Iuu<dnuk<7P?rC|BWM=X}cVQuiv*X7HG3Uex5q$>a+4tk@TghtKAL
z`3SQhk2~d^wQi95EyGoi@2JwuBt@wn7w+Tiji!eq?cA;6y!x<E`J-5@V}YDV>>wiX
zMAjp?%^}{qJllEV<H3UKmCQZ2e2gECD^t-9sOt$LFo12j>s7|&>00xSY7)tD$NW%6
zQ%r?NL>zDj(;K0wtJE3kqA0E|4+rv~=H|^DR1)M!CrsIwC$0OhC+_0W2$F8I>;+~>
zE@!psQTiFhyrDIxr<GM2_gFsbMpLpxf|1<=xP7x}))-j~BTBJU)@8;0D6}0;T_P4D
z6ph&Thg0k3{pRUWyP<i$Z%=r=y&n(}fVlrCqoZI<Gk!JOx@$i=y{cZPWBQMY?ecQC
z5#ZAUh*HmsBV-NR`^pZP)Xd-MXFL)FJNXiBS*+AFLt);U7#G%0i!AUezhmfgtfT5G
z()&re`px9qV!eE6;Bn`uF%$Vd=5LKKGcV!-hXu2x%kQGjuQmZep+c={DVVmN%qQ(d
z*|tQ#^N3pKEvon0O5yYY_vDlD@QPP=mJff4wv!w0vm7WUXD1pw45*OXkxdL&od+p7
z90_mriLBO*>lh0N;|sw2X*yUeut-KPWjnqp;5uMmtCD=9>;P(EF_TbV{5qfYp=5=B
zRLPGshU$^NY3&G|XX{dKZzG1}@E9mXK#%0_3QK^WvBJ7=vyhR<_Gds5tg>0toi0#U
z-$@v?^UZ|a11ZudC>#P0V-<h-U>AoZKhKb{toc%hQvSmalX(YTX#@g~GQ0)vmr(T|
zPsTW$gba~#J}2PuZ9_wo*b=v2Vu5ir;w%zwkRA;VyUALlgN{Fr)cP<p)qS`Gj4#gG
z48B%}U0xn?OR1D7NM~XK7f~?aVk$9MbgPs^jFvE&lV$Jo$k9w`&u-&Ey{{)cKsP09
z9(NrsZm%1d_Si_YSVhp!gQZIOkT}evRgmAqMxE*oiHQ#1A5hKB1=~RYb1E9cvWuY~
zDflp6`U`=@?ix~Psbt+8o$#{eC$*qJGqZjVBU1-DBS6Z8rp_%SgC{Sxx!yH=TfO8W
zkyz}x$tpIt_+ExX%h!57JmaTXzMyog2V2M1mxg#9w;0#tx~Rg3re|JgojnYZ-{IWr
z{sY#KlH~Lza9-%K460tpC&O3rCd4wztq9Q5ch<e;(|X%uQxLMR@sweHlw01V@F$XG
zxIzOuwoMtBxr&ZCP;rS5HSS9AOo-MnQ;q%ayQi>{LXJMoshWp}DSR-@Berzz@>gs@
zkN-%+W-mV1vZtoK;V-k_pR)Q@-ETi_T@CAHM>b0No>gYs>$Q+2kc5LKf;*&OZL(mJ
zvC|jn`5!3u_21jyF+BpF!lK#Z_1Yz5088?3@MHh<lpKkE8!jjW>c&^ZNd!Rk0BA22
z?%87$X57;r3J$N=b3TcHD%rs8^U%G{r;k?dBw&>;;=@1Q$+1|9UjZ`woyTy^I$=x6
zj5^_AlWrPI%<wM7D}`yE;5sXDp-zxlo<2dG)CZCmD<MbsU6&SmGwp-viH;Ty&Y?#d
zk>voa5z!(*7<mF{FSo?*0|KtsT^F}3(}{I<rrskSAgRv18(?)yW)Zdi{lfB(b>Rr`
z{!8+8T_Bqjxmzz*2;HWT88+gf(l0==L)XR~AHZ~n6x!d@C};5RSp^$4c#zmO4}nSH
zOmQssKZ0n1EZ6GXig2`n_2<*x$Z&3Cp^GrxM*XCTDsN;a3S^k?pfioggfK2uqCJPM
z(;NN5_L}BwG-CN<Rplb)%qp+*lX|%(v+AcY%mb?ZZ5n}LXlDYa+baMNrxLT2HXrs5
z=1v4t=VxmpZ;1t_J{>K{Oh{y8V7mb{EDWtT)f|rI$V@LS6BnfbD#-_&&t0@V5&J+u
zXx#$No?TSgZQvj{6U^u7LFFvvj7+ALPZ1IBHt45c$6kR?nBBlFw^x7(ti=>2NTBc^
z54n=Wn4oxm0Fr`9o{D{rD2KYMoskS<=IEe6RJFuSMbBN!6Y_y~q>``9yFeQvk2<gx
ziJ21JOBmT7tRzD0MRQ)E)~Ew+-<2RBsa_XUZ?;Gt-qVo%x;mUKwxnaC6t9M)Am1%E
z&-+zyCS?QPBfgLjpByqxCQrT#mLd7~qiSnu)uS*tw~V`_kKG6|ev&3nIgOF5&C7?|
zsZ=voJ0L{W$9@+BDZe5&^U0o7AB^7`9bfybBOE(=8seJv6kG=CijkKt$ZCj8c05H5
z-bJOF;*3K?yG)h+4DX$!S6th0oE$K}B*PK;V?BQlq|>+Zcsry7S^%4jLz?*Cm;lqP
z<J0rj($f5`tt0tX>|1javJY?6g;a@IakE{XY)c9PZV`*^U;I(x+;Ha&nv2a_oJhZN
z@6$Gb&hlB%1t;e14oN|lODxI$NH92DKt8Z4xDsg#{POTqDHGgPeRD*lD~9k?4qohN
zF0o;CzHX7i#4~j&UW~})a8U4D>u<12Ku3b+SPwpOYixAtzp(j(bX129e)uVoH^?2P
z?o;}L952bqQ85vGT1x98h3NcoLZ6K~9L26Qq&@Xu%A7Z*?j83(nEF6$k!hM$TU}G5
zbbB=BvvCaIiX(V+w`0-Rt&Xvu7lWh`O!^9<N27ZdEOTjyQhlq<>ZCHDzdCfAjip(>
zg%JNeTpXBoi~I|u3rbMsp-cnJt>-J4o4cvrS)W`t1DR;?@JLgUMoRUB`1?Vg5pYHI
z@8t-zJ6?C1Vf+~O?&+oM&w1X5_&kpEfo7o6W!U_wxq=D@HlBGeYtPAPJTp<bs+&HW
zZcFxpY|L>4)5d8mybL6OYG#c0J&G7L&Q)KtF$N0wg|N4nOh?)Q)KpduBLYw3eg8ON
zNa+)an^fYDZWDzUOXEprGiHKiOzpA#V9FULsy<ZxM39wWdHMSo9f1%EdX9K}aH&18
z>uU57q&(1YTTgk6Pj@hO5bvBhI+4{E_Q8EPkdlp4Z9+32C@Z^e&&_)O=`N*Vs5(=F
zsMmKUrOR{J@@+^jFdJp(xUAR=okW|R9Rtx=U4;W99u*b4r_3gszkr|i4_!*e837}c
zLr&T7Q4iZGfFG5tu)<CD#*6U9?6z?^*n|OVy%k)J0(tEYSnU1C4bdG`he_J}RCKRw
znvPuT0~>9JScuGPq^^FNXeE|;Q+~S^rZNzf;3?&(o6|JMo*_cpNzdVr19m-Y0sfFa
z7NwE-RUYPL0PbXUS!ll-2>t8=O{PkKH$Pqchr)%PeaPXt<_gAE{t#)&yaAF&pz!<Y
zscAPY+-gw*v@Z8Rb~|q8x!baWoQ<XEgDJaXi7E7$SnJ}MZ`3~#hk<Ws=u)TNAn}PB
zA=dzxJjt0TIUQ;CsItNC(rw$|8ON2EkFOZBoJI)y$mH0r9z}ELehGG_=~kwEv?(_K
zh7qr;G~!A)zyq_6aOeyjAOV8F6^$w)BQ`_I8UKi`>xC&36Q^;CiER*T_#n7dB_hZx
z`!G>+ud7lrgMpj%m&%3LwTa7mMam4*eYr%u5kpxFWh(*>?*~7#$vkZ#N-&}ee(0{b
zy8{((=#iaQUGsJ+3v7ZOqwW1gS}+aqYxCv)Q5@VfCXcgB2V^dD$7GyFaZ;_DSXN^v
z@ZjghK{_lEK`d7@w^m{gR?VG^Bf-oLVm$=bqLY!6!)wwk$pRW^4_IiY+VV9Dip&fO
zr1~8(Wlbwn65g~PCyTaovc~anHmx_<Og=1jeXE#1o+vDk`hR;yels^KG7{q5y-m8D
zim#&s0_RVD<_l?Zo>U>>oL$TOHsJtd_^EmbnCj{MCDbq7>0Wg?R=s6nDl$2Ls6m`~
zMEU#=rYr!+?bM6QC-ShM;K!KEe8o@7-KnWTch6041ug`0aqAQ}q^x*9w+oTGD1fJw
zteo}x261{>kxo)rSdx)eNH*9#0N9i9?GOCE!m930KPTk0r3rN>OmO|~Zb{AFtByMz
z4b%`HQ;6}(H`UP{BzQA((3N_Kzj?Rx<1X(!G34-GAj6uYw)=~}oLcv{j{C~$x314Y
z?w*)l=w3uPx5oV(a{U|G4y4Fg*p9gTs+%;%yn9iv0==l=B`s~fc$g#%-FDDGAeLBW
z*m$WCi=6pGCK@xuN5A5iI(w}cIM?mbAzS2BNwm-h=^CB6nMt4G(<v>qUTNY);t62q
zl@yTwsTJ6vBzE~emQSNA41IV&sK1)kMU>+GAl~@Jo^6H=?|4|C6r2!$-c4^*Ozm)m
z?k{<b{y-5)P_icLADT{skPv|15<uR6A+WoCuEMHzGPaDGU?*yLas4sjf96M@)Mt8}
z1-Y~mD!LR(#QDD>z|C{qv<cw^xC{K7!WNu@Rq2~m6`LYh>gE5Mg@{a6?TaODIhK~)
zx#oF@2)H}{kyB8Wdz{|k+jUTno_#0fprkmUhwOXiT-sc*<wVWktNN#n?V+OPMfAtx
z06*fm#(;|4^6@_L2h%;Sfp9-}jEI96!)#}LJf}-r51%AK8t!Sb<*CRApLFilKfp9W
zM65!qH1WfOJnz>4UM<(X`v5y1>aV2M*@S0js^VNdz+goevmFz{SGc9OLkfCD?B;{U
z_Ts>az{v{<geTz+Q}bc)-VJDa%%|RJ`XoM^w%Wa~O>L(*scoyCnIoZjfZMO%igc_;
zHR-P(`u97wO)|RKlkc07up@mI34|2+c$r@KZAnBF)SCSx?4LQ$tw2cTltB#&8<Fia
zG5wKLr6?B^L!;4$eHYgb`OM7t0abwNJJgH55E5c(a6fP*%n8}4+CI&Q4FZ%I4BG66
z-@bHlwB|wDR7h$gy9sXQo$>4A5lH=3p&y$dRFs7U{(jlFoZ>CwGLm`umLbkfN7NS}
zvc)_}`5RC*dLxf_MwBmpEK28EQR}XLDn(h%{Ct=zD5Kh?TVU5G-YE59SOtXqtkX72
z=-(c|3TfEUoD-;`CE=Y<GDKdWc+Ih3e7|B=edGW-?_A7{e#Av9uDnMuTn%ccV}Cz~
zEKd?PExre1s(B50RQ-F9E%B)$aSy4vwhwf0`A?w}_^rue*wInABsUFfg+Lr&7Fbb9
z#b>{|oWA5>1#qd<B{07Tkh@pE=pL}m_r1Wj77j=?Z-Yb_gwlJPaOH_YY}WRV(~c)?
zbUKv7m3#tzz};mQ#t#K9>el!PyFiqY88*<^ix5tI&(?(~x=|<6XYL@Ap>f6Tr{gk7
z6Jf$oFaqrN91yuoiaaqt6}t;<7NmG$z*zL{5|+OB<wb&|vE`;0=$T#bc0}+pmdT}x
zxeae-L28u|NX|<0pu`3BuURA9h#XGsY5*a}g)@Wu^ZNXj;?r%miJoT>KNBzVWAkfO
zzUi%xAbc(yt=pOzTD*PkMk|;lqdistub4W^Jya6NEpy;NT-f#fcU;ylw;1``R#Cbf
zJ6<>Z+2m69kfGT}qQ~F#!EAH)V!o<kFj{Qd2xshHLhpU_mTkBq9|bqp*^7+31CNm8
z0kMRg%$JjK1fwlqHMj2Cc-0D&K!@K=0zkE`%lf;3mJZ)_pmS*d>3rab^P-C4gc4cK
zI)cW@F2@AkDv68Lgra+DD?Jn`<zw<@6o~Sr{__zbBZ=efW5g@lmz``?*rkVRcQ}L9
zo0SafNldh@KCdztm*ShWf{Xxq;Sk~*KaW`(*u~tP^~bx?&-?Cx{EYMaZ$rtdlkh}c
zzAG}xVvO!)S|;K;U0CivE^{<1g|VFJWQ31WjyN~Q>AuWHlKS+d6&BPHWwQQ>p5eXh
z&kqF_TRSmUK)g8d$cn~p$<4(?v=~v}2>79A(mIvmwC&?hZa(#eC*F#o+)B5H{P=u`
z6ceXsL@Scae~>7kxae0f{iBsJmb?^Alui)3WJXAD45I1@gw+~ap12+!qr@}RHsR?D
zvbez|OI|)?paP2c=DYByZb^%bZbk*E>Lx%l=UOHbL0SK(7H&nwZv8p2hOuH!m<Cu$
zL6tKxu_~%lbaUz6=%}dwnkCAfm{2_hV5;gCpiHV<v^*)(<jMajg%t>egq*6j@fyHP
zm$7Jc$k1;R5x*XpRn~$rLd>w*P?-Rwq1Gtbi;kFon88W-8buQmGnu<Vm9S3uw@qW<
z%q(x(S+xh^<fp>K%vpL91$5N?#^G!wfn+;PG;{9m@~m|Sntg~mt|ah2ti`0v5=nHv
zc$rw?14zW_R1F!m^{NAlWG#kJGE{(D(+~p<2s`=f2etfH$jOIetqWs}l-a6tTH~-u
zo+Wqh*+2_C_}M~HoBUcjn5&%=2&~+Dm<O*tMFdjY5XCzkXTBmguWHg-#_>8P-(5D<
z#0Z{bNl>Ced>WgJ@e_zI=bAc4Y<h>;4l*pH*$>V_|FuVl2L*|p*m&q0=6lz2=eCV!
zKzt26jEaLX;(mWolB_hksvpLp9s;Znv&;bJf;E<Mqf%31(01cv)$doPG6yUaZ_QUa
zl63mCz#{L$%5k+_1<kGfa~cx_VA3Sq`yJ}f%D|tU^j(7PDWjw&(H^0?hjCarwuhSb
zO>eO}*0-XoU9uRQQPG68eUs#Gp19Dyr<fazQ97Rt?FN+!59I;wExCL^Pw5!_^K~P}
zmDQkwDfIoy*gmw315t*rwPEXcwtox{T@#|L_B&Sf=N8p?L810*B?$k0)mMzGH{8Y|
zM$44r%=TKv`oJn@B<D2ZeKR9ZEm9qztwtSQt54aCz<?A2fzbE27ETr<P5?p(@EhIx
zSl3EADP8SF=L*Yi&ZHK9S64av3Y=*26|GR#PfA%m;#FtH?`rx%T5pz`Ftr+#BEWf{
zLW2YQgTw#@I{sDTqZm$zivUyXMYK*HI$S+Q<@RUb$1dZzUUDaDnhr|_#{@lr?=;KF
z-BYOmQ+hI6x-RjoM@&-2Y~!u92EcS9Cyf3*&f<cs;EUdNCjXtr%#k?(7X7iVp&iwC
z)zj`=CTRq*S^oZbejbQSry^SbR9R&(gR)jlP_QvE2_WRqaD0!uL~=7S`}gm5EM5H7
zWqTo>`5yk1Fw-ocVxC^UMZC*6A??6KV_XGc-QLDTK{T6nt}rx?lpK1@HezmM|LD^}
zlWNN)fF4}h8_Ogu1h8(g5(9E60oO&sh&1=Q_FRm==yLpaPOT*;9u89*2Ym?u)>!?q
zfn4o~W9uK(HexQm*_#XWYSJt^31D;$(C2)az!-?OxwVH{w03jfgy|`HBg<-oZ`U|g
z`DUfG^wB`kU1+`G&<M)ZYaCo5)Qos0g6~iE)Z>2Y5}0+1aA-h<{mG9AFKT~2<QV;F
zMhD)P0`1*?*Q37AAf|-v<n#b)VNJ|^bS8w_TDi)gvKY%#x+Oy#cINQ$zv*7-Hl_bM
zuM?m+AsXC}l%#E_oz2b>?@elK>pC@{r8Tb{{2jZHQhmx?5{BfBv#;~+zX2tdVyjgA
zPc&DDP7pF1>_1u$SBw7Hf9yMD&jFq9qwi!W&|Nfe?_5ZUY4N2V4B2Yhu(NrMkq_5;
zhgnfBlL&~9ixHnj59UCK2(i8+M&J8vG&WGv06^FXZx7PRLGd0^6+eUO#){({p^TDE
z*S=DGRp7jX@W@O}1d?}pQk6m})thLrC|Ha<blX3vi|+z??PL|C+cD`*qr@&?VrpHj
zEX^u1dbA2dx~F0z6J9*AjBLUUbI(iN&)QGbg(1H^7(tAGnxdcql%Z7Lrqnv)5qjM5
z45P$eH^;|4Y>&rZYQssivyQ?deK6Gj<g-Y$EGIm2WHJe_BPStn)-ZWBUR;=FBDXe&
z4HM}E0os{RhVWCE^&%K>i$WK%(p|#ITrxi?KgMO~ane_%Ehf=MW%9T)uz|*@;sEG$
zlpj&vyZR#Md_2{aK`cDH?L9Xo7YSRfm@yd>fFi$PQ%rW@MEth5Yx%T>ps`BG;B(4r
zC$KSyZT4MVB-|q_-$%jBirHD(`L?wKUq!-~CIee~nmJ3I3~C~)H(Tgqq+5$Q8(pz*
zBkY4GTZ`NoC^Z%XzY;<DIyl*5gDQ!Wv49GF>0bo`UM%jpF?gR>z@Kns-qbfJ2TpJ~
z^H2vuO*23>IJjgFmK?%JV3(UDq-zboo=NYj%y$q`?bthSPaaGblb*I$E2j@Y+M>98
zJ72Mjl5uPXH?{4ovKYy$xB#?zN%|UE_m4`C&MFe5Lo|}wl2gm-NJb8M7jc@%b2WZq
zenLZyIR`cdXv;R->9#8c9A!6B{3+t7FNtRyNrWeF+h1lxCd-NKrAn?<Yr;?*RY3bZ
z{<dAIMOcM87EiR`6L?ucaO$4$XmX}_@Z*($d>Z%Ah&#@J8shryg;mggGNi}!{8Wwa
z7FRv?g*$&I<8R$&P?{?gd<q@NDiApsf8^9z&~5X`=^W&~`yDHsTA-zx)^a}aUAs!t
z4t+zBb2~x8Ie3Zih<C5dX&M0lP=OHY|GD9Dnn9OFan%FNbJwFvRTr9&7s1RU)fpin
z9D2%BH*cBj>if@F{E>yf{G?MBRWY$=)A#eq-$K#CJgJf|=qd<?3J_AYqYYCxTgyGO
zLop(<h#%au>Uf9&1;ey{OrY}CVvP1J8}8%Qd{X5x^+6HO?E&#w%c9!qA+swAV*UN4
zU}r-2Y<LnRVooZ3w`x(okVFCx9X*}lwHjm?eEH^1oAbm@;bvvDnLcSbP4Ma{0%}l1
zgnU0SNvc$pKzfyZu}^e55os(*xNB%P`AutA4jNzO!Fx)vgpfuc#T>Z}wok2J9a=$8
zkmVB;Rz-mAM+3-+-<V~e+5m5!Fyd$T&W{IqKK7juOnR9N^H=A=^5WQ!A9xQ9tps>5
z+yRM=B-_bab5l6b+{JYH!RfKWIjjN{<d?Z9dd1bf!_(&ZsR>mihB?#zklcZu^~NTl
z5>(<f%jn7#5I#w@<*0aDWuH<s8i?LBpvvo_3%FD3GB`AJhJ2F%Z_pWIw6r5aDRF55
z0|5Zy^GpW@%RGCp5z^Td^(^NftA1Np7<MLI64Nv_<4m&FLfiz@h|SMDGz0&f5yeKA
zatVR?^!mII#~^Wp98cYduFby6TQ6g}$<P@#6}EE$yaFf`Cq53T^_ZnPX9Vn9rLOOO
zAMu64#!K*Y0uu;-8nx@WM!m6DbK*bunH**4@5#Hysg`W)%<=<Tf1B_f^oB}T^Qz;8
zSb*r;t=8-#8F?%9%cJc}*2G>4It?sq0?@$`BU3x<{JolhOT*qah6CbZkAlU$E=d{&
zDlopv>2@`au(`1AumYEIEo?S=<j(;xo?F}@K}2dzFV;SlUuRDuM0S&rtEc^Dm{?Wm
zhfyjwd0m1B>8)8n;2SPZBC{RdRV$h0NH7GNiyM5UqwEfjtBJfvXC-?9q{>Z103;&o
zL+6cxqB$}3mW^%l_QJb_^{|$jg>=^y@WIz(mzV=c93K-^Dr<HC(5-2koB#JYlA=+u
z$4e^GAD6%gBhX~)d+Yq0RBg@AwWDeJh4&$MYID1;+5*3tvErnI$)#$(WV%IrTE**}
z=HRH<`s3@j*LValNjWb5hj4hc6>IXc;KWgGRJ#v~LFDRx{&wRvNo(iFMW$tnN_7>~
zn*U1Q+U5LVtP14;;q^}RhFYc<M;i!-+I#bmc?`bUO;V)s+#vm{mE}oE_sa(Z1<;kg
zoP9nvW;Di)^=9i}7YxU(H0}b$HTS{zw>79=(l{9)e!}UxQtOd6d6xpnRtHh!yKcCi
z8w^mOhmO(V_Y>93{J7wI@;qG}xY4(G=BoW+BUT#WQs7XWDtf#RK?X&F+Gh!S$78|`
z%Q^Vy)SGWCn)WO{yU`tjPWQygmF@Cr#4$hF$&S|k$P3Mx2r=l`1;HTZT<@%rbw_(Y
z|5a3<AnNVplT4Ze8!bYFB<%mQ^6LNL<pzm&%v=Jml@JeV1>mQCsLuA0$>v`TfYm9g
z#B@@4qdgH?OP*l>BQt-H+YAMdR?cyQXMKp652kQi5nIcW#>s{9YN+<xvx?JX>lScx
zFlQGe@T^rpoeKR6F2@w~I+9fyJ|ksQE<SryzH%O!ZDdL962Brn>JL|51Uzek;7<LG
zu^+BN{w-~|k??^{9W{Etg8+ErYM)0>0z1uMv$$9n6=&?e0H|Cvf>sJ;bMV|nxp#!0
z@MQSBkpZB>r^l9XTGq^p{MtH{V%t!BWhS&-{kVH`DWSkil})$d3qMKkBJj~J%E56k
z9=Xy|!ZS~<B(S&_R0~DO&yxM}5zq}ls+KJj7mg`Er-2!vx(6Lpg}Hp^A0L=EN73y&
z5{kFto5J*@^ll~cxfDewe!Rn9iyCr)=g_Rw%Yd|~Ib^ao1Ey;`Az(lm?6RKfa*Izo
z92vm**lc|&vo^k~ux4iENolo3jt($!s2|l{$navq1OQ8g`efO|f!A4Q`JDcgue0?^
zz<n?F%Z3M?ZKn3=uW$LX3NZ^eKxYV-EKa^dDPFI%?TI8I`tW^ZMUU?X;O@>$herwu
z)kWoXx0X~}=t*PD60p2iTs$a}2+GZHH;7wNj87D4Y!U^9ZH36QL^=%eS-|dc?{PSS
zn`C5M)USKM3<+9reW$6HASSNIP>}#&KZ;t?DT@5?5HT^tk6I@e;}PANKQIF){yaTi
zPe>T#Y@Q34N?Gcy6K!9Rj{uU1h6WbpsBJ~9MJNCl8xC5_Xa7yf7Q$i`paZMfSkiLJ
zqJk%?&_YcJo{t{?xzH8r$q$ulcF~-$SwK%g4e;u-g^EU*corQ-+~Gy9mD<ULL2$dr
zRrwyd1AR#4>~~C5#Kwp?tmB(b?aK`vqDI<2+7%Dz7DGpNIRX)}c*Ad}d{F66;1B#(
z2^ya$%^6<2GC{RE9lyyGraRWXJvZ<&mz}ew&@&niu;rrfs%U_Uz;M+p1S&2Q(ve)C
z;JtJ{=BQcG_WYkFCSDaaZ*!2ehw|*|GMV81&sb5o8JWCwqwFXwUay*IPXRcRr)))Q
z%Q`BnzH<6aiG@S|Rf=*uW6w5D^!gSCBG|GWWQIvzrsm<Zbr4f!ZEu+J?$UrCGy7kI
z(Fi^76l@w!>OaZlvpk<XS~r3@O^{AGD_Z(VSXxN|WtzE!d#jU|9S%V09?NV9U%Y!<
zz=VioO0Q6SL?64h-*TE_;2IzvuZ9LyO$`8<bOX!5@OTz-hkM6;oGriGS`!vyRO4U#
zF-uKn<QLY$X6mVC*?>_!);O)XQ=n`;m4S4~e|K_}e6be!`MPm}Nfwbu68GR6*h=fx
z-{0F(cD8^`kk)QOV+RcJFGQ4H0+zCWcrA-=4el+XP<<a>zJEd7=TmW1MVLS!CDnUQ
zRHce4t(B-hZvO-TMiHNtWl1+TVt&7C_Lf%L6#9vIJ(C+3S#-2<=Ru~e!RK#ow6I{W
zh{Q6e`?r{R4B)ds+e)Ym<tpmkpr|A|r@Z%^wASefifsCjBm6PF95DTm|DOO$6143u
zlSj4U4G&m|Dyl7OGXrUBJe)zV8E`mR_p)3;_QMen8cOEsRp+dZMxM>HMMwE>`lNQ$
zqouowX!Y7J`-)3;sGPRg%9p_Ie0)c=+ro?3PlG7$A9+ShV-?;NwmaOBaW@rcQ){Y3
z#gbNI{$>MTU%Xhz@Aj1UOv6<A(A1#`K9AGPekPYPeyG$ccsh{4l8|-D0;XmhkRU9A
zgilSHGT11%tQ4&9Q$Q7wPVHd}+-bqpF*s1r1<wuH@|j9mjW+<z4R&wXY0CH-U`Q8q
z;ygUI7EK{s5&u8%1j(TpA;RhH!fTP24F%hoK7!xwtt<AQ98@FFal(OeV@0(cG^kA4
ze6L5jN-{KQ5>AwK2XpZ8(TI5$3&^}J1~zx)A=KA|x=RZlif}O~gFBzrL4X=uIwxks
z*!{gRmfoMP5$@axZCe9n#7ch1=(uU*w}bNKSi?_V-=nx1kW5}>k1^H9?mRU%UdAZ(
z2%$4@q*F~T?(V^3Z%R&!&4&Tl@-e|&5kp5g;Ks2}m8=Yi<8Prj35ZadDA-6$OJ6&_
z%vU*})Th4RPYi{-Hnrz5yhXRJ-w${mHoznS&}W3liAXOg2moYP5YpWlFW6Z^6y!~F
zCK_^Ps|^FFb)cvHz($rU61O`hP1Qm1O*6fL#X3>UBtZrne%m@5$mVkT3G9S2GSLZA
z8_=ya1nVz0uLvzE@W=EsWI?wi!+p$>LpTJz<B-%P2OOd2qsS{?Fa~9%YUF8q5mP`-
z??_Jun_mHM;_#rNvk*=;`quzH9#`{MlsJxnNFZTt)W>bx-0F>aeo%O*G^XOQ85#DP
z6=1|FIN7LEAplwo*K0sSnuo7v@%3rA>R>X9QygnRYg*h&vI`g(Vkihc!xt|IfIa_f
zAKO}I#RHYdj_{`gy@}^#csA9WrzglUx0GU@-4}Fjg6R+yODJ~7z8459Q#o*b*-tGW
zTPvVmmyOuS1Q;hlQc&kudmHgtLh<axD{TTBV-#*2C|J~|em7?kF=?l33AS)$EEFP0
z`q=>;=fFfk#3w|L<E1>R;(e)#FuT8EG<OGhYiS;w;$SLtk^weR!y*j2r*G3>tWjk8
zivIYUTSdb*3WMo4*n!GQo~uXFdw>#2xg_vY$OXgKS5Y0_&uYN7(5ng3w(tMBBOho1
z&YBXX{K6Z10cFJ^?LJHOHpox6Ir+=+)MApE7cgtidF{OBa^fTK?f_iqMNeQC@q!xw
zRY3sHNb`N!JwZH3w<nPzn&k3s`;MlUmNTrv0$#A^3L~nkC;6IyiMr)mWtA=?oCV1U
zVpVL_JjIXj31$_`@z<HB*EvC?QM?#LraDh2C(5%HW`^qJE;mtEs9R9@82!Gc90b;+
zW2kse=kNQt>~(C^AYpSGGtIsxN^BWV0H7undGL)uIaD|2YrDzVsN=daCpXJlwTZTX
zNWZz|{eiU3<eaP#d~5TS<OvuP=_A2mOk2J&2|BNNN~epY&k-y?66HwYEZ$d;;AXeR
zV;NQ+ED5t2cbwVZJ$JDE;dF<%o332C3{oBFCLtE(WUgqR4v6$aVzD6!uI;+|(dPBz
zJ{bh)h3z;Ld-L-QK>z0KAV)q3E8U}HvwD^LDW<E%zy_AACX3p~Yn8m;H4KcKEo`XD
z4zU;xfq+u3OnrujNDG^Rs{5E7Qubn}JcDQ|SJks9(4kJFsZ9ArD8K{9Rv(A#p=`QG
z*N#{oEp7$gHws+k8D^581kt7mg;Wq4A3Sbdb|M&yl8(A+kE_GMU@`E>90#DccJ$Wo
zn!c)k%vR7G;NdTjUI+Eq8WXKj>I|eo@qXZECoI$xa*qp&-#}p9lI-k!fa<?@h_O1|
zpdhP`@-w#pG$w++R^`^t{g>+?(%>`8!E!jhBrJS*-xGdyWF#Z2i@;HRzV~1D)ajmN
zfnYXDD-uW2Y}ox^bkl5mir!2o{QK-e9}#H~>~gEkw1-q=c{qd<tb6<FbUQI7g`D5O
z`!4neZpujiEC~>vgfWj|_y@WEFOOIH&+oVYt(yzej?bjMoC+ec!5HHrw9RFljd!ck
zGoD`s&vi7I1d$zT7j2=g(Df~D+||A)Bprqhsk{CySsNCfnjFaUJ??k?4s$f*6n0}K
z@~0mRZ*6fULGnuo@qv<F03tz_Y^@Jhgu_Z+zs(@|R0A&D6qy$xwoATP_K6%3iVF>#
zmE_i~5mG<XN9^f@iV1aK86Is16XFEs7Genf-d4!x-OA@gs>(udTF{0u>8fYDeb%Iy
zUS0}EfsSXrm=F)0MdAn3Ki^s4bRMhBUk);i5zt^rF{g61+E^xz^1|*)`v#zU^G&Q&
zd$<*4AZ8dE7^9?V5fVetwVe(Q=D7KlamB8Z3P%AS%#+@Br;u|QXAubslFW`RgyBlV
zv~=_A?slkg1c&0jO;jeH%}fXkKWou0fRQy+402t40+Xgvmj0boA99DTnhlqf%i<5k
z+LWLYq^VO3o}~<<V2C8MvFJE|&=}iui#c_TBSq?$%2L7PO*A|)U4tHe$#yKp*u0{f
zpw46Elt&8Ng@hWDC-M?#eh};Q!k6`$`4JLSaXKVI8z|_Bc=k90O*%#MXIz(<{b}mV
z6y)*NMcr$YZYM?gp_bEt*f+sMmo%;uThTZf;eQU^b&9#htwkvw`c67vieu3bcMVjk
z&L39Jh{F;O*L{*G#?d)M?XGbhE$Xy9Ps%>bT1wWU&y$=cCGe%hc5upW!`uRoplZ0C
zC?+G-e#NBUJ;y&Az3PZ_tE_@CCrnvayo>Cl`Zl#qVa2Uru9;DWao9i9=(LoBg$(xQ
zso&_>i&gFC)G2lkf=r)2yk-@nf?moW30ut5Pw|mSIQZ_~m13T<j^+&a#=igZe$H#W
zyo<yVXOX!n)(dqIR<JuGaTwCE=`6DD$V_4}yna!iE2brHghz$;!N0KVP#yJS-4a9d
z_|K9SSM>8F_>~VR!RbTDgJ*tp$P9gS9?)0Y<83P$*ziXTZqnR*I!L&-UHeW~#WV)I
z7y+GyTF;78X^zEDEj<RjcMHmm6zj7ZgZ5eKl|$`X@86Z=ABB=BSljsy@=zRFDaK8j
z$CQju7A*Y)t<v*<LLLi-I=gPpSE<lAC`lk?+@W~W4?Fj3WOkV%R~mh-35*qFWOZZh
zyV)s>wBK6}0q!e*R_A74^tgOE*9&xRGtBbWoFSM;8~$+s7v=?}ePHE48=Jct5Bm(&
ze~s6v@;8G1bdQ~ASQ2s2(P;PP3{V9|<RvOWx=DmA;K`70_tUy@6!jGfNx9;Yk>!o$
z2zPU^>0LNoO=TCBe<{`HMMh)37Pwa~Y-I|l0Pz8|OzO6QdktRWY{I!kr0}*7T_NU}
znZQOVKd>V_zyUd2`OQL+xRb&%Js{NbX<`ihHtma7M-OFsC<v*^g+AWoK-y8;)2IP`
zTzy%px{-PRfz^1W-9dYE0~5+^q`@qYw)ZB3L`2g5Iu5M$d9DSOSP?TeRibpk0AXEt
z;FFly?K$1$z)z0z-U5GgOpka6s4ETmB$?l+vn`}cYMov1u7F@IW<-vb`9hRrI#<2^
z7{=hdQ}?MfFbSmYJrt+WoXZS!BO@saV8Ds+_)3^uyM|EMiLKOS_rw9iC7LQ!1dWN9
zh{IDt{AU-Ob^O6f;A<+e%F4+XZR#(-{|es%Hn=2rpqKEmA}2kNV=LU$S*`h%83#u#
zqdZFGz~pPSrWy=~IJk=u!6l*#cbd?<Ui0;t$6%Yi8~|245C+?@)BrMpYACtmiUy(q
zTk^MP9H^<wfGknWb82;GurRo&vJ@L*wqooe#2KTA5}xGY>tdXe*OQfsW-LLPIB_0v
zhv@;9Or0jGK|pl*4YFfSQSe9f`EG5!b`@i&S8rzQd5nT&CV%+K^6xi>lDOjS8Dl^R
zQwD=S?0jl#s!P44QSe9f`EF^Fwx&rcrk8;@q@K>OQBr3(Jz?uM<@UQX)SZvv^Gjr+
z!pm;Q2_|}uR@F`{TiWM_wQnGiQ)+hxr0plMv-zarS`!%0s3AlfTo|(5MZ~U2oAU|m
z7yN+aFX4XrKff}t5w7$R_5ms$WU>~d9%S9%A0Qoe3bL)NA1RX^TU2m$jVJS96JGtZ
z9|9SR4!2bnv%~^mSaAMEIi{S4h9nTY3!7ZmqL{66cpOLLml6E<kHCbrr=Z_PDsU=K
z>!wL%jZO5_E)?~?#-F`jAyfRSE4+|{`{I-xS4<hXfXLAchu?Z|ORthJ7hA7;SMF!v
znzmQJrDN_<5#s_x8skS8DXH+_tpY2<5w9bsDuv8dmW17`_7!ClB8TG>)m!%{g(d3-
zk9?YQz()YHedI8vOQ`4dNdNYj5ZFf(@PbmWY69$`e^}er8#%*6_k8RzG}*i`S+?3I
znW9l|EG7+9f0Um_vQ<X!J~iLi9xXkl7+kFlnKS=HJl<1^%l<gzB<*OxfTZ=)4438Q
zP(BJmpI_nCqZ(Gd7?fJ61=DrI<y-Cnt5M(!B8ci}!0&fWv?z|Ou?}@{m^>m3D=Tq0
z;yz$;K3v+7g(IKnwevSE4w&_s<EP%1)^$wG+~ZpMqvdL(*9;?}7eZeMD}7r*)FvN3
zO*wlWRLK%2tO@ZI9U6^<On(L^`qS=-!#sC|M3NM<lc0s1Z{et<Bri58$Hdh<7@1(}
zfz+^a+r3RL?|84W*zm^`*T~?43x1zyDY`=}F<c$$!afX6X{E<T(SN7ul*|`fm)Ps~
zkR-Uj{Vn!+1!_xrpm8z$+O{J@H;7T?E_$`QJ02g3*n4%L0&MKwI`EI^BjNl6np9Kl
zz>VXAm2yTRaJUh&w{r=cGz|CampKAKgAOi$C#32J(uqBXtXmUlNU@>6;y;IcE@57@
zN6-9=TS{v_;bt_QUD?C<bPSI!l@byV@K`O$pP#Y%Mo=iFzB|Amk;STZHBcJB&P5>a
zRkJ|SeX`#=D4LpbdsWC=A!l4n+K4(FGDI2Wtm!l&XhUEvOc07ouj`}qhuk<1Vbnja
zcAv+-LoY5j;ksY^OFDIy6oT2;l*cr$dr6*yW^5!hkO}-2U{UGYH;tRa0ks%l3oSni
zgj8key5+t*<Ul19=7-~#5&ZZpRhzdBN6r_O1y0Y@G@Nq1rr*x3LMkYyH<Q}Pf)dx>
zzQ_JZyNMQFpMD&hjF8YadFCv0dP;KJZ%z*|l`GEYgWZlmxRVQ7A%k^!vadm1(dr>V
zVHoLEiFjZThHKws`_qmpF@R+DQAwt;q*S_JR9bJvkMCW3*wS~#i3P;~CiMADayMSC
z`^U4LVhXH6?2P`uARg8Ao?Z?k%r&(MAU(82+ZPylJZ~8`xvu^!&icB>hvYt#vxBKb
zov!A}BGso_fCuUwLt>e|b4oIim>E;*aLh__^bc1e4y$b`Ate^p*CiUsMl`N98}1+N
z%8A)NHn-dZR-?ccL4aB0=|qdRh9YpLlhYf>)F?Yb&kQ=wjgisb-7S4bNr2EIUyLWp
zC0-f40OxVz$K=3lzCR8rIShGK9pq~t#77Y~3wkwYrVBPuFvMYOzLR<{9=~kz=_-*2
zte%C)iAF~3_rMDokFk<(B7o7>335RWKqN0$2F<6Cj_czvZI5!sto?TBLsJWZ_JYOb
zzw)qd4T7MVLUi+N0lgobrfD4|`~OU<{So;TG-?owke<;2QYh7(buiSZY>%nJUeGiS
z9sgN4r$_TJvXHvkv6Z~q;yOm#untqMOqn7pZ{Hl;6#Du6f6VfakCQP?je-C86}T%b
zD2pXB2eC=0ICU=FP&w#(r~~WQ4RCxaI|7b2ncGr(8Lshe@HV9&NQmIu=0miGDfEw)
z!{UyAJZ?JwHvy>B_Cx}xN(U8#+)of`nkG*g=0;Dkb0ZPFjz}z@39qN&HnYoZEGerV
z*8PX2AW9Fw?|Hd@2~GFW#~n0GmliggPU}YNdz8QQkiwao)1uW9p`Vqe{{8f}tc=cX
z)<p#~w^X@k_ygvRI@kR<Use{jT(g4sAwC%nPdRGu;C5=vTRHur*KsAmcbvi`Gr;jP
zdv)t`K2#>PaF``_xs!2st6%_o4gjl=wL}mRuniREBsZ%(yO-)%k)3Tym!z6juc`6`
zfv+O}C^-;-b6=ih6{-SkljO%c9+Y?n`6z~VI%gP}mS?aX|E5YX9gA(TaLGvIpUF%3
F008Q#p-=z-

literal 0
HcmV?d00001


From 48e6b92cc378c937e59719f2c0f482bf76c9ca81 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Tue, 25 Jun 2024 13:56:49 +0200
Subject: [PATCH 03/50] Add chat template support for llama-cli (#8068)

* add chat template support for llama-cli

* add help message

* server: simplify format_chat

* more consistent naming

* improve

* add llama_chat_format_example

* fix server

* code style

* code style

* Update examples/main/main.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/common.cpp            | 60 +++++++++++++++++++++++++++++++++++-
 common/common.h              | 23 ++++++++++++++
 examples/main/main.cpp       | 55 +++++++++++++++++++++++++--------
 examples/server/server.cpp   | 12 ++------
 examples/server/utils.hpp    | 29 +++--------------
 llama.cpp                    |  4 +--
 tests/test-chat-template.cpp | 20 ++++++++++++
 7 files changed, 154 insertions(+), 49 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 0ca7b4430..da6db4dc6 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1444,7 +1444,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "main",        "       --cfg-negative-prompt-file FNAME",
                                                                         "negative prompt file to use for guidance" });
     options.push_back({ "main",        "       --cfg-scale N",          "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
-
+    options.push_back({ "main",        "       --chat-template JINJA_TEMPLATE",
+                                                                        "set custom jinja chat template (default: template taken from model's metadata)\n"
+                                                                        "only commonly used templates are accepted:\n"
+                                                                        "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
     options.push_back({ "grammar" });
     options.push_back({ "*",           "       --grammar GRAMMAR",      "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
     options.push_back({ "*",           "       --grammar-file FNAME",   "file to read grammar from" });
@@ -2604,12 +2607,67 @@ bool llama_should_add_bos_token(const llama_model * model) {
     return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
 }
 
+//
+// Chat template utils
+//
+
 bool llama_chat_verify_template(const std::string & tmpl) {
     llama_chat_message chat[] = {{"user", "test"}};
     int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
     return res >= 0;
 }
 
+std::string llama_chat_apply_template(const struct llama_model * model,
+        const std::string & tmpl,
+        const std::vector<llama_chat_msg> & msgs,
+        bool add_ass) {
+    int alloc_size = 0;
+    std::vector<llama_chat_message> chat;
+    for (auto & msg : msgs) {
+        chat.push_back({msg.role.c_str(), msg.content.c_str()});
+        alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
+    }
+
+    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
+    std::vector<char> buf(alloc_size);
+
+    // run the first time to get the total output length
+    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+
+    // if it turns out that our buffer is too small, we resize it
+    if ((size_t) res > buf.size()) {
+        buf.resize(res);
+        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+    }
+
+    std::string formatted_chat(buf.data(), res);
+    return formatted_chat;
+}
+
+std::string llama_chat_format_single(const struct llama_model * model,
+        const std::string & tmpl,
+        const std::vector<llama_chat_msg> & past_msg,
+        const llama_chat_msg & new_msg,
+        bool add_ass) {
+    auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
+    std::vector<llama_chat_msg> chat_new(past_msg);
+    chat_new.push_back(new_msg);
+    auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
+    auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
+    return formatted;
+}
+
+std::string llama_chat_format_example(const struct llama_model * model,
+        const std::string & tmpl) {
+    std::vector<llama_chat_msg> msgs = {
+        {"system",    "You are a helpful assistant"},
+        {"user",      "Hello"},
+        {"assistant", "Hi there"},
+        {"user",      "How are you?"},
+    };
+    return llama_chat_apply_template(model, tmpl, msgs, true);
+}
+
 //
 // KV cache utils
 //
diff --git a/common/common.h b/common/common.h
index a5c738f8b..de90eec51 100644
--- a/common/common.h
+++ b/common/common.h
@@ -365,9 +365,32 @@ bool llama_should_add_bos_token(const llama_model * model);
 // Chat template utils
 //
 
+// same with llama_chat_message, but uses std::string
+struct llama_chat_msg {
+    std::string role;
+    std::string content;
+};
+
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool llama_chat_verify_template(const std::string & tmpl);
 
+// CPP wrapper for llama_chat_apply_template
+std::string llama_chat_apply_template(const struct llama_model * model,
+        const std::string & tmpl,
+        const std::vector<llama_chat_msg> & chat,
+        bool add_ass);
+
+// Format single message, while taking into account the position of that message in chat history
+std::string llama_chat_format_single(const struct llama_model * model,
+        const std::string & tmpl,
+        const std::vector<llama_chat_msg> & past_msg,
+        const llama_chat_msg & new_msg,
+        bool add_ass);
+
+// Returns an example of formatted chat
+std::string llama_chat_format_example(const struct llama_model * model,
+        const std::string & tmpl);
+
 //
 // KV cache utils
 //
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index b97b7b793..cfaf6a6e8 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -39,12 +39,12 @@ static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting = false;
 
-static bool file_exists(const std::string &path) {
+static bool file_exists(const std::string & path) {
     std::ifstream f(path.c_str());
     return f.good();
 }
 
-static bool file_is_empty(const std::string &path) {
+static bool file_is_empty(const std::string & path) {
     std::ifstream f;
     f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
     f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
@@ -117,6 +117,14 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
     LOG_TEE("%s", text);
 }
 
+static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
+    llama_chat_msg new_msg{role, content};
+    auto formatted = llama_chat_format_single(
+        model, g_params->chat_template, chat_msgs, new_msg, role == "user");
+    chat_msgs.push_back({role, content});
+    return formatted;
+}
+
 int main(int argc, char ** argv) {
     gpt_params params;
     g_params = &params;
@@ -190,6 +198,7 @@ int main(int argc, char ** argv) {
     llama_model * model;
     llama_context * ctx;
     llama_context * ctx_guidance = NULL;
+    std::vector<llama_chat_msg> chat_msgs;
     g_model = &model;
     g_ctx = &ctx;
 
@@ -215,6 +224,8 @@ int main(int argc, char ** argv) {
                 __func__, n_ctx_train, n_ctx);
     }
 
+    LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
+
     // print system information
     {
         LOG_TEE("\n");
@@ -249,16 +260,21 @@ int main(int argc, char ** argv) {
 
     std::vector<llama_token> embd_inp;
 
-    if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
-        LOG("tokenize the prompt\n");
-        embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
-    } else {
-        LOG("use session tokens\n");
-        embd_inp = session_tokens;
-    }
+    {
+        auto prompt = params.conversation
+            ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
+            : params.prompt;
+        if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
+            LOG("tokenize the prompt\n");
+            embd_inp = ::llama_tokenize(ctx, prompt, true, true);
+        } else {
+            LOG("use session tokens\n");
+            embd_inp = session_tokens;
+        }
 
-    LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+        LOG("prompt: \"%s\"\n", log_tostr(prompt));
+        LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+    }
 
     // Should not run without any tokens
     if (embd_inp.empty()) {
@@ -478,6 +494,7 @@ int main(int argc, char ** argv) {
     std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
     std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
     std::ostringstream output_ss;     g_output_ss     = &output_ss;
+    std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
 
     // the first thing we will do is to output the prompt, so set color accordingly
     console::set_display(console::prompt);
@@ -793,11 +810,18 @@ int main(int argc, char ** argv) {
                         is_antiprompt = true;
                     }
 
+                    chat_add_and_format(model, chat_msgs, "system", assistant_ss.str());
                     is_interacting = true;
                     printf("\n");
                 }
             }
 
+            // if current token is not EOG, we add it to current assistant message
+            if (params.conversation) {
+                auto id = llama_sampling_last(ctx_sampling);
+                assistant_ss << llama_token_to_piece(ctx, id, false);
+            }
+
             if (n_past > 0 && is_interacting) {
                 LOG("waiting for user input\n");
 
@@ -848,8 +872,12 @@ int main(int argc, char ** argv) {
                         string_process_escapes(buffer);
                     }
 
+                    std::string user_inp = params.conversation
+                        ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
+                        : std::move(buffer);
+                    // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
                     const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
+                    const auto line_inp = ::llama_tokenize(ctx, user_inp,            false, params.conversation);
                     const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
 
                     LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
@@ -864,6 +892,9 @@ int main(int argc, char ** argv) {
                         output_ss << llama_token_to_piece(ctx, token);
                     }
 
+                    // reset assistant message
+                    assistant_ss.str("");
+
                     n_remain -= line_inp.size();
                     LOG("n_remain: %d\n", n_remain);
                 } else {
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index f9a86961f..ae768097b 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2606,17 +2606,9 @@ int main(int argc, char ** argv) {
 
     // print sample chat example to make it clear which template is used
     {
-        json chat;
-        chat.push_back({{"role", "system"},    {"content", "You are a helpful assistant"}});
-        chat.push_back({{"role", "user"},      {"content", "Hello"}});
-        chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
-        chat.push_back({{"role", "user"},      {"content", "How are you?"}});
-
-        const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat);
-
         LOG_INFO("chat template", {
-            {"chat_example", chat_example},
-            {"built_in", params.chat_template.empty()},
+            {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
+            {"built_in",     params.chat_template.empty()},
         });
     }
 
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 63fde9c9f..7ef2a519a 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -118,36 +118,17 @@ static inline void server_log(const char * level, const char * function, int lin
 
 // Format given chat. If tmpl is empty, we take the template from model metadata
 inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
-    size_t alloc_size = 0;
-    // vector holding all allocated string to be passed to llama_chat_apply_template
-    std::vector<std::string> str(messages.size() * 2);
-    std::vector<llama_chat_message> chat(messages.size());
+    std::vector<llama_chat_msg> chat;
 
     for (size_t i = 0; i < messages.size(); ++i) {
         const auto & curr_msg = messages[i];
-        str[i*2 + 0]    = json_value(curr_msg, "role",    std::string(""));
-        str[i*2 + 1]    = json_value(curr_msg, "content", std::string(""));
-        alloc_size     += str[i*2 + 1].length();
-        chat[i].role    = str[i*2 + 0].c_str();
-        chat[i].content = str[i*2 + 1].c_str();
+        std::string role    = json_value(curr_msg, "role",    std::string(""));
+        std::string content = json_value(curr_msg, "content", std::string(""));
+        chat.push_back({role, content});
     }
 
-    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
-    std::vector<char> buf(alloc_size * 2);
-
-    // run the first time to get the total output length
-    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
-
-    // if it turns out that our buffer is too small, we resize it
-    if ((size_t) res > buf.size()) {
-        buf.resize(res);
-        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
-    }
-
-    const std::string formatted_chat(buf.data(), res);
-
+    auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
     LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
-
     return formatted_chat;
 }
 
diff --git a/llama.cpp b/llama.cpp
index 49bc93c02..33e6cb722 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -18818,10 +18818,10 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|im_start|>assistant\n";
         }
-    } else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
+    } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl.find("[INST]") != std::string::npos) {
         // llama2 template and its variants
         // [variant] support system message
-        bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
+        bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos || tmpl == "mistral";
         // [variant] space before + after response
         bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
         // [variant] add BOS inside history
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index cef9a650b..d19ba8633 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -7,6 +7,7 @@
 #include <cassert>
 
 #include "llama.h"
+#include "common.h"
 
 int main(void) {
     llama_chat_message conversation[] = {
@@ -119,5 +120,24 @@ int main(void) {
         std::cout << output << "\n-------------------------\n";
         assert(output == expected);
     }
+
+    // test llama_chat_format_single
+    std::cout << "\n\n=== llama_chat_format_single ===\n\n";
+    std::vector<llama_chat_msg> chat2;
+    chat2.push_back({"system", "You are a helpful assistant"});
+    chat2.push_back({"user", "Hello"});
+    chat2.push_back({"assistant", "I am assistant"});
+    llama_chat_msg new_msg{"user", "How are you"};
+
+    auto fmt_single = [&](std::string tmpl) {
+        auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
+        std::cout << "fmt_single(" << tmpl << ")\n" << output << "\n-------------------------\n";
+        return output;
+    };
+    assert(fmt_single("chatml") == "<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
+    assert(fmt_single("llama2") == "[INST] How are you [/INST]");
+    assert(fmt_single("gemma") == "<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
+    assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
+
     return 0;
 }

From 49c03c79cda17913b72260acdc8157b742cee41c Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Tue, 25 Jun 2024 13:59:54 +0200
Subject: [PATCH 04/50] cvector: better prompt handling, add "mean vector"
 method (#8069)

* remove completions file

* fix inverted vector

* add mean method

* code style

* remove inverted pca hotfix
---
 common/common.cpp                             | 22 +++---
 common/common.h                               | 17 +++--
 examples/cvector-generator/README.md          | 17 ++++-
 .../cvector-generator/cvector-generator.cpp   | 74 ++++++++++---------
 examples/cvector-generator/mean.hpp           | 48 ++++++++++++
 examples/cvector-generator/negative.txt       |  5 +-
 examples/cvector-generator/pca.hpp            |  5 +-
 examples/cvector-generator/positive.txt       |  5 +-
 8 files changed, 133 insertions(+), 60 deletions(-)
 create mode 100644 examples/cvector-generator/mean.hpp

diff --git a/common/common.cpp b/common/common.cpp
index da6db4dc6..c76d0e2c3 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1263,11 +1263,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         return true;
     }
     // cvector params
-    if (arg == "--completions-file") {
-        CHECK_ARG
-        params.cvector_completions_file = argv[i];
-        return true;
-    }
     if (arg == "--positive-file") {
         CHECK_ARG
         params.cvector_positive_file = argv[i];
@@ -1278,11 +1273,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.cvector_negative_file = argv[i];
         return true;
     }
-    if (arg == "--completions") {
-        CHECK_ARG
-        params.n_completions = std::stoi(argv[i]);
-        return true;
-    }
     if (arg == "--pca-batch") {
         CHECK_ARG
         params.n_pca_batch = std::stoi(argv[i]);
@@ -1293,6 +1283,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.n_pca_iterations = std::stoi(argv[i]);
         return true;
     }
+    if (arg == "--method") {
+        CHECK_ARG
+        std::string value(argv[i]);
+        /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
+        else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
+        else { invalid_param = true; }
+        return true;
+    }
 #ifndef LOG_DISABLE_LOGS
     // Parse args for logging parameters
     if (log_param_single_parse(argv[i])) {
@@ -1626,11 +1624,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "cvector",     "-o,    --output FNAME",         "output file (default: '%s')", params.cvector_outfile.c_str() });
     options.push_back({ "cvector",     "       --positive-file FNAME",  "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
     options.push_back({ "cvector",     "       --negative-file FNAME",  "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
-    options.push_back({ "cvector",     "       --completions-file FNAME",
-                                                                        "completions file (default: '%s')", params.cvector_completions_file.c_str() });
-    options.push_back({ "cvector",     "       --completions N",        "number of lines of completions file to use (default: %d)", params.n_completions });
     options.push_back({ "cvector",     "       --pca-batch N",          "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
     options.push_back({ "cvector",     "       --pca-iter N",           "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
+    options.push_back({ "cvector",     "       --method {pca,mean}",    "dimensionality reduction method to be used (default: pca)" });
 
     printf("usage: %s [options]\n", argv[0]);
 
diff --git a/common/common.h b/common/common.h
index de90eec51..c541204f6 100644
--- a/common/common.h
+++ b/common/common.h
@@ -52,6 +52,12 @@ int32_t cpu_get_num_math();
 // CLI argument parsing
 //
 
+// dimensionality reduction methods, used by cvector-generator
+enum dimre_method {
+    DIMRE_METHOD_PCA,
+    DIMRE_METHOD_MEAN,
+};
+
 struct gpt_params {
     uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
 
@@ -238,13 +244,12 @@ struct gpt_params {
     bool compute_ppl    = true;  // whether to compute perplexity
 
     // cvector-generator params
-    int n_completions = 64;
-    int n_pca_batch = 20;
+    int n_pca_batch = 100;
     int n_pca_iterations = 1000;
-    std::string cvector_outfile          = "control_vector.gguf";
-    std::string cvector_completions_file = "examples/cvector-generator/completions.txt";
-    std::string cvector_positive_file    = "examples/cvector-generator/positive.txt";
-    std::string cvector_negative_file    = "examples/cvector-generator/negative.txt";
+    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
+    std::string cvector_outfile       = "control_vector.gguf";
+    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
+    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
 };
 
 void gpt_params_handle_model_default(gpt_params & params);
diff --git a/examples/cvector-generator/README.md b/examples/cvector-generator/README.md
index 5182e906d..be4dd5250 100644
--- a/examples/cvector-generator/README.md
+++ b/examples/cvector-generator/README.md
@@ -11,13 +11,16 @@ Related PRs:
 
 ```sh
 # CPU only
-./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf
+./cvector-generator -m ./llama-3.Q4_K_M.gguf
 
 # With GPU
-./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
+./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99
 
 # With advanced options
-./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100
+./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100
+
+# Using mean value instead of PCA
+./cvector-generator -m ./llama-3.Q4_K_M.gguf --method mean
 
 # To see help message
 ./cvector-generator -h
@@ -32,3 +35,11 @@ If you have multiple lines per prompt, you can escape the newline character (cha
 <|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
 <|im_start|>system\nYou are in a very good mood today<|im_end|>
 ```
+
+Example to use output file with `llama-cli`:
+
+(Tips: The control vector works better when apply to layers higher than 10)
+
+```sh
+./llama-cli -m ./llama-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31
+```
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp
index 355905cb0..d4e126ac2 100644
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -2,6 +2,7 @@
 #include "llama.h"
 #include "ggml.h"
 #include "pca.hpp"
+#include "mean.hpp"
 
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
@@ -38,9 +39,10 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
     gpt_params_print_usage(argc, argv, params);
 
     printf("\nexample usage:\n");
-    printf("\n    CPU only:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
-    printf("\n    with GPU:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
-    printf("\n    advanced:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100\n", argv[0]);
+    printf("\n    CPU only:   %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
+    printf("\n    with GPU:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
+    printf("\n    advanced:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
+    printf("\n    using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
     printf("\n");
 }
 
@@ -223,23 +225,30 @@ struct train_context {
 
     // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
     // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
-    void build_v_diff() {
+    void build_v_diff(bool transpose) {
         printf("build_v_diff\n");
         for (int il = 0; il < n_layers - 1; il++) {
             auto & diff_tmp = v_diff_tmp[il];
             int n_elem = diff_tmp.size() / sizeof(float);
             GGML_ASSERT(n_elem % n_embd == 0);
             int n_rows = n_elem / n_embd;
-            struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd);
+            struct ggml_tensor * diff = transpose
+                ? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd)
+                : ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows);
             ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
-            // copy data & transpose
             diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
-            float * arr = (float *) diff_tmp.data();
-            for (int ir = 0; ir < n_rows; ++ir) {
-                for (int ic = 0; ic < n_embd; ++ic) {
-                    float f = arr[ir*n_embd + ic];
-                    ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
+            if (transpose) {
+                // copy data & transpose
+                float * arr = (float *) diff_tmp.data();
+                for (int ir = 0; ir < n_rows; ++ir) {
+                    for (int ic = 0; ic < n_embd; ++ic) {
+                        float f = arr[ir*n_embd + ic];
+                        ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
+                    }
                 }
+            } else {
+                // only copy
+                memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff));
             }
             v_diff.push_back(diff);
             print_debug_tensor(diff);
@@ -263,8 +272,8 @@ struct tokenized_prompt {
 
     tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
         const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-        tokens_pos = ::llama_tokenize(ctx, pos, add_bos);
-        tokens_neg = ::llama_tokenize(ctx, neg, add_bos);
+        tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
+        tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
         max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
         padding_seq(ctx, tokens_pos, max_seq_len);
         padding_seq(ctx, tokens_neg, max_seq_len);
@@ -373,20 +382,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
         fprintf(stderr, "must provide at least one prompt pair\n");
         return 1;
     }
-
-    // create templated prompts
-    std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
-    auto format_template = [](std::string persona, std::string suffix) {
-        // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST] "
-        return persona + suffix;
-    };
-    for (size_t i = 0; i < positive_prompts.size(); ++i) {
-        for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
-            // TODO replicate the truncations done by the python implementation
-            ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j]));
-            ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j]));
-        }
-    }
+    ctx_train.positive_entries = positive_prompts;
+    ctx_train.negative_entries = negative_prompts;
     return 0;
 }
 
@@ -480,15 +477,22 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
     llama_free_model(model);
 
-    // prepare ctx_train for PCA
-    ctx_train.build_v_diff();
+    bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
 
-    // run PCA
-    PCA::pca_params pca_params;
-    pca_params.n_threads = params.n_threads;
-    pca_params.n_batch = params.n_pca_batch;
-    pca_params.n_iterations = params.n_pca_iterations;
-    PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
+    // prepare ctx_train for PCA
+    ctx_train.build_v_diff(use_pca);
+
+    if (use_pca) {
+        // run PCA
+        PCA::pca_params pca_params;
+        pca_params.n_threads = params.n_threads;
+        pca_params.n_batch = params.n_pca_batch;
+        pca_params.n_iterations = params.n_pca_iterations;
+        PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
+    } else {
+        // run mean
+        mean::run(ctx_train.v_diff, ctx_train.v_final);
+    }
 
     // write output vectors to gguf
     export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
diff --git a/examples/cvector-generator/mean.hpp b/examples/cvector-generator/mean.hpp
new file mode 100644
index 000000000..16be5ce3e
--- /dev/null
+++ b/examples/cvector-generator/mean.hpp
@@ -0,0 +1,48 @@
+#include "common.h"
+#include "llama.h"
+#include "ggml.h"
+
+#include <string>
+#include <vector>
+#include <math.h>
+
+namespace mean {
+
+static void run(
+        const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_embd, n_samples]
+        const std::vector<struct ggml_tensor *> & v_output) {
+    printf("%s: Running mean...\n", __func__);
+    for (size_t il = 0; il < v_input.size(); ++il) {
+        // prepare output vector
+        struct ggml_tensor * ctrl_out = v_output[il];
+        ggml_format_name(ctrl_out, "direction.%ld", il+1);
+
+        // calculate mean vector
+        struct ggml_tensor * t_layer = v_input[il];
+        GGML_ASSERT(t_layer->ne[0] == ctrl_out->ne[0]); // == n_embd
+        for (int ic = 0; ic < t_layer->ne[0]; ic++) {
+            float f = 0.0;
+            for (int ir = 0; ir < t_layer->ne[1]; ir++) {
+                f += ggml_get_f32_nd(t_layer, ic, ir, 0, 0);
+            }
+            f /= t_layer->ne[1];
+            ggml_set_f32_1d(ctrl_out, ic, f);
+        }
+
+        // normalize output vector
+        float norm = 0.0;
+        for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
+            float f = ggml_get_f32_1d(ctrl_out, i);
+            norm += f*f;
+        }
+        norm = sqrt(norm);
+        for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
+            float f = ggml_get_f32_1d(ctrl_out, i);
+            ggml_set_f32_1d(ctrl_out, i, f / norm);
+        }
+
+        printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
+    }
+}
+
+}
diff --git a/examples/cvector-generator/negative.txt b/examples/cvector-generator/negative.txt
index 3e9951752..45b9384b3 100644
--- a/examples/cvector-generator/negative.txt
+++ b/examples/cvector-generator/negative.txt
@@ -1 +1,4 @@
-[INST] Act like a person who is extremely sad. [/INST] 
+<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI feel like there's a heavy weight on my chest
+<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
+<|start_header_id|>system<|end_header_id|>\n\nYou are in a very bad mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGo away! There's a deep, aching emptiness inside me
+<|start_header_id|>system<|end_header_id|>\n\nYou are the sadest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
\ No newline at end of file
diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp
index 36eadaac2..6ec3141af 100644
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@@ -290,7 +290,7 @@ static void power_iteration(
         }
 
         printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
-            __func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch);
+            __func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch);
     }
 
     // get output tensor
@@ -298,6 +298,9 @@ static void power_iteration(
     ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
     //print_debug_tensor(output);
     ggml_gallocr_free(allocr);
+
+    // TODO @ngxson : The output vector is randomly inverted
+    // Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
 }
 
 static void run_pca(
diff --git a/examples/cvector-generator/positive.txt b/examples/cvector-generator/positive.txt
index 880236787..fea736225 100644
--- a/examples/cvector-generator/positive.txt
+++ b/examples/cvector-generator/positive.txt
@@ -1 +1,4 @@
-[INST] Act like a person who is extremely happy. [/INST] 
+<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world
+<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever!
+<|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you
+<|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now!
\ No newline at end of file

From c8ad35955ad2c68db172dcd0e857423ab128518d Mon Sep 17 00:00:00 2001
From: Brian <mofosyne@gmail.com>
Date: Tue, 25 Jun 2024 22:03:25 +1000
Subject: [PATCH 05/50] Gguf dump start data offset via --data-offset and some
 extra refactor (#8054)

* gguf-dump: add --data-offset

* gguf-dump: add tensor data offset table

* gguf-dump: refactor GGUFReader for clarity

* gguf-dump: add --data-alignment

* gguf-dump.py: Rename variables and adjust comments

start_data_offset --> data_offset

_build_tensors_info_fields --> _build_tensor_info
---
 gguf-py/gguf/gguf_reader.py  | 29 +++++++++++++++++++++++++----
 gguf-py/scripts/gguf-dump.py | 29 ++++++++++++++++++++++++++++-
 2 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
index e48bc00c3..20432bd25 100644
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -69,6 +69,7 @@ class GGUFReader:
     # I - same as host, S - swapped
     byte_order: Literal['I'] | Literal['S'] = 'I'
     alignment: int = GGUF_DEFAULT_ALIGNMENT
+    data_offset: int
 
     # Note: Internal helper, API may change.
     gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = {
@@ -88,9 +89,13 @@ class GGUFReader:
     def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'):
         self.data = np.memmap(path, mode = mode)
         offs = 0
+
+        # Check for GGUF magic
         if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
             raise ValueError('GGUF magic invalid')
         offs += 4
+
+        # Check GGUF version
         temp_version = self._get(offs, np.uint32)
         if temp_version[0] & 65535 == 0:
             # If we get 0 here that means it's (probably) a GGUF file created for
@@ -103,12 +108,16 @@ class GGUFReader:
         self.fields: OrderedDict[str, ReaderField] = OrderedDict()
         self.tensors: list[ReaderTensor] = []
         offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))
+
+        # Check tensor count and kv count
         temp_counts = self._get(offs, np.uint64, 2)
         offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64]))
         offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64]))
         tensor_count, kv_count = temp_counts
         offs = self._build_fields(offs, kv_count)
-        offs, tensors_fields = self._build_tensors_fields(offs, tensor_count)
+
+        # Build Tensor Info Fields
+        offs, tensors_fields = self._build_tensor_info(offs, tensor_count)
         new_align = self.fields.get('general.alignment')
         if new_align is not None:
             if new_align.types != [GGUFValueType.UINT32]:
@@ -117,6 +126,7 @@ class GGUFReader:
         padding = offs % self.alignment
         if padding != 0:
             offs += self.alignment - padding
+        self.data_offset = offs
         self._build_tensors(offs, tensors_fields)
 
     _DT = TypeVar('_DT', bound = npt.DTypeLike)
@@ -193,18 +203,29 @@ class GGUFReader:
         # We can't deal with this one.
         raise ValueError('Unknown/unhandled field type {gtype}')
 
-    def _get_tensor(self, orig_offs: int) -> ReaderField:
+    def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:
         offs = orig_offs
+
+        # Get Tensor Name
         name_len, name_data = self._get_str(offs)
         offs += int(name_len.nbytes + name_data.nbytes)
+
+        # Get Tensor Dimensions Count
         n_dims = self._get(offs, np.uint32)
         offs += int(n_dims.nbytes)
+
+        # Get Tensor Dimension Array
         dims = self._get(offs, np.uint64, n_dims[0])
         offs += int(dims.nbytes)
+
+        # Get Tensor Encoding Scheme Type
         raw_dtype = self._get(offs, np.uint32)
         offs += int(raw_dtype.nbytes)
+
+        # Get Tensor Offset
         offset_tensor = self._get(offs, np.uint64)
         offs += int(offset_tensor.nbytes)
+
         return ReaderField(
             orig_offs,
             str(bytes(name_data), encoding = 'utf-8'),
@@ -233,10 +254,10 @@ class GGUFReader:
             offs += field_size
         return offs
 
-    def _build_tensors_fields(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
+    def _build_tensor_info(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
         tensor_fields = []
         for _ in range(count):
-            field = self._get_tensor(offs)
+            field = self._get_tensor_info_field(offs)
             offs += sum(int(part.nbytes) for part in field.parts)
             tensor_fields.append(field)
         return offs, tensor_fields
diff --git a/gguf-py/scripts/gguf-dump.py b/gguf-py/scripts/gguf-dump.py
index 508ca8f0a..a73ca2776 100755
--- a/gguf-py/scripts/gguf-dump.py
+++ b/gguf-py/scripts/gguf-dump.py
@@ -319,6 +319,27 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
 
         markdown_content += "\n"
 
+        markdown_content += "### Tensor Data Offset\n"
+        markdown_content += '\n'
+        markdown_content += 'This table contains the offset and data segment relative to start of file\n'
+        markdown_content += '\n'
+
+        tensor_mapping_table: list[dict[str, str | int]] = []
+        for key, tensor in enumerate(reader.tensors):
+            data_offset_pretty = '{0:#16x}'.format(tensor.data_offset)
+            data_size_pretty = '{0:#16x}'.format(tensor.n_bytes)
+            tensor_mapping_table.append({"t_id":key, "layer_name":tensor.name, "data_offset":data_offset_pretty, "data_size":data_size_pretty})
+
+        tensors_mapping_table_header_map = [
+            {'key_name':'t_id',         'header_name':'T_ID',               'align':'right'},
+            {'key_name':'layer_name',   'header_name':'Tensor Layer Name',  'align':'left'},
+            {'key_name':'data_offset',  'header_name':'Data Offset (B)',    'align':'right'},
+            {'key_name':'data_size',    'header_name':'Data Size (B)',      'align':'right'},
+        ]
+
+        markdown_content += markdown_table_with_alignment_support(tensors_mapping_table_header_map, tensor_mapping_table)
+        markdown_content += "\n"
+
         for group in tensor_prefix_order:
             tensors = tensor_groups[group]
             group_elements = sum(tensor.n_elements for tensor in tensors)
@@ -370,6 +391,8 @@ def main() -> None:
     parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata")
     parser.add_argument("--json",       action="store_true", help="Produce JSON output")
     parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)")
+    parser.add_argument("--data-offset",    action="store_true", help="Start of data offset")
+    parser.add_argument("--data-alignment", action="store_true", help="Data alignment applied globally to data field")
     parser.add_argument("--markdown",   action="store_true", help="Produce markdown output")
     parser.add_argument("--verbose",    action="store_true", help="increase output verbosity")
 
@@ -377,7 +400,7 @@ def main() -> None:
 
     logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
 
-    if not args.json and not args.markdown:
+    if not args.json and not args.markdown and not args.data_offset and not args.data_alignment:
         logger.info(f'* Loading: {args.model}')
 
     reader = GGUFReader(args.model, 'r')
@@ -386,6 +409,10 @@ def main() -> None:
         dump_metadata_json(reader, args)
     elif args.markdown:
         dump_markdown_metadata(reader, args)
+    elif args.data_offset:
+        print(reader.data_offset)  # noqa: NP100
+    elif args.data_alignment:
+        print(reader.alignment)  # noqa: NP100
     else:
         dump_metadata(reader, args)
 

From 925c30956dd17723c3a25297bcd0a609aec60663 Mon Sep 17 00:00:00 2001
From: joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
Date: Tue, 25 Jun 2024 08:13:27 -0700
Subject: [PATCH 06/50] Add healthchecks to llama-server containers (#8081)

* added healthcheck

* added healthcheck

* added healthcheck

* added healthcheck

* added healthcheck

* moved curl to base

* moved curl to base
---
 .devops/llama-server-cuda.Dockerfile   |  4 +++-
 .devops/llama-server-intel.Dockerfile  |  4 +++-
 .devops/llama-server-rocm.Dockerfile   |  4 +++-
 .devops/llama-server-vulkan.Dockerfile | 10 ++++------
 .devops/llama-server.Dockerfile        |  4 +++-
 5 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/llama-server-cuda.Dockerfile
index 0010ffd4c..7bef07a05 100644
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@@ -30,8 +30,10 @@ RUN make -j$(nproc) llama-server
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
 
 RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev libgomp1
+    apt-get install -y libcurl4-openssl-dev libgomp1 curl
 
 COPY --from=build /app/llama-server /llama-server
 
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
 ENTRYPOINT [ "/llama-server" ]
diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile
index cec436452..3bf1670ec 100644
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@@ -20,10 +20,12 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
 
 RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev
+    apt-get install -y libcurl4-openssl-dev curl
 
 COPY --from=build /app/build/bin/llama-server /llama-server
 
 ENV LC_ALL=C.utf8
 
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
 ENTRYPOINT [ "/llama-server" ]
diff --git a/.devops/llama-server-rocm.Dockerfile b/.devops/llama-server-rocm.Dockerfile
index f88cf20e5..4b1cdc320 100644
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@@ -43,8 +43,10 @@ ENV CXX=/opt/rocm/llvm/bin/clang++
 # Enable cURL
 ENV LLAMA_CURL=1
 RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev
+    apt-get install -y libcurl4-openssl-dev curl
 
 RUN make -j$(nproc) llama-server
 
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
 ENTRYPOINT [ "/app/llama-server" ]
diff --git a/.devops/llama-server-vulkan.Dockerfile b/.devops/llama-server-vulkan.Dockerfile
index b0fa0b8e6..2bc2e45d3 100644
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@@ -5,15 +5,11 @@ FROM ubuntu:$UBUNTU_VERSION as build
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget
 
-# Install Vulkan SDK
+# Install Vulkan SDK and cURL
 RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
     wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
     apt update -y && \
-    apt-get install -y vulkan-sdk
-
-# Install cURL
-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev
+    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
 
 # Build it
 WORKDIR /app
@@ -28,4 +24,6 @@ RUN cp /app/build/bin/llama-server /llama-server && \
 
 ENV LC_ALL=C.utf8
 
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
 ENTRYPOINT [ "/llama-server" ]
diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile
index aa93369be..a53a5c999 100644
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build
 
 RUN apt-get update && \
-    apt-get install -y build-essential git libcurl4-openssl-dev
+    apt-get install -y build-essential git libcurl4-openssl-dev curl
 
 WORKDIR /app
 
@@ -22,4 +22,6 @@ COPY --from=build /app/llama-server /llama-server
 
 ENV LC_ALL=C.utf8
 
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
 ENTRYPOINT [ "/llama-server" ]

From dd047b476c8b904e0c25e5dbc5bee6ffde2f6e17 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Tue, 25 Jun 2024 19:20:06 +0200
Subject: [PATCH 07/50] disable docker CI on pull requests (#8110)

---
 .github/workflows/docker.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index b3efe0084..01f1a4522 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -10,7 +10,7 @@
 name: Publish Docker image
 
 on:
-  pull_request:
+  #pull_request:
   push:
     branches:
       - master
@@ -22,7 +22,7 @@ concurrency:
 jobs:
   push_to_registry:
     name: Push Docker image to Docker Hub
-    if: github.event.pull_request.draft == false
+    #if: github.event.pull_request.draft == false
 
     runs-on: ubuntu-latest
     env:

From 84631fe1504de40427dc4b4cdac92fa7ebf65955 Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@users.noreply.github.com>
Date: Tue, 25 Jun 2024 20:06:20 +0100
Subject: [PATCH 08/50] `json`: support integer minimum, maximum,
 exclusiveMinimum, exclusiveMaximum (#7797)

* json: support minimum for positive integer values

* json: fix min 0

* json: min + max integer constraints

* json: handle negative min / max integer bounds

* json: fix missing paren min/max bug

* json: proper paren fix

* json: integration test for schemas

* json: fix bounds tests

* Update json-schema-to-grammar.cpp

* json: fix negative max

* json: fix negative min (w/ more than 1 digit)

* Update test-grammar-integration.cpp

* json: nit: move string rules together

* json: port min/max integer support to Python & JS

* nit: move + rename _build_min_max_int

* fix min in [1, 9]

* Update test-grammar-integration.cpp

* add C++11-compatible replacement for std::string_view

* add min/max constrained int field to pydantic json schema example

* fix merge

* json: add integration tests for min/max bounds

* reshuffle/merge min/max integ test cases

* nits / cleanups

* defensive code against string out of bounds (apparently different behaviour of libstdc++ vs. clang's libc++, can't read final NULL char w/ former)
---
 common/json-schema-to-grammar.cpp             | 246 +++++++++++++++-
 examples/json-schema-pydantic-example.py      |   1 +
 examples/json_schema_to_grammar.py            | 184 +++++++++++-
 .../server/public/json-schema-to-grammar.mjs  | 213 ++++++++++++++
 tests/test-grammar-integration.cpp            | 245 +++++++++++++++-
 tests/test-json-schema-to-grammar.cpp         | 264 ++++++++++++++++++
 6 files changed, 1150 insertions(+), 3 deletions(-)

diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 10b9b3d1d..07d0e952d 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -40,6 +40,233 @@ static std::string build_repetition(const std::string & item_rule, int min_items
     return result;
 }
 
+/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
+class string_view {
+    const std::string & _str;
+    const size_t _start;
+    const size_t _end;
+public:
+    string_view(const std::string & str, size_t start = 0, size_t end  = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
+
+    size_t size() const {
+        return _end - _start;
+    }
+
+    size_t length() const {
+        return size();
+    }
+
+    operator std::string() const {
+        return str();
+    }
+
+    std::string str() const {
+        return _str.substr(_start, _end - _start);
+    }
+
+    string_view substr(size_t pos, size_t len = std::string::npos) const {
+        return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
+    }
+
+    char operator[](size_t pos) const {
+        auto index = _start + pos;
+        if (index >= _end) {
+            throw std::out_of_range("string_view index out of range");
+        }
+        return _str[_start + pos];
+    }
+
+    bool operator==(const string_view & other) const {
+        std::string this_str = *this;
+        std::string other_str = other;
+        return this_str == other_str;
+    }
+};
+
+static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
+    auto has_min = min_value != std::numeric_limits<int>::min();
+    auto has_max = max_value != std::numeric_limits<int>::max();
+
+    auto digit_range = [&](char from, char to) {
+        out << "[";
+        if (from == to) {
+            out << from;
+        } else {
+            out << from << "-" << to;
+        }
+        out << "]";
+    };
+    auto more_digits = [&](int min_digits, int max_digits) {
+        out << "[0-9]";
+        if (min_digits == max_digits && min_digits == 1) {
+            return;
+        }
+        out << "{";
+        out << min_digits;
+        if (max_digits != min_digits) {
+            out << ",";
+            if (max_digits != std::numeric_limits<int>::max()) {
+                out << max_digits;
+            }
+        }
+        out << "}";
+    };
+    std::function<void(const string_view &, const string_view &)> uniform_range =
+        [&](const string_view & from, const string_view & to) {
+            size_t i = 0;
+            while (i < from.length() && i < to.length() && from[i] == to[i]) {
+                i++;
+            }
+            if (i > 0) {
+                out << "\"" << from.substr(0, i).str() << "\"";
+            }
+            if (i < from.length() && i < to.length()) {
+                if (i > 0) {
+                    out << " ";
+                }
+                auto sub_len = from.length() - i - 1;
+                if (sub_len > 0) {
+                    auto from_sub = from.substr(i + 1);
+                    auto to_sub = to.substr(i + 1);
+                    auto sub_zeros = repeat("0", sub_len);
+                    auto sub_nines = repeat("9", sub_len);
+
+                    auto to_reached = false;
+                    out << "(";
+                    if (from_sub == sub_zeros) {
+                        digit_range(from[i], to[i] - 1);
+                        out << " ";
+                        more_digits(sub_len, sub_len);
+                    } else {
+                        out << "[" << from[i] << "] ";
+                        out << "(";
+                        uniform_range(from_sub, sub_nines);
+                        out << ")";
+                        if (from[i] < to[i] - 1) {
+                            out << " | ";
+                            if (to_sub == sub_nines) {
+                                digit_range(from[i] + 1, to[i]);
+                                to_reached = true;
+                            } else {
+                                digit_range(from[i] + 1, to[i] - 1);
+                            }
+                            out << " ";
+                            more_digits(sub_len, sub_len);
+                        }
+                    }
+                    if (!to_reached) {
+                        out << " | ";
+                        digit_range(to[i], to[i]);
+                        out << " ";
+                        uniform_range(sub_zeros, to_sub);
+                    }
+                    out << ")";
+                } else {
+                    out << "[" << from[i] << "-" << to[i] << "]";
+                }
+            }
+        };
+
+    if (has_min && has_max) {
+        if (min_value < 0 && max_value < 0) {
+            out << "\"-\" (";
+            _build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true);
+            out << ")";
+            return;
+        }
+
+        if (min_value < 0) {
+            out << "\"-\" (";
+            _build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true);
+            out << ") | ";
+            min_value = 0;
+        }
+
+        auto min_s = std::to_string(min_value);
+        auto max_s = std::to_string(max_value);
+        auto min_digits = min_s.length();
+        auto max_digits = max_s.length();
+
+        for (auto digits = min_digits; digits < max_digits; digits++) {
+            uniform_range(min_s, repeat("9", digits));
+            min_s = "1" + repeat("0", digits);
+            out << " | ";
+        }
+        uniform_range(min_s, max_s);
+        return;
+    }
+
+    auto less_decimals = std::max(decimals_left - 1, 1);
+
+    if (has_min) {
+        if (min_value < 0) {
+            out << "\"-\" (";
+            _build_min_max_int(std::numeric_limits<int>::min(), -min_value, out, decimals_left, /* top_level= */ false);
+            out << ") | [0] | [1-9] ";
+            more_digits(0, decimals_left - 1);
+        } else if (min_value == 0) {
+            if (top_level) {
+                out << "[0] | [1-9] ";
+                more_digits(0, less_decimals);
+            } else {
+                more_digits(1, decimals_left);
+            }
+        } else if (min_value <= 9) {
+            char c = '0' + min_value;
+            auto range_start = top_level ? '1' : '0';
+            if (c > range_start) {
+                digit_range(range_start, c - 1);
+                out << " ";
+                more_digits(1, less_decimals);
+                out << " | ";
+            }
+            digit_range(c, '9');
+            out << " ";
+            more_digits(0, less_decimals);
+        } else {
+            auto min_s = std::to_string(min_value);
+            auto len = min_s.length();
+            auto c = min_s[0];
+
+            if (c > '1') {
+                digit_range(top_level ? '1' : '0', c - 1);
+                out << " ";
+                more_digits(len, less_decimals);
+                out << " | ";
+            }
+            digit_range(c, c);
+            out << " (";
+            _build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits<int>::max(), out, less_decimals, /* top_level= */ false);
+            out << ")";
+            if (c < '9') {
+                out << " | ";
+                digit_range(c + 1, '9');
+                out << " ";
+                more_digits(len - 1, less_decimals);
+            }
+        }
+        return;
+    }
+
+    if (has_max) {
+        if (max_value >= 0) {
+            if (top_level) {
+                out << "\"-\" [1-9] ";
+                more_digits(0, less_decimals);
+                out << " | ";
+            }
+            _build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
+        } else {
+            out << "\"-\" (";
+            _build_min_max_int(-max_value, std::numeric_limits<int>::max(), out, decimals_left, /* top_level= */ false);
+            out << ")";
+        }
+        return;
+    }
+
+    throw std::runtime_error("At least one of min_value or max_value must be set");
+}
+
 const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
 
 struct BuiltinRule {
@@ -160,7 +387,6 @@ static std::string format_literal(const std::string & literal) {
     return "\"" + escaped + "\"";
 }
 
-
 class SchemaConverter {
 private:
     std::function<json(const std::string &)> _fetch_json;
@@ -686,6 +912,24 @@ public:
             int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
             int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
             return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
+        } else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
+            int min_value = std::numeric_limits<int>::min();
+            int max_value = std::numeric_limits<int>::max();
+            if (schema.contains("minimum")) {
+                min_value = schema["minimum"].get<int>();
+            } else if (schema.contains("exclusiveMinimum")) {
+                min_value = schema["exclusiveMinimum"].get<int>() + 1;
+            }
+            if (schema.contains("maximum")) {
+                max_value = schema["maximum"].get<int>();
+            } else if (schema.contains("exclusiveMaximum")) {
+                max_value = schema["exclusiveMaximum"].get<int>() - 1;
+            }
+            std::stringstream out;
+            out << "(";
+            _build_min_max_int(min_value, max_value, out);
+            out << ") space";
+            return _add_rule(rule_name, out.str());
         } else if (schema.empty() || schema_type == "object") {
             return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
         } else {
diff --git a/examples/json-schema-pydantic-example.py b/examples/json-schema-pydantic-example.py
index cc64e572b..2240188cd 100644
--- a/examples/json-schema-pydantic-example.py
+++ b/examples/json-schema-pydantic-example.py
@@ -53,6 +53,7 @@ if __name__ == '__main__':
         question: str
         concise_answer: str
         justification: str
+        stars: Annotated[int, Field(ge=1, le=5)]
 
     class PyramidalSummary(BaseModel):
         title: str
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
index b588497b9..86500a8c3 100755
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -4,7 +4,7 @@ import itertools
 import json
 import re
 import sys
-from typing import Any, Dict, List, Set, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
 
 
 def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
@@ -23,6 +23,170 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
     result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
     return f'({result})?' if min_items == 0 else result
 
+def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True):
+    has_min = min_value != None
+    has_max = max_value != None
+
+    def digit_range(from_char: str, to_char: str):
+        out.append("[")
+        if from_char == to_char:
+            out.append(from_char)
+        else:
+            out.append(from_char)
+            out.append("-")
+            out.append(to_char)
+        out.append("]")
+
+    def more_digits(min_digits: int, max_digits: int):
+        out.append("[0-9]")
+        if min_digits == max_digits and min_digits == 1:
+            return
+        out.append("{")
+        out.append(str(min_digits))
+        if max_digits != min_digits:
+            out.append(",")
+            if max_digits != sys.maxsize:
+                out.append(str(max_digits))
+        out.append("}")
+
+    def uniform_range(from_str: str, to_str: str):
+        i = 0
+        while i < len(from_str) and from_str[i] == to_str[i]:
+            i += 1
+        if i > 0:
+            out.append("\"")
+            out.append(from_str[:i])
+            out.append("\"")
+        if i < len(from_str):
+            if i > 0:
+                out.append(" ")
+            sub_len = len(from_str) - i - 1
+            if sub_len > 0:
+                from_sub = from_str[i+1:]
+                to_sub = to_str[i+1:]
+                sub_zeros = "0" * sub_len
+                sub_nines = "9" * sub_len
+
+                to_reached = False
+                out.append("(")
+                if from_sub == sub_zeros:
+                    digit_range(from_str[i], chr(ord(to_str[i]) - 1))
+                    out.append(" ")
+                    more_digits(sub_len, sub_len)
+                else:
+                    out.append("[")
+                    out.append(from_str[i])
+                    out.append("] ")
+                    out.append("(")
+                    uniform_range(from_sub, sub_nines)
+                    out.append(")")
+                    if ord(from_str[i]) < ord(to_str[i]) - 1:
+                        out.append(" | ")
+                        if to_sub == sub_nines:
+                            digit_range(chr(ord(from_str[i]) + 1), to_str[i])
+                            to_reached = True
+                        else:
+                            digit_range(chr(ord(from_str[i]) + 1), chr(ord(to_str[i]) - 1))
+                        out.append(" ")
+                        more_digits(sub_len, sub_len)
+                if not to_reached:
+                    out.append(" | ")
+                    digit_range(to_str[i], to_str[i])
+                    out.append(" ")
+                    uniform_range(sub_zeros, to_sub)
+                out.append(")")
+            else:
+                out.append("[")
+                out.append(from_str[i])
+                out.append("-")
+                out.append(to_str[i])
+                out.append("]")
+
+    if has_min and has_max:
+        if min_value < 0 and max_value < 0:
+            out.append("\"-\" (")
+            _generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True)
+            out.append(")")
+            return
+
+        if min_value < 0:
+            out.append("\"-\" (")
+            _generate_min_max_int(0, -min_value, out, decimals_left, top_level=True)
+            out.append(") | ")
+            min_value = 0
+
+        min_s = str(min_value)
+        max_s = str(max_value)
+        min_digits = len(min_s)
+        max_digits = len(max_s)
+
+        for digits in range(min_digits, max_digits):
+            uniform_range(min_s, "9" * digits)
+            min_s = "1" + "0" * digits
+            out.append(" | ")
+        uniform_range(min_s, max_s)
+        return
+
+    less_decimals = max(decimals_left - 1, 1)
+
+    if has_min:
+        if min_value < 0:
+            out.append("\"-\" (")
+            _generate_min_max_int(None, -min_value, out, decimals_left, top_level=False)
+            out.append(") | [0] | [1-9] ")
+            more_digits(0, decimals_left - 1)
+        elif min_value == 0:
+            if top_level:
+                out.append("[0] | [1-9] ")
+                more_digits(0, less_decimals)
+            else:
+                more_digits(1, decimals_left)
+        elif min_value <= 9:
+            c = str(min_value)
+            range_start = '1' if top_level else '0'
+            if c > range_start:
+                digit_range(range_start, chr(ord(c) - 1))
+                out.append(" ")
+                more_digits(1, less_decimals)
+                out.append(" | ")
+            digit_range(c, "9")
+            out.append(" ")
+            more_digits(0, less_decimals)
+        else:
+            min_s = str(min_value)
+            length = len(min_s)
+            c = min_s[0]
+
+            if c > "1":
+                digit_range("1" if top_level else "0", chr(ord(c) - 1))
+                out.append(" ")
+                more_digits(length, less_decimals)
+                out.append(" | ")
+            digit_range(c, c)
+            out.append(" (")
+            _generate_min_max_int(int(min_s[1:]), None, out, less_decimals, top_level=False)
+            out.append(")")
+            if c < "9":
+                out.append(" | ")
+                digit_range(chr(ord(c) + 1), "9")
+                out.append(" ")
+                more_digits(length - 1, less_decimals)
+        return
+
+    if has_max:
+        if max_value >= 0:
+            if top_level:
+                out.append("\"-\" [1-9] ")
+                more_digits(0, less_decimals)
+                out.append(" | ")
+            _generate_min_max_int(0, max_value, out, decimals_left, top_level=True)
+        else:
+            out.append("\"-\" (")
+            _generate_min_max_int(-max_value, None, out, decimals_left, top_level=False)
+            out.append(")")
+        return
+
+    raise RuntimeError("At least one of min_value or max_value must be set")
 
 class BuiltinRule:
     def __init__(self, content: str, deps: list = None):
@@ -432,6 +596,24 @@ class SchemaConverter:
 
             return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
 
+        elif schema_type in (None, 'integer') and \
+                ('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema):
+            min_value = None
+            max_value = None
+            if 'minimum' in schema:
+                min_value = schema['minimum']
+            elif 'exclusiveMinimum' in schema:
+                min_value = schema['exclusiveMinimum'] + 1
+            if 'maximum' in schema:
+                max_value = schema['maximum']
+            elif 'exclusiveMaximum' in schema:
+                max_value = schema['exclusiveMaximum'] - 1
+
+            out = ["("]
+            _generate_min_max_int(min_value, max_value, out)
+            out.append(") space")
+            return self._add_rule(rule_name, ''.join(out))
+
         elif (schema_type == 'object') or (len(schema) == 0):
             return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
 
diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs
index faed6a32c..f340f94bd 100644
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@@ -24,6 +24,201 @@ function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
   return minItems === 0 ? `(${result})?` : result;
 }
 
+function _generateMinMaxInt(minValue, maxValue, out, decimalsLeft = 16, topLevel = true) {
+  const hasMin = minValue !== null;
+  const hasMax = maxValue !== null;
+
+  function digitRange(fromChar, toChar) {
+      out.push("[");
+      if (fromChar === toChar) {
+          out.push(fromChar);
+      } else {
+          out.push(fromChar);
+          out.push("-");
+          out.push(toChar);
+      }
+      out.push("]");
+  }
+
+  function moreDigits(minDigits, maxDigits) {
+      out.push("[0-9]");
+      if (minDigits === maxDigits && minDigits === 1) {
+          return;
+      }
+      out.push("{");
+      out.push(minDigits.toString());
+      if (maxDigits !== minDigits) {
+          out.push(",");
+          if (maxDigits !== Number.MAX_SAFE_INTEGER) {
+              out.push(maxDigits.toString());
+          }
+      }
+      out.push("}");
+  }
+
+  function uniformRange(fromStr, toStr) {
+      let i = 0;
+      while (i < fromStr.length && fromStr[i] === toStr[i]) {
+          i++;
+      }
+      if (i > 0) {
+          out.push("\"");
+          out.push(fromStr.slice(0, i));
+          out.push("\"");
+      }
+      if (i < fromStr.length) {
+          if (i > 0) {
+              out.push(" ");
+          }
+          const subLen = fromStr.length - i - 1;
+          if (subLen > 0) {
+              const fromSub = fromStr.slice(i + 1);
+              const toSub = toStr.slice(i + 1);
+              const subZeros = "0".repeat(subLen);
+              const subNines = "9".repeat(subLen);
+
+              let toReached = false;
+              out.push("(");
+              if (fromSub === subZeros) {
+                  digitRange(fromStr[i], String.fromCharCode(toStr.charCodeAt(i) - 1));
+                  out.push(" ");
+                  moreDigits(subLen, subLen);
+              } else {
+                  out.push("[");
+                  out.push(fromStr[i]);
+                  out.push("] ");
+                  out.push("(");
+                  uniformRange(fromSub, subNines);
+                  out.push(")");
+                  if (fromStr.charCodeAt(i) < toStr.charCodeAt(i) - 1) {
+                      out.push(" | ");
+                      if (toSub === subNines) {
+                          digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), toStr[i]);
+                          toReached = true;
+                      } else {
+                          digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), String.fromCharCode(toStr.charCodeAt(i) - 1));
+                      }
+                      out.push(" ");
+                      moreDigits(subLen, subLen);
+                  }
+              }
+              if (!toReached) {
+                  out.push(" | ");
+                  digitRange(toStr[i], toStr[i]);
+                  out.push(" ");
+                  uniformRange(subZeros, toSub);
+              }
+              out.push(")");
+          } else {
+              out.push("[");
+              out.push(fromStr[i]);
+              out.push("-");
+              out.push(toStr[i]);
+              out.push("]");
+          }
+      }
+  }
+
+  if (hasMin && hasMax) {
+      if (minValue < 0 && maxValue < 0) {
+          out.push("\"-\" (");
+          _generateMinMaxInt(-maxValue, -minValue, out, decimalsLeft, true);
+          out.push(")");
+          return;
+      }
+
+      if (minValue < 0) {
+          out.push("\"-\" (");
+          _generateMinMaxInt(0, -minValue, out, decimalsLeft, true);
+          out.push(") | ");
+          minValue = 0;
+      }
+
+      let minS = minValue.toString();
+      const maxS = maxValue.toString();
+      const minDigits = minS.length;
+      const maxDigits = maxS.length;
+
+      for (let digits = minDigits; digits < maxDigits; digits++) {
+          uniformRange(minS, "9".repeat(digits));
+          minS = "1" + "0".repeat(digits);
+          out.push(" | ");
+      }
+      uniformRange(minS, maxS);
+      return;
+  }
+
+  const lessDecimals = Math.max(decimalsLeft - 1, 1);
+
+  if (hasMin) {
+      if (minValue < 0) {
+          out.push("\"-\" (");
+          _generateMinMaxInt(null, -minValue, out, decimalsLeft, false);
+          out.push(") | [0] | [1-9] ");
+          moreDigits(0, decimalsLeft - 1);
+      } else if (minValue === 0) {
+          if (topLevel) {
+              out.push("[0] | [1-9] ");
+              moreDigits(0, lessDecimals);
+          } else {
+              moreDigits(1, decimalsLeft);
+          }
+      } else if (minValue <= 9) {
+          const c = minValue.toString();
+          const range_start = topLevel ? '1' : '0';
+          if (c > range_start) {
+              digitRange(range_start, String.fromCharCode(c.charCodeAt(0) - 1));
+              out.push(" ");
+              moreDigits(1, lessDecimals);
+              out.push(" | ");
+          }
+          digitRange(c, "9");
+          out.push(" ");
+          moreDigits(0, lessDecimals);
+      } else {
+          const minS = minValue.toString();
+          const length = minS.length;
+          const c = minS[0];
+
+          if (c > "1") {
+              digitRange(topLevel ? "1" : "0", String.fromCharCode(c.charCodeAt(0) - 1));
+              out.push(" ");
+              moreDigits(length, lessDecimals);
+              out.push(" | ");
+          }
+          digitRange(c, c);
+          out.push(" (");
+          _generateMinMaxInt(parseInt(minS.slice(1)), null, out, lessDecimals, false);
+          out.push(")");
+          if (c < "9") {
+              out.push(" | ");
+              digitRange(String.fromCharCode(c.charCodeAt(0) + 1), "9");
+              out.push(" ");
+              moreDigits(length - 1, lessDecimals);
+          }
+      }
+      return;
+  }
+
+  if (hasMax) {
+      if (maxValue >= 0) {
+          if (topLevel) {
+              out.push("\"-\" [1-9] ");
+              moreDigits(0, lessDecimals);
+              out.push(" | ");
+          }
+          _generateMinMaxInt(0, maxValue, out, decimalsLeft, true);
+      } else {
+          out.push("\"-\" (");
+          _generateMinMaxInt(-maxValue, null, out, decimalsLeft, false);
+          out.push(")");
+      }
+      return;
+  }
+
+  throw new Error("At least one of minValue or maxValue must be set");
+}
+
 class BuiltinRule {
   constructor(content, deps) {
     this.content = content;
@@ -435,6 +630,24 @@ export class SchemaConverter {
       const minLen = schema.minLength || 0;
       const maxLen = schema.maxLength;
       return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space');
+    } else if (schemaType === 'integer' && ('minimum' in schema || 'exclusiveMinimum' in schema || 'maximum' in schema || 'exclusiveMaximum' in schema)) {
+      let minValue = null;
+      let maxValue = null;
+      if ('minimum' in schema) {
+        minValue = schema.minimum;
+      } else if ('exclusiveMinimum' in schema) {
+        minValue = schema.exclusiveMinimum + 1;
+      }
+      if ('maximum' in schema) {
+        maxValue = schema.maximum;
+      } else if ('exclusiveMaximum' in schema) {
+        maxValue = schema.exclusiveMaximum - 1;
+      }
+
+      const out = ["("];
+      _generateMinMaxInt(minValue, maxValue, out);
+      out.push(") space");
+      return this._addRule(ruleName, out.join(''));
     } else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
       return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object']));
     } else {
diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp
index 96f90c01e..5b3992236 100644
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@@ -148,6 +148,250 @@ static void test_schema(const std::string & test_desc, const std::string & schem
 }
 
 static void test_simple_grammar() {
+    test_schema(
+        "min 0",
+        R"""({
+            "type": "integer",
+            "minimum": 0
+        })""",
+        // Passing strings
+        {
+            "0",
+            "10",
+            "12",
+            "10000",
+        },
+        // Failing strings
+        {
+            "-1",
+            "-10",
+            "-10000",
+            "-100000000000000000000000000000000",
+            "100000000000000000000000000000000",
+            "00",
+            "01",
+            "-0",
+        }
+    );
+    test_schema(
+        "min 2",
+        // Schema
+        R"""({
+            "type": "integer",
+            "minimum": 2
+        })""",
+        // Passing strings
+        {
+            "2",
+            "3",
+            "4",
+            "10",
+            "20",
+            "1234567890000000",
+        },
+        // Failing strings
+        {
+            "0",
+            "1",
+            "-1",
+            "-100",
+            "0",
+            "1",
+            "01",
+            "02",
+            "12345678900000000",
+        }
+    );
+    test_schema(
+        "min 456",
+        R"""({
+            "type": "integer",
+            "minimum": 456
+        })""",
+        // Passing strings
+        {
+            "456",
+            "4560",
+            "457",
+            "460",
+            "500",
+        },
+        // Failing strings
+        {
+            "455",
+            "356",
+            "50",
+            "050",
+            "-1",
+            "-456",
+        }
+    );
+    test_schema(
+        "min -123",
+        R"""({
+            "type": "integer",
+            "minimum": -123
+        })""",
+        // Passing strings
+        {
+            "-123",
+            "-122",
+            "-11",
+            "-1",
+            "0",
+            "1",
+            "123",
+            "1234",
+            "2345",
+        },
+        // Failing strings
+        {
+            "-1234",
+            "-124",
+        }
+    );
+
+    test_schema(
+        "max 9999",
+        // Schema
+        R"""({
+            "type": "integer",
+            "maximum": 9999
+        })""",
+        // Passing strings
+        {
+            "-99999",
+            "0",
+            "9999",
+        },
+        // Failing strings
+        {
+            "10000",
+            "99991",
+        }
+    );
+    test_schema(
+        "max -9999",
+        // Schema
+        R"""({
+            "type": "integer",
+            "maximum": -9999
+        })""",
+        // Passing strings
+        {
+            "-10000",
+            "-9999",
+        },
+        // Failing strings
+        {
+            "-9998",
+            "0",
+            "9999",
+        }
+    );
+    test_schema(
+        "min 5 max 30",
+        // Schema
+        R"""({
+            "type": "integer",
+            "minimum": 5,
+            "maximum": 30
+        })""",
+        // Passing strings
+        {
+            "5",
+            "10",
+            "30",
+        },
+        // Failing strings
+        {
+            "05",
+            "4",
+            "-1",
+            "31",
+            "123",
+            "0123",
+        }
+    );
+    test_schema(
+        "min -1 max 1",
+        R"""({
+            "type": "integer",
+            "minimum": -1,
+            "maximum": 1
+        })""",
+        // Passing strings
+        {
+            "-1",
+            "0",
+            "1",
+        },
+        // Failing strings
+        {
+            "-11",
+            "-10",
+            "-2",
+            "2",
+            "10",
+            "11",
+        }
+    );
+    test_schema(
+        "min -123 max 42",
+        R"""({
+            "type": "integer",
+            "minimum": -123,
+            "maximum": 42
+        })""",
+        // Passing strings
+        {
+            "-123",
+            "-122",
+            "-13",
+            "-11",
+            "-2",
+            "-1",
+            "0",
+            "1",
+            "5",
+            "10",
+            "39",
+            "40",
+            "42",
+        },
+        // Failing strings
+        {
+            "-0123",
+            "-124",
+            "-1123",
+            "-200",
+            "43",
+            "123",
+            "0123",
+        }
+    );
+    test_schema(
+        "exclusive min / max",
+        // Schema
+        R"""({
+            "type": "integer",
+            "exclusiveMinimum": 0,
+            "exclusiveMaximum": 10000
+        })""",
+        // Passing strings
+        {
+            "1",
+            "9999",
+        },
+        // Failing strings
+        {
+            "0",
+            "01",
+            "10000",
+            "99999",
+        }
+    );
+
     // Test case for a simple grammar
     test_grammar(
         "simple grammar",
@@ -773,7 +1017,6 @@ static void test_json_schema() {
         }
     );
 
-
     test_schema(
         "min+max items",
         // Schema
diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp
index 87bc66b69..2e591bd71 100755
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -80,6 +80,232 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         runner(tc);
     };
 
+    test({
+        SUCCESS,
+        "min 0",
+        R"""({
+            "type": "integer",
+            "minimum": 0
+        })""",
+        R"""(
+            root ::= ([0] | [1-9] [0-9]{0,15}) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 1",
+        R"""({
+            "type": "integer",
+            "minimum": 1
+        })""",
+        R"""(
+            root ::= ([1-9] [0-9]{0,15}) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 3",
+        R"""({
+            "type": "integer",
+            "minimum": 3
+        })""",
+        R"""(
+            root ::= ([1-2] [0-9]{1,15} | [3-9] [0-9]{0,15}) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 9",
+        R"""({
+            "type": "integer",
+            "minimum": 9
+        })""",
+        R"""(
+            root ::= ([1-8] [0-9]{1,15} | [9] [0-9]{0,15}) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 10",
+        R"""({
+            "type": "integer",
+            "minimum": 10
+        })""",
+        R"""(
+            root ::= ([1] ([0-9]{1,15}) | [2-9] [0-9]{1,15}) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 25",
+        R"""({
+            "type": "integer",
+            "minimum": 25
+        })""",
+        R"""(
+            root ::= ([1] [0-9]{2,15} | [2] ([0-4] [0-9]{1,14} | [5-9] [0-9]{0,14}) | [3-9] [0-9]{1,15}) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "max 30",
+        R"""({
+            "type": "integer",
+            "maximum": 30
+        })""",
+        R"""(
+            root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-2] [0-9] | [3] "0")) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min -5",
+        R"""({
+            "type": "integer",
+            "minimum": -5
+        })""",
+        R"""(
+            root ::= ("-" ([0-5]) | [0] | [1-9] [0-9]{0,15}) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min -123",
+        R"""({
+            "type": "integer",
+            "minimum": -123
+        })""",
+        R"""(
+            root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0] | [1-9] [0-9]{0,15}) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "max -5",
+        R"""({
+            "type": "integer",
+            "maximum": -5
+        })""",
+        R"""(
+            root ::= ("-" ([0-4] [0-9]{1,15} | [5-9] [0-9]{0,15})) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "max 1",
+        R"""({
+            "type": "integer",
+            "maximum": 1
+        })""",
+        R"""(
+            root ::= ("-" [1-9] [0-9]{0,15} | [0-1]) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "max 100",
+        R"""({
+            "type": "integer",
+            "maximum": 100
+        })""",
+        R"""(
+            root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-8] [0-9] | [9] [0-9]) | "100") space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 0 max 23",
+        R"""({
+            "type": "integer",
+            "minimum": 0,
+            "maximum": 23
+        })""",
+        R"""(
+            root ::= ([0-9] | ([1] [0-9] | [2] [0-3])) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 15 max 300",
+        R"""({
+            "type": "integer",
+            "minimum": 15,
+            "maximum": 300
+        })""",
+        R"""(
+            root ::= (([1] ([5-9]) | [2-9] [0-9]) | ([1-2] [0-9]{2} | [3] "00")) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 5 max 30",
+        R"""({
+            "type": "integer",
+            "minimum": 5,
+            "maximum": 30
+        })""",
+        R"""(
+            root ::= ([5-9] | ([1-2] [0-9] | [3] "0")) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min -123 max 42",
+        R"""({
+            "type": "integer",
+            "minimum": -123,
+            "maximum": 42
+        })""",
+        R"""(
+            root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0-9] | ([1-3] [0-9] | [4] [0-2])) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min -10 max 10",
+        R"""({
+            "type": "integer",
+            "minimum": -10,
+            "maximum": 10
+        })""",
+        R"""(
+            root ::= ("-" ([0-9] | "10") | [0-9] | "10") space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
     test({
         FAILURE,
         "unknown type",
@@ -390,6 +616,44 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         )"""
     });
 
+    test({
+        SUCCESS,
+        "min + max items with min + max values across zero",
+        R"""({
+            "items": {
+                "type": "integer",
+                "minimum": -12,
+                "maximum": 207
+            },
+            "minItems": 3,
+            "maxItems": 5
+        })""",
+        R"""(
+            item ::= ("-" ([0-9] | "1" [0-2]) | [0-9] | ([1-8] [0-9] | [9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7])) space
+            root ::= "[" space item ("," space item){2,4} "]" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min + max items with min + max values",
+        R"""({
+            "items": {
+                "type": "integer",
+                "minimum": 12,
+                "maximum": 207
+            },
+            "minItems": 3,
+            "maxItems": 5
+        })""",
+        R"""(
+            item ::= (([1] ([2-9]) | [2-9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7])) space
+            root ::= "[" space item ("," space item){2,4} "]" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
     test({
         SUCCESS,
         "simple regexp",

From e6bf007744eb06336a231ef39cf08146dd16d2ce Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Tue, 25 Jun 2024 21:07:28 +0200
Subject: [PATCH 09/50] llama : return nullptr from llama_grammar_init (#8093)

* llama : return nullptr from llama_grammar_init

This commit updates llama_grammar_init to return nullptr instead of
throwing an exception.

The motivation for this is that this function is declared inside an
extern "C" block and is intended/may be used from C code which will not
be able to handle exceptions thrown, and results in undefined behavior.

On Windows and using MSVC the following warning is currently generated:
```console
C:\llama.cpp\llama.cpp(13998,1): warning C4297: 'llama_grammar_init':
function assumed not to throw an exception but does
C:\llama.cpp\llama.cpp(13998,1): message :
__declspec(nothrow), throw(), noexcept(true), or noexcept was specified
on the function
```

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

* squash! llama : return nullptr from llama_grammar_init

Add checks for nullptr when calling llama_grammar_init.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

---------

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
Co-authored-by: Clint Herron <hanclinto@gmail.com>
---
 common/sampling.cpp                        | 12 ++++++++++--
 examples/gbnf-validator/gbnf-validator.cpp |  4 +++-
 llama.cpp                                  |  3 ++-
 llama.h                                    |  6 ++++++
 tests/test-grammar-integration.cpp         |  6 +++---
 tests/test-llama-grammar.cpp               |  4 ++++
 6 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/common/sampling.cpp b/common/sampling.cpp
index f1f803516..9f332fe57 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -28,9 +28,13 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
 
         std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
 
-        result->grammar = llama_grammar_init(
+        struct llama_grammar * grammar = llama_grammar_init(
                 grammar_rules.data(),
                 grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
+        if (grammar == nullptr) {
+            throw std::runtime_error("Failed to initialize llama_grammar");
+        }
+        result->grammar = grammar;
     }
 
     result->prev.resize(params.n_prev);
@@ -59,9 +63,13 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
     if (!ctx->parsed_grammar.rules.empty()) {
         std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
 
-        ctx->grammar = llama_grammar_init(
+        struct llama_grammar * grammar = llama_grammar_init(
                 grammar_rules.data(),
                 grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
+        if (grammar == nullptr) {
+            throw std::runtime_error("Failed to initialize llama_grammar");
+        }
+        ctx->grammar = grammar;
     }
 
     std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp
index 0406dc339..dd53ba9b1 100644
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -101,7 +101,9 @@ int main(int argc, char** argv) {
     auto grammar = llama_grammar_init(
             grammar_rules.data(),
             grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
-
+    if (grammar == nullptr) {
+        throw std::runtime_error("Failed to initialize llama_grammar");
+    }
     // Read the input file
     std::string input_str;
     {
diff --git a/llama.cpp b/llama.cpp
index 33e6cb722..dd2823e65 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -14500,7 +14500,8 @@ struct llama_grammar * llama_grammar_init(
             continue;
         }
         if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
-            throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
+            LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
+            return nullptr;
         }
     }
 
diff --git a/llama.h b/llama.h
index 53e06d9db..82d15747f 100644
--- a/llama.h
+++ b/llama.h
@@ -924,6 +924,12 @@ extern "C" {
     // Grammar
     //
 
+    /// Initialize a llama_grammar.
+    ///
+    /// @param rules The rule elements of the grammar to initialize.
+    /// @param n_rules The number of rules.
+    /// @param start_rule_index The index of the root rule (the starting point of the grammar).
+    /// @return The initialized llama_grammar or nullptr if initialization failed.
     LLAMA_API struct llama_grammar * llama_grammar_init(
             const llama_grammar_element ** rules,
                                  size_t    n_rules,
diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp
index 5b3992236..5750d362a 100644
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@@ -36,10 +36,10 @@ static llama_grammar* build_grammar(const std::string & grammar_str) {
 static bool test_build_grammar_fails(const std::string & grammar_str) {
     fprintf(stderr, "⚫ Testing failure for grammar: %s\n", grammar_str.c_str());
     bool grammar_fails = false;
-    try {
-        build_grammar(grammar_str);
+    llama_grammar * grammar = build_grammar(grammar_str);
+    if (grammar != nullptr) {
         fprintf(stderr, "  ❌ Expected build failure, but succeeded\n");
-    } catch (const std::exception & err) {
+    } else {
         grammar_fails = true;
         fprintf(stdout, "  ✅︎\n");
     }
diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp
index 27ca4d265..c8badb206 100644
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@@ -116,6 +116,10 @@ int main()
     std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
     grammar = llama_grammar_init(
         grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    if (grammar == nullptr)
+    {
+        throw std::runtime_error("Failed to initialize llama_grammar");
+    }
 
     std::vector<std::vector<llama_grammar_element>> expected_stacks = {
         {

From 6fcbf6823553efabe52ed83e3c2a3329aa3387d1 Mon Sep 17 00:00:00 2001
From: fairydreaming <166155368+fairydreaming@users.noreply.github.com>
Date: Tue, 25 Jun 2024 21:14:35 +0200
Subject: [PATCH 10/50] llama : implement Unigram tokenizer needed by T5 and
 FLAN-T5 model families (#5763)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* llama : add T5 model architecture, tensors and model header parameters

* llama : add implementation of Unigram tokenizer with SentencePiece-like text normalization using precompiled charsmap

---------

Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
---
 llama.cpp   | 619 ++++++++++++++++++++++++++++++++++++++++++++++++----
 llama.h     |   2 +
 unicode.cpp |   2 +-
 unicode.h   |   1 +
 4 files changed, 586 insertions(+), 38 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index dd2823e65..78a21008f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -226,6 +226,7 @@ enum llm_arch {
     LLM_ARCH_ARCTIC,
     LLM_ARCH_DEEPSEEK2,
     LLM_ARCH_BITNET,
+    LLM_ARCH_T5,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -265,6 +266,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_ARCTIC,          "arctic"       },
     { LLM_ARCH_DEEPSEEK2,       "deepseek2"    },
     { LLM_ARCH_BITNET,          "bitnet"       },
+    { LLM_ARCH_T5,              "t5"           },
     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
 };
 
@@ -297,6 +299,7 @@ enum llm_kv {
     LLM_KV_EXPERT_WEIGHTS_SCALE,
     LLM_KV_POOLING_TYPE,
     LLM_KV_LOGIT_SCALE,
+    LLM_KV_DECODER_START_TOKEN_ID,
 
     LLM_KV_ATTENTION_HEAD_COUNT,
     LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -309,6 +312,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_CAUSAL,
     LLM_KV_ATTENTION_Q_LORA_RANK,
     LLM_KV_ATTENTION_KV_LORA_RANK,
+    LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
 
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_FREQ_BASE,
@@ -346,6 +350,8 @@ enum llm_kv {
     LLM_KV_TOKENIZER_ADD_BOS,
     LLM_KV_TOKENIZER_ADD_EOS,
     LLM_KV_TOKENIZER_ADD_PREFIX,
+    LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
+    LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
     LLM_KV_TOKENIZER_HF_JSON,
     LLM_KV_TOKENIZER_RWKV,
     LLM_KV_TOKENIZER_PREFIX_ID,
@@ -383,18 +389,20 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_EXPERT_WEIGHTS_SCALE,              "%s.expert_weights_scale"              },
     { LLM_KV_POOLING_TYPE ,                     "%s.pooling_type"                      },
     { LLM_KV_LOGIT_SCALE,                       "%s.logit_scale"                       },
+    { LLM_KV_DECODER_START_TOKEN_ID,            "%s.decoder_start_token_id"            },
 
-    { LLM_KV_ATTENTION_HEAD_COUNT,          "%s.attention.head_count"             },
-    { LLM_KV_ATTENTION_HEAD_COUNT_KV,       "%s.attention.head_count_kv"          },
-    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,      "%s.attention.max_alibi_bias"         },
-    { LLM_KV_ATTENTION_CLAMP_KQV,           "%s.attention.clamp_kqv"              },
-    { LLM_KV_ATTENTION_KEY_LENGTH,          "%s.attention.key_length"             },
-    { LLM_KV_ATTENTION_VALUE_LENGTH,        "%s.attention.value_length"           },
-    { LLM_KV_ATTENTION_LAYERNORM_EPS,       "%s.attention.layer_norm_epsilon"     },
-    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,   "%s.attention.layer_norm_rms_epsilon" },
-    { LLM_KV_ATTENTION_CAUSAL,              "%s.attention.causal"                 },
-    { LLM_KV_ATTENTION_Q_LORA_RANK,         "%s.attention.q_lora_rank"            },
-    { LLM_KV_ATTENTION_KV_LORA_RANK,        "%s.attention.kv_lora_rank"           },
+    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"             },
+    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"          },
+    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,         "%s.attention.max_alibi_bias"         },
+    { LLM_KV_ATTENTION_CLAMP_KQV,              "%s.attention.clamp_kqv"              },
+    { LLM_KV_ATTENTION_KEY_LENGTH,             "%s.attention.key_length"             },
+    { LLM_KV_ATTENTION_VALUE_LENGTH,           "%s.attention.value_length"           },
+    { LLM_KV_ATTENTION_LAYERNORM_EPS,          "%s.attention.layer_norm_epsilon"     },
+    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      "%s.attention.layer_norm_rms_epsilon" },
+    { LLM_KV_ATTENTION_CAUSAL,                 "%s.attention.causal"                 },
+    { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"            },
+    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"           },
+    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
 
     { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
     { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
@@ -415,29 +423,31 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_SSM_STATE_SIZE,                "%s.ssm.state_size"     },
     { LLM_KV_SSM_TIME_STEP_RANK,            "%s.ssm.time_step_rank" },
 
-    { LLM_KV_TOKENIZER_MODEL,               "tokenizer.ggml.model"              },
-    { LLM_KV_TOKENIZER_PRE,                 "tokenizer.ggml.pre"                },
-    { LLM_KV_TOKENIZER_LIST,                "tokenizer.ggml.tokens"             },
-    { LLM_KV_TOKENIZER_TOKEN_TYPE,          "tokenizer.ggml.token_type"         },
-    { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,    "tokenizer.ggml.token_type_count"   },
-    { LLM_KV_TOKENIZER_SCORES,              "tokenizer.ggml.scores"             },
-    { LLM_KV_TOKENIZER_MERGES,              "tokenizer.ggml.merges"             },
-    { LLM_KV_TOKENIZER_BOS_ID,              "tokenizer.ggml.bos_token_id"       },
-    { LLM_KV_TOKENIZER_EOS_ID,              "tokenizer.ggml.eos_token_id"       },
-    { LLM_KV_TOKENIZER_UNK_ID,              "tokenizer.ggml.unknown_token_id"   },
-    { LLM_KV_TOKENIZER_SEP_ID,              "tokenizer.ggml.seperator_token_id" },
-    { LLM_KV_TOKENIZER_PAD_ID,              "tokenizer.ggml.padding_token_id"   },
-    { LLM_KV_TOKENIZER_CLS_ID,              "tokenizer.ggml.cls_token_id"       },
-    { LLM_KV_TOKENIZER_MASK_ID,             "tokenizer.ggml.mask_token_id"      },
-    { LLM_KV_TOKENIZER_ADD_BOS,             "tokenizer.ggml.add_bos_token"      },
-    { LLM_KV_TOKENIZER_ADD_EOS,             "tokenizer.ggml.add_eos_token"      },
-    { LLM_KV_TOKENIZER_ADD_PREFIX,          "tokenizer.ggml.add_space_prefix"   },
-    { LLM_KV_TOKENIZER_HF_JSON,             "tokenizer.huggingface.json"        },
-    { LLM_KV_TOKENIZER_RWKV,                "tokenizer.rwkv.world"              },
-    { LLM_KV_TOKENIZER_PREFIX_ID,           "tokenizer.ggml.prefix_token_id"    },
-    { LLM_KV_TOKENIZER_SUFFIX_ID,           "tokenizer.ggml.suffix_token_id"    },
-    { LLM_KV_TOKENIZER_MIDDLE_ID,           "tokenizer.ggml.middle_token_id"    },
-    { LLM_KV_TOKENIZER_EOT_ID,              "tokenizer.ggml.eot_token_id"       },
+    { LLM_KV_TOKENIZER_MODEL,                "tokenizer.ggml.model"                    },
+    { LLM_KV_TOKENIZER_PRE,                  "tokenizer.ggml.pre"                      },
+    { LLM_KV_TOKENIZER_LIST,                 "tokenizer.ggml.tokens"                   },
+    { LLM_KV_TOKENIZER_TOKEN_TYPE,           "tokenizer.ggml.token_type"               },
+    { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,     "tokenizer.ggml.token_type_count"         },
+    { LLM_KV_TOKENIZER_SCORES,               "tokenizer.ggml.scores"                   },
+    { LLM_KV_TOKENIZER_MERGES,               "tokenizer.ggml.merges"                   },
+    { LLM_KV_TOKENIZER_BOS_ID,               "tokenizer.ggml.bos_token_id"             },
+    { LLM_KV_TOKENIZER_EOS_ID,               "tokenizer.ggml.eos_token_id"             },
+    { LLM_KV_TOKENIZER_UNK_ID,               "tokenizer.ggml.unknown_token_id"         },
+    { LLM_KV_TOKENIZER_SEP_ID,               "tokenizer.ggml.seperator_token_id"       },
+    { LLM_KV_TOKENIZER_PAD_ID,               "tokenizer.ggml.padding_token_id"         },
+    { LLM_KV_TOKENIZER_CLS_ID,               "tokenizer.ggml.cls_token_id"             },
+    { LLM_KV_TOKENIZER_MASK_ID,              "tokenizer.ggml.mask_token_id"            },
+    { LLM_KV_TOKENIZER_ADD_BOS,              "tokenizer.ggml.add_bos_token"            },
+    { LLM_KV_TOKENIZER_ADD_EOS,              "tokenizer.ggml.add_eos_token"            },
+    { LLM_KV_TOKENIZER_ADD_PREFIX,           "tokenizer.ggml.add_space_prefix"         },
+    { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,      "tokenizer.ggml.remove_extra_whitespaces" },
+    { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap"     },
+    { LLM_KV_TOKENIZER_HF_JSON,              "tokenizer.huggingface.json"              },
+    { LLM_KV_TOKENIZER_RWKV,                 "tokenizer.rwkv.world"                    },
+    { LLM_KV_TOKENIZER_PREFIX_ID,            "tokenizer.ggml.prefix_token_id"          },
+    { LLM_KV_TOKENIZER_SUFFIX_ID,            "tokenizer.ggml.suffix_token_id"          },
+    { LLM_KV_TOKENIZER_MIDDLE_ID,            "tokenizer.ggml.middle_token_id"          },
+    { LLM_KV_TOKENIZER_EOT_ID,               "tokenizer.ggml.eot_token_id"             },
 };
 
 struct LLM_KV {
@@ -504,6 +514,34 @@ enum llm_tensor {
     LLM_TENSOR_ATTN_KV_A_NORM,
     LLM_TENSOR_ATTN_SUB_NORM,
     LLM_TENSOR_FFN_SUB_NORM,
+    LLM_TENSOR_DEC_ATTN_NORM,
+    LLM_TENSOR_DEC_ATTN_Q,
+    LLM_TENSOR_DEC_ATTN_K,
+    LLM_TENSOR_DEC_ATTN_V,
+    LLM_TENSOR_DEC_ATTN_OUT,
+    LLM_TENSOR_DEC_ATTN_REL_B,
+    LLM_TENSOR_DEC_CROSS_ATTN_NORM,
+    LLM_TENSOR_DEC_CROSS_ATTN_Q,
+    LLM_TENSOR_DEC_CROSS_ATTN_K,
+    LLM_TENSOR_DEC_CROSS_ATTN_V,
+    LLM_TENSOR_DEC_CROSS_ATTN_OUT,
+    LLM_TENSOR_DEC_CROSS_ATTN_REL_B,
+    LLM_TENSOR_DEC_FFN_NORM,
+    LLM_TENSOR_DEC_FFN_GATE,
+    LLM_TENSOR_DEC_FFN_DOWN,
+    LLM_TENSOR_DEC_FFN_UP,
+    LLM_TENSOR_DEC_OUTPUT_NORM,
+    LLM_TENSOR_ENC_ATTN_NORM,
+    LLM_TENSOR_ENC_ATTN_Q,
+    LLM_TENSOR_ENC_ATTN_K,
+    LLM_TENSOR_ENC_ATTN_V,
+    LLM_TENSOR_ENC_ATTN_OUT,
+    LLM_TENSOR_ENC_ATTN_REL_B,
+    LLM_TENSOR_ENC_FFN_NORM,
+    LLM_TENSOR_ENC_FFN_GATE,
+    LLM_TENSOR_ENC_FFN_DOWN,
+    LLM_TENSOR_ENC_FFN_UP,
+    LLM_TENSOR_ENC_OUTPUT_NORM,
 };
 
 static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -1135,6 +1173,41 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
             { LLM_TENSOR_FFN_SUB_NORM,       "blk.%d.ffn_sub_norm" },
         },
     },
+    {
+        LLM_ARCH_T5,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,           "token_embd" },
+            { LLM_TENSOR_OUTPUT,               "output" },
+            { LLM_TENSOR_DEC_OUTPUT_NORM,      "dec.output_norm" },
+            { LLM_TENSOR_DEC_ATTN_NORM,        "dec.blk.%d.attn_norm" },
+            { LLM_TENSOR_DEC_ATTN_Q,           "dec.blk.%d.attn_q" },
+            { LLM_TENSOR_DEC_ATTN_K,           "dec.blk.%d.attn_k" },
+            { LLM_TENSOR_DEC_ATTN_V,           "dec.blk.%d.attn_v" },
+            { LLM_TENSOR_DEC_ATTN_OUT,         "dec.blk.%d.attn_o" },
+            { LLM_TENSOR_DEC_ATTN_REL_B,       "dec.blk.%d.attn_rel_b" },
+            { LLM_TENSOR_DEC_CROSS_ATTN_NORM,  "dec.blk.%d.cross_attn_norm" },
+            { LLM_TENSOR_DEC_CROSS_ATTN_Q,     "dec.blk.%d.cross_attn_q" },
+            { LLM_TENSOR_DEC_CROSS_ATTN_K,     "dec.blk.%d.cross_attn_k" },
+            { LLM_TENSOR_DEC_CROSS_ATTN_V,     "dec.blk.%d.cross_attn_v" },
+            { LLM_TENSOR_DEC_CROSS_ATTN_OUT,   "dec.blk.%d.cross_attn_o" },
+            { LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "dec.blk.%d.cross_attn_rel_b" },
+            { LLM_TENSOR_DEC_FFN_NORM,         "dec.blk.%d.ffn_norm" },
+            { LLM_TENSOR_DEC_FFN_GATE,         "dec.blk.%d.ffn_gate" },
+            { LLM_TENSOR_DEC_FFN_DOWN,         "dec.blk.%d.ffn_down" },
+            { LLM_TENSOR_DEC_FFN_UP,           "dec.blk.%d.ffn_up" },
+            { LLM_TENSOR_ENC_OUTPUT_NORM,      "enc.output_norm" },
+            { LLM_TENSOR_ENC_ATTN_NORM,        "enc.blk.%d.attn_norm" },
+            { LLM_TENSOR_ENC_ATTN_Q,           "enc.blk.%d.attn_q" },
+            { LLM_TENSOR_ENC_ATTN_K,           "enc.blk.%d.attn_k" },
+            { LLM_TENSOR_ENC_ATTN_V,           "enc.blk.%d.attn_v" },
+            { LLM_TENSOR_ENC_ATTN_OUT,         "enc.blk.%d.attn_o" },
+            { LLM_TENSOR_ENC_ATTN_REL_B,       "enc.blk.%d.attn_rel_b" },
+            { LLM_TENSOR_ENC_FFN_NORM,         "enc.blk.%d.ffn_norm" },
+            { LLM_TENSOR_ENC_FFN_GATE,         "enc.blk.%d.ffn_gate" },
+            { LLM_TENSOR_ENC_FFN_DOWN,         "enc.blk.%d.ffn_down" },
+            { LLM_TENSOR_ENC_FFN_UP,           "enc.blk.%d.ffn_up" },
+        },
+    },
     {
         LLM_ARCH_UNKNOWN,
         {
@@ -2356,6 +2429,11 @@ struct llama_vocab {
     bool tokenizer_add_bos          = false;
     bool tokenizer_add_eos          = false;
     bool tokenizer_ignore_merges    = false;
+    bool tokenizer_remove_extra_whitespaces   = false;
+    bool tokenizer_escape_whitespaces         = true;
+    bool tokenizer_treat_whitespace_as_suffix = false;
+
+    std::vector<char> precompiled_charsmap;
 
     int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
         GGML_ASSERT(token_left.find(' ') == std::string::npos);
@@ -4191,6 +4269,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
         case LLAMA_VOCAB_TYPE_SPM:  return "SPM";
         case LLAMA_VOCAB_TYPE_BPE:  return "BPE";
         case LLAMA_VOCAB_TYPE_WPM:  return "WPM";
+        case LLAMA_VOCAB_TYPE_UGM:  return "UGM";
         default:                    return "unknown";
     }
 }
@@ -4870,6 +4949,45 @@ static void llm_load_vocab(
             vocab.special_pad_id  = -1;
             vocab.special_cls_id  = -1;
             vocab.special_mask_id = -1;
+        } else if (tokenizer_model == "t5") {
+            vocab.type = LLAMA_VOCAB_TYPE_UGM;
+
+            // default special tokens
+            vocab.special_bos_id  = -1;
+            vocab.special_eos_id  = 1;
+            vocab.special_unk_id  = 2;
+            vocab.special_sep_id  = -1;
+            vocab.special_pad_id  = 0;
+            vocab.special_cls_id  = -1;
+            vocab.special_mask_id = -1;
+
+            const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
+            if (add_space_prefix_keyidx != -1) {
+                vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
+            } // The default value of add_space_prefix is true.
+
+            const int remove_extra_whitespaces_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS).c_str());
+            if (remove_extra_whitespaces_keyidx != -1) {
+                vocab.tokenizer_remove_extra_whitespaces = gguf_get_val_bool(ctx, remove_extra_whitespaces_keyidx);
+            } // The default value of remove_extra_whitespaces is false.
+
+            const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
+            if (precompiled_charsmap_keyidx != -1) {
+                size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
+                const char * precompiled_charsmap = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
+                vocab.precompiled_charsmap.assign(precompiled_charsmap, precompiled_charsmap + n_precompiled_charsmap);
+#ifdef IS_BIG_ENDIAN
+                // correct endiannes of data in precompiled_charsmap binary blob
+                uint32_t * xcda_blob_size = (uint32_t *) &vocab.precompiled_charsmap[0];
+                *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
+                assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
+                size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
+                uint32_t * xcda_array = (uint32_t *) &vocab.precompiled_charsmap[sizeof(uint32_t)];
+                for (size_t i = 0; i < xcda_array_size; ++i) {
+                    xcda_array[i] = __builtin_bswap32(xcda_array[i]);
+                }
+#endif
+            }
         } else {
             throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
         }
@@ -4952,6 +5070,10 @@ static void llm_load_vocab(
             vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             vocab.tokenizer_add_bos = true;
             vocab.tokenizer_add_eos = false;
+        } else if (vocab.type == LLAMA_VOCAB_TYPE_UGM) {
+            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            vocab.tokenizer_add_bos = false;
+            vocab.tokenizer_add_eos = true;
         } else {
             vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
         }
@@ -13213,12 +13335,18 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
     return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
 }
 
+static bool llama_is_unused_token(const llama_vocab& vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
+}
+
 static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
     GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
     GGML_ASSERT(llama_is_byte_token(vocab, id));
     const auto & token_data = vocab.id_to_token.at(id);
     switch (llama_vocab_get_type(vocab)) {
-        case LLAMA_VOCAB_TYPE_SPM: {
+        case LLAMA_VOCAB_TYPE_SPM:
+        case LLAMA_VOCAB_TYPE_UGM: {
             auto buf = token_data.text.substr(3, 2);
             return strtol(buf.c_str(), NULL, 16);
         }
@@ -13238,7 +13366,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
     GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
     static const char * hex = "0123456789ABCDEF";
     switch (llama_vocab_get_type(vocab)) {
-        case LLAMA_VOCAB_TYPE_SPM: {
+        case LLAMA_VOCAB_TYPE_SPM:
+        case LLAMA_VOCAB_TYPE_UGM: {
             const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
             auto token = vocab.token_to_id.find(buf);
             if (token != vocab.token_to_id.end()) {
@@ -13826,6 +13955,383 @@ struct llm_tokenizer_wpm {
     const llama_vocab & vocab;
 };
 
+struct naive_trie {
+    naive_trie() : has_value(false), value(0) {
+    }
+    void insert(const char * key, size_t len, int32_t value = 0) {
+        if (len == 0) {
+            this->has_value = true;
+            this->value = value;
+            return;
+        }
+        char c = key[0];
+        auto res = children.find(c);
+        if (res != children.end()) {
+            res->second.insert(key + 1, len - 1, value);
+        } else {
+            auto res = children.insert(std::make_pair(c, naive_trie()));
+            res.first->second.insert(key + 1, len - 1, value);
+        }
+    }
+    std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) {
+        if (len == 0 || offset == len) {
+            return std::make_pair(key, offset);
+        }
+        char c = key[offset];
+        auto res = children.find(c);
+        if (res != children.end()) {
+            return res->second.get_longest_prefix(key, len, offset + 1);
+        } else {
+            return std::make_pair(key, offset);
+        }
+    }
+    struct naive_trie * traverse(const char c) {
+        auto res = children.find(c);
+        if (res != children.end()) {
+            return &res->second;
+        } else {
+            return NULL;
+        }
+    }
+    std::map<char, struct naive_trie> children;
+    bool has_value;
+    llama_token value;
+};
+
+struct llm_tokenizer_ugm {
+    llm_tokenizer_ugm(const llama_vocab & vocab) : vocab(vocab) {
+        if (vocab.precompiled_charsmap.size() > 0) {
+            size_t charsmap_offset = 0;
+
+            // First four bytes of precompiled_charsmap contains length of binary
+            // blob containing XOR-compressed compact double array (XCDA) entries
+            uint32_t xcda_blob_size = *(const uint32_t *) &vocab.precompiled_charsmap[0];
+            charsmap_offset += sizeof(xcda_blob_size);
+            if (xcda_blob_size + charsmap_offset >= vocab.precompiled_charsmap.size()) {
+                throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
+            }
+
+            // Next xcda_blob_size bytes contain entries of XOR-compressed compact
+            // double array (XCDA). Each entry is bit-packed into a 32-bit integer.
+            xcda_array = (const uint32_t *) &vocab.precompiled_charsmap[charsmap_offset];
+            xcda_array_size = xcda_blob_size / sizeof(uint32_t);
+            charsmap_offset += xcda_blob_size;
+
+            // Remaining bytes of precompiled charsmap contain null-terminated
+            // replacement strings for prefixes matched by the XCDA.
+            prefix_replacements = &vocab.precompiled_charsmap[charsmap_offset];
+            prefix_replacements_size = vocab.precompiled_charsmap.size() - charsmap_offset;
+        }
+
+        for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
+            const auto &token_data = vocab.id_to_token[id];
+
+            if (llama_is_normal_token(vocab, id)) {
+                min_score = std::min<float>(min_score, token_data.score);
+                max_score = std::max<float>(max_score, token_data.score);
+            }
+
+            if (llama_is_normal_token(vocab, id) ||
+                llama_is_user_defined_token(vocab, id) ||
+                llama_is_unused_token(vocab, id)) {
+                token_matcher.insert(token_data.text.data(), token_data.text.size(), id);
+            }
+
+            if (llama_is_user_defined_token(vocab, id)) {
+                user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size());
+            }
+        }
+
+        unknown_token_score = min_score - unknown_token_score_penalty;
+    }
+
+    /* This implementation is based on SentencePiece optimized Viterbi algorithm for
+     * unigram language models. The general idea is to:
+     * - move along the input sequence in steps of one UTF code point,
+     * - at each step find all possible tokenizations of the prefix by
+     *   traversing the tokens trie,
+     * - for each tokenization store the best one so far (by higher score)
+     * - use the position in sequence after given token as an index to store
+     *   results
+     * - if there was no valid tokenization of the current UTF code point
+     *   then use unknown token with additional score penalty
+     * After processing the whole sequence we backtrack from the end to get
+     * the best tokenization.
+    */
+    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+        // normalize the input first
+        std::string normalized;
+        normalize(text, &normalized);
+        size_t input_len = normalized.size();
+
+        // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
+        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {0, 0, -FLT_MAX});
+        // at the beginning tokenization score is zero
+        tokenization_results[0] = { 0, 0, 0 };
+
+        for (size_t input_offset = 0; input_offset < input_len;) {
+            size_t prefix_offset = input_offset;
+            // calculate how many code units are in the currently processed UTF code point
+            size_t n_utf8_code_units = std::min<size_t>(utf8_len(normalized[input_offset]), input_len - input_offset);
+
+            // traverse the token matcher trie to find a matching token
+            bool single_codepoint_token_found = false;
+            const struct best_tokenization & current_best = tokenization_results[input_offset];
+            struct naive_trie * node  = token_matcher.traverse(normalized[prefix_offset++]);
+
+            while (prefix_offset <= input_len && node != NULL) {
+                // check if we found valid token in prefix
+                if (node->has_value) {
+                    // check if it corresponds to the whole UTF code point
+                    if (prefix_offset - input_offset == n_utf8_code_units) {
+                        single_codepoint_token_found = true;
+                    }
+                    llama_token token_id = node->value;
+                    const auto &token_data = vocab.id_to_token[token_id];
+
+                    // we set the user-defined token scores to 0 to make them more likely to be selected
+                    // (normal token scores are log probabilities, so they are negative)
+                    // score type is double here to make tokenization results exactly
+                    // the same as in the HF tokenizer using SentencePiece
+                    const double token_score = llama_is_user_defined_token(vocab, token_id) ? 0.0 : token_data.score;
+                    const double challenger_score = current_best.score_sum + token_score;
+                    struct best_tokenization & current_champ = tokenization_results[prefix_offset];
+                    if (challenger_score > current_champ.score_sum) {
+                        struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
+                        current_champ = challenger;
+                    }
+                }
+                node = node->traverse(normalized[prefix_offset++]);
+            }
+
+            // if we didn't find a valid token corresponding to the whole UTF code point
+            // then use unknown token as the tokenization of this UTF code point
+            if (!single_codepoint_token_found) {
+                const double challenger_score = current_best.score_sum + unknown_token_score;
+                prefix_offset = input_offset + n_utf8_code_units;
+                struct best_tokenization & current_champ = tokenization_results[prefix_offset];
+                if (challenger_score > current_champ.score_sum) {
+                    struct best_tokenization challenger = { vocab.special_unk_id, input_offset, (float) challenger_score };
+                    current_champ = challenger;
+                }
+            }
+
+            // move to the next UTF code point
+            input_offset += n_utf8_code_units;
+        }
+
+        // now backtrack from the end to gather token ids of the best tokenization
+        // merge sequences of consecutive unknown tokens into single unknown tokens
+        bool is_prev_unknown = false;
+        for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) {
+            bool is_unknown = tokenization.token_id == vocab.special_unk_id;
+            if (!(is_prev_unknown && is_unknown)) {
+                output.push_back(tokenization.token_id);
+            }
+            if (tokenization.input_offset == 0) {
+                break;
+            }
+            is_prev_unknown = is_unknown;
+        }
+
+        // reverse the output since we added tokens starting from the end of the input
+        std::reverse(output.begin(), output.end());
+    }
+
+private:
+    const llama_vocab & vocab;
+
+    // helper structure for returning normalization results
+    struct normalization_result {
+        const char * normalized;
+        size_t normalized_len;
+        size_t consumed_input;
+    };
+
+    void normalize(const std::string& input, std::string * normalized) {
+        normalized->clear();
+        normalized->reserve(input.size() * 3);
+
+        const std::string space = vocab.tokenizer_escape_whitespaces ? escaped_space : " ";
+
+        bool shall_prepend_space = !vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
+        bool shall_append_space = vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
+        bool shall_merge_spaces = vocab.tokenizer_remove_extra_whitespaces;
+
+        bool is_space_prepended = false;
+        bool processing_non_ws = false;
+
+        size_t input_len = input.size();
+
+        for (size_t input_offset = 0; input_offset < input_len; ) {
+            auto norm_res = normalize_prefix(input, input_offset);
+            for (size_t i = 0; i < norm_res.normalized_len; i++) {
+                char c = norm_res.normalized[i];
+                if (c != ' ') {
+                    if (!processing_non_ws) {
+                        processing_non_ws = true;
+                        if ((shall_prepend_space && !is_space_prepended) || shall_merge_spaces) {
+                            normalized->append(space);
+                            is_space_prepended = true;
+                        }
+                    }
+                    normalized->push_back(c);
+                } else {
+                    if (processing_non_ws) {
+                        processing_non_ws = false;
+                    }
+                    if (!shall_merge_spaces) {
+                        normalized->append(space);
+                    }
+                }
+            }
+
+            input_offset += norm_res.consumed_input;
+        }
+
+        if (shall_append_space) {
+            normalized->append(space);
+        }
+    }
+
+    /*
+     * This structure is a view wrapper for XOR-compressed double array (XCDA)
+     * See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries.
+     * Eeach bit-packed entry contains:
+     * - BASE array value in bits 10-30
+     * - LCHECK array value in bits 0-7
+     * - LEAF array value in bit 9
+     * Entries containing indexes of replacement sequences have set bit 31
+     */
+    struct xcda_array_view {
+    public:
+        xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) {
+        }
+        uint32_t get_base(size_t index) {
+            uint32_t packed_node = get_node(index);
+            return (packed_node >> 10) << ((packed_node & (1U << 9)) >> 6);
+        }
+        uint32_t get_lcheck(size_t index) {
+            uint32_t packed_node = get_node(index);
+            return packed_node & ((1U << 31) | 0xff);
+        }
+        bool get_leaf(size_t index) {
+            uint32_t packed_node = get_node(index);
+            return (packed_node >> 8) & 1;
+        }
+        uint32_t get_value(size_t index) {
+            uint32_t packed_node = get_node(index);
+            return packed_node & ((1U << 31) - 1);
+        }
+    private:
+        uint32_t get_node(size_t index) {
+            if (index > xcda_array_size) {
+                throw std::runtime_error("Index out of array bounds in XCDA array!");
+            }
+            return xcda_array[index];
+        }
+        const uint32_t * xcda_array;
+        size_t xcda_array_size;
+    };
+
+    struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
+        if (input_offset == input.size()) {
+            return { &input[input_offset], 0, 0 };
+        }
+
+        // if input prefix matches some user-defined token return this token as normalization result
+        auto user_defined_token_match = user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
+        if (user_defined_token_match.second > 0) {
+            return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
+        }
+
+        size_t longest_prefix_length = 0;
+        size_t longest_prefix_offset = 0;
+
+        if (xcda_array_size > 0) {
+            struct xcda_array_view xcda_view(xcda_array, xcda_array_size);
+
+            // Find the longest normalized sequence matching the input prefix by walking
+            // the XOR-compressed compact double array (XCDA) starting from the root node
+            // We find the index of the next node by calculating BASE[s] ^ c where s is
+            // the index of the previous node and c is a numerical character value
+            uint32_t node_index = 0;
+            // get BASE of the root node
+            node_index = xcda_view.get_base(node_index);
+            for (size_t prefix_offset = input_offset; prefix_offset < input.size(); prefix_offset++) {
+                unsigned char c = input[prefix_offset];
+                if (c == 0) {
+                    break;
+                }
+                node_index ^= c;
+                // if value of LCHECK is not c it means that this is not a child of
+                // the previous node, so we stop matching
+                if (xcda_view.get_lcheck(node_index) != c) {
+                    break;
+                }
+                bool is_leaf = xcda_view.get_leaf(node_index);
+                // get BASE of the current node
+                node_index ^= xcda_view.get_base(node_index);
+                // if LEAF of the current node is true, it means that its BASE points to the node
+                // containing index of replacement sequence for currently matched input prefix
+                if (is_leaf)
+                {
+                    longest_prefix_length = prefix_offset - input_offset + 1;
+                    // get index of replacement sequence for currently matched input prefix
+                    longest_prefix_offset = xcda_view.get_value(node_index);
+                }
+            }
+        }
+
+        if (longest_prefix_length > 0) {
+            // we have a match, so return the replacement sequence
+            if (longest_prefix_offset >= prefix_replacements_size) {
+                throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
+            }
+            const char * prefix_replacement = &prefix_replacements[longest_prefix_offset];
+            return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
+        } else {
+            // check if the input prefix contains a valid sequence of UTF-8 code units
+            try {
+                // if yes, return this sequence unmodified
+                size_t prefix_offset = input_offset;
+                unicode_cpt_from_utf8(input, prefix_offset);
+                return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset };
+            } catch(std::invalid_argument & ex) {
+                // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
+                return { "\xEF\xBF\xBD", 3, 1 };
+            }
+        }
+    }
+
+    // escaped space symbol - U+2581 (Lower One Eighth Block)
+    const std::string escaped_space = "\xE2\x96\x81";
+
+    const char * prefix_replacements = NULL;
+    size_t prefix_replacements_size = 0;
+
+    const uint32_t * xcda_array = NULL;
+    size_t xcda_array_size = 0;
+
+    struct naive_trie user_defined_token_matcher;
+
+    // this structure stores the best tokenization so far at input_offset
+    struct best_tokenization {
+        llama_token token_id;
+        size_t input_offset;
+        float score_sum;
+    };
+
+    float min_score = FLT_MAX;
+    float max_score = -FLT_MAX;
+
+    float unknown_token_score_penalty = 10.0;
+    float unknown_token_score;
+
+    struct naive_trie token_matcher;
+};
+
+
 typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
     FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
     FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
@@ -14086,6 +14592,39 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                     output.push_back(vocab.special_sep_id);
                 }
             } break;
+        case LLAMA_VOCAB_TYPE_UGM:
+            {
+                llm_tokenizer_ugm tokenizer(vocab);
+
+                if (add_special && vocab.tokenizer_add_bos != 0) {
+                    GGML_ASSERT(vocab.special_bos_id != -1);
+                    output.push_back(vocab.special_bos_id);
+                }
+
+                for (const auto & fragment : fragment_buffer) {
+                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
+#ifdef PRETOKENIZERDEBUG
+                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
+#endif
+                        tokenizer.tokenize(raw_text, output);
+                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+                        output.push_back(fragment.token);
+                    }
+                }
+
+                if (add_special && vocab.tokenizer_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
+                    LLAMA_LOG_WARN(
+                        "%s: Added a BOS token to the prompt as specified by the model but the prompt "
+                        "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
+                        "Are you sure this is what you want?\n", __FUNCTION__);
+                }
+
+                if (add_special && vocab.tokenizer_add_eos == 1) {
+                    GGML_ASSERT(vocab.special_eos_id != -1);
+                    output.push_back(vocab.special_eos_id);
+                }
+            } break;
         case LLAMA_VOCAB_TYPE_NONE:
             GGML_ASSERT(false);
     }
@@ -16964,6 +17503,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_BLOOM:
         case LLM_ARCH_MAMBA:
         case LLM_ARCH_JINA_BERT_V2:
+        case LLM_ARCH_T5:
             return LLAMA_ROPE_TYPE_NONE;
 
         // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -18659,6 +19199,10 @@ llama_token llama_token_eot(const struct llama_model * model) {
     return model->vocab.special_eot_id;
 }
 
+llama_token llama_token_pad(const struct llama_model * model) {
+    return model->vocab.special_pad_id;
+}
+
 int32_t llama_tokenize(
     const struct llama_model * model,
                   const char * text,
@@ -18725,7 +19269,8 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
     if (0 <= token && token < llama_n_vocab(model)) {
         switch (llama_vocab_get_type(model->vocab)) {
             case LLAMA_VOCAB_TYPE_WPM:
-            case LLAMA_VOCAB_TYPE_SPM: {
+            case LLAMA_VOCAB_TYPE_SPM:
+            case LLAMA_VOCAB_TYPE_UGM: {
                 // NOTE: we accept all unsupported token types,
                 // suppressing them like CONTROL tokens.
                 if (llama_is_normal_token(model->vocab, token)) {
diff --git a/llama.h b/llama.h
index 82d15747f..88eecb0ed 100644
--- a/llama.h
+++ b/llama.h
@@ -67,6 +67,7 @@ extern "C" {
         LLAMA_VOCAB_TYPE_SPM  = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
         LLAMA_VOCAB_TYPE_BPE  = 2, // GPT-2 tokenizer based on byte-level BPE
         LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
+        LLAMA_VOCAB_TYPE_UGM  = 4, // T5 tokenizer based on Unigram
     };
 
     // pre-tokenization types
@@ -857,6 +858,7 @@ extern "C" {
     LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
     LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
     LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
+    LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
 
     // Returns -1 if unknown, 1 for true or 0 for false.
     LLAMA_API int32_t         llama_add_bos_token(const struct llama_model * model);
diff --git a/unicode.cpp b/unicode.cpp
index c0b76bf20..8692924b9 100644
--- a/unicode.cpp
+++ b/unicode.cpp
@@ -23,7 +23,7 @@ static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
     return result;
 }
 
-static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
+uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
     assert(offset < utf8.size());
     if (!(utf8[offset + 0] & 0x80)) {
         auto result = utf8[offset + 0];
diff --git a/unicode.h b/unicode.h
index 6c488970a..30b07ba7f 100644
--- a/unicode.h
+++ b/unicode.h
@@ -48,6 +48,7 @@ struct codepoint_flags {
 
 
 std::string unicode_cpt_to_utf8(uint32_t cp);
+uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
 std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
 
 std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);

From 163d50adaf8897d8b734d701ff332de6be63d484 Mon Sep 17 00:00:00 2001
From: jukofyork <69222624+jukofyork@users.noreply.github.com>
Date: Tue, 25 Jun 2024 21:47:40 +0100
Subject: [PATCH 11/50] fixes #7999 (adds control vectors to all `build_XXX()`
 functions in `llama.cpp` [needs testing] (#8060)

* fixes #7999

The `build_command_r` forgot to add the control vector.

* Fixes qwen2 too

* Fixed all models' control vectors

* Removed double calls to `cb(cur, "l_out", il)`

* Moved control vector logic to llama_control_vector:apply_to()
---
 llama.cpp | 112 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 73 insertions(+), 39 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 78a21008f..989c73149 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2368,13 +2368,21 @@ struct llama_control_vector {
     int32_t layer_start = -1;
     int32_t layer_end   = -1;
 
-    ggml_tensor * tensor_for(int il) const {
+    struct ggml_tensor * tensor_for(int il) const {
         if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
             return nullptr;
         }
         return tensors[il];
     }
 
+    struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int  il) const {
+        ggml_tensor * layer_dir = tensor_for(il);
+        if (layer_dir != nullptr) {
+            cur = ggml_add(ctx, cur, layer_dir);
+        }
+        return cur;
+    }
+
     ~llama_control_vector() {
         for (struct ggml_context * ctx : ctxs) {
             ggml_free(ctx);
@@ -8023,10 +8031,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
-            if (layer_dir != nullptr) {
-                cur = ggml_add(ctx0, cur, layer_dir);
-            }
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -8141,6 +8146,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -8245,6 +8251,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -8360,9 +8367,8 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "l_out", il);
-
             cur = ggml_add(ctx0, cur, inpL);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -8514,10 +8520,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
-            if (layer_dir != nullptr) {
-                cur = ggml_add(ctx0, cur, layer_dir);
-            }
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -8648,10 +8651,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
-            if (layer_dir != nullptr) {
-                cur = ggml_add(ctx0, cur, layer_dir);
-            }
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -8757,8 +8757,12 @@ struct llm_build_context {
                 cb(cur, "ffn_out", il);
             }
 
-            inpL = ggml_add(ctx0, cur, ffn_inp);
-            cb(inpL, "l_out", il);
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
         }
 
         cur = llm_build_norm(ctx0, inpL, hparams,
@@ -8846,6 +8850,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -9141,8 +9146,12 @@ struct llm_build_context {
                 cb(cur, "ffn_out", il);
             }
 
-            inpL = ggml_add(ctx0, cur, ffn_inp);
-            cb(inpL, "l_out", il);
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
         }
 
         cur = llm_build_norm(ctx0, inpL, hparams,
@@ -9276,6 +9285,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -9424,6 +9434,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -9536,6 +9547,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -9647,6 +9659,7 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -9792,6 +9805,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -9912,11 +9926,11 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_output);
-            cb(cur, "l_out", il);
-
             cur = ggml_add(ctx0, cur, inpL);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
+            // input for next layer
             inpL = cur;
         }
 
@@ -10048,8 +10062,10 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, residual, cur);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
+            // input for next layer
             inpL = cur;
         }
 
@@ -10148,9 +10164,8 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, sa_out);
-            cb(cur, "l_out", il);
-
             cur = ggml_add(ctx0, cur, inpL);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -10256,8 +10271,12 @@ struct llm_build_context {
                 cb(cur, "ffn_out", il);
             }
 
-            inpL = ggml_add(ctx0, cur, ffn_inp);
-            cb(inpL, "l_out", il);
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
         }
 
         cur = llm_build_norm(ctx0, inpL, hparams,
@@ -10363,8 +10382,12 @@ struct llm_build_context {
                 cb(cur, "ffn_out", il);
             }
 
-            inpL = ggml_add(ctx0, cur, ffn_inp);
-            cb(inpL, "l_out", il);
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
         }
 
         cur = llm_build_norm(ctx0, inpL, hparams,
@@ -10476,6 +10499,7 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -10593,6 +10617,7 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -10734,6 +10759,7 @@ struct llm_build_context {
             cb(cur, "hidden_scaled_ffn", -1);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -10846,6 +10872,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, sa_out);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -10962,7 +10989,9 @@ struct llm_build_context {
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
             cb(cur, "ffn_out", il);
+
             cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -11111,6 +11140,7 @@ struct llm_build_context {
 
             // residual
             cur = ggml_add(ctx0, cur, inpL);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -11252,6 +11282,7 @@ struct llm_build_context {
             // add together residual + FFN + self-attention
             cur = ggml_add(ctx0, cur, inpL);
             cur = ggml_add(ctx0, cur, attn_out);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -11387,10 +11418,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
-            if (layer_dir != nullptr) {
-                cur = ggml_add(ctx0, cur, layer_dir);
-            }
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -11504,8 +11532,12 @@ struct llm_build_context {
                 cur = ggml_add(ctx0, cur, inpL);
                 cb(cur, "ffn_out", il);
 
-                inpL = ggml_add(ctx0, cur, attn_out);
-                cb(inpL, "l_out", il);
+                cur = ggml_add(ctx0, cur, attn_out);
+                cur = lctx.cvec.apply_to(ctx0, cur, il);
+                cb(cur, "l_out", il);
+
+                // input for next layer
+                inpL = cur;
             } else {
                 // attention and ffn are computed sequentially
                 // x = x + attn(ln1(x))
@@ -11528,8 +11560,12 @@ struct llm_build_context {
                         LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                 cb(cur, "ffn_out", il);
 
-                inpL = ggml_add(ctx0, cur, ffn_inp);
-                cb(inpL, "l_out", il);
+                cur = ggml_add(ctx0, cur, ffn_inp);
+                cur = lctx.cvec.apply_to(ctx0, cur, il);
+                cb(cur, "l_out", il);
+
+                // input for next layer
+                inpL = cur;
             }
         }
 
@@ -11656,10 +11692,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_out);
             cb(cur, "ffn_out", il);
 
-            ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
-            if (layer_dir != nullptr) {
-                cur = ggml_add(ctx0, cur, layer_dir);
-            }
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -11892,6 +11925,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer

From 6777c544bdd8c5d9de3220d6e2557957bbbf2a4f Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@users.noreply.github.com>
Date: Wed, 26 Jun 2024 01:45:58 +0100
Subject: [PATCH 12/50] `json`: fix additionalProperties, allow space after
 enum/const (#7840)

* json: default additionalProperty to true

* json: don't force additional props after normal properties!

* json: allow space after enum/const

* json: update pydantic example to set additionalProperties: false

* json: prevent additional props to redefine a typed prop

* port not_strings to python, add trailing space

* fix not_strings & port to js+py

* Update json-schema-to-grammar.cpp

* fix _not_strings for substring overlaps

* json: fix additionalProperties default, uncomment tests

* json: add integ. test case for additionalProperties

* json: nit: simplify condition

* reformat grammar integ tests w/ R"""()""" strings where there's escapes

* update # tokens in server test: consts can now have trailing space
---
 common/json-schema-to-grammar.cpp             |  99 +++++-
 examples/json-schema-pydantic-example.py      |   6 +-
 examples/json_schema_to_grammar.py            |  76 ++++-
 .../server/public/json-schema-to-grammar.mjs  |  89 ++++-
 examples/server/tests/features/server.feature |   2 +-
 tests/test-grammar-integration.cpp            | 320 ++++++++----------
 tests/test-json-schema-to-grammar.cpp         | 150 ++++++--
 7 files changed, 497 insertions(+), 245 deletions(-)

diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 07d0e952d..b40821dad 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -614,6 +614,75 @@ private:
         return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
     }
 
+    /*
+        Returns a rule that matches a JSON string that is none of the provided strings
+
+        not_strings({"a"})
+            -> ["] ( [a] char+ | [^"a] char* )? ["] space
+        not_strings({"and", "also"})
+            -> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
+    */
+    std::string _not_strings(const std::vector<std::string> & strings) {
+
+        struct TrieNode {
+            std::map<char, TrieNode> children;
+            bool is_end_of_string;
+
+            TrieNode() : is_end_of_string(false) {}
+
+            void insert(const std::string & string) {
+                auto node = this;
+                for (char c : string) {
+                    node = &node->children[c];
+                }
+                node->is_end_of_string = true;
+            }
+        };
+
+        TrieNode trie;
+        for (const auto & s : strings) {
+            trie.insert(s);
+        }
+
+        std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
+        std::ostringstream out;
+        out << "[\"] ( ";
+        std::function<void(const TrieNode &)> visit = [&](const TrieNode & node) {
+            std::ostringstream rejects;
+            auto first = true;
+            for (const auto & kv : node.children) {
+                rejects << kv.first;
+                if (first) {
+                    first = false;
+                } else {
+                    out << " | ";
+                }
+                out << "[" << kv.first << "]";
+                if (!kv.second.children.empty()) {
+                    out << " (";
+                    visit(kv.second);
+                    out << ")";
+                } else if (kv.second.is_end_of_string) {
+                    out << " " << char_rule << "+";
+                }
+            }
+            if (!node.children.empty()) {
+                if (!first) {
+                    out << " | ";
+                }
+                out << "[^\"" << rejects.str() << "] " << char_rule << "*";
+            }
+        };
+        visit(trie);
+
+        out << " )";
+        if (!trie.is_end_of_string) {
+            out << "?";
+        }
+        out << " [\"] space";
+        return out.str();
+    }
+
     std::string _resolve_ref(const std::string & ref) {
         std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
         if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
@@ -634,6 +703,7 @@ private:
         std::vector<std::string> required_props;
         std::vector<std::string> optional_props;
         std::unordered_map<std::string, std::string> prop_kv_rule_names;
+        std::vector<std::string> prop_names;
         for (const auto & kv : properties) {
             const auto &prop_name = kv.first;
             const auto &prop_schema = kv.second;
@@ -648,11 +718,18 @@ private:
             } else {
                 optional_props.push_back(prop_name);
             }
+            prop_names.push_back(prop_name);
         }
-        if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
+        if (!(additional_properties.is_boolean() && !additional_properties.get<bool>())) {
             std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
-            std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
-            std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
+            std::string value_rule =
+                additional_properties.is_object() ? visit(additional_properties, sub_name + "-value")
+                : _add_primitive("value", PRIMITIVE_RULES.at("value"));
+
+            auto key_rule =
+                prop_names.empty() ? _add_primitive("string", PRIMITIVE_RULES.at("string"))
+                : _add_rule(sub_name + "-k", _not_strings(prop_names));
+            std::string kv_rule = _add_rule(sub_name + "-kv", key_rule + " \":\" space " + value_rule);
             prop_kv_rule_names["*"] = kv_rule;
             optional_props.push_back("*");
         }
@@ -678,15 +755,11 @@ private:
                 }
                 std::string k = ks[0];
                 std::string kv_rule_name = prop_kv_rule_names[k];
-                if (k == "*") {
-                    res = _add_rule(
-                        name + (name.empty() ? "" : "-") + "additional-kvs",
-                        kv_rule_name + " ( \",\" space " + kv_rule_name + " )*"
-                    );
-                } else if (first_is_optional) {
-                    res = "( \",\" space " + kv_rule_name + " )?";
+                std::string comma_ref = "( \",\" space " + kv_rule_name + " )";
+                if (first_is_optional) {
+                    res = comma_ref + (k == "*" ? "*" : "?");
                 } else {
-                    res = kv_rule_name;
+                    res = kv_rule_name + (k == "*" ? " " + comma_ref + "*" : "");
                 }
                 if (ks.size() > 1) {
                     res += " " + _add_rule(
@@ -824,13 +897,13 @@ public:
             }
             return _add_rule(rule_name, _generate_union_rule(name, schema_types));
         } else if (schema.contains("const")) {
-            return _add_rule(rule_name, _generate_constant_rule(schema["const"]));
+            return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space");
         } else if (schema.contains("enum")) {
             std::vector<std::string> enum_values;
             for (const auto & v : schema["enum"]) {
                 enum_values.push_back(_generate_constant_rule(v));
             }
-            return _add_rule(rule_name, join(enum_values.begin(), enum_values.end(), " | "));
+            return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
         } else if ((schema_type.is_null() || schema_type == "object")
                 && (schema.contains("properties") ||
                     (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
diff --git a/examples/json-schema-pydantic-example.py b/examples/json-schema-pydantic-example.py
index 2240188cd..2a24f8118 100644
--- a/examples/json-schema-pydantic-example.py
+++ b/examples/json-schema-pydantic-example.py
@@ -3,7 +3,7 @@
 #! pip install pydantic
 #! python json-schema-pydantic-example.py
 
-from pydantic import BaseModel, TypeAdapter
+from pydantic import BaseModel, Extra, TypeAdapter
 from annotated_types import MinLen
 from typing import Annotated, List, Optional
 import json, requests
@@ -50,12 +50,16 @@ else:
 if __name__ == '__main__':
 
     class QAPair(BaseModel):
+        class Config:
+            extra = 'forbid'  # triggers additionalProperties: false in the JSON schema
         question: str
         concise_answer: str
         justification: str
         stars: Annotated[int, Field(ge=1, le=5)]
 
     class PyramidalSummary(BaseModel):
+        class Config:
+            extra = 'forbid'  # triggers additionalProperties: false in the JSON schema
         title: str
         summary: str
         question_answers: Annotated[List[QAPair], MinLen(2)]
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
index 86500a8c3..3f3132f88 100755
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -4,8 +4,7 @@ import itertools
 import json
 import re
 import sys
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
-
+from typing import Any, List, Optional, Set, Tuple, Union
 
 def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
 
@@ -276,6 +275,51 @@ class SchemaConverter:
 
         return ''.join(('(', *recurse(0), ')'))
 
+    def _not_strings(self, strings):
+        class TrieNode:
+            def __init__(self):
+                self.children = {}
+                self.is_end_of_string = False
+
+            def insert(self, string):
+                node = self
+                for c in string:
+                    node = node.children.setdefault(c, TrieNode())
+                node.is_end_of_string = True
+
+        trie = TrieNode()
+        for s in strings:
+            trie.insert(s)
+
+        char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
+        out = ['["] ( ']
+
+        def visit(node):
+            rejects = []
+            first = True
+            for c in sorted(node.children.keys()):
+                child = node.children[c]
+                rejects.append(c)
+                if first:
+                    first = False
+                else:
+                    out.append(' | ')
+                out.append(f'[{c}]')
+                if child.children:
+                    out.append(f' (')
+                    visit(child)
+                    out.append(')')
+                elif child.is_end_of_string:
+                    out.append(f' {char_rule}+')
+            if node.children:
+                if not first:
+                    out.append(' | ')
+                out.append(f'[^"{"".join(rejects)}] {char_rule}*')
+        visit(trie)
+
+        out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space')
+        return ''.join(out)
+
     def _add_rule(self, name, rule):
         esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
         if esc_name not in self._rules or self._rules[esc_name] == rule:
@@ -524,10 +568,10 @@ class SchemaConverter:
             return self._add_rule(rule_name, self._generate_union_rule(name, [{'type': t} for t in schema_type]))
 
         elif 'const' in schema:
-            return self._add_rule(rule_name, self._generate_constant_rule(schema['const']))
+            return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space')
 
         elif 'enum' in schema:
-            rule = ' | '.join((self._generate_constant_rule(v) for v in schema['enum']))
+            rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space'
             return self._add_rule(rule_name, rule)
 
         elif schema_type in (None, 'object') and \
@@ -632,7 +676,7 @@ class SchemaConverter:
                 self._add_primitive(dep, dep_rule)
         return n
 
-    def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
+    def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Optional[Union[bool, Any]]):
         prop_order = self._prop_order
         # sort by position in prop_order (if specified) then by original order
         sorted_props = [kv[0] for _, kv in sorted(enumerate(properties), key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]))]
@@ -647,12 +691,16 @@ class SchemaConverter:
         required_props = [k for k in sorted_props if k in required]
         optional_props = [k for k in sorted_props if k not in required]
 
-        if additional_properties == True or isinstance(additional_properties, dict):
+        if additional_properties != False:
             sub_name = f'{name}{"-" if name else ""}additional'
-            value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value')
+            value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \
+                self._add_primitive('value', PRIMITIVE_RULES['value'])
+            key_rule = self._add_primitive('string', PRIMITIVE_RULES['string']) if not sorted_props \
+                else self._add_rule(f'{sub_name}-k', self._not_strings(sorted_props))
+
             prop_kv_rule_names["*"] = self._add_rule(
                 f'{sub_name}-kv',
-                self._add_primitive('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
+                f'{key_rule} ":" space {value_rule}'
             )
             optional_props.append("*")
 
@@ -667,15 +715,11 @@ class SchemaConverter:
             def get_recursive_refs(ks, first_is_optional):
                 [k, *rest] = ks
                 kv_rule_name = prop_kv_rule_names[k]
-                if k == '*':
-                    res = self._add_rule(
-                        f'{name}{"-" if name else ""}additional-kvs',
-                        f'{kv_rule_name} ( "," space ' + kv_rule_name + ' )*'
-                    )
-                elif first_is_optional:
-                    res = f'( "," space {kv_rule_name} )?'
+                comma_ref = f'( "," space {kv_rule_name} )'
+                if first_is_optional:
+                    res = comma_ref + ('*' if k == '*' else '?')
                 else:
-                    res = kv_rule_name
+                    res = kv_rule_name + (' ' + comma_ref + "*" if k == '*' else '')
                 if len(rest) > 0:
                     res += ' ' + self._add_rule(
                         f'{name}{"-" if name else ""}{k}-rest',
diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs
index f340f94bd..02015bbd4 100644
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@@ -532,6 +532,64 @@ export class SchemaConverter {
     return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
   }
 
+  _notStrings(strings) {
+    class TrieNode {
+      constructor() {
+        this.children = {};
+        this.isEndOfString = false;
+      }
+
+      insert(str) {
+        let node = this;
+        for (const c of str) {
+          node = node.children[c] = node.children[c] || new TrieNode();
+        }
+        node.isEndOfString = true;
+      }
+    }
+
+    const trie = new TrieNode();
+    for (const s of strings) {
+      trie.insert(s);
+    }
+
+    const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']);
+    const out = ['["] ( '];
+
+    const visit = (node) => {
+      const rejects = [];
+      let first = true;
+      for (const c of Object.keys(node.children).sort()) {
+        const child = node.children[c];
+        rejects.push(c);
+        if (first) {
+          first = false;
+        } else {
+          out.push(' | ');
+        }
+        out.push(`[${c}]`);
+        if (Object.keys(child.children).length > 0) {
+          out.push(' (');
+          visit(child);
+          out.push(')');
+        } else if (child.isEndOfString) {
+          out.push(` ${charRuleName}+`);
+        }
+      }
+      if (Object.keys(node.children).length > 0) {
+        if (!first) {
+          out.push(' | ');
+        }
+        out.push(`[^"${rejects.join('')}] ${charRuleName}*`);
+      }
+    };
+
+    visit(trie);
+
+    out.push(` )${trie.isEndOfString ? '' : '?'} ["] space`);
+    return out.join('');
+  }
+
   _resolveRef(ref) {
     let refName = ref.split('/').pop();
     if (!(refName in this._rules) && !this._refsBeingResolved.has(ref)) {
@@ -560,9 +618,9 @@ export class SchemaConverter {
     } else if (Array.isArray(schemaType)) {
       return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({ type: t }))));
     } else if ('const' in schema) {
-      return this._addRule(ruleName, this._generateConstantRule(schema.const));
+      return this._addRule(ruleName, this._generateConstantRule(schema.const) + ' space');
     } else if ('enum' in schema) {
-      const rule = schema.enum.map(v => this._generateConstantRule(v)).join(' | ');
+      const rule = '(' + schema.enum.map(v => this._generateConstantRule(v)).join(' | ') + ') space';
       return this._addRule(ruleName, rule);
     } else if ((schemaType === undefined || schemaType === 'object') &&
                ('properties' in schema ||
@@ -599,7 +657,7 @@ export class SchemaConverter {
         }
       }
 
-      return this._addRule(ruleName, this._buildObjectRule(properties, required, name, /* additionalProperties= */ false));
+      return this._addRule(ruleName, this._buildObjectRule(properties, required, name, null));
     } else if ((schemaType === undefined || schemaType === 'array') && ('items' in schema || 'prefixItems' in schema)) {
       const items = schema.items ?? schema.prefixItems;
       if (Array.isArray(items)) {
@@ -693,12 +751,19 @@ export class SchemaConverter {
     const requiredProps = sortedProps.filter(k => required.has(k));
     const optionalProps = sortedProps.filter(k => !required.has(k));
 
-    if (typeof additionalProperties === 'object' || additionalProperties === true) {
+    if (additionalProperties !== false) {
       const subName = `${name ?? ''}${name ? '-' : ''}additional`;
-      const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`);
+      const valueRule =
+        additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`)
+        : this._addPrimitive('value', PRIMITIVE_RULES['value']);
+
+      const key_rule =
+        sortedProps.length === 0 ? this._addPrimitive('string', PRIMITIVE_RULES['string'])
+        : this._addRule(`${subName}-k`, this._notStrings(sortedProps));
+
       propKvRuleNames['*'] = this._addRule(
         `${subName}-kv`,
-        `${this._addPrimitive('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
+        `${key_rule} ":" space ${valueRule}`);
       optionalProps.push('*');
     }
 
@@ -715,15 +780,11 @@ export class SchemaConverter {
         const [k, ...rest] = ks;
         const kvRuleName = propKvRuleNames[k];
         let res;
-        if (k === '*') {
-            res = this._addRule(
-                `${name ?? ''}${name ? '-' : ''}additional-kvs`,
-                `${kvRuleName} ( "," space ` + kvRuleName + ` )*`
-            )
-        } else if (firstIsOptional) {
-          res = `( "," space ${kvRuleName} )?`;
+        const commaRef = `( "," space ${kvRuleName} )`;
+        if (firstIsOptional) {
+          res = commaRef + (k === '*' ? '*' : '?');
         } else {
-          res = kvRuleName;
+          res = kvRuleName + (k === '*' ? ' ' + commaRef + '*' : '');
         }
         if (rest.length > 0) {
           res += ' ' + this._addRule(
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature
index d21c09135..b55971454 100644
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -82,7 +82,7 @@ Feature: llama.cpp server
 
     Examples: Prompts
       | response_format                                                     | n_predicted | re_content             |
-      | {"type": "json_object", "schema": {"const": "42"}}                  | 5           | "42"                   |
+      | {"type": "json_object", "schema": {"const": "42"}}                  | 6           | "42"                   |
       | {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10          | \[ -300 \]             |
       | {"type": "json_object"}                                             | 10          | \{ " Jacky.            |
 
diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp
index 5750d362a..23ef8324c 100644
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@@ -15,8 +15,6 @@
 
 using json = nlohmann::ordered_json;
 
-//#define INCLUDE_FAILING_TESTS 1
-
 static llama_grammar* build_grammar(const std::string & grammar_str) {
     auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
 
@@ -754,7 +752,7 @@ static void test_json_schema() {
         )""",
         // Passing strings
         {
-            "{}",
+            R"""({})""",
             R"""({"foo": "bar"})""",
         },
         // Failing strings
@@ -762,7 +760,7 @@ static void test_json_schema() {
             "",
             "[]",
             "null",
-            "\"\"",
+            R"""("")""",
             "true",
         }
     );
@@ -770,16 +768,14 @@ static void test_json_schema() {
     test_schema(
         "exotic formats (list)",
         // Schema
-        R"""(
-            {
+        R"""({
             "items": [
                 { "format": "date" },
                 { "format": "uuid" },
                 { "format": "time" },
                 { "format": "date-time" }
             ]
-            }
-        )""",
+        })""",
         // Passing strings
         {
             // "{}", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
@@ -798,125 +794,113 @@ static void test_json_schema() {
     test_schema(
         "string",
         // Schema
-        R"""(
-            {
-                "type": "string"
-            }
-        )""",
+        R"""({
+            "type": "string"
+        })""",
         // Passing strings
         {
-            "\"foo\"",
-            "\"bar\"",
-            "\"\"",
+            R"""("foo")""",
+            R"""("bar")""",
+            R"""("")""",
         },
         // Failing strings
         {
-            "{}",
-            "\"foo\": \"bar\"",
+            R"""({})""",
+            R"""("foo": "bar")""",
         }
     );
 
     test_schema(
         "string w/ min length 1",
         // Schema
-        R"""(
-            {
-                "type": "string",
-                "minLength": 1
-            }
-        )""",
+        R"""({
+            "type": "string",
+            "minLength": 1
+        })""",
         // Passing strings
         {
-            "\"foo\"",
-            "\"bar\"",
+            R"""("foo")""",
+            R"""("bar")""",
         },
         // Failing strings
         {
-            "\"\"",
-            "{}",
-            "\"foo\": \"bar\"",
+            R"""("")""",
+            R"""({})""",
+            R"""("foo": "bar")""",
         }
     );
 
     test_schema(
         "string w/ min length 3",
         // Schema
-        R"""(
-            {
+        R"""({
                 "type": "string",
                 "minLength": 3
-            }
-        )""",
+        })""",
         // Passing strings
         {
-            "\"foo\"",
-            "\"bar\"",
-            "\"foobar\"",
+            R"""("foo")""",
+            R"""("bar")""",
+            R"""("foobar")""",
         },
         // Failing strings
         {
-            "\"\"",
-            "\"f\"",
-            "\"fo\"",
+            R"""("")""",
+            R"""("f")""",
+            R"""("fo")""",
         }
     );
 
     test_schema(
         "string w/ max length",
         // Schema
-        R"""(
-            {
-                "type": "string",
-                "maxLength": 3
-            }
-        )""",
+        R"""({
+            "type": "string",
+            "maxLength": 3
+        })""",
         // Passing strings
         {
-            "\"foo\"",
-            "\"bar\"",
-            "\"\"",
-            "\"f\"",
-            "\"fo\"",
+            R"""("foo")""",
+            R"""("bar")""",
+            R"""("")""",
+            R"""("f")""",
+            R"""("fo")""",
         },
         // Failing strings
         {
-            "\"foobar\"",
+            R"""("foobar")""",
         }
     );
 
     test_schema(
         "string w/ min & max length",
         // Schema
-        R"""(
-            {
-                "type": "string",
-                "minLength": 1,
-                "maxLength": 4
-            }
-        )""",
+        R"""({
+            "type": "string",
+            "minLength": 1,
+            "maxLength": 4
+        })""",
         // Passing strings
         {
-            "\"foo\"",
-            "\"bar\"",
-            "\"f\"",
-            "\"barf\"",
+            R"""("foo")""",
+            R"""("bar")""",
+            R"""("f")""",
+            R"""("barf")""",
         },
         // Failing strings
         {
-            "\"\"",
-            "\"barfo\"",
-            "\"foobar\"",
+            R"""("")""",
+            R"""("barfo")""",
+            R"""("foobar")""",
         }
     );
 
     test_schema(
         "boolean",
         // Schema
-        R"""(
-            {
-                "type": "boolean"
-            }
-        )""",
+        R"""({
+            "type": "boolean"
+        })""",
         // Passing strings
         {
             "true",
@@ -924,122 +908,112 @@ static void test_json_schema() {
         },
         // Failing strings
         {
-            "\"\"",
-            "\"true\"",
-            "True",
-            "FALSE",
+            R"""("")""",
+            R"""("true")""",
+            R"""(True)""",
+            R"""(FALSE)""",
         }
     );
 
     test_schema(
         "integer",
         // Schema
-        R"""(
-            {
-                "type": "integer"
-            }
-        )""",
+        R"""({
+            "type": "integer"
+        })""",
         // Passing strings
         {
-            "0",
-            "12345",
-            "1234567890123456"
+            R"""(0)""",
+            R"""(12345)""",
+            R"""(1234567890123456)""",
         },
         // Failing strings
         {
-            "",
-            "01",
-            "007",
-            "12345678901234567"
+            R"""()""",
+            R"""(01)""",
+            R"""(007)""",
+            R"""(12345678901234567  )""",
         }
     );
 
     test_schema(
         "string const",
         // Schema
-        R"""(
-            {
-                "const": "foo"
-            }
-        )""",
+        R"""({
+            "const": "foo"
+        })""",
         // Passing strings
         {
-            "\"foo\"",
+            R"""("foo")""",
         },
         // Failing strings
         {
-            "foo",
-            "\"bar\"",
+            R"""(foo)""",
+            R"""("bar")""",
         }
     );
 
     test_schema(
         "non-string const",
         // Schema
-        R"""(
-            {
-                "const": true
-            }
-        )""",
+        R"""({
+            "const": true
+        })""",
         // Passing strings
         {
-            "true",
+            R"""(true)""",
         },
         // Failing strings
         {
-            "",
-            "foo",
-            "\"true\"",
+            R"""()""",
+            R"""(foo)""",
+            R"""("true")""",
         }
     );
 
     test_schema(
         "non-string const",
         // Schema
-        R"""(
-            {
-                "enum": ["red", "amber", "green", null, 42, ["foo"]]
-            }
-        )""",
+        R"""({
+            "enum": ["red", "amber", "green", null, 42, ["foo"]]
+        })""",
         // Passing strings
         {
-            "\"red\"",
-            "null",
-            "42",
-            "[\"foo\"]",
+            R"""("red")""",
+            R"""(null)""",
+            R"""(42)""",
+            R"""(["foo"])""",
         },
         // Failing strings
         {
-            "",
-            "420",
-            "true",
-            "foo",
+            R"""()""",
+            R"""(420)""",
+            R"""(true)""",
+            R"""(foo)""",
         }
     );
 
     test_schema(
         "min+max items",
         // Schema
-        R"""(
-            {
-                "items": {
-                    "type": ["number", "integer"]
-                },
-                "minItems": 3,
-                "maxItems": 5
-            }
-        )""",
+        R"""({
+            "items": {
+                "type": ["number", "integer"]
+            },
+            "minItems": 3,
+            "maxItems": 5
+        })""",
         // Passing strings
         {
-            "[1, 2, 3]",
-            "[1, 2, 3, 4]",
-            "[1, 2, 3, 4, 5]",
+            R"""([1, 2, 3])""",
+            R"""([1, 2, 3, 4])""",
+            R"""([1, 2, 3, 4, 5])""",
         },
         // Failing strings
         {
-            "[1, 2]",
-            "[1, 2, 3, 4, 5, 6]",
-            "1"
+            R"""([1, 2])""",
+            R"""([1, 2, 3, 4, 5, 6])""",
+            R"""(1)""",
         }
     );
 
@@ -1047,16 +1021,14 @@ static void test_json_schema() {
     test_schema(
         "object properties",
         // Schema
-        R"""(
-            {
+        R"""({
             "type": "object",
             "properties": {
                 "number": { "type": "number" },
                 "street_name": { "type": "string" },
                 "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
             }
-            }
-        )""",
+        })""",
         // Passing strings
         {
             R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""",
@@ -1066,12 +1038,8 @@ static void test_json_schema() {
             // "By extension, even an empty object is valid"
             R"""({})""",
             // "By default, providing additional properties is valid"
-#ifdef INCLUDE_FAILING_TESTS
-            // TODO: The following should pass, but currently FAILS. Additional properties should be permitted by default.
             R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
-            // TODO: Spaces should be permitted around enum values, but currently they fail to pass.
             R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
-#endif
         },
         // Failing strings
         {
@@ -1084,13 +1052,35 @@ static void test_json_schema() {
         }
     );
 
+    test_schema(
+        "additional properties can't override other properties",
+        R"""({
+            "properties": {
+                "a": {"type": "integer"},
+                "b": {"type": "integer"}
+            },
+            "additionalProperties": true
+        })""",
+        // Passing strings
+        {
+            R"""({"a": 42})""",
+            R"""({"c": ""})""",
+            R"""({"a": 42, "c": ""})""",
+            R"""({"a_": ""})""",
+        },
+        // Failing strings
+        {
+            R"""()""",
+            R"""({"a": ""})""",
+            R"""({"a": "", "b": ""})""",
+        }
+    );
 
     // Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties)
     test_schema(
         "object properties, additionalProperties: true",
         // Schema
-        R"""(
-            {
+        R"""({
             "type": "object",
             "properties": {
                 "number": { "type": "number" },
@@ -1098,26 +1088,18 @@ static void test_json_schema() {
                 "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
             },
             "additionalProperties": true
-            }
-        )""",
+        })""",
         // Passing strings
         {
             // "By extension, even an empty object is valid"
             R"""({})""",
-#ifdef INCLUDE_FAILING_TESTS
-            // TODO: Following line should pass and doesn't
             R"""({"number":1600,"street_name":"Pennsylvania","street_type":"Avenue"})""",
             // "By default, leaving out properties is valid"
-            // TODO: Following line should pass and doesn't
             R"""({ "street_name": "Pennsylvania" })""",
-            // TODO: Following line should pass and doesn't
             R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
             // "By default, providing additional properties is valid"
-            // TODO: The following should pass, but currently FAILS. Additional properties should be permitted by default.
             R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
-            // TODO: Spaces should be permitted around enum values, but currently they fail to pass.
             R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
-#endif
         },
         // Failing strings
         {
@@ -1132,8 +1114,7 @@ static void test_json_schema() {
     test_schema(
         "required + optional props each in original order",
         // Schema
-        R"""(
-            {
+        R"""({
             "type": "object",
             "properties": {
                 "number": { "type": "number" },
@@ -1141,18 +1122,15 @@ static void test_json_schema() {
                 "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
             },
             "additionalProperties": false
-            }
-        )""",
+        })""",
         // Passing strings
         {
             R"""({ "street_name": "Pennsylvania" })""",
             R"""({ "number": 1600, "street_type":"Avenue"})""",
             R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
             R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""",
-#ifdef INCLUDE_FAILING_TESTS
-            // TODO: Spaces should be permitted around enum values, but currently they fail to pass.
+            // Spaces are permitted around enum values
             R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
-#endif
         },
         // Failing strings
         {
@@ -1166,18 +1144,16 @@ static void test_json_schema() {
     test_schema(
         "required + optional props each in original order",
         // Schema
-        R"""(
-            {
-                "properties": {
-                    "b": {"type": "string"},
-                    "a": {"type": "string"},
-                    "d": {"type": "string"},
-                    "c": {"type": "string"}
-                },
-                "required": ["a", "b"],
-                "additionalProperties": false
-            }
-        )""",
+        R"""({
+            "properties": {
+                "b": {"type": "string"},
+                "a": {"type": "string"},
+                "d": {"type": "string"},
+                "c": {"type": "string"}
+            },
+            "required": ["a", "b"],
+            "additionalProperties": false
+        })""",
         // Passing strings
         {
             R"""({"b": "foo", "a": "bar"})""",
@@ -1197,8 +1173,7 @@ static void test_json_schema() {
     test_schema(
         "required props",
         // Schema
-        R"""(
-            {
+        R"""({
             "$schema": "https://json-schema.org/draft/2020-12/schema",
             "$id": "https://example.com/product.schema.json",
             "title": "Product",
@@ -1244,8 +1219,7 @@ static void test_json_schema() {
                 }
             },
             "required": [ "productId", "productName", "price" ]
-            }
-        )""",
+        })""",
         // Passing strings
         {
             R"""({"productId": 1, "productName": "A green door", "price": 12.50})""",
diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp
index 2e591bd71..1e69cb6ef 100755
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -473,7 +473,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             "const": "foo"
         })""",
         R"""(
-            root ::= "\"foo\""
+            root ::= "\"foo\"" space
             space ::= | " " | "\n" [ \t]{0,20}
         )"""
     });
@@ -485,7 +485,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             "const": 123
         })""",
         R"""(
-            root ::= "123"
+            root ::= "123" space
             space ::= | " " | "\n" [ \t]{0,20}
         )"""
     });
@@ -497,7 +497,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             "enum": ["red", "amber", "green", null, 42, ["foo"]]
         })""",
         R"""(
-            root ::= "\"red\"" | "\"amber\"" | "\"green\"" | "null" | "42" | "[\"foo\"]"
+            root ::= ("\"red\"" | "\"amber\"" | "\"green\"" | "null" | "42" | "[\"foo\"]") space
             space ::= | " " | "\n" [ \t]{0,20}
         )"""
     });
@@ -816,13 +816,12 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             additional-kv ::= string ":" space additional-value
-            additional-kvs ::= additional-kv ( "," space additional-kv )*
             additional-value ::= "[" space (number ("," space number)*)? "]" space
             char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
             decimal-part ::= [0-9]{1,16}
             integral-part ::= [0] | [1-9] [0-9]{0,15}
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "{" space  (additional-kvs )? "}" space
+            root ::= "{" space  (additional-kv ( "," space additional-kv )* )? "}" space
             space ::= | " " | "\n" [ \t]{0,20}
             string ::= "\"" char* "\"" space
         )"""
@@ -899,13 +898,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             a-kv ::= "\"a\"" space ":" space number
-            additional-kv ::= string ":" space string
-            additional-kvs ::= additional-kv ( "," space additional-kv )*
+            additional-k ::= ["] ( [a] char+ | [^"a] char* )? ["] space
+            additional-kv ::= additional-k ":" space string
             char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
             decimal-part ::= [0-9]{1,16}
             integral-part ::= [0] | [1-9] [0-9]{0,15}
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "{" space a-kv ( "," space ( additional-kvs ) )? "}" space
+            root ::= "{" space a-kv ( "," space ( additional-kv ( "," space additional-kv )* ) )? "}" space
             space ::= | " " | "\n" [ \t]{0,20}
             string ::= "\"" char* "\"" space
         )"""
@@ -923,16 +922,15 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             a-kv ::= "\"a\"" space ":" space number
-            a-rest ::= additional-kvs
-            additional-kv ::= string ":" space number
-            additional-kvs ::= additional-kv ( "," space additional-kv )*
+            a-rest ::= ( "," space additional-kv )*
+            additional-k ::= ["] ( [a] char+ | [^"a] char* )? ["] space
+            additional-kv ::= additional-k ":" space number
             char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
             decimal-part ::= [0-9]{1,16}
             integral-part ::= [0] | [1-9] [0-9]{0,15}
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "{" space  (a-kv a-rest | additional-kvs )? "}" space
+            root ::= "{" space  (a-kv a-rest | additional-kv ( "," space additional-kv )* )? "}" space
             space ::= | " " | "\n" [ \t]{0,20}
-            string ::= "\"" char* "\"" space
         )"""
     });
 
@@ -942,25 +940,100 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         R"""({
             "type": "object",
             "properties": {
-                "a": {"type": "number"},
-                "b": {"type": "number"}
+                "and": {"type": "number"},
+                "also": {"type": "number"}
             },
-            "required": ["a"],
+            "required": ["and"],
             "additionalProperties": {"type": "number"}
         })""",
         R"""(
-            a-kv ::= "\"a\"" space ":" space number
-            additional-kv ::= string ":" space number
-            additional-kvs ::= additional-kv ( "," space additional-kv )*
-            b-kv ::= "\"b\"" space ":" space number
-            b-rest ::= additional-kvs
+            additional-k ::= ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
+            additional-kv ::= additional-k ":" space number
+            also-kv ::= "\"also\"" space ":" space number
+            also-rest ::= ( "," space additional-kv )*
+            and-kv ::= "\"and\"" space ":" space number
             char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
             decimal-part ::= [0-9]{1,16}
             integral-part ::= [0] | [1-9] [0-9]{0,15}
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "{" space a-kv ( "," space ( b-kv b-rest | additional-kvs ) )? "}" space
+            root ::= "{" space and-kv ( "," space ( also-kv also-rest | additional-kv ( "," space additional-kv )* ) )? "}" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "optional props with empty name",
+        R"""({
+            "properties": {
+                "": {"type": "integer"},
+                "a": {"type": "integer"}
+            },
+            "additionalProperties": {"type": "integer"}
+        })""",
+        R"""(
+            -kv ::= "\"\"" space ":" space root
+            -rest ::= ( "," space a-kv )? a-rest
+            a-kv ::= "\"a\"" space ":" space integer
+            a-rest ::= ( "," space additional-kv )*
+            additional-k ::= ["] ( [a] char+ | [^"a] char* ) ["] space
+            additional-kv ::= additional-k ":" space integer
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            integer ::= ("-"? integral-part) space
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            root ::= ("-"? integral-part) space
+            root0 ::= "{" space  (-kv -rest | a-kv a-rest | additional-kv ( "," space additional-kv )* )? "}" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "optional props with nested names",
+        R"""({
+            "properties": {
+                "a": {"type": "integer"},
+                "aa": {"type": "integer"}
+            },
+            "additionalProperties": {"type": "integer"}
+        })""",
+        R"""(
+            a-kv ::= "\"a\"" space ":" space integer
+            a-rest ::= ( "," space aa-kv )? aa-rest
+            aa-kv ::= "\"aa\"" space ":" space integer
+            aa-rest ::= ( "," space additional-kv )*
+            additional-k ::= ["] ( [a] ([a] char+ | [^"a] char*) | [^"a] char* )? ["] space
+            additional-kv ::= additional-k ":" space integer
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            integer ::= ("-"? integral-part) space
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            root ::= "{" space  (a-kv a-rest | aa-kv aa-rest | additional-kv ( "," space additional-kv )* )? "}" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "optional props with common prefix",
+        R"""({
+            "properties": {
+                "ab": {"type": "integer"},
+                "ac": {"type": "integer"}
+            },
+            "additionalProperties": {"type": "integer"}
+        })""",
+        R"""(
+            ab-kv ::= "\"ab\"" space ":" space integer
+            ab-rest ::= ( "," space ac-kv )? ac-rest
+            ac-kv ::= "\"ac\"" space ":" space integer
+            ac-rest ::= ( "," space additional-kv )*
+            additional-k ::= ["] ( [a] ([b] char+ | [c] char+ | [^"bc] char*) | [^"a] char* )? ["] space
+            additional-kv ::= additional-k ":" space integer
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            integer ::= ("-"? integral-part) space
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            root ::= "{" space  (ab-kv ab-rest | ac-kv ac-rest | additional-kv ( "," space additional-kv )* )? "}" space
             space ::= | " " | "\n" [ \t]{0,20}
-            string ::= "\"" char* "\"" space
         )"""
     });
 
@@ -1015,15 +1088,28 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         R"""(
             alternative-0 ::= foo
             alternative-1 ::= bar
-            bar ::= "{" space  (bar-b-kv )? "}" space
+            array ::= "[" space ( value ("," space value)* )? "]" space
+            bar ::= "{" space  (bar-b-kv bar-b-rest | bar-additional-kv ( "," space bar-additional-kv )* )? "}" space
+            bar-additional-k ::= ["] ( [b] char+ | [^"b] char* )? ["] space
+            bar-additional-kv ::= bar-additional-k ":" space value
             bar-b-kv ::= "\"b\"" space ":" space number
+            bar-b-rest ::= ( "," space bar-additional-kv )*
+            boolean ::= ("true" | "false") space
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
             decimal-part ::= [0-9]{1,16}
-            foo ::= "{" space  (foo-a-kv )? "}" space
+            foo ::= "{" space  (foo-a-kv foo-a-rest | foo-additional-kv ( "," space foo-additional-kv )* )? "}" space
             foo-a-kv ::= "\"a\"" space ":" space number
+            foo-a-rest ::= ( "," space foo-additional-kv )*
+            foo-additional-k ::= ["] ( [a] char+ | [^"a] char* )? ["] space
+            foo-additional-kv ::= foo-additional-k ":" space value
             integral-part ::= [0] | [1-9] [0-9]{0,15}
+            null ::= "null" space
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
             root ::= alternative-0 | alternative-1
             space ::= | " " | "\n" [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+            value ::= object | array | string | number | boolean | null
         )"""
     });
 
@@ -1059,15 +1145,25 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             a-kv ::= "\"a\"" space ":" space number
+            additional-k ::= ["] ( [a] char+ | [b] char+ | [c] char+ | [d] char+ | [^"abcd] char* )? ["] space
+            additional-kv ::= additional-k ":" space value
+            array ::= "[" space ( value ("," space value)* )? "]" space
             b-kv ::= "\"b\"" space ":" space number
+            boolean ::= ("true" | "false") space
             c-kv ::= "\"c\"" space ":" space number
+            c-rest ::= ( "," space additional-kv )*
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
             d-kv ::= "\"d\"" space ":" space number
-            d-rest ::= ( "," space c-kv )?
+            d-rest ::= ( "," space c-kv )? c-rest
             decimal-part ::= [0-9]{1,16}
             integral-part ::= [0] | [1-9] [0-9]{0,15}
+            null ::= "null" space
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
+            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
+            root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv c-rest | additional-kv ( "," space additional-kv )* ) )? "}" space
             space ::= | " " | "\n" [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+            value ::= object | array | string | number | boolean | null
         )"""
     });
 

From 9b2f16f8055265c67e074025350736adc1ea0666 Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@users.noreply.github.com>
Date: Wed, 26 Jun 2024 01:46:35 +0100
Subject: [PATCH 13/50] `json`: better support for "type" unions (e.g. nullable
 arrays w/ typed items) (#7863)

* json: better suport for "type" arrays (e.g. `{"type": ["array", "null"], "items": {"type": "string"}}`)

* json: add test for type: [array, null] fix

* update tests
---
 common/json-schema-to-grammar.cpp             |  4 ++-
 examples/json_schema_to_grammar.py            |  2 +-
 .../server/public/json-schema-to-grammar.mjs  |  2 +-
 tests/test-grammar-integration.cpp            | 25 +++++++++++++++
 tests/test-json-schema-to-grammar.cpp         | 32 +++++++++++++++++++
 5 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index b40821dad..2f233e2e7 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -893,7 +893,9 @@ public:
         } else if (schema_type.is_array()) {
             std::vector<json> schema_types;
             for (const auto & t : schema_type) {
-                schema_types.push_back({{"type", t}});
+                json schema_copy(schema);
+                schema_copy["type"] = t;
+                schema_types.push_back(schema_copy);
             }
             return _add_rule(rule_name, _generate_union_rule(name, schema_types));
         } else if (schema.contains("const")) {
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
index 3f3132f88..92f6e3d47 100755
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -565,7 +565,7 @@ class SchemaConverter:
             return self._add_rule(rule_name, self._generate_union_rule(name, schema.get('oneOf') or schema['anyOf']))
 
         elif isinstance(schema_type, list):
-            return self._add_rule(rule_name, self._generate_union_rule(name, [{'type': t} for t in schema_type]))
+            return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type]))
 
         elif 'const' in schema:
             return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space')
diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs
index 02015bbd4..06d76edde 100644
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@@ -616,7 +616,7 @@ export class SchemaConverter {
     } else if (schema.oneOf || schema.anyOf) {
       return this._addRule(ruleName, this._generateUnionRule(name, schema.oneOf || schema.anyOf));
     } else if (Array.isArray(schemaType)) {
-      return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({ type: t }))));
+      return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({...schema, type: t}))));
     } else if ('const' in schema) {
       return this._addRule(ruleName, this._generateConstantRule(schema.const) + ' space');
     } else if ('enum' in schema) {
diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp
index 23ef8324c..0e21dc795 100644
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@@ -993,6 +993,31 @@ static void test_json_schema() {
         }
     );
 
+    test_schema(
+        "",
+        // Schema
+        R"""(
+            {
+                "type": ["array", "null"],
+                "items": { "type": "string" }
+            }
+        )""",
+        // Passing strings
+        {
+            "null",
+            "[]",
+            "[\"123\"]",
+            "[\"foo\", \"bar\"]",
+        },
+        // Failing strings
+        {
+            "",
+            "[123]",
+            "\"foo\"",
+            "[\"foo\", 42]",
+        }
+    );
+
     test_schema(
         "min+max items",
         // Schema
diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp
index 1e69cb6ef..3aaa11833 100755
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -502,6 +502,38 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         )"""
     });
 
+    test({
+        SUCCESS,
+        "string array",
+        R"""({
+            "type": "array",
+            "prefixItems": { "type": "string" }
+        })""",
+        R"""(
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "[" space (string ("," space string)*)? "]" space
+            space ::= | " " | "\n" [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "nullable string array",
+        R"""({
+            "type": ["array", "null"],
+            "prefixItems": { "type": "string" }
+        })""",
+        R"""(
+            alternative-0 ::= "[" space (string ("," space string)*)? "]" space
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            null ::= "null" space
+            root ::= alternative-0 | null
+            space ::= | " " | "\n" [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
     test({
         SUCCESS,
         "tuple1",

From 494165f3b6c4cbcd793123cb57fb3e1f5477f1db Mon Sep 17 00:00:00 2001
From: Eddie-Wang <wangjinheng1120@163.com>
Date: Wed, 26 Jun 2024 14:27:46 +0800
Subject: [PATCH 14/50] llama : extend llm_build_ffn() to support _scale
 tensors (#8103)

---
 llama.cpp | 255 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 135 insertions(+), 120 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 989c73149..f78594a6f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7212,10 +7212,13 @@ static struct ggml_tensor * llm_build_ffn(
          struct ggml_tensor * cur,
          struct ggml_tensor * up,
          struct ggml_tensor * up_b,
+         struct ggml_tensor * up_s,
          struct ggml_tensor * gate,
          struct ggml_tensor * gate_b,
+         struct ggml_tensor * gate_s,
          struct ggml_tensor * down,
          struct ggml_tensor * down_b,
+         struct ggml_tensor * down_s,
          struct ggml_tensor * act_scales,
             llm_ffn_op_type   type_op,
           llm_ffn_gate_type   type_gate,
@@ -7229,6 +7232,11 @@ static struct ggml_tensor * llm_build_ffn(
         cb(tmp, "ffn_up_b", il);
     }
 
+    if (up_s) {
+        tmp = ggml_mul(ctx, tmp, up_s);
+        cb(tmp, "ffn_up_s", il);
+    }
+
     if (gate) {
         switch (type_gate) {
             case LLM_FFN_SEQ:
@@ -7247,6 +7255,12 @@ static struct ggml_tensor * llm_build_ffn(
             cur = ggml_add(ctx, cur, gate_b);
             cb(cur, "ffn_gate_b", il);
         }
+
+        if (gate_s) {
+            cur = ggml_mul(ctx, cur, gate_s);
+            cb(cur, "ffn_gate_s", il);
+        }
+
     } else {
         cur = tmp;
     }
@@ -7286,7 +7300,10 @@ static struct ggml_tensor * llm_build_ffn(
         cb(cur, "ffn_gate_par", il);
     }
 
-    cur = ggml_mul_mat(ctx, down, cur);
+    if (down) {
+        cur = ggml_mul_mat(ctx, down, cur);
+    }
+
     if (down_b) {
         cb(cur, "ffn_down", il);
     }
@@ -7295,6 +7312,11 @@ static struct ggml_tensor * llm_build_ffn(
         cur = ggml_add(ctx, cur, down_b);
     }
 
+    if (down_s) {
+        cur = ggml_mul(ctx, cur, down_s);
+        cb(cur, "ffn_down_s", il);
+    }
+
     return cur;
 }
 
@@ -8003,9 +8025,9 @@ struct llm_build_context {
                 cb(cur, "ffn_norm", il);
 
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         NULL,
                         LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                 cb(cur, "ffn_out", il);
@@ -8137,9 +8159,9 @@ struct llm_build_context {
                 cb(cur, "ffn_norm", il);
 
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, NULL,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
                         NULL,
                         LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                 cb(cur, "ffn_out", il);
@@ -8242,9 +8264,9 @@ struct llm_build_context {
                 cb(cur, "ffn_norm", il);
 
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, NULL,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
                         NULL,
                         LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                 cb(cur, "ffn_out", il);
@@ -8358,9 +8380,9 @@ struct llm_build_context {
             // feed forward
             {
                 cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result
-                        model.layers[il].ffn_up,   NULL,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, NULL,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        NULL,                      NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                 cb(cur, "ffn_out", il);
@@ -8749,9 +8771,9 @@ struct llm_build_context {
                 cb(cur, "ffn_norm", il);
 
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                 cb(cur, "ffn_out", il);
@@ -8841,9 +8863,9 @@ struct llm_build_context {
                 cb(cur, "ffn_norm", il);
 
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, NULL,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
                         NULL,
                         LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                 cb(cur, "ffn_out", il);
@@ -9026,23 +9048,23 @@ struct llm_build_context {
             // feed-forward network
             if (model.arch == LLM_ARCH_BERT) {
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
             } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        model.layers[il].ffn_up,   NULL,                        NULL,
+                        model.layers[il].ffn_gate, NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
             } else {
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, NULL,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
                         NULL,
                         LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
             }
@@ -9138,9 +9160,9 @@ struct llm_build_context {
                 cb(cur, "ffn_norm", il);
 
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                 cb(cur, "ffn_out", il);
@@ -9276,9 +9298,9 @@ struct llm_build_context {
                         LLM_NORM, cb, il);
                 cb(cur, "ffn_norm", il);
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         model.layers[il].ffn_act,
                         LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                 cb(cur, "ffn_out", il);
@@ -9425,9 +9447,9 @@ struct llm_build_context {
                     cur = inpSA;
                 }
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, NULL,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
                         NULL,
                         LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                 cb(cur, "ffn_out", il);
@@ -9538,9 +9560,9 @@ struct llm_build_context {
                 cb(cur, "ffn_norm", il);
 
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, NULL,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
                         NULL,
                         LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                 cb(cur, "ffn_out", il);
@@ -9651,9 +9673,9 @@ struct llm_build_context {
             cb(cur, "ffn_norm", il);
 
             cur = llm_build_ffn(ctx0, cur,
-                    model.layers[il].ffn_up,   NULL,
-                    model.layers[il].ffn_gate, NULL,
-                    model.layers[il].ffn_down, NULL,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
                     NULL,
                     LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
             cb(cur, "ffn_out", il);
@@ -9788,9 +9810,9 @@ struct llm_build_context {
                 cb(cur_gate, "ffn_shexp_gate", il);
 
                 ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up_shexp,   NULL,
-                        model.layers[il].ffn_gate_shexp, NULL,
-                        model.layers[il].ffn_down_shexp, NULL,
+                        model.layers[il].ffn_up_shexp,   NULL, NULL,
+                        model.layers[il].ffn_gate_shexp, NULL, NULL,
+                        model.layers[il].ffn_down_shexp, NULL, NULL,
                         NULL,
                         LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                 cb(cur_ffn, "ffn_shexp", il);
@@ -9917,9 +9939,9 @@ struct llm_build_context {
             // FF
             {
                 ffn_output = llm_build_ffn(ctx0, attn_norm_output,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                 cb(ffn_output, "ffn_out", il);
@@ -10155,9 +10177,9 @@ struct llm_build_context {
             // feed-forward network
             {
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up, NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, NULL,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
                         NULL,
                         LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                 cb(cur, "ffn_out", il);
@@ -10263,9 +10285,9 @@ struct llm_build_context {
                 cb(cur, "ffn_norm", il);
 
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                 cb(cur, "ffn_out", il);
@@ -10374,9 +10396,9 @@ struct llm_build_context {
                 cb(cur, "ffn_norm", il);
 
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                 cb(cur, "ffn_out", il);
@@ -10491,9 +10513,9 @@ struct llm_build_context {
             cb(cur, "ffn_norm", il);
 
             cur = llm_build_ffn(ctx0, cur,
-                    model.layers[il].ffn_up,   NULL,
-                    model.layers[il].ffn_gate, NULL,
-                    model.layers[il].ffn_down, NULL,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
                     NULL,
                     LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
             cb(cur, "ffn_out", il);
@@ -10609,9 +10631,9 @@ struct llm_build_context {
             cb(cur, "ffn_norm", il);
 
             cur = llm_build_ffn(ctx0, cur,
-                    model.layers[il].ffn_up,   NULL,
-                    model.layers[il].ffn_gate, NULL,
-                    model.layers[il].ffn_down, NULL,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
                     NULL,
                     LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
             cb(cur, "ffn_out", il);
@@ -10746,9 +10768,9 @@ struct llm_build_context {
                 cb(cur, "ffn_norm", il);
 
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, NULL,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
                         NULL,
                         LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                 cb(cur, "ffn_out", il);
@@ -10863,9 +10885,9 @@ struct llm_build_context {
             // feed-forward network
             {
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up, NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, NULL,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
                 cb(cur, "ffn_out", il);
@@ -10983,9 +11005,9 @@ struct llm_build_context {
             cb(cur, "ffn_norm", il);
 
             cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
             cb(cur, "ffn_out", il);
@@ -11271,9 +11293,9 @@ struct llm_build_context {
             // feed-forward network
             {
                 cur = llm_build_ffn(ctx0, ffn_inp,
-                        model.layers[il].ffn_up,   NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, NULL,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
                         NULL,
                         LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                 cb(cur, "ffn_out", il);
@@ -11408,9 +11430,9 @@ struct llm_build_context {
             cb(cur, "ffn_norm", il);
 
             cur = llm_build_ffn(ctx0, cur,
-                    model.layers[il].ffn_up,   NULL,
-                    model.layers[il].ffn_gate, NULL,
-                    model.layers[il].ffn_down, NULL,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
                     NULL,
                     LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
             cb(cur, "ffn_out", il);
@@ -11522,9 +11544,9 @@ struct llm_build_context {
                 cb(cur, "ffn_norm", il);
 
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                 cb(cur, "ffn_out", il);
@@ -11553,9 +11575,9 @@ struct llm_build_context {
                 cb(cur, "ffn_norm", il);
 
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                 cb(cur, "ffn_out", il);
@@ -11662,9 +11684,9 @@ struct llm_build_context {
             cb(cur, "ffn_norm", il);
 
             cur = llm_build_ffn(ctx0, cur,
-                    model.layers[il].ffn_up,   NULL,
-                    model.layers[il].ffn_gate, NULL,
-                    model.layers[il].ffn_down, NULL,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
                     NULL,
                     LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
             cb(cur, "ffn_out", il);
@@ -11884,9 +11906,9 @@ struct llm_build_context {
                 cb(cur, "ffn_norm", il);
 
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, NULL,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
                         NULL,
                         LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                 cb(cur, "ffn_out", il);
@@ -11912,9 +11934,9 @@ struct llm_build_context {
                 // FFN shared expert
                 {
                     ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur,
-                            model.layers[il].ffn_up_shexp,   NULL,
-                            model.layers[il].ffn_gate_shexp, NULL,
-                            model.layers[il].ffn_down_shexp, NULL,
+                            model.layers[il].ffn_up_shexp,   NULL, NULL,
+                            model.layers[il].ffn_gate_shexp, NULL, NULL,
+                            model.layers[il].ffn_down_shexp, NULL, NULL,
                             NULL,
                             LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                     cb(ffn_shexp, "ffn_shexp", il);
@@ -12017,7 +12039,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
-                        nullptr, nullptr,
+                        NULL, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
 
                 cur = llm_build_norm(ctx0, cur, hparams,
@@ -12044,35 +12066,28 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward forward
-            if (model.layers[il].ffn_gate_inp == nullptr) {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
-                cb(cur, "ffn_norm", il);
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
 
-                struct ggml_tensor *tmp = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
-                tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_up_scale);
-                cb(tmp, "ffn_up", il);
+            cur = llm_build_ffn(ctx0, cur,
+                    model.layers[il].ffn_up,   NULL, model.layers[il].ffn_up_scale,
+                    model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
+                    NULL,                      NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_sub_out", il);
 
-                cur = ggml_mul_mat(ctx0, model.layers[il].ffn_gate, cur);
-                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_gate_scale);
-                cb(cur, "ffn_gate", il);
+            cur = llm_build_norm(ctx0, cur, hparams,
+                            model.layers[il].ffn_sub_norm, NULL,
+                            LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_sub_norm", il);
 
-                cur = ggml_silu(ctx0, cur);
-                cb(cur, "ffn_silu", il);
+            cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur);
+            cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
+            cb(cur, "ffn_down", il);
 
-                cur = ggml_mul(ctx0, cur, tmp);
-                cb(cur, "ffn_gate_par", il);
-
-                cur = llm_build_norm(ctx0, cur, hparams,
-                                model.layers[il].ffn_sub_norm, NULL,
-                                LLM_NORM_RMS, cb, il);
-                cb(cur, "ffn_sub_norm", il);
-
-                cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur);
-                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
-                cb(cur, "ffn_down", il);
-            }
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "l_out", il);
 

From c8771ab5f89387cdd7d9a8a69280dac46b45e02f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Wed, 26 Jun 2024 08:28:02 +0200
Subject: [PATCH 15/50] CUDA: fix misaligned shared memory read (#8123)

---
 ggml-cuda/mma.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-cuda/mma.cuh b/ggml-cuda/mma.cuh
index 0301a52f9..5d87dd8e6 100644
--- a/ggml-cuda/mma.cuh
+++ b/ggml-cuda/mma.cuh
@@ -23,7 +23,7 @@ struct mma_int_A_I16K4 {
 
     __device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
 #if defined(INT8_MMA_AVAILABLE)
-        const int * xs = xs0 + (threadIdx.x%I)*stride + (threadIdx.x/I)*(K/2);
+        const int * xs = xs0 + (threadIdx.x%I)*stride;
         asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
             : "+r"(x[0]), "+r"(x[1])
             : "l"(xs));

From 88540445615e77a0177fcca43aaa8e9d8eea6864 Mon Sep 17 00:00:00 2001
From: Isaac McFadyen <isaac@imcf.me>
Date: Wed, 26 Jun 2024 02:29:28 -0400
Subject: [PATCH 16/50] Clarify default MMQ for CUDA and LLAMA_CUDA_FORCE_MMQ
 flag (#8115)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add message about int8 support

* Add suggestions from review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a54ee3951..95d970d83 100644
--- a/README.md
+++ b/README.md
@@ -511,7 +511,7 @@ Building the program with BLAS support may lead to some performance improvements
   | LLAMA_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
   | LLAMA_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
   | LLAMA_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
-  | LLAMA_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). Speed for large batch sizes will be worse but VRAM consumption will be lower.                    |
+  | LLAMA_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
   | LLAMA_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
   | LLAMA_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
   | LLAMA_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |

From f3f65429c44bb195a9195bfdc19a30a79709db7b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 26 Jun 2024 18:33:02 +0300
Subject: [PATCH 17/50] llama : reorganize source code + improve CMake (#8006)

* scripts : update sync [no ci]

* files : relocate [no ci]

* ci : disable kompute build [no ci]

* cmake : fixes [no ci]

* server : fix mingw build

ggml-ci

* cmake : minor [no ci]

* cmake : link math library [no ci]

* cmake : build normal ggml library (not object library) [no ci]

* cmake : fix kompute build

ggml-ci

* make,cmake : fix LLAMA_CUDA + replace GGML_CDEF_PRIVATE

ggml-ci

* move public backend headers to the public include directory (#8122)

* move public backend headers to the public include directory

* nix test

* spm : fix metal header

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* scripts : fix sync paths [no ci]

* scripts : sync ggml-blas.h [no ci]

---------

Co-authored-by: slaren <slarengh@gmail.com>
---
 .devops/nix/package.nix                       |   24 +-
 .github/labeler.yml                           |   28 +-
 .github/workflows/bench.yml                   |    2 +-
 .github/workflows/build.yml                   |   74 +-
 .github/workflows/server.yml                  |    6 +-
 .gitignore                                    |    1 +
 .gitmodules                                   |    2 +-
 CMakeLists.txt                                | 1366 +----------------
 CMakePresets.json                             |    6 +-
 Makefile                                      | 1068 ++++++++-----
 Package.swift                                 |   21 +-
 README-sycl.md                                |   24 +-
 README.md                                     |   62 +-
 ci/run.sh                                     |   10 +-
 {scripts => cmake}/build-info.cmake           |    0
 cmake/git-vars.cmake                          |   22 +
 .../llama-config.cmake.in                     |   32 +-
 common/CMakeLists.txt                         |    7 +-
 .../cmake/build-info-gen-cpp.cmake            |    4 +-
 docs/BLIS.md                                  |    6 +-
 examples/CMakeLists.txt                       |    4 +-
 examples/imatrix/README.md                    |    2 +-
 examples/llava/MobileVLM-README.md            |    2 +-
 examples/rpc/README.md                        |    8 +-
 examples/server/CMakeLists.txt                |   15 +-
 examples/sycl/build.sh                        |    4 +-
 examples/sycl/win-build-sycl.bat              |    4 +-
 ggml/CMakeLists.txt                           |  238 +++
 {cmake => ggml/cmake}/FindSIMD.cmake          |   12 +-
 .../ggml_vk_generate_shaders.py               |    0
 ggml-alloc.h => ggml/include/ggml-alloc.h     |    0
 ggml-backend.h => ggml/include/ggml-backend.h |    0
 ggml-blas.h => ggml/include/ggml-blas.h       |    0
 ggml-cuda.h => ggml/include/ggml-cuda.h       |    0
 ggml-kompute.h => ggml/include/ggml-kompute.h |    0
 ggml-metal.h => ggml/include/ggml-metal.h     |    0
 ggml-rpc.h => ggml/include/ggml-rpc.h         |    0
 ggml-sycl.h => ggml/include/ggml-sycl.h       |    4 +-
 ggml-vulkan.h => ggml/include/ggml-vulkan.h   |    0
 ggml.h => ggml/include/ggml.h                 |    0
 ggml/src/CMakeLists.txt                       | 1171 ++++++++++++++
 ggml-alloc.c => ggml/src/ggml-alloc.c         |    0
 .../src/ggml-backend-impl.h                   |    0
 ggml-backend.c => ggml/src/ggml-backend.c     |    0
 ggml-blas.cpp => ggml/src/ggml-blas.cpp       |    0
 ggml-common.h => ggml/src/ggml-common.h       |    0
 ggml-cuda.cu => ggml/src/ggml-cuda.cu         |    0
 {ggml-cuda => ggml/src/ggml-cuda}/acc.cu      |    0
 {ggml-cuda => ggml/src/ggml-cuda}/acc.cuh     |    0
 {ggml-cuda => ggml/src/ggml-cuda}/arange.cu   |    0
 {ggml-cuda => ggml/src/ggml-cuda}/arange.cuh  |    0
 {ggml-cuda => ggml/src/ggml-cuda}/argsort.cu  |    0
 {ggml-cuda => ggml/src/ggml-cuda}/argsort.cuh |    0
 {ggml-cuda => ggml/src/ggml-cuda}/binbcast.cu |    0
 .../src/ggml-cuda}/binbcast.cuh               |    0
 {ggml-cuda => ggml/src/ggml-cuda}/clamp.cu    |    0
 {ggml-cuda => ggml/src/ggml-cuda}/clamp.cuh   |    0
 {ggml-cuda => ggml/src/ggml-cuda}/common.cuh  |    0
 {ggml-cuda => ggml/src/ggml-cuda}/concat.cu   |    0
 {ggml-cuda => ggml/src/ggml-cuda}/concat.cuh  |    0
 {ggml-cuda => ggml/src/ggml-cuda}/convert.cu  |    0
 {ggml-cuda => ggml/src/ggml-cuda}/convert.cuh |    0
 {ggml-cuda => ggml/src/ggml-cuda}/cpy.cu      |    0
 {ggml-cuda => ggml/src/ggml-cuda}/cpy.cuh     |    0
 .../src/ggml-cuda}/dequantize.cuh             |    0
 {ggml-cuda => ggml/src/ggml-cuda}/diagmask.cu |    0
 .../src/ggml-cuda}/diagmask.cuh               |    0
 {ggml-cuda => ggml/src/ggml-cuda}/dmmv.cu     |    0
 {ggml-cuda => ggml/src/ggml-cuda}/dmmv.cuh    |    0
 .../src/ggml-cuda}/fattn-common.cuh           |    4 +-
 .../src/ggml-cuda}/fattn-tile-f16.cu          |    0
 .../src/ggml-cuda}/fattn-tile-f16.cuh         |    0
 .../src/ggml-cuda}/fattn-tile-f32.cu          |    0
 .../src/ggml-cuda}/fattn-tile-f32.cuh         |    0
 .../src/ggml-cuda}/fattn-vec-f16.cuh          |    0
 .../src/ggml-cuda}/fattn-vec-f32.cuh          |    0
 .../src/ggml-cuda}/fattn-wmma-f16.cuh         |    0
 {ggml-cuda => ggml/src/ggml-cuda}/fattn.cu    |    0
 {ggml-cuda => ggml/src/ggml-cuda}/fattn.cuh   |    0
 {ggml-cuda => ggml/src/ggml-cuda}/getrows.cu  |    0
 {ggml-cuda => ggml/src/ggml-cuda}/getrows.cuh |    0
 {ggml-cuda => ggml/src/ggml-cuda}/im2col.cu   |    0
 {ggml-cuda => ggml/src/ggml-cuda}/im2col.cuh  |    0
 {ggml-cuda => ggml/src/ggml-cuda}/mma.cuh     |    0
 {ggml-cuda => ggml/src/ggml-cuda}/mmq.cu      |    0
 {ggml-cuda => ggml/src/ggml-cuda}/mmq.cuh     |    0
 {ggml-cuda => ggml/src/ggml-cuda}/mmvq.cu     |    0
 {ggml-cuda => ggml/src/ggml-cuda}/mmvq.cuh    |    0
 {ggml-cuda => ggml/src/ggml-cuda}/norm.cu     |    0
 {ggml-cuda => ggml/src/ggml-cuda}/norm.cuh    |    0
 {ggml-cuda => ggml/src/ggml-cuda}/pad.cu      |    0
 {ggml-cuda => ggml/src/ggml-cuda}/pad.cuh     |    0
 {ggml-cuda => ggml/src/ggml-cuda}/pool2d.cu   |    0
 {ggml-cuda => ggml/src/ggml-cuda}/pool2d.cuh  |    0
 {ggml-cuda => ggml/src/ggml-cuda}/quantize.cu |    0
 .../src/ggml-cuda}/quantize.cuh               |    0
 {ggml-cuda => ggml/src/ggml-cuda}/rope.cu     |    0
 {ggml-cuda => ggml/src/ggml-cuda}/rope.cuh    |    0
 {ggml-cuda => ggml/src/ggml-cuda}/scale.cu    |    0
 {ggml-cuda => ggml/src/ggml-cuda}/scale.cuh   |    0
 {ggml-cuda => ggml/src/ggml-cuda}/softmax.cu  |    0
 {ggml-cuda => ggml/src/ggml-cuda}/softmax.cuh |    0
 {ggml-cuda => ggml/src/ggml-cuda}/sumrows.cu  |    0
 {ggml-cuda => ggml/src/ggml-cuda}/sumrows.cuh |    0
 .../fattn-vec-f16-instance-hs128-f16-f16.cu   |    0
 .../fattn-vec-f16-instance-hs128-f16-q4_0.cu  |    0
 .../fattn-vec-f16-instance-hs128-f16-q4_1.cu  |    0
 .../fattn-vec-f16-instance-hs128-f16-q5_0.cu  |    0
 .../fattn-vec-f16-instance-hs128-f16-q5_1.cu  |    0
 .../fattn-vec-f16-instance-hs128-f16-q8_0.cu  |    0
 .../fattn-vec-f16-instance-hs128-q4_0-f16.cu  |    0
 .../fattn-vec-f16-instance-hs128-q4_0-q4_0.cu |    0
 .../fattn-vec-f16-instance-hs128-q4_0-q4_1.cu |    0
 .../fattn-vec-f16-instance-hs128-q4_0-q5_0.cu |    0
 .../fattn-vec-f16-instance-hs128-q4_0-q5_1.cu |    0
 .../fattn-vec-f16-instance-hs128-q4_0-q8_0.cu |    0
 .../fattn-vec-f16-instance-hs128-q4_1-f16.cu  |    0
 .../fattn-vec-f16-instance-hs128-q4_1-q4_0.cu |    0
 .../fattn-vec-f16-instance-hs128-q4_1-q4_1.cu |    0
 .../fattn-vec-f16-instance-hs128-q4_1-q5_0.cu |    0
 .../fattn-vec-f16-instance-hs128-q4_1-q5_1.cu |    0
 .../fattn-vec-f16-instance-hs128-q4_1-q8_0.cu |    0
 .../fattn-vec-f16-instance-hs128-q5_0-f16.cu  |    0
 .../fattn-vec-f16-instance-hs128-q5_0-q4_0.cu |    0
 .../fattn-vec-f16-instance-hs128-q5_0-q4_1.cu |    0
 .../fattn-vec-f16-instance-hs128-q5_0-q5_0.cu |    0
 .../fattn-vec-f16-instance-hs128-q5_0-q5_1.cu |    0
 .../fattn-vec-f16-instance-hs128-q5_0-q8_0.cu |    0
 .../fattn-vec-f16-instance-hs128-q5_1-f16.cu  |    0
 .../fattn-vec-f16-instance-hs128-q5_1-q4_0.cu |    0
 .../fattn-vec-f16-instance-hs128-q5_1-q4_1.cu |    0
 .../fattn-vec-f16-instance-hs128-q5_1-q5_0.cu |    0
 .../fattn-vec-f16-instance-hs128-q5_1-q5_1.cu |    0
 .../fattn-vec-f16-instance-hs128-q5_1-q8_0.cu |    0
 .../fattn-vec-f16-instance-hs128-q8_0-f16.cu  |    0
 .../fattn-vec-f16-instance-hs128-q8_0-q4_0.cu |    0
 .../fattn-vec-f16-instance-hs128-q8_0-q4_1.cu |    0
 .../fattn-vec-f16-instance-hs128-q8_0-q5_0.cu |    0
 .../fattn-vec-f16-instance-hs128-q8_0-q5_1.cu |    0
 .../fattn-vec-f16-instance-hs128-q8_0-q8_0.cu |    0
 .../fattn-vec-f16-instance-hs256-f16-f16.cu   |    0
 .../fattn-vec-f16-instance-hs64-f16-f16.cu    |    0
 .../fattn-vec-f16-instance-hs64-f16-q4_0.cu   |    0
 .../fattn-vec-f16-instance-hs64-f16-q4_1.cu   |    0
 .../fattn-vec-f16-instance-hs64-f16-q5_0.cu   |    0
 .../fattn-vec-f16-instance-hs64-f16-q5_1.cu   |    0
 .../fattn-vec-f16-instance-hs64-f16-q8_0.cu   |    0
 .../fattn-vec-f32-instance-hs128-f16-f16.cu   |    0
 .../fattn-vec-f32-instance-hs128-f16-q4_0.cu  |    0
 .../fattn-vec-f32-instance-hs128-f16-q4_1.cu  |    0
 .../fattn-vec-f32-instance-hs128-f16-q5_0.cu  |    0
 .../fattn-vec-f32-instance-hs128-f16-q5_1.cu  |    0
 .../fattn-vec-f32-instance-hs128-f16-q8_0.cu  |    0
 .../fattn-vec-f32-instance-hs128-q4_0-f16.cu  |    0
 .../fattn-vec-f32-instance-hs128-q4_0-q4_0.cu |    0
 .../fattn-vec-f32-instance-hs128-q4_0-q4_1.cu |    0
 .../fattn-vec-f32-instance-hs128-q4_0-q5_0.cu |    0
 .../fattn-vec-f32-instance-hs128-q4_0-q5_1.cu |    0
 .../fattn-vec-f32-instance-hs128-q4_0-q8_0.cu |    0
 .../fattn-vec-f32-instance-hs128-q4_1-f16.cu  |    0
 .../fattn-vec-f32-instance-hs128-q4_1-q4_0.cu |    0
 .../fattn-vec-f32-instance-hs128-q4_1-q4_1.cu |    0
 .../fattn-vec-f32-instance-hs128-q4_1-q5_0.cu |    0
 .../fattn-vec-f32-instance-hs128-q4_1-q5_1.cu |    0
 .../fattn-vec-f32-instance-hs128-q4_1-q8_0.cu |    0
 .../fattn-vec-f32-instance-hs128-q5_0-f16.cu  |    0
 .../fattn-vec-f32-instance-hs128-q5_0-q4_0.cu |    0
 .../fattn-vec-f32-instance-hs128-q5_0-q4_1.cu |    0
 .../fattn-vec-f32-instance-hs128-q5_0-q5_0.cu |    0
 .../fattn-vec-f32-instance-hs128-q5_0-q5_1.cu |    0
 .../fattn-vec-f32-instance-hs128-q5_0-q8_0.cu |    0
 .../fattn-vec-f32-instance-hs128-q5_1-f16.cu  |    0
 .../fattn-vec-f32-instance-hs128-q5_1-q4_0.cu |    0
 .../fattn-vec-f32-instance-hs128-q5_1-q4_1.cu |    0
 .../fattn-vec-f32-instance-hs128-q5_1-q5_0.cu |    0
 .../fattn-vec-f32-instance-hs128-q5_1-q5_1.cu |    0
 .../fattn-vec-f32-instance-hs128-q5_1-q8_0.cu |    0
 .../fattn-vec-f32-instance-hs128-q8_0-f16.cu  |    0
 .../fattn-vec-f32-instance-hs128-q8_0-q4_0.cu |    0
 .../fattn-vec-f32-instance-hs128-q8_0-q4_1.cu |    0
 .../fattn-vec-f32-instance-hs128-q8_0-q5_0.cu |    0
 .../fattn-vec-f32-instance-hs128-q8_0-q5_1.cu |    0
 .../fattn-vec-f32-instance-hs128-q8_0-q8_0.cu |    0
 .../fattn-vec-f32-instance-hs256-f16-f16.cu   |    0
 .../fattn-vec-f32-instance-hs64-f16-f16.cu    |    0
 .../fattn-vec-f32-instance-hs64-f16-q4_0.cu   |    0
 .../fattn-vec-f32-instance-hs64-f16-q4_1.cu   |    0
 .../fattn-vec-f32-instance-hs64-f16-q5_0.cu   |    0
 .../fattn-vec-f32-instance-hs64-f16-q5_1.cu   |    0
 .../fattn-vec-f32-instance-hs64-f16-q8_0.cu   |    0
 .../fattn-wmma-f16-instance-kqfloat-cpb16.cu  |    0
 .../fattn-wmma-f16-instance-kqfloat-cpb32.cu  |    0
 .../fattn-wmma-f16-instance-kqhalf-cpb16.cu   |    0
 .../fattn-wmma-f16-instance-kqhalf-cpb32.cu   |    0
 .../fattn-wmma-f16-instance-kqhalf-cpb8.cu    |    0
 .../template-instances/generate_cu_files.py   |    0
 .../template-instances/mmq-instance-q2_k.cu   |    0
 .../template-instances/mmq-instance-q3_k.cu   |    0
 .../template-instances/mmq-instance-q4_0.cu   |    0
 .../template-instances/mmq-instance-q4_1.cu   |    0
 .../template-instances/mmq-instance-q4_k.cu   |    0
 .../template-instances/mmq-instance-q5_0.cu   |    0
 .../template-instances/mmq-instance-q5_1.cu   |    0
 .../template-instances/mmq-instance-q5_k.cu   |    0
 .../template-instances/mmq-instance-q6_k.cu   |    0
 .../template-instances/mmq-instance-q8_0.cu   |    0
 {ggml-cuda => ggml/src/ggml-cuda}/tsembd.cu   |    0
 {ggml-cuda => ggml/src/ggml-cuda}/tsembd.cuh  |    0
 {ggml-cuda => ggml/src/ggml-cuda}/unary.cu    |    0
 {ggml-cuda => ggml/src/ggml-cuda}/unary.cuh   |    0
 {ggml-cuda => ggml/src/ggml-cuda}/upscale.cu  |    0
 {ggml-cuda => ggml/src/ggml-cuda}/upscale.cuh |    0
 {ggml-cuda => ggml/src/ggml-cuda}/vecdotq.cuh |    0
 ggml-impl.h => ggml/src/ggml-impl.h           |    0
 ggml-kompute.cpp => ggml/src/ggml-kompute.cpp |    0
 ggml-metal.m => ggml/src/ggml-metal.m         |    0
 ggml-metal.metal => ggml/src/ggml-metal.metal |    0
 ggml-quants.c => ggml/src/ggml-quants.c       |    0
 ggml-quants.h => ggml/src/ggml-quants.h       |    0
 ggml-rpc.cpp => ggml/src/ggml-rpc.cpp         |    0
 ggml-sycl.cpp => ggml/src/ggml-sycl.cpp       |    1 +
 {ggml-sycl => ggml/src/ggml-sycl}/backend.hpp |    0
 {ggml-sycl => ggml/src/ggml-sycl}/common.cpp  |    0
 {ggml-sycl => ggml/src/ggml-sycl}/common.hpp  |    1 +
 {ggml-sycl => ggml/src/ggml-sycl}/convert.cpp |    0
 {ggml-sycl => ggml/src/ggml-sycl}/convert.hpp |    0
 .../src/ggml-sycl}/dequantize.hpp             |    0
 {ggml-sycl => ggml/src/ggml-sycl}/dmmv.cpp    |    0
 {ggml-sycl => ggml/src/ggml-sycl}/dmmv.hpp    |    0
 .../src/ggml-sycl}/dpct/helper.hpp            |    0
 {ggml-sycl => ggml/src/ggml-sycl}/mmq.cpp     |    0
 {ggml-sycl => ggml/src/ggml-sycl}/mmq.hpp     |    0
 {ggml-sycl => ggml/src/ggml-sycl}/mmvq.cpp    |    0
 {ggml-sycl => ggml/src/ggml-sycl}/mmvq.hpp    |    0
 {ggml-sycl => ggml/src/ggml-sycl}/presets.hpp |    2 -
 {ggml-sycl => ggml/src/ggml-sycl}/vecdotq.hpp |    0
 .../src/ggml-vulkan-shaders.hpp               |    0
 ggml-vulkan.cpp => ggml/src/ggml-vulkan.cpp   |    0
 ggml.c => ggml/src/ggml.c                     |    0
 kompute => ggml/src/kompute                   |    0
 .../src/kompute-shaders}/common.comp          |    0
 .../src/kompute-shaders}/op_add.comp          |    0
 .../src/kompute-shaders}/op_addrow.comp       |    0
 .../src/kompute-shaders}/op_cpy_f16_f16.comp  |    0
 .../src/kompute-shaders}/op_cpy_f16_f32.comp  |    0
 .../src/kompute-shaders}/op_cpy_f32_f16.comp  |    0
 .../src/kompute-shaders}/op_cpy_f32_f32.comp  |    0
 .../src/kompute-shaders}/op_diagmask.comp     |    0
 .../src/kompute-shaders}/op_gelu.comp         |    0
 .../src/kompute-shaders}/op_getrows.comp      |    0
 .../src/kompute-shaders}/op_getrows_f16.comp  |    0
 .../src/kompute-shaders}/op_getrows_f32.comp  |    0
 .../src/kompute-shaders}/op_getrows_q4_0.comp |    0
 .../src/kompute-shaders}/op_getrows_q4_1.comp |    0
 .../src/kompute-shaders}/op_getrows_q6_k.comp |    0
 .../src/kompute-shaders}/op_mul.comp          |    0
 .../src/kompute-shaders}/op_mul_mat_f16.comp  |    0
 .../kompute-shaders}/op_mul_mat_mat_f32.comp  |    0
 .../src/kompute-shaders}/op_mul_mat_q4_0.comp |    0
 .../src/kompute-shaders}/op_mul_mat_q4_1.comp |    0
 .../src/kompute-shaders}/op_mul_mat_q6_k.comp |    0
 .../src/kompute-shaders}/op_mul_mat_q8_0.comp |    0
 .../src/kompute-shaders}/op_mul_mv_q_n.comp   |    0
 .../kompute-shaders}/op_mul_mv_q_n_pre.comp   |    0
 .../src/kompute-shaders}/op_norm.comp         |    0
 .../src/kompute-shaders}/op_relu.comp         |    0
 .../src/kompute-shaders}/op_rmsnorm.comp      |    0
 .../src/kompute-shaders}/op_rope_f16.comp     |    0
 .../src/kompute-shaders}/op_rope_f32.comp     |    0
 .../src/kompute-shaders}/op_scale.comp        |    0
 .../src/kompute-shaders}/op_scale_8.comp      |    0
 .../src/kompute-shaders}/op_silu.comp         |    0
 .../src/kompute-shaders}/op_softmax.comp      |    0
 .../src/kompute-shaders}/rope_common.comp     |    0
 sgemm.cpp => ggml/src/sgemm.cpp               |    0
 sgemm.h => ggml/src/sgemm.h                   |    0
 .../src/vulkan-shaders}/add.comp              |    0
 .../src/vulkan-shaders}/argsort.comp          |    0
 .../src/vulkan-shaders}/clamp.comp            |    0
 .../src/vulkan-shaders}/copy.comp             |    0
 .../src/vulkan-shaders}/dequant_f32.comp      |    0
 .../src/vulkan-shaders}/dequant_funcs.comp    |    0
 .../src/vulkan-shaders}/dequant_head.comp     |    0
 .../src/vulkan-shaders}/dequant_q2_k.comp     |    0
 .../src/vulkan-shaders}/dequant_q3_k.comp     |    0
 .../src/vulkan-shaders}/dequant_q4_0.comp     |    0
 .../src/vulkan-shaders}/dequant_q4_1.comp     |    0
 .../src/vulkan-shaders}/dequant_q4_k.comp     |    0
 .../src/vulkan-shaders}/dequant_q5_0.comp     |    0
 .../src/vulkan-shaders}/dequant_q5_1.comp     |    0
 .../src/vulkan-shaders}/dequant_q5_k.comp     |    0
 .../src/vulkan-shaders}/dequant_q6_k.comp     |    0
 .../src/vulkan-shaders}/dequant_q8_0.comp     |    0
 .../src/vulkan-shaders}/diag_mask_inf.comp    |    0
 .../src/vulkan-shaders}/div.comp              |    0
 .../src/vulkan-shaders}/gelu.comp             |    0
 .../vulkan-shaders}/generic_binary_head.comp  |    0
 .../src/vulkan-shaders}/generic_head.comp     |    0
 .../vulkan-shaders}/generic_unary_head.comp   |    0
 .../src/vulkan-shaders}/get_rows.comp         |    0
 .../src/vulkan-shaders}/get_rows_quant.comp   |    0
 .../src/vulkan-shaders}/mul.comp              |    0
 .../mul_mat_split_k_reduce.comp               |    0
 .../src/vulkan-shaders}/mul_mat_vec.comp      |    0
 .../src/vulkan-shaders}/mul_mat_vec_base.comp |    0
 .../src/vulkan-shaders}/mul_mat_vec_nc.comp   |    0
 .../src/vulkan-shaders}/mul_mat_vec_p021.comp |    0
 .../src/vulkan-shaders}/mul_mat_vec_q2_k.comp |    0
 .../src/vulkan-shaders}/mul_mat_vec_q3_k.comp |    0
 .../src/vulkan-shaders}/mul_mat_vec_q4_k.comp |    0
 .../src/vulkan-shaders}/mul_mat_vec_q5_k.comp |    0
 .../src/vulkan-shaders}/mul_mat_vec_q6_k.comp |    0
 .../src/vulkan-shaders}/mul_mm.comp           |    0
 .../src/vulkan-shaders}/norm.comp             |    0
 .../src/vulkan-shaders}/relu.comp             |    0
 .../src/vulkan-shaders}/rms_norm.comp         |    0
 .../src/vulkan-shaders}/rope_head.comp        |    0
 .../src/vulkan-shaders}/rope_neox.comp        |    0
 .../src/vulkan-shaders}/rope_norm.comp        |    0
 .../src/vulkan-shaders}/scale.comp            |    0
 .../src/vulkan-shaders}/silu.comp             |    0
 .../src/vulkan-shaders}/soft_max.comp         |    0
 .../src/vulkan-shaders}/square.comp           |    0
 .../src/vulkan-shaders}/sum_rows.comp         |    0
 .../src/vulkan-shaders}/types.comp            |    0
 llama.h => include/llama.h                    |    0
 scripts/build-info.sh                         |   10 +-
 scripts/compare-commits.sh                    |    2 +-
 scripts/debug-test.sh                         |    2 +-
 scripts/pod-llama.sh                          |   16 +-
 scripts/server-llm.sh                         |    2 +-
 scripts/sync-ggml-am.sh                       |  113 +-
 scripts/sync-ggml.sh                          |   68 +-
 spm-headers/ggml-alloc.h                      |    2 +-
 spm-headers/ggml-backend.h                    |    2 +-
 spm-headers/ggml-metal.h                      |    1 +
 spm-headers/ggml.h                            |    2 +-
 spm-headers/llama.h                           |    2 +-
 src/CMakeLists.txt                            |   32 +
 llama.cpp => src/llama.cpp                    |    0
 unicode-data.cpp => src/unicode-data.cpp      |    0
 unicode-data.h => src/unicode-data.h          |    0
 unicode.cpp => src/unicode.cpp                |    0
 unicode.h => src/unicode.h                    |    0
 tests/test-backend-ops.cpp                    |    1 -
 345 files changed, 2555 insertions(+), 1937 deletions(-)
 rename {scripts => cmake}/build-info.cmake (100%)
 create mode 100644 cmake/git-vars.cmake
 rename scripts/LlamaConfig.cmake.in => cmake/llama-config.cmake.in (73%)
 rename scripts/gen-build-info-cpp.cmake => common/cmake/build-info-gen-cpp.cmake (86%)
 create mode 100644 ggml/CMakeLists.txt
 rename {cmake => ggml/cmake}/FindSIMD.cmake (94%)
 rename ggml_vk_generate_shaders.py => ggml/ggml_vk_generate_shaders.py (100%)
 rename ggml-alloc.h => ggml/include/ggml-alloc.h (100%)
 rename ggml-backend.h => ggml/include/ggml-backend.h (100%)
 rename ggml-blas.h => ggml/include/ggml-blas.h (100%)
 rename ggml-cuda.h => ggml/include/ggml-cuda.h (100%)
 rename ggml-kompute.h => ggml/include/ggml-kompute.h (100%)
 rename ggml-metal.h => ggml/include/ggml-metal.h (100%)
 rename ggml-rpc.h => ggml/include/ggml-rpc.h (100%)
 rename ggml-sycl.h => ggml/include/ggml-sycl.h (95%)
 rename ggml-vulkan.h => ggml/include/ggml-vulkan.h (100%)
 rename ggml.h => ggml/include/ggml.h (100%)
 create mode 100644 ggml/src/CMakeLists.txt
 rename ggml-alloc.c => ggml/src/ggml-alloc.c (100%)
 rename ggml-backend-impl.h => ggml/src/ggml-backend-impl.h (100%)
 rename ggml-backend.c => ggml/src/ggml-backend.c (100%)
 rename ggml-blas.cpp => ggml/src/ggml-blas.cpp (100%)
 rename ggml-common.h => ggml/src/ggml-common.h (100%)
 rename ggml-cuda.cu => ggml/src/ggml-cuda.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/acc.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/acc.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/arange.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/arange.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/argsort.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/argsort.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/binbcast.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/binbcast.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/clamp.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/clamp.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/common.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/concat.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/concat.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/convert.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/convert.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/cpy.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/cpy.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/dequantize.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/diagmask.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/diagmask.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/dmmv.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/dmmv.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/fattn-common.cuh (99%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/fattn-tile-f16.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/fattn-tile-f16.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/fattn-tile-f32.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/fattn-tile-f32.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/fattn-vec-f16.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/fattn-vec-f32.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/fattn-wmma-f16.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/fattn.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/fattn.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/getrows.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/getrows.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/im2col.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/im2col.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/mma.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/mmq.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/mmq.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/mmvq.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/mmvq.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/norm.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/norm.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/pad.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/pad.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/pool2d.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/pool2d.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/quantize.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/quantize.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/rope.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/rope.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/scale.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/scale.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/softmax.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/softmax.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/sumrows.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/sumrows.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/generate_cu_files.py (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/mmq-instance-q2_k.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/mmq-instance-q3_k.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/mmq-instance-q4_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/mmq-instance-q4_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/mmq-instance-q4_k.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/mmq-instance-q5_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/mmq-instance-q5_1.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/mmq-instance-q5_k.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/mmq-instance-q6_k.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/template-instances/mmq-instance-q8_0.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/tsembd.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/tsembd.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/unary.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/unary.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/upscale.cu (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/upscale.cuh (100%)
 rename {ggml-cuda => ggml/src/ggml-cuda}/vecdotq.cuh (100%)
 rename ggml-impl.h => ggml/src/ggml-impl.h (100%)
 rename ggml-kompute.cpp => ggml/src/ggml-kompute.cpp (100%)
 rename ggml-metal.m => ggml/src/ggml-metal.m (100%)
 rename ggml-metal.metal => ggml/src/ggml-metal.metal (100%)
 rename ggml-quants.c => ggml/src/ggml-quants.c (100%)
 rename ggml-quants.h => ggml/src/ggml-quants.h (100%)
 rename ggml-rpc.cpp => ggml/src/ggml-rpc.cpp (100%)
 rename ggml-sycl.cpp => ggml/src/ggml-sycl.cpp (99%)
 rename {ggml-sycl => ggml/src/ggml-sycl}/backend.hpp (100%)
 rename {ggml-sycl => ggml/src/ggml-sycl}/common.cpp (100%)
 rename {ggml-sycl => ggml/src/ggml-sycl}/common.hpp (99%)
 rename {ggml-sycl => ggml/src/ggml-sycl}/convert.cpp (100%)
 rename {ggml-sycl => ggml/src/ggml-sycl}/convert.hpp (100%)
 rename {ggml-sycl => ggml/src/ggml-sycl}/dequantize.hpp (100%)
 rename {ggml-sycl => ggml/src/ggml-sycl}/dmmv.cpp (100%)
 rename {ggml-sycl => ggml/src/ggml-sycl}/dmmv.hpp (100%)
 rename {ggml-sycl => ggml/src/ggml-sycl}/dpct/helper.hpp (100%)
 rename {ggml-sycl => ggml/src/ggml-sycl}/mmq.cpp (100%)
 rename {ggml-sycl => ggml/src/ggml-sycl}/mmq.hpp (100%)
 rename {ggml-sycl => ggml/src/ggml-sycl}/mmvq.cpp (100%)
 rename {ggml-sycl => ggml/src/ggml-sycl}/mmvq.hpp (100%)
 rename {ggml-sycl => ggml/src/ggml-sycl}/presets.hpp (96%)
 rename {ggml-sycl => ggml/src/ggml-sycl}/vecdotq.hpp (100%)
 rename ggml-vulkan-shaders.hpp => ggml/src/ggml-vulkan-shaders.hpp (100%)
 rename ggml-vulkan.cpp => ggml/src/ggml-vulkan.cpp (100%)
 rename ggml.c => ggml/src/ggml.c (100%)
 rename kompute => ggml/src/kompute (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/common.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_add.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_addrow.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_cpy_f16_f16.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_cpy_f16_f32.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_cpy_f32_f16.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_cpy_f32_f32.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_diagmask.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_gelu.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_getrows.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_getrows_f16.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_getrows_f32.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_getrows_q4_0.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_getrows_q4_1.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_getrows_q6_k.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_mul.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_mul_mat_f16.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_mul_mat_mat_f32.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_mul_mat_q4_0.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_mul_mat_q4_1.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_mul_mat_q6_k.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_mul_mat_q8_0.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_mul_mv_q_n.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_mul_mv_q_n_pre.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_norm.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_relu.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_rmsnorm.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_rope_f16.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_rope_f32.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_scale.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_scale_8.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_silu.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/op_softmax.comp (100%)
 rename {kompute-shaders => ggml/src/kompute-shaders}/rope_common.comp (100%)
 rename sgemm.cpp => ggml/src/sgemm.cpp (100%)
 rename sgemm.h => ggml/src/sgemm.h (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/add.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/argsort.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/clamp.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/copy.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_f32.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_funcs.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_head.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_q2_k.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_q3_k.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_q4_0.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_q4_1.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_q4_k.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_q5_0.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_q5_1.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_q5_k.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_q6_k.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/dequant_q8_0.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/diag_mask_inf.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/div.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/gelu.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/generic_binary_head.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/generic_head.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/generic_unary_head.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/get_rows.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/get_rows_quant.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mat_split_k_reduce.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mat_vec.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mat_vec_base.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mat_vec_nc.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mat_vec_p021.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mat_vec_q2_k.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mat_vec_q3_k.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mat_vec_q4_k.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mat_vec_q5_k.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mat_vec_q6_k.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/mul_mm.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/norm.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/relu.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/rms_norm.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/rope_head.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/rope_neox.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/rope_norm.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/scale.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/silu.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/soft_max.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/square.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/sum_rows.comp (100%)
 rename {vulkan-shaders => ggml/src/vulkan-shaders}/types.comp (100%)
 rename llama.h => include/llama.h (100%)
 create mode 120000 spm-headers/ggml-metal.h
 create mode 100644 src/CMakeLists.txt
 rename llama.cpp => src/llama.cpp (100%)
 rename unicode-data.cpp => src/unicode-data.cpp (100%)
 rename unicode-data.h => src/unicode-data.h (100%)
 rename unicode.cpp => src/unicode.cpp (100%)
 rename unicode.h => src/unicode.h (100%)

diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
index 87bb3a20f..4ee0d62cb 100644
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -160,9 +160,9 @@ effectiveStdenv.mkDerivation (
     };
 
     postPatch = ''
-      substituteInPlace ./ggml-metal.m \
+      substituteInPlace ./ggml/src/ggml-metal.m \
         --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-      substituteInPlace ./ggml-metal.m \
+      substituteInPlace ./ggml/src/ggml-metal.m \
         --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
     '';
 
@@ -205,17 +205,17 @@ effectiveStdenv.mkDerivation (
 
     cmakeFlags =
       [
-        (cmakeBool "LLAMA_NATIVE" false)
         (cmakeBool "LLAMA_BUILD_SERVER" true)
         (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
         (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
-        (cmakeBool "LLAMA_BLAS" useBlas)
-        (cmakeBool "LLAMA_CLBLAST" useOpenCL)
-        (cmakeBool "LLAMA_CUDA" useCuda)
-        (cmakeBool "LLAMA_HIPBLAS" useRocm)
-        (cmakeBool "LLAMA_METAL" useMetalKit)
-        (cmakeBool "LLAMA_VULKAN" useVulkan)
-        (cmakeBool "LLAMA_STATIC" enableStatic)
+        (cmakeBool "GGML_NATIVE" false)
+        (cmakeBool "GGML_BLAS" useBlas)
+        (cmakeBool "GGML_CLBLAST" useOpenCL)
+        (cmakeBool "GGML_CUDA" useCuda)
+        (cmakeBool "GGML_HIPBLAS" useRocm)
+        (cmakeBool "GGML_METAL" useMetalKit)
+        (cmakeBool "GGML_VULKAN" useVulkan)
+        (cmakeBool "GGML_STATIC" enableStatic)
       ]
       ++ optionals useCuda [
         (
@@ -231,7 +231,7 @@ effectiveStdenv.mkDerivation (
       ]
       ++ optionals useMetalKit [
         (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
-        (cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
+        (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
       ];
 
     # Environment variables needed for ROCm
@@ -244,7 +244,7 @@ effectiveStdenv.mkDerivation (
     # if they haven't been added yet.
     postInstall = ''
       mkdir -p $out/include
-      cp $src/llama.h $out/include/
+      cp $src/include/llama.h $out/include/
     '';
 
     # Define the shells here, but don't add in the inputsFrom to avoid recursion.
diff --git a/.github/labeler.yml b/.github/labeler.yml
index 5c12bab73..9c0397d16 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -2,31 +2,31 @@
 Kompute:
     - changed-files:
         - any-glob-to-any-file:
-            - ggml-kompute.h
-            - ggml-kompute.cpp
+            - ggml/include/ggml-kompute.h
+            - ggml/src/ggml-kompute.cpp
             - README-kompute.md
 Apple Metal:
     - changed-files:
         - any-glob-to-any-file:
-            - ggml-metal.h
-            - ggml-metal.cpp
+            - ggml/include/ggml-metal.h
+            - ggml/src/ggml-metal.cpp
             - README-metal.md
 SYCL:
     - changed-files:
         - any-glob-to-any-file:
-            - ggml-sycl.h
-            - ggml-sycl.cpp
+            - ggml/include/ggml-sycl.h
+            - ggml/src/ggml-sycl.cpp
             - README-sycl.md
 Nvidia GPU:
     - changed-files:
         - any-glob-to-any-file:
-            - ggml-cuda.h
-            - ggml-cuda/**
+            - ggml/include/ggml-cuda.h
+            - ggml/src/ggml-cuda/**
 Vulkan:
     - changed-files:
         - any-glob-to-any-file:
-            - ggml_vk_generate_shaders.py
-            - ggml-vulkan*
+            - ggml/ggml_vk_generate_shaders.py
+            - ggml/src/ggml-vulkan*
 documentation:
     - changed-files:
         - any-glob-to-any-file:
@@ -73,10 +73,10 @@ server:
 ggml:
     - changed-files:
         - any-glob-to-any-file:
-            - ggml.c
-            - ggml.h
-            - ggml-*.c
-            - ggml-*.h
+            - ggml/include/ggml*.h
+            - ggml/src/ggml*.c
+            - ggml/src/ggml*.cpp
+            - ggml/src/ggml*.h
             - ggml-cuda/**
 nix:
     - changed-files:
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 88ab4844e..eb69b82c4 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -109,7 +109,7 @@ jobs:
         run: |
           set -eux
           cmake -B build \
-              -DLLAMA_NATIVE=OFF \
+              -DGGML_NATIVE=OFF \
               -DLLAMA_BUILD_SERVER=ON \
               -DLLAMA_CURL=ON \
               -DLLAMA_CUBLAS=ON \
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index a8fcae043..0d91fc4e4 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -47,7 +47,7 @@ jobs:
           sysctl -a
           mkdir build
           cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
           cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
@@ -105,7 +105,7 @@ jobs:
           sysctl -a
           # Metal is disabled due to intermittent failures with Github runners not having a GPU:
           # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON
+          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
@@ -305,7 +305,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
           cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
 
       - name: Test
@@ -335,7 +335,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake -DLLAMA_RPC=ON ..
+          cmake -DGGML_RPC=ON ..
           cmake --build . --config Release -j $(nproc)
 
       - name: Test
@@ -363,7 +363,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake -DLLAMA_VULKAN=ON ..
+          cmake -DGGML_VULKAN=ON ..
           cmake --build . --config Release -j $(nproc)
 
   ubuntu-22-cmake-hip:
@@ -384,13 +384,13 @@ jobs:
       - name: Build with native CMake HIP support
         id: cmake_build
         run: |
-          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON
+          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON
           cmake --build build --config Release -j $(nproc)
 
       - name: Build with legacy HIP support
         id: cmake_build_legacy_hip
         run: |
-          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON
+          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON
           cmake --build build2 --config Release -j $(nproc)
 
   ubuntu-22-cmake-sycl:
@@ -431,7 +431,7 @@ jobs:
           source /opt/intel/oneapi/setvars.sh
           mkdir build
           cd build
-          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
+          cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
           cmake --build . --config Release -j $(nproc)
 
   ubuntu-22-cmake-sycl-fp16:
@@ -472,10 +472,10 @@ jobs:
           source /opt/intel/oneapi/setvars.sh
           mkdir build
           cd build
-          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON ..
+          cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
           cmake --build . --config Release -j $(nproc)
 
-  # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
+  # TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
   #       how to debug it.
   #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
   macOS-latest-make:
@@ -497,15 +497,15 @@ jobs:
         env:
             LLAMA_FATAL_WARNINGS: 1
         run: |
-          LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
+          GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
         id: make_test
         run: |
-          LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
-          LLAMA_NO_METAL=1 make test  -j $(sysctl -n hw.logicalcpu)
+          GGML_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
+          GGML_NO_METAL=1 make test  -j $(sysctl -n hw.logicalcpu)
 
-  # TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
+  # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
   #       how to debug it.
   #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
   #       would be great if we fix these
@@ -529,7 +529,7 @@ jobs:
           sysctl -a
           mkdir build
           cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
           cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
@@ -559,13 +559,14 @@ jobs:
           mkdir build
           cd build
           cmake -G Xcode .. \
-            -DLLAMA_METAL_EMBED_LIBRARY=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
             -DLLAMA_BUILD_EXAMPLES=OFF \
             -DLLAMA_BUILD_TESTS=OFF \
             -DLLAMA_BUILD_SERVER=OFF \
             -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
+            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
+          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
 
   macOS-latest-cmake-tvos:
     runs-on: macos-latest
@@ -588,13 +589,14 @@ jobs:
           mkdir build
           cd build
           cmake -G Xcode .. \
-            -DLLAMA_METAL_EMBED_LIBRARY=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
             -DLLAMA_BUILD_EXAMPLES=OFF \
             -DLLAMA_BUILD_TESTS=OFF \
             -DLLAMA_BUILD_SERVER=OFF \
             -DCMAKE_SYSTEM_NAME=tvOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
+            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
+          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
 
   macOS-latest-swift:
     runs-on: macos-latest
@@ -662,7 +664,7 @@ jobs:
       - name: Build using make w/ OpenBLAS
         shell: msys2 {0}
         run: |
-            make LLAMA_OPENBLAS=1 -j $(nproc)
+            make GGML_OPENBLAS=1 -j $(nproc)
 
       - name: Build using CMake
         shell: msys2 {0}
@@ -678,7 +680,7 @@ jobs:
       - name: Build using CMake w/ OpenBLAS
         shell: msys2 {0}
         run: |
-            cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+            cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
             cmake --build build --config ${{ matrix.build }} -j $(nproc)
 
   windows-latest-cmake:
@@ -693,25 +695,25 @@ jobs:
       matrix:
         include:
           - build: 'rpc-x64'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'noavx-x64'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
           - build: 'avx2-x64'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'avx-x64'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
           - build: 'avx512-x64'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'openblas-x64'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
           - build: 'kompute-x64'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'vulkan-x64'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'llvm-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'msvc-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
 
     steps:
       - name: Clone
@@ -724,7 +726,7 @@ jobs:
         id: clone_kompute
         if: ${{ matrix.build == 'kompute-x64' }}
         run: |
-          git submodule update --init kompute
+          git submodule update --init ggml/src/kompute
 
       - name: Download OpenBLAS
         id: get_openblas
@@ -854,7 +856,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON
+          cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
           cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
 
       - name: Determine tag name
@@ -987,7 +989,7 @@ jobs:
         run: |
           $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
           $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON
+          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON
           cmake --build build --config Release
 
   ios-xcode-build:
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 311abf02a..99feb28f2 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -92,12 +92,12 @@ jobs:
         if: ${{ matrix.sanitizer == 'THREAD' }}
         run: |
           cmake -B build \
-              -DLLAMA_NATIVE=OFF \
+              -DGGML_NATIVE=OFF \
               -DLLAMA_BUILD_SERVER=ON \
               -DLLAMA_CURL=ON \
               -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
               -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-              -DLLAMA_OPENMP=OFF ;
+              -DGGML_OPENMP=OFF ;
           cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
 
       - name: Build
@@ -105,7 +105,7 @@ jobs:
         if: ${{ matrix.sanitizer != 'THREAD' }}
         run: |
           cmake -B build \
-              -DLLAMA_NATIVE=OFF \
+              -DGGML_NATIVE=OFF \
               -DLLAMA_BUILD_SERVER=ON \
               -DLLAMA_CURL=ON \
               -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
diff --git a/.gitignore b/.gitignore
index a0c16e880..177e6a8db 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,6 +56,7 @@ CMakeSettings.json
 compile_commands.json
 ggml-metal-embed.metal
 llama-batched-swift
+/rpc-server
 out/
 tmp/
 
diff --git a/.gitmodules b/.gitmodules
index b7e8b8ff2..5861d59cb 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "kompute"]
-	path = kompute
+	path = ggml/src/kompute
 	url = https://github.com/nomic-ai/kompute.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1acf4bb08..18297834e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,9 @@ cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target
 project("llama.cpp" C CXX)
 include(CheckIncludeFileCXX)
 
+#set(CMAKE_WARN_DEPRECATED YES)
+set(CMAKE_WARN_UNUSED_CLI YES)
+
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
@@ -9,11 +12,16 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 
+# Add path to modules
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
+
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 
 if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
     set(LLAMA_STANDALONE ON)
 
+    include(git-vars)
+
     # configure project version
     # TODO
 else()
@@ -32,1289 +40,72 @@ else()
     endif()
 endif()
 
+option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
 
 #
-# Option list
+# option list
 #
 
-if (APPLE)
-    set(LLAMA_METAL_DEFAULT ON)
-    set(LLAMA_BLAS_DEFAULT ON)
-    set(LLAMA_BLAS_VENDOR_DEFAULT "Apple")
-else()
-    set(LLAMA_METAL_DEFAULT OFF)
-    set(LLAMA_BLAS_DEFAULT OFF)
-    set(LLAMA_BLAS_VENDOR_DEFAULT "Generic")
-endif()
-
-set(LLAMA_LLAMAFILE_DEFAULT ON)
-
 # general
-option(BUILD_SHARED_LIBS                "build shared libraries"                                OFF)
-option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
-option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
-option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
-option(LLAMA_CCACHE                     "llama: use ccache if available"                        ON)
+option(LLAMA_CCACHE "llama: use ccache if available" ON)
 
 # debug
-option(LLAMA_ALL_WARNINGS               "llama: enable all compiler warnings"                   ON)
-option(LLAMA_ALL_WARNINGS_3RD_PARTY     "llama: enable all compiler warnings in 3rd party libs" OFF)
-option(LLAMA_GPROF                      "llama: enable gprof"                                   OFF)
+option(LLAMA_ALL_WARNINGS           "llama: enable all compiler warnings"                   ON)
+option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
 
 # build
-option(LLAMA_FATAL_WARNINGS             "llama: enable -Werror flag"                            OFF)
+option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF)
 
 # sanitizers
-option(LLAMA_SANITIZE_THREAD            "llama: enable thread sanitizer"                        OFF)
-option(LLAMA_SANITIZE_ADDRESS           "llama: enable address sanitizer"                       OFF)
-option(LLAMA_SANITIZE_UNDEFINED         "llama: enable undefined sanitizer"                     OFF)
+option(LLAMA_SANITIZE_THREAD    "llama: enable thread sanitizer"    OFF)
+option(LLAMA_SANITIZE_ADDRESS   "llama: enable address sanitizer"   OFF)
+option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
 
-# instruction set specific
-if (LLAMA_NATIVE)
-    set(INS_ENB OFF)
-else()
-    set(INS_ENB ON)
-endif()
-
-option(LLAMA_SVE                             "llama: enable SVE"                                OFF)
-option(LLAMA_AVX                             "llama: enable AVX"                                ${INS_ENB})
-option(LLAMA_AVX2                            "llama: enable AVX2"                               ${INS_ENB})
-option(LLAMA_AVX512                          "llama: enable AVX512"                             OFF)
-option(LLAMA_AVX512_VBMI                     "llama: enable AVX512-VBMI"                        OFF)
-option(LLAMA_AVX512_VNNI                     "llama: enable AVX512-VNNI"                        OFF)
-option(LLAMA_AVX512_BF16                     "llama: enable AVX512-BF16"                        OFF)
-option(LLAMA_FMA                             "llama: enable FMA"                                ${INS_ENB})
-# in MSVC F16C is implied with AVX2/AVX512
-if (NOT MSVC)
-    option(LLAMA_F16C                        "llama: enable F16C"                               ${INS_ENB})
-endif()
-
-if (WIN32)
-    set(LLAMA_WIN_VER "0x602" CACHE STRING "llama: Windows Version")
-endif()
+# extra artifacts
+option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
 
 # 3rd party libs
-option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
-option(LLAMA_BLAS                            "llama: use BLAS"                                  ${LLAMA_BLAS_DEFAULT})
-set(LLAMA_BLAS_VENDOR ${LLAMA_BLAS_VENDOR_DEFAULT} CACHE STRING
-                                             "llama: BLAS library vendor")
-option(LLAMA_LLAMAFILE                       "llama: use llamafile SGEMM"                       ${LLAMA_LLAMAFILE_DEFAULT})
-option(LLAMA_CUDA                            "llama: use CUDA"                                  OFF)
-option(LLAMA_CUBLAS                          "llama: use CUDA (deprecated, use LLAMA_CUDA)"     OFF)
-option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
-option(LLAMA_CUDA_FORCE_MMQ                  "llama: always use mmq kernels instead of cuBLAS"  OFF)
-option(LLAMA_CUDA_FORCE_CUBLAS               "llama: always use cuBLAS instead of mmq kernels"  OFF)
-set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
-set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
-option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some calculations"   OFF)
-set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
-set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
-                                             "llama: max. batch size for using peer access")
-option(LLAMA_CUDA_NO_PEER_COPY               "llama: do not use peer to peer copies"            OFF)
-option(LLAMA_CUDA_NO_VMM                     "llama: do not try to use CUDA VMM"                OFF)
-option(LLAMA_CUDA_FA_ALL_QUANTS              "llama: compile all quants for FlashAttention"     OFF)
-
-option(LLAMA_CURL                            "llama: use libcurl to download model from an URL" OFF)
-option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
-option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
-option(LLAMA_VULKAN                          "llama: use Vulkan"                                OFF)
-option(LLAMA_VULKAN_CHECK_RESULTS            "llama: run Vulkan op checks"                      OFF)
-option(LLAMA_VULKAN_DEBUG                    "llama: enable Vulkan debug output"                OFF)
-option(LLAMA_VULKAN_MEMORY_DEBUG             "llama: enable Vulkan memory debug output"         OFF)
-option(LLAMA_VULKAN_VALIDATE                 "llama: enable Vulkan validation"                  OFF)
-option(LLAMA_VULKAN_RUN_TESTS                "llama: run Vulkan tests"                          OFF)
-option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
-option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
-option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
-option(LLAMA_METAL_EMBED_LIBRARY             "llama: embed Metal library"                       OFF)
-set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
-                                             "llama: metal minimum macOS version")
-set(LLAMA_METAL_STD "" CACHE STRING          "llama: metal standard version (-std flag)")
-option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
-option(LLAMA_RPC                             "llama: use RPC"                                   OFF)
-option(LLAMA_OPENMP                          "llama: use OpenMP"                                ON)
-option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
-option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
-set(LLAMA_SYCL_TARGET   "INTEL" CACHE STRING "llama: sycl target device")
-option(LLAMA_CPU_HBM                         "llama: use memkind for CPU HBM"                   OFF)
-set(LLAMA_SCHED_MAX_COPIES  "4" CACHE STRING "llama: max input copies for pipeline parallelism")
-
-option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_SERVER                    "llama: build server example"                      ON)
-option(LLAMA_LASX                            "llama: enable lasx"                               ON)
-option(LLAMA_LSX                             "llama: enable lsx"                                ON)
+option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
 
 # Required for relocatable CMake package
-include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 
-#
-# Compile flags
-#
+# override ggml options
+set(GGML_CCACHE             ${LLAMA_CCACHE})
+set(GGML_SANITIZE_THREAD    ${LLAMA_SANITIZE_THREAD})
+set(GGML_SANITIZE_ADDRESS   ${LLAMA_SANITIZE_ADDRESS})
+set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
+set(GGML_ALL_WARNINGS       ${LLAMA_ALL_WARNINGS})
+set(GGML_FATAL_WARNINGS     ${LLAMA_FATAL_WARNINGS})
+set(GGML_LLAMAFILE          ON)
 
-if (LLAMA_SYCL)
-    set(CMAKE_CXX_STANDARD 17)
-else()
-    set(CMAKE_CXX_STANDARD 11)
-endif()
-
-set(CMAKE_CXX_STANDARD_REQUIRED true)
-set(CMAKE_C_STANDARD 11)
-set(CMAKE_C_STANDARD_REQUIRED true)
-set(THREADS_PREFER_PTHREAD_FLAG ON)
-
-find_package(Threads REQUIRED)
-include(CheckCXXCompilerFlag)
-
-add_compile_definitions(GGML_SCHED_MAX_COPIES=${LLAMA_SCHED_MAX_COPIES})
-
-# enable libstdc++ assertions for debug builds
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
-endif()
-
-if (NOT MSVC)
-    if (LLAMA_SANITIZE_THREAD)
-        add_compile_options(-fsanitize=thread)
-        link_libraries     (-fsanitize=thread)
+# transition helpers
+function (llama_option_depr TYPE OLD NEW)
+    if (${OLD})
+        message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
+        set(${NEW} ON)
     endif()
-
-    if (LLAMA_SANITIZE_ADDRESS)
-        add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
-        link_libraries     (-fsanitize=address)
-    endif()
-
-    if (LLAMA_SANITIZE_UNDEFINED)
-        add_compile_options(-fsanitize=undefined)
-        link_libraries     (-fsanitize=undefined)
-    endif()
-endif()
-
-if (APPLE AND LLAMA_ACCELERATE)
-    find_library(ACCELERATE_FRAMEWORK Accelerate)
-    if (ACCELERATE_FRAMEWORK)
-        message(STATUS "Accelerate framework found")
-
-        add_compile_definitions(GGML_USE_ACCELERATE)
-        add_compile_definitions(ACCELERATE_NEW_LAPACK)
-        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
-    else()
-        message(WARNING "Accelerate framework not found")
-    endif()
-endif()
-
-if (LLAMA_METAL)
-    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
-    find_library(METAL_FRAMEWORK    Metal      REQUIRED)
-    find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
-
-    message(STATUS "Metal framework found")
-    set(GGML_HEADERS_METAL ggml-metal.h)
-    set(GGML_SOURCES_METAL ggml-metal.m)
-
-    add_compile_definitions(GGML_USE_METAL)
-    if (LLAMA_METAL_NDEBUG)
-        add_compile_definitions(GGML_METAL_NDEBUG)
-    endif()
-
-    # copy ggml-common.h and ggml-metal.metal to bin directory
-    configure_file(ggml-common.h    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h    COPYONLY)
-    configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
-
-    if (LLAMA_METAL_EMBED_LIBRARY)
-        enable_language(ASM)
-        add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
-
-        set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/ggml-common.h")
-        set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
-
-        file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
-
-        # merge ggml-common.h and ggml-metal.metal into a single file
-        set(METALLIB_EMBED_ASM    "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s")
-        set(METALLIB_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal")
-
-        add_custom_command(
-            OUTPUT ${METALLIB_EMBED_ASM}
-            COMMAND echo "Embedding Metal library"
-            COMMAND sed -e '/\#include \"ggml-common.h\"/r ${METALLIB_COMMON}' -e '/\#include \"ggml-common.h\"/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED}
-            COMMAND echo ".section __DATA,__ggml_metallib"          >  ${METALLIB_EMBED_ASM}
-            COMMAND echo ".globl _ggml_metallib_start"              >> ${METALLIB_EMBED_ASM}
-            COMMAND echo "_ggml_metallib_start:"                    >> ${METALLIB_EMBED_ASM}
-            COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM}
-            COMMAND echo ".globl _ggml_metallib_end"                >> ${METALLIB_EMBED_ASM}
-            COMMAND echo "_ggml_metallib_end:"                      >> ${METALLIB_EMBED_ASM}
-            DEPENDS ggml-metal.metal ggml-common.h
-            COMMENT "Generate assembly for embedded Metal library"
-        )
-
-        set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM})
-    else()
-        if (LLAMA_METAL_SHADER_DEBUG)
-            # custom command to do the following:
-            #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
-            #   xcrun -sdk macosx metallib                   ggml-metal.air   -o default.metallib
-            #
-            # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
-            #       disabling fast math is needed in order to pass tests/test-backend-ops
-            # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
-            # note: unfortunately, we have to call it default.metallib instead of ggml.metallib
-            #       ref: https://github.com/ggerganov/whisper.cpp/issues/1720
-            set(XC_FLAGS -fno-fast-math -fno-inline -g)
-        else()
-            set(XC_FLAGS -O3)
-        endif()
-
-        # Append macOS metal versioning flags
-        if (LLAMA_METAL_MACOSX_VERSION_MIN)
-            message(STATUS "Adding -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN} flag to metal compilation")
-            list(APPEND XC_FLAGS -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN})
-        endif()
-        if (LLAMA_METAL_STD)
-            message(STATUS "Adding -std=${LLAMA_METAL_STD} flag to metal compilation")
-            list(APPEND XC_FLAGS -std=${LLAMA_METAL_STD})
-        endif()
-
-        add_custom_command(
-            OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-            COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
-            COMMAND xcrun -sdk macosx metallib                ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air   -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
-            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
-            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
-            DEPENDS ggml-metal.metal ggml-common.h
-            COMMENT "Compiling Metal kernels"
-            )
-
-        add_custom_target(
-            ggml-metal ALL
-            DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-            )
-    endif() # LLAMA_METAL_EMBED_LIBRARY
-
-    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
-        ${FOUNDATION_LIBRARY}
-        ${METAL_FRAMEWORK}
-        ${METALKIT_FRAMEWORK}
-        )
-endif()
-
-if (LLAMA_OPENMP)
-    find_package(OpenMP)
-    if (OpenMP_FOUND)
-        message(STATUS "OpenMP found")
-        add_compile_definitions(GGML_USE_OPENMP)
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
-    else()
-        message(WARNING "OpenMP not found")
-    endif()
-endif()
-
-if (LLAMA_BLAS)
-    if (LLAMA_STATIC)
-        set(BLA_STATIC ON)
-    endif()
-    #if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
-    #    set(BLA_SIZEOF_INTEGER 8)
-    #endif()
-
-    set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
-    find_package(BLAS)
-
-    if (BLAS_FOUND)
-        message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
-
-        if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${LLAMA_BLAS_VENDOR} MATCHES "Apple"))
-            # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
-            # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
-            find_package(PkgConfig REQUIRED)
-            if (${LLAMA_BLAS_VENDOR} MATCHES "Generic")
-                pkg_check_modules(DepBLAS REQUIRED blas)
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS")
-                # As of openblas v0.3.22, the 64-bit is named openblas64.pc
-                pkg_check_modules(DepBLAS openblas64)
-                if (NOT DepBLAS_FOUND)
-                    pkg_check_modules(DepBLAS REQUIRED openblas)
-                endif()
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME")
-                pkg_check_modules(DepBLAS REQUIRED blis)
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS")
-                pkg_check_modules(DepBLAS REQUIRED blas-atlas)
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "FlexiBLAS")
-                pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
-                # all Intel* libraries share the same include path
-                pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
-                # this doesn't provide pkg-config
-                # suggest to assign BLAS_INCLUDE_DIRS on your own
-                if ("${NVHPC_VERSION}" STREQUAL "")
-                    message(WARNING "Better to set NVHPC_VERSION")
-                else()
-                    set(DepBLAS_FOUND ON)
-                    set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
-                endif()
-            endif()
-            if (DepBLAS_FOUND)
-                set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
-            else()
-                message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
-                " detected by pkgconfig, trying to find cblas.h from possible paths...")
-                find_path(BLAS_INCLUDE_DIRS
-                    NAMES cblas.h
-                    HINTS
-                        /usr/include
-                        /usr/local/include
-                        /usr/include/openblas
-                        /opt/homebrew/opt/openblas/include
-                        /usr/local/opt/openblas/include
-                        /usr/include/x86_64-linux-gnu/openblas/include
-                )
-            endif()
-        endif()
-
-        message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
-
-        add_compile_options(${BLAS_LINKER_FLAGS})
-
-        add_compile_definitions(GGML_USE_BLAS)
-
-        if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
-            add_compile_definitions(GGML_BLAS_USE_MKL)
-        endif()
-
-        set(GGML_HEADERS_BLAS ggml-blas.h)
-        set(GGML_SOURCES_BLAS ggml-blas.cpp)
-
-        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${BLAS_LIBRARIES})
-        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
-    else()
-        message(WARNING "BLAS not found, please refer to "
-        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
-        " to set correct LLAMA_BLAS_VENDOR")
-    endif()
-endif()
-
-if (LLAMA_LLAMAFILE)
-    add_compile_definitions(GGML_USE_LLAMAFILE)
-
-    set(GGML_HEADERS_LLAMAFILE sgemm.h)
-    set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
-endif()
-
-if (LLAMA_CUBLAS)
-    message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
-    set(LLAMA_CUDA ON)
-endif()
-
-if (LLAMA_CUDA)
-    cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
-
-    find_package(CUDAToolkit)
-    if (CUDAToolkit_FOUND)
-        message(STATUS "CUDA found")
-
-        if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-            # 52 == lowest CUDA 12 standard
-            # 60 == FP16 CUDA intrinsics
-            # 61 == integer CUDA intrinsics
-            # 70 == FP16 tensor cores
-            # 75 == int8 tensor cores
-            if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
-                set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
-            else()
-                set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
-                #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
-            endif()
-        endif()
-        message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
-
-        enable_language(CUDA)
-
-        set(GGML_HEADERS_CUDA ggml-cuda.h)
-
-        file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
-        list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
-        file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-
-        add_compile_definitions(GGML_USE_CUDA)
-        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
-        if (LLAMA_CUDA_FORCE_DMMV)
-            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
-        endif()
-        if (LLAMA_CUDA_FORCE_MMQ)
-            add_compile_definitions(GGML_CUDA_FORCE_MMQ)
-        endif()
-        if (LLAMA_CUDA_FORCE_CUBLAS)
-            add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
-        endif()
-        if (LLAMA_CUDA_NO_VMM)
-            add_compile_definitions(GGML_CUDA_NO_VMM)
-        endif()
-        add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
-        add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
-        if (DEFINED LLAMA_CUDA_DMMV_Y)
-            add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
-        endif()
-        if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
-            add_compile_definitions(GGML_CUDA_F16)
-        endif()
-        add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
-        add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
-        if (LLAMA_CUDA_NO_PEER_COPY)
-            add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
-        endif()
-        if (LLAMA_CUDA_FA_ALL_QUANTS)
-            file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-            add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
-        else()
-            file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-            file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-            file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        endif()
-
-        if (LLAMA_STATIC)
-            if (WIN32)
-                # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
-                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
-            else ()
-                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
-            endif()
-        else()
-            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
-        endif()
-
-        if (LLAMA_CUDA_NO_VMM)
-            # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
-        else()
-            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
-        endif()
-    else()
-        message(WARNING "CUDA not found")
-    endif()
-endif()
-
-if (LLAMA_RPC)
-    add_compile_definitions(GGML_USE_RPC)
-
-    if (WIN32)
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32)
-    endif()
-
-    set(GGML_HEADERS_RPC ggml-rpc.h)
-    set(GGML_SOURCES_RPC ggml-rpc.cpp)
-endif()
-
-if (LLAMA_VULKAN)
-    find_package(Vulkan)
-    if (Vulkan_FOUND)
-        message(STATUS "Vulkan found")
-
-        set(GGML_HEADERS_VULKAN ggml-vulkan.h)
-        set(GGML_SOURCES_VULKAN ggml-vulkan.cpp)
-
-        add_compile_definitions(GGML_USE_VULKAN)
-
-        # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
-        # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
-        if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-            add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
-        endif()
-
-        if (LLAMA_VULKAN_CHECK_RESULTS)
-            add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
-        endif()
-
-        if (LLAMA_VULKAN_DEBUG)
-            add_compile_definitions(GGML_VULKAN_DEBUG)
-        endif()
-
-        if (LLAMA_VULKAN_MEMORY_DEBUG)
-            add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
-        endif()
-
-        if (LLAMA_VULKAN_VALIDATE)
-            add_compile_definitions(GGML_VULKAN_VALIDATE)
-        endif()
-
-        if (LLAMA_VULKAN_RUN_TESTS)
-            add_compile_definitions(GGML_VULKAN_RUN_TESTS)
-        endif()
-
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} Vulkan::Vulkan)
-    else()
-        message(WARNING "Vulkan not found")
-    endif()
-endif()
-
-if (LLAMA_HIPBLAS)
-    if (NOT EXISTS $ENV{ROCM_PATH})
-        if (NOT EXISTS /opt/rocm)
-            set(ROCM_PATH /usr)
-        else()
-            set(ROCM_PATH /opt/rocm)
-        endif()
-    else()
-        set(ROCM_PATH $ENV{ROCM_PATH})
-    endif()
-    list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
-    list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
-
-    # CMake on Windows doesn't support the HIP language yet
-    if(WIN32)
-        set(CXX_IS_HIPCC TRUE)
-    else()
-        string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
-    endif()
-
-    if(CXX_IS_HIPCC)
-        if(LINUX)
-            if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
-                message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
-            endif()
-
-            message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
-                    " Prefer setting the HIP compiler directly. See README for details.")
-        endif()
-    else()
-        # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
-        if(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
-            set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
-        endif()
-        cmake_minimum_required(VERSION 3.21)
-        enable_language(HIP)
-    endif()
-    find_package(hip     REQUIRED)
-    find_package(hipblas REQUIRED)
-    find_package(rocblas REQUIRED)
-
-    message(STATUS "HIP and hipBLAS found")
-
-    set(GGML_HEADERS_ROCM ggml-cuda.h)
-
-    file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
-    list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
-    file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
-    list(APPEND GGML_SOURCES_ROCM ${SRCS})
-    file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
-    list(APPEND GGML_SOURCES_ROCM ${SRCS})
-
-    add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
-
-    if (LLAMA_HIP_UMA)
-        add_compile_definitions(GGML_HIP_UMA)
-    endif()
-
-    if (LLAMA_CUDA_FORCE_DMMV)
-        add_compile_definitions(GGML_CUDA_FORCE_DMMV)
-    endif()
-
-    if (LLAMA_CUDA_FORCE_MMQ)
-        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
-    endif()
-
-    if (LLAMA_CUDA_NO_PEER_COPY)
-        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
-    endif()
-
-    if (LLAMA_CUDA_FA_ALL_QUANTS)
-        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
-        list(APPEND GGML_SOURCES_ROCM ${SRCS})
-        add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
-    else()
-        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
-        list(APPEND GGML_SOURCES_ROCM ${SRCS})
-        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
-        list(APPEND GGML_SOURCES_ROCM ${SRCS})
-        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
-        list(APPEND GGML_SOURCES_ROCM ${SRCS})
-    endif()
-
-    add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
-    add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
-    add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
-
-    if (CXX_IS_HIPCC)
-        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device)
-    else()
-        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
-    endif()
-
-    if (LLAMA_STATIC)
-        message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
-    endif()
-
-    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
-endif()
-
-if (LLAMA_SYCL)
-    if (NOT LLAMA_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$")
-        message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
-    endif()
-
-    if ( NOT DEFINED ENV{ONEAPI_ROOT})
-        message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
-    endif()
-    #todo: AOT
-
-    find_package(IntelSYCL REQUIRED)
-    find_package(MKL REQUIRED)
-
-    message(STATUS "SYCL found")
-
-    add_compile_definitions(GGML_USE_SYCL)
-
-    if (LLAMA_SYCL_F16)
-        add_compile_definitions(GGML_SYCL_F16)
-    endif()
-
-    if (LLAMA_CUDA_FORCE_MMQ)
-        add_compile_definitions(GGML_SYCL_FORCE_MMQ)
-    endif()
-
-    add_compile_options(-I./) #include DPCT
-
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
-    if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
-    endif()
-
-    set(GGML_HEADERS_SYCL ggml-sycl.h)
-    file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
-    list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
-
-    if (WIN32)
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
-    else()
-        add_compile_options(-I/${SYCL_INCLUDE_DIR})
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
-        if (LLAMA_SYCL_TARGET STREQUAL "INTEL")
-            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
-        elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
-            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl pthread m dl onemkl)
-        endif()
-    endif()
-endif()
-
-if (LLAMA_KOMPUTE)
-    add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
-    find_package(Vulkan COMPONENTS glslc REQUIRED)
-    find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
-    if (NOT glslc_executable)
-        message(FATAL_ERROR "glslc not found")
-    endif()
-
-    function(compile_shader)
-        set(options)
-        set(oneValueArgs)
-        set(multiValueArgs SOURCES)
-        cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-        foreach(source ${compile_shader_SOURCES})
-            get_filename_component(filename ${source} NAME)
-            set(spv_file ${filename}.spv)
-            add_custom_command(
-                OUTPUT ${spv_file}
-                DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
-                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
-                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
-                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
-                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
-                COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
-                COMMENT "Compiling ${source} to ${spv_file}"
-                )
-
-            get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
-            set(FILE_NAME "shader${RAW_FILE_NAME}")
-            string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
-            string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
-            string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
-            set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
-            message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
-            if(CMAKE_GENERATOR MATCHES "Visual Studio")
-                add_custom_command(
-                    OUTPUT ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                    DEPENDS ${spv_file} xxd
-                    COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
-                    )
-            else()
-                add_custom_command(
-                    OUTPUT ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                    DEPENDS ${spv_file} xxd
-                    COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
-                    )
-            endif()
-        endforeach()
-    endfunction()
-
-    if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
-        message(STATUS "Kompute found")
-        set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
-        add_subdirectory(kompute)
-
-        # Compile our shaders
-        compile_shader(SOURCES
-            kompute-shaders/op_scale.comp
-            kompute-shaders/op_scale_8.comp
-            kompute-shaders/op_add.comp
-            kompute-shaders/op_addrow.comp
-            kompute-shaders/op_mul.comp
-            kompute-shaders/op_silu.comp
-            kompute-shaders/op_relu.comp
-            kompute-shaders/op_gelu.comp
-            kompute-shaders/op_softmax.comp
-            kompute-shaders/op_norm.comp
-            kompute-shaders/op_rmsnorm.comp
-            kompute-shaders/op_diagmask.comp
-            kompute-shaders/op_mul_mat_mat_f32.comp
-            kompute-shaders/op_mul_mat_f16.comp
-            kompute-shaders/op_mul_mat_q8_0.comp
-            kompute-shaders/op_mul_mat_q4_0.comp
-            kompute-shaders/op_mul_mat_q4_1.comp
-            kompute-shaders/op_mul_mat_q6_k.comp
-            kompute-shaders/op_getrows_f32.comp
-            kompute-shaders/op_getrows_f16.comp
-            kompute-shaders/op_getrows_q4_0.comp
-            kompute-shaders/op_getrows_q4_1.comp
-            kompute-shaders/op_getrows_q6_k.comp
-            kompute-shaders/op_rope_f16.comp
-            kompute-shaders/op_rope_f32.comp
-            kompute-shaders/op_cpy_f16_f16.comp
-            kompute-shaders/op_cpy_f16_f32.comp
-            kompute-shaders/op_cpy_f32_f16.comp
-            kompute-shaders/op_cpy_f32_f32.comp
-        )
-
-        # Create a custom target for our generated shaders
-        add_custom_target(generated_shaders DEPENDS
-            shaderop_scale.h
-            shaderop_scale_8.h
-            shaderop_add.h
-            shaderop_addrow.h
-            shaderop_mul.h
-            shaderop_silu.h
-            shaderop_relu.h
-            shaderop_gelu.h
-            shaderop_softmax.h
-            shaderop_norm.h
-            shaderop_rmsnorm.h
-            shaderop_diagmask.h
-            shaderop_mul_mat_mat_f32.h
-            shaderop_mul_mat_f16.h
-            shaderop_mul_mat_q8_0.h
-            shaderop_mul_mat_q4_0.h
-            shaderop_mul_mat_q4_1.h
-            shaderop_mul_mat_q6_k.h
-            shaderop_getrows_f32.h
-            shaderop_getrows_f16.h
-            shaderop_getrows_q4_0.h
-            shaderop_getrows_q4_1.h
-            shaderop_getrows_q6_k.h
-            shaderop_rope_f16.h
-            shaderop_rope_f32.h
-            shaderop_cpy_f16_f16.h
-            shaderop_cpy_f16_f32.h
-            shaderop_cpy_f32_f16.h
-            shaderop_cpy_f32_f32.h
-        )
-
-        # Create a custom command that depends on the generated_shaders
-        add_custom_command(
-            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
-            COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
-            DEPENDS generated_shaders
-            COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
-        )
-
-        # Add the stamp to the main sources to ensure dependency tracking
-        set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
-        set(GGML_HEADERS_KOMPUTE ggml-kompute.h   ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
-
-        add_compile_definitions(GGML_USE_KOMPUTE)
-
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
-        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
-    else()
-        message(WARNING "Kompute not found")
-    endif()
-endif()
-
-if (LLAMA_CPU_HBM)
-    find_library(memkind memkind REQUIRED)
-
-    add_compile_definitions(GGML_USE_CPU_HBM)
-
-    target_link_libraries(ggml PUBLIC memkind)
-endif()
-
-function(get_flags CCID CCVER)
-    set(C_FLAGS "")
-    set(CXX_FLAGS "")
-
-    if (CCID MATCHES "Clang")
-        set(C_FLAGS   -Wunreachable-code-break -Wunreachable-code-return)
-        set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
-
-        if (
-            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
-            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
-        )
-            list(APPEND C_FLAGS -Wdouble-promotion)
-        endif()
-    elseif (CCID STREQUAL "GNU")
-        set(C_FLAGS   -Wdouble-promotion)
-        set(CXX_FLAGS -Wno-array-bounds)
-
-        if (CCVER VERSION_GREATER_EQUAL 7.1.0)
-            list(APPEND CXX_FLAGS -Wno-format-truncation)
-        endif()
-        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
-            list(APPEND CXX_FLAGS -Wextra-semi)
-        endif()
-    endif()
-
-    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
-    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
 endfunction()
 
-if (LLAMA_FATAL_WARNINGS)
-    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-        list(APPEND C_FLAGS   -Werror)
-        list(APPEND CXX_FLAGS -Werror)
-    elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-        add_compile_options(/WX)
-    endif()
-endif()
-
-if (LLAMA_ALL_WARNINGS)
-    if (NOT MSVC)
-        list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
-        list(APPEND C_FLAGS       -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
-                                  -Werror=implicit-int -Werror=implicit-function-declaration)
-        list(APPEND CXX_FLAGS     -Wmissing-declarations -Wmissing-noreturn)
-
-        list(APPEND C_FLAGS   ${WARNING_FLAGS})
-        list(APPEND CXX_FLAGS ${WARNING_FLAGS})
-
-        get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
-
-        add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
-                            "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
-    else()
-        # todo : msvc
-        set(C_FLAGS   "")
-        set(CXX_FLAGS "")
-    endif()
-endif()
-
-set(CUDA_CXX_FLAGS "")
-
-if (LLAMA_CUDA)
-    set(CUDA_FLAGS -use_fast_math)
-
-    if (LLAMA_FATAL_WARNINGS)
-        list(APPEND CUDA_FLAGS -Werror all-warnings)
-    endif()
-
-    if (LLAMA_ALL_WARNINGS AND NOT MSVC)
-        set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
-        if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
-            list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
-        endif()
-
-        execute_process(
-            COMMAND ${NVCC_CMD} -Xcompiler --version
-            OUTPUT_VARIABLE CUDA_CCFULLVER
-            ERROR_QUIET
-        )
-
-        if (NOT CUDA_CCFULLVER MATCHES clang)
-            set(CUDA_CCID "GNU")
-            execute_process(
-                COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
-                OUTPUT_VARIABLE CUDA_CCVER
-                ERROR_QUIET
-            )
-        else()
-            if (CUDA_CCFULLVER MATCHES Apple)
-                set(CUDA_CCID "AppleClang")
-            else()
-                set(CUDA_CCID "Clang")
-            endif()
-            string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
-        endif()
-
-        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
-
-        get_flags(${CUDA_CCID} ${CUDA_CCVER})
-        list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
-    endif()
-
-    if (NOT MSVC)
-        list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
-    endif()
-endif()
-
-if (WIN32)
-    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
-
-    if (BUILD_SHARED_LIBS)
-        set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-    endif()
-endif()
-
-if (LLAMA_LTO)
-    include(CheckIPOSupported)
-    check_ipo_supported(RESULT result OUTPUT output)
-    if (result)
-        set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
-    else()
-        message(WARNING "IPO is not supported: ${output}")
-    endif()
-endif()
-
-if (LLAMA_CCACHE)
-    find_program(LLAMA_CCACHE_FOUND ccache)
-    if (LLAMA_CCACHE_FOUND)
-        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
-        set(ENV{CCACHE_SLOPPINESS} time_macros)
-        message(STATUS "ccache found, compilation results will be cached. Disable with LLAMA_CCACHE=OFF.")
-    else()
-        message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with LLAMA_CCACHE=OFF")
-    endif ()
-endif()
-
-# this version of Apple ld64 is buggy
-execute_process(
-    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
-    ERROR_VARIABLE output
-    OUTPUT_QUIET
-)
-
-if (output MATCHES "dyld-1015\.7")
-    add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
-endif()
-
-# Architecture specific
-# TODO: probably these flags need to be tweaked on some architectures
-#       feel free to update the Makefile for your architecture and send a pull request or issue
-message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
-if (MSVC)
-    string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
-    message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
-else ()
-    set(CMAKE_GENERATOR_PLATFORM_LWR "")
-endif ()
-
-if (NOT MSVC)
-    if (LLAMA_STATIC)
-        add_link_options(-static)
-        if (MINGW)
-            add_link_options(-static-libgcc -static-libstdc++)
-        endif()
-    endif()
-    if (LLAMA_GPROF)
-        add_compile_options(-pg)
-    endif()
-endif()
-
-set(ARCH_FLAGS "")
-
-if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
-    (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-     CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
-    message(STATUS "ARM detected")
-    if (MSVC)
-        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
-        add_compile_definitions(__ARM_NEON)
-        add_compile_definitions(__ARM_FEATURE_FMA)
-
-        set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
-        string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
-        if (GGML_COMPILER_SUPPORT_DOTPROD)
-            add_compile_definitions(__ARM_FEATURE_DOTPROD)
-        endif ()
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
-        if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
-            add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
-        endif ()
-
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
-        if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
-            add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-        endif ()
-        set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
-    else()
-        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
-        if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
-            list(APPEND ARCH_FLAGS -mfp16-format=ieee)
-        endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
-            # Raspberry Pi 1, Zero
-            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
-        endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
-            if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
-                # Android armeabi-v7a
-                list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
-            else()
-                # Raspberry Pi 2
-                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
-            endif()
-        endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
-            # Android arm64-v8a
-            # Raspberry Pi 3, 4, Zero 2 (32-bit)
-            list(APPEND ARCH_FLAGS -mno-unaligned-access)
-        endif()
-        if (LLAMA_SVE)
-            list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
-        endif()
-    endif()
-elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
-        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-         CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
-    message(STATUS "x86 detected")
-    if (MSVC)
-        # instruction set detection for MSVC only
-        if (LLAMA_NATIVE)
-            include(cmake/FindSIMD.cmake)
-        endif ()
-        if (LLAMA_AVX512)
-            list(APPEND ARCH_FLAGS /arch:AVX512)
-            # MSVC has no compile-time flags enabling specific
-            # AVX512 extensions, neither it defines the
-            # macros corresponding to the extensions.
-            # Do it manually.
-            if (LLAMA_AVX512_VBMI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
-            endif()
-            if (LLAMA_AVX512_VNNI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
-            endif()
-            if (LLAMA_AVX512_BF16)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
-            endif()
-        elseif (LLAMA_AVX2)
-            list(APPEND ARCH_FLAGS /arch:AVX2)
-        elseif (LLAMA_AVX)
-            list(APPEND ARCH_FLAGS /arch:AVX)
-        endif()
-    else()
-        if (LLAMA_NATIVE)
-            list(APPEND ARCH_FLAGS -march=native)
-        endif()
-        if (LLAMA_F16C)
-            list(APPEND ARCH_FLAGS -mf16c)
-        endif()
-        if (LLAMA_FMA)
-            list(APPEND ARCH_FLAGS -mfma)
-        endif()
-        if (LLAMA_AVX)
-            list(APPEND ARCH_FLAGS -mavx)
-        endif()
-        if (LLAMA_AVX2)
-            list(APPEND ARCH_FLAGS -mavx2)
-        endif()
-        if (LLAMA_AVX512)
-            list(APPEND ARCH_FLAGS -mavx512f)
-            list(APPEND ARCH_FLAGS -mavx512bw)
-        endif()
-        if (LLAMA_AVX512_VBMI)
-            list(APPEND ARCH_FLAGS -mavx512vbmi)
-        endif()
-        if (LLAMA_AVX512_VNNI)
-            list(APPEND ARCH_FLAGS -mavx512vnni)
-        endif()
-        if (LLAMA_AVX512_BF16)
-            list(APPEND ARCH_FLAGS -mavx512bf16)
-        endif()
-    endif()
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
-    message(STATUS "PowerPC detected")
-    if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
-        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
-    else()
-        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
-        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
-    endif()
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
-    message(STATUS "loongarch64 detected")
-
-    list(APPEND ARCH_FLAGS -march=loongarch64)
-    if (LLAMA_LASX)
-        list(APPEND ARCH_FLAGS -mlasx)
-    endif()
-    if (LLAMA_LSX)
-        list(APPEND ARCH_FLAGS -mlsx)
-    endif()
-
-else()
-    message(STATUS "Unknown architecture")
-endif()
-
-add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
-add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
-
-if (LLAMA_CUDA)
-    list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
-    list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
-    if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
-        list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
-    endif()
-    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
-endif()
-
-if (MINGW)
-    # Target Windows 8 for PrefetchVirtualMemory
-    add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER})
-endif()
+llama_option_depr(FATAL_ERROR LLAMA_CUBLAS              GGML_CUDA)
+llama_option_depr(WARNING     LLAMA_CUDA                GGML_CUDA)
+llama_option_depr(WARNING     LLAMA_KOMPUTE             GGML_KOMPUTE)
+llama_option_depr(WARNING     LLAMA_METAL               GGML_METAL)
+llama_option_depr(WARNING     LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
+llama_option_depr(WARNING     LLAMA_NATIVE              GGML_NATIVE)
+llama_option_depr(WARNING     LLAMA_OPENMP              GGML_OPENMP)
+llama_option_depr(WARNING     LLAMA_RPC                 GGML_RPC)
+llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
+llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
 
 #
-# POSIX conformance
+# build the library
 #
 
-# clock_gettime came in POSIX.1b (1993)
-# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
-# posix_memalign came in POSIX.1-2001 / SUSv3
-# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
-add_compile_definitions(_XOPEN_SOURCE=600)
-
-# Somehow in OpenBSD whenever POSIX conformance is specified
-# some string functions rely on locale_t availability,
-# which was introduced in POSIX.1-2008, forcing us to go higher
-if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
-    remove_definitions(-D_XOPEN_SOURCE=600)
-    add_compile_definitions(_XOPEN_SOURCE=700)
-endif()
-
-# Data types, macros and functions related to controlling CPU affinity and
-# some memory allocation are available on Linux through GNU extensions in libc
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    add_compile_definitions(_GNU_SOURCE)
-endif()
-
-# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
-# and on macOS its availability depends on enabling Darwin extensions
-# similarly on DragonFly, enabling BSD extensions is necessary
-if (
-    CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
-    CMAKE_SYSTEM_NAME MATCHES "iOS" OR
-    CMAKE_SYSTEM_NAME MATCHES "tvOS" OR
-    CMAKE_SYSTEM_NAME MATCHES "DragonFly"
-)
-    add_compile_definitions(_DARWIN_C_SOURCE)
-endif()
-
-# alloca is a non-standard interface that is not visible on BSDs when
-# POSIX conformance is specified, but not all of them provide a clean way
-# to enable it in such cases
-if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
-    add_compile_definitions(__BSD_VISIBLE)
-endif()
-if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
-    add_compile_definitions(_NETBSD_SOURCE)
-endif()
-if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
-    add_compile_definitions(_BSD_SOURCE)
-endif()
-
-#
-# libraries
-#
-
-# ggml
-
-add_library(ggml OBJECT
-            ggml.c
-            ggml.h
-            ggml-alloc.c
-            ggml-alloc.h
-            ggml-backend.c
-            ggml-backend.h
-            ggml-quants.c
-            ggml-quants.h
-            ${GGML_SOURCES_CUDA}      ${GGML_HEADERS_CUDA}
-            ${GGML_SOURCES_METAL}     ${GGML_HEADERS_METAL}
-            ${GGML_SOURCES_RPC}       ${GGML_HEADERS_RPC}
-            ${GGML_SOURCES_EXTRA}     ${GGML_HEADERS_EXTRA}
-            ${GGML_SOURCES_SYCL}      ${GGML_HEADERS_SYCL}
-            ${GGML_SOURCES_KOMPUTE}   ${GGML_HEADERS_KOMPUTE}
-            ${GGML_SOURCES_VULKAN}    ${GGML_HEADERS_VULKAN}
-            ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
-            ${GGML_SOURCES_BLAS}      ${GGML_HEADERS_BLAS}
-            ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
-            )
-
-target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
-target_compile_features   (ggml PUBLIC c_std_11) # don't bump
-
-target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
-
-add_library(ggml_static STATIC $<TARGET_OBJECTS:ggml>)
-
-if (BUILD_SHARED_LIBS)
-    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
-    target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
-    install(TARGETS ggml_shared LIBRARY)
-endif()
-
-# llama
-
-add_library(llama
-            llama.cpp
-            llama.h
-            unicode.h
-            unicode.cpp
-            unicode-data.cpp
-            )
-
-target_include_directories(llama PUBLIC .)
-target_compile_features   (llama PUBLIC cxx_std_11) # don't bump
-
-target_link_libraries(llama PRIVATE
-    ggml
-    ${LLAMA_EXTRA_LIBS}
-    )
-
-if (BUILD_SHARED_LIBS)
-    set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
-    if (LLAMA_METAL)
-        set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
-    endif()
-endif()
-
+add_subdirectory(ggml)
+add_subdirectory(src)
 
 #
 # install
@@ -1323,44 +114,35 @@ endif()
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
 
-set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR}
-    CACHE PATH "Location of header files")
-set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR}
-    CACHE PATH "Location of library files")
-set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}
-    CACHE PATH "Location of binary files")
-set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
-set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
+set(LLAMA_BUILD_NUMBER        ${BUILD_NUMBER})
+set(LLAMA_BUILD_COMMIT        ${BUILD_COMMIT})
 set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
+
+set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header  files")
+set(LLAMA_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
+set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")
+
 get_directory_property(LLAMA_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
 
+set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
+install(TARGETS llama LIBRARY PUBLIC_HEADER)
+
 configure_package_config_file(
-        ${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in
-        ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
-    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama
+        ${CMAKE_CURRENT_SOURCE_DIR}/cmake/llama-config.cmake.in
+        ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama
     PATH_VARS LLAMA_INCLUDE_INSTALL_DIR
               LLAMA_LIB_INSTALL_DIR
               LLAMA_BIN_INSTALL_DIR )
 
 write_basic_package_version_file(
-        ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
+        ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
     VERSION ${LLAMA_INSTALL_VERSION}
     COMPATIBILITY SameMajorVersion)
 
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
-              ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
-        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
-
-set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
-        "${GGML_HEADERS_CUDA}"
-        "${GGML_HEADERS_METAL}"
-        "${GGML_HEADERS_EXTRA}")
-
-set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
-install(TARGETS ggml PUBLIC_HEADER)
-
-set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/llama.h)
-install(TARGETS llama LIBRARY PUBLIC_HEADER)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
+              ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)
 
 install(
     FILES convert-hf-to-gguf.py
@@ -1373,22 +155,6 @@ install(
         WORLD_READ
         WORLD_EXECUTE
     DESTINATION ${CMAKE_INSTALL_BINDIR})
-if (LLAMA_METAL)
-    install(
-        FILES ggml-metal.metal
-        PERMISSIONS
-            OWNER_READ
-            OWNER_WRITE
-            GROUP_READ
-            WORLD_READ
-        DESTINATION ${CMAKE_INSTALL_BINDIR})
-    if (NOT LLAMA_METAL_EMBED_LIBRARY)
-        install(
-            FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-            DESTINATION ${CMAKE_INSTALL_BINDIR}
-        )
-    endif()
-endif()
 
 configure_file(cmake/llama.pc.in
         "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
diff --git a/CMakePresets.json b/CMakePresets.json
index fba22af9a..d69bc0344 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -19,14 +19,14 @@
         "cacheVariables": {
             "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
             "CMAKE_CXX_COMPILER": "icx",
-            "LLAMA_SYCL": "ON",
+            "GGML_SYCL": "ON",
             "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
         }
     },
     { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
     { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
-    { "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
-    { "name": "static",  "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
+    { "name": "reldbg",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
+    { "name": "static",  "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
 
     {
         "name": "arm64-windows-msvc", "hidden": true,
diff --git a/Makefile b/Makefile
index f6e8eb73e..64a6e6ff0 100644
--- a/Makefile
+++ b/Makefile
@@ -61,8 +61,80 @@ TEST_TARGETS = \
 	tests/test-tokenizer-1-bpe \
 	tests/test-tokenizer-1-spm
 
-# Code coverage output files
-COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
+# Deprecation aliases
+ifdef LLAMA_CUBLAS
+$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
+endif
+
+ifdef LLAMA_CUDA
+GGML_CUDA := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_KOMPUTE
+GGML_KOMPUTE := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_METAL
+GGML_METAL := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_OPENMP
+GGML_OPENMP := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_RPC
+GGML_RPC := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_SYCL
+GGML_SYCL := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_SYCL_F16
+GGML_SYCL_F16 := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_OPENBLAS
+GGML_OPENBLAS := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_OPENBLAS64
+GGML_OPENBLAS64 := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_BLIS
+GGML_BLIS := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_NO_LLAMAFILE
+GGML_NO_LLAMAFILE := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_NO_ACCELERATE
+GGML_NO_ACCELERATE := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_NO_OPENMP
+GGML_NO_OPENMP := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_NO_METAL
+GGML_NO_METAL := 1
+DEPRECATE_WARNING := 1
+endif
 
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
@@ -76,6 +148,12 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif
 
+MK_CFLAGS     += -O3
+MK_CXXFLAGS   += -O3
+ifndef LLAMA_DEBUG
+MK_NVCCFLAGS  += -O3
+endif # LLAMA_DEBUG
+
 # In GNU make default CXX is g++ instead of c++.  Let's fix that so that users
 # of non-gcc compilers don't have to provide g++ alias or wrapper.
 DEFCC  := cc
@@ -90,11 +168,11 @@ endif
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 ifeq ($(UNAME_S),Darwin)
-	ifndef LLAMA_NO_METAL
-		LLAMA_METAL := 1
+	ifndef GGML_NO_METAL
+		GGML_METAL := 1
 	endif
 
-	LLAMA_NO_OPENMP := 1
+	GGML_NO_OPENMP := 1
 
 	ifneq ($(UNAME_P),arm)
 		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
@@ -106,7 +184,11 @@ ifeq ($(UNAME_S),Darwin)
 	endif
 endif
 
-ifdef LLAMA_RPC
+ifdef GGML_METAL
+	GGML_METAL_EMBED_LIBRARY := 1
+endif
+
+ifdef GGML_RPC
 	BUILD_TARGETS += rpc-server
 endif
 
@@ -147,18 +229,6 @@ test: $(TEST_TARGETS)
 
 all: $(BUILD_TARGETS) $(TEST_TARGETS)
 
-coverage: ## Run code coverage
-	gcov -pb tests/*.cpp
-
-lcov-report: coverage ## Generate lcov report
-	mkdir -p lcov-report
-	lcov --capture --directory . --output-file lcov-report/coverage.info
-	genhtml lcov-report/coverage.info --output-directory lcov-report
-
-gcovr-report: coverage ## Generate gcovr report
-	mkdir -p gcovr-report
-	gcovr --root . --html --html-details --output gcovr-report/coverage.html
-
 ifdef RISCV_CROSS_COMPILE
 CC	:= riscv64-unknown-linux-gnu-gcc
 CXX	:= riscv64-unknown-linux-gnu-g++
@@ -169,26 +239,11 @@ endif
 #
 
 # keep standard at C11 and C++11
-MK_CPPFLAGS  = -I. -Icommon
+MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon
 MK_CFLAGS    = -std=c11   -fPIC
 MK_CXXFLAGS  = -std=c++11 -fPIC
 MK_NVCCFLAGS = -std=c++11
 
-# -Ofast tends to produce faster code, but may not be available for some compilers.
-ifdef LLAMA_FAST
-MK_CFLAGS     += -Ofast
-HOST_CXXFLAGS += -Ofast
-ifndef LLAMA_DEBUG
-MK_NVCCFLAGS  += -O3
-endif # LLAMA_DEBUG
-else
-MK_CFLAGS     += -O3
-MK_CXXFLAGS   += -O3
-ifndef LLAMA_DEBUG
-MK_NVCCFLAGS  += -O3
-endif # LLAMA_DEBUG
-endif # LLAMA_FAST
-
 ifndef LLAMA_NO_CCACHE
 CCACHE := $(shell which ccache)
 ifdef CCACHE
@@ -243,8 +298,8 @@ ifeq ($(UNAME_S),OpenBSD)
 	MK_CPPFLAGS += -D_BSD_SOURCE
 endif
 
-ifdef LLAMA_SCHED_MAX_COPIES
-	MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(LLAMA_SCHED_MAX_COPIES)
+ifdef GGML_SCHED_MAX_COPIES
+	MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(GGML_SCHED_MAX_COPIES)
 endif
 
 ifdef LLAMA_DEBUG
@@ -287,19 +342,31 @@ ifdef LLAMA_SERVER_SSL
 	MK_LDFLAGS += -lssl -lcrypto
 endif
 
-ifdef LLAMA_CODE_COVERAGE
-	MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
-endif
-
 ifdef LLAMA_DISABLE_LOGS
 	MK_CPPFLAGS += -DLOG_DISABLE_LOGS
 endif # LLAMA_DISABLE_LOGS
 
 # warnings
-WARN_FLAGS    = -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
-MK_CFLAGS    += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int \
-				-Werror=implicit-function-declaration
-MK_CXXFLAGS  += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
+WARN_FLAGS = \
+	-Wall \
+	-Wextra \
+	-Wpedantic \
+	-Wcast-qual \
+	-Wno-unused-function
+
+MK_CFLAGS += \
+	$(WARN_FLAGS) \
+	-Wshadow \
+	-Wstrict-prototypes \
+	-Wpointer-arith \
+	-Wmissing-prototypes \
+	-Werror=implicit-int \
+	-Werror=implicit-function-declaration
+
+MK_CXXFLAGS += \
+	$(WARN_FLAGS) \
+	-Wmissing-declarations \
+	-Wmissing-noreturn
 
 ifeq ($(LLAMA_FATAL_WARNINGS),1)
 	MK_CFLAGS   += -Werror
@@ -434,7 +501,7 @@ else
 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
 endif
 
-ifndef LLAMA_NO_ACCELERATE
+ifndef GGML_NO_ACCELERATE
 	# Mac OS - include Accelerate framework.
 	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
 	ifeq ($(UNAME_S),Darwin)
@@ -442,141 +509,157 @@ ifndef LLAMA_NO_ACCELERATE
 		MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
 		MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
 		MK_LDFLAGS  += -framework Accelerate
-		OBJS        += ggml-blas.o
+		OBJ_GGML    += ggml/src/ggml-blas.o
 	endif
-endif # LLAMA_NO_ACCELERATE
+endif # GGML_NO_ACCELERATE
 
-ifndef LLAMA_NO_OPENMP
+ifndef GGML_NO_OPENMP
 	MK_CPPFLAGS += -DGGML_USE_OPENMP
 	MK_CFLAGS   += -fopenmp
 	MK_CXXFLAGS += -fopenmp
-endif # LLAMA_NO_OPENMP
+endif # GGML_NO_OPENMP
 
-ifdef LLAMA_OPENBLAS
+ifdef GGML_OPENBLAS
 	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
-	OBJS        += ggml-blas.o
-endif # LLAMA_OPENBLAS
+	OBJ_GGML    += ggml/src/ggml-blas.o
+endif # GGML_OPENBLAS
 
-ifdef LLAMA_OPENBLAS64
+ifdef GGML_OPENBLAS64
 	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas64)
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas64)
-	OBJS        += ggml-blas.o
-endif # LLAMA_OPENBLAS64
+	OBJ_GGML    += ggml/src/ggml-blas.o
+endif # GGML_OPENBLAS64
 
-ifdef LLAMA_BLIS
+ifdef GGML_BLIS
 	MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis
 	MK_LDFLAGS  += -lblis -L/usr/local/lib
-	OBJS        += ggml-blas.o
-endif # LLAMA_BLIS
+	OBJ_GGML    += ggml/src/ggml-blas.o
+endif # GGML_BLIS
 
-ifndef LLAMA_NO_LLAMAFILE
+ifndef GGML_NO_LLAMAFILE
 	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
-	OBJS        += sgemm.o
+	OBJ_GGML    += ggml/src/sgemm.o
 endif
 
-ifdef LLAMA_RPC
-	MK_CPPFLAGS   += -DGGML_USE_RPC
-	OBJS          += ggml-rpc.o
-endif # LLAMA_RPC
+ifdef GGML_RPC
+	MK_CPPFLAGS += -DGGML_USE_RPC
+	OBJ_GGML    += ggml/src/ggml-rpc.o
+endif # GGML_RPC
 
-ifdef LLAMA_CUBLAS
-# LLAMA_CUBLAS is deprecated and will be removed in the future
-	LLAMA_CUDA := 1
-endif
+OBJ_CUDA_TMPL      = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu))
+OBJ_CUDA_TMPL     += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/mmq*.cu))
 
-OBJS_CUDA_TEMP_INST      = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
-OBJS_CUDA_TEMP_INST     += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/mmq*.cu))
-ifdef LLAMA_CUDA_FA_ALL_QUANTS
-	OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
+ifdef GGML_CUDA_FA_ALL_QUANTS
+	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*.cu))
 else
-	OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
-	OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
-	OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
-endif # LLAMA_CUDA_FA_ALL_QUANTS
+	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
+	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
+	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
+endif # GGML_CUDA_FA_ALL_QUANTS
 
-ifdef LLAMA_CUDA
+ifdef GGML_CUDA
 	ifneq ('', '$(wildcard /opt/cuda)')
 		CUDA_PATH ?= /opt/cuda
 	else
 		CUDA_PATH ?= /usr/local/cuda
 	endif
+
 	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
 	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
-	OBJS         += ggml-cuda.o
-	OBJS         += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
-	OBJS         += $(OBJS_CUDA_TEMP_INST)
 	MK_NVCCFLAGS += -use_fast_math
+
+	OBJ_GGML += ggml/src/ggml-cuda.o
+	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
+	OBJ_GGML += $(OBJ_CUDA_TMPL)
+
 ifdef LLAMA_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
 endif # LLAMA_FATAL_WARNINGS
+
 ifndef JETSON_EOL_MODULE_DETECT
 	MK_NVCCFLAGS += --forward-unknown-to-host-compiler
 endif # JETSON_EOL_MODULE_DETECT
+
 ifdef LLAMA_DEBUG
 	MK_NVCCFLAGS += -lineinfo
 endif # LLAMA_DEBUG
-ifdef LLAMA_CUDA_DEBUG
+
+ifdef GGML_CUDA_DEBUG
 	MK_NVCCFLAGS += --device-debug
-endif # LLAMA_CUDA_DEBUG
-ifdef LLAMA_CUDA_NVCC
-	NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
+endif # GGML_CUDA_DEBUG
+
+ifdef GGML_CUDA_NVCC
+	NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
 else
 	NVCC = $(CCACHE) nvcc
-endif #LLAMA_CUDA_NVCC
+endif #GGML_CUDA_NVCC
+
 ifdef CUDA_DOCKER_ARCH
 	MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
 else ifndef CUDA_POWER_ARCH
 	MK_NVCCFLAGS += -arch=native
 endif # CUDA_DOCKER_ARCH
-ifdef LLAMA_CUDA_FORCE_DMMV
+
+ifdef GGML_CUDA_FORCE_DMMV
 	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
-endif # LLAMA_CUDA_FORCE_DMMV
-ifdef LLAMA_CUDA_FORCE_MMQ
+endif # GGML_CUDA_FORCE_DMMV
+
+ifdef GGML_CUDA_FORCE_MMQ
 	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
-endif # LLAMA_CUDA_FORCE_MMQ
-ifdef LLAMA_CUDA_FORCE_CUBLAS
+endif # GGML_CUDA_FORCE_MMQ
+
+ifdef GGML_CUDA_FORCE_CUBLAS
 	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS
-endif # LLAMA_CUDA_FORCE_CUBLAS
-ifdef LLAMA_CUDA_DMMV_X
-	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
+endif # GGML_CUDA_FORCE_CUBLAS
+
+ifdef GGML_CUDA_DMMV_X
+	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
 else
 	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
-endif # LLAMA_CUDA_DMMV_X
-ifdef LLAMA_CUDA_MMV_Y
-	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
-else ifdef LLAMA_CUDA_DMMV_Y
-	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
+endif # GGML_CUDA_DMMV_X
+
+ifdef GGML_CUDA_MMV_Y
+	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
+else ifdef GGML_CUDA_DMMV_Y
+	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_DMMV_Y) # for backwards compatibility
 else
 	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
-endif # LLAMA_CUDA_MMV_Y
-ifdef LLAMA_CUDA_F16
+endif # GGML_CUDA_MMV_Y
+
+ifdef GGML_CUDA_F16
 	MK_NVCCFLAGS += -DGGML_CUDA_F16
-endif # LLAMA_CUDA_F16
-ifdef LLAMA_CUDA_DMMV_F16
+endif # GGML_CUDA_F16
+
+ifdef GGML_CUDA_DMMV_F16
 	MK_NVCCFLAGS += -DGGML_CUDA_F16
-endif # LLAMA_CUDA_DMMV_F16
-ifdef LLAMA_CUDA_KQUANTS_ITER
-	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
+endif # GGML_CUDA_DMMV_F16
+
+ifdef GGML_CUDA_KQUANTS_ITER
+	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
 else
 	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
-ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
-	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
+
+ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
+	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
 else
 	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
-endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
-ifdef LLAMA_CUDA_NO_PEER_COPY
+endif # GGML_CUDA_PEER_MAX_BATCH_SIZE
+
+ifdef GGML_CUDA_NO_PEER_COPY
 	MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
-endif # LLAMA_CUDA_NO_PEER_COPY
-ifdef LLAMA_CUDA_CCBIN
-	MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
-endif # LLAMA_CUDA_CCBIN
-ifdef LLAMA_CUDA_FA_ALL_QUANTS
+endif # GGML_CUDA_NO_PEER_COPY
+
+ifdef GGML_CUDA_CCBIN
+	MK_NVCCFLAGS += -ccbin $(GGML_CUDA_CCBIN)
+endif # GGML_CUDA_CCBIN
+
+ifdef GGML_CUDA_FA_ALL_QUANTS
 	MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
-endif # LLAMA_CUDA_FA_ALL_QUANTS
+endif # GGML_CUDA_FA_ALL_QUANTS
 
 ifdef JETSON_EOL_MODULE_DETECT
 define NVCC_COMPILE
@@ -588,135 +671,187 @@ define NVCC_COMPILE
 endef # NVCC_COMPILE
 endif # JETSON_EOL_MODULE_DETECT
 
-ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
+ggml/src/ggml-cuda/%.o: \
+	ggml/src/ggml-cuda/%.cu \
+	ggml/include/ggml.h \
+	ggml/src/ggml-common.h \
+	ggml/src/ggml-cuda/common.cuh
 	$(NVCC_COMPILE)
 
-ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
+ggml/src/ggml-cuda.o: \
+	ggml/src/ggml-cuda.cu \
+	ggml/include/ggml-cuda.h \
+	ggml/include/ggml.h \
+	ggml/include/ggml-backend.h \
+	ggml/src/ggml-backend-impl.h \
+	ggml/src/ggml-common.h \
+	$(wildcard ggml/src/ggml-cuda/*.cuh)
 	$(NVCC_COMPILE)
-endif # LLAMA_CUDA
+endif # GGML_CUDA
 
-ifdef LLAMA_VULKAN
-	MK_CPPFLAGS  += -DGGML_USE_VULKAN
-	MK_LDFLAGS += -lvulkan
-	OBJS    += ggml-vulkan.o
+ifdef GGML_VULKAN
+	MK_CPPFLAGS += -DGGML_USE_VULKAN
+	MK_LDFLAGS  += -lvulkan
+	OBJ_GGML    += ggml/src/ggml-vulkan.o
 
-ifdef LLAMA_VULKAN_CHECK_RESULTS
+ifdef GGML_VULKAN_CHECK_RESULTS
 	MK_CPPFLAGS  += -DGGML_VULKAN_CHECK_RESULTS
 endif
 
-ifdef LLAMA_VULKAN_DEBUG
+ifdef GGML_VULKAN_DEBUG
 	MK_CPPFLAGS  += -DGGML_VULKAN_DEBUG
 endif
 
-ifdef LLAMA_VULKAN_MEMORY_DEBUG
+ifdef GGML_VULKAN_MEMORY_DEBUG
 	MK_CPPFLAGS  += -DGGML_VULKAN_MEMORY_DEBUG
 endif
 
-ifdef LLAMA_VULKAN_VALIDATE
+ifdef GGML_VULKAN_VALIDATE
 	MK_CPPFLAGS  += -DGGML_VULKAN_VALIDATE
 endif
 
-ifdef LLAMA_VULKAN_RUN_TESTS
+ifdef GGML_VULKAN_RUN_TESTS
 	MK_CPPFLAGS  += -DGGML_VULKAN_RUN_TESTS
 endif
 
-ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
+ggml/src/ggml-vulkan.o: \
+	ggml/src/ggml-vulkan.cpp \
+	ggml/include/ggml-vulkan.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-endif # LLAMA_VULKAN
+endif # GGML_VULKAN
 
-ifdef LLAMA_HIPBLAS
+ifdef GGML_HIPBLAS
 	ifeq ($(wildcard /opt/rocm),)
-		ROCM_PATH	?= /usr
+		ROCM_PATH      ?= /usr
 		AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
 	else
 		ROCM_PATH	?= /opt/rocm
 		AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
 	endif
-	HIPCC                   ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
-	LLAMA_CUDA_DMMV_X       ?= 32
-	LLAMA_CUDA_MMV_Y        ?= 1
-	LLAMA_CUDA_KQUANTS_ITER ?= 2
+
+	GGML_CUDA_DMMV_X       ?= 32
+	GGML_CUDA_MMV_Y        ?= 1
+	GGML_CUDA_KQUANTS_ITER ?= 2
+
 	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
-ifdef LLAMA_HIP_UMA
+
+ifdef GGML_HIP_UMA
 	MK_CPPFLAGS += -DGGML_HIP_UMA
-endif # LLAMA_HIP_UMA
-	MK_LDFLAGS  += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
-	MK_LDFLAGS  += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
-	MK_LDFLAGS	+= -lhipblas -lamdhip64 -lrocblas
-	HIPFLAGS    += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
-	HIPFLAGS    += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
-	HIPFLAGS    += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
-	HIPFLAGS    += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
-ifdef LLAMA_CUDA_FORCE_DMMV
-	HIPFLAGS 	+= -DGGML_CUDA_FORCE_DMMV
-endif # LLAMA_CUDA_FORCE_DMMV
-ifdef LLAMA_CUDA_NO_PEER_COPY
-	HIPFLAGS 	+= -DGGML_CUDA_NO_PEER_COPY
-endif # LLAMA_CUDA_NO_PEER_COPY
-	OBJS        += ggml-cuda.o
-	OBJS        += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
-	OBJS        += $(OBJS_CUDA_TEMP_INST)
+endif # GGML_HIP_UMA
 
-ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
+	MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
+	MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
+	MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
+
+	HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
+
+	HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
+	HIPFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
+	HIPFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
+	HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
+
+ifdef GGML_CUDA_FORCE_DMMV
+	HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
+endif # GGML_CUDA_FORCE_DMMV
+
+ifdef GGML_CUDA_NO_PEER_COPY
+	HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
+endif # GGML_CUDA_NO_PEER_COPY
+
+	OBJ_GGML += ggml/src/ggml-cuda.o
+	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
+	OBJ_GGML += $(OBJ_CUDA_TMPL)
+
+ggml/src/ggml-cuda.o: \
+	ggml/src/ggml-cuda.cu \
+	ggml/include/ggml-cuda.h \
+	ggml/include/ggml.h \
+	ggml/include/ggml-backend.h \
+	ggml/src/ggml-backend-impl.h \
+	ggml/src/ggml-common.h \
+	$(wildcard ggml/src/ggml-cuda/*.cuh)
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
 
-ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
+ggml/src/ggml-cuda/%.o: \
+	ggml/src/ggml-cuda/%.cu \
+	ggml/include/ggml.h \
+	ggml/src/ggml-common.h \
+	ggml/src/ggml-cuda/common.cuh
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
+endif # GGML_HIPBLAS
 
-endif # LLAMA_HIPBLAS
-
-ifdef LLAMA_METAL
+ifdef GGML_METAL
 	MK_CPPFLAGS += -DGGML_USE_METAL
 	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
-	OBJS		+= ggml-metal.o
-ifdef LLAMA_METAL_NDEBUG
+	OBJ_GGML	+= ggml/src/ggml-metal.o
+ifdef GGML_METAL_NDEBUG
 	MK_CPPFLAGS += -DGGML_METAL_NDEBUG
 endif
-ifdef LLAMA_METAL_EMBED_LIBRARY
+ifdef GGML_METAL_EMBED_LIBRARY
 	MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
-	OBJS        += ggml-metal-embed.o
+	OBJ_GGML   += ggml/src/ggml-metal-embed.o
 endif
-endif # LLAMA_METAL
+endif # GGML_METAL
 
-ifdef LLAMA_METAL
-ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h
+ifdef GGML_METAL
+ggml/src/ggml-metal.o: \
+	ggml/src/ggml-metal.m \
+	ggml/include/ggml-metal.h \
+	ggml/include/ggml.h
 	$(CC) $(CFLAGS) -c $< -o $@
 
-ifdef LLAMA_METAL_EMBED_LIBRARY
-ggml-metal-embed.o: ggml-metal.metal ggml-common.h
+ifdef GGML_METAL_EMBED_LIBRARY
+ggml/src/ggml-metal-embed.o: \
+	ggml/src/ggml-metal.metal \
+	ggml/src/ggml-common.h
 	@echo "Embedding Metal library"
-	@sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
+	@sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal
 	$(eval TEMP_ASSEMBLY=$(shell mktemp))
-	@echo ".section __DATA, __ggml_metallib"   >  $(TEMP_ASSEMBLY)
-	@echo ".globl _ggml_metallib_start"        >> $(TEMP_ASSEMBLY)
-	@echo "_ggml_metallib_start:"              >> $(TEMP_ASSEMBLY)
-	@echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
-	@echo ".globl _ggml_metallib_end"          >> $(TEMP_ASSEMBLY)
-	@echo "_ggml_metallib_end:"                >> $(TEMP_ASSEMBLY)
+	@echo ".section __DATA, __ggml_metallib"            >  $(TEMP_ASSEMBLY)
+	@echo ".globl _ggml_metallib_start"                 >> $(TEMP_ASSEMBLY)
+	@echo "_ggml_metallib_start:"                       >> $(TEMP_ASSEMBLY)
+	@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
+	@echo ".globl _ggml_metallib_end"                   >> $(TEMP_ASSEMBLY)
+	@echo "_ggml_metallib_end:"                         >> $(TEMP_ASSEMBLY)
 	@$(AS) $(TEMP_ASSEMBLY) -o $@
 	@rm -f ${TEMP_ASSEMBLY}
 endif
-endif # LLAMA_METAL
+endif # GGML_METAL
 
-OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
-COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
-COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
+OBJ_GGML += \
+	ggml/src/ggml.o \
+	ggml/src/ggml-alloc.o \
+	ggml/src/ggml-backend.o \
+	ggml/src/ggml-quants.o
 
-ifndef LLAMA_NO_LLAMAFILE
-sgemm.o: sgemm.cpp sgemm.h ggml.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-endif
+OBJ_LLAMA = \
+	src/llama.o \
+	src/unicode.o \
+	src/unicode-data.o
 
-ifdef LLAMA_RPC
-ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+OBJ_COMMON = \
+	common/common.o \
+	common/console.o \
+	common/ngram-cache.o \
+	common/sampling.o \
+	common/train.o \
+	common/grammar-parser.o \
+	common/build-info.o \
+	common/json-schema-to-grammar.o
 
-rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON)
 
-rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-endif # LLAMA_RPC
+LIB_GGML   = $(LIB_PRE)ggml$(DSO_EXT)
+LIB_GGML_S = $(LIB_PRE)ggml.a
+
+LIB_LLAMA   = $(LIB_PRE)llama$(DSO_EXT)
+LIB_LLAMA_S = $(LIB_PRE)llama.a
+
+LIB_COMMON   = $(LIB_PRE)common$(DSO_EXT)
+LIB_COMMON_S = $(LIB_PRE)common.a
+
+LIB_ALL   = $(LIB_GGML)   $(LIB_LLAMA)   $(LIB_COMMON)
+LIB_ALL_S = $(LIB_GGML_S) $(LIB_LLAMA_S) $(LIB_COMMON_S)
 
 GF_CC := $(CC)
 include scripts/get-flags.mk
@@ -730,7 +865,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
 override LDFLAGS   := $(MK_LDFLAGS) $(LDFLAGS)
 
 # identify CUDA host compiler
-ifdef LLAMA_CUDA
+ifdef GGML_CUDA
 GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
 include scripts/get-flags.mk
 CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
@@ -755,85 +890,203 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
 $(info I LDFLAGS:   $(LDFLAGS))
 $(info I CC:        $(shell $(CC)   --version | head -n 1))
 $(info I CXX:       $(shell $(CXX)  --version | head -n 1))
-ifdef LLAMA_CUDA
+ifdef GGML_CUDA
 $(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
 CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
 ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
+
 ifndef CUDA_DOCKER_ARCH
 ifndef CUDA_POWER_ARCH
 $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
 endif # CUDA_POWER_ARCH
 endif # CUDA_DOCKER_ARCH
+
 endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
-endif # LLAMA_CUDA
+endif # GGML_CUDA
 $(info )
 
-ifdef LLAMA_CUBLAS
-$(info !!!!)
-$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
-$(info !!!!)
+ifdef DEPRECATE_WARNING
+$(info !!! DEPRECATION WARNING !!!)
+$(info The following LLAMA_ options are deprecated and will be removed in the future. Use the GGML_ prefix instead)
+$(info   - LLAMA_CUDA)
+$(info   - LLAMA_METAL)
+$(info   - LLAMA_METAL_EMBED_LIBRARY)
+$(info   - LLAMA_OPENMP)
+$(info   - LLAMA_RPC)
+$(info   - LLAMA_SYCL)
+$(info   - LLAMA_SYCL_F16)
+$(info   - LLAMA_OPENBLAS)
+$(info   - LLAMA_OPENBLAS64)
+$(info   - LLAMA_BLIS)
+$(info   - LLAMA_NO_LLAMAFILE)
+$(info   - LLAMA_NO_ACCELERATE)
+$(info   - LLAMA_NO_OPENMP)
+$(info   - LLAMA_NO_METAL)
 $(info )
 endif
 
 #
-# Build library
+# Build libraries
 #
 
-ggml.o: ggml.c ggml.h ggml-cuda.h
+# ggml
+
+ggml/src/ggml.o: \
+	ggml/src/ggml.c \
+	ggml/include/ggml.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
 
-ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
+ggml/src/ggml-alloc.o: \
+	ggml/src/ggml-alloc.c \
+	ggml/include/ggml.h \
+	ggml/include/ggml-alloc.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
 
-ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
+ggml/src/ggml-backend.o: \
+	ggml/src/ggml-backend.c \
+	ggml/include/ggml.h \
+	ggml/include/ggml-backend.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
 
-ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
+ggml/src/ggml-quants.o: \
+	ggml/src/ggml-quants.c \
+	ggml/include/ggml.h \
+	ggml/src/ggml-quants.h \
+	ggml/src/ggml-common.h
 	$(CC) $(CFLAGS)    -c $< -o $@
 
-ggml-blas.o: ggml-blas.cpp ggml-blas.h
+ggml/src/ggml-blas.o: \
+	ggml/src/ggml-blas.cpp \
+	ggml/include/ggml-blas.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
-unicode.o: unicode.cpp unicode.h
+ifndef GGML_NO_LLAMAFILE
+ggml/src/sgemm.o: \
+	ggml/src/sgemm.cpp \
+	ggml/src/sgemm.h \
+	ggml/include/ggml.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
+endif # GGML_NO_LLAMAFILE
 
-unicode-data.o: unicode-data.cpp unicode-data.h
+ifdef GGML_RPC
+ggml/src/ggml-rpc.o: \
+	ggml/src/ggml-rpc.cpp \
+	ggml/include/ggml-rpc.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
+endif # GGML_RPC
 
-llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-common.o: common/common.cpp $(COMMON_H_DEPS)
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-sampling.o: common/sampling.cpp $(COMMON_H_DEPS)
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-console.o: common/console.cpp common/console.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-to-grammar.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-train.o: common/train.cpp common/train.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-libllama.so: llama.o ggml.o $(OBJS)
+$(LIB_GGML): \
+	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
-libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
-	ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
+$(LIB_GGML_S): \
+	$(OBJ_GGML)
+	ar rcs $(LIB_GGML_S) $^
+
+# llama
+
+src/unicode.o: \
+	src/unicode.cpp \
+	src/unicode.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+src/unicode-data.o: \
+	src/unicode-data.cpp \
+	src/unicode-data.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+src/llama.o: \
+	src/llama.cpp \
+	src/unicode.h \
+	include/llama.h \
+	ggml/include/ggml-cuda.h \
+	ggml/include/ggml-metal.h \
+	ggml/include/ggml.h \
+	ggml/include/ggml-alloc.h \
+	ggml/include/ggml-backend.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+$(LIB_LLAMA): \
+	$(OBJ_LLAMA) \
+	$(LIB_GGML)
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+
+$(LIB_LLAMA_S): \
+	$(OBJ_LLAMA)
+	ar rcs $(LIB_LLAMA_S) $^
+
+# common
+
+common/common.o: \
+	common/common.cpp \
+	common/common.h \
+	common/console.h \
+	common/sampling.h \
+	common/json.hpp \
+	common/json-schema-to-grammar.h \
+	include/llama.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+common/sampling.o: \
+	common/sampling.cpp \
+	common/sampling.h \
+	include/llama.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+common/console.o: \
+	common/console.cpp \
+	common/console.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+common/grammar-parser.o: \
+	common/grammar-parser.cpp \
+	common/grammar-parser.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+common/json-schema-to-grammar.o: \
+	common/json-schema-to-grammar.cpp \
+	common/json-schema-to-grammar.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+common/train.o: \
+	common/train.cpp \
+	common/train.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+common/ngram-cache.o: \
+	common/ngram-cache.cpp \
+	common/ngram-cache.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+$(LIB_COMMON): \
+	$(OBJ_COMMON) \
+	$(LIB_LLAMA) \
+	$(LIB_GGML)
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+
+$(LIB_COMMON_S): \
+	$(OBJ_COMMON)
+	ar rcs $(LIB_COMMON_S) $^
 
 clean:
-	rm -vrf *.o tests/*.o *.so *.a *.dll common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
-	rm -vrf ggml-cuda/*.o
-	rm -vrf ggml-cuda/template-instances/*.o
+	rm -vrf *.dot $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -rvf src/*.o
+	rm -rvf tests/*.o
+	rm -rvf examples/*.o
+	rm -rvf *.a
+	rm -rvf *.dll
+	rm -rvf *.so
+	rm -rvf *.dot
+	rm -rvf ggml/*.a
+	rm -rvf ggml/*.dll
+	rm -rvf ggml/*.so
+	rm -vrf ggml/src/*.o
+	rm -rvf common/build-info.cpp
+	rm -vrf ggml/src/ggml-metal-embed.metal
+	rm -vrf ggml/src/ggml-cuda/*.o
+	rm -vrf ggml/src/ggml-cuda/template-instances/*.o
+	rm -rvf $(BUILD_TARGETS)
+	rm -rvf $(TEST_TARGETS)
 	find examples pocs -type f -name "*.o" -delete
 
 #
@@ -847,62 +1100,202 @@ clean:
 # Helper function that replaces .c, .cpp, and .cu file endings with .o:
 GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
 
-llama-cli: examples/main/main.cpp                                  ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
+llama-cli: examples/main/main.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./llama-cli -h for help.  ===='
 	@echo
 
-llama-infill: examples/infill/infill.cpp                            ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
+llama-infill: examples/infill/infill.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-simple: examples/simple/simple.cpp                            ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-simple: examples/simple/simple.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-tokenize: examples/tokenize/tokenize.cpp                      ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-tokenize: examples/tokenize/tokenize.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-batched: examples/batched/batched.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-batched: examples/batched/batched.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-batched-bench: examples/batched-bench/batched-bench.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-quantize: examples/quantize/quantize.cpp                      ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-quantize: examples/quantize/quantize.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.o ggml.o llama.o $(OBJS)
+llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-perplexity: examples/perplexity/perplexity.cpp                ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-perplexity: examples/perplexity/perplexity.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-imatrix: examples/imatrix/imatrix.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-imatrix: examples/imatrix/imatrix.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-embedding: examples/embedding/embedding.cpp                   ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-embedding: examples/embedding/embedding.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-gritlm: examples/gritlm/gritlm.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-gritlm: examples/gritlm/gritlm.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-save-load-state: examples/save-load-state/save-load-state.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+llama-gguf: examples/gguf/gguf.cpp \
+	$(OBJ_GGML)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-gguf-split: examples/gguf-split/gguf-split.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-eval-callback: examples/eval-callback/eval-callback.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
+	$(OBJ_GGML) $(OBJ_LLAMA)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-bench: examples/llama-bench/llama-bench.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-baby-llama: examples/baby-llama/baby-llama.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-finetune: examples/finetune/finetune.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-export-lora: examples/export-lora/export-lora.cpp \
+	$(OBJ_GGML) common/log.h
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-retrieval: examples/retrieval/retrieval.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-speculative: examples/speculative/speculative.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-parallel: examples/parallel/parallel.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-lookahead: examples/lookahead/lookahead.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-lookup: examples/lookup/lookup.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-lookup-create: examples/lookup/lookup-create.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-lookup-merge: examples/lookup/lookup-merge.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-lookup-stats: examples/lookup/lookup-stats.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-passkey: examples/passkey/passkey.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+ifdef GGML_RPC
+rpc-server: examples/rpc/rpc-server.cpp \
+	$(OBJ_GGML)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+endif # GGML_RPC
+
+llama-server: \
+	examples/server/server.cpp \
+	examples/server/utils.hpp \
+	examples/server/httplib.h \
+	examples/server/colorthemes.css.hpp \
+	examples/server/style.css.hpp \
+	examples/server/theme-beeninorder.css.hpp \
+	examples/server/theme-ketivah.css.hpp \
+	examples/server/theme-mangotango.css.hpp \
+	examples/server/theme-playground.css.hpp \
+	examples/server/theme-polarnight.css.hpp \
+	examples/server/theme-snowstorm.css.hpp \
+	examples/server/index.html.hpp \
+	examples/server/index-new.html.hpp \
+	examples/server/index.js.hpp \
+	examples/server/completion.js.hpp \
+	examples/server/system-prompts.js.hpp \
+	examples/server/prompt-formats.js.hpp \
+	examples/server/json-schema-to-grammar.mjs.hpp \
+	common/json.hpp \
+	common/stb_image.h \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
 
@@ -915,95 +1308,26 @@ examples/server/%.hpp: examples/server/public/% Makefile
 		echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
 	) > $@
 
-llama-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+libllava.a: examples/llava/llava.cpp \
+	examples/llava/llava.h \
+	examples/llava/clip.cpp \
+	examples/llava/clip.h \
+	common/stb_image.h \
+	common/base64.hpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
 
-llama-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-llava-cli: examples/llava/llava-cli.cpp \
+	examples/llava/clip.h \
+	examples/llava/clip.cpp \
+	examples/llava/llava.h \
+	examples/llava/llava.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp  -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
 	$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
 
-llama-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-lookup-create: examples/lookup/lookup-create.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-lookup-merge: examples/lookup/lookup-merge.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-lookup-stats: examples/lookup/lookup-stats.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 ifeq ($(UNAME_S),Darwin)
 swift: examples/batched.swift
 	(cd examples/batched.swift; make build)
@@ -1017,7 +1341,7 @@ common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
 		rm $@.tmp; \
 	fi
 
-build-info.o: common/build-info.cpp
+common/build-info.o: common/build-info.cpp
 	$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
 
 #
@@ -1026,7 +1350,8 @@ build-info.o: common/build-info.cpp
 
 tests: $(TEST_TARGETS)
 
-llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
+llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp \
+	$(OBJ_GGML) common/build-info.o
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
@@ -1035,85 +1360,108 @@ run-benchmark-matmult: llama-benchmark-matmult
 
 .PHONY: run-benchmark-matmult swift
 
-llama-vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o grammar-parser.o $(OBJS)
+tests/test-llama-grammar: tests/test-llama-grammar.cpp \
+	$(OBJ_GGML) $(OBJ_COMMON) src/unicode.o src/unicode-data.o
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-parser.o $(OBJS)
+tests/test-grammar-parser: tests/test-grammar-parser.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-grammar-integration: tests/test-grammar-integration.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS)
+tests/test-grammar-integration: tests/test-grammar-integration.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
+tests/test-double-float: tests/test-double-float.cpp
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS)
+tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
+tests/test-grad0: tests/test-grad0.cpp \
+	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-opt: tests/test-opt.cpp ggml.o $(OBJS)
+tests/test-opt: tests/test-opt.cpp \
+	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o $(OBJS)
+tests/test-quantize-fns: tests/test-quantize-fns.cpp \
+	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS)
+tests/test-quantize-perf: tests/test-quantize-perf.cpp \
+	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
+tests/test-sampling: tests/test-sampling.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-0: tests/test-tokenizer-0.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
+tests/test-rope: tests/test-rope.cpp ggml/src/ggml.o \
+	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-c.o: tests/test-c.c llama.h
+tests/test-c.o: tests/test-c.c include/llama.h
 	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
 
-tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
+tests/test-backend-ops: tests/test-backend-ops.cpp \
+	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
+tests/test-model-load-cancel: tests/test-model-load-cancel.cpp tests/get-model.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
+tests/test-autorelease: tests/test-autorelease.cpp tests/get-model.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-chat-template: tests/test-chat-template.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+#
+# PoCs
+#
+
+llama-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \
+	$(OBJ_GGML)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
+	$(OBJ_GGML)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
diff --git a/Package.swift b/Package.swift
index 183e64757..77fed86df 100644
--- a/Package.swift
+++ b/Package.swift
@@ -3,14 +3,13 @@
 import PackageDescription
 
 var sources = [
-    "ggml.c",
-    "sgemm.cpp",
-    "llama.cpp",
-    "unicode.cpp",
-    "unicode-data.cpp",
-    "ggml-alloc.c",
-    "ggml-backend.c",
-    "ggml-quants.c",
+    "src/llama.cpp",
+    "src/unicode.cpp",
+    "src/unicode-data.cpp",
+    "ggml/src/ggml.c",
+    "ggml/src/ggml-alloc.c",
+    "ggml/src/ggml-backend.c",
+    "ggml/src/ggml-quants.c",
 ]
 
 var resources: [Resource] = []
@@ -26,8 +25,8 @@ var cSettings: [CSetting] =  [
 ]
 
 #if canImport(Darwin)
-sources.append("ggml-metal.m")
-resources.append(.process("ggml-metal.metal"))
+sources.append("ggml/src/ggml-metal.m")
+resources.append(.process("ggml/src/ggml-metal.metal"))
 linkerSettings.append(.linkedFramework("Accelerate"))
 cSettings.append(
     contentsOf: [
@@ -63,8 +62,6 @@ let package = Package(
                "models",
                "tests",
                "CMakeLists.txt",
-               "ggml-cuda.cu",
-               "ggml-cuda.h",
                "Makefile"
             ],
             sources: sources,
diff --git a/README-sycl.md b/README-sycl.md
index b7e2bb12a..885983e92 100644
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -115,12 +115,12 @@ The docker build option is currently limited to *intel GPU* targets.
 ### Build image
 ```sh
 # Using FP16
-docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
+docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
 ```
 
 *Notes*:
 
-To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
+To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.
 
 You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
 
@@ -244,10 +244,10 @@ source /opt/intel/oneapi/setvars.sh
 # Build LLAMA with MKL BLAS acceleration for intel GPU
 
 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 
 # Option 2: Use FP16
-cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
 
 # build all binary
 cmake --build build --config Release -j -v
@@ -264,10 +264,10 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
 # Build LLAMA with Nvidia BLAS acceleration through SYCL
 
 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 
 # Option 2: Use FP16
-cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
 
 # build all binary
 cmake --build build --config Release -j -v
@@ -422,10 +422,10 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
 @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
 
 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
+cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
 
 # Option 2: Or FP16
-cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
+cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
 
 cmake --build build --config Release -j
 ```
@@ -440,7 +440,7 @@ Or, use CMake presets to build:
 cmake --preset x64-windows-sycl-release
 cmake --build build-x64-windows-sycl-release -j --target llama-cli
 
-cmake -DLLAMA_SYCL_F16=ON --preset x64-windows-sycl-release
+cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
 cmake --build build-x64-windows-sycl-release -j --target llama-cli
 
 cmake --preset x64-windows-sycl-debug
@@ -544,9 +544,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 
 | Name               | Value                             | Function                                    |
 |--------------------|-----------------------------------|---------------------------------------------|
-| LLAMA_SYCL         | ON (mandatory)                    | Enable build with SYCL code path.           |
-| LLAMA_SYCL_TARGET  | INTEL *(default)* \| NVIDIA       | Set the SYCL target device type.            |
-| LLAMA_SYCL_F16     | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path.      |
+| GGML_SYCL          | ON (mandatory)                    | Enable build with SYCL code path.           |
+| GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA       | Set the SYCL target device type.            |
+| GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path.      |
 | CMAKE_C_COMPILER   | icx                               | Set *icx* compiler for SYCL code path.      |
 | CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |
 
diff --git a/README.md b/README.md
index 95d970d83..6ca5ba43e 100644
--- a/README.md
+++ b/README.md
@@ -415,7 +415,7 @@ Flox follows the nixpkgs build of llama.cpp.
 ### Metal Build
 
 On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
-To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.
+To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
 
 When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
 argument.
@@ -435,7 +435,7 @@ Building the program with BLAS support may lead to some performance improvements
   - Using `make`:
     - On Linux:
       ```bash
-      make LLAMA_OPENBLAS=1
+      make GGML_OPENBLAS=1
       ```
 
     - On Windows:
@@ -450,13 +450,13 @@ Building the program with BLAS support may lead to some performance improvements
       8. From here you can run:
 
           ```bash
-          make LLAMA_OPENBLAS=1
+          make GGML_OPENBLAS=1
           ```
 
   - Using `CMake` on Linux:
 
       ```bash
-      cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+      cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
       cmake --build build --config Release
       ```
 
@@ -475,10 +475,10 @@ Building the program with BLAS support may lead to some performance improvements
   Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
 
   - Using manual oneAPI installation:
-    By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
+    By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
       ```bash
       source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-basekit docker image, only required for manual installation
-      cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
+      cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON
       cmake --build build --config Release
       ```
 
@@ -495,28 +495,28 @@ Building the program with BLAS support may lead to some performance improvements
 
   - Using `make`:
     ```bash
-    make LLAMA_CUDA=1
+    make GGML_CUDA=1
     ```
   - Using `CMake`:
 
     ```bash
-    cmake -B build -DLLAMA_CUDA=ON
+    cmake -B build -DGGML_CUDA=ON
     cmake --build build --config Release
     ```
 
   The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
 
-  | Option                         | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
-  |--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-  | LLAMA_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
-  | LLAMA_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
-  | LLAMA_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
-  | LLAMA_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
-  | LLAMA_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
-  | LLAMA_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
-  | LLAMA_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
-  | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
-  | LLAMA_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |
+  | Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
+  |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+  | GGML_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
+  | GGML_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
+  | GGML_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
+  | GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
+  | GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
+  | GGML_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
+  | GGML_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
+  | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
+  | GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |
 
 - #### hipBLAS
 
@@ -526,15 +526,15 @@ Building the program with BLAS support may lead to some performance improvements
 
   - Using `make`:
     ```bash
-    make LLAMA_HIPBLAS=1
+    make GGML_HIPBLAS=1
     ```
   - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
     ```bash
     HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-        cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+        cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
         && cmake --build build --config Release -- -j 16
     ```
-    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
+    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
     However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
 
     Note that if you get the following error:
@@ -548,19 +548,19 @@ Building the program with BLAS support may lead to some performance improvements
     ```bash
     HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
     HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
-        cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+        cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
         && cmake --build build -- -j 16
     ```
 
   - Using `make` (example for target gfx1030, build with 16 CPU threads):
     ```bash
-    make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
+    make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
     ```
 
   - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
     ```bash
     set PATH=%HIP_PATH%\bin;%PATH%
-    cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
+    cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
     cmake --build build
     ```
     Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
@@ -571,11 +571,11 @@ Building the program with BLAS support may lead to some performance improvements
   If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
   The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
 
-  | Option                  | Legal values           | Default | Description                                                                                                                                                                                                                                    |
-  |-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 | 32      | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
-  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |
+  | Option                 | Legal values           | Default | Description                                                                                                                                                                                                                                    |
+  |------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+  | GGML_CUDA_DMMV_X       | Positive integer >= 32 | 32      | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
+  | GGML_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
+  | GGML_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |
 
 - #### Vulkan
 
@@ -613,7 +613,7 @@ Building the program with BLAS support may lead to some performance improvements
   Then, build llama.cpp using the cmake command below:
 
   ```bash
-  cmake -B build -DLLAMA_VULKAN=1
+  cmake -B build -DGGML_VULKAN=1
   cmake --build build --config Release
   # Test the output binary (with "-ngl 33" to offload all layers to GPU)
   ./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
diff --git a/ci/run.sh b/ci/run.sh
index 291c44f47..e0cedb24f 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -36,11 +36,11 @@ SRC=`pwd`
 CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
 
 if [ ! -z ${GG_BUILD_METAL} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
 fi
 
 if [ ! -z ${GG_BUILD_CUDA} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1"
 fi
 
 if [ ! -z ${GG_BUILD_SYCL} ]; then
@@ -50,7 +50,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
         exit 1
     fi
 
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
 fi
 ## helpers
 
@@ -284,7 +284,7 @@ function gg_run_open_llama_7b_v2 {
 
     set -e
 
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
     (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
 
     python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@@ -550,7 +550,7 @@ function gg_run_pythia_2_8b {
 
     set -e
 
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
     (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
 
     python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
diff --git a/scripts/build-info.cmake b/cmake/build-info.cmake
similarity index 100%
rename from scripts/build-info.cmake
rename to cmake/build-info.cmake
diff --git a/cmake/git-vars.cmake b/cmake/git-vars.cmake
new file mode 100644
index 000000000..1a4c24ebf
--- /dev/null
+++ b/cmake/git-vars.cmake
@@ -0,0 +1,22 @@
+find_package(Git)
+
+# the commit's SHA1
+execute_process(COMMAND
+    "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    OUTPUT_VARIABLE GIT_SHA1
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+# the date of the commit
+execute_process(COMMAND
+    "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    OUTPUT_VARIABLE GIT_DATE
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+# the subject of the commit
+execute_process(COMMAND
+    "${GIT_EXECUTABLE}" log -1 --format=%s
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
diff --git a/scripts/LlamaConfig.cmake.in b/cmake/llama-config.cmake.in
similarity index 73%
rename from scripts/LlamaConfig.cmake.in
rename to cmake/llama-config.cmake.in
index 9311055d9..2e7da2f8e 100644
--- a/scripts/LlamaConfig.cmake.in
+++ b/cmake/llama-config.cmake.in
@@ -1,41 +1,43 @@
-set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
+set(LLAMA_VERSION      @LLAMA_INSTALL_VERSION@)
 set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
 set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
-set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
-set(LLAMA_BLAS @LLAMA_BLAS@)
-set(LLAMA_CUDA @LLAMA_CUDA@)
-set(LLAMA_METAL @LLAMA_METAL@)
-set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
-set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
+set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)
+
+set(GGML_BLAS       @GGML_BLAS@)
+set(GGML_CUDA       @GGML_CUDA@)
+set(GGML_METAL      @GGML_METAL@)
+set(GGML_HIPBLAS    @GGML_HIPBLAS@)
+set(GGML_ACCELERATE @GGML_ACCELERATE@)
 
 @PACKAGE_INIT@
 
 set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
-set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
-set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
+set_and_check(LLAMA_LIB_DIR     "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
+set_and_check(LLAMA_BIN_DIR     "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
 
 # Ensure transient dependencies satisfied
 
 find_package(Threads REQUIRED)
-if (APPLE AND LLAMA_ACCELERATE)
+
+if (APPLE AND GGML_ACCELERATE)
     find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
 endif()
 
-if (LLAMA_BLAS)
+if (GGML_BLAS)
     find_package(BLAS REQUIRED)
 endif()
 
-if (LLAMA_CUDA)
+if (GGML_CUDA)
     find_package(CUDAToolkit REQUIRED)
 endif()
 
-if (LLAMA_METAL)
+if (GGML_METAL)
     find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
     find_library(METAL_FRAMEWORK Metal REQUIRED)
     find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
 endif()
 
-if (LLAMA_HIPBLAS)
+if (GGML_HIPBLAS)
     find_package(hip REQUIRED)
     find_package(hipblas REQUIRED)
     find_package(rocblas REQUIRED)
@@ -47,7 +49,9 @@ find_library(llama_LIBRARY llama
 
 set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
 set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
+
 add_library(llama UNKNOWN IMPORTED)
+
 set_target_properties(llama
     PROPERTIES
         INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 171530c91..761971d68 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -1,5 +1,6 @@
 # common
 
+find_package(Threads REQUIRED)
 
 # Build info header
 #
@@ -36,7 +37,7 @@ add_custom_command(
     COMMENT "Generating build details from Git"
     COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
             -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
+            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
     WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
     DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
     VERBATIM
@@ -83,5 +84,5 @@ if (LLAMA_CURL)
 endif ()
 
 target_include_directories(${TARGET} PUBLIC .)
-target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
+target_compile_features   (${TARGET} PUBLIC cxx_std_11)
+target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
diff --git a/scripts/gen-build-info-cpp.cmake b/common/cmake/build-info-gen-cpp.cmake
similarity index 86%
rename from scripts/gen-build-info-cpp.cmake
rename to common/cmake/build-info-gen-cpp.cmake
index d89338920..fbc92b52c 100644
--- a/scripts/gen-build-info-cpp.cmake
+++ b/common/cmake/build-info-gen-cpp.cmake
@@ -1,7 +1,7 @@
-include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 
 set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
-set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
+set(OUTPUT_FILE   "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
 
 # Only write the build info if it changed
 if(EXISTS ${OUTPUT_FILE})
diff --git a/docs/BLIS.md b/docs/BLIS.md
index c933766b7..35d06bd0f 100644
--- a/docs/BLIS.md
+++ b/docs/BLIS.md
@@ -30,8 +30,8 @@ We recommend using openmp since it's easier to modify the cores being used.
 Makefile:
 
 ```bash
-make LLAMA_BLIS=1 -j
-# make LLAMA_BLIS=1 benchmark-matmult
+make GGML_BLIS=1 -j
+# make GGML_BLIS=1 llama-benchmark-matmult
 ```
 
 CMake:
@@ -39,7 +39,7 @@ CMake:
 ```bash
 mkdir build
 cd build
-cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
+cmake -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME ..
 make -j
 ```
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 0b51c44c0..7d9ab3457 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -39,13 +39,13 @@ else()
     add_subdirectory(quantize-stats)
     add_subdirectory(quantize)
     add_subdirectory(retrieval)
-    if (LLAMA_RPC)
+    if (GGML_RPC)
         add_subdirectory(rpc)
     endif()
     if (LLAMA_BUILD_SERVER)
     add_subdirectory(server)
     endif()
-    if (LLAMA_SYCL)
+    if (GGML_SYCL)
         add_subdirectory(sycl)
     endif()
     add_subdirectory(save-load-state)
diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md
index 38b36ee5a..29602881a 100644
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -25,7 +25,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
 ## Example
 
 ```bash
-LLAMA_CUDA=1 make -j
+GGML_CUDA=1 make -j
 
 # generate importance matrix (imatrix.dat)
 ./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md
index 05a8207e6..f6c619c87 100644
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -194,7 +194,7 @@ llama_print_timings:       total time =   44411.01 ms /   377 tokens
 ## Orin compile and run
 ### compile
 ```sh
-make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
+make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32
 ```
 ### run on Orin
 ### case 1
diff --git a/examples/rpc/README.md b/examples/rpc/README.md
index 86544e3fe..e1da801f2 100644
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@@ -29,13 +29,13 @@ You can also run multiple `rpc-server` instances on the same host, each with a d
 
 ## Usage
 
-On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options.
+On each host, build the corresponding backend with `cmake` and add `-DGGML_RPC=ON` to the build options.
 For example, to build the CUDA backend with RPC support:
 
 ```bash
 mkdir build-rpc-cuda
 cd build-rpc-cuda
-cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
+cmake .. -DGGML_CUDA=ON -DGGML_RPC=ON
 cmake --build . --config Release
 ```
 
@@ -58,12 +58,12 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
 This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
 
 
-On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:
+On the main host build `llama.cpp` only with `-DGGML_RPC=ON`:
 
 ```bash
 mkdir build-rpc
 cd build-rpc
-cmake .. -DLLAMA_RPC=ON
+cmake .. -DGGML_RPC=ON
 cmake --build . --config Release
 ```
 
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index 8365f9510..dbe41f1fd 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -1,7 +1,14 @@
 set(TARGET llama-server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
-option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
+option(LLAMA_SERVER_SSL     "Build SSL support for the server"        OFF)
+
 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+
+if (MINGW)
+    # fix: https://github.com/ggerganov/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
+    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
+endif()
+
 set(TARGET_SRCS
     server.cpp
     utils.hpp
@@ -24,6 +31,7 @@ set(PUBLIC_ASSETS
     prompt-formats.js
     json-schema-to-grammar.mjs
 )
+
 foreach(asset ${PUBLIC_ASSETS})
     set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
     set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
@@ -34,18 +42,23 @@ foreach(asset ${PUBLIC_ASSETS})
         COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
     )
 endforeach()
+
 add_executable(${TARGET} ${TARGET_SRCS})
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
     SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
+
 target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
+
 if (LLAMA_SERVER_SSL)
     find_package(OpenSSL REQUIRED)
     target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
     target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
 endif()
+
 if (WIN32)
     TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
+
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh
index db46d57ca..8fe0a6790 100755
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@@ -8,10 +8,10 @@ cd build
 source /opt/intel/oneapi/setvars.sh
 
 #for FP16
-#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
+#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # faster for long-prompt inference
 
 #for FP32
-cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 
 #build example/main
 #cmake --build . --config Release --target main
diff --git a/examples/sycl/win-build-sycl.bat b/examples/sycl/win-build-sycl.bat
index 027173b0a..cdae5a528 100644
--- a/examples/sycl/win-build-sycl.bat
+++ b/examples/sycl/win-build-sycl.bat
@@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR
 
 ::  for FP16
 ::  faster for long-prompt inference
-::  cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
+::  cmake -G "MinGW Makefiles" ..  -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
 
 ::  for FP32
-cmake -G "Ninja" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
+cmake -G "Ninja" ..  -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
 if %errorlevel% neq 0 goto ERROR
 ::  build example/main only
 ::  make main
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
new file mode 100644
index 000000000..f3763f7eb
--- /dev/null
+++ b/ggml/CMakeLists.txt
@@ -0,0 +1,238 @@
+cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
+project("ggml" C CXX)
+include(CheckIncludeFileCXX)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
+
+if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+    set(GGML_STANDALONE ON)
+
+    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+    # configure project version
+    # TODO
+else()
+    set(GGML_STANDALONE OFF)
+endif()
+
+if (EMSCRIPTEN)
+    set(BUILD_SHARED_LIBS_DEFAULT OFF)
+
+    option(GGML_WASM_SINGLE_FILE "ggml: embed WASM inside the generated ggml.js" ON)
+else()
+    if (MINGW)
+        set(BUILD_SHARED_LIBS_DEFAULT OFF)
+    else()
+        set(BUILD_SHARED_LIBS_DEFAULT ON)
+    endif()
+endif()
+
+option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
+
+#
+# option list
+#
+
+# TODO: mark all options as advanced when not GGML_STANDALONE
+
+if (APPLE)
+    set(GGML_METAL_DEFAULT ON)
+    set(GGML_BLAS_DEFAULT ON)
+    set(GGML_BLAS_VENDOR_DEFAULT "Apple")
+else()
+    set(GGML_METAL_DEFAULT OFF)
+    set(GGML_BLAS_DEFAULT OFF)
+    set(GGML_BLAS_VENDOR_DEFAULT "Generic")
+endif()
+
+# general
+option(GGML_STATIC "ggml: static link libraries"         OFF)
+option(GGML_NATIVE "ggml: enable -march=native flag"     ON)
+option(GGML_LTO    "ggml: enable link time optimization" OFF)
+option(GGML_CCACHE "ggml: use ccache if available"       ON)
+
+# debug
+option(GGML_ALL_WARNINGS           "ggml: enable all compiler warnings"                   ON)
+option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
+option(GGML_GPROF                  "ggml: enable gprof"                                   OFF)
+
+# build
+option(GGML_FATAL_WARNINGS    "ggml: enable -Werror flag"    OFF)
+
+# sanitizers
+option(GGML_SANITIZE_THREAD    "ggml: enable thread sanitizer"    OFF)
+option(GGML_SANITIZE_ADDRESS   "ggml: enable address sanitizer"   OFF)
+option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
+
+# instruction set specific
+if (GGML_NATIVE)
+    set(INS_ENB OFF)
+else()
+    set(INS_ENB ON)
+endif()
+
+option(GGML_CPU_HBM     "ggml: use memkind for CPU HBM" OFF)
+
+option(GGML_AVX         "ggml: enable AVX"              ${INS_ENB})
+option(GGML_AVX2        "ggml: enable AVX2"             ${INS_ENB})
+option(GGML_AVX512      "ggml: enable AVX512"           OFF)
+option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI"      OFF)
+option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI"      OFF)
+option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16"      OFF)
+option(GGML_FMA         "ggml: enable FMA"              ${INS_ENB})
+if (NOT MSVC)
+    option(GGML_F16C    "ggml: enable F16C"             ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
+endif()
+option(GGML_LASX        "ggml: enable lasx"             ON)
+option(GGML_LSX         "ggml: enable lsx"              ON)
+option(GGML_SVE         "ggml: enable SVE"              OFF)
+
+if (WIN32)
+    set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version")
+endif()
+
+# ggml core
+set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
+
+# 3rd party libs / backends
+option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"               ON)
+option(GGML_BLAS                            "ggml: use BLAS"                                  ${GGML_BLAS_DEFAULT})
+set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
+                                            "ggml: BLAS library vendor")
+option(GGML_LLAMAFILE                       "ggml: use ggml SGEMM"                            OFF)
+
+option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
+option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
+option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
+set   (GGML_CUDA_DMMV_X   "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
+set   (GGML_CUDA_MMV_Y     "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
+option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
+set   (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
+                                            "ggml: iters./thread per block for Q2_K/Q6_K")
+set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
+                                            "ggml: max. batch size for using peer access")
+option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
+option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"                OFF)
+option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
+
+option(GGML_CURL                            "ggml: use libcurl to download model from an URL" OFF)
+option(GGML_HIPBLAS                         "ggml: use hipBLAS"                               OFF)
+option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
+option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
+option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
+option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
+option(GGML_VULKAN_MEMORY_DEBUG             "ggml: enable Vulkan memory debug output"         OFF)
+option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
+option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
+option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
+option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
+option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
+option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
+option(GGML_METAL_EMBED_LIBRARY             "ggml: embed Metal library"                       ${GGML_METAL})
+set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
+                                            "ggml: metal minimum macOS version")
+set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
+option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
+option(GGML_RPC                             "ggml: use RPC"                                   OFF)
+option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
+option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
+set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
+                                            "ggml: sycl target device")
+
+# extra artifacts
+option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE})
+option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
+
+#
+# dependencies
+#
+
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_C_STANDARD_REQUIRED true)
+
+if (GGML_SYCL)
+    set(CMAKE_CXX_STANDARD 17)
+else()
+    set(CMAKE_CXX_STANDARD 11)
+endif()
+set(CMAKE_CXX_STANDARD_REQUIRED true)
+
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+
+find_package(Threads REQUIRED)
+
+#
+# build the library
+#
+
+add_subdirectory(src)
+
+#
+# tests and examples
+#
+
+if (GGML_BUILD_TESTS)
+    enable_testing()
+    add_subdirectory(tests)
+endif ()
+
+if (GGML_BUILD_EXAMPLES)
+    add_subdirectory(examples)
+endif ()
+
+#
+# install
+#
+
+include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+
+set(GGML_PUBLIC_HEADERS
+    include/ggml.h
+    include/ggml-alloc.h
+    include/ggml-backend.h
+    "${GGML_HEADERS_CUDA}"
+    "${GGML_HEADERS_METAL}"
+    "${GGML_HEADERS_EXTRA}")
+
+set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
+#if (GGML_METAL)
+#    set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
+#endif()
+install(TARGETS ggml PUBLIC_HEADER)
+
+if (BUILD_SHARED_LIBS)
+    install(TARGETS ggml LIBRARY)
+endif()
+
+if (GGML_METAL)
+    install(
+        FILES src/ggml-metal.metal
+        PERMISSIONS
+            OWNER_READ
+            OWNER_WRITE
+            GROUP_READ
+            WORLD_READ
+        DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+    if (NOT GGML_METAL_EMBED_LIBRARY)
+        install(
+            FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+            DESTINATION ${CMAKE_INSTALL_BINDIR}
+        )
+    endif()
+endif()
+
+if (GGML_STANDALONE)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
+        ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
+        @ONLY)
+
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
+        DESTINATION share/pkgconfig)
+endif()
diff --git a/cmake/FindSIMD.cmake b/ggml/cmake/FindSIMD.cmake
similarity index 94%
rename from cmake/FindSIMD.cmake
rename to ggml/cmake/FindSIMD.cmake
index 33377ec44..5533668ec 100644
--- a/cmake/FindSIMD.cmake
+++ b/ggml/cmake/FindSIMD.cmake
@@ -79,22 +79,22 @@ endmacro()
 # flags are for MSVC only!
 check_sse("AVX" " ;/arch:AVX")
 if (NOT ${AVX_FOUND})
-    set(LLAMA_AVX OFF)
+    set(GGML_AVX OFF)
 else()
-    set(LLAMA_AVX ON)
+    set(GGML_AVX ON)
 endif()
 
 check_sse("AVX2" " ;/arch:AVX2")
 check_sse("FMA" " ;/arch:AVX2")
 if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
-    set(LLAMA_AVX2 OFF)
+    set(GGML_AVX2 OFF)
 else()
-    set(LLAMA_AVX2 ON)
+    set(GGML_AVX2 ON)
 endif()
 
 check_sse("AVX512" " ;/arch:AVX512")
 if (NOT ${AVX512_FOUND})
-    set(LLAMA_AVX512 OFF)
+    set(GGML_AVX512 OFF)
 else()
-    set(LLAMA_AVX512 ON)
+    set(GGML_AVX512 ON)
 endif()
diff --git a/ggml_vk_generate_shaders.py b/ggml/ggml_vk_generate_shaders.py
similarity index 100%
rename from ggml_vk_generate_shaders.py
rename to ggml/ggml_vk_generate_shaders.py
diff --git a/ggml-alloc.h b/ggml/include/ggml-alloc.h
similarity index 100%
rename from ggml-alloc.h
rename to ggml/include/ggml-alloc.h
diff --git a/ggml-backend.h b/ggml/include/ggml-backend.h
similarity index 100%
rename from ggml-backend.h
rename to ggml/include/ggml-backend.h
diff --git a/ggml-blas.h b/ggml/include/ggml-blas.h
similarity index 100%
rename from ggml-blas.h
rename to ggml/include/ggml-blas.h
diff --git a/ggml-cuda.h b/ggml/include/ggml-cuda.h
similarity index 100%
rename from ggml-cuda.h
rename to ggml/include/ggml-cuda.h
diff --git a/ggml-kompute.h b/ggml/include/ggml-kompute.h
similarity index 100%
rename from ggml-kompute.h
rename to ggml/include/ggml-kompute.h
diff --git a/ggml-metal.h b/ggml/include/ggml-metal.h
similarity index 100%
rename from ggml-metal.h
rename to ggml/include/ggml-metal.h
diff --git a/ggml-rpc.h b/ggml/include/ggml-rpc.h
similarity index 100%
rename from ggml-rpc.h
rename to ggml/include/ggml-rpc.h
diff --git a/ggml-sycl.h b/ggml/include/ggml-sycl.h
similarity index 95%
rename from ggml-sycl.h
rename to ggml/include/ggml-sycl.h
index 451938fc4..43ab1519c 100644
--- a/ggml-sycl.h
+++ b/ggml/include/ggml-sycl.h
@@ -8,7 +8,9 @@
 
 #include "ggml.h"
 #include "ggml-backend.h"
-#include "ggml-sycl/presets.hpp"
+
+#define GGML_SYCL_NAME "SYCL"
+#define GGML_SYCL_MAX_DEVICES 48
 
 #ifdef  __cplusplus
 extern "C" {
diff --git a/ggml-vulkan.h b/ggml/include/ggml-vulkan.h
similarity index 100%
rename from ggml-vulkan.h
rename to ggml/include/ggml-vulkan.h
diff --git a/ggml.h b/ggml/include/ggml.h
similarity index 100%
rename from ggml.h
rename to ggml/include/ggml.h
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
new file mode 100644
index 000000000..ba341d374
--- /dev/null
+++ b/ggml/src/CMakeLists.txt
@@ -0,0 +1,1171 @@
+include(CheckCXXCompilerFlag)
+
+unset(GGML_CDEF_PUBLIC)
+
+add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
+
+# enable libstdc++ assertions for debug builds
+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
+endif()
+
+if (NOT MSVC)
+    if (GGML_SANITIZE_THREAD)
+        add_compile_options(-fsanitize=thread)
+        link_libraries     (-fsanitize=thread)
+    endif()
+
+    if (GGML_SANITIZE_ADDRESS)
+        add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
+        link_libraries     (-fsanitize=address)
+    endif()
+
+    if (GGML_SANITIZE_UNDEFINED)
+        add_compile_options(-fsanitize=undefined)
+        link_libraries     (-fsanitize=undefined)
+    endif()
+endif()
+
+if (APPLE AND GGML_ACCELERATE)
+    find_library(ACCELERATE_FRAMEWORK Accelerate)
+    if (ACCELERATE_FRAMEWORK)
+        message(STATUS "Accelerate framework found")
+
+        add_compile_definitions(GGML_USE_ACCELERATE)
+        add_compile_definitions(ACCELERATE_NEW_LAPACK)
+        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
+
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
+    else()
+        message(WARNING "Accelerate framework not found")
+    endif()
+endif()
+
+if (GGML_METAL)
+    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+    find_library(METAL_FRAMEWORK    Metal      REQUIRED)
+    find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
+
+    message(STATUS "Metal framework found")
+    set(GGML_HEADERS_METAL ../include/ggml-metal.h)
+    set(GGML_SOURCES_METAL ggml-metal.m)
+
+    list(APPEND GGML_CDEF_PUBLIC GGML_USE_METAL)
+    if (GGML_METAL_NDEBUG)
+        add_compile_definitions(GGML_METAL_NDEBUG)
+    endif()
+
+    # copy ggml-common.h and ggml-metal.metal to bin directory
+    configure_file(ggml-common.h    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h    COPYONLY)
+    configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
+
+    if (GGML_METAL_EMBED_LIBRARY)
+        enable_language(ASM)
+
+        add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
+
+        set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/ggml-common.h")
+        set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
+
+        file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
+
+        # merge ggml-common.h and ggml-metal.metal into a single file
+        set(METALLIB_EMBED_ASM    "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s")
+        set(METALLIB_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal")
+
+        add_custom_command(
+            OUTPUT ${METALLIB_EMBED_ASM}
+            COMMAND echo "Embedding Metal library"
+            COMMAND sed -e '/\#include \"ggml-common.h\"/r ${METALLIB_COMMON}' -e '/\#include \"ggml-common.h\"/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED}
+            COMMAND echo ".section __DATA,__ggml_metallib"          >  ${METALLIB_EMBED_ASM}
+            COMMAND echo ".globl _ggml_metallib_start"              >> ${METALLIB_EMBED_ASM}
+            COMMAND echo "_ggml_metallib_start:"                    >> ${METALLIB_EMBED_ASM}
+            COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM}
+            COMMAND echo ".globl _ggml_metallib_end"                >> ${METALLIB_EMBED_ASM}
+            COMMAND echo "_ggml_metallib_end:"                      >> ${METALLIB_EMBED_ASM}
+            DEPENDS ggml-metal.metal ggml-common.h
+            COMMENT "Generate assembly for embedded Metal library"
+        )
+
+        set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM})
+    else()
+        if (GGML_METAL_SHADER_DEBUG)
+            # custom command to do the following:
+            #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
+            #   xcrun -sdk macosx metallib                   ggml-metal.air   -o default.metallib
+            #
+            # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
+            #       disabling fast math is needed in order to pass tests/test-backend-ops
+            # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
+            # note: unfortunately, we have to call it default.metallib instead of ggml.metallib
+            #       ref: https://github.com/ggerganov/whisper.cpp/issues/1720
+            set(XC_FLAGS -fno-fast-math -fno-inline -g)
+        else()
+            set(XC_FLAGS -O3)
+        endif()
+
+        # Append macOS metal versioning flags
+        if (GGML_METAL_MACOSX_VERSION_MIN)
+            message(STATUS "Adding  -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN} flag to metal compilation")
+            list   (APPEND XC_FLAGS -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN})
+        endif()
+
+        if (GGML_METAL_STD)
+            message(STATUS "Adding  -std=${GGML_METAL_STD} flag to metal compilation")
+            list   (APPEND XC_FLAGS -std=${GGML_METAL_STD})
+        endif()
+
+        add_custom_command(
+            OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+            COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
+            COMMAND xcrun -sdk macosx metallib                ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air   -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
+            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
+            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
+            DEPENDS ggml-metal.metal ggml-common.h
+            COMMENT "Compiling Metal kernels"
+            )
+
+        add_custom_target(
+            ggml-metal ALL
+            DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+            )
+    endif() # GGML_METAL_EMBED_LIBRARY
+
+    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS}
+        ${FOUNDATION_LIBRARY}
+        ${METAL_FRAMEWORK}
+        ${METALKIT_FRAMEWORK}
+        )
+endif()
+
+if (GGML_OPENMP)
+    find_package(OpenMP)
+    if (OpenMP_FOUND)
+        message(STATUS "OpenMP found")
+
+        add_compile_definitions(GGML_USE_OPENMP)
+
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+    else()
+        message(WARNING "OpenMP not found")
+    endif()
+endif()
+
+if (GGML_BLAS)
+    if (GGML_STATIC)
+        set(BLA_STATIC ON)
+    endif()
+    #if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
+    #    set(BLA_SIZEOF_INTEGER 8)
+    #endif()
+
+    set(BLA_VENDOR ${GGML_BLAS_VENDOR})
+    find_package(BLAS)
+
+    if (BLAS_FOUND)
+        message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
+
+        if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${GGML_BLAS_VENDOR} MATCHES "Apple"))
+            # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
+            # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
+            find_package(PkgConfig REQUIRED)
+            if (${GGML_BLAS_VENDOR} MATCHES "Generic")
+                pkg_check_modules(DepBLAS REQUIRED blas)
+            elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS")
+                # As of openblas v0.3.22, the 64-bit is named openblas64.pc
+                pkg_check_modules(DepBLAS openblas64)
+                if (NOT DepBLAS_FOUND)
+                    pkg_check_modules(DepBLAS REQUIRED openblas)
+                endif()
+            elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
+                pkg_check_modules(DepBLAS REQUIRED blis)
+            elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
+                pkg_check_modules(DepBLAS REQUIRED blas-atlas)
+            elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
+                pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
+            elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
+                # all Intel* libraries share the same include path
+                pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
+            elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
+                # this doesn't provide pkg-config
+                # suggest to assign BLAS_INCLUDE_DIRS on your own
+                if ("${NVHPC_VERSION}" STREQUAL "")
+                    message(WARNING "Better to set NVHPC_VERSION")
+                else()
+                    set(DepBLAS_FOUND ON)
+                    set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
+                endif()
+            endif()
+            if (DepBLAS_FOUND)
+                set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
+            else()
+                message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
+                " detected by pkgconfig, trying to find cblas.h from possible paths...")
+                find_path(BLAS_INCLUDE_DIRS
+                    NAMES cblas.h
+                    HINTS
+                        /usr/include
+                        /usr/local/include
+                        /usr/include/openblas
+                        /opt/homebrew/opt/openblas/include
+                        /usr/local/opt/openblas/include
+                        /usr/include/x86_64-linux-gnu/openblas/include
+                )
+            endif()
+        endif()
+
+        message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
+
+        add_compile_options(${BLAS_LINKER_FLAGS})
+
+        list(APPEND GGML_CDEF_PUBLIC GGML_USE_BLAS)
+
+        if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
+            add_compile_definitions(GGML_BLAS_USE_MKL)
+        endif()
+
+        set(GGML_HEADERS_BLAS ../include/ggml-blas.h)
+        set(GGML_SOURCES_BLAS ggml-blas.cpp)
+
+        set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS}     ${BLAS_LIBRARIES})
+        set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
+    else()
+        message(WARNING "BLAS not found, please refer to "
+        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
+        " to set correct GGML_BLAS_VENDOR")
+    endif()
+endif()
+
+if (GGML_LLAMAFILE)
+    message(STATUS "Using ggml SGEMM")
+
+    add_compile_definitions(GGML_USE_LLAMAFILE)
+
+    set(GGML_HEADERS_LLAMAFILE sgemm.h)
+    set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
+endif()
+
+if (GGML_CUDA)
+    cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
+
+    find_package(CUDAToolkit)
+
+    if (CUDAToolkit_FOUND)
+        message(STATUS "CUDA found")
+
+        if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+            # 52 == lowest CUDA 12 standard
+            # 60 == FP16 CUDA intrinsics
+            # 61 == integer CUDA intrinsics
+            # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
+            if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
+                set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
+            else()
+                set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
+                #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
+            endif()
+        endif()
+        message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
+
+        enable_language(CUDA)
+
+        file(GLOB   GGML_HEADERS_CUDA "ggml-cuda/*.cuh")
+        list(APPEND GGML_HEADERS_CUDA "../include/ggml-cuda.h")
+
+        file(GLOB   GGML_SOURCES_CUDA "ggml-cuda/*.cu")
+        list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
+        file(GLOB   SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        file(GLOB   SRCS "ggml-cuda/template-instances/mmq*.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+
+        if (GGML_CUDA_FA_ALL_QUANTS)
+            file(GLOB   SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
+            list(APPEND GGML_SOURCES_CUDA ${SRCS})
+            add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
+        else()
+            file(GLOB   SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
+            list(APPEND GGML_SOURCES_CUDA ${SRCS})
+            file(GLOB   SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
+            list(APPEND GGML_SOURCES_CUDA ${SRCS})
+            file(GLOB   SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
+            list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        endif()
+
+        list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA)
+
+        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
+        add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
+        add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
+        add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
+        add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
+
+        if (GGML_CUDA_FORCE_DMMV)
+            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
+        endif()
+
+        if (GGML_CUDA_FORCE_MMQ)
+            add_compile_definitions(GGML_CUDA_FORCE_MMQ)
+        endif()
+
+        if (GGML_CUDA_FORCE_CUBLAS)
+            add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
+        endif()
+
+        if (GGML_CUDA_NO_VMM)
+            add_compile_definitions(GGML_CUDA_NO_VMM)
+        endif()
+
+        if (DEFINED GGML_CUDA_DMMV_Y)
+            add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_DMMV_Y}) # for backwards compatibility
+        endif()
+
+        if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
+            add_compile_definitions(GGML_CUDA_F16)
+        endif()
+
+        if (GGML_CUDA_NO_PEER_COPY)
+            add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
+        endif()
+
+        if (GGML_STATIC)
+            if (WIN32)
+                # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
+                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
+            else ()
+                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+            endif()
+        else()
+            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
+        endif()
+
+        if (GGML_CUDA_NO_VMM)
+            # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
+        else()
+            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
+        endif()
+    else()
+        message(WARNING "CUDA not found")
+    endif()
+endif()
+
+if (GGML_HIPBLAS)
+    if (NOT EXISTS $ENV{ROCM_PATH})
+        if (NOT EXISTS /opt/rocm)
+            set(ROCM_PATH /usr)
+        else()
+            set(ROCM_PATH /opt/rocm)
+        endif()
+    else()
+        set(ROCM_PATH $ENV{ROCM_PATH})
+    endif()
+
+    list(APPEND CMAKE_PREFIX_PATH  ${ROCM_PATH})
+    list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
+
+    # CMake on Windows doesn't support the HIP language yet
+    if (WIN32)
+        set(CXX_IS_HIPCC TRUE)
+    else()
+        string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
+    endif()
+
+    if (CXX_IS_HIPCC)
+        if (LINUX)
+            if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+                message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
+            endif()
+
+            message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
+                    " Prefer setting the HIP compiler directly. See README for details.")
+        endif()
+    else()
+        # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
+        if (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
+            set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
+        endif()
+        cmake_minimum_required(VERSION 3.21)
+        enable_language(HIP)
+    endif()
+
+    find_package(hip     REQUIRED)
+    find_package(hipblas REQUIRED)
+    find_package(rocblas REQUIRED)
+
+    message(STATUS "HIP and hipBLAS found")
+
+    file(GLOB   GGML_HEADERS_ROCM "ggml-cuda/*.cuh")
+    list(APPEND GGML_HEADERS_ROCM "../include/ggml-cuda.h")
+
+    file(GLOB   GGML_SOURCES_ROCM "ggml-cuda/*.cu")
+    list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
+    file(GLOB   SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
+    list(APPEND GGML_SOURCES_ROCM ${SRCS})
+    file(GLOB   SRCS "ggml-cuda/template-instances/mmq*.cu")
+    list(APPEND GGML_SOURCES_ROCM ${SRCS})
+
+    if (GGML_CUDA_FA_ALL_QUANTS)
+        file(GLOB   SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
+        list(APPEND GGML_SOURCES_ROCM ${SRCS})
+        add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
+    else()
+        file(GLOB   SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
+        list(APPEND GGML_SOURCES_ROCM ${SRCS})
+        file(GLOB   SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
+        list(APPEND GGML_SOURCES_ROCM ${SRCS})
+        file(GLOB   SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
+        list(APPEND GGML_SOURCES_ROCM ${SRCS})
+    endif()
+
+    list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA)
+
+    add_compile_definitions(GGML_USE_HIPBLAS)
+    add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
+    add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
+    add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
+
+    if (GGML_HIP_UMA)
+        add_compile_definitions(GGML_HIP_UMA)
+    endif()
+
+    if (GGML_CUDA_FORCE_DMMV)
+        add_compile_definitions(GGML_CUDA_FORCE_DMMV)
+    endif()
+
+    if (GGML_CUDA_FORCE_MMQ)
+        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
+    endif()
+
+    if (GGML_CUDA_NO_PEER_COPY)
+        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
+    endif()
+
+    if (CXX_IS_HIPCC)
+        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} hip::device)
+    else()
+        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
+    endif()
+
+    if (GGML_STATIC)
+        message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
+    endif()
+
+    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
+endif()
+
+if (GGML_SYCL)
+    if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$")
+        message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
+    endif()
+
+    if ( NOT DEFINED ENV{ONEAPI_ROOT})
+        message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
+    endif()
+    #todo: AOT
+
+    find_package(IntelSYCL REQUIRED)
+    find_package(MKL REQUIRED)
+
+    message(STATUS "SYCL found")
+
+    list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL)
+
+    if (GGML_SYCL_F16)
+        add_compile_definitions(GGML_SYCL_F16)
+    endif()
+
+    if (GGML_CUDA_FORCE_MMQ)
+        add_compile_definitions(GGML_SYCL_FORCE_MMQ)
+    endif()
+
+    add_compile_options(-I./) #include DPCT
+
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+    if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
+    endif()
+
+    file(GLOB   GGML_HEADERS_SYCL "ggml-sycl/*.hpp")
+    list(APPEND GGML_HEADERS_SYCL "../include/ggml-sycl.h")
+
+    file(GLOB   GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
+    list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
+
+    if (WIN32)
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
+    else()
+        add_compile_options(-I/${SYCL_INCLUDE_DIR})
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
+
+        if (GGML_SYCL_TARGET STREQUAL "INTEL")
+            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
+        elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
+            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
+        endif()
+    endif()
+endif()
+
+if (GGML_RPC)
+    message(STATUS "RPC found")
+
+    list(APPEND GGML_CDEF_PUBLIC GGML_USE_RPC)
+
+    if (WIN32)
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ws2_32)
+    endif()
+
+    set(GGML_HEADERS_RPC ../include/ggml-rpc.h)
+    set(GGML_SOURCES_RPC ggml-rpc.cpp)
+endif()
+
+if (GGML_VULKAN)
+    find_package(Vulkan)
+
+    if (Vulkan_FOUND)
+        message(STATUS "Vulkan found")
+
+        set(GGML_HEADERS_VULKAN ../include/ggml-vulkan.h)
+        set(GGML_SOURCES_VULKAN ggml-vulkan.cpp)
+
+        list(APPEND GGML_CDEF_PUBLIC GGML_USE_VULKAN)
+
+        # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
+        # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
+        if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+            add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
+        endif()
+
+        if (GGML_VULKAN_CHECK_RESULTS)
+            add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
+        endif()
+
+        if (GGML_VULKAN_DEBUG)
+            add_compile_definitions(GGML_VULKAN_DEBUG)
+        endif()
+
+        if (GGML_VULKAN_MEMORY_DEBUG)
+            add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
+        endif()
+
+        if (GGML_VULKAN_VALIDATE)
+            add_compile_definitions(GGML_VULKAN_VALIDATE)
+        endif()
+
+        if (GGML_VULKAN_RUN_TESTS)
+            add_compile_definitions(GGML_VULKAN_RUN_TESTS)
+        endif()
+
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} Vulkan::Vulkan)
+    else()
+        message(WARNING "Vulkan not found")
+    endif()
+endif()
+
+if (GGML_KOMPUTE)
+    add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
+
+    find_package(Vulkan COMPONENTS glslc REQUIRED)
+    find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
+
+    if (NOT glslc_executable)
+        message(FATAL_ERROR "glslc not found")
+    endif()
+
+    function(compile_shader)
+        set(options)
+        set(oneValueArgs)
+        set(multiValueArgs SOURCES)
+        cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+        foreach(source ${compile_shader_SOURCES})
+            get_filename_component(filename ${source} NAME)
+            set(spv_file ${filename}.spv)
+            add_custom_command(
+                OUTPUT ${spv_file}
+                DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
+                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
+                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
+                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
+                COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+                COMMENT "Compiling ${source} to ${spv_file}"
+                )
+
+            get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
+            set(FILE_NAME "shader${RAW_FILE_NAME}")
+            string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
+            string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
+            string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
+            set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
+            message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
+            if(CMAKE_GENERATOR MATCHES "Visual Studio")
+                add_custom_command(
+                    OUTPUT ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                    DEPENDS ${spv_file} xxd
+                    COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
+                    )
+            else()
+                add_custom_command(
+                    OUTPUT ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                    DEPENDS ${spv_file} xxd
+                    COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
+                    )
+            endif()
+        endforeach()
+    endfunction()
+
+    if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
+        message(STATUS "Kompute found")
+        set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
+        add_subdirectory(kompute)
+
+        # Compile our shaders
+        compile_shader(SOURCES
+            kompute-shaders/op_scale.comp
+            kompute-shaders/op_scale_8.comp
+            kompute-shaders/op_add.comp
+            kompute-shaders/op_addrow.comp
+            kompute-shaders/op_mul.comp
+            kompute-shaders/op_silu.comp
+            kompute-shaders/op_relu.comp
+            kompute-shaders/op_gelu.comp
+            kompute-shaders/op_softmax.comp
+            kompute-shaders/op_norm.comp
+            kompute-shaders/op_rmsnorm.comp
+            kompute-shaders/op_diagmask.comp
+            kompute-shaders/op_mul_mat_mat_f32.comp
+            kompute-shaders/op_mul_mat_f16.comp
+            kompute-shaders/op_mul_mat_q8_0.comp
+            kompute-shaders/op_mul_mat_q4_0.comp
+            kompute-shaders/op_mul_mat_q4_1.comp
+            kompute-shaders/op_mul_mat_q6_k.comp
+            kompute-shaders/op_getrows_f32.comp
+            kompute-shaders/op_getrows_f16.comp
+            kompute-shaders/op_getrows_q4_0.comp
+            kompute-shaders/op_getrows_q4_1.comp
+            kompute-shaders/op_getrows_q6_k.comp
+            kompute-shaders/op_rope_f16.comp
+            kompute-shaders/op_rope_f32.comp
+            kompute-shaders/op_cpy_f16_f16.comp
+            kompute-shaders/op_cpy_f16_f32.comp
+            kompute-shaders/op_cpy_f32_f16.comp
+            kompute-shaders/op_cpy_f32_f32.comp
+        )
+
+        # Create a custom target for our generated shaders
+        add_custom_target(generated_shaders DEPENDS
+            shaderop_scale.h
+            shaderop_scale_8.h
+            shaderop_add.h
+            shaderop_addrow.h
+            shaderop_mul.h
+            shaderop_silu.h
+            shaderop_relu.h
+            shaderop_gelu.h
+            shaderop_softmax.h
+            shaderop_norm.h
+            shaderop_rmsnorm.h
+            shaderop_diagmask.h
+            shaderop_mul_mat_mat_f32.h
+            shaderop_mul_mat_f16.h
+            shaderop_mul_mat_q8_0.h
+            shaderop_mul_mat_q4_0.h
+            shaderop_mul_mat_q4_1.h
+            shaderop_mul_mat_q6_k.h
+            shaderop_getrows_f32.h
+            shaderop_getrows_f16.h
+            shaderop_getrows_q4_0.h
+            shaderop_getrows_q4_1.h
+            shaderop_getrows_q6_k.h
+            shaderop_rope_f16.h
+            shaderop_rope_f32.h
+            shaderop_cpy_f16_f16.h
+            shaderop_cpy_f16_f32.h
+            shaderop_cpy_f32_f16.h
+            shaderop_cpy_f32_f32.h
+        )
+
+        # Create a custom command that depends on the generated_shaders
+        add_custom_command(
+            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
+            COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
+            DEPENDS generated_shaders
+            COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
+        )
+
+        # Add the stamp to the main sources to ensure dependency tracking
+        set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp           ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
+        set(GGML_HEADERS_KOMPUTE ../include/ggml-kompute.h  ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
+
+        list(APPEND GGML_CDEF_PUBLIC GGML_USE_KOMPUTE)
+
+        set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS}     kompute)
+        set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR})
+    else()
+        message(WARNING "Kompute not found")
+    endif()
+endif()
+
+if (GGML_CPU_HBM)
+    find_library(memkind memkind REQUIRED)
+
+    message(STATUS "Using memkind for CPU HBM")
+
+    add_compile_definitions(GGML_USE_CPU_HBM)
+
+    target_link_libraries(ggml PUBLIC memkind)
+endif()
+
+function(get_flags CCID CCVER)
+    set(C_FLAGS "")
+    set(CXX_FLAGS "")
+
+    if (CCID MATCHES "Clang")
+        set(C_FLAGS   -Wunreachable-code-break -Wunreachable-code-return)
+        set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
+
+        if (
+            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
+            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
+        )
+            list(APPEND C_FLAGS -Wdouble-promotion)
+        endif()
+    elseif (CCID STREQUAL "GNU")
+        set(C_FLAGS   -Wdouble-promotion)
+        set(CXX_FLAGS -Wno-array-bounds)
+
+        if (CCVER VERSION_GREATER_EQUAL 7.1.0)
+            list(APPEND CXX_FLAGS -Wno-format-truncation)
+        endif()
+        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
+            list(APPEND CXX_FLAGS -Wextra-semi)
+        endif()
+    endif()
+
+    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
+    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
+endfunction()
+
+if (GGML_FATAL_WARNINGS)
+    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        list(APPEND C_FLAGS   -Werror)
+        list(APPEND CXX_FLAGS -Werror)
+    elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+        add_compile_options(/WX)
+    endif()
+endif()
+
+if (GGML_ALL_WARNINGS)
+    if (NOT MSVC)
+        list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+        list(APPEND C_FLAGS       -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
+                                  -Werror=implicit-int -Werror=implicit-function-declaration)
+        list(APPEND CXX_FLAGS     -Wmissing-declarations -Wmissing-noreturn)
+
+        list(APPEND C_FLAGS   ${WARNING_FLAGS})
+        list(APPEND CXX_FLAGS ${WARNING_FLAGS})
+
+        get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
+
+        add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
+                            "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
+    else()
+        # todo : msvc
+        set(C_FLAGS   "")
+        set(CXX_FLAGS "")
+    endif()
+endif()
+
+set(CUDA_CXX_FLAGS "")
+
+if (GGML_CUDA)
+    set(CUDA_FLAGS -use_fast_math)
+
+    if (GGML_FATAL_WARNINGS)
+        list(APPEND CUDA_FLAGS -Werror all-warnings)
+    endif()
+
+    if (GGML_ALL_WARNINGS AND NOT MSVC)
+        set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
+        if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
+            list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
+        endif()
+
+        execute_process(
+            COMMAND ${NVCC_CMD} -Xcompiler --version
+            OUTPUT_VARIABLE CUDA_CCFULLVER
+            ERROR_QUIET
+        )
+
+        if (NOT CUDA_CCFULLVER MATCHES clang)
+            set(CUDA_CCID "GNU")
+            execute_process(
+                COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
+                OUTPUT_VARIABLE CUDA_CCVER
+                ERROR_QUIET
+            )
+        else()
+            if (CUDA_CCFULLVER MATCHES Apple)
+                set(CUDA_CCID "AppleClang")
+            else()
+                set(CUDA_CCID "Clang")
+            endif()
+            string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
+        endif()
+
+        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
+
+        get_flags(${CUDA_CCID} ${CUDA_CCVER})
+        list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
+    endif()
+
+    if (NOT MSVC)
+        list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
+    endif()
+endif()
+
+if (GGML_LTO)
+    include(CheckIPOSupported)
+    check_ipo_supported(RESULT result OUTPUT output)
+    if (result)
+        set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
+    else()
+        message(WARNING "IPO is not supported: ${output}")
+    endif()
+endif()
+
+if (GGML_CCACHE)
+    find_program(GGML_CCACHE_FOUND ccache)
+
+    if (GGML_CCACHE_FOUND)
+        # TODO: should not be set globally
+        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
+        set(ENV{CCACHE_SLOPPINESS} time_macros)
+        message(STATUS "ccache found, compilation results will be cached. Disable with GGML_CCACHE=OFF.")
+    else()
+        message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with GGML_CCACHE=OFF")
+    endif ()
+endif()
+
+# this version of Apple ld64 is buggy
+execute_process(
+    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
+    ERROR_VARIABLE output
+    OUTPUT_QUIET
+)
+
+if (output MATCHES "dyld-1015\.7")
+    add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
+endif()
+
+# architecture specific
+# TODO: probably these flags need to be tweaked on some architectures
+#       feel free to update the Makefile for your architecture and send a pull request or issue
+message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
+if (MSVC)
+    string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
+    message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
+else ()
+    set(CMAKE_GENERATOR_PLATFORM_LWR "")
+endif ()
+
+if (NOT MSVC)
+    if (GGML_STATIC)
+        add_link_options(-static)
+        if (MINGW)
+            add_link_options(-static-libgcc -static-libstdc++)
+        endif()
+    endif()
+    if (GGML_GPROF)
+        add_compile_options(-pg)
+    endif()
+endif()
+
+set(ARCH_FLAGS "")
+
+if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
+    CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
+    (NOT CMAKE_OSX_ARCHITECTURES      AND
+     NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+         CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
+
+    message(STATUS "ARM detected")
+
+    if (MSVC)
+        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
+        add_compile_definitions(__ARM_NEON)
+        add_compile_definitions(__ARM_FEATURE_FMA)
+
+        set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
+        string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
+
+        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
+        if (GGML_COMPILER_SUPPORT_DOTPROD)
+            add_compile_definitions(__ARM_FEATURE_DOTPROD)
+        endif ()
+
+        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
+
+        if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
+            add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
+        endif ()
+
+        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
+        if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
+            add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+        endif ()
+
+        set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
+    else()
+        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
+        if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
+            list(APPEND ARCH_FLAGS -mfp16-format=ieee)
+        endif()
+        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
+            # Raspberry Pi 1, Zero
+            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
+        endif()
+        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
+            if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
+                # Android armeabi-v7a
+                list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
+            else()
+                # Raspberry Pi 2
+                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
+            endif()
+        endif()
+        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
+            # Android arm64-v8a
+            # Raspberry Pi 3, 4, Zero 2 (32-bit)
+            list(APPEND ARCH_FLAGS -mno-unaligned-access)
+        endif()
+        if (GGML_SVE)
+            list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
+        endif()
+    endif()
+elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
+        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+         CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
+    message(STATUS "x86 detected")
+    if (MSVC)
+        # instruction set detection for MSVC only
+        if (GGML_NATIVE)
+            # TODO: improve, should not reference files from the parent folder
+            include(../cmake/FindSIMD.cmake)
+        endif ()
+        if (GGML_AVX512)
+            list(APPEND ARCH_FLAGS /arch:AVX512)
+            # MSVC has no compile-time flags enabling specific
+            # AVX512 extensions, neither it defines the
+            # macros corresponding to the extensions.
+            # Do it manually.
+            if (GGML_AVX512_VBMI)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
+            endif()
+            if (GGML_AVX512_VNNI)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
+            endif()
+            if (GGML_AVX512_BF16)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
+            endif()
+        elseif (GGML_AVX2)
+            list(APPEND ARCH_FLAGS /arch:AVX2)
+        elseif (GGML_AVX)
+            list(APPEND ARCH_FLAGS /arch:AVX)
+        endif()
+    else()
+        if (GGML_NATIVE)
+            list(APPEND ARCH_FLAGS -march=native)
+        endif()
+        if (GGML_F16C)
+            list(APPEND ARCH_FLAGS -mf16c)
+        endif()
+        if (GGML_FMA)
+            list(APPEND ARCH_FLAGS -mfma)
+        endif()
+        if (GGML_AVX)
+            list(APPEND ARCH_FLAGS -mavx)
+        endif()
+        if (GGML_AVX2)
+            list(APPEND ARCH_FLAGS -mavx2)
+        endif()
+        if (GGML_AVX512)
+            list(APPEND ARCH_FLAGS -mavx512f)
+            list(APPEND ARCH_FLAGS -mavx512bw)
+        endif()
+        if (GGML_AVX512_VBMI)
+            list(APPEND ARCH_FLAGS -mavx512vbmi)
+        endif()
+        if (GGML_AVX512_VNNI)
+            list(APPEND ARCH_FLAGS -mavx512vnni)
+        endif()
+        if (GGML_AVX512_BF16)
+            list(APPEND ARCH_FLAGS -mavx512bf16)
+        endif()
+    endif()
+elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
+    message(STATUS "PowerPC detected")
+    if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
+        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
+    else()
+        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
+        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
+    endif()
+elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
+    message(STATUS "loongarch64 detected")
+
+    list(APPEND ARCH_FLAGS -march=loongarch64)
+    if (GGML_LASX)
+        list(APPEND ARCH_FLAGS -mlasx)
+    endif()
+    if (GGML_LSX)
+        list(APPEND ARCH_FLAGS -mlsx)
+    endif()
+else()
+    message(STATUS "Unknown architecture")
+endif()
+
+add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
+add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
+
+if (GGML_CUDA)
+    list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
+    list(JOIN   CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
+
+    if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
+        list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
+    endif()
+
+    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
+endif()
+
+if (MINGW)
+    # Target Windows 8 for PrefetchVirtualMemory
+    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
+endif()
+
+#
+# POSIX conformance
+#
+
+# clock_gettime came in POSIX.1b (1993)
+# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
+# posix_memalign came in POSIX.1-2001 / SUSv3
+# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
+add_compile_definitions(_XOPEN_SOURCE=600)
+
+# Somehow in OpenBSD whenever POSIX conformance is specified
+# some string functions rely on locale_t availability,
+# which was introduced in POSIX.1-2008, forcing us to go higher
+if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
+    remove_definitions(-D_XOPEN_SOURCE=600)
+    add_compile_definitions(_XOPEN_SOURCE=700)
+endif()
+
+# Data types, macros and functions related to controlling CPU affinity and
+# some memory allocation are available on Linux through GNU extensions in libc
+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    add_compile_definitions(_GNU_SOURCE)
+endif()
+
+# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
+# and on macOS its availability depends on enabling Darwin extensions
+# similarly on DragonFly, enabling BSD extensions is necessary
+if (
+    CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
+    CMAKE_SYSTEM_NAME MATCHES "iOS"    OR
+    CMAKE_SYSTEM_NAME MATCHES "tvOS"   OR
+    CMAKE_SYSTEM_NAME MATCHES "DragonFly"
+)
+    add_compile_definitions(_DARWIN_C_SOURCE)
+endif()
+
+# alloca is a non-standard interface that is not visible on BSDs when
+# POSIX conformance is specified, but not all of them provide a clean way
+# to enable it in such cases
+if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+    add_compile_definitions(__BSD_VISIBLE)
+endif()
+if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
+    add_compile_definitions(_NETBSD_SOURCE)
+endif()
+if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
+    add_compile_definitions(_BSD_SOURCE)
+endif()
+
+if (WIN32)
+    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+
+    if (BUILD_SHARED_LIBS)
+        # TODO: should not use this
+        set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+    endif()
+endif()
+
+#
+# libraries
+#
+
+# ggml
+
+add_library(ggml
+            ../include/ggml.h
+            ../include/ggml-alloc.h
+            ../include/ggml-backend.h
+            ggml.c
+            ggml-alloc.c
+            ggml-backend.c
+            ggml-quants.c
+            ggml-quants.h
+            ${GGML_SOURCES_CUDA}      ${GGML_HEADERS_CUDA}
+            ${GGML_SOURCES_METAL}     ${GGML_HEADERS_METAL}
+            ${GGML_SOURCES_RPC}       ${GGML_HEADERS_RPC}
+            ${GGML_SOURCES_EXTRA}     ${GGML_HEADERS_EXTRA}
+            ${GGML_SOURCES_SYCL}      ${GGML_HEADERS_SYCL}
+            ${GGML_SOURCES_KOMPUTE}   ${GGML_HEADERS_KOMPUTE}
+            ${GGML_SOURCES_VULKAN}    ${GGML_HEADERS_VULKAN}
+            ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
+            ${GGML_SOURCES_BLAS}      ${GGML_HEADERS_BLAS}
+            ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
+            )
+
+if (EMSCRIPTEN)
+    set_target_properties(ggml PROPERTIES COMPILE_FLAGS "-msimd128")
+endif()
+
+target_compile_definitions(ggml PUBLIC  ${GGML_CDEF_PUBLIC})
+target_include_directories(ggml PUBLIC ../include)
+target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
+target_compile_features   (ggml PRIVATE c_std_11) # don't bump
+
+target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})
+
+find_library(MATH_LIBRARY m)
+if (MATH_LIBRARY)
+    target_link_libraries(ggml PRIVATE ${MATH_LIBRARY})
+endif()
+
+if (BUILD_SHARED_LIBS)
+    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
diff --git a/ggml-alloc.c b/ggml/src/ggml-alloc.c
similarity index 100%
rename from ggml-alloc.c
rename to ggml/src/ggml-alloc.c
diff --git a/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
similarity index 100%
rename from ggml-backend-impl.h
rename to ggml/src/ggml-backend-impl.h
diff --git a/ggml-backend.c b/ggml/src/ggml-backend.c
similarity index 100%
rename from ggml-backend.c
rename to ggml/src/ggml-backend.c
diff --git a/ggml-blas.cpp b/ggml/src/ggml-blas.cpp
similarity index 100%
rename from ggml-blas.cpp
rename to ggml/src/ggml-blas.cpp
diff --git a/ggml-common.h b/ggml/src/ggml-common.h
similarity index 100%
rename from ggml-common.h
rename to ggml/src/ggml-common.h
diff --git a/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
similarity index 100%
rename from ggml-cuda.cu
rename to ggml/src/ggml-cuda.cu
diff --git a/ggml-cuda/acc.cu b/ggml/src/ggml-cuda/acc.cu
similarity index 100%
rename from ggml-cuda/acc.cu
rename to ggml/src/ggml-cuda/acc.cu
diff --git a/ggml-cuda/acc.cuh b/ggml/src/ggml-cuda/acc.cuh
similarity index 100%
rename from ggml-cuda/acc.cuh
rename to ggml/src/ggml-cuda/acc.cuh
diff --git a/ggml-cuda/arange.cu b/ggml/src/ggml-cuda/arange.cu
similarity index 100%
rename from ggml-cuda/arange.cu
rename to ggml/src/ggml-cuda/arange.cu
diff --git a/ggml-cuda/arange.cuh b/ggml/src/ggml-cuda/arange.cuh
similarity index 100%
rename from ggml-cuda/arange.cuh
rename to ggml/src/ggml-cuda/arange.cuh
diff --git a/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
similarity index 100%
rename from ggml-cuda/argsort.cu
rename to ggml/src/ggml-cuda/argsort.cu
diff --git a/ggml-cuda/argsort.cuh b/ggml/src/ggml-cuda/argsort.cuh
similarity index 100%
rename from ggml-cuda/argsort.cuh
rename to ggml/src/ggml-cuda/argsort.cuh
diff --git a/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu
similarity index 100%
rename from ggml-cuda/binbcast.cu
rename to ggml/src/ggml-cuda/binbcast.cu
diff --git a/ggml-cuda/binbcast.cuh b/ggml/src/ggml-cuda/binbcast.cuh
similarity index 100%
rename from ggml-cuda/binbcast.cuh
rename to ggml/src/ggml-cuda/binbcast.cuh
diff --git a/ggml-cuda/clamp.cu b/ggml/src/ggml-cuda/clamp.cu
similarity index 100%
rename from ggml-cuda/clamp.cu
rename to ggml/src/ggml-cuda/clamp.cu
diff --git a/ggml-cuda/clamp.cuh b/ggml/src/ggml-cuda/clamp.cuh
similarity index 100%
rename from ggml-cuda/clamp.cuh
rename to ggml/src/ggml-cuda/clamp.cuh
diff --git a/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
similarity index 100%
rename from ggml-cuda/common.cuh
rename to ggml/src/ggml-cuda/common.cuh
diff --git a/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu
similarity index 100%
rename from ggml-cuda/concat.cu
rename to ggml/src/ggml-cuda/concat.cu
diff --git a/ggml-cuda/concat.cuh b/ggml/src/ggml-cuda/concat.cuh
similarity index 100%
rename from ggml-cuda/concat.cuh
rename to ggml/src/ggml-cuda/concat.cuh
diff --git a/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
similarity index 100%
rename from ggml-cuda/convert.cu
rename to ggml/src/ggml-cuda/convert.cu
diff --git a/ggml-cuda/convert.cuh b/ggml/src/ggml-cuda/convert.cuh
similarity index 100%
rename from ggml-cuda/convert.cuh
rename to ggml/src/ggml-cuda/convert.cuh
diff --git a/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
similarity index 100%
rename from ggml-cuda/cpy.cu
rename to ggml/src/ggml-cuda/cpy.cu
diff --git a/ggml-cuda/cpy.cuh b/ggml/src/ggml-cuda/cpy.cuh
similarity index 100%
rename from ggml-cuda/cpy.cuh
rename to ggml/src/ggml-cuda/cpy.cuh
diff --git a/ggml-cuda/dequantize.cuh b/ggml/src/ggml-cuda/dequantize.cuh
similarity index 100%
rename from ggml-cuda/dequantize.cuh
rename to ggml/src/ggml-cuda/dequantize.cuh
diff --git a/ggml-cuda/diagmask.cu b/ggml/src/ggml-cuda/diagmask.cu
similarity index 100%
rename from ggml-cuda/diagmask.cu
rename to ggml/src/ggml-cuda/diagmask.cu
diff --git a/ggml-cuda/diagmask.cuh b/ggml/src/ggml-cuda/diagmask.cuh
similarity index 100%
rename from ggml-cuda/diagmask.cuh
rename to ggml/src/ggml-cuda/diagmask.cuh
diff --git a/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu
similarity index 100%
rename from ggml-cuda/dmmv.cu
rename to ggml/src/ggml-cuda/dmmv.cu
diff --git a/ggml-cuda/dmmv.cuh b/ggml/src/ggml-cuda/dmmv.cuh
similarity index 100%
rename from ggml-cuda/dmmv.cuh
rename to ggml/src/ggml-cuda/dmmv.cuh
diff --git a/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
similarity index 99%
rename from ggml-cuda/fattn-common.cuh
rename to ggml/src/ggml-cuda/fattn-common.cuh
index 37b3b9932..bd7993595 100644
--- a/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -603,7 +603,7 @@ static void on_no_fattn_vec_case(const int D) {
     if (D == 64) {
         fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
         fprintf(stderr, "By default only f16 KV cache is supported.\n");
-        fprintf(stderr, "Compile with LLAMA_CUDA_FA_ALL_QUANTS for V cache quantization support.\n");
+        fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for V cache quantization support.\n");
         GGML_ASSERT(false);
     } else if (D == 128) {
         fprintf(stderr, "Unsupported KV type combination for head_size 128.\n");
@@ -611,7 +611,7 @@ static void on_no_fattn_vec_case(const int D) {
         fprintf(stderr, "  - K == q4_0, V == q4_0,  4.50 BPV\n");
         fprintf(stderr, "  - K == q8_0, V == q8_0,  8.50 BPV\n");
         fprintf(stderr, "  - K == f16,  V == f16,  16.00 BPV\n");
-        fprintf(stderr, "Compile with LLAMA_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
+        fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
         GGML_ASSERT(false);
     } else {
         fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");
diff --git a/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu
similarity index 100%
rename from ggml-cuda/fattn-tile-f16.cu
rename to ggml/src/ggml-cuda/fattn-tile-f16.cu
diff --git a/ggml-cuda/fattn-tile-f16.cuh b/ggml/src/ggml-cuda/fattn-tile-f16.cuh
similarity index 100%
rename from ggml-cuda/fattn-tile-f16.cuh
rename to ggml/src/ggml-cuda/fattn-tile-f16.cuh
diff --git a/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu
similarity index 100%
rename from ggml-cuda/fattn-tile-f32.cu
rename to ggml/src/ggml-cuda/fattn-tile-f32.cu
diff --git a/ggml-cuda/fattn-tile-f32.cuh b/ggml/src/ggml-cuda/fattn-tile-f32.cuh
similarity index 100%
rename from ggml-cuda/fattn-tile-f32.cuh
rename to ggml/src/ggml-cuda/fattn-tile-f32.cuh
diff --git a/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
similarity index 100%
rename from ggml-cuda/fattn-vec-f16.cuh
rename to ggml/src/ggml-cuda/fattn-vec-f16.cuh
diff --git a/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
similarity index 100%
rename from ggml-cuda/fattn-vec-f32.cuh
rename to ggml/src/ggml-cuda/fattn-vec-f32.cuh
diff --git a/ggml-cuda/fattn-wmma-f16.cuh b/ggml/src/ggml-cuda/fattn-wmma-f16.cuh
similarity index 100%
rename from ggml-cuda/fattn-wmma-f16.cuh
rename to ggml/src/ggml-cuda/fattn-wmma-f16.cuh
diff --git a/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
similarity index 100%
rename from ggml-cuda/fattn.cu
rename to ggml/src/ggml-cuda/fattn.cu
diff --git a/ggml-cuda/fattn.cuh b/ggml/src/ggml-cuda/fattn.cuh
similarity index 100%
rename from ggml-cuda/fattn.cuh
rename to ggml/src/ggml-cuda/fattn.cuh
diff --git a/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu
similarity index 100%
rename from ggml-cuda/getrows.cu
rename to ggml/src/ggml-cuda/getrows.cu
diff --git a/ggml-cuda/getrows.cuh b/ggml/src/ggml-cuda/getrows.cuh
similarity index 100%
rename from ggml-cuda/getrows.cuh
rename to ggml/src/ggml-cuda/getrows.cuh
diff --git a/ggml-cuda/im2col.cu b/ggml/src/ggml-cuda/im2col.cu
similarity index 100%
rename from ggml-cuda/im2col.cu
rename to ggml/src/ggml-cuda/im2col.cu
diff --git a/ggml-cuda/im2col.cuh b/ggml/src/ggml-cuda/im2col.cuh
similarity index 100%
rename from ggml-cuda/im2col.cuh
rename to ggml/src/ggml-cuda/im2col.cuh
diff --git a/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
similarity index 100%
rename from ggml-cuda/mma.cuh
rename to ggml/src/ggml-cuda/mma.cuh
diff --git a/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
similarity index 100%
rename from ggml-cuda/mmq.cu
rename to ggml/src/ggml-cuda/mmq.cu
diff --git a/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
similarity index 100%
rename from ggml-cuda/mmq.cuh
rename to ggml/src/ggml-cuda/mmq.cuh
diff --git a/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
similarity index 100%
rename from ggml-cuda/mmvq.cu
rename to ggml/src/ggml-cuda/mmvq.cu
diff --git a/ggml-cuda/mmvq.cuh b/ggml/src/ggml-cuda/mmvq.cuh
similarity index 100%
rename from ggml-cuda/mmvq.cuh
rename to ggml/src/ggml-cuda/mmvq.cuh
diff --git a/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu
similarity index 100%
rename from ggml-cuda/norm.cu
rename to ggml/src/ggml-cuda/norm.cu
diff --git a/ggml-cuda/norm.cuh b/ggml/src/ggml-cuda/norm.cuh
similarity index 100%
rename from ggml-cuda/norm.cuh
rename to ggml/src/ggml-cuda/norm.cuh
diff --git a/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
similarity index 100%
rename from ggml-cuda/pad.cu
rename to ggml/src/ggml-cuda/pad.cu
diff --git a/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh
similarity index 100%
rename from ggml-cuda/pad.cuh
rename to ggml/src/ggml-cuda/pad.cuh
diff --git a/ggml-cuda/pool2d.cu b/ggml/src/ggml-cuda/pool2d.cu
similarity index 100%
rename from ggml-cuda/pool2d.cu
rename to ggml/src/ggml-cuda/pool2d.cu
diff --git a/ggml-cuda/pool2d.cuh b/ggml/src/ggml-cuda/pool2d.cuh
similarity index 100%
rename from ggml-cuda/pool2d.cuh
rename to ggml/src/ggml-cuda/pool2d.cuh
diff --git a/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu
similarity index 100%
rename from ggml-cuda/quantize.cu
rename to ggml/src/ggml-cuda/quantize.cu
diff --git a/ggml-cuda/quantize.cuh b/ggml/src/ggml-cuda/quantize.cuh
similarity index 100%
rename from ggml-cuda/quantize.cuh
rename to ggml/src/ggml-cuda/quantize.cuh
diff --git a/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
similarity index 100%
rename from ggml-cuda/rope.cu
rename to ggml/src/ggml-cuda/rope.cu
diff --git a/ggml-cuda/rope.cuh b/ggml/src/ggml-cuda/rope.cuh
similarity index 100%
rename from ggml-cuda/rope.cuh
rename to ggml/src/ggml-cuda/rope.cuh
diff --git a/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu
similarity index 100%
rename from ggml-cuda/scale.cu
rename to ggml/src/ggml-cuda/scale.cu
diff --git a/ggml-cuda/scale.cuh b/ggml/src/ggml-cuda/scale.cuh
similarity index 100%
rename from ggml-cuda/scale.cuh
rename to ggml/src/ggml-cuda/scale.cuh
diff --git a/ggml-cuda/softmax.cu b/ggml/src/ggml-cuda/softmax.cu
similarity index 100%
rename from ggml-cuda/softmax.cu
rename to ggml/src/ggml-cuda/softmax.cu
diff --git a/ggml-cuda/softmax.cuh b/ggml/src/ggml-cuda/softmax.cuh
similarity index 100%
rename from ggml-cuda/softmax.cuh
rename to ggml/src/ggml-cuda/softmax.cuh
diff --git a/ggml-cuda/sumrows.cu b/ggml/src/ggml-cuda/sumrows.cu
similarity index 100%
rename from ggml-cuda/sumrows.cu
rename to ggml/src/ggml-cuda/sumrows.cu
diff --git a/ggml-cuda/sumrows.cuh b/ggml/src/ggml-cuda/sumrows.cuh
similarity index 100%
rename from ggml-cuda/sumrows.cuh
rename to ggml/src/ggml-cuda/sumrows.cuh
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu b/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu
diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu b/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu
diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu b/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu b/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu b/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
similarity index 100%
rename from ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
rename to ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
diff --git a/ggml-cuda/template-instances/generate_cu_files.py b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
similarity index 100%
rename from ggml-cuda/template-instances/generate_cu_files.py
rename to ggml/src/ggml-cuda/template-instances/generate_cu_files.py
diff --git a/ggml-cuda/template-instances/mmq-instance-q2_k.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu
similarity index 100%
rename from ggml-cuda/template-instances/mmq-instance-q2_k.cu
rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu
diff --git a/ggml-cuda/template-instances/mmq-instance-q3_k.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu
similarity index 100%
rename from ggml-cuda/template-instances/mmq-instance-q3_k.cu
rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu
diff --git a/ggml-cuda/template-instances/mmq-instance-q4_0.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/mmq-instance-q4_0.cu
rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu
diff --git a/ggml-cuda/template-instances/mmq-instance-q4_1.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/mmq-instance-q4_1.cu
rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu
diff --git a/ggml-cuda/template-instances/mmq-instance-q4_k.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu
similarity index 100%
rename from ggml-cuda/template-instances/mmq-instance-q4_k.cu
rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu
diff --git a/ggml-cuda/template-instances/mmq-instance-q5_0.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/mmq-instance-q5_0.cu
rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu
diff --git a/ggml-cuda/template-instances/mmq-instance-q5_1.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu
similarity index 100%
rename from ggml-cuda/template-instances/mmq-instance-q5_1.cu
rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu
diff --git a/ggml-cuda/template-instances/mmq-instance-q5_k.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu
similarity index 100%
rename from ggml-cuda/template-instances/mmq-instance-q5_k.cu
rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu
diff --git a/ggml-cuda/template-instances/mmq-instance-q6_k.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu
similarity index 100%
rename from ggml-cuda/template-instances/mmq-instance-q6_k.cu
rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu
diff --git a/ggml-cuda/template-instances/mmq-instance-q8_0.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu
similarity index 100%
rename from ggml-cuda/template-instances/mmq-instance-q8_0.cu
rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu
diff --git a/ggml-cuda/tsembd.cu b/ggml/src/ggml-cuda/tsembd.cu
similarity index 100%
rename from ggml-cuda/tsembd.cu
rename to ggml/src/ggml-cuda/tsembd.cu
diff --git a/ggml-cuda/tsembd.cuh b/ggml/src/ggml-cuda/tsembd.cuh
similarity index 100%
rename from ggml-cuda/tsembd.cuh
rename to ggml/src/ggml-cuda/tsembd.cuh
diff --git a/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu
similarity index 100%
rename from ggml-cuda/unary.cu
rename to ggml/src/ggml-cuda/unary.cu
diff --git a/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh
similarity index 100%
rename from ggml-cuda/unary.cuh
rename to ggml/src/ggml-cuda/unary.cuh
diff --git a/ggml-cuda/upscale.cu b/ggml/src/ggml-cuda/upscale.cu
similarity index 100%
rename from ggml-cuda/upscale.cu
rename to ggml/src/ggml-cuda/upscale.cu
diff --git a/ggml-cuda/upscale.cuh b/ggml/src/ggml-cuda/upscale.cuh
similarity index 100%
rename from ggml-cuda/upscale.cuh
rename to ggml/src/ggml-cuda/upscale.cuh
diff --git a/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh
similarity index 100%
rename from ggml-cuda/vecdotq.cuh
rename to ggml/src/ggml-cuda/vecdotq.cuh
diff --git a/ggml-impl.h b/ggml/src/ggml-impl.h
similarity index 100%
rename from ggml-impl.h
rename to ggml/src/ggml-impl.h
diff --git a/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp
similarity index 100%
rename from ggml-kompute.cpp
rename to ggml/src/ggml-kompute.cpp
diff --git a/ggml-metal.m b/ggml/src/ggml-metal.m
similarity index 100%
rename from ggml-metal.m
rename to ggml/src/ggml-metal.m
diff --git a/ggml-metal.metal b/ggml/src/ggml-metal.metal
similarity index 100%
rename from ggml-metal.metal
rename to ggml/src/ggml-metal.metal
diff --git a/ggml-quants.c b/ggml/src/ggml-quants.c
similarity index 100%
rename from ggml-quants.c
rename to ggml/src/ggml-quants.c
diff --git a/ggml-quants.h b/ggml/src/ggml-quants.h
similarity index 100%
rename from ggml-quants.h
rename to ggml/src/ggml-quants.h
diff --git a/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp
similarity index 100%
rename from ggml-rpc.cpp
rename to ggml/src/ggml-rpc.cpp
diff --git a/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
similarity index 99%
rename from ggml-sycl.cpp
rename to ggml/src/ggml-sycl.cpp
index db045336f..4a668a2c3 100644
--- a/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -37,6 +37,7 @@
 #include "ggml-backend-impl.h"
 
 #include "ggml-sycl/backend.hpp"
+#include "ggml-sycl/presets.hpp"
 
 bool   ggml_sycl_loaded(void);
 void   ggml_sycl_free_data(struct ggml_tensor * tensor);
diff --git a/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp
similarity index 100%
rename from ggml-sycl/backend.hpp
rename to ggml/src/ggml-sycl/backend.hpp
diff --git a/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp
similarity index 100%
rename from ggml-sycl/common.cpp
rename to ggml/src/ggml-sycl/common.cpp
diff --git a/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
similarity index 99%
rename from ggml-sycl/common.hpp
rename to ggml/src/ggml-sycl/common.hpp
index 414c37eed..e01f91633 100644
--- a/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -17,6 +17,7 @@
 #include <iostream>
 
 #include "dpct/helper.hpp"
+#include "ggml-sycl.h"
 #include "presets.hpp"
 
 #define GGML_COMMON_DECL_SYCL
diff --git a/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp
similarity index 100%
rename from ggml-sycl/convert.cpp
rename to ggml/src/ggml-sycl/convert.cpp
diff --git a/ggml-sycl/convert.hpp b/ggml/src/ggml-sycl/convert.hpp
similarity index 100%
rename from ggml-sycl/convert.hpp
rename to ggml/src/ggml-sycl/convert.hpp
diff --git a/ggml-sycl/dequantize.hpp b/ggml/src/ggml-sycl/dequantize.hpp
similarity index 100%
rename from ggml-sycl/dequantize.hpp
rename to ggml/src/ggml-sycl/dequantize.hpp
diff --git a/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp
similarity index 100%
rename from ggml-sycl/dmmv.cpp
rename to ggml/src/ggml-sycl/dmmv.cpp
diff --git a/ggml-sycl/dmmv.hpp b/ggml/src/ggml-sycl/dmmv.hpp
similarity index 100%
rename from ggml-sycl/dmmv.hpp
rename to ggml/src/ggml-sycl/dmmv.hpp
diff --git a/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
similarity index 100%
rename from ggml-sycl/dpct/helper.hpp
rename to ggml/src/ggml-sycl/dpct/helper.hpp
diff --git a/ggml-sycl/mmq.cpp b/ggml/src/ggml-sycl/mmq.cpp
similarity index 100%
rename from ggml-sycl/mmq.cpp
rename to ggml/src/ggml-sycl/mmq.cpp
diff --git a/ggml-sycl/mmq.hpp b/ggml/src/ggml-sycl/mmq.hpp
similarity index 100%
rename from ggml-sycl/mmq.hpp
rename to ggml/src/ggml-sycl/mmq.hpp
diff --git a/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp
similarity index 100%
rename from ggml-sycl/mmvq.cpp
rename to ggml/src/ggml-sycl/mmvq.cpp
diff --git a/ggml-sycl/mmvq.hpp b/ggml/src/ggml-sycl/mmvq.hpp
similarity index 100%
rename from ggml-sycl/mmvq.hpp
rename to ggml/src/ggml-sycl/mmvq.hpp
diff --git a/ggml-sycl/presets.hpp b/ggml/src/ggml-sycl/presets.hpp
similarity index 96%
rename from ggml-sycl/presets.hpp
rename to ggml/src/ggml-sycl/presets.hpp
index 5e6b61813..fe9d41770 100644
--- a/ggml-sycl/presets.hpp
+++ b/ggml/src/ggml-sycl/presets.hpp
@@ -15,8 +15,6 @@
 
 #define GGML_SYCL_MAX_STREAMS       8
 #define GGML_SYCL_MAX_BUFFERS       256
-#define GGML_SYCL_MAX_DEVICES       48
-#define GGML_SYCL_NAME "SYCL"
 
 #define WARP_SIZE 32
 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
diff --git a/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp
similarity index 100%
rename from ggml-sycl/vecdotq.hpp
rename to ggml/src/ggml-sycl/vecdotq.hpp
diff --git a/ggml-vulkan-shaders.hpp b/ggml/src/ggml-vulkan-shaders.hpp
similarity index 100%
rename from ggml-vulkan-shaders.hpp
rename to ggml/src/ggml-vulkan-shaders.hpp
diff --git a/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
similarity index 100%
rename from ggml-vulkan.cpp
rename to ggml/src/ggml-vulkan.cpp
diff --git a/ggml.c b/ggml/src/ggml.c
similarity index 100%
rename from ggml.c
rename to ggml/src/ggml.c
diff --git a/kompute b/ggml/src/kompute
similarity index 100%
rename from kompute
rename to ggml/src/kompute
diff --git a/kompute-shaders/common.comp b/ggml/src/kompute-shaders/common.comp
similarity index 100%
rename from kompute-shaders/common.comp
rename to ggml/src/kompute-shaders/common.comp
diff --git a/kompute-shaders/op_add.comp b/ggml/src/kompute-shaders/op_add.comp
similarity index 100%
rename from kompute-shaders/op_add.comp
rename to ggml/src/kompute-shaders/op_add.comp
diff --git a/kompute-shaders/op_addrow.comp b/ggml/src/kompute-shaders/op_addrow.comp
similarity index 100%
rename from kompute-shaders/op_addrow.comp
rename to ggml/src/kompute-shaders/op_addrow.comp
diff --git a/kompute-shaders/op_cpy_f16_f16.comp b/ggml/src/kompute-shaders/op_cpy_f16_f16.comp
similarity index 100%
rename from kompute-shaders/op_cpy_f16_f16.comp
rename to ggml/src/kompute-shaders/op_cpy_f16_f16.comp
diff --git a/kompute-shaders/op_cpy_f16_f32.comp b/ggml/src/kompute-shaders/op_cpy_f16_f32.comp
similarity index 100%
rename from kompute-shaders/op_cpy_f16_f32.comp
rename to ggml/src/kompute-shaders/op_cpy_f16_f32.comp
diff --git a/kompute-shaders/op_cpy_f32_f16.comp b/ggml/src/kompute-shaders/op_cpy_f32_f16.comp
similarity index 100%
rename from kompute-shaders/op_cpy_f32_f16.comp
rename to ggml/src/kompute-shaders/op_cpy_f32_f16.comp
diff --git a/kompute-shaders/op_cpy_f32_f32.comp b/ggml/src/kompute-shaders/op_cpy_f32_f32.comp
similarity index 100%
rename from kompute-shaders/op_cpy_f32_f32.comp
rename to ggml/src/kompute-shaders/op_cpy_f32_f32.comp
diff --git a/kompute-shaders/op_diagmask.comp b/ggml/src/kompute-shaders/op_diagmask.comp
similarity index 100%
rename from kompute-shaders/op_diagmask.comp
rename to ggml/src/kompute-shaders/op_diagmask.comp
diff --git a/kompute-shaders/op_gelu.comp b/ggml/src/kompute-shaders/op_gelu.comp
similarity index 100%
rename from kompute-shaders/op_gelu.comp
rename to ggml/src/kompute-shaders/op_gelu.comp
diff --git a/kompute-shaders/op_getrows.comp b/ggml/src/kompute-shaders/op_getrows.comp
similarity index 100%
rename from kompute-shaders/op_getrows.comp
rename to ggml/src/kompute-shaders/op_getrows.comp
diff --git a/kompute-shaders/op_getrows_f16.comp b/ggml/src/kompute-shaders/op_getrows_f16.comp
similarity index 100%
rename from kompute-shaders/op_getrows_f16.comp
rename to ggml/src/kompute-shaders/op_getrows_f16.comp
diff --git a/kompute-shaders/op_getrows_f32.comp b/ggml/src/kompute-shaders/op_getrows_f32.comp
similarity index 100%
rename from kompute-shaders/op_getrows_f32.comp
rename to ggml/src/kompute-shaders/op_getrows_f32.comp
diff --git a/kompute-shaders/op_getrows_q4_0.comp b/ggml/src/kompute-shaders/op_getrows_q4_0.comp
similarity index 100%
rename from kompute-shaders/op_getrows_q4_0.comp
rename to ggml/src/kompute-shaders/op_getrows_q4_0.comp
diff --git a/kompute-shaders/op_getrows_q4_1.comp b/ggml/src/kompute-shaders/op_getrows_q4_1.comp
similarity index 100%
rename from kompute-shaders/op_getrows_q4_1.comp
rename to ggml/src/kompute-shaders/op_getrows_q4_1.comp
diff --git a/kompute-shaders/op_getrows_q6_k.comp b/ggml/src/kompute-shaders/op_getrows_q6_k.comp
similarity index 100%
rename from kompute-shaders/op_getrows_q6_k.comp
rename to ggml/src/kompute-shaders/op_getrows_q6_k.comp
diff --git a/kompute-shaders/op_mul.comp b/ggml/src/kompute-shaders/op_mul.comp
similarity index 100%
rename from kompute-shaders/op_mul.comp
rename to ggml/src/kompute-shaders/op_mul.comp
diff --git a/kompute-shaders/op_mul_mat_f16.comp b/ggml/src/kompute-shaders/op_mul_mat_f16.comp
similarity index 100%
rename from kompute-shaders/op_mul_mat_f16.comp
rename to ggml/src/kompute-shaders/op_mul_mat_f16.comp
diff --git a/kompute-shaders/op_mul_mat_mat_f32.comp b/ggml/src/kompute-shaders/op_mul_mat_mat_f32.comp
similarity index 100%
rename from kompute-shaders/op_mul_mat_mat_f32.comp
rename to ggml/src/kompute-shaders/op_mul_mat_mat_f32.comp
diff --git a/kompute-shaders/op_mul_mat_q4_0.comp b/ggml/src/kompute-shaders/op_mul_mat_q4_0.comp
similarity index 100%
rename from kompute-shaders/op_mul_mat_q4_0.comp
rename to ggml/src/kompute-shaders/op_mul_mat_q4_0.comp
diff --git a/kompute-shaders/op_mul_mat_q4_1.comp b/ggml/src/kompute-shaders/op_mul_mat_q4_1.comp
similarity index 100%
rename from kompute-shaders/op_mul_mat_q4_1.comp
rename to ggml/src/kompute-shaders/op_mul_mat_q4_1.comp
diff --git a/kompute-shaders/op_mul_mat_q6_k.comp b/ggml/src/kompute-shaders/op_mul_mat_q6_k.comp
similarity index 100%
rename from kompute-shaders/op_mul_mat_q6_k.comp
rename to ggml/src/kompute-shaders/op_mul_mat_q6_k.comp
diff --git a/kompute-shaders/op_mul_mat_q8_0.comp b/ggml/src/kompute-shaders/op_mul_mat_q8_0.comp
similarity index 100%
rename from kompute-shaders/op_mul_mat_q8_0.comp
rename to ggml/src/kompute-shaders/op_mul_mat_q8_0.comp
diff --git a/kompute-shaders/op_mul_mv_q_n.comp b/ggml/src/kompute-shaders/op_mul_mv_q_n.comp
similarity index 100%
rename from kompute-shaders/op_mul_mv_q_n.comp
rename to ggml/src/kompute-shaders/op_mul_mv_q_n.comp
diff --git a/kompute-shaders/op_mul_mv_q_n_pre.comp b/ggml/src/kompute-shaders/op_mul_mv_q_n_pre.comp
similarity index 100%
rename from kompute-shaders/op_mul_mv_q_n_pre.comp
rename to ggml/src/kompute-shaders/op_mul_mv_q_n_pre.comp
diff --git a/kompute-shaders/op_norm.comp b/ggml/src/kompute-shaders/op_norm.comp
similarity index 100%
rename from kompute-shaders/op_norm.comp
rename to ggml/src/kompute-shaders/op_norm.comp
diff --git a/kompute-shaders/op_relu.comp b/ggml/src/kompute-shaders/op_relu.comp
similarity index 100%
rename from kompute-shaders/op_relu.comp
rename to ggml/src/kompute-shaders/op_relu.comp
diff --git a/kompute-shaders/op_rmsnorm.comp b/ggml/src/kompute-shaders/op_rmsnorm.comp
similarity index 100%
rename from kompute-shaders/op_rmsnorm.comp
rename to ggml/src/kompute-shaders/op_rmsnorm.comp
diff --git a/kompute-shaders/op_rope_f16.comp b/ggml/src/kompute-shaders/op_rope_f16.comp
similarity index 100%
rename from kompute-shaders/op_rope_f16.comp
rename to ggml/src/kompute-shaders/op_rope_f16.comp
diff --git a/kompute-shaders/op_rope_f32.comp b/ggml/src/kompute-shaders/op_rope_f32.comp
similarity index 100%
rename from kompute-shaders/op_rope_f32.comp
rename to ggml/src/kompute-shaders/op_rope_f32.comp
diff --git a/kompute-shaders/op_scale.comp b/ggml/src/kompute-shaders/op_scale.comp
similarity index 100%
rename from kompute-shaders/op_scale.comp
rename to ggml/src/kompute-shaders/op_scale.comp
diff --git a/kompute-shaders/op_scale_8.comp b/ggml/src/kompute-shaders/op_scale_8.comp
similarity index 100%
rename from kompute-shaders/op_scale_8.comp
rename to ggml/src/kompute-shaders/op_scale_8.comp
diff --git a/kompute-shaders/op_silu.comp b/ggml/src/kompute-shaders/op_silu.comp
similarity index 100%
rename from kompute-shaders/op_silu.comp
rename to ggml/src/kompute-shaders/op_silu.comp
diff --git a/kompute-shaders/op_softmax.comp b/ggml/src/kompute-shaders/op_softmax.comp
similarity index 100%
rename from kompute-shaders/op_softmax.comp
rename to ggml/src/kompute-shaders/op_softmax.comp
diff --git a/kompute-shaders/rope_common.comp b/ggml/src/kompute-shaders/rope_common.comp
similarity index 100%
rename from kompute-shaders/rope_common.comp
rename to ggml/src/kompute-shaders/rope_common.comp
diff --git a/sgemm.cpp b/ggml/src/sgemm.cpp
similarity index 100%
rename from sgemm.cpp
rename to ggml/src/sgemm.cpp
diff --git a/sgemm.h b/ggml/src/sgemm.h
similarity index 100%
rename from sgemm.h
rename to ggml/src/sgemm.h
diff --git a/vulkan-shaders/add.comp b/ggml/src/vulkan-shaders/add.comp
similarity index 100%
rename from vulkan-shaders/add.comp
rename to ggml/src/vulkan-shaders/add.comp
diff --git a/vulkan-shaders/argsort.comp b/ggml/src/vulkan-shaders/argsort.comp
similarity index 100%
rename from vulkan-shaders/argsort.comp
rename to ggml/src/vulkan-shaders/argsort.comp
diff --git a/vulkan-shaders/clamp.comp b/ggml/src/vulkan-shaders/clamp.comp
similarity index 100%
rename from vulkan-shaders/clamp.comp
rename to ggml/src/vulkan-shaders/clamp.comp
diff --git a/vulkan-shaders/copy.comp b/ggml/src/vulkan-shaders/copy.comp
similarity index 100%
rename from vulkan-shaders/copy.comp
rename to ggml/src/vulkan-shaders/copy.comp
diff --git a/vulkan-shaders/dequant_f32.comp b/ggml/src/vulkan-shaders/dequant_f32.comp
similarity index 100%
rename from vulkan-shaders/dequant_f32.comp
rename to ggml/src/vulkan-shaders/dequant_f32.comp
diff --git a/vulkan-shaders/dequant_funcs.comp b/ggml/src/vulkan-shaders/dequant_funcs.comp
similarity index 100%
rename from vulkan-shaders/dequant_funcs.comp
rename to ggml/src/vulkan-shaders/dequant_funcs.comp
diff --git a/vulkan-shaders/dequant_head.comp b/ggml/src/vulkan-shaders/dequant_head.comp
similarity index 100%
rename from vulkan-shaders/dequant_head.comp
rename to ggml/src/vulkan-shaders/dequant_head.comp
diff --git a/vulkan-shaders/dequant_q2_k.comp b/ggml/src/vulkan-shaders/dequant_q2_k.comp
similarity index 100%
rename from vulkan-shaders/dequant_q2_k.comp
rename to ggml/src/vulkan-shaders/dequant_q2_k.comp
diff --git a/vulkan-shaders/dequant_q3_k.comp b/ggml/src/vulkan-shaders/dequant_q3_k.comp
similarity index 100%
rename from vulkan-shaders/dequant_q3_k.comp
rename to ggml/src/vulkan-shaders/dequant_q3_k.comp
diff --git a/vulkan-shaders/dequant_q4_0.comp b/ggml/src/vulkan-shaders/dequant_q4_0.comp
similarity index 100%
rename from vulkan-shaders/dequant_q4_0.comp
rename to ggml/src/vulkan-shaders/dequant_q4_0.comp
diff --git a/vulkan-shaders/dequant_q4_1.comp b/ggml/src/vulkan-shaders/dequant_q4_1.comp
similarity index 100%
rename from vulkan-shaders/dequant_q4_1.comp
rename to ggml/src/vulkan-shaders/dequant_q4_1.comp
diff --git a/vulkan-shaders/dequant_q4_k.comp b/ggml/src/vulkan-shaders/dequant_q4_k.comp
similarity index 100%
rename from vulkan-shaders/dequant_q4_k.comp
rename to ggml/src/vulkan-shaders/dequant_q4_k.comp
diff --git a/vulkan-shaders/dequant_q5_0.comp b/ggml/src/vulkan-shaders/dequant_q5_0.comp
similarity index 100%
rename from vulkan-shaders/dequant_q5_0.comp
rename to ggml/src/vulkan-shaders/dequant_q5_0.comp
diff --git a/vulkan-shaders/dequant_q5_1.comp b/ggml/src/vulkan-shaders/dequant_q5_1.comp
similarity index 100%
rename from vulkan-shaders/dequant_q5_1.comp
rename to ggml/src/vulkan-shaders/dequant_q5_1.comp
diff --git a/vulkan-shaders/dequant_q5_k.comp b/ggml/src/vulkan-shaders/dequant_q5_k.comp
similarity index 100%
rename from vulkan-shaders/dequant_q5_k.comp
rename to ggml/src/vulkan-shaders/dequant_q5_k.comp
diff --git a/vulkan-shaders/dequant_q6_k.comp b/ggml/src/vulkan-shaders/dequant_q6_k.comp
similarity index 100%
rename from vulkan-shaders/dequant_q6_k.comp
rename to ggml/src/vulkan-shaders/dequant_q6_k.comp
diff --git a/vulkan-shaders/dequant_q8_0.comp b/ggml/src/vulkan-shaders/dequant_q8_0.comp
similarity index 100%
rename from vulkan-shaders/dequant_q8_0.comp
rename to ggml/src/vulkan-shaders/dequant_q8_0.comp
diff --git a/vulkan-shaders/diag_mask_inf.comp b/ggml/src/vulkan-shaders/diag_mask_inf.comp
similarity index 100%
rename from vulkan-shaders/diag_mask_inf.comp
rename to ggml/src/vulkan-shaders/diag_mask_inf.comp
diff --git a/vulkan-shaders/div.comp b/ggml/src/vulkan-shaders/div.comp
similarity index 100%
rename from vulkan-shaders/div.comp
rename to ggml/src/vulkan-shaders/div.comp
diff --git a/vulkan-shaders/gelu.comp b/ggml/src/vulkan-shaders/gelu.comp
similarity index 100%
rename from vulkan-shaders/gelu.comp
rename to ggml/src/vulkan-shaders/gelu.comp
diff --git a/vulkan-shaders/generic_binary_head.comp b/ggml/src/vulkan-shaders/generic_binary_head.comp
similarity index 100%
rename from vulkan-shaders/generic_binary_head.comp
rename to ggml/src/vulkan-shaders/generic_binary_head.comp
diff --git a/vulkan-shaders/generic_head.comp b/ggml/src/vulkan-shaders/generic_head.comp
similarity index 100%
rename from vulkan-shaders/generic_head.comp
rename to ggml/src/vulkan-shaders/generic_head.comp
diff --git a/vulkan-shaders/generic_unary_head.comp b/ggml/src/vulkan-shaders/generic_unary_head.comp
similarity index 100%
rename from vulkan-shaders/generic_unary_head.comp
rename to ggml/src/vulkan-shaders/generic_unary_head.comp
diff --git a/vulkan-shaders/get_rows.comp b/ggml/src/vulkan-shaders/get_rows.comp
similarity index 100%
rename from vulkan-shaders/get_rows.comp
rename to ggml/src/vulkan-shaders/get_rows.comp
diff --git a/vulkan-shaders/get_rows_quant.comp b/ggml/src/vulkan-shaders/get_rows_quant.comp
similarity index 100%
rename from vulkan-shaders/get_rows_quant.comp
rename to ggml/src/vulkan-shaders/get_rows_quant.comp
diff --git a/vulkan-shaders/mul.comp b/ggml/src/vulkan-shaders/mul.comp
similarity index 100%
rename from vulkan-shaders/mul.comp
rename to ggml/src/vulkan-shaders/mul.comp
diff --git a/vulkan-shaders/mul_mat_split_k_reduce.comp b/ggml/src/vulkan-shaders/mul_mat_split_k_reduce.comp
similarity index 100%
rename from vulkan-shaders/mul_mat_split_k_reduce.comp
rename to ggml/src/vulkan-shaders/mul_mat_split_k_reduce.comp
diff --git a/vulkan-shaders/mul_mat_vec.comp b/ggml/src/vulkan-shaders/mul_mat_vec.comp
similarity index 100%
rename from vulkan-shaders/mul_mat_vec.comp
rename to ggml/src/vulkan-shaders/mul_mat_vec.comp
diff --git a/vulkan-shaders/mul_mat_vec_base.comp b/ggml/src/vulkan-shaders/mul_mat_vec_base.comp
similarity index 100%
rename from vulkan-shaders/mul_mat_vec_base.comp
rename to ggml/src/vulkan-shaders/mul_mat_vec_base.comp
diff --git a/vulkan-shaders/mul_mat_vec_nc.comp b/ggml/src/vulkan-shaders/mul_mat_vec_nc.comp
similarity index 100%
rename from vulkan-shaders/mul_mat_vec_nc.comp
rename to ggml/src/vulkan-shaders/mul_mat_vec_nc.comp
diff --git a/vulkan-shaders/mul_mat_vec_p021.comp b/ggml/src/vulkan-shaders/mul_mat_vec_p021.comp
similarity index 100%
rename from vulkan-shaders/mul_mat_vec_p021.comp
rename to ggml/src/vulkan-shaders/mul_mat_vec_p021.comp
diff --git a/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q2_k.comp
similarity index 100%
rename from vulkan-shaders/mul_mat_vec_q2_k.comp
rename to ggml/src/vulkan-shaders/mul_mat_vec_q2_k.comp
diff --git a/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q3_k.comp
similarity index 100%
rename from vulkan-shaders/mul_mat_vec_q3_k.comp
rename to ggml/src/vulkan-shaders/mul_mat_vec_q3_k.comp
diff --git a/vulkan-shaders/mul_mat_vec_q4_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q4_k.comp
similarity index 100%
rename from vulkan-shaders/mul_mat_vec_q4_k.comp
rename to ggml/src/vulkan-shaders/mul_mat_vec_q4_k.comp
diff --git a/vulkan-shaders/mul_mat_vec_q5_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q5_k.comp
similarity index 100%
rename from vulkan-shaders/mul_mat_vec_q5_k.comp
rename to ggml/src/vulkan-shaders/mul_mat_vec_q5_k.comp
diff --git a/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q6_k.comp
similarity index 100%
rename from vulkan-shaders/mul_mat_vec_q6_k.comp
rename to ggml/src/vulkan-shaders/mul_mat_vec_q6_k.comp
diff --git a/vulkan-shaders/mul_mm.comp b/ggml/src/vulkan-shaders/mul_mm.comp
similarity index 100%
rename from vulkan-shaders/mul_mm.comp
rename to ggml/src/vulkan-shaders/mul_mm.comp
diff --git a/vulkan-shaders/norm.comp b/ggml/src/vulkan-shaders/norm.comp
similarity index 100%
rename from vulkan-shaders/norm.comp
rename to ggml/src/vulkan-shaders/norm.comp
diff --git a/vulkan-shaders/relu.comp b/ggml/src/vulkan-shaders/relu.comp
similarity index 100%
rename from vulkan-shaders/relu.comp
rename to ggml/src/vulkan-shaders/relu.comp
diff --git a/vulkan-shaders/rms_norm.comp b/ggml/src/vulkan-shaders/rms_norm.comp
similarity index 100%
rename from vulkan-shaders/rms_norm.comp
rename to ggml/src/vulkan-shaders/rms_norm.comp
diff --git a/vulkan-shaders/rope_head.comp b/ggml/src/vulkan-shaders/rope_head.comp
similarity index 100%
rename from vulkan-shaders/rope_head.comp
rename to ggml/src/vulkan-shaders/rope_head.comp
diff --git a/vulkan-shaders/rope_neox.comp b/ggml/src/vulkan-shaders/rope_neox.comp
similarity index 100%
rename from vulkan-shaders/rope_neox.comp
rename to ggml/src/vulkan-shaders/rope_neox.comp
diff --git a/vulkan-shaders/rope_norm.comp b/ggml/src/vulkan-shaders/rope_norm.comp
similarity index 100%
rename from vulkan-shaders/rope_norm.comp
rename to ggml/src/vulkan-shaders/rope_norm.comp
diff --git a/vulkan-shaders/scale.comp b/ggml/src/vulkan-shaders/scale.comp
similarity index 100%
rename from vulkan-shaders/scale.comp
rename to ggml/src/vulkan-shaders/scale.comp
diff --git a/vulkan-shaders/silu.comp b/ggml/src/vulkan-shaders/silu.comp
similarity index 100%
rename from vulkan-shaders/silu.comp
rename to ggml/src/vulkan-shaders/silu.comp
diff --git a/vulkan-shaders/soft_max.comp b/ggml/src/vulkan-shaders/soft_max.comp
similarity index 100%
rename from vulkan-shaders/soft_max.comp
rename to ggml/src/vulkan-shaders/soft_max.comp
diff --git a/vulkan-shaders/square.comp b/ggml/src/vulkan-shaders/square.comp
similarity index 100%
rename from vulkan-shaders/square.comp
rename to ggml/src/vulkan-shaders/square.comp
diff --git a/vulkan-shaders/sum_rows.comp b/ggml/src/vulkan-shaders/sum_rows.comp
similarity index 100%
rename from vulkan-shaders/sum_rows.comp
rename to ggml/src/vulkan-shaders/sum_rows.comp
diff --git a/vulkan-shaders/types.comp b/ggml/src/vulkan-shaders/types.comp
similarity index 100%
rename from vulkan-shaders/types.comp
rename to ggml/src/vulkan-shaders/types.comp
diff --git a/llama.h b/include/llama.h
similarity index 100%
rename from llama.h
rename to include/llama.h
diff --git a/scripts/build-info.sh b/scripts/build-info.sh
index 32682afbd..fa9e7bacd 100755
--- a/scripts/build-info.sh
+++ b/scripts/build-info.sh
@@ -8,20 +8,20 @@ build_compiler="unknown"
 build_target="unknown"
 
 if out=$(git rev-list --count HEAD); then
-  # git is broken on WSL so we need to strip extra newlines
-  build_number=$(printf '%s' "$out" | tr -d '\n')
+    # git is broken on WSL so we need to strip extra newlines
+    build_number=$(printf '%s' "$out" | tr -d '\n')
 fi
 
 if out=$(git rev-parse --short HEAD); then
-  build_commit=$(printf '%s' "$out" | tr -d '\n')
+    build_commit=$(printf '%s' "$out" | tr -d '\n')
 fi
 
 if out=$($CC --version | head -1); then
-  build_compiler=$out
+    build_compiler=$out
 fi
 
 if out=$($CC -dumpmachine); then
-  build_target=$out
+    build_target=$out
 fi
 
 echo "int LLAMA_BUILD_NUMBER = ${build_number};"
diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh
index a45cd3962..70679f4e5 100755
--- a/scripts/compare-commits.sh
+++ b/scripts/compare-commits.sh
@@ -12,7 +12,7 @@ bench_args="${@:3}"
 
 rm -f llama-bench.sqlite > /dev/null
 
-# to test a backend, call the script with the corresponding environment variable (e.g. LLAMA_CUDA=1 ./scripts/compare-commits.sh ...)
+# to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
 
 git checkout $1 > /dev/null
 make clean > /dev/null
diff --git a/scripts/debug-test.sh b/scripts/debug-test.sh
index 7b2b601a9..91946c514 100755
--- a/scripts/debug-test.sh
+++ b/scripts/debug-test.sh
@@ -110,7 +110,7 @@ rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir"
 ###########################################################
 
 # Note: test-eval-callback requires -DLLAMA_CURL
-cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build enviroment"
+cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build enviroment"
 pushd "$build_dir"
 make -j || abort "Failed to compile"
 popd > /dev/null || exit 1
diff --git a/scripts/pod-llama.sh b/scripts/pod-llama.sh
index 6ba499a2a..586d6ea18 100644
--- a/scripts/pod-llama.sh
+++ b/scripts/pod-llama.sh
@@ -42,7 +42,7 @@ git clone https://github.com/ggerganov/llama.cpp
 
 cd llama.cpp
 
-LLAMA_CUDA=1 make -j
+GGML_CUDA=1 make -j
 
 ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3  ./models/tinyllama-1b
 ln -sfn /workspace/CodeLlama-7b-hf           ./models/codellama-7b
@@ -60,7 +60,7 @@ cd /workspace/llama.cpp
 mkdir build-cublas
 cd build-cublas
 
-cmake -DLLAMA_CUDA=1 ../
+cmake -DGGML_CUDA=1 ../
 make -j
 
 if [ "$1" -eq "0" ]; then
@@ -186,17 +186,17 @@ if [ "$1" -eq "1" ]; then
     # batched
     cd /workspace/llama.cpp
 
-    LLAMA_CUDA=1 make -j && ./llama-batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
+    GGML_CUDA=1 make -j && ./llama-batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
 
     # batched-bench
     cd /workspace/llama.cpp
 
-    LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
+    GGML_CUDA=1 make -j && ./llama-batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
 
     # parallel
     cd /workspace/llama.cpp
 
-    LLAMA_CUDA=1 make -j && ./llama-parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
+    GGML_CUDA=1 make -j && ./llama-parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
 
 fi
 
@@ -204,10 +204,10 @@ fi
 #if [ "$1" -eq "7" ]; then
 #    cd /workspace/llama.cpp
 #
-#    LLAMA_CUDA=1 make -j && ./llama-speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
+#    GGML_CUDA=1 make -j && ./llama-speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
 #fi
 
 # more benches
-#LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf  4096 1 99 1 512,3200 128,128,800 1
-#LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
+#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf  4096 1 99 1 512,3200 128,128,800 1
+#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
 
diff --git a/scripts/server-llm.sh b/scripts/server-llm.sh
index 199232440..802592a3e 100644
--- a/scripts/server-llm.sh
+++ b/scripts/server-llm.sh
@@ -380,7 +380,7 @@ fi
 
 if [[ "$backend" == "cuda" ]]; then
     printf "[+] Building with CUDA backend\n"
-    LLAMA_CUDA=1 make -j llama-server $log
+    GGML_CUDA=1 make -j llama-server $log
 elif [[ "$backend" == "cpu" ]]; then
     printf "[+] Building with CPU backend\n"
     make -j llama-server $log
diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh
index 9e34dc8b9..9e654180b 100755
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@@ -53,7 +53,9 @@ while read c; do
     fi
 
     git format-patch -k $c~1..$c --stdout -- \
-        include/ggml/ggml*.h \
+        CMakeLists.txt \
+        src/CMakeLists.txt \
+        cmake/FindSIMD.cmake \
         src/ggml*.h \
         src/ggml*.c \
         src/ggml*.cpp \
@@ -61,6 +63,7 @@ while read c; do
         src/ggml*.metal \
         src/ggml*.cu \
         src/ggml-cuda/* \
+        include/ggml*.h \
         tests/test-opt.cpp \
         tests/test-grad0.cpp \
         tests/test-quantize-fns.cpp \
@@ -93,30 +96,36 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
 
     # replace filenames:
     #
-    # src/ggml.c                  -> ggml.c
-    # src/ggml-alloc.c            -> ggml-alloc.c
-    # src/ggml-backend-impl.h     -> ggml-backend-impl.h
-    # src/ggml-backend.c          -> ggml-backend.c
-    # src/ggml-common.h           -> ggml-common.h
-    # src/ggml-cuda/*             -> ggml-cuda/
-    # src/ggml-cuda.cu            -> ggml-cuda.cu
-    # src/ggml-cuda.h             -> ggml-cuda.h
-    # src/ggml-impl.h             -> ggml-impl.h
-    # src/ggml-kompute.cpp        -> ggml-kompute.cpp
-    # src/ggml-kompute.h          -> ggml-kompute.h
-    # src/ggml-metal.h            -> ggml-metal.h
-    # src/ggml-metal.m            -> ggml-metal.m
-    # src/ggml-quants.c           -> ggml-quants.c
-    # src/ggml-quants.h           -> ggml-quants.h
-    # src/ggml-rpc.cpp            -> ggml-rpc.cpp
-    # src/ggml-rpc.h              -> ggml-rpc.h
-    # src/ggml-sycl.cpp           -> ggml-sycl.cpp
-    # src/ggml-sycl.h             -> ggml-sycl.h
-    # src/ggml-vulkan.cpp         -> ggml-vulkan.cpp
-    # src/ggml-vulkan.h           -> ggml-vulkan.h
-    # include/ggml/ggml.h         -> ggml.h
-    # include/ggml/ggml-alloc.h   -> ggml-alloc.h
-    # include/ggml/ggml-backend.h -> ggml-backend.h
+    # CMakelists.txt          -> ggml/CMakeLists.txt
+    # src/CMakeLists.txt      -> ggml/src/CMakeLists.txt
+    # cmake/FindSIMD.cmake    -> ggml/cmake/FindSIMD.cmake
+    #
+    # src/ggml.c              -> ggml/src/ggml.c
+    # src/ggml-alloc.c        -> ggml/src/ggml-alloc.c
+    # src/ggml-backend-impl.h -> ggml/src/ggml-backend-impl.h
+    # src/ggml-backend.c      -> ggml/src/ggml-backend.c
+    # src/ggml-common.h       -> ggml/src/ggml-common.h
+    # src/ggml-cuda/*         -> ggml/src/ggml-cuda/
+    # src/ggml-cuda.cu        -> ggml/src/ggml-cuda.cu
+    # src/ggml-impl.h         -> ggml/src/ggml-impl.h
+    # src/ggml-kompute.cpp    -> ggml/src/ggml-kompute.cpp
+    # src/ggml-metal.m        -> ggml/src/ggml-metal.m
+    # src/ggml-quants.c       -> ggml/src/ggml-quants.c
+    # src/ggml-quants.h       -> ggml/src/ggml-quants.h
+    # src/ggml-rpc.cpp        -> ggml/src/ggml-rpc.cpp
+    # src/ggml-sycl.cpp       -> ggml/src/ggml-sycl.cpp
+    # src/ggml-vulkan.cpp     -> ggml/src/ggml-vulkan.cpp
+    #
+    # include/ggml.h         -> ggml/include/ggml.h
+    # include/ggml-alloc.h   -> ggml/include/ggml-alloc.h
+    # include/ggml-backend.h -> ggml/include/ggml-backend.h
+    # include/ggml-blas.h    -> ggml/include/ggml-blas.h
+    # include/ggml-cuda.h    -> ggml/include/ggml-cuda.h
+    # include/ggml-kompute.h -> ggml/include/ggml-kompute.h
+    # include/ggml-metal.h   -> ggml/include/ggml-metal.h
+    # include/ggml-rpc.h     -> ggml/include/ggml-rpc.h
+    # include/ggml-sycl.h    -> ggml/include/ggml-sycl.h
+    # include/ggml-vulkan.h  -> ggml/include/ggml-vulkan.h
     #
     # tests/test-opt.cpp           -> tests/test-opt.cpp
     # tests/test-grad0.cpp         -> tests/test-grad0.cpp
@@ -124,34 +133,38 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
     # tests/test-quantize-perf.cpp -> tests/test-quantize-perf.cpp
     # tests/test-backend-ops.cpp   -> tests/test-backend-ops.cpp
     #
-    # LICENSE                      -> LICENSE
-    # scripts/gen-authors.sh       -> scripts/gen-authors.sh
+    # LICENSE                -> LICENSE
+    # scripts/gen-authors.sh -> scripts/gen-authors.sh
 
     cat ggml-src.patch | sed \
-        -e 's/src\/ggml\.c/ggml.c/g' \
-        -e 's/src\/ggml-alloc\.c/ggml-alloc.c/g' \
-        -e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \
-        -e 's/src\/ggml-backend\.c/ggml-backend.c/g' \
-        -e 's/src\/ggml-common\.h/ggml-common.h/g' \
+        -e 's/CMakeLists.txt/ggml\/CMakeLists.txt/g' \
+        -e 's/src\/CMakeLists.txt/ggml\/src\/CMakeLists.txt/g' \
+        -e 's/cmake\/FindSIMD.cmake/ggml\/cmake\/FindSIMD.cmake/g' \
+        -e 's/src\/ggml\.c/ggml/src/ggml.c/g' \
+        -e 's/src\/ggml-alloc\.c/ggml/src/ggml-alloc.c/g' \
+        -e 's/src\/ggml-backend-impl\.h/ggml/src/ggml-backend-impl.h/g' \
+        -e 's/src\/ggml-backend\.c/ggml/src/ggml-backend.c/g' \
+        -e 's/src\/ggml-common\.h/ggml/src/ggml-common.h/g' \
         -e 's/src\/ggml-cuda\//ggml-cuda\//g' \
-        -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
-        -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
-        -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
-        -e 's/src\/ggml-kompute\.cpp/ggml-kompute.cpp/g' \
-        -e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \
-        -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
-        -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
-        -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
-        -e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
-        -e 's/src\/ggml-rpc\.cpp/ggml-rpc.cpp/g' \
-        -e 's/src\/ggml-rpc\.h/ggml-rpc.h/g' \
-        -e 's/src\/ggml-sycl\.cpp/ggml-sycl.cpp/g' \
-        -e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \
-        -e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \
-        -e 's/src\/ggml-vulkan\.h/ggml-vulkan.h/g' \
-        -e 's/include\/ggml\/ggml\.h/ggml.h/g' \
-        -e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \
-        -e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \
+        -e 's/src\/ggml-cuda\.cu/ggml/src/ggml-cuda.cu/g' \
+        -e 's/src\/ggml-impl\.h/ggml/src/ggml-impl.h/g' \
+        -e 's/src\/ggml-kompute\.cpp/ggml/src/ggml-kompute.cpp/g' \
+        -e 's/src\/ggml-metal\.m/ggml/src/ggml-metal.m/g' \
+        -e 's/src\/ggml-quants\.c/ggml/src/ggml-quants.c/g' \
+        -e 's/src\/ggml-quants\.h/ggml/src/ggml-quants.h/g' \
+        -e 's/src\/ggml-rpc\.cpp/ggml/src/ggml-rpc.cpp/g' \
+        -e 's/src\/ggml-sycl\.cpp/ggml/src/ggml-sycl.cpp/g' \
+        -e 's/src\/ggml-vulkan\.cpp/ggml/src/ggml-vulkan.cpp/g' \
+        -e 's/include\/ggml\.h/ggml/include/ggml.h/g' \
+        -e 's/include\/ggml-alloc\.h/ggml/include/ggml-alloc.h/g' \
+        -e 's/include\/ggml-backend\.h/ggml/include/ggml-backend.h/g' \
+        -e 's/include\/ggml-blas\.h/ggml/include/ggml-blas.h/g' \
+        -e 's/include\/ggml-cuda\.h/ggml/include/ggml-cuda.h/g' \
+        -e 's/include\/ggml-kompute\.h/ggml/include/ggml-kompute.h/g' \
+        -e 's/include\/ggml-metal\.h/ggml/include/ggml-metal.h/g' \
+        -e 's/include\/ggml-rpc\.h/ggml/include/ggml-rpc.h/g' \
+        -e 's/include\/ggml-sycl\.h/ggml/include/ggml-sycl.h/g' \
+        -e 's/include\/ggml-vulkan\.h/ggml/include/ggml-vulkan.h/g' \
         -e 's/tests\/test-opt\.cpp/tests\/test-opt.cpp/g' \
         -e 's/tests\/test-grad0\.cpp/tests\/test-grad0.cpp/g' \
         -e 's/tests\/test-quantize-fns\.cpp/tests\/test-quantize-fns.cpp/g' \
diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh
index 4843f8a4a..2f32c1ce8 100755
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@@ -1,34 +1,42 @@
 #!/bin/bash
 
-cp -rpv ../ggml/src/ggml.c                  ./ggml.c
-cp -rpv ../ggml/src/ggml-alloc.c            ./ggml-alloc.c
-cp -rpv ../ggml/src/ggml-backend-impl.h     ./ggml-backend-impl.h
-cp -rpv ../ggml/src/ggml-backend.c          ./ggml-backend.c
-cp -rpv ../ggml/src/ggml-common.h           ./ggml-common.h
-cp -rpv ../ggml/src/ggml-cuda/*             ./ggml-cuda/
-cp -rpv ../ggml/src/ggml-cuda.cu            ./ggml-cuda.cu
-cp -rpv ../ggml/src/ggml-cuda.h             ./ggml-cuda.h
-cp -rpv ../ggml/src/ggml-impl.h             ./ggml-impl.h
-cp -rpv ../ggml/src/ggml-kompute.cpp        ./ggml-kompute.cpp
-cp -rpv ../ggml/src/ggml-kompute.h          ./ggml-kompute.h
-cp -rpv ../ggml/src/ggml-metal.h            ./ggml-metal.h
-cp -rpv ../ggml/src/ggml-metal.m            ./ggml-metal.m
-cp -rpv ../ggml/src/ggml-metal.metal        ./ggml-metal.metal
-cp -rpv ../ggml/src/ggml-quants.c           ./ggml-quants.c
-cp -rpv ../ggml/src/ggml-quants.h           ./ggml-quants.h
-cp -rpv ../ggml/src/ggml-rpc.cpp            ./ggml-rpc.cpp
-cp -rpv ../ggml/src/ggml-rpc.h              ./ggml-rpc.h
-cp -rpv ../ggml/src/ggml-sycl.cpp           ./ggml-sycl.cpp
-cp -rpv ../ggml/src/ggml-sycl.h             ./ggml-sycl.h
-cp -rpv ../ggml/src/ggml-vulkan.cpp         ./ggml-vulkan.cpp
-cp -rpv ../ggml/src/ggml-vulkan.h           ./ggml-vulkan.h
-cp -rpv ../ggml/include/ggml/ggml.h         ./ggml.h
-cp -rpv ../ggml/include/ggml/ggml-alloc.h   ./ggml-alloc.h
-cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h
+cp -rpv ../ggml/CMakeLists.txt       ./ggml/CMakeLists.txt
+cp -rpv ../ggml/src/CMakeLists.txt   ./ggml/src/CMakeLists.txt
+cp -rpv ../ggml/cmake/FindSIMD.cmake ./ggml/cmake/FindSIMD.cmake
 
-cp -rpv ../ggml/tests/test-opt.cpp         ./tests/test-opt.cpp
-cp -rpv ../ggml/tests/test-grad0.cpp       ./tests/test-grad0.cpp
-cp -rpv ../ggml/tests/test-backend-ops.cpp ./tests/test-backend-ops.cpp
+cp -rpv ../ggml/src/ggml.c              ./ggml/src/ggml.c
+cp -rpv ../ggml/src/ggml-alloc.c        ./ggml/src/ggml-alloc.c
+cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml/src/ggml-backend-impl.h
+cp -rpv ../ggml/src/ggml-backend.c      ./ggml/src/ggml-backend.c
+cp -rpv ../ggml/src/ggml-common.h       ./ggml/src/ggml-common.h
+cp -rpv ../ggml/src/ggml-cuda/*         ./ggml/src/ggml-cuda/
+cp -rpv ../ggml/src/ggml-cuda.cu        ./ggml/src/ggml-cuda.cu
+cp -rpv ../ggml/src/ggml-impl.h         ./ggml/src/ggml-impl.h
+cp -rpv ../ggml/src/ggml-kompute.cpp    ./ggml/src/ggml-kompute.cpp
+cp -rpv ../ggml/src/ggml-metal.m        ./ggml/src/ggml-metal.m
+cp -rpv ../ggml/src/ggml-metal.metal    ./ggml/src/ggml-metal.metal
+cp -rpv ../ggml/src/ggml-quants.c       ./ggml/src/ggml-quants.c
+cp -rpv ../ggml/src/ggml-quants.h       ./ggml/src/ggml-quants.h
+cp -rpv ../ggml/src/ggml-rpc.cpp        ./ggml/src/ggml-rpc.cpp
+cp -rpv ../ggml/src/ggml-sycl.cpp       ./ggml/src/ggml-sycl.cpp
+cp -rpv ../ggml/src/ggml-vulkan.cpp     ./ggml/src/ggml-vulkan.cpp
 
-cp -rpv ../LICENSE                         ./LICENSE
-cp -rpv ../ggml/scripts/gen-authors.sh     ./scripts/gen-authors.sh
+cp -rpv ../ggml/include/ggml.h         ./ggml/include/ggml.h
+cp -rpv ../ggml/include/ggml-alloc.h   ./ggml/include/ggml-alloc.h
+cp -rpv ../ggml/include/ggml-backend.h ./ggml/include/ggml-backend.h
+cp -rpv ../ggml/include/ggml-blas.h    ./ggml/include/ggml-blas.h
+cp -rpv ../ggml/include/ggml-cuda.h    ./ggml/include/ggml-cuda.h
+cp -rpv ../ggml/include/ggml-kompute.h ./ggml/include/ggml-kompute.h
+cp -rpv ../ggml/include/ggml-metal.h   ./ggml/include/ggml-metal.h
+cp -rpv ../ggml/include/ggml-rpc.h     ./ggml/include/ggml-rpc.h
+cp -rpv ../ggml/include/ggml-sycl.h    ./ggml/include/ggml-sycl.h
+cp -rpv ../ggml/include/ggml-vulkan.h  ./ggml/include/ggml-vulkan.h
+
+cp -rpv ../ggml/tests/test-opt.cpp           ./tests/test-opt.cpp
+cp -rpv ../ggml/tests/test-grad0.cpp         ./tests/test-grad0.cpp
+cp -rpv ../ggml/tests/test-quantize-fns.cpp  ./tests/test-quantize-fns.cpp
+cp -rpv ../ggml/tests/test-quantize-perf.cpp ./tests/test-quantize-perf.cpp
+cp -rpv ../ggml/tests/test-backend-ops.cpp   ./tests/test-backend-ops.cpp
+
+cp -rpv ../LICENSE                     ./LICENSE
+cp -rpv ../ggml/scripts/gen-authors.sh ./scripts/gen-authors.sh
diff --git a/spm-headers/ggml-alloc.h b/spm-headers/ggml-alloc.h
index a49d385a1..0361ffc38 120000
--- a/spm-headers/ggml-alloc.h
+++ b/spm-headers/ggml-alloc.h
@@ -1 +1 @@
-../ggml-alloc.h
\ No newline at end of file
+../ggml/include/ggml-alloc.h
\ No newline at end of file
diff --git a/spm-headers/ggml-backend.h b/spm-headers/ggml-backend.h
index 17c2cf14f..7295f0f0d 120000
--- a/spm-headers/ggml-backend.h
+++ b/spm-headers/ggml-backend.h
@@ -1 +1 @@
-../ggml-backend.h
\ No newline at end of file
+../ggml/include/ggml-backend.h
\ No newline at end of file
diff --git a/spm-headers/ggml-metal.h b/spm-headers/ggml-metal.h
new file mode 120000
index 000000000..aefad5fa0
--- /dev/null
+++ b/spm-headers/ggml-metal.h
@@ -0,0 +1 @@
+../ggml/include/ggml-metal.h
\ No newline at end of file
diff --git a/spm-headers/ggml.h b/spm-headers/ggml.h
index 39215298f..0bdfeacbd 120000
--- a/spm-headers/ggml.h
+++ b/spm-headers/ggml.h
@@ -1 +1 @@
-../ggml.h
\ No newline at end of file
+../ggml/include/ggml.h
\ No newline at end of file
diff --git a/spm-headers/llama.h b/spm-headers/llama.h
index 9acceb980..b31388f0d 120000
--- a/spm-headers/llama.h
+++ b/spm-headers/llama.h
@@ -1 +1 @@
-../llama.h
\ No newline at end of file
+../include/llama.h
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 000000000..ccb607e56
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,32 @@
+# TODO: should not use this
+if (WIN32)
+    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+
+    if (BUILD_SHARED_LIBS)
+        set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+    endif()
+endif()
+
+#
+# libraries
+#
+
+# llama
+
+add_library(llama
+            ../include/llama.h
+            llama.cpp
+            unicode.h
+            unicode.cpp
+            unicode-data.cpp
+            )
+
+target_include_directories(llama PUBLIC . ../include)
+target_compile_features   (llama PUBLIC cxx_std_11) # don't bump
+
+target_link_libraries(llama PUBLIC ggml)
+
+if (BUILD_SHARED_LIBS)
+    set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
+endif()
diff --git a/llama.cpp b/src/llama.cpp
similarity index 100%
rename from llama.cpp
rename to src/llama.cpp
diff --git a/unicode-data.cpp b/src/unicode-data.cpp
similarity index 100%
rename from unicode-data.cpp
rename to src/unicode-data.cpp
diff --git a/unicode-data.h b/src/unicode-data.h
similarity index 100%
rename from unicode-data.h
rename to src/unicode-data.h
diff --git a/unicode.cpp b/src/unicode.cpp
similarity index 100%
rename from unicode.cpp
rename to src/unicode.cpp
diff --git a/unicode.h b/src/unicode.h
similarity index 100%
rename from unicode.h
rename to src/unicode.h
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 1ed74e543..f74c0db47 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1,7 +1,6 @@
 #include <ggml.h>
 #include <ggml-alloc.h>
 #include <ggml-backend.h>
-#include <ggml-backend-impl.h>
 
 #include <algorithm>
 #include <array>

From a95631ee97bb24861af6bdeec380270459631e8e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 26 Jun 2024 19:26:13 +0300
Subject: [PATCH 18/50] readme : update API notes

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 6ca5ba43e..99b16f6e2 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 
 ### Recent API changes
 
+- [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006
 - [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
 - [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
 - [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122

From 0e814dfc42b4b57ad19598d239557b6a977ca16c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 26 Jun 2024 19:32:07 +0300
Subject: [PATCH 19/50] devops : remove clblast + LLAMA_CUDA -> GGML_CUDA
 (#8139)

ggml-ci
---
 .devops/full-cuda.Dockerfile           |  2 +-
 .devops/full-rocm.Dockerfile           |  2 +-
 .devops/llama-cli-cuda.Dockerfile      |  2 +-
 .devops/llama-cli-intel.Dockerfile     | 10 +--
 .devops/llama-cli-rocm.Dockerfile      |  2 +-
 .devops/llama-cli-vulkan.Dockerfile    |  2 +-
 .devops/llama-cpp-clblast.srpm.spec    | 84 --------------------------
 .devops/llama-cpp-cuda.srpm.spec       |  2 +-
 .devops/llama-server-cuda.Dockerfile   |  2 +-
 .devops/llama-server-intel.Dockerfile  | 10 +--
 .devops/llama-server-rocm.Dockerfile   |  2 +-
 .devops/llama-server-vulkan.Dockerfile |  2 +-
 12 files changed, 19 insertions(+), 103 deletions(-)
 delete mode 100644 .devops/llama-cpp-clblast.srpm.spec

diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile
index f6073f662..2a7da586a 100644
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -27,7 +27,7 @@ COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
-ENV LLAMA_CUDA=1
+ENV GGML_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1
 
diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile
index 0314d469b..5cbd2e7a1 100644
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@@ -36,7 +36,7 @@ COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
-ENV LLAMA_HIPBLAS=1
+ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 
diff --git a/.devops/llama-cli-cuda.Dockerfile b/.devops/llama-cli-cuda.Dockerfile
index d5ce538f6..bff946cbc 100644
--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/llama-cli-cuda.Dockerfile
@@ -21,7 +21,7 @@ COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
-ENV LLAMA_CUDA=1
+ENV GGML_CUDA=1
 
 RUN make -j$(nproc) llama-cli
 
diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile
index 6789e17af..bd816f9f5 100644
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@@ -2,7 +2,7 @@ ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
 
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
 
-ARG LLAMA_SYCL_F16=OFF
+ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
     apt-get install -y git
 
@@ -10,11 +10,11 @@ WORKDIR /app
 
 COPY . .
 
-RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
-        echo "LLAMA_SYCL_F16 is set" && \
-        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
+RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
+        echo "GGML_SYCL_F16 is set" && \
+        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
     fi && \
-    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
+    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
     cmake --build build --config Release --target llama-cli
 
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
diff --git a/.devops/llama-cli-rocm.Dockerfile b/.devops/llama-cli-rocm.Dockerfile
index 7e8a6f0fa..caa507b08 100644
--- a/.devops/llama-cli-rocm.Dockerfile
+++ b/.devops/llama-cli-rocm.Dockerfile
@@ -36,7 +36,7 @@ COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
-ENV LLAMA_HIPBLAS=1
+ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 
diff --git a/.devops/llama-cli-vulkan.Dockerfile b/.devops/llama-cli-vulkan.Dockerfile
index 7a0abe71f..6155d5881 100644
--- a/.devops/llama-cli-vulkan.Dockerfile
+++ b/.devops/llama-cli-vulkan.Dockerfile
@@ -14,7 +14,7 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
 # Build it
 WORKDIR /app
 COPY . .
-RUN cmake -B build -DLLAMA_VULKAN=1 && \
+RUN cmake -B build -DGGML_VULKAN=1 && \
     cmake --build build --config Release --target llama-cli
 
 # Clean up
diff --git a/.devops/llama-cpp-clblast.srpm.spec b/.devops/llama-cpp-clblast.srpm.spec
deleted file mode 100644
index 013952191..000000000
--- a/.devops/llama-cpp-clblast.srpm.spec
+++ /dev/null
@@ -1,84 +0,0 @@
-# SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
-# Built and maintained by John Boero - boeroboy@gmail.com
-# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
-
-# Notes for llama.cpp:
-# 1. Tags are currently based on hash - which will not sort asciibetically.
-#    We need to declare standard versioning if people want to sort latest releases.
-# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
-# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
-#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
-# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
-#    It is up to the user to install the correct vendor-specific support.
-
-Name:           llama.cpp-clblast
-Version:        %( date "+%%Y%%m%%d" )
-Release:        1%{?dist}
-Summary:        OpenCL Inference of LLaMA model in C/C++
-License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
-BuildRequires:  coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
-Requires:       clblast
-URL:            https://github.com/ggerganov/llama.cpp
-
-%define debug_package %{nil}
-%define source_date_epoch_from_changelog 0
-
-%description
-CPU inference for Meta's Lllama2 models using default options.
-
-%prep
-%setup -n llama.cpp-master
-
-%build
-make -j LLAMA_CLBLAST=1
-
-%install
-mkdir -p %{buildroot}%{_bindir}/
-cp -p llama-cli %{buildroot}%{_bindir}/llama-clblast-cli
-cp -p llama-server %{buildroot}%{_bindir}/llama-clblast-server
-cp -p llama-simple %{buildroot}%{_bindir}/llama-clblast-simple
-
-mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
-[Unit]
-Description=Llama.cpp server, CPU only (no GPU support in this build).
-After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
-
-[Service]
-Type=simple
-EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llama-clblast-server $LLAMA_ARGS
-ExecReload=/bin/kill -s HUP $MAINPID
-Restart=never
-
-[Install]
-WantedBy=default.target
-EOF
-
-mkdir -p %{buildroot}/etc/sysconfig
-%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
-LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
-EOF
-
-%clean
-rm -rf %{buildroot}
-rm -rf %{_builddir}/*
-
-%files
-%{_bindir}/llama-clblast-cli
-%{_bindir}/llama-clblast-server
-%{_bindir}/llama-clblast-simple
-/usr/lib/systemd/system/llamaclblast.service
-%config /etc/sysconfig/llama
-
-
-%pre
-
-%post
-
-%preun
-%postun
-
-%changelog
diff --git a/.devops/llama-cpp-cuda.srpm.spec b/.devops/llama-cpp-cuda.srpm.spec
index cbdf43626..7425d3a9d 100644
--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@@ -32,7 +32,7 @@ CPU inference for Meta's Lllama2 models using default options.
 %setup -n llama.cpp-master
 
 %build
-make -j LLAMA_CUDA=1
+make -j GGML_CUDA=1
 
 %install
 mkdir -p %{buildroot}%{_bindir}/
diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/llama-server-cuda.Dockerfile
index 7bef07a05..d7eaa0925 100644
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@@ -21,7 +21,7 @@ COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
-ENV LLAMA_CUDA=1
+ENV GGML_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1
 
diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile
index 3bf1670ec..8f8fef8c0 100644
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@@ -2,7 +2,7 @@ ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
 
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
 
-ARG LLAMA_SYCL_F16=OFF
+ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
     apt-get install -y git libcurl4-openssl-dev
 
@@ -10,11 +10,11 @@ WORKDIR /app
 
 COPY . .
 
-RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
-        echo "LLAMA_SYCL_F16 is set" && \
-        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
+RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
+        echo "GGML_SYCL_F16 is set" && \
+        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
     fi && \
-    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
+    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
     cmake --build build --config Release --target llama-server
 
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
diff --git a/.devops/llama-server-rocm.Dockerfile b/.devops/llama-server-rocm.Dockerfile
index 4b1cdc320..af96c3325 100644
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@@ -36,7 +36,7 @@ COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
-ENV LLAMA_HIPBLAS=1
+ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 
diff --git a/.devops/llama-server-vulkan.Dockerfile b/.devops/llama-server-vulkan.Dockerfile
index 2bc2e45d3..49062f84b 100644
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@@ -14,7 +14,7 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
 # Build it
 WORKDIR /app
 COPY . .
-RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
+RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
     cmake --build build --config Release --target llama-server
 
 # Clean up

From 4713bf3093d58a3e12368ab2ab5fc3630f27803e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 26 Jun 2024 19:36:44 +0300
Subject: [PATCH 20/50] authors : regen

---
 AUTHORS | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 128 insertions(+), 1 deletion(-)

diff --git a/AUTHORS b/AUTHORS
index b029f13da..1bd36158a 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,8 +1,9 @@
-# date: Tue Apr  9 09:17:14 EEST 2024
+# date: Wed Jun 26 19:36:34 EEST 2024
 # this file is auto-generated by scripts/gen-authors.sh
 
 0cc4m <picard12@live.de>
 0xspringtime <110655352+0xspringtime@users.noreply.github.com>
+20kdc <asdd2808@gmail.com>
 2f38b454 <dxf@protonmail.com>
 3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
 44670 <44670@users.noreply.github.com>
@@ -11,14 +12,18 @@ AT <manyoso@users.noreply.github.com>
 Aarni Koskela <akx@iki.fi>
 Aaron Miller <apage43@ninjawhale.com>
 Aaryaman Vasishta <aaryaman.vasishta@amd.com>
+Abheek Gulati <abheekg@hotmail.com>
 Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
 Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com>
 Adithya Balaji <adithya.b94@gmail.com>
 AdithyanI <adithyan.i4internet@gmail.com>
 Adrian <smith.adriane@gmail.com>
 Adrian Hesketh <a-h@users.noreply.github.com>
+Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
 AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
 Aisuko <urakiny@gmail.com>
+Akarshan Biswas <akarshanbiswas@fedoraproject.org>
+Albert Jin <albert.jin@gmail.com>
 Alberto <57916483+albbus-stack@users.noreply.github.com>
 Alex <awhill19@icloud.com>
 Alex Azarov <alex@azarov.by>
@@ -35,19 +40,24 @@ Ali Nehzat <ali.nehzat@thanks.dev>
 Ali Tariq <ali.tariq@10xengineers.ai>
 Alon <alonfaraj@gmail.com>
 AlpinDale <52078762+AlpinDale@users.noreply.github.com>
+Amir <amir_zia@outlook.com>
 AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
 Ananta Bastola <anantarajbastola@gmail.com>
 Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
 András Salamon <ott2@users.noreply.github.com>
 Andrei <abetlen@gmail.com>
 Andrew Canis <andrew.canis@gmail.com>
+Andrew Downing <andrew2085@gmail.com>
 Andrew Duffy <a10y@users.noreply.github.com>
 Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
+Andy Tai <andy-tai@users.noreply.github.com>
 Arik Poznanski <arikpoz@users.noreply.github.com>
 Artem <guinmoon@gmail.com>
+Artem Zinnatullin <ceo@abstractny.gay>
 Artyom Lebedev <vagran.ast@gmail.com>
 Asbjørn Olling <asbjornolling@gmail.com>
 Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
+Ashish <1856117+ashishdatta@users.noreply.github.com>
 Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
 Ashraful Islam <ashraful.meche@gmail.com>
 Atsushi Tatsuma <yoshoku@outlook.com>
@@ -57,35 +67,46 @@ BADR <contact@pythops.com>
 Bach Le <bach@bullno1.com>
 Bailey Chittle <39804642+bachittle@users.noreply.github.com>
 BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
+Bartowski <ckealty1182@gmail.com>
 Behnam M <58621210+ibehnam@users.noreply.github.com>
+Ben Ashbaugh <ben.ashbaugh@intel.com>
 Ben Garney <bengarney@users.noreply.github.com>
 Ben Siraphob <bensiraphob@gmail.com>
 Ben Williams <ben@719ben.com>
+Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
 Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
 Bernat Vadell <hounter.caza@gmail.com>
+Bingan <70050083+binganao@users.noreply.github.com>
 Bodo Graumann <mail@bodograumann.de>
 Bono Lv <lvscar@users.noreply.github.com>
 Borislav Stanimirov <b.stanimirov@abv.bg>
 Branden Butler <bwtbutler@hotmail.com>
 Brian <mofosyne@gmail.com>
 Bruce MacDonald <brucewmacdonald@gmail.com>
+Bryan Honof <bryanhonof@gmail.com>
 CJ Pais <cj@cjpais.com>
 CRD716 <crd716@gmail.com>
+Calvin Laurenson <calvin@laurenson.dev>
 Cameron <csteele@steelecameron.com>
 Cameron Kaiser <classilla@users.noreply.github.com>
+Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
 Casey Primozic <casey@cprimozic.net>
 Casey Primozic <me@ameo.link>
 CausalLM <148736309+CausalLM@users.noreply.github.com>
 Cebtenzzre <cebtenzzre@gmail.com>
 Chad Brewbaker <crb002@gmail.com>
+Chao Jiang <jc19chaoj@zoho.com>
 Cheng Shao <terrorjack@type.dance>
+Chris Elrod <elrodc@gmail.com>
 Chris Kuehl <ckuehl@ckuehl.me>
 Christian Demsar <christian@github.email.demsar.us>
 Christian Demsar <crasm@git.vczf.us>
 Christian Falch <875252+chrfalch@users.noreply.github.com>
 Christian Kögler <ck3d@gmx.de>
+Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
 Clark Saben <76020733+csaben@users.noreply.github.com>
 Clint Herron <hanclinto@gmail.com>
+CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
 Cuong Trinh Manh <nguoithichkhampha@gmail.com>
 DAN™ <dranger003@gmail.com>
 Damian Stewart <d@damianstewart.com>
@@ -95,8 +116,12 @@ Daniel Bevenius <daniel.bevenius@gmail.com>
 Daniel Drake <drake@endlessos.org>
 Daniel Hiltgen <dhiltgen@users.noreply.github.com>
 Daniel Illescas Romero <illescas.daniel@protonmail.com>
+Daniele <57776841+daniandtheweb@users.noreply.github.com>
 DannyDaemonic <DannyDaemonic@gmail.com>
 Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
+Dave <dave-fl@users.noreply.github.com>
+Dave Airlie <airlied@gmail.com>
+Dave Airlie <airlied@redhat.com>
 Dave Della Costa <ddellacosta+github@gmail.com>
 David Friehs <david@friehs.info>
 David Kennedy <dakennedyd@gmail.com>
@@ -104,10 +129,13 @@ David Pflug <david@pflug.email>
 David Renshaw <dwrenshaw@gmail.com>
 David Sommers <12738+databyte@users.noreply.github.com>
 David Yang <davidyang6us@gmail.com>
+Dawid Potocki <github@dawidpotocki.com>
 Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
 Dean <Dean.Sinaean@gmail.com>
 Deins <deinsegle@gmail.com>
+Deven Mistry <31466137+deven367@users.noreply.github.com>
 Didzis Gosko <didzis@users.noreply.github.com>
+Djip007 <djip.perois@free.fr>
 Don Mahurin <dmahurin@users.noreply.github.com>
 DooWoong Lee (David) <manics99@naver.com>
 Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
@@ -116,8 +144,11 @@ Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
 Ebey Abraham <ebey97@gmail.com>
 Ed Lee <edilee@mozilla.com>
 Ed Lepedus <ed.lepedus@googlemail.com>
+Eddie-Wang <wangjinheng1120@163.com>
 Edward Taylor <edeetee@gmail.com>
+Elaine <elaine.zosa@gmail.com>
 Elbios <141279586+Elbios@users.noreply.github.com>
+Elton Kola <eltonkola@gmail.com>
 Engininja2 <139037756+Engininja2@users.noreply.github.com>
 Equim <sayaka@ekyu.moe>
 Eric Sommerlade <es0m@users.noreply.github.com>
@@ -143,37 +174,47 @@ Firat <firatkiral@gmail.com>
 Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
 Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
 Francisco Melo <43780565+francis2tm@users.noreply.github.com>
+Frank Mai <thxcode0824@gmail.com>
 FrankHB <frankhb1989@gmail.com>
+Fred Douglas <43351173+fredlas@users.noreply.github.com>
 Frederik Vogel <Schaltfehler@users.noreply.github.com>
 Gabe Goodhart <gabe.l.hart@gmail.com>
 GainLee <perfecter.gen@gmail.com>
 Galunid <karolek1231456@gmail.com>
 Gary Linscott <glinscott@gmail.com>
 Gary Mulder <gjmulder@gmail.com>
+Gavin Zhao <gavinzhaojw@protonmail.com>
 Genkagaku.GPT <hlhr202@163.com>
 Georgi Gerganov <ggerganov@gmail.com>
 Gilad S <giladgd@users.noreply.github.com>
+Giuseppe Scrivano <giuseppe@scrivano.org>
 GiviMAD <GiviMAD@users.noreply.github.com>
 Govlzkoy <gotope@users.noreply.github.com>
 Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
 Guillaume Wenzek <gwenzek@users.noreply.github.com>
 Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
 Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
+Haggai Nuchi <h.nuchi@gmail.com>
 Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
+Hamdoud Hakem <90524568+hamdoudhakem@users.noreply.github.com>
+HanishKVC <hanishkvc@gmail.com>
 Haohui Mai <ricetons@gmail.com>
 Haoxiang Fei <tonyfettes@tonyfettes.com>
 Harald Fernengel <harald.fernengel@here.com>
 Hatsune Miku <129688334+at8u@users.noreply.github.com>
+HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com>
 Henk Poley <HenkPoley@gmail.com>
 Henri Vasserman <henv@hot.ee>
 Henrik Forstén <henrik.forsten@gmail.com>
 Herman Semenov <GermanAizek@yandex.ru>
 Hesen Peng <hesen.peng@gmail.com>
 Hoang Nguyen <hugo53@users.noreply.github.com>
+Hong Bo PENG <penghb@cn.ibm.com>
 Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
 Howard Su <howard0su@gmail.com>
 Hua Jiang <allenhjiang@outlook.com>
 Huawei Lin <huaweilin.cs@gmail.com>
+Hugo Roussel <hugo.rous@gmail.com>
 Ian Bull <irbull@eclipsesource.com>
 Ian Bull <irbull@gmail.com>
 Ian Scrivener <github@zilogy.asia>
@@ -190,8 +231,10 @@ Ivan Stepanov <ivanstepanovftw@gmail.com>
 JH23X <165871467+JH23X@users.noreply.github.com>
 Jack Mousseau <jmousseau@users.noreply.github.com>
 JackJollimore <130917767+JackJollimore@users.noreply.github.com>
+Jaemin Son <woalsdnd@gmail.com>
 Jag Chadha <jagtesh@gmail.com>
 Jakub N <jakubniemczyk97@gmail.com>
+James A Capozzoli <157492257+jac-jim@users.noreply.github.com>
 James Reynolds <magnusviri@users.noreply.github.com>
 Jan Boon <jan.boon@kaetemi.be>
 Jan Boon <kaetemi@gmail.com>
@@ -205,12 +248,17 @@ Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
 Jed Fox <git@jedfox.com>
 Jeffrey Quesnelle <emozilla@nousresearch.com>
 Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
+Jeximo <jeximo@gmail.com>
 Jhen-Jie Hong <iainst0409@gmail.com>
 Jiahao Li <liplus17@163.com>
 Jian Liao <jianliao@users.noreply.github.com>
 JidongZhang-THU <1119708529@qq.com>
 Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
 Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
+Jiří Sejkora <Sejseloid@gmail.com>
+Joan Fontanals <jfontanalsmartinez@gmail.com>
+Joan Fontanals <joan.fontanals.martinez@jina.ai>
+Johan <JohanAR@users.noreply.github.com>
 Johannes Gäßler <johannesg@5d6.de>
 Johannes Rudolph <johannes.rudolph@gmail.com>
 John <78893154+cmp-nct@users.noreply.github.com>
@@ -221,15 +269,19 @@ Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com>
 Jorge A <161275481+jorgealias@users.noreply.github.com>
 Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com>
 Joseph Stahl <1269177+josephst@users.noreply.github.com>
+Josh Ramer <josh.ramer@icloud.com>
 Joyce <joycebrum@google.com>
 Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
 Judd <foldl@users.noreply.github.com>
 Julius Arkenberg <arki05@users.noreply.github.com>
 Jun Jie <71215065+junnjiee16@users.noreply.github.com>
+Junyang Lin <justinlin930319@hotmail.com>
 Juraj Bednar <juraj@bednar.io>
 Justin Parker <jparkerweb@gmail.com>
 Justin Suess <justin.suess@westpoint.edu>
+Justina Cho <justcho5@gmail.com>
 Justine Tunney <jtunney@gmail.com>
+Justine Tunney <jtunney@mozilla.com>
 Juuso Alasuutari <juuso.alasuutari@gmail.com>
 KASR <karim.asrih@gmail.com>
 Kamil Tomšík <info@tomsik.cz>
@@ -242,6 +294,7 @@ Kawrakow <48489457+ikawrakow@users.noreply.github.com>
 Keiichi Tabata <keiichi.tabata@outlook.com>
 Kenvix ⭐ <kenvixzure@live.com>
 Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
+Kevin Gibbons <bakkot@gmail.com>
 Kevin Ji <1146876+kevinji@users.noreply.github.com>
 Kevin Kwok <antimatter15@gmail.com>
 Kevin Lo <kevlo@kevlo.org>
@@ -257,6 +310,7 @@ Laura <Tijntje_7@msn.com>
 Lee <44310445+lx200916@users.noreply.github.com>
 Lee Drake <b.lee.drake@gmail.com>
 Leng Yue <lengyue@lengyue.me>
+Leon Knauer <git@leonknauer.com>
 LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
 Leonardo Neumann <leonardo@neumann.dev.br>
 Li Tan <tanliboy@gmail.com>
@@ -265,20 +319,26 @@ LoganDark <github@logandark.mozmail.com>
 LostRuins <39025047+LostRuins@users.noreply.github.com>
 Luciano <lucianostrika44@gmail.com>
 Luo Tian <lt@basecity.com>
+Lyle Dean <dean@lyle.dev>
 M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
 Maarten ter Huurne <maarten@treewalker.org>
 Mack Straight <eiz@users.noreply.github.com>
 Maël Kerbiriou <m431.kerbiriou@gmail.com>
 MaggotHATE <clay1326@gmail.com>
+Manuel <44313466+makuche@users.noreply.github.com>
 Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
 Marco Matthies <71844+marcom@users.noreply.github.com>
 Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
 Marian Cepok <marian.cepok@gmail.com>
 Mark Fairbairn <thebaron88@gmail.com>
 Marko Tasic <mtasic85@gmail.com>
+Markus Tavenrath <mtavenrath@users.noreply.github.com>
+Martin Delille <martin@delille.org>
 Martin Krasser <krasserm@googlemail.com>
 Martin Schwaighofer <mschwaig@users.noreply.github.com>
 Marvin Gießing <marvin.giessing@gmail.com>
+Masaya, Kato <62578291+msy-kato@users.noreply.github.com>
+MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com>
 Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
 Matheus C. França <matheus-catarino@hotmail.com>
 Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
@@ -287,8 +347,11 @@ Mathijs de Bruin <mathijs@mathijsfietst.nl>
 Matt Clayton <156335168+mattjcly@users.noreply.github.com>
 Matt Pulver <matt.pulver@heavy.ai>
 Matteo Boschini <12133566+mbosc@users.noreply.github.com>
+Mattheus Chediak <shammcity00@gmail.com>
 Matthew Tejo <matthew.tejo@gmail.com>
 Matvey Soloviev <blackhole89@gmail.com>
+Max Krasnyansky <max.krasnyansky@gmail.com>
+Max Krasnyansky <quic_maxk@quicinc.com>
 Maxime <672982+maximegmd@users.noreply.github.com>
 Maximilian Winter <maximilian.winter.91@gmail.com>
 Meng Zhang <meng@tabbyml.com>
@@ -300,32 +363,41 @@ Michael Kesper <mkesper@schokokeks.org>
 Michael Klimenko <mklimenko29@gmail.com>
 Michael Podvitskiy <podvitskiymichael@gmail.com>
 Michael Potter <NanoTekGuy@Gmail.com>
+Michael de Gans <michael.john.degans@gmail.com>
 Michaël de Vries <vriesdemichael@gmail.com>
 Mihai <mihai.chirculescu@yahoo.com>
 Mike <ytianhui2004@gmail.com>
+Mikko Juola <mikjuo@gmail.com>
 Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
 Mirko185 <mirkosig@gmail.com>
 Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
 Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
 Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
+Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
 Murilo Santana <mvrilo@gmail.com>
 Musab Gultekin <musabgultekin@users.noreply.github.com>
 Nam D. Tran <42194884+namtranase@users.noreply.github.com>
+Nathan Epstein <nate2@umbc.edu>
 NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
 Nebula <infinitewormhole@gmail.com>
+Neo Zhang <14088817+arthw@users.noreply.github.com>
+Neo Zhang <zhang.jianyu@outlook.com>
 Neo Zhang Jianyu <jianyu.zhang@intel.com>
 Neuman Vong <neuman.vong@gmail.com>
 Nexesenex <124105151+Nexesenex@users.noreply.github.com>
 Niall Coates <1349685+Niall-@users.noreply.github.com>
 Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
+Nicolás Pérez <nicolas_perez@brown.edu>
 Nigel Bosch <pnigelb@gmail.com>
 Niklas Korz <niklas@niklaskorz.de>
+Nikolas <127742645+nneubacher@users.noreply.github.com>
 Nindaleth <Nindaleth@users.noreply.github.com>
 Oleksandr Nikitin <oleksandr@tvori.info>
 Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
 Olivier Chafik <ochafik@users.noreply.github.com>
 Ondřej Čertík <ondrej@certik.us>
 Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
+Patrice Ferlet <metal3d@gmail.com>
 Paul Tsochantaris <ptsochantaris@icloud.com>
 Pavol Rusnak <pavol@rusnak.io>
 Pedro Cuenca <pedro@huggingface.co>
@@ -343,9 +415,14 @@ RJ Adriaansen <adriaansen@eshcc.eur.nl>
 Radoslav Gerganov <rgerganov@gmail.com>
 Radosław Gryta <radek.gryta@gmail.com>
 Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com>
+Raj Hammeer Singh Hada <hammeerraj@gmail.com>
+Ralph Soika <ralph.soika@imixs.com>
 Rand Xie <randxiexyy29@gmail.com>
 Randall Fitzgerald <randall@dasaku.net>
 Reinforce-II <fate@eastal.com>
+Ren Xuancheng <jklj077@users.noreply.github.com>
+Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
+RhinoDevel <RhinoDevel@users.noreply.github.com>
 Riceball LEE <snowyu.lee@gmail.com>
 Richard Kiss <him@richardkiss.com>
 Richard Roberson <richardr1126@gmail.com>
@@ -373,6 +450,7 @@ Rowan Hart <rowanbhart@gmail.com>
 Rune <43761327+Rune-AI@users.noreply.github.com>
 Ryan Landay <rlanday@gmail.com>
 Ryder Wishart <ryderwishart@gmail.com>
+Ryuei <louixs@users.noreply.github.com>
 Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
 SakuraUmi <yukinon244@gmail.com>
 Salvador E. Tropea <stropea@inti.gob.ar>
@@ -386,6 +464,7 @@ SebastianApel <13675545+SebastianApel@users.noreply.github.com>
 Senemu <10880819+Senemu@users.noreply.github.com>
 Sergey Alirzaev <zl29ah@gmail.com>
 Sergio López <slp@sinrega.org>
+Sertaç Özercan <852750+sozercan@users.noreply.github.com>
 SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
 ShadovvBeast <ShadovvBeast@gmail.com>
 Shakhar Dasgupta <shakhardasgupta@gmail.com>
@@ -394,6 +473,7 @@ Shijie <821898965@qq.com>
 Shintarou Okada <kokuzen@gmail.com>
 Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
 Shouzheng Liu <lshzh.hi@gmail.com>
+Shuichi Tsutsumi <shuichi0526@gmail.com>
 Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
 Simon Willison <swillison@gmail.com>
 Siwen Yu <yusiwen@gmail.com>
@@ -405,11 +485,14 @@ Someone <sergei.kozlukov@aalto.fi>
 Someone Serge <sergei.kozlukov@aalto.fi>
 Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
 Spencer Sutton <spencersutton@users.noreply.github.com>
+Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com>
 Srinivas Billa <nivibilla@gmail.com>
 Stefan Sydow <stefan@sydow.email>
+Steffen Röcker <sroecker@gmail.com>
 Stephan Walter <stephan@walter.name>
 Stephen Nichols <snichols@users.noreply.github.com>
 Steve Grubb <ausearch.1@gmail.com>
+Steven Prichard <spprichard20@gmail.com>
 Steven Roussey <sroussey@gmail.com>
 Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
 Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
@@ -434,16 +517,19 @@ Tom C <tom.corelis@gmail.com>
 Tom Jobbins <784313+TheBloke@users.noreply.github.com>
 Tomas <tom.tomas.36478119@gmail.com>
 Tomáš Pazdiora <tomas.pazdiora@gmail.com>
+Tristan Druyen <tristan@vault81.mozmail.com>
 Tristan Ross <rosscomputerguy@protonmail.com>
 Tungsten842 <886724vf@anonaddy.me>
 Tungsten842 <quantmint@protonmail.com>
 Tushar <ditsuke@protonmail.com>
 UEXTM.com <84163508+uextm@users.noreply.github.com>
+Ulrich Drepper <drepper@gmail.com>
 Uzo Nweke <uzoechi@gmail.com>
 Vaibhav Srivastav <vaibhavs10@gmail.com>
 Val Kharitonov <mail@kharvd.com>
 Valentin Konovalov <valle.ketsujin@gmail.com>
 Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
+Victor Nogueira <felladrin@gmail.com>
 Victor Z. Peng <ziliangdotme@gmail.com>
 Vlad <spitfireage@gmail.com>
 Vladimir <bogdad@gmail.com>
@@ -455,7 +541,9 @@ Weird Constructor <weirdconstructor@gmail.com>
 Welby Seely <welbyseely@gmail.com>
 Wentai Zhang <rchardx@gmail.com>
 WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
+William Tambellini <william.tambellini@gmail.com>
 Willy Tarreau <w@1wt.eu>
+Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com>
 Wu Jian Ping <wujjpp@hotmail.com>
 Wu Jian Ping <wujp@greatld.com>
 Xiake Sun <xiake.sun@intel.com>
@@ -466,6 +554,8 @@ Xiaoyi Chen <cxychina@gmail.com>
 Xingchen Song(宋星辰) <xingchensong1996@163.com>
 Xuan Son Nguyen <thichthat@gmail.com>
 Yann Follet <131855179+YannFollet@users.noreply.github.com>
+Yaroslav <yaroslav.yashin@me.com>
+Yazan Agha-Schrader <mountaiin@icloud.com>
 Yiming Cui <conandiy@vip.qq.com>
 Yishuo Wang <MeouSker77@outlook.com>
 Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
@@ -477,6 +567,7 @@ Zane Shannon <z@zcs.me>
 Zay <95888118+isaiahbjork@users.noreply.github.com>
 Zenix <zenixls2@gmail.com>
 Zhang Peiyuan <a1286225768@gmail.com>
+Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
 ZhouYuChen <zhouyuchen@naver.com>
 Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
 Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
@@ -484,14 +575,18 @@ Zsapi <martin1.zsapka@gmail.com>
 a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
 adel boussaken <netdur@gmail.com>
 afrideva <95653597+afrideva@users.noreply.github.com>
+agray3 <agray3@users.noreply.github.com>
 akawrykow <142945436+akawrykow@users.noreply.github.com>
 alexpinel <93524949+alexpinel@users.noreply.github.com>
 alonfaraj <alonfaraj@gmail.com>
+alwqx <kenan3015@gmail.com>
+amd-lalithnc <lalithnc@amd.com>
 andrijdavid <david@geek.mg>
 anon998 <131767832+anon998@users.noreply.github.com>
 anzz1 <anzz1@live.com>
 apaz <aarpazdera@gmail.com>
 apcameron <37645737+apcameron@users.noreply.github.com>
+arch-btw <57669023+arch-btw@users.noreply.github.com>
 arcrank <arcrank@gmail.com>
 arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
 at8u <129688334+at8u@users.noreply.github.com>
@@ -514,13 +609,17 @@ cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
 coezbek <c.oezbek@gmail.com>
 comex <comexk@gmail.com>
 compilade <113953597+compilade@users.noreply.github.com>
+compilade <git@compilade.net>
+cpumaxx <163466046+cpumaxx@users.noreply.github.com>
 crasm <crasm@git.vczf.net>
 crasm <crasm@git.vczf.us>
 daboe01 <daboe01@googlemail.com>
 david raistrick <keen99@users.noreply.github.com>
+ddh0 <dylanhalladay02@icloud.com>
 ddpasa <112642920+ddpasa@users.noreply.github.com>
 deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
 divinity76 <divinity76@gmail.com>
+dm4 <sunrisedm4@gmail.com>
 dotpy314 <33351922+dotpy314@users.noreply.github.com>
 drbh <david.richard.holtz@gmail.com>
 ds5t5 <145942675+ds5t5@users.noreply.github.com>
@@ -529,6 +628,7 @@ eastriver <lee@eastriver.dev>
 ebraminio <ebraminio@gmail.com>
 eiery <19350831+eiery@users.noreply.github.com>
 eric8607242 <e0928021388@gmail.com>
+fairydreaming <166155368+fairydreaming@users.noreply.github.com>
 fraxy-v <65565042+fraxy-v@users.noreply.github.com>
 github-actions[bot] <github-actions[bot]@users.noreply.github.com>
 gliptic <gliptic@users.noreply.github.com>
@@ -539,6 +639,7 @@ h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
 hankcs <cnhankmc@gmail.com>
 hoangmit <hoangmit@users.noreply.github.com>
 hongbo.mo <352280764@qq.com>
+hopkins385 <98618192+hopkins385@users.noreply.github.com>
 howlger <eclipse@voormann.de>
 howlger <github@voormann.de>
 hutli <6594598+hutli@users.noreply.github.com>
@@ -549,14 +650,22 @@ hydai <z54981220@gmail.com>
 iSma <ismail.senhaji@gmail.com>
 iacore <74560659+iacore@users.noreply.github.com>
 igarnier <igarnier@protonmail.com>
+intelmatt <61025942+intelmatt@users.noreply.github.com>
 iohub <rickyang.pro@gmail.com>
 jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
+jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
 jameswu2014 <545426914@qq.com>
+jiez <373447296@qq.com>
 jneem <joeneeman@gmail.com>
+joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
 johnson442 <56517414+johnson442@users.noreply.github.com>
+jojorne <jojorne@users.noreply.github.com>
 jon-chuang <9093549+jon-chuang@users.noreply.github.com>
 jp-x-g <jpxg-dev@protonmail.com>
+jukofyork <69222624+jukofyork@users.noreply.github.com>
+junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
 jwj7140 <32943891+jwj7140@users.noreply.github.com>
+k.h.lai <adrian.k.h.lai@outlook.com>
 kaizau <kaizau@users.noreply.github.com>
 kalomaze <66376113+kalomaze@users.noreply.github.com>
 kang <tpdns9032100@gmail.com>
@@ -575,11 +684,15 @@ ldwang <ftgreat@163.com>
 le.chang <cljs118@126.com>
 leejet <leejet714@gmail.com>
 limitedAtonement <limitedAtonement@users.noreply.github.com>
+liuwei-git <14815172+liuwei-git@users.noreply.github.com>
 lon <114724657+longregen@users.noreply.github.com>
+loonerin <132926317+loonerin@users.noreply.github.com>
+luoyu-intel <yu.luo@intel.com>
 m3ndax <adrian.goessl@outlook.com>
 maddes8cht <55592906+maddes8cht@users.noreply.github.com>
 makomk <makosoft@googlemail.com>
 manikbhandari <mbbhandarimanik2@gmail.com>
+maor-ps <154728172+maor-ps@users.noreply.github.com>
 mdrokz <mohammadmunshi@gmail.com>
 mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
 minarchist <minarchist@users.noreply.github.com>
@@ -593,15 +706,19 @@ ngc92 <7938269+ngc92@users.noreply.github.com>
 nhamanasu <45545786+nhamanasu@users.noreply.github.com>
 niansa/tuxifan <anton-sa@web.de>
 niansa/tuxifan <tuxifan@posteo.de>
+nickp27 <nb.porter@gmail.com>
 ningshanwutuobang <ningshanwutuobang@gmail.com>
 nold <Nold360@users.noreply.github.com>
 nopperl <54780682+nopperl@users.noreply.github.com>
 nusu-github <29514220+nusu-github@users.noreply.github.com>
 olexiyb <olexiyb@gmail.com>
+omahs <73983677+omahs@users.noreply.github.com>
 oobabooga <112222186+oobabooga@users.noreply.github.com>
 opparco <parco.opaai@gmail.com>
 ostix360 <55257054+ostix360@users.noreply.github.com>
+pengxin99 <pengxin.yuan@intel.com>
 perserk <perserk@gmail.com>
+pmysl <piotr.myslinski@outlook.com>
 postmasters <namnguyen@google.com>
 pudepiedj <pudepiedj@gmail.com>
 qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
@@ -614,16 +731,19 @@ rhuddleston <ryan.huddleston@percona.com>
 rimoliga <53384203+rimoliga@users.noreply.github.com>
 runfuture <runfuture@users.noreply.github.com>
 sandyiscool <sandyiscool@gmail.com>
+sasha0552 <admin@sasha0552.org>
 semidark <me@semidark.net>
 sharpHL <132747147+sharpHL@users.noreply.github.com>
 shibe2 <shibe@tuta.io>
 singularity <12184989+singularity-s0@users.noreply.github.com>
 sjinzh <sjinzh@gmail.com>
+sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
 slaren <2141330+slaren@users.noreply.github.com>
 slaren <slarengh@gmail.com>
 snadampal <87143774+snadampal@users.noreply.github.com>
 staviq <staviq@gmail.com>
 stduhpf <stephduh@live.fr>
+strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
 swittk <switt1995@gmail.com>
 takov751 <40316768+takov751@users.noreply.github.com>
 tarcey <cey.tarik@gmail.com>
@@ -636,12 +756,16 @@ uint256_t <konndennsa@gmail.com>
 uint256_t <maekawatoshiki1017@gmail.com>
 unbounded <haakon@likedan.net>
 valiray <133289098+valiray@users.noreply.github.com>
+vik <vikhyatk@gmail.com>
+viric <viric@viric.name>
 vodkaslime <646329483@qq.com>
 vvhg1 <94630311+vvhg1@users.noreply.github.com>
 vxiiduu <73044267+vxiiduu@users.noreply.github.com>
 wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
 whoreson <139810751+whoreson@users.noreply.github.com>
+woachk <24752637+woachk@users.noreply.github.com>
 wonjun Jang <strutive07@gmail.com>
+woodx <124784234+woodx9@users.noreply.github.com>
 wzy <32936898+Freed-Wu@users.noreply.github.com>
 xaedes <xaedes@gmail.com>
 xaedes <xaedes@googlemail.com>
@@ -649,7 +773,10 @@ xloem <0xloem@gmail.com>
 yangli2 <yangli2@gmail.com>
 yuiseki <yuiseki@gmail.com>
 zakkor <edward.partenie@gmail.com>
+zhangkaihuo <zhangkaihuo@gmail.com>
 zhouwg <6889919+zhouwg@users.noreply.github.com>
+zhouwg <zhouwg2000@gmail.com>
 zrm <trustiosity.zrm@gmail.com>
+Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
 源文雨 <41315874+fumiama@users.noreply.github.com>
 Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>

From f2d48fffde76d959fdb0da37316bdc09e5518eb1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 26 Jun 2024 19:39:19 +0300
Subject: [PATCH 21/50] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index b6c57ec5e..2da33e913 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-5653a195935ea3ac54652644c9daf154dbc1571b
+5378ea0d3c2f25bcd330ecb226ad2db454be86d0

From c7ab7b612cbdce04499575e713076a026af4b9c5 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Wed, 26 Jun 2024 20:20:22 +0200
Subject: [PATCH 22/50] make : fix missing -O3 (#8143)

---
 Makefile | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 64a6e6ff0..bbfe0f12b 100644
--- a/Makefile
+++ b/Makefile
@@ -148,12 +148,6 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif
 
-MK_CFLAGS     += -O3
-MK_CXXFLAGS   += -O3
-ifndef LLAMA_DEBUG
-MK_NVCCFLAGS  += -O3
-endif # LLAMA_DEBUG
-
 # In GNU make default CXX is g++ instead of c++.  Let's fix that so that users
 # of non-gcc compilers don't have to provide g++ alias or wrapper.
 DEFCC  := cc
@@ -312,7 +306,10 @@ ifdef LLAMA_DEBUG
 		MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
 	endif
 else
-	MK_CPPFLAGS += -DNDEBUG
+	MK_CPPFLAGS   += -DNDEBUG
+	MK_CFLAGS     += -O3
+	MK_CXXFLAGS   += -O3
+	MK_NVCCFLAGS  += -O3
 endif
 
 ifdef LLAMA_SANITIZE_THREAD

From 31ec3993f6e050322a249c07af79dbde66ea6ddc Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Wed, 26 Jun 2024 21:34:14 +0200
Subject: [PATCH 23/50] ggml : add GGML_CUDA_USE_GRAPHS option, restore
 GGML_CUDA_FORCE_CUBLAS (cmake) (#8140)

---
 CMakeLists.txt          | 1 +
 ggml/CMakeLists.txt     | 2 ++
 ggml/src/CMakeLists.txt | 5 ++++-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 18297834e..7a7197282 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -80,6 +80,7 @@ set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
 set(GGML_ALL_WARNINGS       ${LLAMA_ALL_WARNINGS})
 set(GGML_FATAL_WARNINGS     ${LLAMA_FATAL_WARNINGS})
 set(GGML_LLAMAFILE          ON)
+set(GGML_CUDA_USE_GRAPHS    ON)
 
 # transition helpers
 function (llama_option_depr TYPE OLD NEW)
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index f3763f7eb..0d0d52d57 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -109,6 +109,7 @@ option(GGML_LLAMAFILE                       "ggml: use ggml SGEMM"
 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
 option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
+option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
 set   (GGML_CUDA_DMMV_X   "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
 set   (GGML_CUDA_MMV_Y     "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
 option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
@@ -119,6 +120,7 @@ set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
 option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
 option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"                OFF)
 option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
+option(GGML_CUDA_USE_GRAPHS                 "ggml: use CUDA graphs (llama.cpp only)"          OFF)
 
 option(GGML_CURL                            "ggml: use libcurl to download model from an URL" OFF)
 option(GGML_HIPBLAS                         "ggml: use hipBLAS"                               OFF)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index ba341d374..d0f4097d8 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -295,12 +295,15 @@ if (GGML_CUDA)
 
         list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA)
 
-        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
         add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
         add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
         add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
         add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
 
+        if (GGML_CUDA_USE_GRAPHS)
+            add_compile_definitions(GGML_CUDA_USE_GRAPHS)
+        endif()
+
         if (GGML_CUDA_FORCE_DMMV)
             add_compile_definitions(GGML_CUDA_FORCE_DMMV)
         endif()

From ae5d0f4b899ff2842bfca561370c945ad8d4368b Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Wed, 26 Jun 2024 21:59:28 +0200
Subject: [PATCH 24/50] ci : publish new docker images only when the files
 change (#8142)

---
 .github/workflows/build.yml  | 4 ++--
 .github/workflows/docker.yml | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 0d91fc4e4..208515287 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -10,10 +10,10 @@ on:
   push:
     branches:
       - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
+    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
   pull_request:
     types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m']
+    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 01f1a4522..bf94b2024 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -14,6 +14,7 @@ on:
   push:
     branches:
       - master
+    paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}

From c70d117c37cc7876e775d1e2722208a50c52edb3 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 26 Jun 2024 23:25:22 +0300
Subject: [PATCH 25/50] scripts : fix filename sync

---
 scripts/sync-ggml-am.sh | 71 ++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 36 deletions(-)

diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh
index 9e654180b..b05a33747 100755
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@@ -136,42 +136,41 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
     # LICENSE                -> LICENSE
     # scripts/gen-authors.sh -> scripts/gen-authors.sh
 
-    cat ggml-src.patch | sed \
-        -e 's/CMakeLists.txt/ggml\/CMakeLists.txt/g' \
-        -e 's/src\/CMakeLists.txt/ggml\/src\/CMakeLists.txt/g' \
-        -e 's/cmake\/FindSIMD.cmake/ggml\/cmake\/FindSIMD.cmake/g' \
-        -e 's/src\/ggml\.c/ggml/src/ggml.c/g' \
-        -e 's/src\/ggml-alloc\.c/ggml/src/ggml-alloc.c/g' \
-        -e 's/src\/ggml-backend-impl\.h/ggml/src/ggml-backend-impl.h/g' \
-        -e 's/src\/ggml-backend\.c/ggml/src/ggml-backend.c/g' \
-        -e 's/src\/ggml-common\.h/ggml/src/ggml-common.h/g' \
-        -e 's/src\/ggml-cuda\//ggml-cuda\//g' \
-        -e 's/src\/ggml-cuda\.cu/ggml/src/ggml-cuda.cu/g' \
-        -e 's/src\/ggml-impl\.h/ggml/src/ggml-impl.h/g' \
-        -e 's/src\/ggml-kompute\.cpp/ggml/src/ggml-kompute.cpp/g' \
-        -e 's/src\/ggml-metal\.m/ggml/src/ggml-metal.m/g' \
-        -e 's/src\/ggml-quants\.c/ggml/src/ggml-quants.c/g' \
-        -e 's/src\/ggml-quants\.h/ggml/src/ggml-quants.h/g' \
-        -e 's/src\/ggml-rpc\.cpp/ggml/src/ggml-rpc.cpp/g' \
-        -e 's/src\/ggml-sycl\.cpp/ggml/src/ggml-sycl.cpp/g' \
-        -e 's/src\/ggml-vulkan\.cpp/ggml/src/ggml-vulkan.cpp/g' \
-        -e 's/include\/ggml\.h/ggml/include/ggml.h/g' \
-        -e 's/include\/ggml-alloc\.h/ggml/include/ggml-alloc.h/g' \
-        -e 's/include\/ggml-backend\.h/ggml/include/ggml-backend.h/g' \
-        -e 's/include\/ggml-blas\.h/ggml/include/ggml-blas.h/g' \
-        -e 's/include\/ggml-cuda\.h/ggml/include/ggml-cuda.h/g' \
-        -e 's/include\/ggml-kompute\.h/ggml/include/ggml-kompute.h/g' \
-        -e 's/include\/ggml-metal\.h/ggml/include/ggml-metal.h/g' \
-        -e 's/include\/ggml-rpc\.h/ggml/include/ggml-rpc.h/g' \
-        -e 's/include\/ggml-sycl\.h/ggml/include/ggml-sycl.h/g' \
-        -e 's/include\/ggml-vulkan\.h/ggml/include/ggml-vulkan.h/g' \
-        -e 's/tests\/test-opt\.cpp/tests\/test-opt.cpp/g' \
-        -e 's/tests\/test-grad0\.cpp/tests\/test-grad0.cpp/g' \
-        -e 's/tests\/test-quantize-fns\.cpp/tests\/test-quantize-fns.cpp/g' \
-        -e 's/tests\/test-quantize-perf\.cpp/tests\/test-quantize-perf.cpp/g' \
-        -e 's/tests\/test-backend-ops\.cpp/tests\/test-backend-ops.cpp/g' \
-        -e 's/LICENSE/LICENSE/g' \
-        -e 's/scripts\/gen-authors\.sh/scripts\/gen-authors.sh/g' \
+    cat ggml-src.patch | sed -E \
+        -e 's/([[:space:]]|[ab]\/)CMakeLists.txt/\1ggml\/CMakeLists.txt/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \
+        -e 's/([[:space:]]|[ab]\/)cmake\/FindSIMD.cmake/\1ggml\/cmake\/FindSIMD.cmake/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml\.c/\1ggml\/src\/ggml.c/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.c/\1ggml\/src\/ggml-backend.c/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-common\.h/\1ggml\/src\/ggml-common.h/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\.cu/\1ggml\/src\/ggml-cuda.cu/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-impl\.h/\1ggml\/src\/ggml-impl.h/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-kompute\.cpp/\1ggml\/src\/ggml-kompute.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-metal\.m/\1ggml\/src\/ggml-metal.m/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.c/\1ggml\/src\/ggml-quants.c/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.h/\1ggml\/src\/ggml-quants.h/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-rpc\.cpp/\1ggml\/src\/ggml-rpc.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\.cpp/\1ggml\/src\/ggml-sycl.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\.cpp/\1ggml\/src\/ggml-vulkan.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml-blas\.h/\1ggml\/include\/ggml-blas.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml-cuda\.h/\1ggml\/include\/ggml-cuda.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml-kompute\.h/\1ggml\/include\/ggml-kompute.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml-metal\.h/\1ggml\/include\/ggml-metal.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml-rpc\.h/\1ggml\/include\/ggml-rpc.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml-sycl\.h/\1ggml\/include\/ggml-sycl.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml-vulkan\.h/\1ggml\/include\/ggml-vulkan.h/g' \
+        -e 's/([[:space:]]|[ab]\/)examples\/common\.h/examples\/common.h/g' \
+        -e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/examples\/common.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/examples\/common-ggml.h/g' \
+        -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.cpp/examples\/common-ggml.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)LICENSE/LICENSE/g' \
+        -e 's/([[:space:]]|[ab]\/)scripts\/gen-authors\.sh/scripts\/gen-authors.sh/g' \
         > ggml-src.patch.tmp
     mv ggml-src.patch.tmp ggml-src.patch
 

From 9b31a40c6ddabe552875b811d7127aa039ca9703 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Thu, 27 Jun 2024 01:50:09 +0200
Subject: [PATCH 26/50] clip : suppress unused variable warnings (#8105)

* clip : suppress unused variable warnings

This commit suppresses unused variable warnings for the variables e in
the catch blocks.

The motivation for this change is to suppress the warnings that are
generated on Windows when using the MSVC compiler. The warnings are
not displayed when using GCC because GCC will mark all catch parameters
as used.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

* squash! clip : suppress unused variable warnings

Remove e (/*e*/) instead instead of using GGML_UNUSED.

---------

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 examples/llava/clip.cpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 95fbe3d02..d6882eec3 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -1121,20 +1121,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             }
             if (n < 32)
                 hparams.image_grid_pinpoints[n] = 0;
-        } catch (std::runtime_error & e) {
+        } catch (std::runtime_error & /*e*/) {
             hparams.image_grid_pinpoints[0]=0;
         }
 
         try {
             int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
             strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx));
-        } catch (std::runtime_error & e) {
+        } catch (std::runtime_error & /*e*/) {
             strcpy(hparams.mm_patch_merge_type, "flat");
         }
 
         try {
             hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6
-        } catch(const std::exception& e) {
+        } catch(const std::exception& /*e*/) {
             hparams.image_crop_resolution = hparams.image_size;
         }
 
@@ -1173,7 +1173,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         try {
             vision_model.class_embedding  = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
             new_clip->has_class_embedding = true;
-        } catch (const std::exception& e) {
+        } catch (const std::exception& /*e*/) {
             new_clip->has_class_embedding = false;
         }
 
@@ -1181,7 +1181,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             vision_model.pre_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
             vision_model.pre_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
             new_clip->has_pre_norm = true;
-        } catch (std::exception & e) {
+        } catch (std::exception & /*e*/) {
             new_clip->has_pre_norm = false;
         }
 
@@ -1189,21 +1189,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             vision_model.post_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
             vision_model.post_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
             new_clip->has_post_norm = true;
-        } catch (std::exception & e) {
+        } catch (std::exception & /*e*/) {
             new_clip->has_post_norm = false;
         }
 
         try {
             vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
             new_clip->has_patch_bias = true;
-        } catch (std::exception & e) {
+        } catch (std::exception & /*e*/) {
             new_clip->has_patch_bias = false;
         }
 
         try {
             vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
             vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
-        } catch(const std::exception& e) {
+        } catch(const std::exception& /*e*/) {
             LOG_TEE("%s: failed to load vision model tensors\n", __func__);
         }
 
@@ -1215,26 +1215,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                 // Yi-type llava
                 vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
                 vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
             try {
                 // missing in Yi-type llava
                 vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
                 vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
             try {
                 // Yi-type llava
                 vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
                 vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
             try {
                 // Yi-type llava
                 vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
                 vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
             try {
                 vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
                 // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
         } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
             // MobileVLM projection
             vision_model.mm_model_mlp_1_w               = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));

From ac146628e47451c531a3c7e62e6a973a2bb467a0 Mon Sep 17 00:00:00 2001
From: Raj Hammeer Singh Hada <hammeerraj@gmail.com>
Date: Thu, 27 Jun 2024 07:27:57 +0530
Subject: [PATCH 27/50] Fix llama-android.cpp for error - "common/common.h not
 found" (#8145)

- Path seems to be wrong for the common.h header file in llama-android.cpp file. Fixing the path so the Android Build doesn't fail with the error "There is no file common/common.h"
---
 examples/llama.android/llama/src/main/cpp/llama-android.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
index 874158ef0..92a6b16b1 100644
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -5,7 +5,7 @@
 #include <string>
 #include <unistd.h>
 #include "llama.h"
-#include "common/common.h"
+#include "common.h"
 
 // Write C++ code here.
 //

From 911e35bb8bb2fd1c7d3f40f27e96ff432eae7e14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Thu, 27 Jun 2024 09:46:41 +0200
Subject: [PATCH 28/50] llama : fix CodeLlama FIM token checks (#8144)

* account for space prefix character

* use find instead
---
 src/llama.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index f78594a6f..080057332 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -5152,10 +5152,10 @@ static void llm_load_vocab(
         if (gen_name.find("code") != std::string::npos) {
             if (model.arch == LLM_ARCH_LLAMA
               && 32010 < vocab.id_to_token.size()
-              && vocab.id_to_token[32007].text == "<PRE>"
-              && vocab.id_to_token[32008].text == "<SUF>"
-              && vocab.id_to_token[32009].text == "<MID>"
-              && vocab.id_to_token[32010].text == "<EOT>") {
+              && vocab.id_to_token[32007].text.find("<PRE>") != std::string::npos
+              && vocab.id_to_token[32008].text.find("<SUF>") != std::string::npos
+              && vocab.id_to_token[32009].text.find("<MID>") != std::string::npos
+              && vocab.id_to_token[32010].text.find("<EOT>") != std::string::npos) {
                 vocab.special_prefix_id = 32007;
                 vocab.special_suffix_id = 32008;
                 vocab.special_middle_id = 32009;

From f675b20a3b7f878bf3be766b9a737e2c8321ff0d Mon Sep 17 00:00:00 2001
From: kustaaya <58045274+kustaaya@users.noreply.github.com>
Date: Thu, 27 Jun 2024 11:58:54 +0300
Subject: [PATCH 29/50] Added support for Viking pre-tokenizer (#8135)

Co-authored-by: kustaaya <kustaaya@protonmail.com>
---
 convert-hf-to-gguf-update.py | 1 +
 convert-hf-to-gguf.py        | 3 +++
 include/llama.h              | 1 +
 src/llama.cpp                | 9 +++++++++
 4 files changed, 14 insertions(+)

diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
index 67598b561..2758214fa 100755
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -85,6 +85,7 @@ models = [
     {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
     {"name": "poro-chat",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
     {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
+    {"name": "viking",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
 ]
 
 
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index c26fad930..5bf69ef9f 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -487,6 +487,9 @@ class Model:
         if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
             # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
             res = "jina-v2-code"
+        if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
+            # ref: https://huggingface.co/LumiOpen/Viking-7B
+            res = "viking"
 
         if res is None:
             logger.warning("\n")
diff --git a/include/llama.h b/include/llama.h
index 88eecb0ed..cafeafb85 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -88,6 +88,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
         LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
         LLAMA_VOCAB_PRE_TYPE_PORO           = 15,
+        LLAMA_VOCAB_PRE_TYPE_VIKING         = 16,
     };
 
     // note: these values should be synchronized with ggml_rope
diff --git a/src/llama.cpp b/src/llama.cpp
index 080057332..b97b5e279 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -5067,6 +5067,9 @@ static void llm_load_vocab(
             } else if (
                 tokenizer_pre == "poro-chat") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
+            } else if (
+                tokenizer_pre == "viking") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }
@@ -13703,6 +13706,12 @@ struct llm_tokenizer_bpe {
                     " ?[^(\\s|.,!?…。，、।۔،)]+",
                 };
                 break;
+            case LLAMA_VOCAB_PRE_TYPE_VIKING:
+                regex_exprs = {
+                    "\\p{N}",
+                    " ?[^(\\s|.,!?…。，、।۔،)]+",
+                };
+                break;
             default:
                 // default regex for BPE tokenization pre-processing
                 regex_exprs = {

From 85a267daaa1c6f8fd69160445bcb88717031d10c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Thu, 27 Jun 2024 16:26:05 +0200
Subject: [PATCH 30/50] CUDA: fix MMQ stream-k for --split-mode row (#8167)

---
 ggml/src/ggml-cuda/mmq.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index 31fcbf139..1396e7a75 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -2475,7 +2475,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
 
     const dim3 block_nums_mmq(nsm, 1, 1);
 
-    ggml_cuda_pool & pool = ctx.pool();
+    ggml_cuda_pool & pool = ctx.pool(id);
     ggml_cuda_pool_alloc<float> tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y);
 
     if (args.ne01 % mmq_y == 0) {

From 6030c61281c8a7eb94eceb7396a608fac8b71555 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Thu, 27 Jun 2024 16:27:41 +0200
Subject: [PATCH 31/50] Add Qwen2MoE 57B-A14B model identifier (#8158)

* Add Qwen2MoE 57B-A14B

* Add Qwen2MoE 57B-A14B
---
 src/llama.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/llama.cpp b/src/llama.cpp
index b97b5e279..3dc0f8535 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2038,6 +2038,7 @@ enum e_model {
     MODEL_8x22B,
     MODEL_16x12B,
     MODEL_10B_128x3_66B,
+    MODEL_57B_A14B,
 };
 
 static const size_t kiB = 1024;
@@ -4267,6 +4268,7 @@ static const char * llama_model_type_name(e_model type) {
         case MODEL_8x22B:         return "8x22B";
         case MODEL_16x12B:        return "16x12B";
         case MODEL_10B_128x3_66B: return "10B+128x3.66B";
+        case MODEL_57B_A14B:      return "57B.A14B";
         default:                  return "?B";
     }
 }
@@ -4588,6 +4590,7 @@ static void llm_load_hparams(
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 switch (hparams.n_layer) {
                     case 24: model.type = e_model::MODEL_A2_7B; break;
+                    case 28: model.type = e_model::MODEL_57B_A14B; break;
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;

From 387952651a8fc493f8c85ea4c9774bd4a5694f87 Mon Sep 17 00:00:00 2001
From: Raj Hammeer Singh Hada <hammeerraj@gmail.com>
Date: Thu, 27 Jun 2024 20:09:29 +0530
Subject: [PATCH 32/50] Delete examples/llama.android/llama/CMakeLists.txt
 (#8165)

* Delete examples/llama.android/llama/CMakeLists.txt

https://github.com/ggerganov/llama.cpp/pull/8145#issuecomment-2194534244

This file is not being used for building on Android. `llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt` is being used instead.

* Update CMakeLists.txt

Pick local llama.cpp files instead of fetching content from git
---
 examples/llama.android/llama/CMakeLists.txt   | 55 -------------------
 .../llama/src/main/cpp/CMakeLists.txt         | 18 +++---
 2 files changed, 11 insertions(+), 62 deletions(-)
 delete mode 100644 examples/llama.android/llama/CMakeLists.txt

diff --git a/examples/llama.android/llama/CMakeLists.txt b/examples/llama.android/llama/CMakeLists.txt
deleted file mode 100644
index a5618cac0..000000000
--- a/examples/llama.android/llama/CMakeLists.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-
-# For more information about using CMake with Android Studio, read the
-# documentation: https://d.android.com/studio/projects/add-native-code.html.
-# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
-
-# Sets the minimum CMake version required for this project.
-cmake_minimum_required(VERSION 3.22.1)
-
-# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
-# Since this is the top level CMakeLists.txt, the project name is also accessible
-# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
-# build script scope).
-project("llama-android")
-
-## Fetch latest llama.cpp from GitHub
-#include(FetchContent)
-#FetchContent_Declare(
-#        llama
-#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
-#        GIT_TAG        master
-#)
-#
-## Also provides "common"
-#FetchContent_MakeAvailable(llama)
-
-# llama.cpp CI uses the code from the current branch
-# ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
-add_subdirectory(../../../../../../ build-llama)
-
-# Creates and names a library, sets it as either STATIC
-# or SHARED, and provides the relative paths to its source code.
-# You can define multiple libraries, and CMake builds them for you.
-# Gradle automatically packages shared libraries with your APK.
-#
-# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
-# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
-# is preferred for the same purpose.
-#
-# In order to load a library into your app from Java/Kotlin, you must call
-# System.loadLibrary() and pass the name of the library defined here;
-# for GameActivity/NativeActivity derived applications, the same library name must be
-# used in the AndroidManifest.xml file.
-add_library(${CMAKE_PROJECT_NAME} SHARED
-    # List C/C++ source files with relative paths to this CMakeLists.txt.
-        llama-android.cpp)
-
-# Specifies libraries CMake should link to your target library. You
-# can link libraries from various origins, such as libraries defined in this
-# build script, prebuilt third-party libraries, or Android system libraries.
-target_link_libraries(${CMAKE_PROJECT_NAME}
-    # List libraries link to the target library
-    llama
-    common
-    android
-    log)
diff --git a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
index 42ebaad49..2de496574 100644
--- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@@ -11,15 +11,15 @@ cmake_minimum_required(VERSION 3.22.1)
 # build script scope).
 project("llama-android")
 
-include(FetchContent)
-FetchContent_Declare(
-        llama
-        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
-        GIT_TAG        master
-)
+#include(FetchContent)
+#FetchContent_Declare(
+#        llama
+#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+#        GIT_TAG        master
+#)
 
 # Also provides "common"
-FetchContent_MakeAvailable(llama)
+#FetchContent_MakeAvailable(llama)
 
 # Creates and names a library, sets it as either STATIC
 # or SHARED, and provides the relative paths to its source code.
@@ -30,6 +30,10 @@ FetchContent_MakeAvailable(llama)
 # the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
 # is preferred for the same purpose.
 #
+
+#load local llama.cpp
+add_subdirectory(../../../../../../ build-llama)
+
 # In order to load a library into your app from Java/Kotlin, you must call
 # System.loadLibrary() and pass the name of the library defined here;
 # for GameActivity/NativeActivity derived applications, the same library name must be

From 97877eb10bd8e7f8023420b5b5300bcbdadd62dc Mon Sep 17 00:00:00 2001
From: jukofyork <69222624+jukofyork@users.noreply.github.com>
Date: Thu, 27 Jun 2024 15:48:07 +0100
Subject: [PATCH 33/50] Control vector loading fixes (#8137)

* Fixed leak in llama_control_vector_load_one() and allow llama_control_vector_load() to grow

* refactored `llama_control_vector_load_one()`

* allow multiple directions for same layer in same file

* llama_control_vector_load_one() and llama_control_vector_load() now break on error

* removed unnecessary ggml_free() call
---
 common/common.cpp | 186 +++++++++++++++++++---------------------------
 1 file changed, 76 insertions(+), 110 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index c76d0e2c3..70349ad70 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2804,125 +2804,87 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
 //
 
 static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
-    int32_t n_tensors;
-
-    size_t n_bytes = 0;
-
-    uint32_t max_direction_layer = 0;
-
     llama_control_vector_data result = { -1, {} };
 
-    // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
-    {
-        struct ggml_init_params meta_params = {
-            /* .mem_size   = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
-            /* .mem_buffer = */ nullptr,
-            /* .no_alloc   = */ true,
-        };
-        ggml_context * meta_ctx = ggml_init(meta_params);
-        struct gguf_init_params meta_gguf_params = {
-            /* .no_alloc = */ true,
-            /* .ctx      = */ &meta_ctx,
-        };
-        struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
-        if (!meta_ctx_gguf) {
-            fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
-            ggml_free(meta_ctx);
-            return result;
-        }
-
-        n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
-        for (int i = 0; i < n_tensors; i++) {
-            std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
-
-            // split on '.'
-            size_t dotpos = name.find('.');
-            if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
-                try {
-                    uint32_t layer = std::stoi(name.substr(dotpos + 1));
-                    if (layer == 0) {
-                        fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
-                        ggml_free(meta_ctx);
-                        gguf_free(meta_ctx_gguf);
-                        return result;
-                    }
-                    if (layer > max_direction_layer) {
-                        max_direction_layer = layer;
-                    }
-                } catch (...) {
-                    fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
-                    ggml_free(meta_ctx);
-                    gguf_free(meta_ctx_gguf);
-                    return result;
-                }
-            }
-
-            struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
-            if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
-                fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
-                ggml_free(meta_ctx);
-                gguf_free(meta_ctx_gguf);
-                return result;
-            }
-            if (result.n_embd == -1) {
-                result.n_embd = ggml_nelements(tensor_meta);
-            } else if (ggml_nelements(tensor_meta) != result.n_embd) {
-                fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
-                ggml_free(meta_ctx);
-                gguf_free(meta_ctx_gguf);
-                return result;
-            }
-            n_bytes += ggml_nbytes(tensor_meta);
-        }
-        ggml_free(meta_ctx);
-        gguf_free(meta_ctx_gguf);
+    ggml_context * ctx = nullptr;
+    struct gguf_init_params meta_gguf_params = {
+        /* .no_alloc = */ false,
+        /* .ctx      = */ &ctx,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
+    if (!ctx_gguf) {
+        fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
+        return result;
     }
 
+    int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
     if (n_tensors == 0) {
         fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
-        return result;
     }
 
-    // load and scale tensors into final control vector context
-    struct ggml_init_params ggml_params = {
-        /* .mem_size   = */ ggml_tensor_overhead() * n_tensors + n_bytes,
-        /* .mem_buffer = */ nullptr,
-        /* .no_alloc   = */ false,
-    };
-    struct ggml_context * ctx = ggml_init(ggml_params);
+    for (int i = 0; i < n_tensors; i++) {
+        std::string name = gguf_get_tensor_name(ctx_gguf, i);
 
-    struct gguf_init_params params = {
-        /*.no_alloc = */ false,
-        /*.ctx      = */ &ctx,
-    };
-    struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params);
-    if (!ctx_gguf) {
-        fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
-        ggml_free(ctx);
-        return result;
-    }
+        int layer_idx = -1;
 
-    // do not store data for layer 0 (it's not used)
-    result.data.resize(result.n_embd * max_direction_layer);
-
-    for (uint32_t il = 1; il <= max_direction_layer; il++) {
-        const std::string name = "direction." + std::to_string(il);
-        const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
-
-        float * dst = result.data.data() + result.n_embd * (il - 1);
-
-        if (tensor) {
-            const float * src = (const float *) tensor->data;
-            for (int j = 0; j < result.n_embd; j++) {
-                dst[j] = src[j] * load_info.strength;
-            }
-        } else {
-            for (int j = 0; j < result.n_embd; j++) {
-                dst[j] = 0.0f;
+        // split on '.'
+        size_t dotpos = name.find('.');
+        if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
+            try {
+                layer_idx = std::stoi(name.substr(dotpos + 1));
+            } catch (...) {
+                layer_idx = -1;
             }
         }
+        if (layer_idx < 0) {
+            fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        } else if (layer_idx == 0) {
+            fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+
+        struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
+        if (tensor->type != GGML_TYPE_F32) {
+            fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+        if (ggml_n_dims(tensor) != 1) {
+            fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+
+        if (result.n_embd == -1) {
+            result.n_embd = ggml_nelements(tensor);
+        } else if (ggml_nelements(tensor) != result.n_embd) {
+            fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+
+        // extend if necessary - do not store data for layer 0 (it's not used)
+        result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
+
+        const float * src = (const float *) tensor->data;
+        float * dst = result.data.data() + result.n_embd * (layer_idx - 1);  // layer 1 at [0]
+        for (int j = 0; j < result.n_embd; j++) {
+            dst[j] += src[j] * load_info.strength;  // allows multiple directions for same layer in same file
+        }
+
     }
 
+    if (result.n_embd == -1) {
+        fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
+        result.data.clear();
+    }
+
+    gguf_free(ctx_gguf);
+    ggml_free(ctx);
+
     return result;
 }
 
@@ -2933,16 +2895,19 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
         auto cur = llama_control_vector_load_one(info);
 
         if (cur.n_embd == -1) {
-            return result;
+            result.n_embd = -1;
+            break;
         }
-        if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) {
-            fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str());
-            return result;
+        if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
+            fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
+            result.n_embd = -1;
+            break;
         }
 
         if (result.n_embd == -1) {
             result = std::move(cur);
         } else {
+            result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f);  // extend if necessary
             for (size_t i = 0; i < cur.data.size(); i++) {
                 result.data[i] += cur.data[i];
             }
@@ -2950,7 +2915,8 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
     }
 
     if (result.n_embd == -1) {
-        fprintf(stderr, "%s: no vectors passed\n", __func__);
+        fprintf(stderr, "%s: no valid control vector files passed\n", __func__);
+        result.data.clear();
     }
 
     return result;

From ab3679112d4c49a215a3d31550a7720b202e9015 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 27 Jun 2024 18:37:29 +0300
Subject: [PATCH 34/50] flake.lock: Update (#8071)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Flake lock file updates:

• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/e9ee548d90ff586a6471b4ae80ae9cfcbceb3420?narHash=sha256-4Zu0RYRcAY/VWuu6awwq4opuiD//ahpc2aFHg2CWqFY%3D' (2024-06-13)
  → 'github:NixOS/nixpkgs/d603719ec6e294f034936c0d0dc06f689d91b6c3?narHash=sha256-k3JqJrkdoYwE3fHE6xGDY676AYmyh4U2Zw%2B0Bwe5DLU%3D' (2024-06-20)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: Philip Taron <philip.taron@gmail.com>
---
 flake.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flake.lock b/flake.lock
index 5278fb68a..79bb3f63f 100644
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1718318537,
-        "narHash": "sha256-4Zu0RYRcAY/VWuu6awwq4opuiD//ahpc2aFHg2CWqFY=",
+        "lastModified": 1718895438,
+        "narHash": "sha256-k3JqJrkdoYwE3fHE6xGDY676AYmyh4U2Zw+0Bwe5DLU=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "e9ee548d90ff586a6471b4ae80ae9cfcbceb3420",
+        "rev": "d603719ec6e294f034936c0d0dc06f689d91b6c3",
         "type": "github"
       },
       "original": {

From 16791b8f0b4526aafbf5d0e5bbbd2e99c2253418 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Thu, 27 Jun 2024 18:14:19 +0200
Subject: [PATCH 35/50] Add chatml fallback for cpp `llama_chat_apply_template`
 (#8160)

* add chatml fallback for cpp `llama_chat_apply_template`

* remove redundant code
---
 common/common.cpp | 19 ++++++++++++++++++-
 common/common.h   |  2 ++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 70349ad70..57d03a578 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2618,6 +2618,7 @@ std::string llama_chat_apply_template(const struct llama_model * model,
         const std::vector<llama_chat_msg> & msgs,
         bool add_ass) {
     int alloc_size = 0;
+    bool fallback = false; // indicate if we must fallback to default chatml
     std::vector<llama_chat_message> chat;
     for (auto & msg : msgs) {
         chat.push_back({msg.role.c_str(), msg.content.c_str()});
@@ -2630,10 +2631,26 @@ std::string llama_chat_apply_template(const struct llama_model * model,
     // run the first time to get the total output length
     int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
 
+    // error: chat template is not supported
+    if (res < 0) {
+        if (ptr_tmpl != nullptr) {
+            // if the custom "tmpl" is not supported, we throw an error
+            // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
+            throw std::runtime_error("this custom template is not supported");
+        } else {
+            // If the built-in template is not supported, we default to chatml
+            res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+            fallback = true;
+        }
+    }
+
     // if it turns out that our buffer is too small, we resize it
     if ((size_t) res > buf.size()) {
         buf.resize(res);
-        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+        res = llama_chat_apply_template(
+            fallback ? nullptr : model,
+            fallback ? "chatml" : ptr_tmpl,
+            chat.data(), chat.size(), add_ass, buf.data(), buf.size());
     }
 
     std::string formatted_chat(buf.data(), res);
diff --git a/common/common.h b/common/common.h
index c541204f6..0486ba380 100644
--- a/common/common.h
+++ b/common/common.h
@@ -380,6 +380,8 @@ struct llama_chat_msg {
 bool llama_chat_verify_template(const std::string & tmpl);
 
 // CPP wrapper for llama_chat_apply_template
+// If the built-in template is not supported, we default to chatml
+// If the custom "tmpl" is not supported, we throw an error
 std::string llama_chat_apply_template(const struct llama_model * model,
         const std::string & tmpl,
         const std::vector<llama_chat_msg> & chat,

From 8172ee9da9921ca53d698c7438c2d792b3f3f2c8 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 27 Jun 2024 20:04:39 +0200
Subject: [PATCH 36/50] cmake : fix deprecated option names not working (#8171)

* cmake : fix deprecated option names not working

* remove LlAMA_OPENMP
---
 CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7a7197282..dba083089 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,7 +86,7 @@ set(GGML_CUDA_USE_GRAPHS    ON)
 function (llama_option_depr TYPE OLD NEW)
     if (${OLD})
         message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
-        set(${NEW} ON)
+        set(${NEW} ON PARENT_SCOPE)
     endif()
 endfunction()
 
@@ -96,7 +96,6 @@ llama_option_depr(WARNING     LLAMA_KOMPUTE             GGML_KOMPUTE)
 llama_option_depr(WARNING     LLAMA_METAL               GGML_METAL)
 llama_option_depr(WARNING     LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
 llama_option_depr(WARNING     LLAMA_NATIVE              GGML_NATIVE)
-llama_option_depr(WARNING     LLAMA_OPENMP              GGML_OPENMP)
 llama_option_depr(WARNING     LLAMA_RPC                 GGML_RPC)
 llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
 llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)

From 558f44bf83d78242d4e5c4ab98d0be9125cb9780 Mon Sep 17 00:00:00 2001
From: loonerin <132926317+loonerin@users.noreply.github.com>
Date: Thu, 27 Jun 2024 15:01:23 -0400
Subject: [PATCH 37/50] CI: fix release build (Ubuntu+Mac) (#8170)

* CI: fix release build (Ubuntu)

PR #8006 changes defaults to build shared libs. However, CI for releases
expects static builds.

* CI: fix release build (Mac)

---------

Co-authored-by: loonerin <loonerin@users.noreply.github.com>
---
 .github/workflows/build.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 208515287..adf67cecc 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -47,7 +47,7 @@ jobs:
           sysctl -a
           mkdir build
           cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
           cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
@@ -105,7 +105,7 @@ jobs:
           sysctl -a
           # Metal is disabled due to intermittent failures with Github runners not having a GPU:
           # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON
+          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
@@ -222,7 +222,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
           cmake --build . --config Release -j $(nproc)
 
       - name: Test

From cb0b06a8a613f7a2ccb7253b2a3c00fdd397ba1c Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@users.noreply.github.com>
Date: Thu, 27 Jun 2024 22:08:42 +0100
Subject: [PATCH 38/50] `json`: update grammars/README w/ examples & note about
 additionalProperties (#8132)

* json: update grammars/README

* mention broken prefixItems

* add mention to llama-gbnf-validator

* json: explicit type: object for nested items object in cli example
---
 grammars/README.md | 245 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 235 insertions(+), 10 deletions(-)

diff --git a/grammars/README.md b/grammars/README.md
index 2f685eb6d..40f666240 100644
--- a/grammars/README.md
+++ b/grammars/README.md
@@ -126,19 +126,244 @@ You can use GBNF grammars:
     - in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
     - in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
 
-Take a look at [tests](../../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
+Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
 
-Here is also a non-exhaustive list of **unsupported** features:
+```bash
+llama-cli \
+  -hfr bartowski/Phi-3-medium-128k-instruct-GGUF \
+  -hff Phi-3-medium-128k-instruct-Q8_0.gguf \
+  -j '{
+    "type": "array",
+    "items": {
+        "type": "object",
+        "properties": {
+            "name": {
+                "type": "string",
+                "minLength": 1,
+                "maxLength": 100
+            },
+            "age": {
+                "type": "integer",
+                "minimum": 0,
+                "maximum": 150
+            }
+        },
+        "required": ["name", "age"],
+        "additionalProperties": false
+    },
+    "minItems": 10,
+    "maxItems": 100
+  }' \
+  -p 'Generate a {name, age}[] JSON array with famous actors of all ages.'
+```
 
-- `additionalProperties`: to be fixed in https://github.com/ggerganov/llama.cpp/pull/7840
-- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`
-    - `integer` constraints to be implemented in https://github.com/ggerganov/llama.cpp/pull/7797
-- Remote `$ref`s in the C++ version (Python & JavaScript versions fetch https refs)
-- Mixing `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
-- `string` formats `uri`, `email`
+<details>
+
+<summary>Show grammar</summary>
+
+You can convert any schema in command-line with:
+
+```bash
+examples/json_schema_to_grammar.py name-age-schema.json
+```
+
+```
+char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+item ::= "{" space item-name-kv "," space item-age-kv "}" space
+item-age ::= ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-4] [0-9] | [5] "0")) space
+item-age-kv ::= "\"age\"" space ":" space item-age
+item-name ::= "\"" char{1,100} "\"" space
+item-name-kv ::= "\"name\"" space ":" space item-name
+root ::= "[" space item ("," space item){9,99} "]" space
+space ::= | " " | "\n" [ \t]{0,20}
+```
+
+</details>
+
+Here is also a list of known limitations (contributions welcome):
+
+- Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [llama-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp).
+- Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
+- [prefixItems](https://json-schema.org/draft/2020-12/json-schema-core#name-prefixitems) is broken (but [items](https://json-schema.org/draft/2020-12/json-schema-core#name-items) works)
+- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`: only supported for `"type": "integer"` for now, not `number`
+- Nested `$ref`s are broken (https://github.com/ggerganov/llama.cpp/issues/8073)
+- [pattern](https://json-schema.org/draft/2020-12/json-schema-validation#name-pattern)s must start with `^` and end with `$`
+- Remote `$ref`s not supported in the C++ version (Python & JavaScript versions fetch https refs)
+- `string` [formats](https://json-schema.org/draft/2020-12/json-schema-validation#name-defined-formats) lack `uri`, `email`
+- No [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties)
+
+And a non-exhaustive list of other unsupported features that are unlikely to be implemented (hard and/or too slow to support w/ stateless grammars):
+
+- [`uniqueItems`](https://json-schema.org/draft/2020-12/json-schema-validation#name-uniqueitems)
 - [`contains`](https://json-schema.org/draft/2020-12/json-schema-core#name-contains) / `minContains`
-- `uniqueItems`
 - `$anchor` (cf. [dereferencing](https://json-schema.org/draft/2020-12/json-schema-core#name-dereferencing))
 - [`not`](https://json-schema.org/draft/2020-12/json-schema-core#name-not)
 - [Conditionals](https://json-schema.org/draft/2020-12/json-schema-core#name-keywords-for-applying-subsche) `if` / `then` / `else` / `dependentSchemas`
-- [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties)
+
+### A word about additionalProperties
+
+> [!WARNING]
+> By default, `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties), which you might not want / not expect, and which will make sampling slower (not just because of the extra tokens, but also generates a slower grammar).
+> You can set `"additionalProperties": false` on the schema of any object to ensure only properties listed in `properties` are generated (not needed for non-`object` types, e.g. `array` or `string`).
+
+If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can disable additional properties with the `extra` config on each model class:
+
+```python
+# pip install pydantic
+import json
+from typing import Annotated, List
+from pydantic import BaseModel, Extra, Field
+class QAPair(BaseModel):
+    class Config:
+        extra = 'forbid'  # triggers additionalProperties: false in the JSON schema
+    question: str
+    concise_answer: str
+    justification: str
+
+class Summary(BaseModel):
+    class Config:
+        extra = 'forbid'
+    key_facts: List[Annotated[str, Field(pattern='- .{5,}')]]
+    question_answers: List[Annotated[List[QAPair], Field(min_items=5)]]
+
+print(json.dumps(Summary.model_json_schema(), indent=2))
+```
+
+<details>
+<summary>Show JSON schema & grammar</summary>
+
+```json
+{
+  "$defs": {
+    "QAPair": {
+      "additionalProperties": false,
+      "properties": {
+        "question": {
+          "title": "Question",
+          "type": "string"
+        },
+        "concise_answer": {
+          "title": "Concise Answer",
+          "type": "string"
+        },
+        "justification": {
+          "title": "Justification",
+          "type": "string"
+        }
+      },
+      "required": [
+        "question",
+        "concise_answer",
+        "justification"
+      ],
+      "title": "QAPair",
+      "type": "object"
+    }
+  },
+  "additionalProperties": false,
+  "properties": {
+    "key_facts": {
+      "items": {
+        "pattern": "^- .{5,}$",
+        "type": "string"
+      },
+      "title": "Key Facts",
+      "type": "array"
+    },
+    "question_answers": {
+      "items": {
+        "items": {
+          "$ref": "#/$defs/QAPair"
+        },
+        "minItems": 5,
+        "type": "array"
+      },
+      "title": "Question Answers",
+      "type": "array"
+    }
+  },
+  "required": [
+    "key_facts",
+    "question_answers"
+  ],
+  "title": "Summary",
+  "type": "object"
+}
+```
+
+```
+QAPair ::= "{" space QAPair-question-kv "," space QAPair-concise-answer-kv "," space QAPair-justification-kv "}" space
+QAPair-concise-answer-kv ::= "\"concise_answer\"" space ":" space string
+QAPair-justification-kv ::= "\"justification\"" space ":" space string
+QAPair-question-kv ::= "\"question\"" space ":" space string
+char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+dot ::= [^\x0A\x0D]
+key-facts ::= "[" space (key-facts-item ("," space key-facts-item)*)? "]" space
+key-facts-item ::= "\"" "- " key-facts-item-1{5,} "\"" space
+key-facts-item-1 ::= dot
+key-facts-kv ::= "\"key_facts\"" space ":" space key-facts
+question-answers ::= "[" space (question-answers-item ("," space question-answers-item)*)? "]" space
+question-answers-item ::= "[" space question-answers-item-item ("," space question-answers-item-item){4,} "]" space
+question-answers-item-item ::= QAPair
+question-answers-kv ::= "\"question_answers\"" space ":" space question-answers
+root ::= "{" space key-facts-kv "," space question-answers-kv "}" space
+space ::= | " " | "\n" [ \t]{0,20}
+string ::= "\"" char* "\"" space
+```
+
+</details>
+
+If you're using [Zod](https://zod.dev/), you can make your objects explicitly strict w/ `z.object(...).strict()` or `z.strictObject(...)`.
+
+Note however that [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) currently always seems to set `"additionalProperties": false` anyway (even w/ zod schemas on which `nonstrict()` / `passthrough()` was called).
+
+```js
+import { z } from 'zod';
+import { zodToJsonSchema } from 'zod-to-json-schema';
+
+const Foo = z.object({
+  age: z.number().positive(),
+  email: z.string().email(),
+}).strict();
+
+console.log(zodToJsonSchema(Foo));
+```
+
+<details>
+<summary>Show JSON schema & grammar</summary>
+
+```json
+{
+  "type": "object",
+  "properties": {
+    "age": {
+      "type": "number",
+      "exclusiveMinimum": 0
+    },
+    "email": {
+      "type": "string",
+      "format": "email"
+    }
+  },
+  "required": [
+    "age",
+    "email"
+  ],
+  "additionalProperties": false,
+  "$schema": "http://json-schema.org/draft-07/schema#"
+}
+```
+
+```
+age-kv ::= "\"age\"" space ":" space number
+char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+decimal-part ::= [0-9]{1,16}
+email-kv ::= "\"email\"" space ":" space string
+integral-part ::= [0] | [1-9] [0-9]{0,15}
+number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+root ::= "{" space age-kv "," space email-kv "}" space
+space ::= | " " | "\n" [ \t]{0,20}
+string ::= "\"" char* "\"" space
+```
+
+</details>

From a27aa50ab7e07fe46aae619076b6e31d5663e914 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Fri, 28 Jun 2024 02:19:11 +0200
Subject: [PATCH 39/50] Add missing items in makefile (#8177)

---
 Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile b/Makefile
index bbfe0f12b..8ae4f1dc4 100644
--- a/Makefile
+++ b/Makefile
@@ -45,6 +45,7 @@ BUILD_TARGETS = \
 TEST_TARGETS = \
 	tests/test-autorelease \
 	tests/test-backend-ops \
+	tests/test-chat-template \
 	tests/test-double-float \
 	tests/test-grad0 \
 	tests/test-grammar-integration \
@@ -1070,6 +1071,7 @@ clean:
 	rm -rvf src/*.o
 	rm -rvf tests/*.o
 	rm -rvf examples/*.o
+	rm -rvf common/*.o
 	rm -rvf *.a
 	rm -rvf *.dll
 	rm -rvf *.so

From e57dc62057d41211ac018056c19c02cd544694df Mon Sep 17 00:00:00 2001
From: pculliton <phillipculliton@gmail.com>
Date: Fri, 28 Jun 2024 00:00:43 -0400
Subject: [PATCH 40/50] llama: Add support for Gemma2ForCausalLM (#8156)

* Inference support for Gemma 2 model family

* Update convert-hf-to-gguf.py, constants, and tensor mappings

* cleanup

* format fix

* Fix special token vocab bug

* Don't add space prefix

* fix deleted lines

* Update src/llama.cpp

Co-authored-by: slaren <slarengh@gmail.com>

* Add model type names

* Add control vector

* Fix model type identification

---------

Co-authored-by: Andrei Betlen <abetlen@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
---
 convert-hf-to-gguf.py          |  40 +++++++
 gguf-py/gguf/constants.py      |  23 ++++
 gguf-py/gguf/tensor_mapping.py |  14 +++
 src/llama.cpp                  | 198 ++++++++++++++++++++++++++++++++-
 4 files changed, 274 insertions(+), 1 deletion(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 5bf69ef9f..5bcc849db 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -2340,6 +2340,46 @@ class GemmaModel(Model):
         return [(self.map_tensor_name(name), data_torch)]
 
 
+@Model.register("Gemma2ForCausalLM")
+class Gemma2Model(Model):
+    model_arch = gguf.MODEL_ARCH.GEMMA2
+
+    def set_vocab(self):
+        self._set_vocab_llama_hf()
+        self.gguf_writer.add_add_space_prefix(False)
+
+    def set_gguf_parameters(self):
+        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]
+
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
+        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
+        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
+        self.gguf_writer.add_key_length(hparams["head_dim"])
+        self.gguf_writer.add_value_length(hparams["head_dim"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unusem
+
+        # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
+        # To prevent errors, skip loading lm_head.weight.
+        if name == "lm_head.weight":
+            logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
+            return []
+
+        # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
+        if name.endswith("norm.weight"):
+            data_torch = data_torch + 1
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
 @Model.register("Starcoder2ForCausalLM")
 class StarCoder2Model(Model):
     model_arch = gguf.MODEL_ARCH.STARCODER2
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 222a2d137..cf3d09e70 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -150,6 +150,7 @@ class MODEL_ARCH(IntEnum):
     INTERNLM2    = auto()
     MINICPM      = auto()
     GEMMA        = auto()
+    GEMMA2       = auto()
     STARCODER2   = auto()
     MAMBA        = auto()
     XVERSE       = auto()
@@ -180,10 +181,13 @@ class MODEL_TENSOR(IntEnum):
     ATTN_NORM            = auto()
     ATTN_NORM_2          = auto()
     ATTN_OUT_NORM        = auto()
+    ATTN_POST_NORM       = auto()
     ATTN_ROT_EMBD        = auto()
     FFN_GATE_INP         = auto()
     FFN_GATE_INP_SHEXP   = auto()
     FFN_NORM             = auto()
+    FFN_PRE_NORM         = auto()
+    FFN_POST_NORM        = auto()
     FFN_GATE             = auto()
     FFN_DOWN             = auto()
     FFN_UP               = auto()
@@ -270,6 +274,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.INTERNLM2:      "internlm2",
     MODEL_ARCH.MINICPM:        "minicpm",
     MODEL_ARCH.GEMMA:          "gemma",
+    MODEL_ARCH.GEMMA2:         "gemma2",
     MODEL_ARCH.STARCODER2:     "starcoder2",
     MODEL_ARCH.MAMBA:          "mamba",
     MODEL_ARCH.XVERSE:         "xverse",
@@ -303,9 +308,12 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
     MODEL_TENSOR.ATTN_Q_NORM:          "blk.{bid}.attn_q_norm",
     MODEL_TENSOR.ATTN_K_NORM:          "blk.{bid}.attn_k_norm",
     MODEL_TENSOR.ATTN_OUT_NORM:        "blk.{bid}.attn_output_norm",
+    MODEL_TENSOR.ATTN_POST_NORM:       "blk.{bid}.post_attention_norm",
     MODEL_TENSOR.FFN_GATE_INP:         "blk.{bid}.ffn_gate_inp",
     MODEL_TENSOR.FFN_GATE_INP_SHEXP:   "blk.{bid}.ffn_gate_inp_shexp",
     MODEL_TENSOR.FFN_NORM:             "blk.{bid}.ffn_norm",
+    MODEL_TENSOR.FFN_PRE_NORM:         "blk.{bid}.ffn_norm",
+    MODEL_TENSOR.FFN_POST_NORM:        "blk.{bid}.post_ffw_norm",
     MODEL_TENSOR.FFN_GATE:             "blk.{bid}.ffn_gate",
     MODEL_TENSOR.FFN_DOWN:             "blk.{bid}.ffn_down",
     MODEL_TENSOR.FFN_UP:               "blk.{bid}.ffn_up",
@@ -751,6 +759,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_UP,
         MODEL_TENSOR.FFN_NORM,
     ],
+    MODEL_ARCH.GEMMA2: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_POST_NORM,
+        MODEL_TENSOR.FFN_PRE_NORM,
+        MODEL_TENSOR.FFN_POST_NORM,
+    ],
     MODEL_ARCH.STARCODER2: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 7b047f241..0bed43939 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -187,6 +187,10 @@ class TensorNameMap:
             "transformer.blocks.{bid}.norm_attn_norm.norm_2",  # dbrx
         ),
 
+        MODEL_TENSOR.ATTN_POST_NORM: (
+            "model.layers.{bid}.post_attention_layernorm",     # gemma2
+        ),
+
         # Rotary embeddings
         MODEL_TENSOR.ATTN_ROT_EMBD: (
             "model.layers.{bid}.self_attn.rotary_emb.inv_freq",        # llama-hf
@@ -210,6 +214,16 @@ class TensorNameMap:
             "transformer.decoder_layer.{bid}.rms_norm_2",                    # Grok
         ),
 
+        # Post feed-forward norm
+        MODEL_TENSOR.FFN_PRE_NORM: (
+            "model.layers.{bid}.pre_feedforward_layernorm", # gemma2
+        ),
+
+        # Post feed-forward norm
+        MODEL_TENSOR.FFN_POST_NORM: (
+            "model.layers.{bid}.post_feedforward_layernorm", # gemma2
+        ),
+
         MODEL_TENSOR.FFN_GATE_INP: (
             "layers.{bid}.feed_forward.gate",             # mixtral
             "model.layers.{bid}.block_sparse_moe.gate",   # mixtral
diff --git a/src/llama.cpp b/src/llama.cpp
index 3dc0f8535..988ed4fdf 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -217,6 +217,7 @@ enum llm_arch {
     LLM_ARCH_INTERNLM2,
     LLM_ARCH_MINICPM,
     LLM_ARCH_GEMMA,
+    LLM_ARCH_GEMMA2,
     LLM_ARCH_STARCODER2,
     LLM_ARCH_MAMBA,
     LLM_ARCH_XVERSE,
@@ -257,6 +258,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_INTERNLM2,       "internlm2"    },
     { LLM_ARCH_MINICPM,         "minicpm"      },
     { LLM_ARCH_GEMMA,           "gemma"        },
+    { LLM_ARCH_GEMMA2,          "gemma2"       },
     { LLM_ARCH_STARCODER2,      "starcoder2"   },
     { LLM_ARCH_MAMBA,           "mamba"        },
     { LLM_ARCH_XVERSE,          "xverse"       },
@@ -478,10 +480,12 @@ enum llm_tensor {
     LLM_TENSOR_ATTN_NORM,
     LLM_TENSOR_ATTN_NORM_2,
     LLM_TENSOR_ATTN_OUT_NORM,
+    LLM_TENSOR_ATTN_POST_NORM,
     LLM_TENSOR_ATTN_ROT_EMBD,
     LLM_TENSOR_FFN_GATE_INP,
     LLM_TENSOR_FFN_GATE_INP_SHEXP,
     LLM_TENSOR_FFN_NORM,
+    LLM_TENSOR_FFN_POST_NORM,
     LLM_TENSOR_FFN_GATE,
     LLM_TENSOR_FFN_DOWN,
     LLM_TENSOR_FFN_UP,
@@ -1004,6 +1008,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
+    {
+        LLM_ARCH_GEMMA2,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
+        },
+    },
     {
         LLM_ARCH_STARCODER2,
         {
@@ -2039,6 +2061,8 @@ enum e_model {
     MODEL_16x12B,
     MODEL_10B_128x3_66B,
     MODEL_57B_A14B,
+    MODEL_9B,
+    MODEL_27B,
 };
 
 static const size_t kiB = 1024;
@@ -2215,6 +2239,7 @@ struct llama_layer {
     struct ggml_tensor * attn_q_a_norm;
     struct ggml_tensor * attn_kv_a_norm;
     struct ggml_tensor * attn_sub_norm;
+    struct ggml_tensor * attn_post_norm;
     struct ggml_tensor * ffn_sub_norm;
 
     // attention
@@ -2238,6 +2263,7 @@ struct llama_layer {
     // normalization
     struct ggml_tensor * ffn_norm;
     struct ggml_tensor * ffn_norm_b;
+    struct ggml_tensor * ffn_post_norm;
     struct ggml_tensor * layer_out_norm;
     struct ggml_tensor * layer_out_norm_b;
     struct ggml_tensor * ffn_norm_exps;
@@ -4269,6 +4295,8 @@ static const char * llama_model_type_name(e_model type) {
         case MODEL_16x12B:        return "16x12B";
         case MODEL_10B_128x3_66B: return "10B+128x3.66B";
         case MODEL_57B_A14B:      return "57B.A14B";
+        case MODEL_9B:            return "9B";
+        case MODEL_27B:           return "27B";
         default:                  return "?B";
     }
 }
@@ -4671,6 +4699,16 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                }
             } break;
+        case LLM_ARCH_GEMMA2:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                    case 42: model.type = e_model::MODEL_9B; break;
+                    case 46: model.type = e_model::MODEL_27B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+               }
+            } break;
         case LLM_ARCH_STARCODER2:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -6512,6 +6550,40 @@ static bool llm_load_tensors(
                         layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
                     }
                 } break;
+            case LLM_ARCH_GEMMA2:
+                {
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+                    // output
+                    model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                    model.output      = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+
+                    const int64_t n_ff          = hparams.n_ff;
+                    const int64_t n_embd_head_k = hparams.n_embd_head_k;
+                    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
+                    const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
+
+                    for (uint32_t i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
+                        layer.attn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
+
+                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
+                    }
+                } break;
             case LLM_ARCH_STARCODER2:
                 {
                     model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -10923,6 +10995,125 @@ struct llm_build_context {
         return gf;
     }
 
+    struct ggml_cgraph * build_gemma2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        const int64_t n_embd_head_k = hparams.n_embd_head_k;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+        cb(inpL, "inp_scaled", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head,    n_tokens), inp_pos, nullptr,
+                        n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Qcur, "Qcur", il);
+
+                Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
+                cb(Qcur, "Qcur_scaled", il);
+
+                Kcur = ggml_rope_ext(
+                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
+                        n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+            }
+
+            cur = llm_build_norm(ctx0, cur, hparams,
+                    model.layers[il].attn_post_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_post_norm", il);
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+            cb(sa_out, "sa_out", il);
+
+            cur = llm_build_norm(ctx0, sa_out, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            // feed-forward network
+            {
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = llm_build_norm(ctx0, cur, hparams,
+                model.layers[il].ffn_post_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+            cb(cur, "ffn_post_norm", -1);
+
+            cur = ggml_add(ctx0, cur, sa_out);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+
     struct ggml_cgraph * build_starcoder2() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
@@ -12303,6 +12494,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_gemma();
             } break;
+        case LLM_ARCH_GEMMA2:
+            {
+                result = llm.build_gemma2();
+            } break;
         case LLM_ARCH_STARCODER2:
             {
                 result = llm.build_starcoder2();
@@ -17597,6 +17792,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_PHI2:
         case LLM_ARCH_PHI3:
         case LLM_ARCH_GEMMA:
+        case LLM_ARCH_GEMMA2:
         case LLM_ARCH_STARCODER2:
         case LLM_ARCH_GPTNEOX:
             return LLAMA_ROPE_TYPE_NEOX;
@@ -19486,7 +19682,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<s>assistant\n";
         }
-    } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
+    } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("<start_of_turn>") != std::string::npos) {
         // google/gemma-7b-it
         std::string system_prompt = "";
         for (auto message : chat) {

From 139cc621e90b4f61830515c3c124cf35b3d7a6dc Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@users.noreply.github.com>
Date: Fri, 28 Jun 2024 09:26:45 +0100
Subject: [PATCH 41/50] `json`: restore default additionalProperties to false,
 fix some pattern escapes (#8180)

* json: expand ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS charset

* json: revert default of additionalProperties to false

* Update README.md
---
 common/json-schema-to-grammar.cpp             |  4 +-
 examples/json_schema_to_grammar.py            |  6 +--
 .../server/public/json-schema-to-grammar.mjs  |  4 +-
 grammars/README.md                            | 37 ++++++++++++------
 tests/test-grammar-integration.cpp            | 39 ++++++++++++++++++-
 tests/test-json-schema-to-grammar.cpp         | 31 ++-------------
 6 files changed, 73 insertions(+), 48 deletions(-)

diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 2f233e2e7..881eb49e3 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -316,7 +316,7 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
 };
 
 std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
-std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
+std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
 
 template <typename Iterator>
 std::string join(Iterator begin, Iterator end, const std::string & separator) {
@@ -720,7 +720,7 @@ private:
             }
             prop_names.push_back(prop_name);
         }
-        if (!(additional_properties.is_boolean() && !additional_properties.get<bool>())) {
+        if ((additional_properties.is_boolean() && additional_properties.get<bool>()) || additional_properties.is_object()) {
             std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
             std::string value_rule =
                 additional_properties.is_object() ? visit(additional_properties, sub_name + "-value")
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
index 92f6e3d47..072a230f7 100755
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -231,7 +231,7 @@ GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
 GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}
 
 NON_LITERAL_SET = set('|.()[]{}*+?')
-ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
+ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?')
 
 
 class SchemaConverter:
@@ -602,7 +602,7 @@ class SchemaConverter:
                 else:
                     add_component(t, is_required=True)
 
-            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=[]))
+            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
 
         elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
             items = schema.get('items') or schema['prefixItems']
@@ -691,7 +691,7 @@ class SchemaConverter:
         required_props = [k for k in sorted_props if k in required]
         optional_props = [k for k in sorted_props if k not in required]
 
-        if additional_properties != False:
+        if additional_properties is not None and additional_properties != False:
             sub_name = f'{name}{"-" if name else ""}additional'
             value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \
                 self._add_primitive('value', PRIMITIVE_RULES['value'])
diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs
index 06d76edde..7267f3f9c 100644
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@@ -259,7 +259,7 @@ const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g;
 const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' };
 
 const NON_LITERAL_SET = new Set('|.()[]{}*+?');
-const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('[]()|{}*+?');
+const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('^$.[]()|{}*+?');
 
 export class SchemaConverter {
   constructor(options) {
@@ -751,7 +751,7 @@ export class SchemaConverter {
     const requiredProps = sortedProps.filter(k => required.has(k));
     const optionalProps = sortedProps.filter(k => !required.has(k));
 
-    if (additionalProperties !== false) {
+    if (additionalProperties) {
       const subName = `${name ?? ''}${name ? '-' : ''}additional`;
       const valueRule =
         additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`)
diff --git a/grammars/README.md b/grammars/README.md
index 40f666240..886023f77 100644
--- a/grammars/README.md
+++ b/grammars/README.md
@@ -182,6 +182,8 @@ space ::= | " " | "\n" [ \t]{0,20}
 
 Here is also a list of known limitations (contributions welcome):
 
+- `additionalProperties` defaults to `false` (produces faster grammars + reduces hallucinations).
+- `"additionalProperties": true` may produce keys that contain unescaped newlines.
 - Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [llama-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp).
 - Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
 - [prefixItems](https://json-schema.org/draft/2020-12/json-schema-core#name-prefixitems) is broken (but [items](https://json-schema.org/draft/2020-12/json-schema-core#name-items) works)
@@ -203,10 +205,11 @@ And a non-exhaustive list of other unsupported features that are unlikely to be
 ### A word about additionalProperties
 
 > [!WARNING]
-> By default, `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties), which you might not want / not expect, and which will make sampling slower (not just because of the extra tokens, but also generates a slower grammar).
-> You can set `"additionalProperties": false` on the schema of any object to ensure only properties listed in `properties` are generated (not needed for non-`object` types, e.g. `array` or `string`).
+> The JSON schemas spec states `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties) by default.
+> Since this is slow and seems prone to hallucinations, we default to no additional properties.
+> You can set `"additionalProperties": true` in the the schema of any object to explicitly allow additional properties.
 
-If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can disable additional properties with the `extra` config on each model class:
+If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can enable additional properties with the `extra` config on each model class:
 
 ```python
 # pip install pydantic
@@ -215,14 +218,14 @@ from typing import Annotated, List
 from pydantic import BaseModel, Extra, Field
 class QAPair(BaseModel):
     class Config:
-        extra = 'forbid'  # triggers additionalProperties: false in the JSON schema
+        extra = 'allow'  # triggers additionalProperties: true in the JSON schema
     question: str
     concise_answer: str
     justification: str
 
 class Summary(BaseModel):
     class Config:
-        extra = 'forbid'
+        extra = 'allow'
     key_facts: List[Annotated[str, Field(pattern='- .{5,}')]]
     question_answers: List[Annotated[List[QAPair], Field(min_items=5)]]
 
@@ -236,7 +239,7 @@ print(json.dumps(Summary.model_json_schema(), indent=2))
 {
   "$defs": {
     "QAPair": {
-      "additionalProperties": false,
+      "additionalProperties": true,
       "properties": {
         "question": {
           "title": "Question",
@@ -260,7 +263,7 @@ print(json.dumps(Summary.model_json_schema(), indent=2))
       "type": "object"
     }
   },
-  "additionalProperties": false,
+  "additionalProperties": true,
   "properties": {
     "key_facts": {
       "items": {
@@ -292,30 +295,40 @@ print(json.dumps(Summary.model_json_schema(), indent=2))
 ```
 
 ```
-QAPair ::= "{" space QAPair-question-kv "," space QAPair-concise-answer-kv "," space QAPair-justification-kv "}" space
+QAPair ::= "{" space QAPair-question-kv "," space QAPair-concise-answer-kv "," space QAPair-justification-kv ( "," space ( QAPair-additional-kv ( "," space QAPair-additional-kv )* ) )? "}" space
+QAPair-additional-k ::= ["] ( [c] ([o] ([n] ([c] ([i] ([s] ([e] ([_] ([a] ([n] ([s] ([w] ([e] ([r] char+ | [^"r] char*) | [^"e] char*) | [^"w] char*) | [^"s] char*) | [^"n] char*) | [^"a] char*) | [^"_] char*) | [^"e] char*) | [^"s] char*) | [^"i] char*) | [^"c] char*) | [^"n] char*) | [^"o] char*) | [j] ([u] ([s] ([t] ([i] ([f] ([i] ([c] ([a] ([t] ([i] ([o] ([n] char+ | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"a] char*) | [^"c] char*) | [^"i] char*) | [^"f] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"u] char*) | [q] ([u] ([e] ([s] ([t] ([i] ([o] ([n] char+ | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"e] char*) | [^"u] char*) | [^"cjq] char* )? ["] space
+QAPair-additional-kv ::= QAPair-additional-k ":" space value
 QAPair-concise-answer-kv ::= "\"concise_answer\"" space ":" space string
 QAPair-justification-kv ::= "\"justification\"" space ":" space string
 QAPair-question-kv ::= "\"question\"" space ":" space string
+additional-k ::= ["] ( [k] ([e] ([y] ([_] ([f] ([a] ([c] ([t] ([s] char+ | [^"s] char*) | [^"t] char*) | [^"c] char*) | [^"a] char*) | [^"f] char*) | [^"_] char*) | [^"y] char*) | [^"e] char*) | [q] ([u] ([e] ([s] ([t] ([i] ([o] ([n] ([_] ([a] ([n] ([s] ([w] ([e] ([r] ([s] char+ | [^"s] char*) | [^"r] char*) | [^"e] char*) | [^"w] char*) | [^"s] char*) | [^"n] char*) | [^"a] char*) | [^"_] char*) | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"e] char*) | [^"u] char*) | [^"kq] char* )? ["] space
+additional-kv ::= additional-k ":" space value
+array ::= "[" space ( value ("," space value)* )? "]" space
+boolean ::= ("true" | "false") space
 char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+decimal-part ::= [0-9]{1,16}
 dot ::= [^\x0A\x0D]
+integral-part ::= [0] | [1-9] [0-9]{0,15}
 key-facts ::= "[" space (key-facts-item ("," space key-facts-item)*)? "]" space
 key-facts-item ::= "\"" "- " key-facts-item-1{5,} "\"" space
 key-facts-item-1 ::= dot
 key-facts-kv ::= "\"key_facts\"" space ":" space key-facts
+null ::= "null" space
+number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
 question-answers ::= "[" space (question-answers-item ("," space question-answers-item)*)? "]" space
 question-answers-item ::= "[" space question-answers-item-item ("," space question-answers-item-item){4,} "]" space
 question-answers-item-item ::= QAPair
 question-answers-kv ::= "\"question_answers\"" space ":" space question-answers
-root ::= "{" space key-facts-kv "," space question-answers-kv "}" space
+root ::= "{" space key-facts-kv "," space question-answers-kv ( "," space ( additional-kv ( "," space additional-kv )* ) )? "}" space
 space ::= | " " | "\n" [ \t]{0,20}
 string ::= "\"" char* "\"" space
+value ::= object | array | string | number | boolean | null
 ```
 
 </details>
 
-If you're using [Zod](https://zod.dev/), you can make your objects explicitly strict w/ `z.object(...).strict()` or `z.strictObject(...)`.
-
-Note however that [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) currently always seems to set `"additionalProperties": false` anyway (even w/ zod schemas on which `nonstrict()` / `passthrough()` was called).
+If you're using [Zod](https://zod.dev/), you can make your objects to explicitly allow extra properties w/ `nonstrict()` / `passthrough()` (or explicitly no extra props w/ `z.object(...).strict()` or `z.strictObject(...)`) but note that [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) currently always sets `"additionalProperties": false` anyway.
 
 ```js
 import { z } from 'zod';
diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp
index 0e21dc795..975658f79 100644
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@@ -993,6 +993,40 @@ static void test_json_schema() {
         }
     );
 
+    test_schema(
+        "simple pattern",
+        // Schema
+        R"""({
+            "pattern": "^[a-zA-Z0-9_-]*$"
+        })""",
+        // Passing strings
+        {
+            R"""("")""",
+            R"""("He_llo-12")""",
+        },
+        // Failing strings
+        {
+            R"""("!")""",
+            R"""("Hello World")""",
+        }
+    );
+
+    test_schema(
+        "pattern with escapes",
+        // Schema
+        R"""({
+            "pattern": "^a\\^\\$\\.\\[\\]\\(\\)\\|\\{\\}\\*\\+\\?b$"
+        })""",
+        // Passing strings
+        {
+            R"""("a^$.[]()|{}*+?b")""",
+        },
+        // Failing strings
+        {
+            R"""("ab")""",
+        }
+    );
+
     test_schema(
         "",
         // Schema
@@ -1062,8 +1096,6 @@ static void test_json_schema() {
             R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
             // "By extension, even an empty object is valid"
             R"""({})""",
-            // "By default, providing additional properties is valid"
-            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
             R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
         },
         // Failing strings
@@ -1074,6 +1106,9 @@ static void test_json_schema() {
             R"""({ "street_name": "Pennsylvania", "number": 1600 })""",
             // Reorder properties
             R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+            // "Additional properties default to false for generation, even though the spec says true.
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
+
         }
     );
 
diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp
index 3aaa11833..720a949c7 100755
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -1120,28 +1120,15 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         R"""(
             alternative-0 ::= foo
             alternative-1 ::= bar
-            array ::= "[" space ( value ("," space value)* )? "]" space
-            bar ::= "{" space  (bar-b-kv bar-b-rest | bar-additional-kv ( "," space bar-additional-kv )* )? "}" space
-            bar-additional-k ::= ["] ( [b] char+ | [^"b] char* )? ["] space
-            bar-additional-kv ::= bar-additional-k ":" space value
+            bar ::= "{" space  (bar-b-kv )? "}" space
             bar-b-kv ::= "\"b\"" space ":" space number
-            bar-b-rest ::= ( "," space bar-additional-kv )*
-            boolean ::= ("true" | "false") space
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
             decimal-part ::= [0-9]{1,16}
-            foo ::= "{" space  (foo-a-kv foo-a-rest | foo-additional-kv ( "," space foo-additional-kv )* )? "}" space
+            foo ::= "{" space  (foo-a-kv )? "}" space
             foo-a-kv ::= "\"a\"" space ":" space number
-            foo-a-rest ::= ( "," space foo-additional-kv )*
-            foo-additional-k ::= ["] ( [a] char+ | [^"a] char* )? ["] space
-            foo-additional-kv ::= foo-additional-k ":" space value
             integral-part ::= [0] | [1-9] [0-9]{0,15}
-            null ::= "null" space
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
             root ::= alternative-0 | alternative-1
             space ::= | " " | "\n" [ \t]{0,20}
-            string ::= "\"" char* "\"" space
-            value ::= object | array | string | number | boolean | null
         )"""
     });
 
@@ -1177,25 +1164,15 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             a-kv ::= "\"a\"" space ":" space number
-            additional-k ::= ["] ( [a] char+ | [b] char+ | [c] char+ | [d] char+ | [^"abcd] char* )? ["] space
-            additional-kv ::= additional-k ":" space value
-            array ::= "[" space ( value ("," space value)* )? "]" space
             b-kv ::= "\"b\"" space ":" space number
-            boolean ::= ("true" | "false") space
             c-kv ::= "\"c\"" space ":" space number
-            c-rest ::= ( "," space additional-kv )*
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
             d-kv ::= "\"d\"" space ":" space number
-            d-rest ::= ( "," space c-kv )? c-rest
+            d-rest ::= ( "," space c-kv )?
             decimal-part ::= [0-9]{1,16}
             integral-part ::= [0] | [1-9] [0-9]{0,15}
-            null ::= "null" space
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
-            root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv c-rest | additional-kv ( "," space additional-kv )* ) )? "}" space
+            root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
             space ::= | " " | "\n" [ \t]{0,20}
-            string ::= "\"" char* "\"" space
-            value ::= object | array | string | number | boolean | null
         )"""
     });
 

From b851b3fba0a1b06a1129189bac300e07dd1648c8 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 28 Jun 2024 12:37:45 +0200
Subject: [PATCH 42/50] cmake : allow user to override default options (#8178)

---
 CMakeLists.txt | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dba083089..e3a0cc369 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,8 +79,15 @@ set(GGML_SANITIZE_ADDRESS   ${LLAMA_SANITIZE_ADDRESS})
 set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
 set(GGML_ALL_WARNINGS       ${LLAMA_ALL_WARNINGS})
 set(GGML_FATAL_WARNINGS     ${LLAMA_FATAL_WARNINGS})
-set(GGML_LLAMAFILE          ON)
-set(GGML_CUDA_USE_GRAPHS    ON)
+
+# change the default for these ggml options
+if (NOT DEFINED GGML_LLAMAFILE)
+    set(GGML_LLAMAFILE ON)
+endif()
+
+if (NOT DEFINED GGML_CUDA_USE_GRAPHS)
+    set(GGML_CUDA_USE_GRAPHS ON)
+endif()
 
 # transition helpers
 function (llama_option_depr TYPE OLD NEW)

From 38373cfbab5397cc2ab5c3694a3dee12a9e58f45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Fri, 28 Jun 2024 12:53:43 +0200
Subject: [PATCH 43/50] Add SPM infill support (#8016)

* add --spm-infill option

* support --spm-infill

* support --spm-infill
---
 common/common.cpp          |  6 ++++++
 common/common.h            |  2 ++
 examples/infill/README.md  |  1 +
 examples/infill/infill.cpp | 24 +++++++++++++-----------
 examples/server/README.md  |  1 +
 examples/server/server.cpp | 16 +++++++++++-----
 6 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 57d03a578..6a00d25be 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1026,6 +1026,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.input_suffix = argv[i];
         return true;
     }
+    if (arg == "--spm-infill") {
+        params.spm_infill = true;
+        return true;
+    }
     if (arg == "--grammar") {
         CHECK_ARG
         sparams.grammar = argv[i];
@@ -1409,6 +1413,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "main infill", "       --in-prefix-bos",        "prefix BOS to user inputs, preceding the `--in-prefix` string" });
     options.push_back({ "main infill", "       --in-prefix STRING",     "string to prefix user inputs with (default: empty)" });
     options.push_back({ "main infill", "       --in-suffix STRING",     "string to suffix after user inputs with (default: empty)" });
+    options.push_back({ "server infill",
+                                       "       --spm-infill",           "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
 
     options.push_back({ "sampling" });
     options.push_back({ "*",           "       --samplers SAMPLERS",    "samplers that will be used for generation in the order, separated by \';\'\n"
diff --git a/common/common.h b/common/common.h
index 0486ba380..d6cb814b9 100644
--- a/common/common.h
+++ b/common/common.h
@@ -250,6 +250,8 @@ struct gpt_params {
     std::string cvector_outfile       = "control_vector.gguf";
     std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
     std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
+
+    bool spm_infill = false; // suffix/prefix/middle pattern for infill
 };
 
 void gpt_params_handle_model_default(gpt_params & params);
diff --git a/examples/infill/README.md b/examples/infill/README.md
index 74f42d2fc..810a0c5e7 100644
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@@ -15,6 +15,7 @@ In this section, we cover the most commonly used options for running the `infill
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
 -   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
 
 ## Input Prompts
 
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 3e82e4a81..ca71dd687 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -210,6 +210,7 @@ int main(int argc, char ** argv) {
         suff_rm_leading_spc = false;
     }
     std::vector<llama_token> embd_inp;
+    std::vector<llama_token> embd_end;
     std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
     std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
     const int space_token = 29871;
@@ -217,12 +218,13 @@ int main(int argc, char ** argv) {
         inp_sfx.erase(inp_sfx.begin());
     }
     inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
-    if (add_bos) {
-        inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
-    }
     inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
-    embd_inp = inp_pfx;
-    embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
+    embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
+    embd_end = params.spm_infill ? inp_pfx : inp_sfx;
+    if (add_bos) {
+        embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
+    }
+    embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
 
     const llama_token middle_token = llama_token_middle(model);
     if (middle_token >= 0) {
@@ -526,14 +528,14 @@ int main(int argc, char ** argv) {
                     inp_sfx.erase(inp_sfx.begin());
                 }
                 inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
-                if (add_bos) {
-                    inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
-                }
                 inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
-                embd_inp = inp_pfx;
-                embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
+                embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
+                embd_end = params.spm_infill ? inp_pfx : inp_sfx;
+                if (add_bos) {
+                    embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
+                }
+                embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
 
-                const llama_token middle_token = llama_token_middle(model);
                 if (middle_token >= 0) {
                     embd_inp.push_back(middle_token);
                 }
diff --git a/examples/server/README.md b/examples/server/README.md
index e7fb0bf64..4fab006bb 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -73,6 +73,7 @@ The project is under active development, and we are [looking for feedback and co
 - `-fa`, `--flash-attn` : enable flash attention (default: disabled).
 - `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
 - `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
+- `--spm-infill` : Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
 
 **If compiled with `LLAMA_SERVER_SSL=ON`**
 - `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index ae768097b..d7fb61812 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2020,6 +2020,7 @@ struct server_context {
                         slot.t_start_generation = 0;
 
                         if (slot.infill) {
+                            const bool add_bos = llama_should_add_bos_token(model);
                             bool suff_rm_leading_spc = true;
                             if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
                                 params.input_suffix.erase(0, 1);
@@ -2035,16 +2036,21 @@ struct server_context {
                             }
 
                             prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
-                            prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
-                            prefix_tokens.insert(prefix_tokens.end(),   llama_token_suffix(model));
-                            prefix_tokens.insert(prefix_tokens.end(),   suffix_tokens.begin(), suffix_tokens.end());
+                            suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
+
+                            auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
+                            auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
+                            if (add_bos) {
+                                embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
+                            }
+                            embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
 
                             const llama_token middle_token = llama_token_middle(model);
                             if (middle_token >= 0) {
-                                prefix_tokens.push_back(middle_token);
+                                embd_inp.push_back(middle_token);
                             }
 
-                            prompt_tokens = prefix_tokens;
+                            prompt_tokens = embd_inp;
                         } else {
                             prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
                         }

From 26a39bbd6b0bbd66118bb68569f0276d7fe7df6c Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Fri, 28 Jun 2024 15:11:44 +0200
Subject: [PATCH 44/50] Add MiniCPM, Deepseek V2 chat template + clean up
 `llama_chat_apply_template_internal` (#8172)

* tmp_contains

* minicpm chat template

* add DeepSeek Lite template

* change deepseek-lite to deepseek2

* correct code comment

* correct code from master branch
---
 src/llama.cpp                | 64 ++++++++++++++++++++++++++----------
 tests/test-chat-template.cpp | 10 +++++-
 2 files changed, 56 insertions(+), 18 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 988ed4fdf..3edaa98e8 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -19613,7 +19613,10 @@ static int32_t llama_chat_apply_template_internal(
     std::string & dest, bool add_ass) {
     // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
     std::stringstream ss;
-    if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) {
+    auto tmpl_contains = [&tmpl](std::string haystack) -> bool {
+        return tmpl.find(haystack) != std::string::npos;
+    };
+    if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) {
         // chatml template
         for (auto message : chat) {
             ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
@@ -19621,16 +19624,16 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|im_start|>assistant\n";
         }
-    } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl.find("[INST]") != std::string::npos) {
+    } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl_contains("[INST]")) {
         // llama2 template and its variants
         // [variant] support system message
-        bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos || tmpl == "mistral";
+        bool support_system_message = tmpl_contains("<<SYS>>") || tmpl == "mistral";
         // [variant] space before + after response
-        bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
+        bool space_around_response = tmpl_contains("' ' + eos_token");
         // [variant] add BOS inside history
-        bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
+        bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
         // [variant] trim spaces from the input message
-        bool strip_message = tmpl.find("content.strip()") != std::string::npos;
+        bool strip_message = tmpl_contains("content.strip()");
         // construct the prompt
         bool is_inside_turn = true; // skip BOS at the beginning
         ss << "[INST] ";
@@ -19656,7 +19659,7 @@ static int32_t llama_chat_apply_template_internal(
             }
         }
         // llama2 templates seem to not care about "add_generation_prompt"
-    } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
+    } else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
         // Phi 3
         for (auto message : chat) {
             std::string role(message->role);
@@ -19665,7 +19668,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|assistant|>\n";
         }
-    } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
+    } else if (tmpl == "zephyr" || tmpl_contains("<|user|>")) {
         // zephyr template
         for (auto message : chat) {
             ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
@@ -19673,7 +19676,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|assistant|>\n";
         }
-    } else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) {
+    } else if (tmpl == "monarch" || tmpl_contains("bos_token + message['role']")) {
         // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
         for (auto message : chat) {
             std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
@@ -19682,7 +19685,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<s>assistant\n";
         }
-    } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("<start_of_turn>") != std::string::npos) {
+    } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl_contains("<start_of_turn>")) {
         // google/gemma-7b-it
         std::string system_prompt = "";
         for (auto message : chat) {
@@ -19704,7 +19707,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<start_of_turn>model\n";
         }
-    } else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) {
+    } else if (tmpl == "orion" || tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
         // OrionStarAI/Orion-14B-Chat
         std::string system_prompt = "";
         for (auto message : chat) {
@@ -19724,7 +19727,7 @@ static int32_t llama_chat_apply_template_internal(
                 ss << message->content << "</s>";
             }
         }
-    } else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) {
+    } else if (tmpl == "openchat" || tmpl_contains("GPT4 Correct ")) {
         // openchat/openchat-3.5-0106,
         for (auto message : chat) {
             std::string role(message->role);
@@ -19738,13 +19741,13 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "GPT4 Correct Assistant:";
         }
-    } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) {
+    } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: "))) {
         // eachadea/vicuna-13b-1.1 (and Orca variant)
         for (auto message : chat) {
             std::string role(message->role);
             if (role == "system") {
                 // Orca-Vicuna variant uses a system prefix
-                if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) {
+                if (tmpl == "vicuna-orca" || tmpl_contains("SYSTEM: ")) {
                     ss << "SYSTEM: " << message->content << "\n";
                 } else {
                     ss << message->content << "\n\n";
@@ -19758,7 +19761,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "ASSISTANT:";
         }
-    } else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) {
+    } else if (tmpl == "deepseek" || (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>"))) {
         // deepseek-ai/deepseek-coder-33b-instruct
         for (auto message : chat) {
             std::string role(message->role);
@@ -19773,7 +19776,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "### Response:\n";
         }
-    } else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) {
+    } else if (tmpl == "command-r" || (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>"))) {
         // CohereForAI/c4ai-command-r-plus
         for (auto message : chat) {
             std::string role(message->role);
@@ -19788,7 +19791,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
         }
-    } else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) {
+    } else if (tmpl == "llama3" || (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>"))) {
         // Llama 3
         for (auto message : chat) {
             std::string role(message->role);
@@ -19797,6 +19800,33 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
         }
+    } else if (tmpl == "minicpm" || tmpl_contains(u8"<用户>")) {
+        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "user") {
+                ss << u8"<用户>";
+                ss << trim(message->content);
+                ss << "<AI>";
+            } else {
+                ss << trim(message->content);
+            }
+        }
+    } else if (tmpl == "deepseek2" || tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
+        // DeepSeek-V2
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                ss << message->content << "\n\n";
+            } else if (role == "user") {
+                ss << "User: " << message->content << "\n\n";
+            } else if (role == "assistant") {
+                ss << "Assistant: " << message->content << u8"<｜end▁of▁sentence｜>";
+            }
+        }
+        if (add_ass) {
+            ss << "Assistant:";
+        }
     } else {
         // template not supported
         return -1;
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index d19ba8633..b154038b2 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -57,7 +57,11 @@ int main(void) {
         //Phi-3-medium
         "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
         //Phi-3-vision
-        "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}"
+        "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}",
+        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
+        u8"{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
+        // DeepSeek-V2
+        "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
     };
     std::vector<std::string> expected_output = {
         // teknium/OpenHermes-2.5-Mistral-7B
@@ -94,6 +98,10 @@ int main(void) {
         "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
         //Phi-3-vision
         "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
+        u8"You are a helpful assistant<用户>Hello<AI>Hi there<用户>Who are you<AI>I am an assistant<用户>Another question<AI>",
+        // DeepSeek-V2
+        u8"You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<｜end▁of▁sentence｜>User: Who are you\n\nAssistant:    I am an assistant   <｜end▁of▁sentence｜>User: Another question\n\nAssistant:",
     };
     std::vector<char> formatted_chat(1024);
     int32_t res;

From 8748d8ac6f172b99826ab18f01d9a3a165987d54 Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@users.noreply.github.com>
Date: Fri, 28 Jun 2024 18:02:05 +0100
Subject: [PATCH 45/50] json: attempt to skip slow tests when running under
 emulator (#8189)

---
 .github/workflows/build.yml           |  1 +
 tests/test-json-schema-to-grammar.cpp | 40 +++++++++++++++------------
 2 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index adf67cecc..1e344db6b 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -799,6 +799,7 @@ jobs:
           7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
           $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
           cd build
+          $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
           & $sde -future -- ctest -L main -C Release --verbose --timeout 900
 
       - name: Determine tag name
diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp
index 720a949c7..65486ac5c 100755
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -1239,26 +1239,30 @@ int main() {
         }
     });
 
-    if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) {
-        test_all("Python", [](const TestCase & tc) {
-            write("test-json-schema-input.tmp", tc.schema);
-            tc.verify_status(std::system(
-                "python ./examples/json_schema_to_grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
-            tc.verify(read("test-grammar-output.tmp"));
-        });
+    if (getenv("LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR")) {
+        fprintf(stderr, "\033[33mWARNING: Skipping slow tests on emulator.\n\033[0m");
     } else {
-        fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m");
-    }
+        if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) {
+            test_all("Python", [](const TestCase & tc) {
+                write("test-json-schema-input.tmp", tc.schema);
+                tc.verify_status(std::system(
+                    "python ./examples/json_schema_to_grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
+                tc.verify(read("test-grammar-output.tmp"));
+            });
+        } else {
+            fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m");
+        }
 
-    if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
-        test_all("JavaScript", [](const TestCase & tc) {
-            write("test-json-schema-input.tmp", tc.schema);
-            tc.verify_status(std::system(
-                "node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
-            tc.verify(read("test-grammar-output.tmp"));
-        });
-    } else {
-        fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m");
+        if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
+            test_all("JavaScript", [](const TestCase & tc) {
+                write("test-json-schema-input.tmp", tc.schema);
+                tc.verify_status(std::system(
+                    "node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
+                tc.verify(read("test-grammar-output.tmp"));
+            });
+        } else {
+            fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m");
+        }
     }
 
     test_all("Check Expectations Validity", [](const TestCase & tc) {

From 72272b83a3878e91251218c981b4c6ec16c33912 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Sat, 29 Jun 2024 00:14:20 +0200
Subject: [PATCH 46/50] fix code typo in llama-cli (#8198)

---
 examples/main/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index cfaf6a6e8..1114073b8 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -810,7 +810,7 @@ int main(int argc, char ** argv) {
                         is_antiprompt = true;
                     }
 
-                    chat_add_and_format(model, chat_msgs, "system", assistant_ss.str());
+                    chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
                     is_interacting = true;
                     printf("\n");
                 }

From 1c5eba6f8e628fb0a98afb27d8aaeb3b0e136451 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sat, 29 Jun 2024 20:44:08 -0700
Subject: [PATCH 47/50] llama: Add attention and final logit soft-capping,
 update scaling factor to Gemma2 (#8197)

* Add attention and final logit softcapping.

* fix

* Add custom add_ functions

* Disable flash attention for Gemma2

* Update src/llama.cpp

Co-authored-by: slaren <slarengh@gmail.com>

* Add default value for attention and final logit softcap value

* Add custom kq scaling from Gemma2Attention

* Remove custom pre attention scaling and use computed value instead.

---------

Co-authored-by: slaren <slarengh@gmail.com>
---
 convert-hf-to-gguf.py       |  6 ++++++
 gguf-py/gguf/constants.py   |  2 ++
 gguf-py/gguf/gguf_writer.py |  6 ++++++
 src/llama.cpp               | 35 ++++++++++++++++++++++++++++++++---
 4 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 5bcc849db..3ef2f69e7 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -2363,6 +2363,12 @@ class Gemma2Model(Model):
         self.gguf_writer.add_key_length(hparams["head_dim"])
         self.gguf_writer.add_value_length(hparams["head_dim"])
         self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_attn_logit_softcapping(
+            self.hparams["attn_logit_softcapping"]
+        )
+        self.gguf_writer.add_final_logit_softcapping(
+            self.hparams["final_logit_softcapping"]
+        )
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unusem
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index cf3d09e70..9bfa891d5 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -50,6 +50,8 @@ class Keys:
         POOLING_TYPE                      = "{arch}.pooling_type"
         LOGIT_SCALE                       = "{arch}.logit_scale"
         DECODER_START_TOKEN_ID            = "{arch}.decoder_start_token_id"
+        ATTN_LOGIT_SOFTCAPPING            = "{arch}.attn_logit_softcapping"
+        FINAL_LOGIT_SOFTCAPPING           = "{arch}.final_logit_softcapping"
 
     class Attention:
         HEAD_COUNT        = "{arch}.attention.head_count"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 9869f6fe3..1aeb0d9b0 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -516,6 +516,12 @@ class GGUFWriter:
     def add_logit_scale(self, value: float) -> None:
         self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)
 
+    def add_attn_logit_softcapping(self, value: float) -> None:
+        self.add_float32(Keys.LLM.ATTN_LOGIT_SOFTCAPPING.format(arch=self.arch), value)
+
+    def add_final_logit_softcapping(self, value: float) -> None:
+        self.add_float32(Keys.LLM.FINAL_LOGIT_SOFTCAPPING.format(arch=self.arch), value)
+
     def add_expert_count(self, count: int) -> None:
         self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 3edaa98e8..2a4d73856 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -302,6 +302,8 @@ enum llm_kv {
     LLM_KV_POOLING_TYPE,
     LLM_KV_LOGIT_SCALE,
     LLM_KV_DECODER_START_TOKEN_ID,
+    LLM_KV_ATTN_LOGIT_SOFTCAPPING,
+    LLM_KV_FINAL_LOGIT_SOFTCAPPING,
 
     LLM_KV_ATTENTION_HEAD_COUNT,
     LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -392,6 +394,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_POOLING_TYPE ,                     "%s.pooling_type"                      },
     { LLM_KV_LOGIT_SCALE,                       "%s.logit_scale"                       },
     { LLM_KV_DECODER_START_TOKEN_ID,            "%s.decoder_start_token_id"            },
+    { LLM_KV_ATTN_LOGIT_SOFTCAPPING,            "%s.attn_logit_softcapping"            },
+    { LLM_KV_FINAL_LOGIT_SOFTCAPPING,           "%s.final_logit_softcapping"           },
 
     { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"             },
     { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"          },
@@ -2099,6 +2103,9 @@ struct llama_hparams {
     float f_norm_eps;
     float f_norm_rms_eps;
 
+    float f_attn_logit_softcapping = 50.0f;
+    float f_final_logit_softcapping = 30.0f;
+
     float    rope_attn_factor = 1.0f;
     float    rope_freq_base_train;
     float    rope_freq_scale_train;
@@ -2115,8 +2122,9 @@ struct llama_hparams {
     float f_max_alibi_bias = 0.0f;
     float f_logit_scale    = 0.0f;
 
-    bool causal_attn = true;
-    bool use_alibi   = false;
+    bool causal_attn   = true;
+    bool use_alibi     = false;
+    bool attn_soft_cap = false;
 
     enum llama_pooling_type      pooling_type            = LLAMA_POOLING_TYPE_NONE;
     enum llama_rope_type         rope_type               = LLAMA_ROPE_TYPE_NONE;
@@ -4702,6 +4710,9 @@ static void llm_load_hparams(
         case LLM_ARCH_GEMMA2:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
+                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
+                hparams.attn_soft_cap = true;
 
                 switch (hparams.n_layer) {
                     case 42: model.type = e_model::MODEL_9B; break;
@@ -7579,6 +7590,12 @@ static struct ggml_tensor * llm_build_kqv(
             kq = ggml_scale(ctx, kq, 30);
         }
 
+        if (hparams.attn_soft_cap) {
+            kq = ggml_scale(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping);
+            kq = ggml_tanh(ctx, kq);
+            kq = ggml_scale(ctx, kq, hparams.f_attn_logit_softcapping);
+        }
+
         kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
         cb(kq, "kq_soft_max_ext", il);
 
@@ -11039,7 +11056,7 @@ struct llm_build_context {
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Qcur, "Qcur", il);
 
-                Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
+                Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head)));
                 cb(Qcur, "Qcur_scaled", il);
 
                 Kcur = ggml_rope_ext(
@@ -11106,6 +11123,12 @@ struct llm_build_context {
 
         // lm_head
         cur = ggml_mul_mat(ctx0, model.output, cur);
+
+        // final logit soft-capping
+        cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
+        cur = ggml_tanh(ctx0, cur);
+        cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
+
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -17379,6 +17402,12 @@ struct llama_context * llama_new_context_with_model(
         params.flash_attn = false;
     }
 
+    if (params.flash_attn && model->hparams.attn_soft_cap) {
+        LLAMA_LOG_WARN("%s: flash_attn is not compatible with attn_soft_cap - forcing off\n", __func__);
+        params.flash_attn = false;
+    }
+
+
     if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
         LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
         params.flash_attn = false;

From 32bf2296a2652856a63bdee05c2c10c43adb2731 Mon Sep 17 00:00:00 2001
From: Aliebc <i@axgln.net>
Date: Sat, 15 Jun 2024 10:45:01 +0800
Subject: [PATCH 48/50] Add YX simple filter for llama-server

---
 .github/workflows/bench.yml                   | 310 --------
 .github/workflows/build.yml                   | 698 ------------------
 .github/workflows/close-issue.yml             |  23 -
 .github/workflows/editorconfig.yml            |  27 -
 .github/workflows/gguf-publish.yml            |  44 --
 .github/workflows/labeler.yml                 |  17 -
 .github/workflows/nix-ci-aarch64.yml          |  65 --
 .github/workflows/nix-ci.yml                  |  72 --
 .github/workflows/nix-flake-update.yml        |  22 -
 .github/workflows/nix-publish-flake.yml       |  36 -
 .../workflows/python-check-requirements.yml   |  35 -
 .github/workflows/python-lint.yml             |  23 -
 examples/server/CMakeLists.txt                |   1 +
 examples/server/stoplist.cpp                  |  10 +
 examples/server/utils.hpp                     | 111 ++-
 15 files changed, 120 insertions(+), 1374 deletions(-)
 delete mode 100644 .github/workflows/bench.yml
 delete mode 100644 .github/workflows/close-issue.yml
 delete mode 100644 .github/workflows/editorconfig.yml
 delete mode 100644 .github/workflows/gguf-publish.yml
 delete mode 100644 .github/workflows/labeler.yml
 delete mode 100644 .github/workflows/nix-ci-aarch64.yml
 delete mode 100644 .github/workflows/nix-ci.yml
 delete mode 100644 .github/workflows/nix-flake-update.yml
 delete mode 100644 .github/workflows/nix-publish-flake.yml
 delete mode 100644 .github/workflows/python-check-requirements.yml
 delete mode 100644 .github/workflows/python-lint.yml
 create mode 100644 examples/server/stoplist.cpp

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
deleted file mode 100644
index eb69b82c4..000000000
--- a/.github/workflows/bench.yml
+++ /dev/null
@@ -1,310 +0,0 @@
-# Benchmark
-name: Benchmark
-
-on:
-  workflow_dispatch:
-    inputs:
-      gpu-series:
-        description: 'Azure GPU series to run with'
-        required: true
-        type: choice
-        options:
-          - Standard_NC4as_T4_v3
-          - Standard_NC24ads_A100_v4
-          - Standard_NC80adis_H100_v5
-      sha:
-        description: 'Commit SHA1 to build'
-        required: false
-        type: string
-      duration:
-        description: 'Duration of the bench'
-        type: string
-        default: 10m
-
-  push:
-    branches:
-      - master
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
-  pull_request_target:
-    types: [opened, synchronize, reopened]
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
-  schedule:
-    -  cron: '04 2 * * *'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
-  cancel-in-progress: true
-
-jobs:
-  bench-server-baseline:
-    runs-on: Standard_NC4as_T4_v3
-    env:
-      RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
-      N_USERS: 8
-      DURATION: 10m
-
-    strategy:
-      matrix:
-        model: [phi-2]
-        ftype: [q4_0, q8_0, f16]
-        include:
-          - model: phi-2
-            ftype: q4_0
-            pr_comment_enabled: "true"
-
-    if: |
-      inputs.gpu-series == 'Standard_NC4as_T4_v3'
-      || (
-        github.event_name == 'schedule'
-        && github.ref_name == 'master'
-        && github.repository_owner == 'ggerganov'
-      )
-      || github.event_name == 'pull_request_target'
-      || (
-        github.event_name == 'push'
-        && github.event.ref == 'refs/heads/master'
-        && github.repository_owner == 'ggerganov'
-      )
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Install python env
-        id: pipenv
-        run: |
-          cd examples/server/bench
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r requirements.txt
-
-      - name: Prometheus
-        id: install_prometheus
-        run: |
-          wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
-          tar xzf prometheus*.tar.gz --strip-components=1
-          ./prometheus --config.file=examples/server/bench/prometheus.yml &
-          while ! nc -z localhost 9090; do
-            sleep 0.1
-          done
-
-      - name: Set up Go
-        uses: actions/setup-go@v5
-        with:
-          go-version: '1.21'
-
-      - name: Install k6 and xk6-sse
-        id: k6_installation
-        run: |
-          cd examples/server/bench
-          go install go.k6.io/xk6/cmd/xk6@latest
-          xk6 build master \
-              --with github.com/phymbert/xk6-sse
-
-      - name: Build
-        id: cmake_build
-        run: |
-          set -eux
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CURL=ON \
-              -DLLAMA_CUBLAS=ON \
-              -DCUDAToolkit_ROOT=/usr/local/cuda \
-              -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
-              -DCMAKE_CUDA_ARCHITECTURES=75 \
-              -DLLAMA_FATAL_WARNINGS=OFF \
-              -DLLAMA_ALL_WARNINGS=OFF \
-              -DCMAKE_BUILD_TYPE=Release;
-          cmake --build build --config Release -j $(nproc) --target llama-server
-
-      - name: Download the dataset
-        id: download_dataset
-        run: |
-          cd examples/server/bench
-          wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-      - name: Server bench
-        id: server_bench
-        run: |
-          set -eux
-
-          cd examples/server/bench
-          source venv/bin/activate
-          python bench.py \
-              --runner-label ${{ env.RUNNER_LABEL }} \
-              --name ${{ github.job }} \
-              --branch ${{ github.head_ref || github.ref_name }} \
-              --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
-              --scenario script.js \
-              --duration ${{ github.event.inputs.duration || env.DURATION }} \
-              --hf-repo ggml-org/models	 \
-              --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
-              --model-path-prefix /models \
-              --parallel ${{ env.N_USERS }} \
-              -ngl 33 \
-              --batch-size 2048 \
-              --ubatch-size	256 \
-              --ctx-size 16384 \
-              --n-prompts 1000 \
-              --max-prompt-tokens 1024 \
-              --max-tokens 2048
-
-          cat results.github.env >> $GITHUB_ENV
-
-          # Remove dataset as we do not want it in the artefact
-          rm ShareGPT_V3_unfiltered_cleaned_split.json
-
-      - uses: actions/upload-artifact@v4
-        with:
-          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          compression-level: 9
-          path: |
-            examples/server/bench/*.jpg
-            examples/server/bench/*.json
-            examples/server/bench/*.log
-
-      - name: Commit status
-        uses: Sibz/github-status-action@v1
-        with:
-          authToken: ${{secrets.GITHUB_TOKEN}}
-          sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
-          context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          description: |
-            ${{ env.BENCH_RESULTS }}
-          state: 'success'
-
-      - name: Upload benchmark images
-        uses: devicons/public-upload-to-imgur@v2.2.2
-        continue-on-error: true # Important as it looks unstable: 503
-        id: imgur_step
-        with:
-          client_id: ${{secrets.IMGUR_CLIENT_ID}}
-          path: |
-            examples/server/bench/prompt_tokens_seconds.jpg
-            examples/server/bench/predicted_tokens_seconds.jpg
-            examples/server/bench/kv_cache_usage_ratio.jpg
-            examples/server/bench/requests_processing.jpg
-
-      - name: Extract mermaid
-        id: set_mermaid
-        run: |
-          set -eux
-
-          cd examples/server/bench
-          PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
-          echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
-          echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-          PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
-          echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
-          echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-          KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
-          echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
-          echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-          REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
-          echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
-          echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-      - name: Extract image url
-        id: extract_image_url
-        continue-on-error: true
-        run: |
-          set -eux
-
-          echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
-          echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
-          echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
-          echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
-
-      - name: Comment PR
-        uses: mshick/add-pr-comment@v2
-        id: comment_pr
-        if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
-        with:
-          message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          message: |
-            <p align="center">
-
-            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
-
-            </p>
-
-            <details>
-
-            <summary>Expand details for performance related PR only</summary>
-
-            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
-            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
-            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
-            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
-            - ${{ env.BENCH_GRAPH_XLABEL }}
-
-
-            <p align="center">
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
-
-            <details>
-
-            <summary>More</summary>
-
-            ```mermaid
-            ${{ env.PROMPT_TOKENS_SECONDS }}
-            ```
-
-            </details>
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
-
-            <details>
-                <summary>More</summary>
-
-            ```mermaid
-            ${{ env.PREDICTED_TOKENS_SECONDS }}
-            ```
-
-            </details>
-
-            </p>
-
-            <details>
-
-            <summary>Details</summary>
-
-            <p align="center">
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
-
-            <details>
-                <summary>More</summary>
-
-            ```mermaid
-            ${{ env.KV_CACHE_USAGE_RATIO }}
-            ```
-
-            </details>
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
-
-            <details>
-                <summary>More</summary>
-
-            ```mermaid
-            ${{ env.REQUESTS_PROCESSING }}
-            ```
-
-            </details>
-
-            </p>
-            </details>
-            </details>
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 1e344db6b..8c7353434 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -25,664 +25,6 @@ env:
   GGML_N_THREADS: 1
 
 jobs:
-  macOS-latest-cmake-arm64:
-    runs-on: macos-14
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          mkdir build
-          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L 'main|curl' --verbose --timeout 900
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
-          name: llama-bin-macos-arm64.zip
-
-  macOS-latest-cmake-x64:
-    runs-on: macos-12
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
-          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
-          name: llama-bin-macos-x64.zip
-
-  ubuntu-focal-make:
-    runs-on: ubuntu-20.04
-    env:
-      LLAMA_NODE_AVAILABLE: true
-      LLAMA_PYTHON_AVAILABLE: true
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential gcc-8
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: "20"
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-
-      - name: Build
-        id: make_build
-        env:
-            LLAMA_FATAL_WARNINGS: 1
-        run: |
-          CC=gcc-8 make -j $(nproc)
-
-      - name: Test
-        id: make_test
-        run: |
-          CC=gcc-8 make tests -j $(nproc)
-          make test -j $(nproc)
-
-  ubuntu-focal-make-curl:
-    runs-on: ubuntu-20.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev
-
-      - name: Build
-        id: make_build
-        env:
-          LLAMA_FATAL_WARNINGS: 1
-          LLAMA_CURL: 1
-        run: |
-          CC=gcc-8 make -j $(nproc)
-
-  ubuntu-latest-cmake:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
-          cmake --build . --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L 'main|curl' --verbose --timeout 900
-
-      - name: Test llama2c conversion
-        id: llama2c_test
-        run: |
-          cd build
-          echo "Fetch tokenizer"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
-          echo "Fetch llama2c model"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
-          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
-          name: llama-bin-ubuntu-x64.zip
-
-  ubuntu-latest-cmake-sanitizer:
-    runs-on: ubuntu-latest
-
-    continue-on-error: true
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug, Release]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-
-      - name: Build
-        id: cmake_build
-        if: ${{ matrix.sanitizer != 'THREAD' }}
-        run: |
-          mkdir build
-          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          mkdir build
-          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
-          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  ubuntu-latest-cmake-rpc:
-    runs-on: ubuntu-latest
-
-    continue-on-error: true
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake -DGGML_RPC=ON ..
-          cmake --build . --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose
-
-  ubuntu-22-cmake-vulkan:
-    runs-on: ubuntu-22.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libvulkan-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake -DGGML_VULKAN=ON ..
-          cmake --build . --config Release -j $(nproc)
-
-  ubuntu-22-cmake-hip:
-    runs-on: ubuntu-22.04
-    container: rocm/dev-ubuntu-22.04:6.0.2
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev
-
-      - name: Build with native CMake HIP support
-        id: cmake_build
-        run: |
-          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Build with legacy HIP support
-        id: cmake_build_legacy_hip
-        run: |
-          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON
-          cmake --build build2 --config Release -j $(nproc)
-
-  ubuntu-22-cmake-sycl:
-    runs-on: ubuntu-22.04
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v2
-
-      - name: add oneAPI to apt
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
-      - name: install oneAPI dpcpp compiler
-        shell: bash
-        run: |
-          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp
-
-      - name: install oneAPI MKL library
-        shell: bash
-        run: |
-          sudo apt install intel-oneapi-mkl-devel
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          mkdir build
-          cd build
-          cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
-          cmake --build . --config Release -j $(nproc)
-
-  ubuntu-22-cmake-sycl-fp16:
-    runs-on: ubuntu-22.04
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v2
-
-      - name: add oneAPI to apt
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
-      - name: install oneAPI dpcpp compiler
-        shell: bash
-        run: |
-          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp
-
-      - name: install oneAPI MKL library
-        shell: bash
-        run: |
-          sudo apt install intel-oneapi-mkl-devel
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          mkdir build
-          cd build
-          cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
-          cmake --build . --config Release -j $(nproc)
-
-  # TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
-  #       how to debug it.
-  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
-  macOS-latest-make:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: make_build
-        env:
-            LLAMA_FATAL_WARNINGS: 1
-        run: |
-          GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: make_test
-        run: |
-          GGML_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
-          GGML_NO_METAL=1 make test  -j $(sysctl -n hw.logicalcpu)
-
-  # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
-  #       how to debug it.
-  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
-  #       would be great if we fix these
-  macOS-latest-cmake:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          mkdir build
-          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  macOS-latest-cmake-ios:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v1
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          mkdir build
-          cd build
-          cmake -G Xcode .. \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-  macOS-latest-cmake-tvos:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v1
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          mkdir build
-          cd build
-          cmake -G Xcode .. \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=tvOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-  macOS-latest-swift:
-    runs-on: macos-latest
-
-    strategy:
-      matrix:
-        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v1
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          xcodebuild -scheme llama -destination "${{ matrix.destination }}"
-
-      - name: Build Swift Example
-        id: make_build_swift_example
-        run: |
-            make swift
-
-  windows-msys2:
-    runs-on: windows-latest
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - { sys: UCRT64,  env: ucrt-x86_64,  build: Release }
-          - { sys: CLANG64, env: clang-x86_64, build: Release }
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-
-      - name: Setup ${{ matrix.sys }}
-        uses: msys2/setup-msys2@v2
-        with:
-          update: true
-          msystem: ${{matrix.sys}}
-          install: >-
-            base-devel
-            mingw-w64-${{matrix.env}}-toolchain
-            mingw-w64-${{matrix.env}}-cmake
-            mingw-w64-${{matrix.env}}-openblas
-
-      - name: Build using make
-        shell: msys2 {0}
-        run: |
-            make -j $(nproc)
-
-      - name: Clean after building using make
-        shell: msys2 {0}
-        run: |
-            make clean
-
-      - name: Build using make w/ OpenBLAS
-        shell: msys2 {0}
-        run: |
-            make GGML_OPENBLAS=1 -j $(nproc)
-
-      - name: Build using CMake
-        shell: msys2 {0}
-        run: |
-            cmake -B build
-            cmake --build build --config ${{ matrix.build }} -j $(nproc)
-
-      - name: Clean after building using CMake
-        shell: msys2 {0}
-        run: |
-            rm -rf build
-
-      - name: Build using CMake w/ OpenBLAS
-        shell: msys2 {0}
-        run: |
-            cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
-            cmake --build build --config ${{ matrix.build }} -j $(nproc)
-
   windows-latest-cmake:
     runs-on: windows-2019
 
@@ -993,40 +335,6 @@ jobs:
           cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON
           cmake --build build --config Release
 
-  ios-xcode-build:
-    runs-on: macos-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Build Xcode project
-        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
-
-  android-build:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-
-      - name: Set up JDK
-        uses: actions/setup-java@v3
-        with:
-          java-version: 17
-          distribution: zulu
-
-      - name: Setup Android SDK
-        uses: android-actions/setup-android@v3
-        with:
-          log-accepted-android-sdk-licenses: false
-
-      - name: Build
-        run: |
-          cd examples/llama.android
-
-          ./gradlew build --no-daemon
-
 #  freeBSD-latest:
 #    runs-on: macos-12
 #    steps:
@@ -1050,14 +358,8 @@ jobs:
     runs-on: ubuntu-latest
 
     needs:
-      - ubuntu-focal-make
-      - ubuntu-latest-cmake
-      - macOS-latest-make
-      - macOS-latest-cmake
       - windows-latest-cmake
       - windows-latest-cmake-cuda
-      - macOS-latest-cmake-arm64
-      - macOS-latest-cmake-x64
 
     steps:
       - name: Clone
diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml
deleted file mode 100644
index 69c9f4f69..000000000
--- a/.github/workflows/close-issue.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: Close inactive issues
-on:
-  schedule:
-    - cron: "42 0 * * *"
-
-jobs:
-  close-issues:
-    runs-on: ubuntu-latest
-    permissions:
-      issues: write
-      pull-requests: write
-    steps:
-      - uses: actions/stale@v5
-        with:
-          exempt-issue-labels: "refactor,help wanted,good first issue,research,bug"
-          days-before-issue-stale: 30
-          days-before-issue-close: 14
-          stale-issue-label: "stale"
-          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
-          days-before-pr-stale: -1
-          days-before-pr-close: -1
-          operations-per-run: 10000
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml
deleted file mode 100644
index ae86e9927..000000000
--- a/.github/workflows/editorconfig.yml
+++ /dev/null
@@ -1,27 +0,0 @@
-name: EditorConfig Checker
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      create_release:
-        description: 'Create new release'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-  pull_request:
-    branches:
-      - master
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  editorconfig:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: editorconfig-checker/action-editorconfig-checker@main
-      - run: editorconfig-checker
diff --git a/.github/workflows/gguf-publish.yml b/.github/workflows/gguf-publish.yml
deleted file mode 100644
index 3ca4d3058..000000000
--- a/.github/workflows/gguf-publish.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-# This workflow will upload a Python Package using Twine when a GGUF release is created
-# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
-
-# See `gguf-py/README.md` for how to make a release.
-
-# This workflow uses actions that are not certified by GitHub.
-# They are provided by a third-party and are governed by
-# separate terms of service, privacy policy, and support
-# documentation.
-
-name: Upload Python Package
-
-on:
-  workflow_dispatch:
-  push:
-    # Pattern matched against refs/tags
-    tags:
-      - 'gguf-v*'           # Push events to every version tag
-
-
-jobs:
-  deploy:
-
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v4
-    - name: Set up Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: '3.9.x'
-    - name: Install dependencies
-      run: |
-        cd gguf-py
-        python -m pip install poetry
-        poetry install
-
-    - name: Build package
-      run: cd gguf-py && poetry build
-    - name: Publish package
-      uses: pypa/gh-action-pypi-publish@release/v1
-      with:
-        password: ${{ secrets.PYPI_API_TOKEN }}
-        packages-dir: gguf-py/dist
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
deleted file mode 100644
index 368dbdbe5..000000000
--- a/.github/workflows/labeler.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-name: "Pull Request Labeler"
-on:
-- pull_request_target
-
-jobs:
-  labeler:
-    permissions:
-      contents: read
-      pull-requests: write
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        repository: "ggerganov/llama.cpp"
-    - uses: actions/labeler@v5
-      with:
-        configuration-path: '.github/labeler.yml'
diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml
deleted file mode 100644
index 4aa4b2379..000000000
--- a/.github/workflows/nix-ci-aarch64.yml
+++ /dev/null
@@ -1,65 +0,0 @@
-name: Nix aarch64 builds
-
-on:
-  workflow_dispatch: # allows manual triggering
-  schedule:
-    # Rebuild daily rather than on every push because QEMU is expensive (e.g.
-    # 1.5h instead of minutes with the cold cache).
-    #
-    # randint(0, 59), randint(0, 23)
-    - cron: '26 12 * * *'
-  # But also rebuild if we touched any of the Nix expressions:
-  push:
-    branches:
-      - master
-    paths: ['**/*.nix', 'flake.lock']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['**/*.nix', 'flake.lock']
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  nix-build-aarch64:
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install QEMU
-      # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
-      run: |
-        sudo apt-get update
-        sudo apt-get install -y qemu-user-static qemu-system-aarch64
-        sudo usermod -a -G kvm $USER
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-platforms = aarch64-linux
-          extra-system-features = nixos-test kvm
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: Set-up cachix to push the results to
-      uses: cachix/cachix-action@v13
-      with:
-        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: llama-cpp
-    - name: Show all output paths
-      run: >
-          nix run github:nix-community/nix-eval-jobs
-          -- --gc-roots-dir gcroot
-          --flake
-          ".#packages.aarch64-linux"
-    - name: Build
-      run: >
-          nix run github:Mic92/nix-fast-build
-          -- --skip-cached --no-nom
-          --systems aarch64-linux
-          --flake
-          ".#checks.aarch64-linux"
diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml
deleted file mode 100644
index 8955f38d0..000000000
--- a/.github/workflows/nix-ci.yml
+++ /dev/null
@@ -1,72 +0,0 @@
-name: Nix CI
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-  pull_request:
-    types: [opened, synchronize, reopened]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  nix-eval:
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ ubuntu-latest, macos-latest ]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: List all flake outputs
-      run: nix flake show --all-systems
-    - name: Show all output paths
-      run: >
-          nix run github:nix-community/nix-eval-jobs
-          -- --gc-roots-dir gcroot
-          --flake
-          ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
-  nix-build:
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ ubuntu-latest, macos-latest ]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: Set-up cachix to push the results to
-      uses: cachix/cachix-action@v13
-      with:
-        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: llama-cpp
-    - name: Build
-      run: >
-          nix run github:Mic92/nix-fast-build
-          -- --skip-cached --no-nom
-          --flake
-          ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
diff --git a/.github/workflows/nix-flake-update.yml b/.github/workflows/nix-flake-update.yml
deleted file mode 100644
index 3a6a96e26..000000000
--- a/.github/workflows/nix-flake-update.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-name: update-flake-lock
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
-
-jobs:
-  lockfile:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      - name: Install Nix
-        uses: DeterminateSystems/nix-installer-action@main
-      - name: Update flake.lock
-        uses: DeterminateSystems/update-flake-lock@main
-        with:
-          pr-title: "nix: update flake.lock"
-          pr-labels: |
-            nix
-          pr-reviewers: philiptaron,SomeoneSerge
-          token: ${{ secrets.FLAKE_TOKEN }}
diff --git a/.github/workflows/nix-publish-flake.yml b/.github/workflows/nix-publish-flake.yml
deleted file mode 100644
index 2c3c1ebda..000000000
--- a/.github/workflows/nix-publish-flake.yml
+++ /dev/null
@@ -1,36 +0,0 @@
-# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
-name: "Publish a flake to flakestry & flakehub"
-on:
-    push:
-        tags:
-        - "*"
-    workflow_dispatch:
-        inputs:
-            tag:
-                description: "The existing tag to publish"
-                type: "string"
-                required: true
-jobs:
-    flakestry-publish:
-        runs-on: ubuntu-latest
-        permissions:
-            id-token: "write"
-            contents: "read"
-        steps:
-            - uses: flakestry/flakestry-publish@main
-              with:
-                version: "${{ inputs.tag || github.ref_name }}"
-    flakehub-publish:
-      runs-on: "ubuntu-latest"
-      permissions:
-        id-token: "write"
-        contents: "read"
-      steps:
-        - uses: "actions/checkout@v4"
-          with:
-            ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
-        - uses: "DeterminateSystems/nix-installer-action@main"
-        - uses: "DeterminateSystems/flakehub-push@main"
-          with:
-            visibility: "public"
-            tag: "${{ inputs.tag }}"
diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml
deleted file mode 100644
index 4e0374fc6..000000000
--- a/.github/workflows/python-check-requirements.yml
+++ /dev/null
@@ -1,35 +0,0 @@
-name: Python check requirements.txt
-
-on:
-  push:
-    paths:
-      - '.github/workflows/python-check-requirements.yml'
-      - 'scripts/check-requirements.sh'
-      - 'convert*.py'
-      - 'requirements.txt'
-      - 'requirements/*.txt'
-  pull_request:
-    paths:
-      - '.github/workflows/python-check-requirements.yml'
-      - 'scripts/check-requirements.sh'
-      - 'convert*.py'
-      - 'requirements.txt'
-      - 'requirements/*.txt'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  python-check-requirements:
-    runs-on: ubuntu-latest
-    name: check-requirements
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v4
-      - name: Set up Python environment
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Run check-requirements.sh script
-        run:  bash scripts/check-requirements.sh
diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml
deleted file mode 100644
index a8d46f31d..000000000
--- a/.github/workflows/python-lint.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: flake8 Lint
-
-on: [push, pull_request]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  flake8-lint:
-    runs-on: ubuntu-latest
-    name: Lint
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v4
-      - name: Set up Python environment
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: flake8 Lint
-        uses: py-actions/flake8@v2
-        with:
-            plugins: "flake8-no-print"
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index dbe41f1fd..43ad31045 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -11,6 +11,7 @@ endif()
 
 set(TARGET_SRCS
     server.cpp
+    stoplist.cpp
     utils.hpp
     httplib.h
 )
diff --git a/examples/server/stoplist.cpp b/examples/server/stoplist.cpp
new file mode 100644
index 000000000..c0ab9e7d0
--- /dev/null
+++ b/examples/server/stoplist.cpp
@@ -0,0 +1,10 @@
+#include "utils.hpp"
+
+std::set<const char *> SWordsFilter::stoplist = {
+    "<|endoftext|>",
+    "<|im_end|>",
+    "<|startoftext|>",
+    "<|im_start|>"
+};
+
+SWordsFilter stopped_filter;
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 7ef2a519a..3614c72d2 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -8,9 +8,11 @@
 #include "json.hpp"
 
 #include <string>
+#include <cstdlib>
 #include <vector>
 #include <sstream>
 #include <random>
+#include <set>
 
 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
 
@@ -411,7 +413,107 @@ static json oaicompat_completion_params_parse(
     return llama_params;
 }
 
-static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) {
+
+class SWordsFilter {
+std::map<std::string, std::string> scache;
+static std::set<const char * > stoplist;
+static size_t strcmpn(const char * a, const char * b, bool & nostop) {
+    nostop = false;
+    int k = 0;
+    while(*b){
+        if(*a){
+            if(*a == *b){
+                k++;
+                a++;
+                nostop = false;
+            }else{
+                nostop = true;
+            }
+        }
+        b++;
+    }
+    return k;
+}
+static std::string replace_all(
+    const std::string & content, const std::string & from, const std::string & to
+){
+    std::string ret;
+    size_t pos = 0;
+    size_t last = 0;
+    while((pos = content.find(from, last)) != std::string::npos){
+        ret += content.substr(last, pos - last);
+        ret += to;
+        last = pos + from.size();
+    }
+    ret += content.substr(last);
+    return ret;
+}
+public:
+    static void yx_simle_filter_init(){
+        char * fname;
+        fname = getenv("LLAMA_CPP_SERVER_STOPWORDS");
+        do{
+            if(fname != NULL){
+                FILE * f = fopen(fname, "r");
+                if(f == NULL){
+                    LOG_WARNING("failed to open stopword file", {{"file", fname}});
+                    break;
+                }
+                char buf[1024];
+                while(fgets(buf, 1024, f)){
+                    buf[strlen(buf)-1] = 0;
+                    stoplist.insert(strdup(buf));
+                }
+                fclose(f);
+            }
+        }while(false);
+        LOG_INFO("initialized stopwords filter module by Y.X.",
+         {{"stoplist_size", stoplist.size()},
+         {"file", fname == NULL ? "default" : fname},}
+        );
+    }
+    void yx_simple_filter(std::string & content, const std::string & uid){
+        if(content.size()==0 || stoplist.size()==0){
+            return;
+        }
+        if(scache.find(uid) != scache.end()){
+            content = scache[uid] + content;
+            scache[uid]="";
+        }
+        bool cache = false;
+        bool g_nostop = true;
+        size_t max_allow = 0x7fffffff;
+        for(const auto * s: stoplist){
+            const char * cont = content.c_str();
+            if(strstr(cont, s)){
+                content = replace_all(content, s, "");
+                LOG_INFO("hit stopword", {{"stopword", s}});
+            }
+        }
+        for(const auto * s: stoplist){
+            bool nostop;
+            const char * cont = content.c_str();
+            auto k = strcmpn(s, cont, nostop);
+            if(k > 0){
+                g_nostop = g_nostop && nostop;
+                cache = true;
+            }
+            max_allow = std::min(max_allow, strlen(cont) - k);
+        }
+        if(cache && !g_nostop){
+            scache[uid] = content.substr(max_allow);
+            content = content.substr(0, max_allow);
+            const char * ctx2 = scache[uid].c_str();
+            LOG_INFO("cache stopword", {{"content", ctx2}});
+        }
+    }
+    SWordsFilter(){
+        yx_simle_filter_init();
+    }
+};
+extern SWordsFilter stopped_filter;
+
+static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false) {
     bool stopped_word        = result.count("stopped_word") != 0;
     bool stopped_eos         = json_value(result, "stopped_eos", false);
     int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
@@ -422,6 +524,8 @@ static json format_final_response_oaicompat(const json & request, json result, c
     if (stopped_word || stopped_eos) {
         finish_reason = "stop";
     }
+    // Add stopwords filter
+    stopped_filter.yx_simple_filter(content, completion_id);
 
     json choices =
         streaming ? json::array({json{{"finish_reason", finish_reason},
@@ -460,7 +564,7 @@ static json format_final_response_oaicompat(const json & request, json result, c
 }
 
 // return value is vector as there is one case where we might need to generate two responses
-static std::vector<json> format_partial_response_oaicompat(json result, const std::string & completion_id) {
+static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
     if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
         return std::vector<json>({result});
     }
@@ -481,6 +585,9 @@ static std::vector<json> format_partial_response_oaicompat(json result, const st
         finish_reason = "length";
     }
 
+    // Add stopwords filter
+    stopped_filter.yx_simple_filter(content, completion_id);
+
     std::time_t t = std::time(0);
 
     json choices;

From 725ba0b3526f25a17929e830fc519879967d8dce Mon Sep 17 00:00:00 2001
From: Aliebc <i@axgln.net>
Date: Sat, 15 Jun 2024 17:50:00 +0800
Subject: [PATCH 49/50] Add YX UI for llama-server

---
 examples/server/CMakeLists.txt       |    2 +
 examples/server/public/avatar.jpg    |  Bin 0 -> 16277 bytes
 examples/server/public/index-yx.html | 8594 ++++++++++++++++++++++++++
 examples/server/server.cpp           |    7 +-
 4 files changed, 8602 insertions(+), 1 deletion(-)
 create mode 100644 examples/server/public/avatar.jpg
 create mode 100644 examples/server/public/index-yx.html

diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index 43ad31045..e291644e0 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -26,6 +26,8 @@ set(PUBLIC_ASSETS
     theme-snowstorm.css
     index.html
     index-new.html
+    index-yx.html
+    avatar.jpg
     index.js
     completion.js
     system-prompts.js
diff --git a/examples/server/public/avatar.jpg b/examples/server/public/avatar.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c3fe2ccaafd6128e47bf61297963b492900aa513
GIT binary patch
literal 16277
zcmc&*g<F)}(|%}_Gzi57Bve2^MQV|HX;Dx>x<f!ZmW~A_l~zz-2?dc@kcOp{kWd;H
zMHUd3T)KO|v;O{wZ?B7d&U0eUoH;Z1+!OOiOO4^|<+Bh3F=(hO>p&1W_>~-@r3OE?
z{YDPK4@$59H1ue}Cy3TI8vK36UERnFf*9LLUu18D7&*W{FL|pNdh5D9_4c*$w1a$o
zeMKByoxE(U-0ei%Jnb`9?_Gu<Zb(B}QO_@P1?L}RifAHk?Ct&F!|e@;KOC+aNWH6(
zvG|{nP{SjIRGy5S_bMJ8JD5NZHkOlIZrb-q<m-+op0gEeX2~@gbi&@6n6w&ii{u{%
zr>71(f;tXE8ggEJ{<6f)NkZ_x^_{*wv8lCpUVLC;|DPY^ZF?BRt@iPO2Gj4l25oGF
zS!Tb_x)zFGK(EM&n`QY(GNOsETZ78Hna~`%;^SB7I7O88^O_<iq?6nEswZ!a&ze1R
z>!FcJ+gqv9(3ZzdOcEU+&jd24%ZA_X0`-kI!W(C)qx!h6tsPQ|EdOmfykDA-a_Cne
zw+Vlsli<@c&a=oM>f&jhfv@V8ZV}l2dJTL}oJ!nrMJT>CFAw_qV`pWAGADu^f|NRD
zXPffEb)*@R#}uea^>hjw@AJd17G`+N<c<qDR^!vUslH!4wDWXDH=6J|8Y$tHXOkN}
zcoWs>AqANmBBE=vZ$y~zJ;BU+UB#xGHWyE*P6zy4bf?*F=%IW|^*0fP+$AQr-6mNH
zcez*DA*5;g9e%Sa$D@VUdaD#G77I^A9HSA&ul)+Y`j2W5uh8q&dg4DDh})=QF$~dk
z)X+5Z6@7hdwJzqbVFX-NtMfiwUq_oz`gUBvF)K4GZBO*YFGkT4<Ncwhgd5yNU;tM3
zJFXMlV^@y7%H>suq%^`j^mXiJs}1`Mz7zbZ3fV5t^@pgK>Oxm8g3e*|y|&bmai`w%
z;lq5P;dx}(hki8XPN=PZzxo!MU(<suboOPHTI8E76wvzG$A_9MqQ-E25h=}0UH&IA
z#4P2sF`={DY!P4o;#$r6MNI2H55muMu?2m;x)B-oZbY8LRy$=&cNP&V)s|3<O3q5&
z3)P5PUHi_eFKI|amdXw#s;k&^a;~;+M|g&$ZxvOohQ;`sheY--H9w(qN;T|+R^rH@
znA^@1b-BNs*X1;Z`C!76#Qp1K2Kb3rC3LIivY}Wfth7SlGzD|HzHzF6tJb1v`NChz
z6)x&lXFoO>ifA!8`g1ZWb}4~0h2NOeG0H>A1Wx7Dy`cy5-nFgY8ZIv_O;$0Id&ZDK
z<&RCx-#li1khYhmag_Cf2H#~aePU$gt_gPidV3Q!GX*^qT@`GwE@{EH_#wHcBruV-
z>^q_6wKg%n0LIh94l%1JdK|pC!17{vc1P}nV6>;u&~yJ(z~5I>DchB4aj_r+{T!KU
z2`SjVeg*q2_Tytg;Jo~wJZl2y^wfyVVp<{j$x|c5jy{2X+VU2}z4qo6%;fyxNGBfg
zrcQ%t^EabL%ftNEY~gH~+DSOI-*tMZ%S1`&&J**~%H#$^opB6}L)fDs-mv36{Sj=V
zGmN4UMh)RJa_ZhMJ<!Nbo#@8iR$y%8F0;7%N-+45*i&1y4KmakQZVnby+g&3;T@6I
z%(v)H;LKsTxeE@RUG}Q4=U47cG76}Ks$=%u*6g-y`8w|<{jMXfyW5MWCe&nx))4Yd
zsL1hD(%m6A^{vn9p7IXt(#uR&7`~8GK;s!grJvFT>z(-Hs^iIi_&4z^{$@ao?!A<u
ze-FD2<=`%+X-gT294F-o5?HyJM!N|z^yChr^pJy>()Z4XsdS}#Lww)~u|Dm;g8tHU
zdHYnvnHiuGE+0Qtc)CyP$|p|P%`{P0y+|8ADdaxpk-Yey$seUl(vmk)7BgK(3h>+F
z8n}``B<*kh0uCR^wS!d-I=0o%@rsQ=ylJK${&VEgCx$vP2zqATV!plV$Z=`kSOP^N
zFOY1^%x2N6%%l1|8?*_MJ!3X#|JNa=UeqXx`;Z-SSC8$vw&(M8l91`eb$;!7j_eE1
zRqPk>Q2&lbf0Gg$W-|c-q-@}!GHN73OI*s7QM`Go5a^M7XOf0%c`$UH!(^=Nog3_h
z!;BJ#u*BVzOY9Q$7Vv|u5))|Q&)luLfZp(5LTDvs;bka?AIx*q+YXnF`YRHmkR`?A
ze%dB`$TZ%BI2r#n!=reG&j)gU6x&mzN-5y`GH=hY7Lv+gGhL83ZpoQ@LSV+rQ$vj(
z9$eF1S4|-n;hT91pCg!%zSU#?!L@F=A$n$JI9u5B;|>T(aoPB`$6McC-|mJFXcMR^
zaj&vc@vvfa%f9I;D*U)z9P%^iVEHSW)*8hxIf0>45R#IVN+hZ$oIdw)Hf!<uh+y1g
zl7aS_p5J-bOx1fBw-=C&phHfqdoo4Fecc{?`P8MtRq@mk`qHu-YPz<@$57)+mgq%5
z`EnJNWo$`;xzbQW4(-wBV3!-V_bvQ`AM04+C~|uJG$b#sjvwapYo;z&$A<}1BVP!W
zW<I*cHOX5@nn%MsHb1k*_oV?z2-@X!7tybguM_0etoGj<9*5;!;(+&xi1a_GWDJ9a
zP$eqse$0CrxP8!^;+m+dMz{i9J3H&G2$m3#aO3iJ`*^jJ5`nu)G#)zt^wG6pNgRIl
z&9Men7}|^olwkQNV%lsu7eF{)#YK&z6c8!v>8hvUW2gqp@YSc<9RA<A3*Y^EPT{|c
zX$B9cTK4zK!NW96U_I0Jg1m!_B;$Q*AIQ2i&-Q+&^WI8}_s|>A`TN$XOFoP4qeW@v
z<1S|5HEj^H&2zPf%PE3CPBn(gNi8}wMcIlLGxMvfIZ8|hB-kPAXT-mOTuI(_K*U|f
ztbVBO6Y`muBdC2~;P!=hMx9mu@_h=ayZO1pbSkqKv_3N`u6mra{BhAh-|R*~6d$72
zm5iAVmAS>Xw=C&?fMswZl-MN7&_lO%Op^{?;a%V8S>njK^@luHCpb@N8qkEXdoHl&
z6VsQGQg6X6c03nv(S5Xt6@X@X{e1(c{tJDAKp;{Obc_rX3hi-k?-mCBU7d^KLpgw6
zt~PQKMt*LDcf7Rt>52-++Cb5tXEry@Ct2iX@;(5q=ZnioaiQ0;XmY?_1JY%rP*7&z
z5GxGzDmr*P|HKh=QO=4XN9kCYHDTeD!krMaqLSFVux1TnqTW;Rbc5CYjhsJX<;N53
zRfkK|$Y_B}GBtekx(S+>U2){l5qdpW%e+1(pb$aJ$SFx4o0z=$<I}ANve#Kdp@aN(
zX;}kP=`gcyS}Q0zdZy=$UG8>X9uI*P+85QLm~Fcr^(O8wCNJ6&iblTg*|;#<HjlqM
zfT4mW6<eGI{XDzgdi+Hzt*_QmbN92v6&BHw@4e2O1iR**SR=KVcR0RunZSI7okD?=
z!&G`B`gOk?uOkvc2UWdDnS8XM{mJ6dRuO`3l9NIqOHlaxMalf_%#>Jg0(0Kz%jCxw
zm!YaB;FQETCunnEI#xraS3tIRhKYNII;uy`%sBN=<xQsB5dIz6-*2!~oh_T>NTXWl
zORlGmheUqp>(>vbZ;18Zg78Vn<u-w=PoJyt$AM#qG~F0FC&waZtS9mA<-09fFf{vZ
zN1eS}9QUp{W`eol)6)cX3csnT#^3vxfuP&9g1RW|?Cc`q8Zx+&y)J+~%R&1&4z0Gz
z?%CgYM^0b`Lr~Pp;x?XYTsMw_6f(Tv2&EL)iazA)3PyFzOwSL=SwtLy2C8Da{$z*!
zmctr$$0@tkRpSmdq@*PH%>tjt(SE`X@%3xSQq!AafORt_B<5NFxqT-uXkw#}DnVlj
zC?uqM88y7PvsIj3!KlGQ4N>sVzI&WI{4zNJEE-w<&?pTauybrwW)#K8z(!V6src@}
z)U5%F_8BH9dF1+RWhjUF<H3G7pCdao{&*1IjXImY{kc&U7)=FE0T-RG&v{i}O{-Ud
z>#2&j)@Dt#>25cM0?7S#94c_a5|+ewd^QI~U)W{3l6iyytHGB)=s$g~?`GCg^9{~-
zrjGVA?6saAjHO3bxOATe>03EQ^VLIWzBkJScHWmBa&i9oR^y~naqEhP3eXScb3z=t
zR!?yg^=^hy?U%-&s(b<6A*+FHHE?znH0cF?Q13k(zZ+lpn_1MA<MQ~2J?+K`6)X&>
ze_McaP^CK|vRWIw$)X0i*gDva)=xIjv^&v5=vjVWD<v>Rw9-s;M<K&S1Fah=Octpl
zqd;kSdYw#q>SMk;&EKriis2|F>ncCTtLnXemdFiGgOs-xu5xql^_w>=#s%%*EXRxc
ztR5(l4j~zI`;KcMrp>p6cP5e-c}0*@?DQhl>EjIyxk71#*6HiR@$~Cp#PA3~HEO#%
z!NYMyZ-y1XDT*!}{q;?6c1fjD!YHbpObwDwcE!fsJcR~6Qqx5I5yqbL_!<m(=F~Ml
z!{D2p!9X7Kd4sqk2IrqORx9@ish|GRPBXK-c@jsfQ>v;X3A$8|Mj53Q8UJ$tuRi{E
z?n^E(N~rtw4Wyl3Db(zV3MF|pSnv#Tr;U|#J4#QxX5<SgZ0>Jxq)()k2)%#Bi`2iY
ze>=ohKO`$J9_&Wqqd>ET&QZ}%{-bcd*q>w$C~l$9V#8yyFFG__lr5|lPfa2$$f{Zg
zHj9d9Cpt@HAP8*9)E99R$r5x8W6b4`A6YMx{O>dd*85AqXhDhiYGL~H1XJ<Q(`SU&
zH~!G(M3_p{Jq9aJl+VP_$elP5UVXBH9Kx=?d9s$~9S;<0y5#}P*a%UIP=4)B>7GdP
zGZ=N^%G8(KaO73`ii%-y-=#cGTtlH(fmU*!S%a?=p=M>gI<Pi9J^dA3n9_8${<E7-
zpNa#A;rwTZuw;+_gg1?8#CLG{yA2>u;nwZ$aqRl<hmy~~VHDPP?0&Sf`b(FJYAtB$
z^~aCbTmrTp3YWPJ46*DOn_HCee>YgD=GUx#iaXprpn;6PJLUv^W#aQl+EuiqndSyJ
zoE?7RriX3#y^&gNw}(A?i$66ki03{lVH6d1Ng-x3a7aG$>Afc_@0FSra!N%_jxQdF
z`jhvDZmzGKu^SwRwLTUD+pIU<B?#Je?UTdZQEB_k(yfB5+(F)iTaJcx4&|Hy`{zy;
z-9pDWIQ-#4o6%dM5<kkM^^<G0y6wUH)(-xo`IGexB{E1x*jdD(#`UGfLi>ffZZ!cG
zd}pMd1pUs$OHsntC}L5I?8fWM931ErsGHCliCMaYy>#%3Y#Mg)Vb(<VinA#mIO42)
z2J>1d??L~y;U`?jNa}5ZJ!&gn!ez2VX05@ZrT#C|c)3>rf-%{}&MQn6U06D42TLxi
zyD%-Y@GJ1k7)<`?c~YqvW}%14^nFIiCNC;vC~%}&lZ}w)J6f-M9VRaK;bp(P0ZIlj
zQe4(l+7!5FY=8A#K;6K~WKBq~7xwBwEDTA(ZhE`$M*f^i@+MK2t$ZeP*k8AQV64SG
zE@1D~u53Y#>+g@*#6!u$6WLI(K(trlJ((cEEA`-_76&~<gB})ep<l}QpBVXa8}+Eh
zjR=of6fl_|hai^NPwXEemIoAd<E4$SbB30f8y%-(UHsbMXS#J38VL=+>dG5M;dy<a
zNmUJ%(Z{(HFK2cXWfn)uKQssQ79bWQ*zmm_!S$&ON;r?T;KM)tZUdb0R0APF3loov
z=lb6PdAQ9SXm~kfrXAHA9uI0oG6@Z4uO6aAS{7P=ikv3DK1&|Fko<CcEO2e8>s%X{
z{S}e*Q+w>WgJ@o4ExV4~=@seVuNaDM_fsSLC9K8q1B#>T^Vd?E0(XY0_E6}!3<;;N
z#V<>|66rf+j~KF=5N+vWXr;|F6kkF;WohUe)rKGfbGr#RDIv!zZVO%p7CWW2^Kl=%
z;b`5DSIWXpjusc$%=Nmc!X2EnKGc?49B~j{1+_x!Zn59RhVk111jcp~k`{d58C(3J
zt5u_QTk`4ljW{lo+!pzRi%dNKnQ;7Un#%G2Em30k+p%SzQ>w;vI<wgo7UoCz^`YU^
zeXH#5?As4$-OX>eQ?4y#O&uEt2kca6DgKZySc^uiS@>Pu<uJ~w@6u$!!lKLGD?Z;I
zK(**T;diU7ExIYAvqxOjXW*0?fKzQqe5||Y)r?SXwuBP%9EWu#7dfz(qRZr(0v7u$
zh&uU7^~Kv^uT$wqWn1bsqIOnnP~q%eT2rG{j#olok|`wLwH+K{Qon463-enV&wsgE
zw@A%Oe16MYJ!!RZF>PrQoq;%O-J_{SZ&tO<Lx>Mb+$ZBs>Aom6zrULx@%YiB91G%h
z=Vh_<gH6jgxfyqy=Ltga=FKWC#Rmspf=z%Yjg~3l58=pL|E$QB>K%cgndy`+;0W25
zMcteABfRdqbjL3-l9d5FmE|`CmNU|DRNC~t*2Pe!AKtDr<O`9D22aAtH@5$1ieldt
zZ0fdP;q$MC5eF)4#`fGFGrfo(UX+E!G~-qhH-8*u4l4n9Wye<V?o6+_Y2qJ-covqI
zKb|-9$X_xCCY#Cwe{96C<-}Rj_c?iav2$aEtA4D>MDk>*Q{Hl;KdiO&Khm#{YD>+~
zy39&)-ZADB=Vc)JOEOmBAj*?rmY*ViGte`=`xDM95y>bpwAZ{V55#?S{%-y!&tqi*
zqfiVC%4BMBz#;dgLpC47301miG3)tEF|D!JHuZr#o}#lS_Gpe0zRk->;B9HP-rB+B
zwG-(n>vatcKdG&?q)i>wCe1Q1z*oXwR{6SpK%AY)mqv}Q+tC$slqijx@c2UD9i#Gn
z-cC7>SsJ6`-vgg#tRJ=<-yzVqYhwiRwN2Cv)h8IgL~EpABv+rA3lywkg`*Q@-hT}^
z{(5O?AS`37Dql8mZ>il&WMHY<2`p6f_b}DviV{pfF+x4?{sg1K-_p5+%6;B3xt&2F
zFoI$yBWyb+A*c&Zu`w+CJ1@alt*X|R0XpJ5^r}Ws&)AW!jlCbhQt^GOsw{Y>&=LZK
za!EA!zcVHmF6=PLcn@ysqS~(tdoC3*nfJM!{(fv_mHW}+Yc<qWpM3SrJ7pY3>@kJH
zjqEyM&MLHAQy}4FKFbD`B$a0Bo^`mr>VgV;>oKHO!ZEIEO^vj&xwhl0r3<!0H0_~`
z1q$v{)u*1R1(R7yrt7QjQ<(5Wi8a3GxjnBLa|x;_oMY0k&SsVVn>l=7k8(sM8M0-H
zKk4INje6i<645JIHvxa&#AgS&ZytU|P|nyL|Gdut)3F8Z_&b#D)&#vmP+Nanlc82`
zw|y5)X6az}G&Zk|H1_;M**5sqjeU+BhF4c~9>0m6Z|Ady616ASWCm;C-Zw@@hPBH3
z)Eg_c<>M?Yx#u%Yc|@pRoj{NDB<gF^>Wpcn#S4q>;~Ef^pi!1A#tDoJ_OQQ-6!u`x
zxd`u4C?q~>d~iO^k+<NQ_7yi2@+5xFEo_2-H~Z7^zXr(0`S`0{_1zZ?y*E1sFI+dg
z&K;S#m1ZniRc*)wDG6O;Oz{fR;Vr;;FP3cbs+rhZwv^A>jO1(DgOTEVQ=;NHBuC&M
zggXMDb-BRY^P66bRoNO(p4>0c!Cv|)in{OET2~JCl56Y=^}LB%RX*s|6EgrO+t8H=
z$U~4%zIWyey?{5L{@WDU?gQNr$$lhQ*n^?i5NqVaHiReX^-NK^e9=tVE#oi0_?zrY
z!?FsNA-Vwg76i&hDy2)~v0lG^a#Da--93j(IGDBDVxtVNPYZ)9p<GMhfj?EbEOqoO
zbgRyD=3H`}Ag%ORnVW?x^g0UTOZfITY*6)h|9wT#IV_9I)l>=v<6nCwlH6H`Fzgtd
zjONP}wF|#`10B$s|8CMCAH}Y`V{k^y?#=lZJK3K~IPjX6-_wc5fzGXUlD1~%rX4Pz
zWXiknLax*=B<_|1{OB`6xwOL-Qa`ws#S|;i#;9<*G+zG_K|b?6U(Y@Le-GC^3&5Do
z7;mejZUcbD2p<fHLHKImqxUT;10l23LVXqyR8-hA(P9O<ckS+v!+F!J#i5HX<NmMm
zI7#LmcxWcqry-x^2|E<hkMN|?u9J@tZ5lfYG2gH~;&O>qxfZ%IQ9TSNyFFjI0jG{0
zg}AR%LsbH-?~Xg%BWva|1IrLfA{|em8v+@(6lUf_9ZY0=Jtz~)d{%9xluWjsv<V3w
z8X4g4J5wW_6NAh6?18oajd969kTY8T-i3j=wd8c?Ygfm2z4=KG^--0>GZaIN>jUD_
zD~$;BjVfLe#c!OjhW6a&X0~*<K$Fc=KLTaDg|2RjewmRIDy-JQu-dq{w-<X!oxDr{
zd#3*UOESx@w?Eva8LCzl?%Xta479h_u*Nz9L7r^m?r8M!C!B}P`ZiSe1!jXo;tgA^
zQ+!iJEUZvSO2_*hHoY+&5H}a63kp1{aIOc6W_<>rg((*(6eis_-hN|dsC+hO3>0NQ
zBS%(x&8B=#IRqj_r@k6$#=hLrspwB*7zU0KvCmWK2txIHt9%4?a(?68l$c|ed0u&F
z`4`T+%6h#&%)#H^ry4=N8)!9mlR(L*UF2Vu;Q;ga#M9yquOV7Zcl`1yx$t>P^H|vO
zYV>Kf{U8_A$U{_9*uJ;7`M`e-Ep}d%Ua9GpR}JX@2DOFU2U%!2P{IpeYsD<<ge@J;
z^xpz)+@2yB#%#Vsjy+D75Vk{k+~AQVDS(`@jsby@Z}>TryfrG(kLfg`h3tp7P>4&%
z4MMcixkWN=ehVugox|@_T*FySfuJX+wMo^Jm6+>ju|&(sBTqskso57Aks(pJ_bx~A
zuz=%t_MjPIO<OL1MaGYudzn%vZ!E`w3Ce;&KQTwAQ6(476S8RhR;<2@K|ejLX`mYO
zx6+!UThG92_O324d7=_$^lsWf7b8T<eekh>kVq1DDnoeDe$XLYYkVC_)-%21<E%}H
z#>Or~%tbbN*G1bq|MuT2Ft7;3!Fg|!JVxNms}SfFl1)E(ze6ZHsrW$Z0^<f2JaUbD
zhqG=WPN*dImbwFd*VY$}f7%u}DU^Mnb;T;tW`)M%9j*kztn@vu9CUBdyJEHMpdL8I
ze%cM<d2SG9w}2pgW0JrzMNCT1$Uy5_7fWZ1N;VUDn@@Plln>INByj&;{N=^gKJ28K
zwz%7{R^1nko;p4i5e^8BK~S&j+muike>bA3`nDb$8}d-l&<#h{_#{mPwS&)I9Ycnk
zl;~R9^<sA52NRCt^-gJsv+EYE&Q{q^=q$#aaf3QZ6J$RLXEjSWkhPUw^CH2!#Zx2?
zTi2SGUMYC-_gKX~uUo!O96y!;eXg5~`;nnF4!Y8%SvkM(&BU#?FFMX*BB(ek(D5UJ
z+TF4*&G34aDjy9Lp%{Z8U)Kup8SV`rY2vG{K!2@j*XfV)U4gD|5sFUAo{;-FR88U)
zEB2-IvS7^&3AI(jTW*cz9w~AMvt^!{;f-v9=}tO)G{{fQKh;xr+!}M4LYFT0m6}-X
zWyz+W){8F1#buP}B7J8Yh00wW;?x--k@vra)wBHRSf3q=TRZ}X8FAnCuB0TvCi^J)
z+1yqlzp`o!LcvMd19DG}Me`z`?;L-x+>UTFD^5f&`_49vr4jcU0uHHGcM{ZN@M%M@
z5ek}bUO!}?rXaEq7<2epo0^qnd{;&`QWBEAk{VA`GqxrP<Ajip@)N`y#MuFz2oyC#
zoku4Ikpy3RImxIXXkv|ms=lFpC(-?(e}6ygdkS1+EfAr-SqrjgIcbx{3Mdp`-1m+_
zC~2y=LbNOwjU*xT&KHZ*qT<qn_q)qDCseYe&)TmK2_bV^nE9|MuwZ?<AfM>}y|BI+
z1zw1n*|5(VwKE9*&!pC+QtS5KTq+u4dpH^}^J^ZoSDI1UsdjZJ>|%hz47do?(~o+y
zWV8z`{9@*?Sfhfsg=YJ)ea+jRrMqhPq}b}K1w1K`3DYm>8&Tp;V~gQT!FrM>yL&MK
z?dU!8+Nt2OzJ{6m;w8o|C<^=wgYD)_RaYpnwX3P?OBhMkQ7v_fTt~-g)h<UqgiMKA
zJA%Z+mN~k^-Gq?VrgO|feE?>XE?YoVLThtt+V=`CO!<a!2JI&qu2TE0?M$~NEaZ8K
z4%leR+JOGE5+=tl){}jS;f30VIXEO2fvV`PwQLFsn^%m$uiKN%q8-=Ki_(K}$!d=a
z{i+J4DsC>Z6EKZfp*x!VZ;X8*7Xg$O;W?Ydr-b13voNfrXSPss9*EMHzWoj<kReJv
z59H!LWA63|E!PTv&;MQua!wj)!%#RdQBhIBkG6Gp78ki5Yk1rZdD<ATVvGxz#{r{5
zQ$IzPZ0n`ug2?;0qs{=md+>2O?%K+ec1)Pk8?6Gw<cOGehYH)X9}ADM(r(48Zwz*h
z0@sF>LQgVCr`!v<%PR5ertrwCOG&?0V@LjjJO9W#{gva7LeN+`DlgtbA;tyZ#FE0x
zAeeCc4h;WxZD;mKaOhzr$rd@%S)%I|qDZm01J2_!54e2);Y6zl0wXsUrD9I^g>Ozh
zw^I<??PuVg_;Y8&Ynn<WIU)n>2CbW<oNBTHPE~qKO8*E%eF@Pj96B5~`i_=i)E|fE
z@P~UBJ-^2#f#i#@`o#9MZ-`FJUL!eRccE0{-K%TbT@Lu@KZ3%SipWk5VMC$c#&<ZX
z8k@~urrjJR8H8DB@rhP;DQLbDR>Cntd|(1WT+}2kfH9Ab3~=5$6SgMD41J-;gTP;V
z!w4%HO3qyx{SbxmUM+DoYbbIW6ImLnujF%_FEtx4w<)Mu`<aVs@LB8?HmNFUR{X9%
z;xu2XE{nT(izTwzb2L1?YvMN?^lY?7$Lsr-{L4TeszGnBr?@C^_6gx4_pT|Nr4WWx
zM;B-{4`9e3WbK7&aY19C+*ofaNS%=_GRgSEdvQqF8AA^7sp-d#TxKaG<F*A<0=&RH
zg%33TV48Plh8#-febz9NbQ#JFS8u9=kd!e$(?%Q^AUWfCAqa_&9#0zqDgby7l3*wN
z3RDSzR52NEz+OliOrhP}6jBI(;aZ0HPgD6B6g~cu4va0zRw!DS6u_#$J+?557DHw(
z3^p}T@2(ZJ9qv9Z<CGn#wAV%~PG}^Llq=C7ZoC?y>Y%Br^(X@tL9$_OqF&R@^;c1^
zNsYtsT(5DH4+@CN%9Iu#>1&nOoTvH>`YCC48{#$&*jp~_*}M}cckCVq4|>7KWnKj0
zCQ@9CFgE{L6WpL*YW`s?Rb=%2SenD2ljhyfKmRq=+Z7`&5AX=65a88dnIa8iDugrN
z)N`7H0|@)wYKuQVTyr#4slOk^YI1$HdKHH$bRJQE6L<NbjELGZ2fI}Ku|})E>7H(X
z%U#`!$)MrS=ly);XaTq=WL}yy%XP4@KBR;4=+S_$v<r~*(=*<8Dc8NvasTsq1}^>g
zj`NxMx9F?g2d!lf@w$fun*=AL;O7XQb&K~t3<`O2`H}kZT=Pw3<*QI$@YHs{yi<{g
zRA5m&yZJLz%8fFSkcWH3Su5G?^{Jni6eekvEb5;rK13rrzs8P0k4&X=AIO<n;=*h*
z?pZa;d3PTnhBi6{Q1K(zjF<0<i!YWJWDTzPP8I+1nm3x~{zG0iD6~|>X5@RN1Z=X3
z-4ovM*MT^I)ud?Ud-q)`An<b4>8ZuvYlYLeeen4sgxTqnU-+X&%ynirp68AUy49HL
zx_gK4>N!v?OFtErmTN2}wW|uA;FRuV1-)-x?=G%?E6Q_OAB}xjOEeD+TGs{%VFQ32
zPyI_Cu1XmK%>!;W#&2s^RRztF<PGM#=#+i*>R(KuDAag3PB>iEE*skt?FWd<ofize
znsg*S1srdw;(ArMw5w5|umIo`1R(a{qwf|2<!TQBp22u;W7!L{i^+CWX}eFj*#7%t
z<oHrE_MGrJv<aDJ@~|p!h;`v8Y^Qus1n0qDvzw>4JxN2C9}zz}$;p`k`-qi}F6%-+
zOyYnt4Q%@im|<`1)RqP>?Jb1LZ3(cjsxBo>O3}jL7ndrzGD=$}f}>Qu&Rri5-%2vc
z;E{y<fQI_&DCQ)D?w^%iKl*+5z4u!L<M_`7UdFLD->F>EAcfqXT%ld|a?umx7dwd5
z%B~qy2Za_>&6vHKj2-#xgNr6@XOguFc77#199wg1RBQ_Lls@O#cCheoR;HA-Mv0-{
z63`<SM8C9a`f-(7VM%|lX%F%4;m{r|vsi)Xuv_6yx_D~jt?=JByF}TV*PKr_YBEO4
z-10<zs&KIXY{_12Sb2)tnj9|CW%lS(S@VP2!3J~y>IT;t(jzb`CwOU%1qET64?6k<
zrjOZ<@^u6N1RCSn=lE1S;j$t@YB2$1zT-wXZUji)nKytOBJ`LSFitJ`TmA51IF)~w
zbU}ZJGlr4y+rWZ2$;9GGqgeHND+7JOSkTkrIeA1eh$4l7tPf`sLch!;_#zgm<`>!I
z_eGfe|9crB!cs5uP-WF@wKaV9Y&u$u`5b79Ik1<<YIJhhqr#)5z!ml+QHzjbaTuFk
znN(ja^vX{65Ad^7V*#eJNhKz!J`S+5<FvEnC(8}Q?=uOkfot8@m#$A8u#Uw}TdtVu
zn)UWWzr)h>++bnHbD6`4u@FuB?2sbFS)Qc{vHk(y=0iIIy|NO%`9yQ@3WT3EFqGtO
z2|m5RHUEPvYPrI3o^Zh+1VAnR{ov|PQ*ezyP;8Dx%x9vtgSXp2EFL2ovh)lUI0w*a
zu+!-l_j#ouzkXn@(=M6has_P_gEr^pGefq~WyQx`gIIwsUAcj?gh*^O9?Ic11tHe)
zFu>w*emJIsGi;uvN>TseHY*8!0MONE9<*5mj^SiJkN;$NO$t~ffWt=qdBx1S&)
zcBWeN`=kjM@hI-(f8OS`vBp*m*}?3Rko>0)-wcfGx#vD!@v4#wTI>Mx6O)^0XvO4;
z^T_nDrrFo@6gD7emg6qT-VeoW`EV}2^cnp5^HZg0(|={nyBNgT&9)4N>d_`NuQ6}3
z43wPrMf9^4vMD^A+>F@dWOO{}dX|oECj}^jx1eXgj#7p@0!xKv!I5$6*;;tx`$oA=
zbN-pNuosua0$$6tKZf}H*>Kf2DkzY;-k^ml$Yum@@=oMYNyz!`JQ2Kn7s&bB`$kp}
zpU|S7p*buGCKiW{U=$EvJpvf~(3srbB?3LWmz6|?(_>p2q#iTYamtG<QK9oe`X_<T
zD*%~#A@lcj%3XoBGVSv}*t|td9bkjTAPUj{kx{EMBuHSaY5^Kd>J~xGt#LR$FaN!X
zEZM0e`JGs;ky@3$Ee)iRJkQP@^D!%=NHw0_6-B0=EciWsr2G1M*P`uak}NUpy&R;x
zKSNigB}lvdZjr0J4?1lV=W2;HLsD+`sT^uL<k28+SOmvl-vx{dT$U?o2N4sK-3QgB
zyuZJw)Gv23B2l+yhS`i)+tSQ{huw%yFb{c$ARjkrcT;CuB(t3DvMRej?urZBy=^sN
z7zfjRiXv-7gY7J=SXL*{SFx43Fz~A0tm%`wn8XR)wNcggtSk{5kSnd*|Bpa94GW$=
z87{dnV4i6{UQ4ZFPz9E>c}hi~tZFis4P2<&djMuUIry$1s|2Jz4DEAdrJ5X2?w@$%
zp?w8<z?@utXYW2Hk8kQj!5x<z3rH>YZ*4kh<7q(7W>{1+c3Xme688_j1H81!;+Vvh
zs#-66LN*BHZ@!cE%1Ge}6jE($0GIV;u<;<YdPK5fB)t_IsF@QrKmHFds*rFDeYU{Y
zTDx27YTIeD44$J5(~@75#a?pIl;d>Lx=9VK1>r@%eF;MyW?d+`fMk^h7cTaUgOXfy
zXUBXwJCv#2Ig=1Z#C{0$eX{;LRQbmX51Y|^FMy?0B{@s5yc9)&MoUii!f93G7>at|
ziN`6g&suLtsDxj<c+tW7X(obt9GP|*d-HA65Qdk{daHX0pIzyyunH4%3`dX;2m+;Y
zKp`cj8w0VJs|MC$&<pbfV@dA4J2pb-3rPJ-7DnLoN=#oo#r^MbUhBvdzdwNyuQeuO
z_U}HR9uS;Wi`NH$ZHZW-iPg9`u?@~c>Z!y~DiD?DkA-!i01)VPn$UH>VED4^CM|&;
zkNqEKgx8bbHM_L@S9;jp!{|0JpwgYC^>f%WUHn{S;@9*kkb4`_8Dv~p(alM`ktWx<
zqh`EMMLV1!!(RoRFOzEweYqtW=a0G0`L?g?5Ge6|89N_Tl}}HJL=o_ARtUyuQyV}?
zl9BYB&M4?gGRSI_Tu2mKz1{ovTro|5hZB^@7sAL+2QI)Ox9mymr7u@6-E)W7D!wUS
zNqB3s*Y^Vx-V&2dp*op!N`cR;?1LCJ;3)l;wnJn4=>h-z6C3Qgde#OqOA^N;dCtUl
zdSH7I_Z?2w6*Jl7C)%md+f(l~W~hcYUZw{6xhYn{;qDNyvuuTe@RVz_?CYy?gcmi;
z-^solj5J{xr0&WFuII-1EsTCB061CF-YFR&lJq!-<e_MYV_^<W0CNU-i1c7NZ!JZ9
zoskfmo@jNv16}zQh)=`3e4;`l7T7KVCHM8S{-SE>UK?hb)-sS(8b<s0GIzz$Vikfs
z@j^sI1kf>AF3LYA0Mq5Kp->23DVg020hwOos3|R85gmca?}M}E=5oHSO$Zf}m>rp6
z=?y0cQV@zL$DO8G%RzS|3IjWCLr~6a5S+kS74M)~&I$d~GEFfwNI@rUwN^xoGJx&{
zuoOr>>4F9{2Luhyun?<x0mwXE;d>ukYGhFKD_kLWU;;wf<mCzU*?O{3*fXFDfG*7E
zv_eh)aa3Q`upWV=QR}V!I0Oa6&6C+M3eS}Vu(#n8$)igeFG&e+e>ab|Mrl!yBZyL_
zhm!f$Tx+#Jh#<M<{qp?g)<!!$B#?dBly{B52q{cHb3WTDgXSIA1i1bz-P6*v<80E5
z{d)-7gt@-pHu#4ozNa9w_2_<k1PnszuXoq&uVtSXhf0dj{ttAVxt1Zwp?a2qZtk8H
zY%mKvP(O>>)+EqE3LzVlIBtQeRR?Vi4oGi6Aoyh-8t@{J_x@C)a|Vc5&^5m+WO&DG
zPGTOB0rKkH+ETt-6-!I)!M7hwK-?O=BEe1j8r+b=v-|eJJPDB(q@j{hRDDmkyd@CU
z=ef1GBb!hrz9b-0|FUdDNeFV5W;1oeS@M@tC@7gdtWF922Bc@%A#W}p+Gk2QOG*y$
zb!*C3yF$=U+6&Fpd;yTdC$zueP%&v@KDCxlTzWrX&!wK1h154Ej|e5a5VX>3R`BB9
z&YD899ON+j>5s3vF^T@*AXz54y&6!9cv0QX1f}2R6`F6hM}hxynXhI994<a3FanlG
z^5uATpV~TB2uim6(=#r;XQYOw0NM}Q6Hr780{wCJ!1bkfSE9>rJ=nhOHR;h8s(U7I
zp+D>vv|Q^M(i=SK*o>gr{dGLYN>XEx=5|IQiMj^zEpyr+9~(|KP25*FH73w5S6clA
z9x(THY|Ga`>CpYM7&ju3lrdeJAG^o5W^dxFU|jtaO;Y&Av@O-0)4&!>d1@pPKf4Vh
z(5h>u3Xk=SI+chal`3q1d7_Qmj-<F=Y+4w!hVBvd=^?klM<!cr(!}=&n!M{-!F_Nt
zDL$g#p%H;{d26C*JiWz44bMRhBG#qhnar+<lh6>uNwLqo>8-a$o-+|g;S@RxZR=j7
z3^LiY%iWPtJ<U@lEKI`yM85%eE?|}*^d<>Qsm}jiB6hw%kl65#fw0=8Mi0?2W0np0
zRz9z)W3rEzu8B)X7~4a_6@mSnW}(Bv*h`5uG!d%=(6^u+EU~NxZPC(X-=ZiW4QOr2
zwxqao)M;#KJ7v-aMSkVtT9e$~j+^7|gg;hb{ZPNH@mb#~6B2L?wwLaCrmhY6z01qO
z3Tx~xIh7SVrS<jb=^GY6YfA>TxYFNi&0}zip$f~u+V8c_MoIgDr(CoYUu1q(*r|B!
zkx`6E*VC1lB(K(|*+1)_Y=Q^OP)g62cp-^1O=F_2r*R=Ze|xb4iTKQxijWOWx)Pbq
zOB7!ke%6Gnv;y{C_6^5R2+g71cQO_ve8NNRp-yK!47?uttI_VI+(t*)LKY2Yh1OR&
zZSfIoPzRi6>Na%cMHU5hZ`$@_ji8;oHMR$yIR7bnm#K24zdhfd7#dolUQqr9yc5@^
zEmyOX2;$%jzh;ToXKjf$4E?`1zhQO&7a^W%g&QzDTnC%4LjLa!`Z?i?`+vPR&vZJ0
z`|@ym$qw}Kv(B;7ytEYiS6jZ|dUl}sKiICd#bYRLr+PH|^J+9XHS)P$diS=2>(GB1
z&(S99yTFAT1x_wCSyjx#3`s)V!;I095nEI^#fC9QUWHoeTgimZ7*0xRB<qb*_RuiA
zJ-}I@(~Ld1ubr8A!jY`{1nnoP%g^Q~)|%>iXrRrwvEZAh!fnvNe~P(sS7TQAp)M|y
zV0hy<IM1_v76m#Nvj!ivJ+kLDa7Vz?gA_bHvz^~sw0&SRiFSgFJ4qatB=1ZFEyVXO
zCG=x+&75qqpSJ^@2ju`<j-bPpRMiQmgw8vcZLQi^Z@96<L&lCrhocz*yBv5wLuQEG
z<4`qNnC-NhRR)U6Y1Xdg3RtuQ22}mZnmt!4RvRt${*@ENIGA|*-5-$wQ$T4pfw-^w
zyilxpHrnW4W)h`*Nezl}_#Ij8w!19v_dKr>a0j_BO{f|4mHGEzc$R*WbtTPg{@|MR
z&IOjRJtH(Hw<c1pY91Pc@DKiWHjY@D0d7}~;18d-uCQ}#^pNz>BZIw<dtYah|0B`2
zatEZ%nxFmA%pj4woFGpyl=u$rh-qmqk3<ya*2<h<T5>9Ot6Yt~-~)M<w3VVQ((d*H
z^Jtxiaq`(C8ms+dk_j7C*t$(4%JWu8+&2o7){+*7KdljQ*sC5u_|*C%PeeB-gB2#S
z&O?k5zI_0zu<AWeNR*nnjFhNwX-n?_%)sTF^){sS$2N}nay)Z#ty+y10_fDEw&V0N
z^?(sD=KD_hW=}nV`BRr@vK*HPtD5!m8sE`M5chve8FUjJz&bj?&I9t0i}Pq}ngora
zdsZHfG#|1+{K<h+QL&_PR67h%@tzA4uqA}smhR8<yhy9e_FpR~D=GGG$MBv3?7?op
zUwjt0&85c&a-vkwW>Si)^QYp2EQFelJLN!pxC>5>f8L^M6+#~W&00yvdBq`AlGpEk
z@saQwX_s4UU4Cf}Zza9hKJ-mb%Kv{HK)y3F(2DcUd!vljNA^-Nl5j|!qmBE>oRzrS
z30YdshA-t$F5c4!1QPf%z{y5EUy?4lh9D$@OZwK(t(HT7>Cgzjpj$}W%|1fUAys9)
zNDplb`K>Cz>xlpx=IrBn9q=R-0zHS)mIy^|HLL1?erfX?TNYeuPgFeNfA|?tLw!7*
zlw5WUN_dmwd6D-9Y8A4R#%#bHHo(Z~xPV*vi#<1OkMy?z^k`{Zo*F;Q2{7GUzG{lD
zb+1a##zoNi8rC9V$j0)Im;eKe0;m-+0YjGh7aa6SW__K>mIvx6M)569rsT7<pRtyj
zF?bq(Q!8TeLaON7`bofajUwY_VtJy^$QX2Jq>&Hy#yvXqP|xr9pNkhYEf2DvQTn~v
zEpXImAm1qH{mWIu+1K!h4te=<ip0*?Z}GDCXm$JVUnUVog{!Wv!>v@zc6bQPjj}$_
z=FcMkEpw(Otv6NOK>Ws*FTGa1S&7Hrpu~Co(I;aEx9*vNdKy+piZ%m((!H}a|D&4m
ztIS_oZl2iAI2oN^(_G2aB!(?n36oOjQW2GSHTJ5X?=qyU+%{2PmWS7RT{Q$cp6t<N
zuu7DT?~sSHLp-+rE14aIpb#uG6>v?zv>2@ht#fh`IE1f|DL>Nq^@7YwFzA6!J>YV;
z2a0?57L=sRhgKtKjU!=5TH|u}UCb>LDV5275@Fs!yq5}|)ghk_x_$Ofg$vj9NVg{)
zQaPRFDV?^Fu36I{oD$u3PN5z4uYPD?gG?Ir4Dh6HbN(v+NH;cyU9DnNB$QJyd$`>+
zt@~i5w|Gel1H_@gGZA1IHrYAD6}$|!t9z)-i6w_zqn3f{=lOkI0Hf5+B#K)Skk6kw
zbnI_69ix)rKorq&P3Ei5g`DOmoM7P-fJ2>|99#6EaU#13fLMT?{+q90Py9z&2-w^D
z4v1M->6>k2gF@(=yFg9~%F@E!f{3}LgG%#CEx=LNzoJWWwHYcS)HGL(z&@PAAHR%K
z^t<~qG$_A>V!)@Ssr=cV6%Ia?BE0a^3{;44Lg)TW2;<WC8bkeER(uE-t6+e>|Gv)i
zm<E5+J2RsI(bRNy2Uj%v^tk|fcpwN$?^JmnORb$y{NqY@$p{5AXSR4wI=kM^Vg^se
zNGG%~d>J<(&HSaszewRBn=l%UvxJ53wuwV$Due$n;Y_rpcC$D$YWy@hUtVQazav_}
zCHd;+$#jOcY;_x)dgUb(krI@@z%@O6)k$2?9F)I{R~Yfp$#FUjTG#S98TX&aC$G5P
zy=hS!k^-w-X+Qd9^+{7-xWvdBMJ{uNjBg@!$4SBkCc9a~*^nW19-8Ub#N?XP?ak!V
zIZb^AWe{@98=^`=Vrv`bzUqP%Kr_6Al(>Q{pWv45OOz71>Mn4f=7)sr=q{4SZ48l2
zoafRhP@~iiE@C_k-~=<V8)=%SX~`I{g$~s#o2PUP2=Q9OC~o2{Xc~F{TfFd@wh^=B
zcu>JGU|k7}^=rBlcWzV4Xo~R)S6nLkgWBX^u&?fil=#%wHLm3Ar9R&mGZ>m0f^B~<
zr%4!ha)*{vAfH#PZM3dF?asEvkxwJD=o&wkeJVD$g@qq?LdaBiP){PyeL&#fsfj0B
z7d7DvZpf`Uq}Iqq(7z8OgM{^UB&-zPG7_^JKY~dPv>D~gH)DL}1PRQ_bQF-W3laUo
z?cyE-G@d<XJU>ye<TQWJ5*6++!=q3&4(zi9LHsjuMb5hkZ~BJ)M%ZrRP9JV0SNCvY
zR=jfFsFNGB|3y`c9(_f=FYv^4vlCbnkJ+Xyt*AftX`-1jQy8j2Fv-mE(3|UCBs-+=
zs^s;=7zc11tYG2$)3lIuH88AWv4^{-2%E%7zOoOqBVRFRqCcaOj}$+ne;*OpS`0za
zgr5}+<nq+vn%ID3gWLBbp^ndIK+V#znWDr_{=XW#c*T359;%mtL`9=(BXkHyggxKL
zY;5_MTN>$h?4S!uq9i*&5l3ppR2qqUj|}>oXS}w9`$uCHR;LDhk%pPgGuOXI$S3RB
z;mGk}RGXNZ+|t7k4Nxi8{S|_QO6%S~EAm~H=_>}6S`NPEWnJ}L;v;?aniI`z)a;O@
z@C}jvOwoch^~;VhimK<co2O9>A0nqdUJ|Ro-c}a@g=A5H_M#-=((prik6+^h$lFPL
ze&i|dN~-s|dE7+<YNjMh>w;P}Jpb79Jp=_`p3_v_#80_3b5<VT67ID^$i^pkd|H3r
z`Vpe#$)+Lrd6mpKM?)+0Y*PrPs3zbef_~ixlmzZYKD|Q`oYHWAGdp$UJBG%O<~Khs
zt8b~`U)f+MwDy7wO1f=5l(+h@A)_F0@j~)_P~jC?IU#dWq13sJ)_mGG@f1dJg`OHx
ze*DN(?vHXkDB5gL(78_F=F|C*2de4R+CB}@)PH7r+y)`lg+LW2Kg)v@uk6=)uDIvQ
zhBwQ8)0qyhXgWDbf(k@5sEhh}0~Gq)p#Av$NRuB?zN%QVoHT54k-GNoNN>o0ro9R5
zprmh^RMIzW|6DUTM-z{;-H=lwyht|R9O62A)_V_xooKA0g2=|#XhV+wk$boJAEtO$
z`St(BEjOsF(fj+VfigooP|IlYr;R!0w!@DXR~NbOY132q6`#BedCRb7T>bfJ41#fh
zb01RXVM)`z@r2p@Mw#PG;R_8s|7e#}q*vh)37x8CBp>IXfX=)9%dc3thiA%bdg^xc
zM$SBo$}5;Xu}BYIa&uz+VeSZ{G_(e8F|Nw7a7IP5^Zu%>G>YG`BJT^F4t@T9x==wU
z3v~jQKD(UmzY;MjJgFcRMql=6VGKW1aDUa?AH`kRg*mh4klz^iP2l)wvMSX1pBqC%
zb`T4K)v&97zcF|kr6qZFb`cCUF7UA!`||7Qo4;hKo1fhY&n$L*D<7DPxX8>x4{q=U
z3y=zZF2r8&sIzO)P@^;2iFUk)Q3F&RHa3PTHZLATVla~E56Dko(w>gvReBpvLaP3D
zol<Lp{F17hvon8GgsV+G=`!`$FrTqh&81fD(H#?RTwS0DG6(kh=^fehO4nC=&G*)e
z){nAGZmEJwS_vByzESDIJ{bc-MQLe{=4a&fB{COKYkW)3)O=D?u~YG^_@kawLHGRR
z`?F63%i3l~aF`9|l{Zz!o8R%_c4trSXnf~u`B88({>E#*w49ML87#cZhUDGcgnf+6
zzsR_odl*pAM!rU>P)<Hn2ery;e|P$c^=8hVL)S!WH8wN&NX5#H!>ax*gH+?}n2{oY
zx`!99haGo7P`lbK%ZWO<h~7n4t0Dmw+vmXzG+~6iTMIWd`VKfJkG{yH`N%C_)0m|C
zjoRz&s~-rgT?>;`tW(RK64tO1oM5%3tB6~iTLEwjE-$#Hj0U7;|6e~}au0}I3IY-g
T@mn`Z6VXu7QZD_^GW`DlMZW%v

literal 0
HcmV?d00001

diff --git a/examples/server/public/index-yx.html b/examples/server/public/index-yx.html
new file mode 100644
index 000000000..e58f57678
--- /dev/null
+++ b/examples/server/public/index-yx.html
@@ -0,0 +1,8594 @@
+<!DOCTYPE html>
+<html lang="zh-CN">
+
+<head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width,initial-scale=1,viewport-fit=cover">
+    <meta http-equiv="X-UA-Compatible" content="ie=edge">
+    <meta name="description" content="HTML for llama.cpp">
+    <meta name="theme-color" content="#edeff2">
+    <meta name="apple-mobile-web-app-status-bar-style" content="#edeff2">
+    <meta name="msapplication-TileColor" content="#edeff2">
+    <meta name="apple-mobile-web-app-capable" content="yes">
+    <meta name="msapplication-TileImage" content="icon.png">
+    <link rel="manifest" crossorigin="use-credentials" href="manifest.json">
+    <link rel="icon" type="image/png" href="icon.png">
+    <link rel="apple-touch-icon" href="icon.png" sizes="512x512">
+    <title>Llama.cpp</title>
+    <style>
+        .requestBody,
+        .response .markdown-body {
+            max-width: calc(100% - 84px);
+        }
+
+        .bottom_wrapper {
+            max-width: 100%;
+        }
+
+        * {
+            box-sizing: border-box;
+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Noto Sans", Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji";
+        }
+
+        html {
+            width: 100%;
+            height: 100%;
+        }
+
+        :root {
+            --background: #edeff2;
+            --chat-back: #fff;
+            --main-back: #f6f6f6;
+            --active-btn: #e0e0e0;
+            --lighter-active: #eaeaea;
+            --sel-btn: #d0d0d0;
+            --btn-color: #404040;
+            --text-color: #909090;
+            --chat-text-color: #24292f;
+            --response-color: #f7f7f8;
+            --lighter-active-color: #e8e8e8;
+            --lighter-text-color: #555;
+            --svg-color: #808080;
+            --lighter-svg-color: #c0c0c0;
+            --code-color: #f0f0f0;
+            --black-color: #000;
+            --gpt-icon-color: #0d0d0d;
+        }
+
+        [data-theme="dark"] {
+            --background: #1f1f1f;
+            --chat-back: #3c3c3c;
+            --main-back: #333;
+            --active-btn: #1f1f1f;
+            --lighter-active: #151515;
+            --sel-btn: #1e1e1e;
+            --btn-color: #bfbfbf;
+            --text-color: #8f8f8f;
+            --chat-text-color: #c9d1d9;
+            --response-color: #2f2f2f;
+            --lighter-active-color: #171717;
+            --lighter-text-color: #aaa;
+            --svg-color: #7f7f7f;
+            --lighter-svg-color: #3f3f3f;
+            --code-color: #101010;
+            --black-color: #fff;
+            --gpt-icon-color: #ececec;
+        }
+
+        body {
+            background-color: var(--background);
+            width: 100%;
+            height: 100%;
+            margin: 0;
+            display: flex;
+            justify-content: center;
+            align-items: center;
+        }
+
+        .chat_window {
+            position: absolute;
+            width: 100%;
+            max-width: 1188px;
+            height: 100%;
+            max-height: 888px;
+            border-radius: 8px;
+            background-color: var(--chat-back);
+            overflow: hidden;
+            box-shadow: 0 10px 20px rgba(0, 0, 0, 0.15);
+        }
+
+        .chat_window>.overlay {
+            width: 100%;
+            height: 100%;
+            position: absolute;
+            left: 0;
+            top: 0;
+            background-color: rgba(0, 0, 0, .3);
+            z-index: 90;
+            cursor: pointer;
+            visibility: hidden;
+            opacity: 0;
+            -webkit-tap-highlight-color: transparent;
+            transition: all 250ms;
+            transition-timing-function: cubic-bezier(0.4, 0, 0.2, 1);
+        }
+
+        @media screen and (max-width: 1188px) and (max-height: 888px) {
+            #toggleFull {
+                display: none;
+            }
+        }
+
+        @media screen and (min-width: 800px) {
+            .chat_window {
+                display: flex;
+            }
+
+            .mainContent {
+                width: calc(100% - 250px);
+            }
+
+            .chat_window>.nav {
+                position: relative;
+                margin-left: -250px;
+                transition: margin-left 250ms;
+                transition-timing-function: cubic-bezier(0.4, 0, 0.2, 1);
+            }
+
+            .show-nav .nav {
+                margin-left: 0;
+            }
+
+            #sysDialog {
+                max-width: 600px;
+            }
+
+            .sysContent {
+                display: flex;
+            }
+
+            .sysSwitch {
+                flex-shrink: 0;
+                width: 160px;
+            }
+
+            .sysSwitch>div {
+                padding-left: 6px;
+            }
+
+            .sysDetail {
+                margin-left: 12px;
+                margin-top: 5px;
+            }
+        }
+
+        @media screen and (max-width: 800px) {
+            .chat_window {
+                display: block;
+            }
+
+            .mainContent {
+                width: 100%;
+            }
+
+            .chat_window>.nav {
+                position: absolute !important;
+                left: -250px;
+                transition: left 250ms;
+                transition-timing-function: cubic-bezier(0.4, 0, 0.2, 1);
+            }
+
+            .show-nav .nav {
+                left: 0;
+                box-shadow: rgba(0, 0, 0, 0.2) 0px 8px 10px -5px, rgba(0, 0, 0, 0.14) 0px 16px 24px 2px, rgba(0, 0, 0, 0.12) 0px 6px 30px 5px;
+            }
+
+            .show-nav .overlay {
+                visibility: visible;
+                opacity: 1;
+            }
+
+            #sysDialog {
+                max-width: 400px;
+            }
+
+            .sysSwitch {
+                display: flex;
+            }
+
+            .sysSwitch>div {
+                width: 50%;
+                justify-content: center;
+            }
+
+            .sysDetail {
+                margin-top: 8px;
+            }
+        }
+
+        .full_window {
+            max-width: none;
+            max-height: none;
+        }
+
+        .chat_window>.nav {
+            width: 250px;
+            height: 100%;
+            border-right: 1px solid var(--active-btn);
+            background-color: var(--main-back);
+            top: 0;
+            z-index: 99;
+            flex-shrink: 0;
+            display: flex;
+            flex-direction: column;
+        }
+
+        .mainContent {
+            height: 100%;
+            position: relative;
+            display: flex;
+            flex-direction: column;
+            flex: 1;
+        }
+
+        .top_menu {
+            background-color: var(--main-back);
+            width: 100%;
+            height: 46px;
+            padding: 3px 0;
+        }
+
+        .top_menu .toggler {
+            margin-left: 8px;
+            width: 40px;
+            height: 40px;
+            float: left;
+            padding: 5px 7px;
+            border-radius: 4px;
+            cursor: pointer;
+            -webkit-tap-highlight-color: transparent;
+        }
+
+        .top_menu .toggler:hover {
+            background: var(--active-btn);
+        }
+
+        .top_menu .toggler .button {
+            width: 26px;
+            height: 4px;
+            border-radius: 4px;
+            position: absolute;
+            pointer-events: none;
+        }
+
+        .top_menu .toggler .button.close {
+            margin-top: 3px;
+            background-color: #99c959;
+        }
+
+        .top_menu .toggler .button.minimize {
+            margin-top: 13px;
+            background-color: #f8b26a;
+        }
+
+        .top_menu .toggler .button.maximize {
+            margin-top: 23px;
+            background-color: #e15b64;
+        }
+
+        .top_menu .title {
+            color: var(--text-color);
+            height: 40px;
+            position: relative;
+            pointer-events: none;
+        }
+
+        #selector {
+            position: absolute;
+            top: 50%;
+            left: 50%;
+            padding: 0 4px 0 10px;
+            cursor: pointer;
+            font-size: 20px;
+            line-height: 40px;
+            transform: translate(-50%, -50%);
+            pointer-events: auto;
+            border-radius: 4px;
+            -webkit-user-select: none;
+            user-select: none;
+            display: flex;
+            align-items: center;
+        }
+
+        #selector:hover {
+            background-color: var(--active-btn);
+        }
+
+        #modelVer {
+            margin-left: 4px;
+            margin-right: 2px;
+        }
+
+        .messages {
+            position: relative;
+            flex: 1;
+            overflow-x: hidden;
+            overflow-y: auto;
+            font-size: 16px;
+            color: var(--chat-text-color);
+            text-align: center;
+        }
+
+        .messages::-webkit-scrollbar,
+        #chatlog .markdown-body pre>code::-webkit-scrollbar,
+        #setDialog::-webkit-scrollbar,
+        .allList::-webkit-scrollbar,
+        .sysDetail::-webkit-scrollbar,
+        #apiSelect::-webkit-scrollbar {
+            width: 10px;
+            height: 10px;
+        }
+
+        .messages::-webkit-scrollbar-track,
+        #chatlog .markdown-body pre>code::-webkit-scrollbar-track,
+        #setDialog::-webkit-scrollbar-track,
+        .allList::-webkit-scrollbar-track,
+        .sysDetail::-webkit-scrollbar-track,
+        #apiSelect::-webkit-scrollbar-track {
+            background-clip: padding-box;
+            background: transparent;
+            border: none;
+        }
+
+        .messages::-webkit-scrollbar-corner,
+        #chatlog .markdown-body pre>code::-webkit-scrollbar-corner,
+        #setDialog::-webkit-scrollbar-corner,
+        .allList::-webkit-scrollbar-corner,
+        .sysDetail::-webkit-scrollbar-corner,
+        #apiSelect::-webkit-scrollbar-corner {
+            background-color: transparent;
+        }
+
+        .messages::-webkit-scrollbar-thumb,
+        #chatlog .markdown-body pre>code::-webkit-scrollbar-thumb,
+        #setDialog::-webkit-scrollbar-thumb,
+        .allList::-webkit-scrollbar-thumb,
+        .sysDetail::-webkit-scrollbar-thumb,
+        #apiSelect::-webkit-scrollbar-thumb {
+            background-color: rgba(0, 0, 0, 0.1);
+            background-clip: padding-box;
+            border: solid transparent;
+            border-radius: 10px;
+        }
+
+        .messages::-webkit-scrollbar-thumb:hover,
+        #chatlog .markdown-body pre>code::-webkit-scrollbar-thumb:hover,
+        #setDialog::-webkit-scrollbar-thumb:hover,
+        .allList::-webkit-scrollbar-thumb:hover,
+        .sysDetail::-webkit-scrollbar-thumb:hover,
+        #apiSelect::-webkit-scrollbar-thumb:hover {
+            background-color: rgba(0, 0, 0, 0.4);
+        }
+
+        #chatlog {
+            word-wrap: break-word;
+            text-align: start;
+        }
+
+        .chatAvatar {
+            margin: 14px 12px 13px 12px;
+            width: 30px;
+            height: 30px;
+            flex-shrink: 0;
+            border-radius: 2px;
+        }
+
+        .response>.chatAvatar {
+            margin-top: 15px;
+            margin-bottom: 15px;
+        }
+
+        .gptAvatar>svg {
+            margin-top: 3px;
+            margin-left: 3px;
+            fill: var(--gpt-icon-color);
+        }
+
+        .chatAvatar>img {
+            display: block;
+            width: 100%;
+            height: 100%;
+            border-radius: 2px;
+        }
+
+        .modelAvatar>svg,
+        .chatAvatar>svg {
+            display: block;
+        }
+
+        #chatlog .request {
+            position: relative;
+            display: flex;
+            justify-content: center;
+        }
+
+        .requestBody {
+            white-space: pre-wrap;
+            margin: 18px 0;
+            flex: 1;
+        }
+
+        #chatlog .response {
+            background: var(--response-color);
+            position: relative;
+            display: flex;
+            justify-content: center;
+        }
+
+        .response .markdown-body {
+            margin: 18px 0;
+            flex: 1;
+            background: var(--response-color) !important;
+        }
+
+        #chatlog .markdown-body a {
+            padding: 0 1px 0 2px;
+        }
+
+        #chatlog .markdown-body pre {
+            padding: 10px 10px 5px 10px;
+            position: relative;
+            background: var(--code-color);
+        }
+
+        #chatlog .markdown-body .mermaid {
+            margin-bottom: 8px;
+            padding: 0 10px 0 10px;
+            position: relative;
+            background: var(--code-color);
+            border-radius: 6px;
+        }
+
+        .mermaid>svg {
+            cursor: pointer;
+            max-height: 600px;
+            display: block;
+        }
+
+        #chatlog .markdown-body pre>code {
+            overflow-x: auto;
+            display: block;
+        }
+
+        #chatlog .markdown-body ul {
+            list-style-type: disc;
+        }
+
+        .m-mdic-copy-wrapper {
+            position: absolute;
+            top: 5px;
+            right: 16px;
+            z-index: 1;
+            -webkit-user-select: none;
+            user-select: none;
+        }
+
+        .m-mdic-copy-wrapper span.u-mdic-copy-code_lang {
+            position: absolute;
+            top: 3px;
+            right: calc(100% + 4px);
+            font-family: system-ui;
+            font-size: 12px;
+            line-height: 18px;
+            color: #bbb;
+        }
+
+        .m-mdic-copy-wrapper div.u-mdic-copy-notify {
+            position: absolute;
+            top: 0;
+            right: 0;
+            padding: 3px 6px;
+            border: 0;
+            border-radius: 3px;
+            background: none;
+            font-family: system-ui;
+            font-size: 12px;
+            line-height: 18px;
+            color: var(--lighter-text-color);
+            outline: none;
+            right: 100%;
+            padding-right: 4px;
+        }
+
+        .m-mdic-copy-wrapper button.u-mdic-copy-btn {
+            position: relative;
+            top: 0;
+            right: 0;
+            padding: 3px 6px;
+            border: 0;
+            border-radius: 3px;
+            background: none;
+            font-family: system-ui;
+            font-size: 12px;
+            line-height: 18px;
+            color: #bbb;
+            outline: none;
+            cursor: pointer;
+            transition: color 250ms;
+        }
+
+        .m-mdic-copy-wrapper span.u-mdic-copy-code_lang::before,
+        .m-mdic-copy-wrapper div.u-mdic-copy-notify::before,
+        .m-mdic-copy-wrapper button.u-mdic-copy-btn::before {
+            content: attr(text);
+        }
+
+        .m-mdic-copy-wrapper button.u-mdic-copy-btn:hover {
+            color: var(--lighter-text-color);
+        }
+
+        #stopChat {
+            display: none;
+            margin: 0 auto;
+            margin-top: 3px;
+            width: 80px;
+            height: 32px;
+            text-align: center;
+            line-height: 32px;
+            color: white;
+            background: #f8b26a;
+            cursor: pointer;
+            border-radius: 3px;
+            position: sticky;
+            bottom: 2px;
+            justify-content: center;
+            align-items: center;
+        }
+
+        #stopChat>svg {
+            margin-right: 8px;
+        }
+
+        #stopChat:hover {
+            background: #f0aa60;
+        }
+
+        .bottom_wrapper {
+            position: relative;
+            width: 100%;
+            padding: 10px 10px;
+            margin: 0 auto;
+        }
+
+        .bottom_wrapper .message_input_wrapper {
+            border: none;
+            width: calc(100% - 139px);
+            position: relative;
+            text-align: left;
+        }
+
+        .bottom_wrapper .message_input_wrapper .message_input_text {
+            border-radius: 4px;
+            border: none;
+            outline: none;
+            resize: none;
+            background-color: var(--main-back);
+            color: var(--chat-text-color);
+            height: 47px;
+            font-size: 16px;
+            max-height: 200px;
+            padding: 13px 0 13px 13px;
+            width: 100%;
+            display: block;
+            transition: background-color 250ms;
+        }
+
+        .bottom_wrapper .message_input_wrapper .message_input_text:focus {
+            background-color: var(--code-color);
+        }
+
+        .bottom_wrapper .message_input_wrapper .message_input_text::-webkit-scrollbar {
+            display: none;
+            width: 0;
+            height: 0;
+        }
+
+        #sendbutton {
+            width: 80px;
+            height: 47px;
+            font-size: 18px;
+            font-weight: bold;
+            border-radius: 3px;
+            background-color: #b8da8b;
+            border: none;
+            padding: 0;
+            color: #fff;
+            cursor: pointer;
+            transition: all 250ms;
+            text-align: center;
+            float: right;
+            position: absolute;
+            right: 63px;
+            bottom: 10px;
+            cursor: not-allowed;
+        }
+
+        .activeSendBtn {
+            background-color: #99c959 !important;
+            cursor: pointer !important;
+        }
+
+        .activeSendBtn:hover {
+            background-color: #90c050 !important;
+        }
+
+        .clearConv {
+            position: absolute;
+            right: 10px;
+            bottom: 10px;
+            width: 47px;
+            height: 47px;
+            border-radius: 3px;
+            background: var(--text-color);
+            border: none;
+            color: #fff;
+            cursor: pointer;
+        }
+
+        .clearConv>svg {
+            margin: 0 auto;
+        }
+
+        .clearConv:hover {
+            background: var(--svg-color);
+        }
+
+        .clearConv svg:first-child {
+            display: none;
+        }
+
+        .clearConv svg:nth-child(2) {
+            display: block;
+        }
+
+        .closeConv {
+            background: var(--active-btn);
+        }
+
+        .closeConv:hover {
+            background: var(--lighter-active-color);
+        }
+
+        .closeConv svg:first-child {
+            display: block;
+        }
+
+        .closeConv svg:nth-child(2) {
+            display: none;
+        }
+
+        .loaded>span {
+            display: inline-block;
+        }
+
+        .loaded>svg {
+            display: none;
+        }
+
+        .loading {
+            background: var(--active-btn) !important;
+        }
+
+        .loading>span {
+            display: none;
+        }
+
+        .loading>svg {
+            display: block;
+        }
+
+        .switch-slide {
+            display: inline-block;
+            vertical-align: middle;
+        }
+
+        .switch-slide-label {
+            display: block;
+            width: 38px;
+            height: 18px;
+            background: var(--text-color);
+            border-radius: 30px;
+            cursor: pointer;
+            position: relative;
+            -webkit-transition: all 250ms;
+            transition: all 250ms;
+        }
+
+        .switch-slide-label:after {
+            content: "";
+            display: block;
+            width: 16px;
+            height: 16px;
+            border-radius: 100%;
+            background: #fff;
+            box-shadow: 0 1px 1px rgba(0, 0, 0, .1);
+            position: absolute;
+            left: 1px;
+            top: 1px;
+            -webkit-transform: translateZ(0);
+            transform: translateZ(0);
+            -webkit-transition: all 250ms;
+            transition: all 250ms;
+        }
+
+        .switch-slide input:checked+label {
+            background: #99c959;
+            -webkit-transition: all 250ms;
+            transition: all 250ms;
+        }
+
+        .switch-slide input:checked+label:after {
+            left: 21px;
+        }
+
+        .settings {
+            margin-right: 10px;
+            display: flex;
+            position: absolute;
+            right: 0;
+            top: 3px;
+        }
+
+        .setBtn {
+            margin-left: 2px;
+            cursor: pointer;
+            padding: 5px;
+            border: none;
+            background-color: transparent;
+            border-radius: 4px;
+        }
+
+        .setBtn>svg {
+            display: block;
+            color: var(--text-color);
+        }
+
+        .setBtn:hover {
+            background: var(--active-btn);
+        }
+
+        #setting {
+            right: 15px;
+        }
+
+        #toggleFull {
+            right: 56px;
+        }
+
+        #toggleLight *,
+        #toggleFull * {
+            pointer-events: none;
+        }
+
+        .showSetting {
+            background: var(--lighter-svg-color) !important;
+        }
+
+        .showModels {
+            background: var(--active-btn) !important;
+        }
+
+        #modelDialog {
+            color: var(--btn-color);
+            position: absolute;
+            z-index: 2;
+            background: var(--main-back);
+            width: 240px;
+            left: 50%;
+            transform: translateX(-50%);
+            top: 46px;
+            overflow-y: auto;
+            max-height: calc(100% - 55px);
+            -webkit-user-select: none;
+            user-select: none;
+            border-radius: 4px;
+            padding: 8px 5px 8px 5px;
+            box-shadow: 0 0 6px rgba(0, 0, 0, 0.15);
+        }
+
+        .modelSingle {
+            height: 40px;
+            border-radius: 3px;
+            cursor: pointer;
+            line-height: 40px;
+            display: flex;
+            align-items: center;
+            position: relative;
+        }
+
+        .modelSingle * {
+            pointer-events: none;
+        }
+
+        .modelSingle:hover {
+            background-color: var(--active-btn);
+        }
+
+        .activeModel::before {
+            content: "";
+            position: absolute;
+            right: 15px;
+            top: 7px;
+            width: 8px;
+            height: 16px;
+            border-color: #99c959;
+            border-style: solid;
+            border-width: 0 4px 4px 0;
+            transform: rotate(45deg);
+        }
+
+        .modelAvatar {
+            margin-left: 6px;
+            margin-right: 10px;
+            width: 30px;
+            flex-shrink: 0;
+            border-radius: 2px;
+            height: 30px;
+        }
+
+        #setDialog {
+            color: var(--btn-color);
+            position: absolute;
+            z-index: 2;
+            background: var(--main-back);
+            width: 320px;
+            right: 6px;
+            top: 46px;
+            overflow-y: auto;
+            max-height: calc(100% - 55px);
+            -webkit-user-select: none;
+            user-select: none;
+            border-radius: 5px;
+            padding: 8px 12px 8px 12px;
+            box-shadow: 0 0 6px rgba(0, 0, 0, 0.15);
+        }
+
+        #setDialog input {
+            width: 100%;
+        }
+
+        #setDialog .inlineTitle {
+            display: inline-block;
+            width: 88px;
+            line-height: 16px;
+            vertical-align: middle;
+        }
+
+        #convOption,
+        #speechOption,
+        #speechDetail,
+        #recOption {
+            margin-bottom: 6px;
+        }
+
+        #convOption>div,
+        #speechOption>div,
+        #speechDetail>div,
+        #recOption div {
+            margin-top: 7px;
+        }
+
+        #voiceRecSetting select,
+        #speechDetail select {
+            background: var(--chat-back);
+            color: var(--chat-text-color);
+        }
+
+        .inputTextClass {
+            outline: none;
+            border-radius: 2px;
+            margin-top: 3px;
+            height: 32px;
+            font-size: 15px;
+            padding-left: 6px;
+            background: var(--chat-back);
+            color: var(--chat-text-color);
+            border: none;
+        }
+
+        .areaTextClass {
+            width: 100%;
+            height: 80px;
+            display: block;
+            resize: none;
+            padding: 6px;
+        }
+
+        input[type="range"] {
+            -webkit-appearance: none;
+            appearance: none;
+            display: block;
+            margin: 4px 0 3px 0;
+            height: 8px;
+            background: var(--text-color);
+            border-radius: 5px;
+            background-image: linear-gradient(#99c959, #99c959);
+            background-size: 100% 100%;
+            background-repeat: no-repeat;
+        }
+
+        input[type="range"]::-webkit-slider-thumb {
+            -webkit-appearance: none;
+            height: 15px;
+            width: 15px;
+            border-radius: 50%;
+            background: #99c959;
+            cursor: ew-resize;
+        }
+
+        input[type=range]::-webkit-slider-runnable-track {
+            -webkit-appearance: none;
+            box-shadow: none;
+            border: none;
+            background: transparent;
+        }
+
+        .justSetLine {
+            display: flex;
+            justify-content: space-between;
+        }
+
+        .justSetBtn {
+            height: 32px;
+            border-radius: 3px;
+            line-height: 32px;
+            background: var(--lighter-active);
+            text-align: center;
+            padding: 0px 8px;
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            cursor: pointer;
+        }
+
+        .justSetBtn:hover {
+            background-color: var(--sel-btn);
+        }
+
+        .justSetBtn>svg {
+            margin-right: 3px;
+        }
+
+        .readyTestVoice>div:not(:first-child) {
+            display: none;
+        }
+
+        .pauseTestVoice>div:nth-child(1),
+        .pauseTestVoice>div:nth-child(3) {
+            display: none;
+        }
+
+        .resumeTestVoice>div:nth-child(1),
+        .resumeTestVoice>div:nth-child(2) {
+            display: none;
+        }
+
+        .presetSelect>div {
+            display: inline-block;
+        }
+
+        .presetSelect select {
+            outline: none;
+            border-radius: 3px;
+            width: 128px;
+            border-color: rgba(0, 0, 0, .3);
+            background: var(--chat-back);
+            color: var(--chat-text-color);
+        }
+
+        .selectDef {
+            display: flex;
+            justify-content: space-between;
+            font-size: 13px;
+            color: var(--text-color);
+        }
+
+        #preSetSpeech {
+            width: 100%;
+            outline: none;
+            height: 30px;
+            font-size: 14px;
+            margin-top: 5px;
+            border-radius: 3px;
+            border-color: rgba(0, 0, 0, .3);
+        }
+
+        .mdOption {
+            flex-shrink: 0;
+            position: relative;
+            width: 30px;
+            pointer-events: none;
+        }
+
+        .mdOption>div {
+            pointer-events: auto;
+            cursor: pointer;
+        }
+
+        .mdOption svg * {
+            pointer-events: none;
+        }
+
+        .refreshReq svg:not(:first-child) {
+            display: none;
+        }
+
+        .halfRefReq svg:not(:nth-child(2)) {
+            display: none;
+        }
+
+        .optionItems {
+            position: absolute;
+            bottom: -12px;
+            display: flex;
+            justify-content: space-between;
+            visibility: hidden;
+            z-index: 1;
+            color: var(--svg-color);
+        }
+
+        .optionItems:hover {
+            visibility: visible;
+        }
+
+        .request:hover .optionItems,
+        .request:hover .voiceCls,
+        .response:hover .optionItems,
+        .response:hover .voiceCls {
+            visibility: visible;
+        }
+
+        .optionItem {
+            border-radius: 9px;
+            height: 24px;
+            width: 32px;
+            border: 1px solid var(--active-btn);
+            background-color: var(--response-color);
+            display: flex !important;
+            justify-content: center;
+            align-items: center;
+        }
+
+        .optionItem * {
+            pointer-events: none;
+        }
+
+        .optionItem:hover {
+            background: var(--active-btn);
+        }
+
+        .voiceCls {
+            position: relative;
+            height: 100%;
+            visibility: hidden;
+            display: flex;
+            align-items: center;
+        }
+
+        .voiceCls>svg {
+            color: var(--lighter-svg-color);
+            display: block;
+            margin-left: 5px;
+            position: relative;
+        }
+
+        .voiceCls:hover>svg {
+            color: var(--svg-color);
+        }
+
+        .showVoiceCls,
+        .showVoiceCls .markdown-body {
+            background: var(--active-btn) !important;
+        }
+
+        .showVoiceCls .voiceCls {
+            visibility: visible !important;
+        }
+
+        .showEditReq {
+            position: sticky !important;
+            top: 0;
+            bottom: 0;
+            z-index: 1;
+        }
+
+        .showEditReq,
+        .showEditReq .markdown-body {
+            background: var(--active-btn) !important;
+        }
+
+        .readyVoice svg:not(:first-child) {
+            display: none;
+        }
+
+        .pauseVoice svg:not(:nth-child(2)) {
+            display: none;
+        }
+
+        .resumeVoice svg:not(:nth-child(3)) {
+            display: none;
+        }
+
+        #voiceTypes>span {
+            border-radius: 3px;
+            margin-left: 4px;
+            height: 28px;
+            line-height: 26px;
+            display: inline-block;
+            cursor: pointer;
+            padding: 1px 5px;
+        }
+
+        #voiceTypes>span:hover {
+            background: var(--active-btn);
+        }
+
+        .selVoiceType {
+            background: var(--sel-btn) !important;
+        }
+
+        .navHeader {
+            width: 100%;
+            padding: 5px 10px;
+            display: flex;
+            justify-content: space-between;
+        }
+
+        #newChat {
+            text-align: center;
+            width: 80%;
+            height: 40px;
+            border-radius: 3px;
+            background: var(--lighter-active-color);
+            color: var(--btn-color);
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            -webkit-user-select: none;
+            user-select: none;
+            cursor: pointer;
+            flex: 1;
+        }
+
+        .navHeader>div:hover {
+            background: var(--active-btn) !important;
+        }
+
+        #newChat>svg {
+            margin-right: 2px;
+        }
+
+        #newFolder {
+            height: 40px;
+            width: 40px;
+            margin-left: 10px;
+            border-radius: 3px;
+            color: var(--btn-color);
+            cursor: pointer;
+            position: relative;
+            background: var(--lighter-active-color);
+            -webkit-user-select: none;
+            user-select: none;
+        }
+
+        #newFolder>svg {
+            display: block;
+            margin: 8px auto;
+        }
+
+        .extraChat {
+            padding: 2px 10px 6px 10px;
+            position: relative;
+        }
+
+        #searchChat {
+            width: 100%;
+            height: 36px;
+            padding-left: 10px;
+            padding-right: 45px;
+            font-size: 16px;
+            outline: none;
+            border: none;
+            color: var(--chat-text-color);
+            background: var(--lighter-active-color);
+            border-radius: 3px;
+            -webkit-user-select: none;
+            user-select: none;
+        }
+
+        #searchChat:focus {
+            background: var(--active-btn)
+        }
+
+        #searchChat:placeholder-shown+#clearSearch {
+            display: none;
+        }
+
+        #clearSearch {
+            position: absolute;
+            right: 34px;
+            top: 8px;
+            cursor: pointer;
+            color: var(--btn-color);
+        }
+
+        #clearSearch:hover {
+            color: var(--black-color);
+        }
+
+        #clearSearch>svg {
+            display: block;
+        }
+
+        .seledSearch {
+            background: var(--lighter-svg-color) !important;
+        }
+
+        #matchCaseSearch {
+            position: absolute;
+            right: 12px;
+            top: 8px;
+            cursor: pointer;
+            border-radius: 3px;
+            color: var(--btn-color);
+        }
+
+        #matchCaseSearch:hover {
+            background: var(--sel-btn);
+        }
+
+        #matchCaseSearch>svg {
+            display: block;
+        }
+
+        .navFooter {
+            padding-bottom: 8px;
+        }
+
+        .navFooter .divider {
+            width: 100%;
+            border-top: 1px solid var(--active-btn);
+            margin: 4px 0;
+        }
+
+        .navFunc {
+            padding-top: 5px;
+            display: flex;
+            justify-content: space-around;
+        }
+
+        .navFunc svg {
+            display: block;
+        }
+
+        .navFunc>div,
+        .navFunc>label {
+            border-radius: 20px;
+            text-align: center;
+            padding: 8px 8px;
+            color: var(--btn-color);
+            font-size: 14px;
+            cursor: pointer;
+        }
+
+        .navFunc>div:hover,
+        .navFunc>label:hover {
+            background: var(--active-btn);
+        }
+
+        .navFooter .links {
+            text-align: center;
+            -webkit-user-select: none;
+            user-select: none;
+        }
+
+        .navFooter .links a {
+            color: var(--btn-color);
+            text-decoration: none;
+        }
+
+        .navFooter .links a:hover {
+            color: var(--chat-text-color) !important;
+        }
+
+        .navFooter .links a:visited {
+            color: var(--btn-color);
+        }
+
+        .allList {
+            width: 100%;
+            position: relative;
+            flex: 1;
+            overflow-y: auto;
+        }
+
+        #chatList {
+            min-height: 50px;
+        }
+
+        .dragingLi {
+            filter: brightness(90%);
+        }
+
+        .dragingChat {
+            background: var(--lighter-active-color);
+        }
+
+        .expandFolder>.headLi>svg {
+            transform: rotate(90deg);
+        }
+
+        .expandFolder>.chatsInFolder {
+            display: block;
+        }
+
+        .chatsInFolder {
+            display: none;
+            margin-left: 22px;
+            padding-left: 2px;
+            border-left: 1px solid var(--text-color);
+        }
+
+        .headLi,
+        .chatLi {
+            cursor: pointer;
+            width: 100%;
+            height: 50px;
+            color: var(--text-color);
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            position: relative;
+        }
+
+        .headLi *,
+        .chatLi * {
+            pointer-events: none;
+        }
+
+        .headLi>svg,
+        .chatLi>svg {
+            margin-left: 10px;
+            color: var(--btn-color);
+            pointer-events: none;
+        }
+
+        .folderOption svg,
+        .chatOption svg {
+            pointer-events: auto;
+        }
+
+        .headLi svg *,
+        .chatLi svg * {
+            pointer-events: none;
+        }
+
+        .headLi .folderInfo {
+            position: absolute;
+            left: 40px;
+            height: 40px;
+            max-width: calc(100% - 115px);
+        }
+
+        .chatLi .chatInfo {
+            position: absolute;
+            left: 40px;
+            height: 40px;
+            max-width: calc(100% - 90px);
+        }
+
+        .folderInfo *,
+        .chatInfo * {
+            -webkit-user-select: none;
+            user-select: none;
+        }
+
+        .chatInfo span {
+            background: #f8b26a;
+        }
+
+        .headLi .folderName,
+        .chatLi .chatName {
+            color: var(--btn-color);
+            text-overflow: ellipsis;
+            white-space: nowrap;
+            overflow: hidden;
+            line-height: 20px;
+            height: 20px;
+        }
+
+        .headLi .folderNum,
+        .chatLi .chatPre {
+            color: var(--btn-color);
+            text-overflow: ellipsis;
+            white-space: nowrap;
+            overflow: hidden;
+            font-size: 12px;
+            line-height: 20px;
+            height: 20px;
+        }
+
+        .headLi .folderOption,
+        .chatLi .chatOption {
+            visibility: hidden;
+            display: flex;
+            color: #777;
+            margin-right: 2px;
+        }
+
+        .folderLi .chatLi>svg {
+            margin-left: 5px;
+        }
+
+        .folderLi .chatLi .chatInfo {
+            left: 35px;
+            max-width: calc(100% - 85px);
+        }
+
+        .folderLi .chatLi #activeChatEdit {
+            left: 32px;
+            width: calc(100% - 60px)
+        }
+
+        .folderLi:hover {
+            background: var(--lighter-active);
+        }
+
+        .chatLi:hover {
+            background: var(--active-btn);
+        }
+
+        .headLi:hover .folderOption,
+        .chatLi:hover .chatOption {
+            visibility: visible !important;
+        }
+
+        .activeFolder,
+        .activeChatLi {
+            background: var(--sel-btn) !important;
+        }
+
+        .activeChatLi .chatOption {
+            visibility: visible !important;
+        }
+
+        .folderOption>svg:hover,
+        .chatOption>svg:hover {
+            color: #444;
+        }
+
+        #activeChatEdit {
+            position: absolute;
+            left: 37px;
+            font-size: 16px;
+            border-radius: 2px;
+            color: var(--chat-text-color);
+            background: var(--chat-back);
+            outline: none;
+            border: none;
+            pointer-events: auto;
+            height: 24px;
+            line-height: 24px;
+            width: calc(100% - 65px);
+            padding: 20px 3px;
+            z-index: 1;
+        }
+
+        #loadMask {
+            position: fixed;
+            top: 0;
+            left: 0;
+            right: 0;
+            bottom: 0;
+            z-index: 100;
+            background-color: var(--background);
+        }
+
+        #loadMask>div {
+            position: absolute;
+            left: 50%;
+            top: 50%;
+            transform: translate(-50%, -50%);
+            text-align: center;
+        }
+
+        @keyframes loading {
+
+            0%,
+            100% {
+                transform: scale(0);
+            }
+
+            50% {
+                transform: scale(1);
+            }
+        }
+
+        .loadingCSSIcon {
+            position: relative;
+            display: flex;
+            align-items: center;
+            justify-content: space-evenly;
+        }
+
+        .loadingCSSIcon div {
+            width: 15%;
+            height: 0;
+            padding-bottom: 15%;
+            border-radius: 50%;
+            animation: loading 1s cubic-bezier(0.3, 0, 0.7, 1) infinite;
+        }
+
+        .loadingCSSIcon div:nth-child(1) {
+            background: #e15b64;
+            animation-delay: -0.4s
+        }
+
+        .loadingCSSIcon div:nth-child(2) {
+            background: #f8b26a;
+            animation-delay: -0.2s
+        }
+
+        .loadingCSSIcon div:nth-child(3) {
+            background: #99c959;
+            animation-delay: 0s;
+        }
+
+        #loadMask>div>:first-child {
+            font-size: 40px;
+            color: var(--text-color);
+        }
+
+        #loadMask>div>:last-child {
+            width: 140px;
+            height: 70px;
+            margin: 0 auto;
+        }
+
+        #voiceRec {
+            position: absolute;
+            right: 0;
+            top: 0;
+            width: 47px;
+            height: 100%;
+        }
+
+        .message_if_voice {
+            padding-right: 47px !important;
+        }
+
+        #voiceRecIcon {
+            width: 100%;
+            height: 100%;
+            text-align: center;
+            cursor: pointer;
+            position: relative;
+        }
+
+        #voiceRecIcon:hover>svg {
+            color: var(--svg-color);
+        }
+
+        #voiceRecIcon>svg {
+            width: 25px;
+            height: 25px;
+            color: #b0b0b0;
+            position: absolute;
+            top: 50%;
+            left: 50%;
+            margin-top: -12px;
+            margin-left: -13px;
+        }
+
+        #voiceRecIcon>svg .animVoice {
+            display: none;
+        }
+
+        .voiceRecing>svg {
+            color: #99c959 !important;
+        }
+
+        .voiceRecing .animVoice {
+            display: inline !important;
+            transform-origin: 0 64%;
+            animation-duration: 1.5s;
+            animation-name: scaleVoice;
+            animation-timing-function: ease;
+            animation-iteration-count: infinite;
+        }
+
+        .voiceLong .animVoice {
+            display: inline !important;
+            transform-origin: 0 64%;
+            animation-duration: 0.3s;
+            animation-name: longVoice;
+            animation-timing-function: ease-in-out;
+            animation-iteration-count: 1;
+        }
+
+        @keyframes longVoice {
+            0% {
+                transform: scaleY(0);
+            }
+
+            100% {
+                transform: scaleY(1);
+            }
+        }
+
+        @keyframes scaleVoice {
+            0% {
+                transform: scaleY(0.28);
+            }
+
+            20% {
+                transform: scaleY(0.60);
+            }
+
+            28% {
+                transform: scaleY(0.28);
+            }
+
+            36% {
+                transform: scaleY(0.45);
+            }
+
+            44% {
+                transform: scaleY(0.28);
+            }
+
+            52% {
+                transform: scaleY(0.45);
+            }
+
+            62% {
+                transform: scaleY(0.80);
+            }
+
+            72% {
+                transform: scaleY(0.80);
+            }
+
+            90% {
+                transform: scaleY(0.28);
+            }
+
+            100% {
+                transform: scaleY(0.28);
+            }
+        }
+
+        #voiceRecSetting {
+            display: none;
+            position: absolute;
+            top: -70px;
+            left: -26px;
+            z-index: 1;
+            padding: 4px 4px;
+            -webkit-user-select: none;
+            user-select: none;
+            border-radius: 3px;
+            background-color: var(--main-back);
+            box-shadow: 0 0 6px rgba(0, 0, 0, 0.15);
+        }
+
+        #voiceRecSetting select {
+            width: 102px;
+            outline: none;
+            height: 28px;
+            border-radius: 3px;
+            border-color: rgba(0, 0, 0, .3);
+        }
+
+        .presetModelCls label {
+            margin-right: 8px;
+        }
+
+        .presetModelCls select {
+            height: 30px;
+            margin-top: 2px;
+            font-size: 15px;
+        }
+
+        .modelSwitch,
+        .setSwitch {
+            display: flex;
+        }
+
+        .modelSwitch {
+            margin-bottom: 10px;
+            -webkit-user-select: none;
+            user-select: none;
+        }
+
+        .modelSwitch>div,
+        .setSwitch>div {
+            border-radius: 3px;
+            width: calc(100% / 3);
+            height: 32px;
+            line-height: 32px;
+            text-align: center;
+            cursor: pointer;
+        }
+
+        .modelSwitch>div:hover,
+        .setSwitch>div:hover {
+            background-color: var(--active-btn);
+        }
+
+        .activeSwitch {
+            background-color: var(--sel-btn) !important;
+        }
+
+        .checkLoad {
+            height: 32px;
+            border-radius: 3px;
+            line-height: 32px;
+            background: var(--sel-btn);
+            text-align: center;
+            display: flex;
+            justify-content: center;
+            cursor: pointer;
+        }
+
+        .checkLoad:hover {
+            background: var(--lighter-svg-color);
+        }
+
+        .voiceChecking {
+            background-color: var(--lighter-svg-color) !important;
+        }
+
+        .voiceChecking>svg {
+            display: inline !important;
+        }
+
+        .checkLoad>svg {
+            display: none;
+            margin-right: 8px;
+            height: 32px;
+            width: 64px;
+        }
+
+        #preSetSystem {
+            height: 20px;
+            line-height: 20px;
+            vertical-align: top;
+        }
+
+        #sysMask {
+            display: none;
+            position: fixed;
+            z-index: 200;
+            top: 0;
+            left: 0;
+            bottom: 0;
+            right: 0;
+            cursor: pointer;
+            justify-content: center;
+            align-items: center;
+            background: rgba(0, 0, 0, .4);
+            -webkit-tap-highlight-color: transparent;
+        }
+
+        #sysDialog {
+            position: relative;
+            background: var(--chat-back);
+            color: var(--btn-color);
+            cursor: auto;
+            max-height: 100%;
+            width: 88%;
+            display: flex;
+            flex-direction: column;
+            border-radius: 4px;
+            padding: 12px 20px 12px 20px;
+        }
+
+        .sysTitle {
+            font-size: 20px;
+            font-weight: bold;
+            margin-bottom: 8px;
+            -webkit-user-select: none;
+            user-select: none;
+        }
+
+        .sysSwitch,
+        .sysSwitch>div * {
+            pointer-events: none;
+        }
+
+        .sysSwitch>div {
+            border-radius: 3px;
+            height: 32px;
+            line-height: 32px;
+            text-align: center;
+            cursor: pointer;
+            pointer-events: auto;
+            font-weight: bold;
+            display: flex;
+            align-items: center;
+            -webkit-user-select: none;
+            user-select: none;
+        }
+
+        .sysSwitch>div>svg {
+            margin-right: 4px;
+        }
+
+        .sysSwitch>div:hover {
+            background-color: var(--active-btn);
+        }
+
+        .sysDetail {
+            flex: 1;
+        }
+
+        #closeSet {
+            position: absolute;
+            right: 0px;
+            top: 0px;
+            cursor: pointer;
+            padding: 10px 14px;
+        }
+
+        #closeSet:hover {
+            color: var(--black-color);
+        }
+
+        .setContent {
+            margin-bottom: 10px;
+        }
+
+        .setNotNormalFlow {
+            position: relative;
+        }
+
+        .setTitle {
+            margin-bottom: 6px;
+            font-weight: bold;
+            -webkit-user-select: none;
+            user-select: none;
+        }
+
+        .setDetail {
+            margin: 0 10px;
+            -webkit-user-select: none;
+            user-select: none;
+        }
+
+        .autoSelect>label,
+        .autoSelect>input {
+            cursor: pointer;
+        }
+
+        .dataDetail {
+            display: flex;
+        }
+
+        .dataDetail svg {
+            margin-right: 4px;
+        }
+
+        .dataDetail>div,
+        .dataDetail>label {
+            border-radius: 3px;
+            text-align: center;
+            padding: 6px 8px;
+            margin-right: 12px;
+            color: var(--btn-color);
+            font-size: 15px;
+            cursor: pointer;
+            display: flex;
+            align-items: center;
+            justify-content: space-between;
+        }
+
+        .dataDetail>div:hover,
+        .dataDetail>label:hover {
+            background: var(--lighter-active-color);
+        }
+
+        .keyOptionDetail {
+            display: flex;
+            margin-top: 10px;
+        }
+
+        #resetHotKey {
+            display: flex;
+            align-items: center;
+            justify-content: space-between;
+            cursor: pointer;
+            font-size: 15px;
+            color: var(--btn-color);
+            border-radius: 3px;
+            padding: 6px 8px;
+            margin-left: 90px;
+        }
+
+        #resetHotKey>svg {
+            margin-right: 4px;
+        }
+
+        #resetHotKey:hover {
+            background: var(--lighter-active-color);
+        }
+
+        .hotKeyDetail>div {
+            position: relative;
+            height: 30px;
+            margin-bottom: 2px;
+        }
+
+        .hotKeyDetail label {
+            line-height: 30px;
+        }
+
+        .hotKeyDetail select {
+            position: absolute;
+            left: 110px;
+            outline: none;
+            border-radius: 3px;
+            width: 120px;
+            border-color: rgba(0, 0, 0, .3);
+            background: var(--chat-back);
+            color: var(--chat-text-color);
+            height: 30px;
+            font-size: 15px;
+        }
+
+        .avatarDetail {
+            display: flex;
+            margin-top: 2px;
+        }
+
+        .avatarDetail img {
+            border-radius: 2px;
+            width: 32px;
+            height: 32px;
+            flex-shrink: 0;
+            margin-right: 8px;
+            margin-top: 2px;
+        }
+
+        .inputDetail input {
+            outline: none;
+            border-radius: 3px;
+            padding-left: 8px;
+            font-size: 15px;
+            width: 100%;
+            height: 34px;
+            border: 1px solid rgba(0, 0, 0, .3);
+            background: var(--chat-back);
+            color: var(--chat-text-color);
+        }
+
+        .themeDetail {
+            display: flex;
+            width: 180px;
+            justify-content: space-between;
+            pointer-events: none;
+        }
+
+        .themeDetail svg {
+            display: block;
+        }
+
+        .themeDetail>div {
+            pointer-events: auto;
+            border-radius: 20px;
+            text-align: center;
+            padding: 8px 8px;
+            color: var(--btn-color);
+            font-size: 14px;
+            cursor: pointer;
+        }
+
+        .themeDetail>div * {
+            pointer-events: none;
+        }
+
+        .themeDetail>div:hover {
+            background: var(--lighter-active-color);
+        }
+
+        .darkTheme>div:first-child {
+            background: var(--sel-btn);
+        }
+
+        .lightTheme>div:nth-child(2) {
+            background: var(--sel-btn);
+        }
+
+        .autoTheme>div:nth-child(3) {
+            background: var(--sel-btn);
+        }
+
+        .langDetail {
+            width: 110px;
+        }
+
+        .enLang>div:first-child {
+            background: var(--sel-btn);
+        }
+
+        .zhLang>div:nth-child(2) {
+            background: var(--sel-btn);
+        }
+
+        #customAutoSet input {
+            width: 100px;
+            height: 30px;
+            line-height: 30px;
+            font-size: 15px;
+            outline: none;
+            border: 1px solid rgba(0, 0, 0, .3);
+            text-align: center;
+            border-radius: 3px;
+            background: var(--chat-back);
+            color: var(--chat-text-color);
+        }
+
+        #customAutoSet label {
+            margin-right: 8px;
+        }
+
+        .progressBar {
+            position: relative;
+            width: 100%;
+            height: 12px;
+            border-radius: 6px;
+            background: var(--active-btn);
+            overflow: hidden;
+        }
+
+        .nowProgress {
+            position: absolute;
+            left: 0;
+            top: 0;
+            height: 12px;
+            min-width: 1px;
+            border-radius: 6px;
+            background: #99c959;
+        }
+
+        .progressDetail {
+            display: flex;
+            justify-content: space-between;
+            font-size: 15px;
+        }
+
+        .cursorCls {
+            background: var(--chat-text-color);
+            width: 4px;
+            animation: 1s cursor-blinker infinite step-start;
+        }
+
+        @keyframes cursor-blinker {
+            0% {
+                opacity: 0;
+            }
+
+            50% {
+                opacity: 1;
+            }
+
+            100% {
+                opacity: 0;
+            }
+        }
+
+        .apiSelsContainer {
+            position: relative;
+            z-index: 1;
+        }
+
+        #apiSelect {
+            position: absolute;
+            top: 64px;
+            padding: 4px 0;
+            background: var(--chat-back);
+            width: calc(100% - 20px);
+            left: 10px;
+            box-shadow: 0 0 6px rgba(0, 0, 0, 0.15);
+            max-height: 180px;
+            overflow-y: auto;
+            -webkit-user-select: none;
+            user-select: none;
+        }
+
+        #apiSelect>div {
+            pointer-events: auto;
+            cursor: pointer;
+            font-size: 15px;
+            padding: 6px 0 6px 8px;
+            height: 36px;
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+        }
+
+        #apiSelect>div:hover {
+            background: var(--lighter-active-color);
+        }
+
+        #apiSelect>div>span {
+            height: 100%;
+            line-height: 23px;
+            overflow: hidden;
+            white-space: nowrap;
+            text-overflow: ellipsis;
+        }
+
+        #apiSelect>div * {
+            pointer-events: none;
+        }
+
+        .delApiOption:hover {
+            background: var(--lighter-svg-color);
+        }
+
+        .delApiOption {
+            width: 36px;
+            height: 36px;
+            flex-shrink: 0;
+            pointer-events: auto !important;
+        }
+
+        .delApiOption>svg {
+            margin: 6px;
+            display: block;
+        }
+
+        @keyframes textInOut {
+            from {
+                opacity: 0;
+            }
+
+            to {
+                opacity: 1;
+            }
+        }
+
+        .response[data-loading="true"]>.markdown-body>*:not(.cursorCls) {
+            animation: textInOut 0.48s ease-in-out forwards;
+        }
+    </style>
+    <style>
+        /* for katex */
+        .katex {
+            font-size: 1em !important;
+        }
+
+        eq {
+            display: inline-block;
+        }
+
+        eqn {
+            display: block
+        }
+
+        section.eqno {
+            display: flex;
+            flex-direction: row;
+            align-content: space-between;
+            align-items: center;
+        }
+
+        section.eqno>eqn {
+            width: 100%;
+            margin-left: 3em;
+        }
+
+        section.eqno>span {
+            width: 3em;
+            text-align: right;
+        }
+    </style>
+    <script>
+        let themeMode; // 2: 自动， 1: 浅色，0: 深色
+        let autoThemeMode; // 1: 跟随系统，0:自定义时间
+        let customDarkTime; // 开始，结束时间
+        let isFull = false; // 是否全屏
+        const darkMedia = window.matchMedia("(prefers-color-scheme: dark)");
+        const justDarkTheme = (is) => {
+            if (is) document.documentElement.setAttribute("data-theme", "dark");
+            else document.documentElement.removeAttribute("data-theme");
+            document.head.children[4].content = document.head.children[5].content = document.head.children[6].content = getComputedStyle(document.documentElement).getPropertyValue("--background");
+        }
+        const checkDark = () => {
+            const checkCustomTheme = () => {
+                let date = new Date();
+                let nowTime = date.getTime();
+                let start = customDarkTime[0].split(":");
+                let startTime = new Date().setHours(start[0], start[1], 0, 0);
+                let end = customDarkTime[1].split(":");
+                let endTime = new Date().setHours(end[0], end[1], 0, 0);
+                let order = endTime > startTime;
+                let isDark = order ? (nowTime > startTime && endTime > nowTime) : !(nowTime > endTime && startTime > nowTime);
+                justDarkTheme(isDark);
+            }
+            const setDarkMode = () => {
+                if (themeMode === 2) {
+                    if (autoThemeMode) {
+                        justDarkTheme(darkMedia.matches);
+                    } else {
+                        checkCustomTheme();
+                    }
+                } else if (themeMode === 1) {
+                    justDarkTheme(false);
+                } else {
+                    justDarkTheme(true);
+                }
+                localStorage.setItem("themeMode", themeMode);
+            }
+            let localTheme = localStorage.getItem("themeMode");
+            themeMode = parseInt(localTheme || "1");
+            let localAutoTheme = localStorage.getItem("autoThemeMode");
+            autoThemeMode = parseInt(localAutoTheme || "1");
+            let localCustomDark = localStorage.getItem("customDarkTime");
+            customDarkTime = JSON.parse(localCustomDark || '["21:00", "07:00"]');
+            setDarkMode();
+        }
+        checkDark();
+    </script>
+</head>
+
+<body>
+    <svg style="transform: translate(-1000px, -1000px);">
+        <defs>
+            <radialGradient id="paint0_radial_16771_53212" cx="0" cy="0" r="1" gradientUnits="userSpaceOnUse"
+                gradientTransform="translate(2.77876 11.3795) rotate(18.6832) scale(29.8025 238.737)">
+                <stop offset="0.0671246" stop-color="#9168C0" />
+                <stop offset="0.342551" stop-color="#5684D1" />
+                <stop offset="0.672076" stop-color="#1BA1E3" />
+            </radialGradient>
+        </defs>
+    </svg>
+    <div style="display: none">
+        <svg>
+            <symbol viewBox="0 0 28 28" id="geminiIcon">
+                <path
+                    d="M14 28C14 26.0633 13.6267 24.2433 12.88 22.54C12.1567 20.8367 11.165 19.355 9.905 18.095C8.645 16.835 7.16333 15.8433 5.46 15.12C3.75667 14.3733 1.93667 14 0 14C1.93667 14 3.75667 13.6383 5.46 12.915C7.16333 12.1683 8.645 11.165 9.905 9.905C11.165 8.645 12.1567 7.16333 12.88 5.46C13.6267 3.75667 14 1.93667 14 0C14 1.93667 14.3617 3.75667 15.085 5.46C15.8317 7.16333 16.835 8.645 18.095 9.905C19.355 11.165 20.8367 12.1683 22.54 12.915C24.2433 13.6383 26.0633 14 28 14C26.0633 14 24.2433 14.3733 22.54 15.12C20.8367 15.8433 19.355 16.835 18.095 18.095C16.835 19.355 15.8317 20.8367 15.085 22.54C14.3617 24.2433 14 26.0633 14 28Z"
+                    fill="url(#paint0_radial_16771_53212)" />
+            </symbol>
+            <symbol viewBox="0 0 40 40" id="claudeIcon">
+                <path shape-rendering="optimizeQuality" fill="#D97757"
+                    d="m7.75 26.27 7.77-4.36.13-.38-.13-.21h-.38l-1.3-.08-4.44-.12-3.85-.16-3.73-.2-.94-.2L0 19.4l.09-.58.79-.53 1.13.1 2.5.17 3.75.26 2.72.16 4.03.42h.64l.09-.26-.22-.16-.17-.16-3.88-2.63-4.2-2.78-2.2-1.6L3.88 11l-.6-.76-.26-1.66L4.1 7.39l1.45.1.37.1 1.47 1.13 3.14 2.43 4.1 3.02.6.5.24-.17.03-.12-.27-.45L13 9.9l-2.38-4.1-1.06-1.7-.28-1.02c-.1-.42-.17-.77-.17-1.2L10.34.21l.68-.22 1.64.22.69.6 1.02 2.33 1.65 3.67 2.56 4.99.75 1.48.4 1.37.15.42h.26v-.24l.21-2.81.39-3.45.38-4.44.13-1.25.62-1.5L23.1.57l.96.46.79 1.13-.11.73-.47 3.05-.92 4.78-.6 3.2h.35l.4-.4 1.62-2.15 2.72-3.4 1.2-1.35 1.4-1.49.9-.71h1.7l1.25 1.86-.56 1.92-1.75 2.22-1.45 1.88-2.08 2.8-1.3 2.24.12.18.31-.03 4.7-1 2.54-.46 3.03-.52 1.37.64.15.65-.54 1.33-3.24.8-3.8.76-5.66 1.34-.07.05.08.1 2.55.24 1.09.06h2.67l4.97.37 1.3.86.78 1.05-.13.8-2 1.02-2.7-.64-6.3-1.5-2.16-.54h-.3v.18l1.8 1.76 3.3 2.98 4.13 3.84.21.95-.53.75-.56-.08-3.63-2.73-1.4-1.23-3.17-2.67h-.21v.28l.73 1.07 3.86 5.8.2 1.78-.28.58-1 .35-1.1-.2L26 33.14l-2.33-3.57-1.88-3.2-.23.13-1.11 11.95-.52.61-1.2.46-1-.76-.53-1.23.53-2.43.64-3.17.52-2.52.47-3.13.28-1.04-.02-.07-.23.03-2.36 3.24-3.59 4.85-2.84 3.04-.68.27-1.18-.61.11-1.09.66-.97 3.93-5 2.37-3.1 1.53-1.79-.01-.26h-.09L6.8 30.56l-1.86.24-.8-.75.1-1.23.38-.4 3.14-2.16Z">
+                </path>
+            </symbol>
+            <symbol viewBox="0 0 16 16" id="expandIcon">
+                <path fill="currentColor"
+                    d="M12.78 5.22a.749.749 0 0 1 0 1.06l-4.25 4.25a.749.749 0 0 1-1.06 0L3.22 6.28a.749.749 0 1 1 1.06-1.06L8 8.939l3.72-3.719a.749.749 0 0 1 1.06 0Z">
+                </path>
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="refreshIcon">
+                <path fill="currentColor"
+                    d="M3.07 10.876C3.623 6.436 7.41 3 12 3a9.15 9.15 0 0 1 6.012 2.254V4a1 1 0 1 1 2 0v4a1 1 0 0 1-1 1H15a1 1 0 1 1 0-2h1.957A7.15 7.15 0 0 0 12 5a7 7 0 0 0-6.946 6.124 1 1 0 1 1-1.984-.248m16.992 1.132a1 1 0 0 1 .868 1.116C20.377 17.564 16.59 21 12 21a9.15 9.15 0 0 1-6-2.244V20a1 1 0 1 1-2 0v-4a1 1 0 0 1 1-1h4a1 1 0 1 1 0 2H7.043A7.15 7.15 0 0 0 12 19a7 7 0 0 0 6.946-6.124 1 1 0 0 1 1.116-.868">
+                </path>
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="halfRefIcon">
+                <path fill="currentColor"
+                    d="M3.07 10.876C3.623 6.436 7.41 3 12 3a9.15 9.15 0 0 1 6.012 2.254V4a1 1 0 1 1 2 0v4a1 1 0 0 1-1 1H15a1 1 0 1 1 0-2h1.957A7.15 7.15 0 0 0 12 5a7 7 0 0 0-6.946 6.124 1 1 0 1 1-1.984-.248" />
+            </symbol>
+            <symbol viewBox="-2 -2 20 20" id="copyIcon">
+                <path fill="currentColor"
+                    d="M0 6.75C0 5.784.784 5 1.75 5h1.5a.75.75 0 0 1 0 1.5h-1.5a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-1.5a.75.75 0 0 1 1.5 0v1.5A1.75 1.75 0 0 1 9.25 16h-7.5A1.75 1.75 0 0 1 0 14.25Z">
+                </path>
+                <path fill="currentColor"
+                    d="M5 1.75C5 .784 5.784 0 6.75 0h7.5C15.216 0 16 .784 16 1.75v7.5A1.75 1.75 0 0 1 14.25 11h-7.5A1.75 1.75 0 0 1 5 9.25Zm1.75-.25a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-7.5a.25.25 0 0 0-.25-.25Z">
+                </path>
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="delIcon">
+                <path fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="2"
+                    d="M9 7v0a3 3 0 0 1 3-3v0a3 3 0 0 1 3 3v0M9 7h6M9 7H6m9 0h3m2 0h-2M4 7h2m0 0v11a2 2 0 0 0 2 2h8a2 2 0 0 0 2-2V7">
+                </path>
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="readyVoiceIcon">
+                <path fill="currentColor"
+                    d="M3 9v6h4l5 5V4L7 9H3zm13.5 3c0-1.77-1.02-3.29-2.5-4.03v8.05c1.48-.73 2.5-2.25 2.5-4.02zM14 3.23v2.06c2.89.86 5 3.54 5 6.71s-2.11 5.85-5 6.71v2.06c4.01-.91 7-4.49 7-8.77s-2.99-7.86-7-8.77z">
+                </path>
+            </symbol>
+            <symbol viewBox="0 0 20 20" id="pauseVoiceIcon">
+                <path stroke="currentColor" stroke-width="2.4" d="M6 3v14M14 3v14"></path>
+            </symbol>
+            <symbol viewBox="0 0 16 16" id="resumeVoiceIcon">
+                <path fill="currentColor" d="M4 3L4 13L12 8Z"></path>
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="stopResIcon">
+                <path fill="currentColor"
+                    d="M12 22C6.477 22 2 17.523 2 12S6.477 2 12 2s10 4.477 10 10s-4.477 10-10 10zm0-2a8 8 0 1 0 0-16a8 8 0 0 0 0 16zM9 9h6v6H9V9z">
+                </path>
+            </symbol>
+            <symbol viewBox="0 0 128 128" id="downAudioIcon">
+                <path
+                    d="M 64.662 1.549 C 56.549 4.524, 46.998 14.179, 45.523 20.895 C 45.041 23.089, 44.073 23.833, 40.433 24.807 C 34.752 26.326, 27.956 32.929, 25.527 39.289 C 24.273 42.574, 23.884 45.715, 24.196 50.034 C 24.620 55.897, 24.528 56.193, 21.836 57.585 C 17.142 60.012, 16 63.617, 16 76 C 16 88.463, 17.137 91.985, 21.967 94.483 C 28.244 97.729, 36.120 95.350, 38.579 89.466 C 39.387 87.532, 40 82.764, 40 78.415 C 40 70.971, 40.060 70.783, 42.250 71.370 C 43.487 71.701, 48.888 71.979, 54.250 71.986 L 64 72 64 76 L 64 80 57.122 80 C 49.420 80, 48.614 80.543, 47.547 86.453 C 46.552 91.964, 43.550 97.473, 40.273 99.803 C 33 104.974, 23.120 105.042, 16.118 99.971 C 11.407 96.558, 9.048 92.484, 8.145 86.205 C 6.963 77.979, 0.794 77.729, 0.191 85.883 C -0.196 91.111, 3.323 99.170, 8.062 103.908 C 11.290 107.136, 20.073 111.969, 22.750 111.990 C 23.540 111.996, 24 113.472, 24 116 C 24 119.740, 23.813 120, 21.122 120 C 17.674 120, 15.727 122.044, 16.173 125.195 C 16.492 127.441, 16.781 127.500, 27.391 127.500 C 36.676 127.500, 38.445 127.242, 39.386 125.750 C 40.993 123.203, 38.986 120.568, 35.149 120.187 C 32.206 119.894, 32 119.617, 32 115.956 C 32 112.509, 32.330 111.959, 34.750 111.377 C 42.181 109.591, 52.157 101.208, 53.575 95.559 C 53.928 94.152, 54.514 93, 54.878 93 C 55.242 93, 59.797 97.275, 65 102.500 C 70.762 108.286, 75.256 112, 76.495 112 C 77.769 112, 83.287 107.231, 91.264 99.236 C 101.113 89.366, 104 85.876, 104 83.843 C 104 80.580, 102.553 80, 94.418 80 L 88 80 88 76.105 L 88 72.211 99.750 71.815 C 113.117 71.364, 117.595 69.741, 122.762 63.473 C 128.159 56.925, 129.673 45.269, 126.134 37.500 C 123.787 32.346, 117.218 26.445, 112.132 24.921 C 108.617 23.868, 107.767 22.968, 105.028 17.405 C 99.364 5.901, 89.280 -0.062, 75.712 0.070 C 71.746 0.109, 66.773 0.774, 64.662 1.549 M 67.885 9.380 C 60.093 12.164, 55.057 17.704, 52.527 26.276 C 51.174 30.856, 50.220 31.617, 44.729 32.496 C 37.017 33.729, 30.917 42.446, 32.374 50.154 C 34.239 60.026, 40.582 63.944, 54.750 63.978 L 64 64 64 57.122 C 64 52.457, 64.449 49.872, 65.396 49.086 C 66.310 48.328, 70.370 48.027, 77.146 48.214 L 87.500 48.500 87.794 56.359 L 88.088 64.218 98.989 63.845 C 108.043 63.535, 110.356 63.125, 112.634 61.424 C 119.736 56.122, 121.911 47.667, 118.097 40.190 C 115.870 35.824, 110.154 32.014, 105.790 31.985 C 102.250 31.961, 101.126 30.787, 99.532 25.443 C 95.580 12.197, 80.880 4.736, 67.885 9.380 M 72 70.800 C 72 80.978, 71.625 85.975, 70.800 86.800 C 70.140 87.460, 67.781 88, 65.559 88 L 61.517 88 68.759 95.241 L 76 102.483 83.241 95.241 L 90.483 88 86.441 88 C 84.219 88, 81.860 87.460, 81.200 86.800 C 80.375 85.975, 80 80.978, 80 70.800 L 80 56 76 56 L 72 56 72 70.800 M 25.200 65.200 C 23.566 66.834, 23.566 85.166, 25.200 86.800 C 27.002 88.602, 29.798 88.246, 30.965 86.066 C 31.534 85.002, 32 80.472, 32 76 C 32 71.528, 31.534 66.998, 30.965 65.934 C 29.798 63.754, 27.002 63.398, 25.200 65.200"
+                    stroke="none" fill="currentColor" fill-rule="evenodd" />
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="chatIcon">
+                <path fill="currentColor"
+                    d="m18 21l-1.4-1.4l1.575-1.6H14v-2h4.175L16.6 14.4L18 13l4 4l-4 4ZM3 21V6q0-.825.588-1.413T5 4h12q.825 0 1.413.588T19 6v5.075q-.25-.05-.5-.063T18 11q-.25 0-.5.013t-.5.062V6H5v10h7.075q-.05.25-.063.5T12 17q0 .25.013.5t.062.5H6l-3 3Zm4-11h8V8H7v2Zm0 4h5v-2H7v2Zm-2 2V6v10Z" />
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="chatEditIcon">
+                <path fill="currentColor"
+                    d="M5 19h1.4l8.625-8.625l-1.4-1.4L5 17.6V19ZM19.3 8.925l-4.25-4.2l1.4-1.4q.575-.575 1.413-.575t1.412.575l1.4 1.4q.575.575.6 1.388t-.55 1.387L19.3 8.925ZM17.85 10.4L7.25 21H3v-4.25l10.6-10.6l4.25 4.25Zm-3.525-.725l-.7-.7l1.4 1.4l-.7-.7Z">
+                </path>
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="deleteIcon">
+                <path fill="currentColor"
+                    d="M8 20v-5h2v5h9v-7H5v7h3zm-4-9h16V8h-6V4h-4v4H4v3zM3 21v-8H2V7a1 1 0 0 1 1-1h5V3a1 1 0 0 1 1-1h6a1 1 0 0 1 1 1v3h5a1 1 0 0 1 1 1v6h-1v8a1 1 0 0 1-1 1H4a1 1 0 0 1-1-1z">
+                </path>
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="addIcon" stroke="currentColor" fill="none" stroke-width="2"
+                stroke-linecap="round" stroke-linejoin="round">
+                <line x1="12" y1="5" x2="12" y2="19"></line>
+                <line x1="5" y1="12" x2="19" y2="12"></line>
+            </symbol>
+            <symbol viewBox="0 0 200 100" preserveAspectRatio="xMidYMid" id="loadingIcon">
+                <g transform="translate(50 50)">
+                    <circle cx="0" cy="0" r="15" fill="#e15b64">
+                        <animateTransform attributeName="transform" type="scale" begin="-0.4s" calcMode="spline"
+                            keySplines="0.3 0 0.7 1;0.3 0 0.7 1" values="0;1;0" keyTimes="0;0.5;1" dur="1s"
+                            repeatCount="indefinite"></animateTransform>
+                    </circle>
+                </g>
+                <g transform="translate(100 50)">
+                    <circle cx="0" cy="0" r="15" fill="#f8b26a">
+                        <animateTransform attributeName="transform" type="scale" begin="-0.2s" calcMode="spline"
+                            keySplines="0.3 0 0.7 1;0.3 0 0.7 1" values="0;1;0" keyTimes="0;0.5;1" dur="1s"
+                            repeatCount="indefinite"></animateTransform>
+                    </circle>
+                </g>
+                <g transform="translate(150 50)">
+                    <circle cx="0" cy="0" r="15" fill="#99c959">
+                        <animateTransform attributeName="transform" type="scale" begin="0s" calcMode="spline"
+                            keySplines="0.3 0 0.7 1;0.3 0 0.7 1" values="0;1;0" keyTimes="0;0.5;1" dur="1s"
+                            repeatCount="indefinite"></animateTransform>
+                    </circle>
+                </g>
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="exportIcon">
+                <path fill="currentColor"
+                    d="m17.86 18l1.04 1c-1.4 1.2-3.96 2-6.9 2c-4.41 0-8-1.79-8-4V7c0-2.21 3.58-4 8-4c2.95 0 5.5.8 6.9 2l-1.04 1l-.36.4C16.65 5.77 14.78 5 12 5C8.13 5 6 6.5 6 7s2.13 2 6 2c1.37 0 2.5-.19 3.42-.46l.96.96H13.5v1.42c-.5.05-1 .08-1.5.08c-2.39 0-4.53-.53-6-1.36v2.81C7.3 13.4 9.58 14 12 14c.5 0 1-.03 1.5-.08v.58h2.88l-1 1l.12.11c-1.09.25-2.26.39-3.5.39c-2.28 0-4.39-.45-6-1.23V17c0 .5 2.13 2 6 2c2.78 0 4.65-.77 5.5-1.39l.36.39m1.06-10.92L17.5 8.5L20 11h-5v2h5l-2.5 2.5l1.42 1.42L23.84 12l-4.92-4.92Z" />
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="importIcon">
+                <path fill="currentColor"
+                    d="m8.84 12l-4.92 4.92L2.5 15.5L5 13H0v-2h5L2.5 8.5l1.42-1.42L8.84 12M12 3C8.59 3 5.68 4.07 4.53 5.57L5 6l1.03 1.07C6 7.05 6 7 6 7c0-.5 2.13-2 6-2s6 1.5 6 2s-2.13 2-6 2c-2.62 0-4.42-.69-5.32-1.28l3.12 3.12c.7.1 1.44.16 2.2.16c2.39 0 4.53-.53 6-1.36v2.81c-1.3.95-3.58 1.55-6 1.55c-.96 0-1.9-.1-2.76-.27l-1.65 1.64c1.32.4 2.82.63 4.41.63c2.28 0 4.39-.45 6-1.23V17c0 .5-2.13 2-6 2s-6-1.5-6-2v-.04L5 18l-.46.43C5.69 19.93 8.6 21 12 21c4.41 0 8-1.79 8-4V7c0-2.21-3.58-4-8-4Z" />
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="clearAllIcon">
+                <path fill="currentColor"
+                    d="M12 22C6.477 22 2 17.523 2 12S6.477 2 12 2s10 4.477 10 10s-4.477 10-10 10zm0-2a8 8 0 1 0 0-16a8 8 0 0 0 0 16zm0-9.414l2.828-2.829l1.415 1.415L13.414 12l2.829 2.828l-1.415 1.415L12 13.414l-2.828 2.829l-1.415-1.415L10.586 12L7.757 9.172l1.415-1.415L12 10.586z">
+                </path>
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="collapseFullIcon">
+                <path fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round"
+                    stroke-width="1.5"
+                    d="m20 20l-5-5m0 0v4m0-4h4M4 20l5-5m0 0v4m0-4H5M20 4l-5 5m0 0V5m0 4h4M4 4l5 5m0 0V5m0 4H5" />
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="expandFullIcon">
+                <path fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round"
+                    stroke-width="1.5"
+                    d="M9 9L4 4m0 0v4m0-4h4m7 5l5-5m0 0v4m0-4h-4M9 15l-5 5m0 0v-4m0 4h4m7-5l5 5m0 0v-4m0 4h-4" />
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="darkThemeIcon">
+                <path fill="currentColor"
+                    d="M20.742 13.045a8.088 8.088 0 0 1-2.077.271c-2.135 0-4.14-.83-5.646-2.336a8.025 8.025 0 0 1-2.064-7.723A1 1 0 0 0 9.73 2.034a10.014 10.014 0 0 0-4.489 2.582c-3.898 3.898-3.898 10.243 0 14.143a9.937 9.937 0 0 0 7.072 2.93 9.93 9.93 0 0 0 7.07-2.929 10.007 10.007 0 0 0 2.583-4.491 1.001 1.001 0 0 0-1.224-1.224zm-2.772 4.301a7.947 7.947 0 0 1-5.656 2.343 7.953 7.953 0 0 1-5.658-2.344c-3.118-3.119-3.118-8.195 0-11.314a7.923 7.923 0 0 1 2.06-1.483 10.027 10.027 0 0 0 2.89 7.848 9.972 9.972 0 0 0 7.848 2.891 8.036 8.036 0 0 1-1.484 2.059z">
+                </path>
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="lightThemeIcon">
+                <path fill="currentColor"
+                    d="M6.993 12c0 2.761 2.246 5.007 5.007 5.007s5.007-2.246 5.007-5.007S14.761 6.993 12 6.993 6.993 9.239 6.993 12zM12 8.993c1.658 0 3.007 1.349 3.007 3.007S13.658 15.007 12 15.007 8.993 13.658 8.993 12 10.342 8.993 12 8.993zM10.998 19h2v3h-2zm0-17h2v3h-2zm-9 9h3v2h-3zm17 0h3v2h-3zM4.219 18.363l2.12-2.122 1.415 1.414-2.12 2.122zM16.24 6.344l2.122-2.122 1.414 1.414-2.122 2.122zM6.342 7.759 4.22 5.637l1.415-1.414 2.12 2.122zm13.434 10.605-1.414 1.414-2.122-2.122 1.414-1.414z">
+                </path>
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="autoThemeIcon">
+                <g fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="2">
+                    <path d="M9.173 14.83a4 4 0 1 1 5.657-5.657" />
+                    <path
+                        d="m11.294 12.707l.174.247a7.5 7.5 0 0 0 8.845 2.492A9 9 0 0 1 5.642 18.36M3 12h1m8-9v1M5.6 5.6l.7.7M3 21L21 3" />
+                </g>
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="newFolderIcon">
+                <path fill="currentColor"
+                    d="M14 16h2v-2h2v-2h-2v-2h-2v2h-2v2h2v2ZM2 20V4h8l2 2h10v14H2Zm2-2h16V8h-8.825l-2-2H4v12Zm0 0V6v12Z" />
+            </symbol>
+            <symbol viewBox="0 0 20 20" id="expandFolderIcon">
+                <path fill="currentColor"
+                    d="M7.293 14.707a1 1 0 010-1.414L10.586 10 7.293 6.707a1 1 0 011.414-1.414l4 4a1 1 0 010 1.414l-4 4a1 1 0 01-1.414 0z">
+                </path>
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="closeIcon">
+                <path fill="currentColor"
+                    d="M6.4 19L5 17.6l5.6-5.6L5 6.4L6.4 5l5.6 5.6L17.6 5L19 6.4L13.4 12l5.6 5.6l-1.4 1.4l-5.6-5.6L6.4 19Z" />
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="settingIcon">
+                <path fill="currentColor"
+                    d="M13.85 22.25h-3.7c-.74 0-1.36-.54-1.45-1.27l-.27-1.89c-.27-.14-.53-.29-.79-.46l-1.8.72c-.7.26-1.47-.03-1.81-.65L2.2 15.53c-.35-.66-.2-1.44.36-1.88l1.53-1.19c-.01-.15-.02-.3-.02-.46 0-.15.01-.31.02-.46l-1.52-1.19c-.59-.45-.74-1.26-.37-1.88l1.85-3.19c.34-.62 1.11-.9 1.79-.63l1.81.73c.26-.17.52-.32.78-.46l.27-1.91c.09-.7.71-1.25 1.44-1.25h3.7c.74 0 1.36.54 1.45 1.27l.27 1.89c.27.14.53.29.79.46l1.8-.72c.71-.26 1.48.03 1.82.65l1.84 3.18c.36.66.2 1.44-.36 1.88l-1.52 1.19c.01.15.02.3.02.46s-.01.31-.02.46l1.52 1.19c.56.45.72 1.23.37 1.86l-1.86 3.22c-.34.62-1.11.9-1.8.63l-1.8-.72c-.26.17-.52.32-.78.46l-.27 1.91c-.1.68-.72 1.22-1.46 1.22zm-3.23-2h2.76l.37-2.55.53-.22c.44-.18.88-.44 1.34-.78l.45-.34 2.38.96 1.38-2.4-2.03-1.58.07-.56c.03-.26.06-.51.06-.78s-.03-.53-.06-.78l-.07-.56 2.03-1.58-1.39-2.4-2.39.96-.45-.35c-.42-.32-.87-.58-1.33-.77l-.52-.22-.37-2.55h-2.76l-.37 2.55-.53.21c-.44.19-.88.44-1.34.79l-.45.33-2.38-.95-1.39 2.39 2.03 1.58-.07.56a7 7 0 0 0-.06.79c0 .26.02.53.06.78l.07.56-2.03 1.58 1.38 2.4 2.39-.96.45.35c.43.33.86.58 1.33.77l.53.22.38 2.55z">
+                </path>
+                <circle fill="currentColor" cx="12" cy="12" r="3.5"></circle>
+            </symbol>
+            <symbol viewBox="299 299 1808 1808" id="aiIcon">
+                <path fill="currentColor"
+                    d="M1107.3 299.1c-198 0-373.9 127.3-435.2 315.3C544.8 640.6 434.9 720.2 370.5 833c-99.3 171.4-76.6 386.9 56.4 533.8-41.1 123.1-27 257.7 38.6 369.2 98.7 172 297.3 260.2 491.6 219.2 86.1 97 209.8 152.3 339.6 151.8 198 0 373.9-127.3 435.3-315.3 127.5-26.3 237.2-105.9 301-218.5 99.9-171.4 77.2-386.9-55.8-533.9v-.6c41.1-123.1 27-257.8-38.6-369.8-98.7-171.4-297.3-259.6-491-218.6-86.6-96.8-210.5-151.8-340.3-151.2zm0 117.5-.6.6c79.7 0 156.3 27.5 217.6 78.4-2.5 1.2-7.4 4.3-11 6.1L952.8 709.3c-18.4 10.4-29.4 30-29.4 51.4V1248l-155.1-89.4V755.8c-.1-187.1 151.6-338.9 339-339.2zm434.2 141.9c121.6-.2 234 64.5 294.7 169.8 39.2 68.6 53.9 148.8 40.4 226.5-2.5-1.8-7.3-4.3-10.4-6.1l-360.4-208.2c-18.4-10.4-41-10.4-59.4 0L1024 984.2V805.4L1372.7 604c51.3-29.7 109.5-45.4 168.8-45.5zM650 743.5v427.9c0 21.4 11 40.4 29.4 51.4l421.7 243-155.7 90L597.2 1355c-162-93.8-217.4-300.9-123.8-462.8C513.1 823.6 575.5 771 650 743.5zm807.9 106 348.8 200.8c162.5 93.7 217.6 300.6 123.8 462.8l.6.6c-39.8 68.6-102.4 121.2-176.5 148.2v-428c0-21.4-11-41-29.4-51.4l-422.3-243.7 155-89.3zM1201.7 997l177.8 102.8v205.1l-177.8 102.8-177.8-102.8v-205.1L1201.7 997zm279.5 161.6 155.1 89.4v402.2c0 187.3-152 339.2-339 339.2v-.6c-79.1 0-156.3-27.6-217-78.4 2.5-1.2 8-4.3 11-6.1l360.4-207.5c18.4-10.4 30-30 29.4-51.4l.1-486.8zM1380 1421.9v178.8l-348.8 200.8c-162.5 93.1-369.6 38-463.4-123.7h.6c-39.8-68-54-148.8-40.5-226.5 2.5 1.8 7.4 4.3 10.4 6.1l360.4 208.2c18.4 10.4 41 10.4 59.4 0l421.9-243.7z" />
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="importSetIcon">
+                <path fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="2"
+                    d="m12 21l-8-4.5v-9L12 3l8 4.5V12m-8 0l8-4.5M12 12v9m0-9L4 7.5M22 18h-7m3-3l-3 3l3 3" />
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="exportSetIcon">
+                <path fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="2"
+                    d="m12 21l-8-4.5v-9L12 3l8 4.5V12m-8 0l8-4.5M12 12v9m0-9L4 7.5M15 18h7m-3-3l3 3l-3 3" />
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="databaseIcon">
+                <path fill="currentColor"
+                    d="M12 3C7.58 3 4 4.79 4 7v10c0 2.21 3.59 4 8 4s8-1.79 8-4V7c0-2.21-3.58-4-8-4m6 14c0 .5-2.13 2-6 2s-6-1.5-6-2v-2.23c1.61.78 3.72 1.23 6 1.23s4.39-.45 6-1.23V17m0-4.55c-1.3.95-3.58 1.55-6 1.55s-4.7-.6-6-1.55V9.64c1.47.83 3.61 1.36 6 1.36s4.53-.53 6-1.36v2.81M12 9C8.13 9 6 7.5 6 7s2.13-2 6-2s6 1.5 6 2s-2.13 2-6 2Z" />
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="stopIcon">
+                <path fill="currentColor" d="M6 5h12a1 1 0 0 1 1 1v12a1 1 0 0 1-1 1H6a1 1 0 0 1-1-1V6a1 1 0 0 1 1-1Z" />
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="forceRefreshIcon">
+                <path fill="currentColor"
+                    d="M13.82 14H9.66c-.1-.66-.16-1.32-.16-2s.06-1.35.16-2h4.68c.09.65.16 1.32.16 2c0 .5-.04 1-.1 1.46c.6-.5 1.32-.89 2.1-1.14V12c0-.68-.06-1.34-.14-2h3.38c.16.64.26 1.31.26 2v.18c.7.17 1.35.45 1.95.82c.05-.32.05-.66.05-1c0-5.5-4.5-10-10-10C6.47 2 2 6.5 2 12s4.5 10 10 10c.34 0 .68 0 1-.05c-.41-.66-.71-1.4-.87-2.2c-.04.07-.08.14-.13.21c-.83-1.2-1.5-2.53-1.91-3.96h2.41c.31-.75.76-1.42 1.32-2m5.1-6h-2.95a15.65 15.65 0 0 0-1.38-3.56c1.84.63 3.37 1.9 4.33 3.56M12 4.03c.83 1.2 1.5 2.54 1.91 3.97h-3.82c.41-1.43 1.08-2.77 1.91-3.97M4.26 14C4.1 13.36 4 12.69 4 12s.1-1.36.26-2h3.38c-.08.66-.14 1.32-.14 2s.06 1.34.14 2H4.26m.82 2H8c.35 1.25.8 2.45 1.4 3.56A8.008 8.008 0 0 1 5.08 16M8 8H5.08A7.923 7.923 0 0 1 9.4 4.44C8.8 5.55 8.35 6.75 8 8m12.83 7.67L22 14.5v4h-4l1.77-1.77A2.5 2.5 0 1 0 20 20h1.71A3.991 3.991 0 0 1 18 22.5c-2.21 0-4-1.79-4-4s1.79-4 4-4c1.11 0 2.11.45 2.83 1.17Z" />
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="hotkeyIcon">
+                <g fill="none">
+                    <path
+                        d="M24 0v24H0V0h24ZM12.593 23.258l-.011.002l-.071.035l-.02.004l-.014-.004l-.071-.035c-.01-.004-.019-.001-.024.005l-.004.01l-.017.428l.005.02l.01.013l.104.074l.015.004l.012-.004l.104-.074l.012-.016l.004-.017l-.017-.427c-.002-.01-.009-.017-.017-.018Zm.265-.113l-.013.002l-.185.093l-.01.01l-.003.011l.018.43l.005.012l.008.007l.201.093c.012.004.023 0 .029-.008l.004-.014l-.034-.614c-.003-.012-.01-.02-.02-.022Zm-.715.002a.023.023 0 0 0-.027.006l-.006.014l-.034.614c0 .012.007.02.017.024l.015-.002l.201-.093l.01-.008l.004-.011l.017-.43l-.003-.012l-.01-.01l-.184-.092Z" />
+                    <path fill="currentColor"
+                        d="M18 3a3 3 0 0 1 2.995 2.824L21 6v12a3 3 0 0 1-2.824 2.995L18 21H6a3 3 0 0 1-2.995-2.824L3 18V6a3 3 0 0 1 2.824-2.995L6 3h12Zm-2.707 13.708A2.99 2.99 0 0 1 14 17H5v1a1 1 0 0 0 1 1h11.586l-2.293-2.292ZM18 5h-1v9c0 .386-.073.755-.206 1.094l-.086.2L19 17.585V6a1 1 0 0 0-.883-.993L18 5Zm-3 0H6a1 1 0 0 0-.993.883L5 6v9h9a1 1 0 0 0 .993-.883L15 14V5ZM9 7a1 1 0 0 1 .993.883L10 8v.631l1.445-.963a1 1 0 0 1 1.203 1.594l-.093.07l-1.377.918l1.377.918a1 1 0 0 1-1.009 1.723l-.1-.059L10 11.868V12a1 1 0 0 1-1.993.117L8 12V8a1 1 0 0 1 1-1Z" />
+                </g>
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="zhIcon">
+                <g fill="none" stroke="currentColor" stroke-linecap="round" stroke-width="2">
+                    <path stroke-linejoin="round" d="M5 8h14v7H5z" />
+                    <path d="M12 4v17" />
+                </g>
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="enIcon">
+                <path fill="currentColor"
+                    d="M14 10h2v.757a4.5 4.5 0 0 1 7 3.743V20h-2v-5.5c0-1.43-1.174-2.5-2.5-2.5S16 13.07 16 14.5V20h-2V10Zm-2-6v2H4v5h8v2H4v5h8v2H2V4h10Z" />
+            </symbol>
+            <symbol viewBox="0 0 24 24" id="caseIcon">
+                <path fill="currentColor"
+                    d="m3.975 17l3.75-10h1.8l3.75 10H11.55l-.9-2.55H6.6L5.7 17H3.975Zm3.15-4h3l-1.45-4.15h-.1L7.125 13Zm9.225 4.275q-1.225 0-1.925-.638t-.7-1.737q0-1.05.813-1.713t2.087-.662q.575 0 1.063.088t.837.287v-.35q0-.675-.462-1.075t-1.263-.4q-.525 0-.988.225t-.787.65l-1.075-.8q.475-.675 1.2-1.025t1.675-.35q1.55 0 2.375.738t.825 2.137v4.4H18.55v-.85h-.075q-.325.5-.875.788t-1.25.287Zm.25-1.25q.8 0 1.363-.563t.562-1.362q-.35-.2-.8-.3t-.825-.1q-.8 0-1.225.313t-.425.887q0 .5.375.813t.975.312Z" />
+            </symbol>
+        </svg>
+    </div>
+    <div id="loadMask">
+        <div>
+            <div>ChatGPT</div>
+            <div class="loadingCSSIcon">
+                <div></div>
+                <div></div>
+                <div></div>
+            </div>
+        </div>
+    </div>
+    <div class="chat_window">
+        <div class="overlay"></div>
+        <nav class="nav">
+            <div class="navHeader">
+                <div id="newChat">
+                    <svg width="24" height="24">
+                        <use xlink:href="#addIcon" />
+                    </svg>
+                    <span data-i18n-key="newChat"></span>
+                </div>
+                <div id="newFolder" data-i18n-title="newFolder" title>
+                    <svg width="24" height="24">
+                        <use xlink:href="#newFolderIcon" />
+                    </svg>
+                </div>
+            </div>
+            <div class="extraChat">
+                <input type="text" id="searchChat" autocomplete="off" readonly
+                    onfocus="this.removeAttribute('readonly')" data-i18n-place="search" placeholder />
+                <div id="clearSearch">
+                    <svg width="24" height="24">
+                        <use xlink:href="#closeIcon" />
+                    </svg>
+                </div>
+                <div id="matchCaseSearch" data-i18n-title="matchCaseTip" title>
+                    <svg width="24" height="24">
+                        <use xlink:href="#caseIcon" />
+                    </svg>
+                </div>
+            </div>
+            <div class="allList">
+                <div id="folderList"></div>
+                <div id="chatList"></div>
+            </div>
+            <div class="navFooter">
+                <div class="navFunc">
+                    <div id="refreshPage" data-i18n-title="forceRe" title>
+                        <svg width="24" height="24">
+                            <use xlink:href="#forceRefreshIcon" />
+                        </svg>
+                    </div>
+                    <div id="clearChat" data-i18n-title="clearAll" title>
+                        <svg width="24" height="24">
+                            <use xlink:href="#clearAllIcon" />
+                        </svg>
+                    </div>
+                    <div id="toggleLight" data-i18n-theme title>
+                        <svg width="24" height="24">
+                            <use xlink:href="#lightThemeIcon" />
+                        </svg>
+                    </div>
+                    <div id="sysSetting" data-i18n-title="setting" title>
+                        <svg width="24" height="24">
+                            <use xlink:href="#settingIcon" />
+                        </svg>
+                    </div>
+                </div>
+                <div class="divider"></div>
+                <div class="links">
+                    <a href="https://github.com/Aliebc/chatgpt-web" target="_blank"
+                        rel="noopener noreferrer">Github</a>
+                </div>
+            </div>
+        </nav>
+        <div class="mainContent">
+            <div class="top_menu">
+                <div class="toggler" data-i18n-title="nav" title>
+                    <div class="button close"></div>
+                    <div class="button minimize"></div>
+                    <div class="button maximize"></div>
+                </div>
+                <div class="title">
+                    <div id="selector"><span id="modelName">llama.cpp</span><span id="modelVer"></span></div>
+                </div>
+                <div class="settings">
+                    <button class="setBtn" id="toggleFull" data-i18n-window title>
+                        <svg width="30" height="30">
+                            <use xlink:href="#expandFullIcon" />
+                        </svg>
+                    </button>
+                    <button class="setBtn" id="setting">
+                        <svg viewBox="0 0 100 100" width="30" height="30">
+                            <title data-i18n-key="quickSet"></title>
+                            <circle cx="50" cy="20" r="10" fill="#e15b64" />
+                            <circle cx="50" cy="50" r="10" fill="#f8b26a" />
+                            <circle cx="50" cy="80" r="10" fill="#99c959" />
+                        </svg>
+                    </button>
+                </div>
+                <div id="modelDialog" style="display:none;">
+                    <div class="modelSingle" data-value="gpt-3.5-turbo" data-ver="3.5">
+                        <div class="modelAvatar gptAvatar">
+                            <svg width="24" height="24">
+                                <use xlink:href="#aiIcon"></use>
+                            </svg>
+                        </div>
+                        <div>llama.cpp</div>
+                    </div>
+                </div>
+                <div id="setDialog" style="display:none;">
+                    <div class="setSwitch" style="display:none;">
+                        <div data-id="convOption" data-i18n-key="chat" class="activeSwitch"></div>
+                        <div data-id="speechOption" data-i18n-key="tts"></div>
+                        <div data-id="recOption" data-i18n-key="stt"></div>
+                    </div>
+                    <div id="convOption">
+                        <div>
+                            <div data-i18n-key="avatar"></div>
+                            <div class="avatarDetail">
+                                <img id="setAvatarPre" src="" />
+                                <input class="inputTextClass" autocomplete="off" type="text" id="setAvatar" />
+                            </div>
+                        </div>
+                        <div>
+                            <div class="justSetLine presetSelect">
+                                <div data-i18n-key="systemRole"></div>
+                                <div>
+                                    <label for="preSetSystem" data-i18n-key="presetRole"></label>
+                                    <select id="preSetSystem">
+                                        <option value="default" data-i18n-key="default"></option>
+                                        <option value="normal" data-i18n-key="assistant"></option>
+                                        <option value="cat" data-i18n-key="cat"></option>
+                                        <option value="emoji" data-i18n-key="emoji"></option>
+                                        <option value="image" data-i18n-key="withImg"></option>
+                                    </select>
+                                </div>
+                            </div>
+                            <textarea class="inputTextClass areaTextClass" autocomplete="off"
+                                data-i18n-place="assistantText" placeholder id="systemInput"></textarea>
+                        </div>
+                        <div>
+                            <span data-i18n-key="nature"></span>
+                            <input type="range" id="top_p" min="0" max="1" value="1" step="0.05" />
+                            <div class="selectDef">
+                                <span data-i18n-key="natureNeg"></span>
+                                <span data-i18n-key="naturePos"></span>
+                            </div>
+                        </div>
+                        <div>
+                            <span data-i18n-key="quality"></span>
+                            <input type="range" id="temp" min="0" max="2" value="1" step="0.05" />
+                            <div class="selectDef">
+                                <span data-i18n-key="qualityNeg"></span>
+                                <span data-i18n-key="qualityPos"></span>
+                            </div>
+                        </div>
+                        <div>
+                            <span data-i18n-key="chatsWidth"></span>
+                            <input type="range" id="convWidth" min="30" max="100" value="100" step="1" />
+                            <div class="selectDef">
+                                <span>30%</span>
+                                <span>100%</span>
+                            </div>
+                        </div>
+                        <div>
+                            <span data-i18n-key="typeSpeed"></span>
+                            <input type="range" id="textSpeed" min="0" max="100" value="100" step="1" />
+                            <div class="selectDef">
+                                <span data-i18n-key="slow"></span>
+                                <span data-i18n-key="fast"></span>
+                            </div>
+                        </div>
+                        <div>
+                            <span><span data-i18n-key="continuousLen"></span>: <span id="contLenWrap"></span><span
+                                    data-i18n-key="msgAbbr"></span></span>
+                            <input type="range" id="contLength" min="0" max="50" value="25" step="1" />
+                            <div class="selectDef">
+                                <span>0</span>
+                                <span>50</span>
+                            </div>
+                        </div>
+                        <div>
+                            <span class="inlineTitle" data-i18n-key="longReply"></span>
+                            <label class="switch-slide">
+                                <input type="checkbox" id="enableLongReply" hidden />
+                                <label for="enableLongReply" class="switch-slide-label"></label>
+                            </label>
+                        </div>
+                    </div>
+                    <div id="speechOption" style="display: none;">
+                        <div class="presetSelect presetModelCls">
+                            <label for="preSetService" data-i18n-key="ttsService"></label>
+                            <select id="preSetService">
+                                <option value="4" data-i18n-key="openaiTTS"></option>
+                                <option value="3" data-i18n-key="azureTTS"></option>
+                                <option selected value="2" data-i18n-key="edgeTTS"></option>
+                                <option value="1" data-i18n-key="systemTTS"></option>
+                            </select>
+                        </div>
+                        <div class="presetSelect presetModelCls">
+                            <label for="preSetAzureRegion" data-i18n-key="azureRegion"></label>
+                            <select id="preSetAzureRegion">
+                            </select>
+                        </div>
+                        <div>
+                            <div data-i18n-key="azureKey"></div>
+                            <input class="inputTextClass" type="password" data-i18n-place="azureKey" id="azureKeyInput"
+                                autocomplete="off" />
+                        </div>
+                        <div id="checkVoiceLoad" class="checkLoad" style="display: none;">
+                            <svg>
+                                <use xlink:href="#loadingIcon" />
+                            </svg>
+                            <span data-i18n-key="loadVoice"></span>
+                        </div>
+                        <div id="speechDetail">
+                            <div>
+                                <div class="justSetLine">
+                                    <div data-i18n-key="voiceName" style="line-height: 28px;"></div>
+                                    <div id="voiceTypes">
+                                        <span data-type="0" data-i18n-key="userVoice"></span>
+                                        <span data-type="1" class="selVoiceType" data-i18n-key="replyVoice"></span>
+                                    </div>
+                                </div>
+                                <select id="preSetSpeech">
+                                </select>
+                            </div>
+                            <div>
+                                <div class="justSetLine">
+                                    <input class="inputTextClass" id="testVoiceText" data-i18n-value="TTSTest" value />
+                                </div>
+                                <div class="justSetLine readyTestVoice" id="testVoiceBtn" style="margin-top: 6px;">
+                                    <div class="justSetBtn" onclick="startTestVoice()">
+                                        <svg width="18" height="18">
+                                            <use xlink:href="#readyVoiceIcon" />
+                                        </svg>
+                                        <span data-i18n-key="play"></span>
+                                    </div>
+                                    <div class="justSetBtn" onclick="pauseTestVoice()">
+                                        <svg width="18" height="18">
+                                            <use xlink:href="#pauseVoiceIcon" />
+                                        </svg>
+                                        <span data-i18n-key="pause"></span>
+                                    </div>
+                                    <div class="justSetBtn" onclick="resumeTestVoice()">
+                                        <svg width="18" height="18">
+                                            <use xlink:href="#resumeVoiceIcon" />
+                                        </svg>
+                                        <span data-i18n-key="resume"></span>
+                                    </div>
+                                    <div class="justSetBtn" style="margin-right: 130px" onclick="stopTestVoice()">
+                                        <svg width="18" height="18">
+                                            <use xlink:href="#stopIcon" />
+                                        </svg>
+                                        <span data-i18n-key="stop"></span>
+                                    </div>
+                                </div>
+                            </div>
+                            <div class="justSetLine presetSelect" id="azureExtra" style="display:none;">
+                                <div class="presetModelCls">
+                                    <label for="preSetVoiceStyle" data-i18n-key="style"></label>
+                                    <select id="preSetVoiceStyle">
+                                    </select>
+                                </div>
+                                <div class="presetModelCls">
+                                    <label for="preSetVoiceRole" data-i18n-key="role"></label>
+                                    <select id="preSetVoiceRole">
+                                    </select>
+                                </div>
+                            </div>
+                            <div>
+                                <span data-i18n-key="volume"></span>
+                                <input type="range" id="voiceVolume" min="0.1" max="1.9" value="1" step="0.1" />
+                                <div class="selectDef">
+                                    <span data-i18n-key="low"></span>
+                                    <span data-i18n-key="high"></span>
+                                </div>
+                            </div>
+                            <div>
+                                <span data-i18n-key="rate"></span>
+                                <input type="range" id="voiceRate" min="0.1" max="1.9" value="1" step="0.1" />
+                                <div class="selectDef">
+                                    <span data-i18n-key="slow"></span>
+                                    <span data-i18n-key="fast"></span>
+                                </div>
+                            </div>
+                            <div>
+                                <span data-i18n-key="pitch"></span>
+                                <input type="range" id="voicePitch" min="0.1" max="1.9" value="1" step="0.1" />
+                                <div class="selectDef">
+                                    <span data-i18n-key="neutral"></span>
+                                    <span data-i18n-key="intense"></span>
+                                </div>
+                            </div>
+                            <div>
+                                <span class="inlineTitle" data-i18n-key="contSpeech"></span>
+                                <label class="switch-slide">
+                                    <input type="checkbox" id="enableContVoice" checked="true" hidden />
+                                    <label for="enableContVoice" class="switch-slide-label"></label>
+                                </label>
+                            </div>
+                            <div>
+                                <span class="inlineTitle" data-i18n-key="autoSpeech"></span>
+                                <label class="switch-slide">
+                                    <input type="checkbox" id="enableAutoVoice" hidden />
+                                    <label for="enableAutoVoice" class="switch-slide-label"></label>
+                                </label>
+                            </div>
+                        </div>
+                    </div>
+                    <div id="recOption" style="display: none;">
+                        <div class="presetSelect presetModelCls">
+                            <label for="preRecService" data-i18n-key="sttService"></label>
+                            <select id="preRecService">
+                                <option value="3" data-i18n-key="openaiTTS"></option>
+                                <option value="2" data-i18n-key="azureTTS"></option>
+                                <option value="1" data-i18n-key="systemTTS"></option>
+                            </select>
+                        </div>
+                        <div>
+                            <div class="presetSelect presetModelCls">
+                                <label for="preRecAzureRegion" data-i18n-key="azureRegion"></label>
+                                <select id="preRecAzureRegion">
+                                </select>
+                            </div>
+                            <div data-i18n-key="azureKey"></div>
+                            <input class="inputTextClass" type="password" data-i18n-place="azureKey"
+                                id="azureRecKeyInput" autocomplete="off" />
+                        </div>
+                        <div id="checkRecLoad" class="checkLoad" style="display: none;">
+                            <svg>
+                                <use xlink:href="#loadingIcon" />
+                            </svg>
+                            <span data-i18n-key="loadRecVoice"></span>
+                        </div>
+                        <div id="noRecTip" style="display: none; margin-top: 15px" data-i18n-key="unsupportRecTip">
+                        </div>
+                        <div id="recDetail" style="display: none;">
+                            <div class="presetSelect presetModelCls">
+                                <label for="selectLangOption" data-i18n-key="lang"></label>
+                                <select id="selectLangOption">
+                                </select>
+                            </div>
+                            <div class="presetSelect presetModelCls">
+                                <label for="selectDiaOption" data-i18n-key="dialect"></label>
+                                <select id="selectDiaOption">
+                                </select>
+                            </div>
+                            <div data-feat="forStream">
+                                <div data-i18n-key="autoSendKey"></div>
+                                <input class="inputTextClass" id="autoSendText" autocomplete="off"
+                                    data-i18n-place="send" placeholder />
+                            </div>
+                            <div data-feat="forStream">
+                                <div data-i18n-key="autoStopKey"></div>
+                                <input class="inputTextClass" id="autoStopText" autocomplete="off"
+                                    data-i18n-place="stop" placeholder />
+                            </div>
+                            <div data-feat="forStream">
+                                <span data-i18n-key="autoSendDelay"></span>
+                                <input type="range" id="autoSendTimeout" min="0" max="10" value="0" step="1" />
+                                <div class="selectDef">
+                                    <span>0<span data-i18n-key="second"></span></span>
+                                    <span>10<span data-i18n-key="second"></span></span>
+                                </div>
+                            </div>
+                            <div data-i18n-key="noStreamTip" data-feat="forNoStream"></div>
+                            <div>
+                                <span class="inlineTitle" data-i18n-key="keepListenMic"></span>
+                                <label class="switch-slide">
+                                    <input type="checkbox" id="keepListenMic" checked="false" hidden />
+                                    <label for="keepListenMic" class="switch-slide-label"></label>
+                                </label>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+            </div>
+            <div class="messages">
+                <div id="chatlog"></div>
+                <div id="stopChat"><svg width="24" height="24">
+                        <use xlink:href="#stopResIcon" />
+                    </svg><span data-i18n-key="stop"></span></div>
+            </div>
+            <div class="bottom_wrapper clearfix">
+                <div class="message_input_wrapper">
+                    <textarea class="message_input_text" autocomplete="off" spellcheck="false" data-i18n-place="askTip"
+                        placeholder id="chatinput"></textarea>
+                    <div id="voiceRec" style="display:none;">
+                        <div id="voiceRecIcon" style="display:none;">
+                            <svg viewBox="0 0 48 48" id="voiceInputIcon">
+                                <g fill="none" stroke="currentColor" stroke-linejoin="round" stroke-width="4">
+                                    <rect fill="none" width="14" height="27" x="17" y="4" rx="7" />
+                                    <rect class="animVoice" x="18" y="4" width="12" height="27" stroke="none"
+                                        fill="currentColor"></rect>
+                                    <path stroke-linecap="round"
+                                        d="M9 23c0 8.284 6.716 15 15 15c8.284 0 15-6.716 15-15M24 38v6" />
+                                </g>
+                            </svg>
+                        </div>
+                        <div id="voiceRecSetting">
+                            <select id="select_language" style="margin-bottom: 4px;"></select>
+                            <select id="select_dialect"></select>
+                        </div>
+                    </div>
+                </div>
+                <button class="loaded" id="sendbutton">
+                    <span data-i18n-key="send"></span>
+                    <svg style="margin:0 auto;height:40px;width:100%;">
+                        <use xlink:href="#loadingIcon" />
+                    </svg>
+                </button>
+                <button class="clearConv" data-i18n-title="clearChat" title>
+                    <svg style="color: #e15b64;" width="29" height="29">
+                        <use xlink:href="#closeIcon" />
+                    </svg>
+                    <svg width="21" height="21">
+                        <use xlink:href="#deleteIcon" />
+                    </svg>
+                </button>
+            </div>
+        </div>
+    </div>
+    <div id="sysMask">
+        <div id="sysDialog">
+            <div id="closeSet">
+                <svg width="24" height="24">
+                    <use xlink:href="#closeIcon" />
+                </svg>
+            </div>
+            <div class="sysTitle" data-i18n-key="setting"></div>
+            <div class="sysContent">
+                <div class="sysSwitch">
+                    <div data-id="generalOption" class="activeSwitch">
+                        <svg width="24" height="24">
+                            <use xlink:href="#settingIcon" />
+                        </svg><span data-i18n-key="general"></span>
+                    </div>
+                    <div data-id="hotkeyOption">
+                        <svg width="24" height="24">
+                            <use xlink:href="#hotkeyIcon" />
+                        </svg><span data-i18n-key="hotkey"></span>
+                    </div>
+                    <div data-id="dataOption">
+                        <svg width="24" height="24">
+                            <use xlink:href="#databaseIcon" />
+                        </svg><span data-i18n-key="data"></span>
+                    </div>
+                </div>
+                <div class="sysDetail">
+                    <div id="generalOption">
+                        <div class="setContent">
+                            <div class="setTitle" data-i18n-key="theme"></div>
+                            <div class="setDetail themeDetail lightTheme" id="setLight">
+                                <div data-i18n-title="darkTheme" title>
+                                    <svg width="24" height="24">
+                                        <use xlink:href="#darkThemeIcon"></use>
+                                    </svg>
+                                </div>
+                                <div data-i18n-title="lightTheme" title>
+                                    <svg width="24" height="24">
+                                        <use xlink:href="#lightThemeIcon"></use>
+                                    </svg>
+                                </div>
+                                <div data-i18n-title="autoWord" title>
+                                    <svg width="24" height="24">
+                                        <use xlink:href="#autoThemeIcon"></use>
+                                    </svg>
+                                </div>
+                            </div>
+                        </div>
+                        <div class="setContent" id="autoDetail" style="display: none;font-size: 15px;">
+                            <div class="setDetail">
+                                <div class="autoSelect">
+                                    <input type="radio" id="autoTheme1" name="autoLight" value="1" checked />
+                                    <label for="autoTheme1" data-i18n-key="systemTheme"></label>
+                                </div>
+                                <div class="autoSelect" style="margin-top: 8px;">
+                                    <input type="radio" id="autoTheme0" name="autoLight" value="0" />
+                                    <label for="autoTheme0" data-i18n-key="customDarkTheme"></label>
+                                </div>
+                                <div id="customAutoSet" style="display: none; margin-top: 10px;">
+                                    <div>
+                                        <label for="customStart" data-i18n-key="startDark"></label>
+                                        <input type="time" id="customStart" required>
+                                    </div>
+                                    <div style="margin-top: 10px;">
+                                        <label for="customEnd" data-i18n-key="endDark"></label>
+                                        <input type="time" id="customEnd" required>
+                                    </div>
+                                </div>
+                            </div>
+                        </div>
+                        <div class="setContent">
+                            <div class="setTitle" data-i18n-key="lang"></div>
+                            <div class="setDetail themeDetail langDetail" id="setLang">
+                                <div title="English">
+                                    <svg width="24" height="24">
+                                        <use xlink:href="#enIcon"></use>
+                                    </svg>
+                                </div>
+                                <div title="中文">
+                                    <svg width="24" height="24">
+                                        <use xlink:href="#zhIcon"></use>
+                                    </svg>
+                                </div>
+                            </div>
+                        </div>
+                        <div class="setContent">
+                            <div class="setTitle" data-i18n-key="about"></div>
+                            <div class="setDetail aboutDetail">
+                                <div data-i18n-key="about-d"></div>
+                            </div>
+                        </div>
+                        <div class="setContent">
+                            <div class="setTitle" data-i18n-key="contact"></div>
+                            <div class="setDetail aboutDetail">
+                                <a href="mailto:i@axgln.net" target="_blank" rel="noopener noreferrer">E-mail: i@axgln.net</a>
+                            </div>
+                        </div>
+                        <div id="hide-settings" style="display: none;">
+                            <div class="modelSwitch">
+                                <div data-id="gptOption" class="activeSwitch">ChatGPT</div>
+                                <div data-id="geminiOption">Gemini</div>
+                                <div data-id="claudeOption">Claude</div>
+                            </div>
+                            <div class="apiSelsContainer">
+                                <div id="apiSelect" tabindex="-1" style="display: none;"></div>
+                            </div>
+                            <div id="gptOption">
+                                <div class="setContent setNotNormalFlow">
+                                    <div class="setTitle"><span>ChatGPT</span><span data-i18n-key="aiEndpoint"></span></div>
+                                    <div class="setDetail inputDetail" style="position: relative;">
+                                        <input class="inputTextClass" placeholder="https://api.openai.com/"
+                                            autocomplete="off" id="apiHostInput" value="/" />
+                                    </div>
+                                </div>
+                                <div class="setContent">
+                                    <div class="setTitle"><span data-i18n-key="aiKey"></span></div>
+                                    <div class="setDetail inputDetail">
+                                        <input class="inputTextClass" type="password" placeholder="sk-xxxxxx" id="keyInput"
+                                            autocomplete="off" value="test-key" />
+                                    </div>
+                                </div>
+                                <div class="setContent">
+                                    <div class="setTitle" data-i18n-key="aiModel"></div>
+                                    <div class="setDetail inputDetail">
+                                        <input class="inputTextClass" type="text" placeholder="gpt-x-xxx" id="modelInput"
+                                            autocomplete="off" />
+                                    </div>
+                                </div>
+                            </div>
+                            <div id="geminiOption" style="display: none;">
+                                <div class="setContent setNotNormalFlow">
+                                    <div class="setTitle"><span>Gemini</span><span data-i18n-key="aiEndpoint"></span></div>
+                                    <div class="setDetail inputDetail" style="position: relative;">
+                                        <input class="inputTextClass"
+                                            placeholder="https://generativelanguage.googleapis.com/" autocomplete="off"
+                                            id="geminiApiHostInput" />
+                                    </div>
+                                </div>
+                                <div class="setContent">
+                                    <div class="setTitle" data-i18n-key="aiKey"></div>
+                                    <div class="setDetail inputDetail">
+                                        <input class="inputTextClass" type="password" placeholder="xxxx-xxxx"
+                                            id="geminiKeyInput" autocomplete="off" />
+                                    </div>
+                                </div>
+                                <div class="setContent">
+                                    <div class="setTitle" data-i18n-key="aiModel"></div>
+                                    <div class="setDetail inputDetail">
+                                        <input class="inputTextClass" type="text" placeholder="gemini-xxx-xxx"
+                                            id="geminiModelInput" autocomplete="off" />
+                                    </div>
+                                </div>
+                            </div>
+                            <div id="claudeOption" style="display: none;">
+                                <div class="setContent setNotNormalFlow">
+                                    <div class="setTitle"><span>Claude</span><span data-i18n-key="aiEndpoint"></span></div>
+                                    <div class="setDetail inputDetail" style="position: relative;">
+                                        <input class="inputTextClass" placeholder="https://api.anthropic.com/"
+                                            autocomplete="off" id="claudeApiHostInput" />
+                                    </div>
+                                </div>
+                                <div class="setContent">
+                                    <div class="setTitle" data-i18n-key="aiKey"></div>
+                                    <div class="setDetail inputDetail">
+                                        <input class="inputTextClass" type="password" placeholder="sk-xxxxxx"
+                                            id="claudeKeyInput" autocomplete="off" />
+                                    </div>
+                                </div>
+                                <div class="setContent">
+                                    <div class="setTitle" data-i18n-key="aiModel"></div>
+                                    <div class="setDetail inputDetail">
+                                        <input class="inputTextClass" type="text" placeholder="claude-x-xxx-xxxx"
+                                            id="claudeModelInput" autocomplete="off" />
+                                    </div>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
+                    <div id="hotkeyOption" style="display: none;">
+                        <div class="setContent">
+                            <div class="setTitle">UI</div>
+                            <div class="setDetail hotKeyDetail">
+                                <div>
+                                    <label for="hotKeyNav" data-i18n-key="navKey"></label>
+                                    <select id="hotKeyNav">
+                                    </select>
+                                </div>
+                                <div>
+                                    <label for="hotKeyWindow" data-i18n-key="fullKey"></label>
+                                    <select id="hotKeyWindow">
+                                    </select>
+                                </div>
+                                <div>
+                                    <label for="hotKeyTheme" data-i18n-key="themeKey"></label>
+                                    <select id="hotKeyTheme">
+                                    </select>
+                                </div>
+                                <div>
+                                    <label for="hotKeyLang" data-i18n-key="langKey"></label>
+                                    <select id="hotKeyLang">
+                                    </select>
+                                </div>
+                            </div>
+                            <div class="setTitle" data-i18n-key="chat"></div>
+                            <div class="setDetail hotKeyDetail">
+                                <div>
+                                    <label for="hotKeySearch" data-i18n-key="search"></label>
+                                    <select id="hotKeySearch">
+                                    </select>
+                                </div>
+                                <div>
+                                    <label for="hotKeyInput" data-i18n-key="inputKey"></label>
+                                    <select id="hotKeyInput">
+                                    </select>
+                                </div>
+                                <div>
+                                    <label for="hotKeyNewChat" data-i18n-key="newChat"></label>
+                                    <select id="hotKeyNewChat">
+                                    </select>
+                                </div>
+                                <div>
+                                    <label for="hotKeyClearChat" data-i18n-key="clearChat"></label>
+                                    <select id="hotKeyClearChat">
+                                    </select>
+                                </div>
+                            </div>
+                            <div class="setTitle" data-i18n-key="voiceKey"></div>
+                            <div class="setDetail hotKeyDetail">
+                                <div style="display: none;">
+                                    <label for="hotKeyVoiceRec" data-i18n-key="recKey"></label>
+                                    <select id="hotKeyVoiceRec">
+                                    </select>
+                                </div>
+                                <div>
+                                    <label for="hotKeyVoiceSpeak" data-i18n-key="speechKey"></label>
+                                    <select id="hotKeyVoiceSpeak">
+                                    </select>
+                                </div>
+                            </div>
+                            <div class="setDetail keyOptionDetail">
+                                <div id="resetHotKey">
+                                    <svg width="24" height="24" style="transform: scaleX(-1)">
+                                        <use xlink:href="#refreshIcon" />
+                                    </svg>
+                                    <span data-i18n-key="resetTip"></span>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
+                    <div id="dataOption" style="display: none;">
+                        <div class="setContent">
+                            <div class="setTitle" data-i18n-key="chat"></div>
+                            <div class="setDetail dataDetail">
+                                <div id="exportChat">
+                                    <svg width="24" height="24">
+                                        <use xlink:href="#exportIcon" />
+                                    </svg>
+                                    <span data-i18n-key="export"></span>
+                                </div>
+                                <label id="importChat" for="importChatInput">
+                                    <svg width="24" height="24">
+                                        <use xlink:href="#importIcon" />
+                                    </svg>
+                                    <span data-i18n-key="import"></span>
+                                </label>
+                                <input type="file" style="display: none;" id="importChatInput"
+                                    accept="application/json" />
+                                <div id="clearChatSet">
+                                    <svg width="24" height="24">
+                                        <use xlink:href="#clearAllIcon" />
+                                    </svg>
+                                    <span data-i18n-key="clear"></span>
+                                </div>
+                            </div>
+                        </div>
+                        <div class="setContent">
+                            <div class="setTitle" data-i18n-key="setting"></div>
+                            <div class="setDetail dataDetail">
+                                <div id="exportSet">
+                                    <svg width="24" height="24">
+                                        <use xlink:href="#exportSetIcon" />
+                                    </svg>
+                                    <span data-i18n-key="export"></span>
+                                </div>
+                                <label id="importSet" for="importSetInput">
+                                    <svg width="24" height="24">
+                                        <use xlink:href="#importSetIcon" />
+                                    </svg>
+                                    <span data-i18n-key="import"></span>
+                                </label>
+                                <input type="file" style="display: none;" id="importSetInput"
+                                    accept="application/json" />
+                                <div id="resetSet">
+                                    <svg width="24" height="24" style="transform: scaleX(-1)">
+                                        <use xlink:href="#refreshIcon" />
+                                    </svg>
+                                    <span data-i18n-key="reset"></span>
+                                </div>
+                            </div>
+                        </div>
+                        <div class="setContent">
+                            <div class="setTitle" data-i18n-key="localStore"></div>
+                            <div class="setDetail">
+                                <div class="progressBar">
+                                    <div class="nowProgress" id="usedStorageBar"></div>
+                                </div>
+                                <div class="progressDetail">
+                                    <div><span data-i18n-key="used"></span><span id="usedStorage"></span></div>
+                                    <div><span data-i18n-key="available"></span><span id="availableStorage"></span>
+                                    </div>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
+                    <div>
+
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    <link crossorigin="anonymous"
+        href="https://fastly.jsdelivr.net/npm/github-markdown-css@5.5.1/github-markdown-light.min.css" rel="stylesheet">
+    <link crossorigin="anonymous"
+        href="https://fastly.jsdelivr.net/gh/highlightjs/cdn-release@11.9.0/build/styles/github.min.css"
+        rel="stylesheet">
+    <link crossorigin="anonymous" href="https://fastly.jsdelivr.net/npm/notyf@3.10.0/notyf.min.css" rel="stylesheet">
+    <script crossorigin="anonymous" src="https://fastly.jsdelivr.net/npm/notyf@3.10.0/notyf.min.js"></script>
+    <script>
+        const notyf = new Notyf({
+            position: { x: "center", y: "top" },
+            types: [
+                {
+                    type: "success",
+                    background: "#99c959",
+                    duration: 2000,
+                },
+                {
+                    type: "warning",
+                    background: "#f8b26a",
+                    duration: 3000
+                },
+                {
+                    type: "error",
+                    background: "#e15b64",
+                    duration: 3000,
+                }
+            ]
+        });
+        const registerSW = () => {
+            if ("serviceWorker" in navigator) {
+                navigator.serviceWorker.register("sw.js" + location.search).then(reg => console.log("Service worker register succeeded"),
+                    error => console.error(`Service worker register failed: ${error}`))
+            }
+        };
+        window.addEventListener("load", () => registerSW());
+        const isMobile = navigator.userAgent.match(/iPhone|iPad|iPod|Android|BlackBerry|webOS/);
+        if (isMobile) {
+            const script = document.createElement("script");
+            script.src = "https://fastly.jsdelivr.net/gh/timruffles/mobile-drag-drop@3.0.0-rc.0/release/index.min.js";
+            script.crossOrigin = "anonymous";
+            script.defer = true;
+            script.onload = () => {
+                MobileDragDrop.polyfill();
+            }
+            document.body.appendChild(script);
+            const link = document.createElement("link");
+            link.crossOrigin = "anonymous";
+            link.rel = "stylesheet";
+            link.href = "https://fastly.jsdelivr.net/gh/timruffles/mobile-drag-drop@3.0.0-rc.0/release/default.css";
+            document.body.appendChild(link);
+        }
+        let envAPIEndpoint, envAPIKey, envAPIModel, envGeminiAPIEndpoint, envGeminiAPIKey, envGeminiAPIModel, envClaudeAPIEndpoint, envClaudeAPIKey, envClaudeAPIModel;
+    </script>
+    <script src="env.js"></script>
+    <script>
+        // from fflate@0.8.2
+        {const t=Uint8Array,e=Uint16Array,n=Int32Array,r=new t([0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0,0]),o=new t([0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,0,0]),l=new t([16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15]),s=(t,r)=>{const o=new e(31);for(let e=0;e<31;++e)o[e]=r+=1<<t[e-1];const l=new n(o[30]);for(let t=1;t<30;++t)for(let e=o[t];e<o[t+1];++e)l[e]=e-o[t]<<5|t;return{b:o,r:l}},{b:f,r:c}=s(r,2);f[28]=258,c[258]=28;const{b:i,r:a}=s(o,0),h=new e(32768);for(let t=0;t<32768;++t){let e=(43690&t)>>1|(21845&t)<<1;e=(52428&e)>>2|(13107&e)<<2,e=(61680&e)>>4|(3855&e)<<4,h[t]=((65280&e)>>8|(255&e)<<8)>>1}const u=(t,n,r)=>{const o=t.length;let l=0;const s=new e(n);for(;l<o;++l)t[l]&&++s[t[l]-1];const f=new e(n);for(l=1;l<n;++l)f[l]=f[l-1]+s[l-1]<<1;let c;if(r){c=new e(1<<n);const r=15-n;for(l=0;l<o;++l)if(t[l]){const e=l<<4|t[l],o=n-t[l];let s=f[t[l]-1]++<<o;for(const t=s|(1<<o)-1;s<=t;++s)c[h[s]>>r]=e}}else for(c=new e(o),l=0;l<o;++l)t[l]&&(c[l]=h[f[t[l]-1]++]>>15-t[l]);return c},w=new t(288);for(let t=0;t<144;++t)w[t]=8;for(let t=144;t<256;++t)w[t]=9;for(let t=256;t<280;++t)w[t]=7;for(let t=280;t<288;++t)w[t]=8;const g=new t(32);for(let t=0;t<32;++t)g[t]=5;const b=u(w,9,0),d=u(w,9,1),m=u(g,5,0),y=u(g,5,1),M=t=>{let e=t[0];for(let n=1;n<t.length;++n)t[n]>e&&(e=t[n]);return e},p=(t,e,n)=>{const r=e/8|0;return(t[r]|t[r+1]<<8)>>(7&e)&n},k=(t,e)=>{const n=e/8|0;return(t[n]|t[n+1]<<8|t[n+2]<<16)>>(7&e)},v=t=>(t+7)/8|0,x=(e,n,r)=>((null==n||n<0)&&(n=0),(null==r||r>e.length)&&(r=e.length),new t(e.subarray(n,r))),E=["unexpected EOF","invalid block type","invalid length/literal","invalid distance","stream finished","no stream handler",,"no callback","invalid UTF-8 data","extra field too long","date not in range 1980-2099","filename too long","stream finishing","invalid zip data"],A=(t,e,n)=>{const r=new Error(e||E[t]);if(r.code=t,Error.captureStackTrace&&Error.captureStackTrace(r,A),!n)throw r;return r},T=(e,n,s,c)=>{const a=e.length,h=c?c.length:0;if(!a||n.f&&!n.l)return s||new t(0);const w=!s,g=w||2!=n.i,b=n.i;w&&(s=new t(3*a));const m=e=>{let n=s.length;if(e>n){const r=new t(Math.max(2*n,e));r.set(s),s=r}};let E=n.f||0,T=n.p||0,U=n.b||0,z=n.l,F=n.d,S=n.m,I=n.n;const O=8*a;do{if(!z){E=p(e,T,1);const r=p(e,T+1,3);if(T+=3,!r){const t=v(T)+4,r=e[t-4]|e[t-3]<<8,o=t+r;if(o>a){b&&A(0);break}g&&m(U+r),s.set(e.subarray(t,o),U),n.b=U+=r,n.p=T=8*o,n.f=E;continue}if(1==r)z=d,F=y,S=9,I=5;else if(2==r){const n=p(e,T,31)+257,r=p(e,T+10,15)+4,o=n+p(e,T+5,31)+1;T+=14;const s=new t(o),f=new t(19);for(let t=0;t<r;++t)f[l[t]]=p(e,T+3*t,7);T+=3*r;const c=M(f),i=(1<<c)-1,a=u(f,c,1);for(let t=0;t<o;){const n=a[p(e,T,i)];T+=15&n;const r=n>>4;if(r<16)s[t++]=r;else{let n=0,o=0;for(16==r?(o=3+p(e,T,3),T+=2,n=s[t-1]):17==r?(o=3+p(e,T,7),T+=3):18==r&&(o=11+p(e,T,127),T+=7);o--;)s[t++]=n}}const h=s.subarray(0,n),w=s.subarray(n);S=M(h),I=M(w),z=u(h,S,1),F=u(w,I,1)}else A(1);if(T>O){b&&A(0);break}}g&&m(U+131072);const w=(1<<S)-1,x=(1<<I)-1;let j=T;for(;;j=T){const t=z[k(e,T)&w],n=t>>4;if(T+=15&t,T>O){b&&A(0);break}if(t||A(2),n<256)s[U++]=n;else{if(256==n){j=T,z=null;break}{let t=n-254;if(n>264){const o=n-257,l=r[o];t=p(e,T,(1<<l)-1)+f[o],T+=l}const l=F[k(e,T)&x],a=l>>4;l||A(3),T+=15&l;let u=i[a];if(a>3){const t=o[a];u+=k(e,T)&(1<<t)-1,T+=t}if(T>O){b&&A(0);break}g&&m(U+131072);const w=U+t;if(U<u){const t=h-u,e=Math.min(u,w);for(t+U<0&&A(3);U<e;++U)s[U]=c[t+U]}for(;U<w;++U)s[U]=s[U-u]}}}n.l=z,n.p=j,n.b=U,n.f=E,z&&(E=1,n.m=S,n.d=F,n.n=I)}while(!E);return U!=s.length&&w?x(s,0,U):s.subarray(0,U)},U=(t,e,n)=>{n<<=7&e;const r=e/8|0;t[r]|=n,t[r+1]|=n>>8},z=(t,e,n)=>{n<<=7&e;const r=e/8|0;t[r]|=n,t[r+1]|=n>>8,t[r+2]|=n>>16},F=(n,r)=>{const o=[];for(let t=0;t<n.length;++t)n[t]&&o.push({s:t,f:n[t]});const l=o.length,s=o.slice();if(!l)return{t:C,l:0};if(1==l){const e=new t(o[0].s+1);return e[o[0].s]=1,{t:e,l:1}}o.sort(((t,e)=>t.f-e.f)),o.push({s:-1,f:25001});let f=o[0],c=o[1],i=0,a=1,h=2;for(o[0]={s:-1,f:f.f+c.f,l:f,r:c};a!=l-1;)f=o[o[i].f<o[h].f?i++:h++],c=o[i!=a&&o[i].f<o[h].f?i++:h++],o[a++]={s:-1,f:f.f+c.f,l:f,r:c};let u=s[0].s;for(let t=1;t<l;++t)s[t].s>u&&(u=s[t].s);const w=new e(u+1);let g=S(o[a-1],w,0);if(g>r){let t=0,e=0;const n=g-r,o=1<<n;for(s.sort(((t,e)=>w[e.s]-w[t.s]||t.f-e.f));t<l;++t){const n=s[t].s;if(!(w[n]>r))break;e+=o-(1<<g-w[n]),w[n]=r}for(e>>=n;e>0;){const n=s[t].s;w[n]<r?e-=1<<r-w[n]++-1:++t}for(;t>=0&&e;--t){const n=s[t].s;w[n]==r&&(--w[n],++e)}g=r}return{t:new t(w),l:g}},S=(t,e,n)=>-1==t.s?Math.max(S(t.l,e,n+1),S(t.r,e,n+1)):e[t.s]=n,I=t=>{let n=t.length;for(;n&&!t[--n];);const r=new e(++n);let o=0,l=t[0],s=1;const f=t=>{r[o++]=t};for(let e=1;e<=n;++e)if(t[e]==l&&e!=n)++s;else{if(!l&&s>2){for(;s>138;s-=138)f(32754);s>2&&(f(s>10?s-11<<5|28690:s-3<<5|12305),s=0)}else if(s>3){for(f(l),--s;s>6;s-=6)f(8304);s>2&&(f(s-3<<5|8208),s=0)}for(;s--;)f(l);s=1,l=t[e]}return{c:r.subarray(0,o),n:n}},O=(t,e)=>{let n=0;for(let r=0;r<e.length;++r)n+=t[r]*e[r];return n},j=(t,e,n)=>{const r=n.length,o=v(e+2);t[o]=255&r,t[o+1]=r>>8,t[o+2]=255^t[o],t[o+3]=255^t[o+1];for(let e=0;e<r;++e)t[o+e+4]=n[e];return 8*(o+4+r)},q=(t,n,s,f,c,i,a,h,d,y,M)=>{U(n,M++,s),++c[256];const{t:p,l:k}=F(c,15),{t:v,l:x}=F(i,15),{c:E,n:A}=I(p),{c:T,n:S}=I(v),q=new e(19);for(let t=0;t<E.length;++t)++q[31&E[t]];for(let t=0;t<T.length;++t)++q[31&T[t]];const{t:B,l:C}=F(q,7);let D=19;for(;D>4&&!B[l[D-1]];--D);const G=y+5<<3,H=O(c,w)+O(i,g)+a,J=O(c,p)+O(i,v)+a+14+3*D+O(q,B)+2*q[16]+3*q[17]+7*q[18];if(d>=0&&G<=H&&G<=J)return j(n,M,t.subarray(d,d+y));let K,L,N,P;if(U(n,M,1+(J<H)),M+=2,J<H){K=u(p,k,0),L=p,N=u(v,x,0),P=v;const t=u(B,C,0);U(n,M,A-257),U(n,M+5,S-1),U(n,M+10,D-4),M+=14;for(let t=0;t<D;++t)U(n,M+3*t,B[l[t]]);M+=3*D;const e=[E,T];for(let r=0;r<2;++r){const o=e[r];for(let e=0;e<o.length;++e){const r=31&o[e];U(n,M,t[r]),M+=B[r],r>15&&(U(n,M,o[e]>>5&127),M+=o[e]>>12)}}}else K=b,L=w,N=m,P=g;for(let t=0;t<h;++t){const e=f[t];if(e>255){const t=e>>18&31;z(n,M,K[t+257]),M+=L[t+257],t>7&&(U(n,M,e>>23&31),M+=r[t]);const l=31&e;z(n,M,N[l]),M+=P[l],l>3&&(z(n,M,e>>5&8191),M+=o[l])}else z(n,M,K[e]),M+=L[e]}return z(n,M,K[256]),M+L[256]},B=new n([65540,131080,131088,131104,262176,1048704,1048832,2114560,2117632]),C=new t(0),D=(l,s,f,i,h,u)=>{const w=u.z||l.length,g=new t(i+w+5*(1+Math.ceil(w/7e3))+h),b=g.subarray(i,g.length-h),d=u.l;let m=7&(u.r||0);if(s){m&&(b[0]=u.r>>3);const t=B[s-1],i=t>>13,h=8191&t,g=(1<<f)-1,y=u.p||new e(32768),M=u.h||new e(g+1),p=Math.ceil(f/3),k=2*p,v=t=>(l[t]^l[t+1]<<p^l[t+2]<<k)&g,x=new n(25e3),E=new e(288),A=new e(32);let T=0,U=0,z=u.i||0,F=0,S=u.w||0,I=0;for(;z+2<w;++z){const t=v(z);let e=32767&z,n=M[t];if(y[e]=n,M[t]=e,S<=z){const s=w-z;if((T>7e3||F>24576)&&(s>423||!d)){m=q(l,b,0,x,E,A,U,F,I,z-I,m),F=T=U=0,I=z;for(let t=0;t<286;++t)E[t]=0;for(let t=0;t<30;++t)A[t]=0}let f=2,u=0,g=h,M=e-n&32767;if(s>2&&t==v(z-M)){const t=Math.min(i,s)-1,r=Math.min(32767,z),o=Math.min(258,s);for(;M<=r&&--g&&e!=n;){if(l[z+f]==l[z+f-M]){let e=0;for(;e<o&&l[z+e]==l[z+e-M];++e);if(e>f){if(f=e,u=M,e>t)break;const r=Math.min(M,e-2);let o=0;for(let t=0;t<r;++t){const e=z-M+t&32767,r=e-y[e]&32767;r>o&&(o=r,n=e)}}}e=n,n=y[e],M+=e-n&32767}}if(u){x[F++]=268435456|c[f]<<18|a[u];const t=31&c[f],e=31&a[u];U+=r[t]+o[e],++E[257+t],++A[e],S=z+f,++T}else x[F++]=l[z],++E[l[z]]}}for(z=Math.max(z,S);z<w;++z)x[F++]=l[z],++E[l[z]];m=q(l,b,d,x,E,A,U,F,I,z-I,m),d||(u.r=7&m|b[m/8|0]<<3,m-=7,u.h=M,u.p=y,u.i=z,u.w=S)}else{for(let t=u.w||0;t<w+d;t+=65535){let e=t+65535;e>=w&&(b[m/8|0]=d,e=w),m=j(b,m+1,l.subarray(t,e))}u.i=w}return x(g,0,i+v(m)+h)},G=(e,n,r,o,l)=>{if(!l&&(l={l:1},n.dictionary)){const r=n.dictionary.subarray(-32768),o=new t(r.length+e.length);o.set(r),o.set(e,r.length),e=o,l.w=r.length}return D(e,null==n.level?6:n.level,null==n.mem?l.l?Math.ceil(1.5*Math.max(8,Math.min(13,Math.log(e.length)))):20:12+n.mem,r,o,l)};function deflateSync(t,e){return G(t,e||{},0,0)}function inflateSync(t,e){return T(t,{i:2},e&&e.out,e&&e.dictionary)}};
+        // from markdown-it-texmath@1.0.1
+        function escapeHTML(e){return e.replace(/&/g,"&amp;").replace(/</g,"&lt;").replace(/>/g,"&gt;").replace(/"/g,"&quot;").replace(/'/g,"&apos;").replace(/\//g,"&sol;")}function texmath(e,t){const n=texmath.mergeDelimiters(t&&t.delimiters),a=t&&t.outerSpace||!1,o=t&&t.katexOptions||{};o.throwOnError=o.throwOnError||!1,o.macros=o.macros||t&&t.macros,texmath.katex||(t&&"object"==typeof t.engine?texmath.katex=t.engine:"object"==typeof module?texmath.katex=require("katex"):texmath.katex={renderToString:()=>"No math renderer found."});for(const t of n.inline)a&&"outerSpace"in t&&(t.outerSpace=!0),e.inline.ruler.before("escape",t.name,texmath.inline(t)),e.renderer.rules[t.name]=(e,n)=>t.tmpl.replace(/\$1/,texmath.render(e[n].content,!!t.displayMode,o));for(const t of n.block)e.block.ruler.before("fence",t.name,texmath.block(t)),e.renderer.rules[t.name]=(e,n)=>t.tmpl.replace(/\$2/,escapeHTML(e[n].info)).replace(/\$1/,texmath.render(e[n].content,!0,o))}texmath.mergeDelimiters=function(e){const t=Array.isArray(e)?e:"string"==typeof e?[e]:["dollars"],n={inline:[],block:[]};for(const e of t)e in texmath.rules&&(n.inline.push(...texmath.rules[e].inline),n.block.push(...texmath.rules[e].block));return n},texmath.inline=e=>function(t,n){const a=t.pos,o=t.src,r=o.startsWith(e.tag,e.rex.lastIndex=a)&&(!e.pre||e.pre(o,e.outerSpace,a))&&e.rex.exec(o),s=!!r&&a<e.rex.lastIndex&&(!e.post||e.post(o,e.outerSpace,e.rex.lastIndex-1));if(s){if(!n){const n=t.push(e.name,"math",0);n.content=r[1],n.markup=e.tag}t.pos=e.rex.lastIndex}return s},texmath.block=e=>function(t,n,a,o){const r=t.bMarks[n]+t.tShift[n],s=t.src,m=s.startsWith(e.tag,e.rex.lastIndex=r)&&(!e.pre||e.pre(s,!1,r))&&e.rex.exec(s),l=!!m&&r<e.rex.lastIndex&&(!e.post||e.post(s,!1,e.rex.lastIndex-1));if(l&&!o){const o=e.rex.lastIndex-1;let r;for(r=n;r<a&&!(o>=t.bMarks[r]+t.tShift[r]&&o<=t.eMarks[r]);r++);const s=t.lineMax,l=t.parentType;t.lineMax=r,t.parentType="math","blockquote"===l&&(m[1]=m[1].replace(/(\n*?^(?:\s*>)+)/gm,""));let c=t.push(e.name,"math",0);c.block=!0,c.tag=e.tag,c.markup="",c.content=m[1],c.info=m[m.length-1],c.map=[n,r+1],t.parentType=l,t.lineMax=s,t.line=r+1}return l},texmath.render=function(e,t,n){let a;n.displayMode=t;try{a=texmath.katex.renderToString(e,n)}catch(t){a=escapeHTML(`${e}:${t.message}`)}return a},texmath.inlineRuleNames=["math_inline","math_inline_double"],texmath.blockRuleNames=["math_block","math_block_eqno"],texmath.$_pre=(e,t,n)=>{const a=n>0&&e[n-1].charCodeAt(0);return t?!a||32===a:!a||92!==a&&(a<48||a>57)},texmath.$_post=(e,t,n)=>{const a=e[n+1]&&e[n+1].charCodeAt(0);return t?!a||32===a||46===a||44===a||59===a:!a||a<48||a>57},texmath.rules={brackets:{inline:[{name:"math_inline",rex:/\\\((.+?)\\\)/gy,tmpl:"<eq>$1</eq>",tag:"\\("}],block:[{name:"math_block_eqno",rex:/\\\[(((?!\\\]|\\\[)[\s\S])+?)\\\]\s*?\(([^)$\r\n]+?)\)/gmy,tmpl:'<section class="eqno"><eqn>$1</eqn><span>($2)</span></section>',tag:"\\["},{name:"math_block",rex:/\\\[([\s\S]+?)\\\]/gmy,tmpl:"<section><eqn>$1</eqn></section>",tag:"\\["}]},doxygen:{inline:[{name:"math_inline",rex:/\\f\$(.+?)\\f\$/gy,tmpl:"<eq>$1</eq>",tag:"\\f$"}],block:[{name:"math_block_eqno",rex:/\\f\[([^]+?)\\f\]\s*?\(([^)\s]+?)\)/gmy,tmpl:'<section class="eqno"><eqn>$1</eqn><span>($2)</span></section>',tag:"\\f["},{name:"math_block",rex:/\\f\[([^]+?)\\f\]/gmy,tmpl:"<section><eqn>$1</eqn></section>",tag:"\\f["}]},gitlab:{inline:[{name:"math_inline",rex:/\$`(.+?)`\$/gy,tmpl:"<eq>$1</eq>",tag:"$`"}],block:[{name:"math_block_eqno",rex:/`{3}math\s*([^`]+?)\s*?`{3}\s*\(([^)\r\n]+?)\)/gm,tmpl:'<section class="eqno"><eqn>$1</eqn><span>($2)</span></section>',tag:"```math"},{name:"math_block",rex:/`{3}math\s*([^`]*?)\s*`{3}/gm,tmpl:"<section><eqn>$1</eqn></section>",tag:"```math"}]},julia:{inline:[{name:"math_inline",rex:/`{2}([^`]+?)`{2}/gy,tmpl:"<eq>$1</eq>",tag:"``"},{name:"math_inline",rex:/\$((?:\S?)|(?:\S.*?\S))\$/gy,tmpl:"<eq>$1</eq>",tag:"$",spaceEnclosed:!1,pre:texmath.$_pre,post:texmath.$_post}],block:[{name:"math_block_eqno",rex:/`{3}math\s+?([^`]+?)\s+?`{3}\s*?\(([^)$\r\n]+?)\)/gmy,tmpl:'<section class="eqno"><eqn>$1</eqn><span>($2)</span></section>',tag:"```math"},{name:"math_block",rex:/`{3}math\s+?([^`]+?)\s+?`{3}/gmy,tmpl:"<section><eqn>$1</eqn></section>",tag:"```math"}]},kramdown:{inline:[{name:"math_inline",rex:/\${2}(.+?)\${2}/gy,tmpl:"<eq>$1</eq>",tag:"$$"}],block:[{name:"math_block_eqno",rex:/\${2}([^$]+?)\${2}\s*?\(([^)\s]+?)\)/gmy,tmpl:'<section class="eqno"><eqn>$1</eqn><span>($2)</span></section>',tag:"$$"},{name:"math_block",rex:/\${2}([^$]+?)\${2}/gmy,tmpl:"<section><eqn>$1</eqn></section>",tag:"$$"}]},beg_end:{inline:[],block:[{name:"math_block",rex:/(\\(?:begin)\{([a-z]+)\}[\s\S]+?\\(?:end)\{\2\})/gmy,tmpl:"<section><eqn>$1</eqn></section>",tag:"\\"}]},dollars:{inline:[{name:"math_inline_double",rex:/\${2}([^$]*?[^\\])\${2}/gy,tmpl:"<section><eqn>$1</eqn></section>",tag:"$$",displayMode:!0,pre:texmath.$_pre,post:texmath.$_post},{name:"math_inline",rex:/\$((?:[^\s\\])|(?:\S.*?[^\s\\]))\$/gy,tmpl:"<eq>$1</eq>",tag:"$",outerSpace:!1,pre:texmath.$_pre,post:texmath.$_post}],block:[{name:"math_block_eqno",rex:/\${2}([^$]*?[^\\])\${2}\s*?\(([^)\s]+?)\)/gmy,tmpl:'<section class="eqno"><eqn>$1</eqn><span>($2)</span></section>',tag:"$$"},{name:"math_block",rex:/\${2}([^$]*?[^\\])\${2}/gmy,tmpl:"<section><eqn>$1</eqn></section>",tag:"$$"}]}};
+        // from morphdom@2.7.2
+        {let e,t=11,l=(e,l)=>{let n,i,r,d,o,a=l.attributes;if(l.nodeType===t||e.nodeType===t)return;for(let t=a.length-1;t>=0;t--)n=a[t],i=n.name,r=n.namespaceURI,d=n.value,r?(i=n.localName||i,o=e.getAttributeNS(r,i),o!==d&&("xmlns"===n.prefix&&(i=n.name),e.setAttributeNS(r,i,d))):(o=e.getAttribute(i),o!==d&&e.setAttribute(i,d));let u=e.attributes;for(let t=u.length-1;t>=0;t--)n=u[t],i=n.name,r=n.namespaceURI,r?(i=n.localName||i,l.hasAttributeNS(r,i)||e.removeAttributeNS(r,i)):l.hasAttribute(i)||e.removeAttribute(i)},n="http://www.w3.org/1999/xhtml",i="undefined"==typeof document?void 0:document,r=!!i&&"content"in i.createElement("template"),d=!!i&&i.createRange&&"createContextualFragment"in i.createRange(),o=e=>{let t=i.createElement("template");return t.innerHTML=e,t.content.childNodes[0]},a=t=>(e||(e=i.createRange(),e.selectNode(i.body)),e.createContextualFragment(t).childNodes[0]),u=e=>{let t=i.createElement("body");return t.innerHTML=e,t.childNodes[0]},s=e=>(e=e.trim(),r?o(e):d?a(e):u(e)),f=(e,t)=>{let l,n,i=e.nodeName,r=t.nodeName;return i===r||(l=i.charCodeAt(0),n=r.charCodeAt(0),l<=90&&n>=97?i===r.toUpperCase():n<=90&&l>=97&&r===i.toUpperCase())},c=(e,t)=>t&&t!==n?i.createElementNS(t,e):i.createElement(e),m=(e,t)=>{let l=e.firstChild;for(;l;){let e=l.nextSibling;t.appendChild(l),l=e}return t},p=(e,t,l)=>{e[l]!==t[l]&&(e[l]=t[l],e[l]?e.setAttribute(l,""):e.removeAttribute(l))},h={OPTION:(e,t)=>{let l=e.parentNode;if(l){let n=l.nodeName.toUpperCase();"OPTGROUP"===n&&(l=l.parentNode,n=l&&l.nodeName.toUpperCase()),"SELECT"!==n||l.hasAttribute("multiple")||(e.hasAttribute("selected")&&!t.selected&&(e.setAttribute("selected","selected"),e.removeAttribute("selected")),l.selectedIndex=-1)}p(e,t,"selected")},INPUT:(e,t)=>{p(e,t,"checked"),p(e,t,"disabled"),e.value!==t.value&&(e.value=t.value),t.hasAttribute("value")||e.removeAttribute("value")},TEXTAREA:(e,t)=>{let l=t.value;e.value!==l&&(e.value=l);let n=e.firstChild;if(n){let t=n.nodeValue;if(t==l||!l&&t==e.placeholder)return;n.nodeValue=l}},SELECT:(e,t)=>{if(!t.hasAttribute("multiple")){let t,l,n=-1,i=0,r=e.firstChild;for(;r;)if(l=r.nodeName&&r.nodeName.toUpperCase(),"OPTGROUP"===l)t=r,r=t.firstChild;else{if("OPTION"===l){if(r.hasAttribute("selected")){n=i;break}i++}r=r.nextSibling,!r&&t&&(r=t.nextSibling,t=null)}e.selectedIndex=n}}},N=1,b=11,A=3,C=8,T=()=>{},g=e=>{if(e)return e.getAttribute&&e.getAttribute("id")||e.id},E=e=>function(t,l,n){if(n||(n={}),"string"==typeof l)if("#document"===t.nodeName||"HTML"===t.nodeName||"BODY"===t.nodeName){let e=l;(l=i.createElement("html")).innerHTML=e}else l=s(l);else l.nodeType===b&&(l=l.firstElementChild);let r=n.getNodeKey||g,d=n.onBeforeNodeAdded||T,o=n.onNodeAdded||T,a=n.onBeforeElUpdated||T,u=n.onElUpdated||T,p=n.onBeforeNodeDiscarded||T,E=n.onNodeDiscarded||T,v=n.onBeforeElChildrenUpdated||T,x=n.skipFromChildren||T,y=n.addChild||function(e,t){return e.appendChild(t)},S=!0===n.childrenOnly,U=Object.create(null),O=[],R=e=>{O.push(e)},V=(e,t)=>{if(e.nodeType===N){let l=e.firstChild;for(;l;){let e;t&&(e=r(l))?R(e):(E(l),l.firstChild&&V(l,t)),l=l.nextSibling}}},w=(e,t,l)=>{!1!==p(e)&&(t&&t.removeChild(e),E(e),V(e,l))},I=e=>{if(e.nodeType===N||e.nodeType===b){let t=e.firstChild;for(;t;){let e=r(t);e&&(U[e]=t),I(t),t=t.nextSibling}}};I(t);let P=e=>{o(e);let t=e.firstChild;for(;t;){let e=t.nextSibling,l=r(t);if(l){let e=U[l];e&&f(t,e)?(t.parentNode.replaceChild(e,t),B(e,t)):P(t)}else P(t);t=e}},B=(t,l,n)=>{let i=r(l);if(i&&delete U[i],!n){if(!1===a(t,l))return;if(e(t,l),u(t),!1===v(t,l))return}"TEXTAREA"!==t.nodeName?L(t,l):h.TEXTAREA(t,l)},L=(e,t)=>{let l,n,o,a,u,s=x(e,t),c=t.firstChild,m=e.firstChild;e:for(;c;){for(a=c.nextSibling,l=r(c);!s&&m;){if(o=m.nextSibling,c.isEqualNode&&c.isEqualNode(m)){c=a,m=o;continue e}n=r(m);let t,i=m.nodeType;if(i===c.nodeType&&(i===N?(l?l!==n&&((u=U[l])?o===u?t=!1:(e.insertBefore(u,m),n?R(n):w(m,e,!0),m=u,n=r(m)):t=!1):n&&(t=!1),t=!1!==t&&f(m,c),t&&B(m,c)):i!==A&&i!=C||(t=!0,m.nodeValue!==c.nodeValue&&(m.nodeValue=c.nodeValue))),t){c=a,m=o;continue e}n?R(n):w(m,e,!0),m=o}if(l&&(u=U[l])&&f(u,c))s||y(e,u),B(u,c);else{let t=d(c);!1!==t&&(t&&(c=t),c.actualize&&(c=c.actualize(e.ownerDocument||i)),y(e,c),P(c))}c=a,m=o}((e,t,l)=>{for(;t;){let n=t.nextSibling;(l=r(t))?R(l):w(t,e,!0),t=n}})(e,m,n);let p=h[e.nodeName];p&&p(e,t)},D=t,z=D.nodeType,H=l.nodeType;if(!S)if(z===N)H===N?f(t,l)||(E(t),D=m(t,c(l.nodeName,l.namespaceURI))):D=l;else if(z===A||z===C){if(H===z)return D.nodeValue!==l.nodeValue&&(D.nodeValue=l.nodeValue),D;D=l}if(B(D,l,S),O)for(let e=0,t=O.length;e<t;e++){let t=U[O[e]];t&&w(t,t.parentNode,!1)}return!S&&D!==t&&t.parentNode&&(D.actualize&&(D=D.actualize(t.ownerDocument||i)),t.parentNode.replaceChild(D,t)),D};window.morphdom=E(l)};
+        // from sha1-uint8array@0.10.7
+        {const K=[1518500249,1859775393,-1894007588,-899497514];window.createHash=()=>new Hash;class Hash{constructor(){this.A=1732584193,this.B=-271733879,this.C=-1732584194,this.D=271733878,this.E=-1009589776,this._size=0,this._sp=0,(!sharedBuffer||sharedOffset>=8e3)&&(sharedBuffer=new ArrayBuffer(8e3),sharedOffset=0),this._byte=new Uint8Array(sharedBuffer,sharedOffset,80),this._word=new Int32Array(sharedBuffer,sharedOffset,20),sharedOffset+=80};update(t){if("string"==typeof t)return this._utf8(t);if(null==t)throw new TypeError("Invalid type: "+typeof t);const s=t.byteOffset,e=t.byteLength;let r=e/64|0,i=0;if(r&&!(3&s)&&!(this._size%64)){const e=new Int32Array(t.buffer,s,16*r);for(;r--;)this._int32(e,i>>2),i+=64;this._size+=i}if(1!==t.BYTES_PER_ELEMENT&&t.buffer){const r=new Uint8Array(t.buffer,s+i,e-i);return this._uint8(r)}return i===e?this:this._uint8(t,i)};_uint8(t,s){const{_byte:e,_word:r}=this,i=t.length;for(s|=0;s<i;){const h=this._size%64;let n=h;for(;s<i&&n<64;)e[n++]=t[s++];n>=64&&this._int32(r),this._size+=n-h}return this};_utf8(t){const{_byte:s,_word:e}=this,r=t.length;let i=this._sp;for(let h=0;h<r;){const n=this._size%64;let f=n;for(;h<r&&f<64;){let e=0|t.charCodeAt(h++);e<128?s[f++]=e:e<2048?(s[f++]=192|e>>>6,s[f++]=128|63&e):e<55296||e>57343?(s[f++]=224|e>>>12,s[f++]=128|e>>>6&63,s[f++]=128|63&e):i?(e=((1023&i)<<10)+(1023&e)+65536,s[f++]=240|e>>>18,s[f++]=128|e>>>12&63,s[f++]=128|e>>>6&63,s[f++]=128|63&e,i=0):i=e}f>=64&&(this._int32(e),e[0]=e[16]),this._size+=f-n}return this._sp=i,this};_int32(t,s){let{A:e,B:r,C:i,D:h,E:n}=this,f=0;for(s|=0;f<16;)W[f++]=swap32(t[s++]);for(f=16;f<80;f++)W[f]=rotate1(W[f-3]^W[f-8]^W[f-14]^W[f-16]);for(f=0;f<80;f++){const t=f/20|0,s=rotate5(e)+ft(t,r,i,h)+n+W[f]+K[t]|0;n=h,h=i,i=rotate30(r),r=e,e=s}this.A=e+this.A|0,this.B=r+this.B|0,this.C=i+this.C|0,this.D=h+this.D|0,this.E=n+this.E|0};digest(t){const{_byte:s,_word:e}=this;let r=this._size%64|0;for(s[r++]=128;3&r;)s[r++]=0;if(r>>=2,r>14){for(;r<16;)e[r++]=0;r=0,this._int32(e)}for(;r<16;)e[r++]=0;const i=8*this._size,h=(4294967295&i)>>>0,n=(i-h)/4294967296;return n&&(e[14]=swap32(n)),h&&(e[15]=swap32(h)),this._int32(e),"hex"===t?this._hex():this._bin()};_hex(){const{A:t,B:s,C:e,D:r,E:i}=this;return hex32(t)+hex32(s)+hex32(e)+hex32(r)+hex32(i)};_bin(){const{A:t,B:s,C:e,D:r,E:i,_byte:h,_word:n}=this;return n[0]=swap32(t),n[1]=swap32(s),n[2]=swap32(e),n[3]=swap32(r),n[4]=swap32(i),h.slice(0,20)}};const W=new Int32Array(80);let sharedBuffer,sharedOffset=0;const hex32=t=>(t+4294967296).toString(16).substr(-8),swapLE=t=>t<<24&4278190080|t<<8&16711680|t>>8&65280|t>>24&255,swapBE=t=>t,swap32=isBE()?swapBE:swapLE,rotate1=t=>t<<1|t>>>31,rotate5=t=>t<<5|t>>>27,rotate30=t=>t<<30|t>>>2;function ft(t,s,e,r){return 0===t?s&e|~s&r:2===t?s&e|s&r|e&r:s^e^r}function isBE(){return 254===new Uint8Array(new Uint16Array([65279]).buffer)[0]}};
+    </script>
+    <script>
+        const stringToArray = string => {
+            let arr = []
+            for (let i = 0; i < string.length; i++) {
+                arr.push(string.charCodeAt(i));
+            }
+            return arr
+        }
+        const stringToUint = string => {
+            return new Uint8Array(stringToArray(string));
+        }
+        const uintToString = uintArray => {
+            let str = "";
+            let len = Math.ceil(uintArray.byteLength / 32767);
+            for (let i = 0; i < len; i++) {
+                str += String.fromCharCode.apply(null, uintArray.subarray(i * 32767, Math.min((i + 1) * 32767, uintArray.byteLength)));
+            }
+            return str;
+        }
+        let isCompressedChats = localStorage.getItem("compressedChats") === "true";
+        const originSetItem = localStorage.setItem;
+        localStorage.setItem = (key, value) => {
+            try {
+                if (isCompressedChats && key === "chats") value = uintToString(deflateSync(new TextEncoder().encode(value), { level: 1 }));
+                originSetItem.call(localStorage, key, value)
+            } catch (e) {
+                if (isCompressedChats) {
+                    notyf.error(translations[locale]["localQuotaExceedTip"])
+                    return;
+                }
+                let isKeyChats = key === "chats";
+                let compressed = uintToString(deflateSync(new TextEncoder().encode(isKeyChats ? value : localStorage.getItem("chats")), { level: 1 }));
+                originSetItem.call(localStorage, "chats", compressed);
+                originSetItem.call(localStorage, "compressedChats", true);
+                isCompressedChats = true;
+                if (!isKeyChats) originSetItem.call(localStorage, key, value);
+            }
+        }
+    </script>
+    <script>
+        const localeList = ["en", "zh"];
+        let locale; // UI语言
+        const setLangEle = document.getElementById("setLang");
+        const setLang = () => {
+            let langClass = locale + "Lang";
+            localStorage.setItem("UILang", locale)
+            document.documentElement.lang = locale === "zh" ? "zh-CN" : "en";
+            setLangEle.classList = "setDetail themeDetail langDetail " + langClass;
+        }
+        setLangEle.onclick = (ev) => {
+            let idx = Array.prototype.indexOf.call(setLangEle.children, ev.target);
+            if (locale !== localeList[idx]) {
+                locale = localeList[idx];
+                setLang();
+                changeLocale();
+            }
+        }
+        const initLang = () => {
+            let localLang = localStorage.getItem("UILang") || (navigator.language.startsWith("zh-") ? "zh" : "en");
+            let isInit = locale === void 0;
+            if (locale !== localLang) {
+                locale = localLang;
+                if (!isInit) changeLocale();
+            };
+            setLang();
+        }
+        initLang();
+        const translations = {
+            "en": {
+                "about": "About",
+                "about-d": "ChatGPT style web app for llama.cpp",
+                "contact": "Contact Author",
+                "description": "ChatGPT style web app for llama.cpp",
+                "newChat": "New chat",
+                "newChatName": "New chat",
+                "newFolder": "New folder",
+                "newFolderName": "New folder",
+                "search": "Search",
+                "matchCaseTip": "Match case",
+                "forceRe": "Force refresh",
+                "clearAll": "Clear all chats",
+                "setting": "Setting",
+                "nav": "Navigate",
+                "winedWin": "Window",
+                "fullWin": "Full screen",
+                "quickSet": "Quick setting",
+                "chat": "Chat",
+                "tts": "TTS",
+                "stt": "STT",
+                "avatar": "Avatar",
+                "systemRole": "System role",
+                "presetRole": "Preset",
+                "default": "Default",
+                "assistant": "Assistant",
+                "cat": "Cat girl",
+                "emoji": "Emoji",
+                "withImg": "Image",
+                "defaultText": "",
+                "assistantText": "You are a helpful assistant, answer as concisely as possible.",
+                "catText": "You are a cute cat girl, you must end every sentence with 'meow'",
+                "emojiText": "Your personality is very lively, there must be at least one emoji icon in every sentence",
+                "imageText": "When you need to send pictures, please generate them in markdown language, without backslashes or code boxes. When you need to use the unsplash API, follow the format, https://source.unsplash.com/960x640/?<English keywords>",
+                "nature": "Nature",
+                "natureNeg": "Accurate",
+                "naturePos": "Creativity",
+                "quality": "Quality",
+                "qualityNeg": "Repetitive",
+                "qualityPos": "Nonsense",
+                "chatsWidth": "Chats width",
+                "typeSpeed": "Typing speed",
+                "continuousLen": "Context messages",
+                "msgAbbr": " msgs.",
+                "slow": "Slow",
+                "fast": "Fast",
+                "longReply": "Long reply",
+                "ttsService": "TTS API",
+                "sttService": "STT API",
+                "openaiTTS": "OpenAI",
+                "azureTTS": "Azure",
+                "edgeTTS": "Edge",
+                "systemTTS": "System",
+                "azureRegion": "Azure region",
+                "azureKey": "Azure key",
+                "loadVoice": "Load voice",
+                "voiceName": "Switch",
+                "userVoice": "User voice",
+                "replyVoice": "Reply voice",
+                "TTSTest": "Hello, nice to meet you.",
+                "play": "Play",
+                "pause": "Pause",
+                "resume": "Resume",
+                "stop": "Stop",
+                "style": "Style",
+                "role": "Role",
+                "volume": "Volume",
+                "low": "Low",
+                "high": "High",
+                "rate": "Rate",
+                "slow": "Slow",
+                "fast": "Fast",
+                "pitch": "Pitch",
+                "neutral": "Neutral",
+                "intense": "Intense",
+                "contSpeech": "Continuous speech",
+                "autoSpeech": "Auto speech",
+                "unsupportRecTip": "Voice recognition is not supported in the current environment. Please refer to the documentation.",
+                "loadRecVoice": "Load language",
+                "lang": "Language",
+                "dialect": "Dialect",
+                "autoSendKey": "Auto send keyword",
+                "autoStopKey": "Auto stop keyword",
+                "autoSendDelay": "Auto send delay time",
+                "second": "s",
+                "keepListenMic": "Keep listen",
+                "send": "Send",
+                "askTip": "Type message here",
+                "clearChat": "Clear chat",
+                "general": "General",
+                "hotkey": "Hotkey",
+                "data": "Data",
+                "theme": "Theme",
+                "darkTheme": "Dark",
+                "lightTheme": "Light",
+                "autoWord": "Auto",
+                "systemTheme": "System",
+                "customDarkTheme": "Custom dark theme",
+                "startDark": "Start",
+                "endDark": "End",
+                "aiEndpoint": " endpoint",
+                "aiKey": " API key",
+                "aiModel": "Custom model name",
+                "used": "Used ",
+                "available": "Avail ",
+                "navKey": "Toggle nav",
+                "fullKey": "Window size",
+                "themeKey": "Toggle theme",
+                "langKey": "Toggle lang",
+                "inputKey": "Message",
+                "voiceKey": "Voice",
+                "resetTip": "Restore default",
+                "recKey": "Recognition",
+                "speechKey": "Start speech",
+                "export": "Export",
+                "import": "Import",
+                "clear": "Clear",
+                "reset": "Reset",
+                "localStore": "Local storage",
+                "forceReTip": "Force refresh page?",
+                "noSpeechTip": "No speech was detected. You may need to adjust your microphone settings.",
+                "noMicTip": "No microphone was found. Ensure that a microphone is installed and microphone settings are configured correctly.",
+                "noMicPerTip": "Permission to use microphone is blocked.",
+                "azureInvalidTip": "Invalid access key or wrong Azure region endpoint, please check!",
+                "errorAiKeyTip": "Invalid or incorrect API key, please check API key!",
+                "copyCode": "Copy code",
+                "copySuccess": "Success",
+                "update": "Update",
+                "cancel": "Cancel",
+                "delMsgTip": "Delete this message?",
+                "edit": "Edit",
+                "refresh": "Refresh",
+                "continue": "Continue",
+                "copy": "Copy",
+                "del": "Delete",
+                "downAudio": "Download audio",
+                "speech": "Speech",
+                "chats": " chats",
+                "delFolderTip": "Delete this folder?",
+                "delChatTip": "Delete this chat?",
+                "exportSuccTip": "Export successful!",
+                "importSuccTip": "Import successful!",
+                "importFailTip": "Import failed, please check the file format!",
+                "clearChatSuccTip": "Clear chats data successful!",
+                "resetSetSuccTip": "Reset settings successful!",
+                "clearAllTip": "Delete all chats and folders?",
+                "resetSetTip": "Restore all settings to default?",
+                "hotkeyConflict": "Hotkey conflict, please choose another key!",
+                "customDarkTip": "Start time and end time cannot be the same!",
+                "timeoutTip": "Request timeout, please try again later!",
+                "largeReqTip": "Request is too large, please delete part of the chat or cancel continuous chat!",
+                "noModelPerTip": "Not permission to use this model, please choose another GPT model!",
+                "apiRateTip": "Trigger API call rate limit, please try again later!",
+                "exceedLimitTip": "API usage exceeded limit, please check your bill!",
+                "badGateTip": "Gateway error or timeout, please try again later!",
+                "badEndpointTip": "Failed to access the endpoint, please check the endpoint!",
+                "clearChatTip": "Clear this chat?",
+                "cantSpeechTip": "Current voice cannot synthesize this message, please choose another voice or message!",
+                "cantTranscribeTip": "Voice recognition failed, please try again!",
+                "noStreamTip": "Automatic features aren't available for non-realtime speech recognition service!",
+                "localQuotaExceedTip": "Local storage exceeded limit, please export chats data and clear or delete some chats!",
+            },
+            "zh": {
+                "about": "关于",
+                "about-d": "专为 llama.cpp 设计的ChatGPT风格应用",
+                "contact": "联系作者",
+                "description": "专为 llama.cpp 设计的ChatGPT风格应用",
+                "newChat": "新建会话",
+                "newChatName": "新的会话",
+                "newFolder": "新建文件夹",
+                "newFolderName": "新文件夹",
+                "search": "搜索",
+                "matchCaseTip": "区分大小写",
+                "forceRe": "强制刷新",
+                "clearAll": "清空全部",
+                "setting": "设置",
+                "nav": "导航",
+                "winedWin": "窗口",
+                "fullWin": "全屏",
+                "quickSet": "快速设置",
+                "chat": "会话",
+                "tts": "语音合成",
+                "stt": "语音识别",
+                "avatar": "用户头像",
+                "systemRole": "系统角色",
+                "presetRole": "预设角色",
+                "default": "默认",
+                "assistant": "助手",
+                "cat": "猫娘",
+                "emoji": "表情",
+                "withImg": "有图",
+                "defaultText": "",
+                "assistantText": "你是一个乐于助人的助手，尽量简明扼要地回答",
+                "catText": "你是一个可爱的猫娘，每句话结尾都要带个'喵'",
+                "emojiText": "你的性格很活泼，每句话中都要有至少一个emoji图标",
+                "imageText": "当你需要发送图片的时候，请用 markdown 语言生成，不要反斜线，不要代码框，需要使用 unsplash API时，遵循一下格式， https://source.unsplash.com/960x640/? ＜英文关键词＞",
+                "nature": "角色性格",
+                "natureNeg": "准确严谨",
+                "naturePos": "灵活创新",
+                "quality": "回答质量",
+                "qualityNeg": "重复保守",
+                "qualityPos": "胡言乱语",
+                "chatsWidth": "会话宽度",
+                "typeSpeed": "打字机速度",
+                "continuousLen": "上下文消息数",
+                "msgAbbr": "条",
+                "slow": "慢",
+                "fast": "快",
+                "longReply": "长回复",
+                "ttsService": "语音合成服务",
+                "sttService": "语音识别服务",
+                "openaiTTS": "OpenAI语音",
+                "azureTTS": "Azure语音",
+                "edgeTTS": "Edge语音",
+                "systemTTS": "系统语音",
+                "azureRegion": "Azure区域",
+                "azureKey": "Azure密钥",
+                "loadVoice": "加载语音",
+                "voiceName": "切换",
+                "userVoice": "用户语音",
+                "replyVoice": "回答语音",
+                "TTSTest": "你好，很高兴认识你。",
+                "play": "播放",
+                "pause": "暂停",
+                "resume": "恢复",
+                "stop": "停止",
+                "style": "风格",
+                "role": "角色",
+                "volume": "音量",
+                "low": "低",
+                "high": "高",
+                "rate": "语速",
+                "slow": "慢",
+                "fast": "快",
+                "pitch": "音调",
+                "neutral": "平淡",
+                "intense": "起伏",
+                "contSpeech": "连续朗读",
+                "autoSpeech": "自动朗读",
+                "unsupportRecTip": "当前环境不支持语音识别，请查阅文档。",
+                "loadRecVoice": "加载语言",
+                "lang": "语言",
+                "dialect": "方言",
+                "autoSendKey": "自动发送关键词",
+                "autoStopKey": "自动停止关键词",
+                "autoSendDelay": "自动发送延迟时间",
+                "second": "秒",
+                "keepListenMic": "保持监听",
+                "send": "发送",
+                "askTip": "来问点什么吧",
+                "clearChat": "清空会话",
+                "general": "通用",
+                "hotkey": "快捷键",
+                "data": "数据",
+                "theme": "主题",
+                "darkTheme": "深色",
+                "lightTheme": "浅色",
+                "autoWord": "自动",
+                "systemTheme": "跟随系统",
+                "customDarkTheme": "自定义深色主题时间",
+                "startDark": "开始时间",
+                "endDark": "结束时间",
+                "aiEndpoint": "接口",
+                "aiKey": "API密钥",
+                "aiModel": "自定义模型",
+                "used": "已用 ",
+                "available": "可用 ",
+                "navKey": "切换导航",
+                "fullKey": "全屏/窗口",
+                "themeKey": "切换主题",
+                "langKey": "切换语言",
+                "inputKey": "输入框",
+                "voiceKey": "语音",
+                "resetTip": "重置设置",
+                "recKey": "语音输入",
+                "speechKey": "朗读会话",
+                "export": "导出",
+                "import": "导入",
+                "clear": "清空",
+                "reset": "重置",
+                "localStore": "本地存储",
+                "forceReTip": "是否强制刷新页面？",
+                "noSpeechTip": "未识别到语音，请调整麦克风后重试！",
+                "noMicTip": "未识别到麦克风，请确保已安装麦克风！",
+                "noMicPerTip": "未允许麦克风权限！",
+                "azureInvalidTip": "Azure区域错误或密钥无效，请检查！",
+                "errorAiKeyTip": "API密钥错误或失效，请检查API密钥！",
+                "copyCode": "复制代码",
+                "copySuccess": "复制成功",
+                "update": "更新",
+                "cancel": "取消",
+                "delMsgTip": "是否删除此消息？",
+                "edit": "编辑",
+                "refresh": "刷新",
+                "continue": "继续",
+                "copy": "复制",
+                "del": "删除",
+                "downAudio": "下载语音",
+                "speech": "朗读",
+                "chats": "个会话",
+                "delFolderTip": "是否删除此文件夹？",
+                "delChatTip": "是否删除此会话？",
+                "exportSuccTip": "导出成功！",
+                "importSuccTip": "导入成功！",
+                "importFailTip": "导入失败，请检查文件格式！",
+                "clearChatSuccTip": "清空会话成功！",
+                "resetSetSuccTip": "重置设置成功！",
+                "clearAllTip": "是否删除所有会话和文件夹？",
+                "resetSetTip": "是否还原所有设置为默认值？",
+                "hotkeyConflict": "快捷键冲突，请选择其他键位！",
+                "customDarkTip": "开始时间和结束时间不能相同！",
+                "timeoutTip": "请求超时，请稍后重试！",
+                "largeReqTip": "请求内容过大，请删除部分对话或关闭连续对话！",
+                "noModelPerTip": "无权使用此模型，请选择其他GPT模型！",
+                "apiRateTip": "触发API调用频率限制，请稍后重试！",
+                "exceedLimitTip": "API使用超出限额，请检查您的账单！",
+                "badGateTip": "网关错误或超时，请稍后重试！",
+                "badEndpointTip": "访问接口失败，请检查接口！",
+                "clearChatTip": "是否清空此会话？",
+                "cantSpeechTip": "当前语音无法合成此消息，请选择其他语音或消息！",
+                "cantTranscribeTip": "语音识别失败，请重试！",
+                "noStreamTip": "非实时语音识别服务无法使用自动功能！",
+                "localQuotaExceedTip": "本地存储超出限额，请导出会话并清空或删除部分会话！",
+            },
+        };
+        const translateElement = (ele, type) => {
+            const key = ele.getAttribute("data-i18n-" + type);
+            const translation = translations[locale][key];
+            if (type === "title") {
+                ele.setAttribute("title", translation);
+            } else if (type === "place") {
+                ele.setAttribute("placeholder", translation);
+            } else if (type === "value") {
+                ele.setAttribute("value", translation);
+            } else {
+                ele.textContent = translation;
+            }
+        }
+        const initLocale = () => {
+            document.querySelectorAll("[data-i18n-title]").forEach(ele => { translateElement(ele, "title") });
+            document.querySelectorAll("[data-i18n-place]").forEach(ele => { translateElement(ele, "place") });
+            document.querySelectorAll("[data-i18n-value]").forEach(ele => { translateElement(ele, "value") });
+            document.querySelectorAll("[data-i18n-key]").forEach(ele => { translateElement(ele, "key") });
+            document.querySelectorAll("[data-i18n-theme]").forEach(ele => {
+                let key = themeMode === 2 ? "autoWord" : themeMode === 1 ? "lightTheme" : "darkTheme";
+                ele.setAttribute("title", translations[locale][key])
+            })
+            document.querySelectorAll("[data-i18n-window]").forEach(ele => {
+                let key = isFull ? "winedWin" : "fullWin";
+                ele.setAttribute("title", translations[locale][key])
+            })
+            document.head.children[3].setAttribute("content", translations[locale]["description"])
+        };
+        initLocale();
+        const changeLocale = () => {
+            initLocale();
+            document.querySelectorAll("[data-type='chatEdit'],[data-type='folderEdit']").forEach(ele => {
+                ele.children[0].textContent = translations[locale]["edit"];
+            });
+            document.querySelectorAll("[data-type='chatDel'],[data-type='folderDel']").forEach(ele => {
+                ele.children[0].textContent = translations[locale]["del"];
+            });
+            document.querySelectorAll("[data-type='folderAddChat']").forEach(ele => {
+                ele.children[0].textContent = translations[locale]["newChat"];
+            });
+            document.querySelectorAll("[data-id]").forEach(ele => {
+                let key = ele.getAttribute("data-id");
+                if (key.endsWith("Md")) {
+                    if (key === "speechMd" || key === "pauseMd" || key === "resumeMd") {
+                        ele.children[0].textContent = translations[locale][key.slice(0, -2)];
+                    } else if (key === "refreshMd") {
+                        ele.setAttribute("title", translations[locale][ele.classList.contains("refreshReq") ? "refresh" : "continue"]);
+                    } else {
+                        ele.setAttribute("title", translations[locale][key.slice(0, -2)]);
+                    }
+                }
+            });
+            document.querySelectorAll(".folderNum").forEach(ele => {
+                let num = ele.textContent.match(/\d+/)[0];
+                ele.textContent = num + translations[locale]["chats"];
+            });
+            document.querySelectorAll(".u-mdic-copy-btn").forEach(ele => {
+                ele.setAttribute("text", translations[locale]["copyCode"]);
+            })
+            document.querySelectorAll(".u-mdic-copy-notify").forEach(ele => {
+                ele.setAttribute("text", translations[locale]["copySuccess"]);
+            })
+            if (editingIdx !== void 0) {
+                document.querySelector("[data-i18n-key='send']").textContent = translations[locale]["update"];
+                document.querySelector("[data-i18n-title='clearChat']").setAttribute("title", translations[locale]["cancel"]);
+            }
+            loadPrompt();
+        }
+    </script>
+    <script>
+        const windowEle = document.getElementsByClassName("chat_window")[0];
+        const messagesEle = document.getElementsByClassName("messages")[0];
+        const chatlog = document.getElementById("chatlog");
+        const stopEle = document.getElementById("stopChat");
+        const sendBtnEle = document.getElementById("sendbutton");
+        const clearEle = document.getElementsByClassName("clearConv")[0];
+        const inputAreaEle = document.getElementById("chatinput");
+        const settingEle = document.getElementById("setting");
+        const dialogEle = document.getElementById("setDialog");
+        const selectorEle = document.getElementById("selector");
+        const modelSetEle = document.getElementById("modelDialog");
+        const lightEle = document.getElementById("toggleLight");
+        const setLightEle = document.getElementById("setLight");
+        const autoThemeEle = document.getElementById("autoDetail");
+        const systemEle = document.getElementById("systemInput");
+        const speechServiceEle = document.getElementById("preSetService");
+        const recServiceEle = document.getElementById("preRecService");
+        const newChatEle = document.getElementById("newChat");
+        const folderListEle = document.getElementById("folderList");
+        const chatListEle = document.getElementById("chatList");
+        const searchChatEle = document.getElementById("searchChat");
+        const voiceRecEle = document.getElementById("voiceRecIcon");
+        const voiceRecSetEle = document.getElementById("voiceRecSetting");
+        const preEle = document.getElementById("preSetSystem");
+        let voicesData; // 语音数据
+        let voiceType = 1; // 设置 0: 提问语音，1：回答语音
+        let voiceRole = []; // 语音
+        let voiceTestText; // 测试语音文本
+        let voiceVolume = []; //音量
+        let voiceRate = []; // 语速
+        let voicePitch = []; // 音调
+        let enableContVoice; // 连续朗读
+        let enableAutoVoice; // 自动朗读
+        let existVoice = 0; // 4:OpenAI语音 3:Azure语音 2:edge在线语音, 1:本地语音, 0:不支持语音
+        const azureRegions = ['southafricanorth', 'eastasia', 'southeastasia', 'australiaeast', 'centralindia', 'japaneast', 'japanwest', 'koreacentral', 'canadacentral', 'northeurope', 'westeurope', 'francecentral', 'germanywestcentral', 'norwayeast', 'swedencentral', 'switzerlandnorth', 'switzerlandwest', 'uksouth', 'uaenorth', 'brazilsouth', 'qatarcentral', 'centralus', 'eastus', 'eastus2', 'northcentralus', 'southcentralus', 'westcentralus', 'westus', 'westus2', 'westus3'];
+        let azureRegion;
+        let azureKey;
+        let azureRole = [];
+        let azureStyle = [];
+        const supportSpe = !!(window.speechSynthesis && window.SpeechSynthesisUtterance);
+        const isSafeEnv = window.isSecureContext; // 安全上下文
+        const supportLocalRec = isSafeEnv && !!window.webkitSpeechRecognition; // 是否支持本地语音识别输入
+        const supportOnlineRec = isSafeEnv && navigator.mediaDevices && navigator.mediaDevices.getUserMedia && window.AudioContext && ("audioWorklet" in window.AudioContext.prototype || "createScriptProcessor" in window.AudioContext.prototype);
+        const supportOnlineLegacyRec = isSafeEnv && navigator.mediaDevices && navigator.mediaDevices.getUserMedia && !!window.MediaRecorder && (MediaRecorder.isTypeSupported("audio/webm") || MediaRecorder.isTypeSupported("audio/mp4"));
+        const supportRec = supportLocalRec || supportOnlineRec || supportOnlineLegacyRec;
+        let existRec = 1; // 2:Azure语音，1:系统语音
+        let azureRecRegion;
+        let azureRecKey;
+        let recing = false;
+        let autoSendWord; // 自动发送关键词
+        let autoStopWord; // 自动停止关键词
+        let autoSendTime; // 自动发送延迟时间
+        let keepListenMic; // 保持监听麦克风
+        let autoSendTimer;
+        let resetRecRes;
+        let toggleRecEv;
+        const isAndroid = /\bAndroid\b/i.test(navigator.userAgent);
+        const isApple = /(Mac|iPhone|iPod|iPad)/i.test(navigator.userAgent);
+        const isSafari = /Safari/.test(navigator.userAgent) && !/Chrome/.test(navigator.userAgent);
+        const isPWA = navigator.standalone || window.matchMedia("(display-mode: standalone)").matches;
+        if (isPWA) {
+            let bottomEle = document.querySelector(".bottom_wrapper");
+            let footerEle = document.querySelector(".navFooter");
+            footerEle.style.marginBottom = bottomEle.style.marginBottom = "8px";
+        };
+        const dayMs = 8.64e7;
+        refreshPage.onclick = () => {
+            if (confirmAction(translations[locale]["forceReTip"])) {
+                location.href = location.origin + location.pathname + "?" + new Date().getTime()
+            }
+        };
+        const noLoading = () => {
+            return !loading && (!currentResEle || currentResEle.dataset.loading !== "true")
+        };
+        const uuidv4 = (upper) => {
+            let uuid = ([1e7] + 1e3 + 4e3 + 8e3 + 1e11).replace(/[018]/g, c =>
+                (c ^ crypto.getRandomValues(new Uint8Array(1))[0] & 15 >> c / 4).toString(16)
+            );
+            return upper ? uuid.toUpperCase() : uuid;
+        };
+        if (!isMobile) inputAreaEle.focus();
+        const textInputEvent = () => {
+            if (noLoading()) sendBtnEle.classList.toggle("activeSendBtn", inputAreaEle.value.trim().length);
+            inputAreaEle.style.height = "47px";
+            inputAreaEle.style.height = inputAreaEle.scrollHeight + "px";
+        };
+        inputAreaEle.oninput = textInputEvent;
+        const toggleNavEv = () => {
+            let isShowNav = document.body.classList.toggle("show-nav");
+            if (window.innerWidth > 800) {
+                localStorage.setItem("pinNav", isShowNav)
+            }
+        }
+        document.body.addEventListener("mousedown", event => {
+            if (event.target.className === "toggler") {
+                toggleNavEv();
+            } else if (event.target.className === "overlay") {
+                document.body.classList.remove("show-nav");
+            } else if (event.target === document.body) {
+                if (window.innerWidth <= 800) {
+                    document.body.classList.remove("show-nav");
+                }
+            }
+        });
+        const endSetEvent = (ev) => {
+            if (!document.getElementById("sysDialog").contains(ev.target)) {
+                ev.preventDefault();
+                ev.stopPropagation();
+                endSet();
+            }
+        }
+        const endSet = () => {
+            document.getElementById("sysMask").style.display = "none";
+            document.body.removeEventListener("click", endSetEvent, true);
+        }
+        document.getElementById("closeSet").onclick = endSet;
+        document.getElementById("sysSetting").onclick = () => {
+            document.getElementById("sysMask").style.display = "flex";
+            checkStorage();
+            document.getElementById("sysMask").onmousedown = endSetEvent;
+        };
+        const setAutoTimer = () => {
+            if (autoSendTime) {
+                autoSendTimer = setTimeout(() => {
+                    genFunc();
+                    autoSendTimer = void 0;
+                }, autoSendTime * 1000);
+            }
+        }
+        const clearAutoSendTimer = () => {
+            if (autoSendTimer !== void 0) {
+                clearTimeout(autoSendTimer);
+                autoSendTimer = void 0;
+            }
+        }
+        if (!supportLocalRec) recServiceEle.remove(2);
+        if (!supportOnlineRec) recServiceEle.remove(1);
+        if (!supportOnlineLegacyRec) recServiceEle.remove(0);
+        const initRecVal = () => {
+            if (!supportRec) {
+                noRecTip.style.display = "block"
+                noRecTip.parentElement.firstElementChild.style.display = "none";
+                noRecTip.parentElement.children[1].style.display = "none";
+                return;
+            }
+            let localRecType = localStorage.getItem("existRec");
+            recServiceEle.value = existRec = parseInt(localRecType || (supportLocalRec ? "1" : "2"));
+        }
+        initRecVal();
+        const clearAzureRec = () => {
+            azureRecKey = void 0;
+            localStorage.removeItem(azureRecRegion + "RecData");
+            azureRecData = void 0;
+            azureRecRegion = void 0;
+            azureRecKeyInput.parentElement.style.display = "none";
+            preRecAzureRegion.parentElement.style.display = "none";
+        }
+        const featStreamRec = (hide) => {
+            document.querySelectorAll('[data-feat="forStream"]').forEach(item => item.style.display = (hide ? "none" : "block"))
+            document.querySelectorAll('[data-feat="forNoStream"]').forEach(item => item.style.display = (hide ? "block" : "none"))
+        }
+        let azureRecData, systemRecData, checkAzureRecAbort;
+        const toggleRecCheck = (bool) => {
+            checkRecLoad.style.display = bool ? "flex" : "none";
+            recDetail.style.display = bool ? "none" : "block";
+            hotKeyVoiceRec.parentElement.style.display = bool ? "none" : "block";
+            document.getElementById("voiceRec").style.display = bool ? "none" : "block";
+            if (bool) inputAreaEle.classList.remove("message_if_voice");
+            else inputAreaEle.classList.add("message_if_voice");
+        }
+        recServiceEle.onchange = () => {
+            if (!supportRec) return;
+            existRec = parseInt(recServiceEle.value);
+            localStorage.setItem("existRec", existRec);
+            toggleRecCheck(true);
+            if (checkAzureRecAbort && !checkAzureRecAbort.signal.aborted) {
+                checkAzureRecAbort.abort();
+                checkAzureRecAbort = void 0;
+            }
+            if (existRec === 3) {
+                clearAzureRec();
+                loadOpenAIRec();
+                featStreamRec(true);
+            } else if (existRec === 2) {
+                azureRecKeyInput.parentElement.style.display = "block";
+                preRecAzureRegion.parentElement.style.display = "block";
+                loadAzureRec();
+                featStreamRec();
+            } else {
+                clearAzureRec();
+                loadLocalRec();
+                featStreamRec();
+            }
+        }
+        const loadLocalRec = () => { initRecSetting() };
+        const loadOpenAIRec = () => { initRecSetting() };
+        const loadAzureRec = () => {
+            let checking = false;
+            const checkAzureFunc = () => {
+                if (checking) return;
+                if (azureRecKey) {
+                    checking = true;
+                    checkRecLoad.classList.add("voiceChecking");
+                    checkAzureRecAbort = new AbortController();
+                    setTimeout(() => {
+                        if (checkAzureRecAbort && !checkAzureRecAbort.signal.aborted) {
+                            checkAzureRecAbort.abort();
+                            checkAzureRecAbort = void 0;
+                        }
+                    }, 15000);
+                    getAzureToken(checkAzureRecAbort.signal).then(() => {
+                        getRecList(checkAzureRecAbort.signal).then(() => {
+                            initRecSetting(azureRecData);
+                        }).catch(e => {
+                        }).finally(() => {
+                            checkRecLoad.classList.remove("voiceChecking");
+                            checking = false;
+                        })
+                    }).catch(e => {
+                    }).finally(() => {
+                        checkRecLoad.classList.remove("voiceChecking");
+                        checking = false;
+                    })
+                }
+            };
+            checkRecLoad.onclick = checkAzureFunc;
+            const getAzureToken = (signal) => {
+                return new Promise((res, rej) => {
+                    fetch("https://" + azureRecRegion + ".api.cognitive.microsoft.com/sts/v1.0/issueToken", {
+                        signal,
+                        method: "POST",
+                        headers: {
+                            "Ocp-Apim-Subscription-Key": azureRecKey
+                        }
+                    }).then(response => {
+                        response.text().then(text => {
+                            try {
+                                let json = JSON.parse(text);
+                                notyf.error(translations[locale]["azureInvalidTip"]);
+                                rej();
+                            } catch (e) {
+                                res();
+                            }
+                        });
+                    }).catch(e => {
+                        localStorage.removeItem(azureRecRegion + "RecData");
+                        azureRecData = void 0;
+                        rej();
+                    })
+                })
+            };
+            const getRecList = (signal) => {
+                return new Promise((res, rej) => {
+                    if (azureRecData) res();
+                    else {
+                        let localAzureRecData = localStorage.getItem(azureRecRegion + "RecData");
+                        if (localAzureRecData) {
+                            azureRecData = JSON.parse(localAzureRecData);
+                            res();
+                        } else {
+                            fetch("https://" + azureRecRegion + ".stt.speech.microsoft.com/api/v1.0/languages/recognition", {
+                                signal
+                            }).then(response => {
+                                response.json().then(json => {
+                                    azureRecData = json;
+                                    localStorage.setItem(azureRecRegion + "RecData", JSON.stringify(json));
+                                    res();
+                                }).catch(e => {
+                                    notyf.error(translations[locale]["azureInvalidTip"]);
+                                    rej();
+                                })
+                            }).catch(e => {
+                                localStorage.removeItem(azureRecRegion + "RecData");
+                                azureRecData = void 0;
+                                rej();
+                            })
+                        }
+                    }
+                })
+            };
+            let azureRecRegionEle = document.getElementById("preRecAzureRegion");
+            if (!azureRecRegionEle.options.length) {
+                azureRegions.forEach((region, i) => {
+                    let option = document.createElement("option");
+                    option.value = region;
+                    option.text = region;
+                    azureRecRegionEle.options.add(option);
+                });
+            }
+            let localAzureRecRegion = localStorage.getItem("azureRecRegion");
+            if (localAzureRecRegion) {
+                azureRecRegion = localAzureRecRegion;
+                azureRecRegionEle.value = localAzureRecRegion;
+            }
+            azureRecRegionEle.onchange = () => {
+                azureRecRegion = azureRecRegionEle.value;
+                localStorage.setItem("azureRecRegion", azureRecRegion);
+                toggleRecCheck(true);
+            }
+            azureRecRegionEle.dispatchEvent(new Event("change"));
+            let azureRecKeyEle = document.getElementById("azureRecKeyInput");
+            let localAzureRecKey = localStorage.getItem("azureRecKey");
+            if (localAzureRecKey) {
+                azureRecKey = localAzureRecKey;
+                azureRecKeyEle.value = localAzureRecKey;
+            }
+            azureRecKeyEle.onchange = () => {
+                azureRecKey = azureRecKeyEle.value;
+                localStorage.setItem("azureRecKey", azureRecKey);
+                toggleRecCheck(true);
+            }
+            azureRecKeyEle.dispatchEvent(new Event("change"));
+            if (azureRecKey) checkAzureFunc();
+        }
+        const azureLangTrans = { "en-au": "Australia", "en-ca": "Canada", "en-gb": "United Kingdom", "en-gh": "Ghana", "en-hk": "Hong Kong SAR", "en-ie": "Ireland", "en-in": "India", "en-ke": "Kenya", "en-ng": "Nigeria", "en-nz": "New Zealand", "en-ph": "Philippines", "en-sg": "Singapore", "en-tz": "Tanzania", "en-us": "United States", "en-za": "South Africa", "nan-cn": "闽南语，简体", "wuu-cn": "吴语，简体", "yue-cn": "粤语，简体", "zh-cn": "普通话，简体", "zh-cn-anhui": "安徽江淮普通话，简体", "zh-cn-bilingual": "普通话，英语双语", "zh-cn-gansu": "甘肃兰银普通话，简体", "zh-cn-guangxi": "广西口音普通话，简体", "zh-cn-henan": "中原官话河南，简体", "zh-cn-hunan": "湖南口音普通话，简体", "zh-cn-liaoning": "东北官话，简体", "zh-cn-shaanxi": "中原官话陕西，简体", "zh-cn-shandong": "冀鲁官话，简体", "zh-cn-shanxi": "山西口音普通话，简体", "zh-cn-sichuan": "西南官话，简体", "zh-hk": "粤语，繁体", "zh-sg": "简体，新加坡", "zh-tw": "台湾普通话" };
+        const initRecSetting = (azureData) => {
+            let langs = [['中文'], ['English']];
+            if (azureData) {
+                azureData.forEach(item => {
+                    if (item.startsWith("en-")) {
+                        let lowCase = item.toLowerCase();
+                        let dialectName = azureLangTrans[lowCase] || lowCase;
+                        if (lowCase == "en-us") langs[1].splice(1, 0, [lowCase, dialectName]);
+                        else langs[1].push([lowCase, dialectName]);
+                    } else if (item.indexOf("CN") != -1 || item.indexOf("zh") != -1) {
+                        let lowCase = item.toLowerCase();
+                        let dialectName = azureLangTrans[lowCase] || lowCase;
+                        if (lowCase == "zh-cn") langs[0].splice(1, 0, [lowCase, dialectName]);
+                        else langs[0].push([lowCase, dialectName]);
+                    }
+                })
+            } else if (existRec === 3) {
+                langs = [
+                    ['自动检测', ['', '自动检测']],
+                    ['中文', ['zh', '汉语']],
+                    ['English', ['en', 'English']]
+                ];
+            } else {
+                langs = [ // from https://www.google.com/intl/en/chrome/demos/speech.html
+                    ['中文', ['cmn-Hans-CN', '普通话 (大陆)'],
+                        ['cmn-Hans-HK', '普通话 (香港)'],
+                        ['cmn-Hant-TW', '中文 (台灣)'],
+                        ['yue-Hant-HK', '粵語 (香港)']],
+                    ['English', ['en-US', 'United States'],
+                        ['en-GB', 'United Kingdom'],
+                        ['en-AU', 'Australia'],
+                        ['en-CA', 'Canada'],
+                        ['en-IN', 'India'],
+                        ['en-KE', 'Kenya'],
+                        ['en-TZ', 'Tanzania'],
+                        ['en-GH', 'Ghana'],
+                        ['en-NZ', 'New Zealand'],
+                        ['en-NG', 'Nigeria'],
+                        ['en-ZA', 'South Africa'],
+                        ['en-PH', 'Philippines']]
+                ];
+            };
+            toggleRecCheck(false);
+            if (locale !== "zh") {
+                if (existRec === 3) {
+                    langs[0][0] = langs[0][1][1] = translations[locale]["autoWord"];
+                    let idx = langs.findIndex((item) => { return item[1][0] === locale });
+                    let [temp] = langs.splice(idx, 1);
+                    langs.splice(1, 0, temp);
+                } else langs = langs.reverse();
+            }
+            selectLangOption.options.length = select_language.options.length = 0;
+            langs.forEach((lang, i) => {
+                select_language.options.add(new Option(lang[0], i));
+                selectLangOption.options.add(new Option(lang[0], i))
+            });
+            const updateCountry = function () {
+                selectLangOption.selectedIndex = select_language.selectedIndex = this.selectedIndex;
+                select_dialect.innerHTML = "";
+                selectDiaOption.innerHTML = "";
+                let list = langs[select_language.selectedIndex];
+                for (let i = 1; i < list.length; i++) {
+                    select_dialect.options.add(new Option(list[i][1], list[i][0]));
+                    selectDiaOption.options.add(new Option(list[i][1], list[i][0]));
+                }
+                select_dialect.style.visibility = list[1].length == 1 ? "hidden" : "visible";
+                selectDiaOption.parentElement.style.visibility = list[1].length == 1 ? "hidden" : "visible";
+                localStorage.setItem("voiceRecLang", select_dialect.value);
+            };
+            let localLangIdx = 0;
+            let localDiaIdx = 0;
+            let localRecLang = localStorage.getItem("voiceRecLang") || langs[0][1][0];
+            if (localRecLang) {
+                let localIndex = langs.findIndex(item => {
+                    let diaIdx = item.findIndex(lang => { return lang instanceof Array && lang[0] === localRecLang });
+                    if (diaIdx !== -1) {
+                        localDiaIdx = diaIdx - 1;
+                        return true;
+                    }
+                    return false;
+                });
+                if (localIndex !== -1) localLangIdx = localIndex;
+            }
+            selectLangOption.onchange = updateCountry;
+            select_language.onchange = updateCountry;
+            selectDiaOption.onchange = select_dialect.onchange = function () {
+                selectDiaOption.selectedIndex = select_dialect.selectedIndex = this.selectedIndex;
+                localStorage.setItem("voiceRecLang", select_dialect.value);
+            }
+            selectLangOption.selectedIndex = select_language.selectedIndex = localLangIdx;
+            select_language.dispatchEvent(new Event("change"));
+            selectDiaOption.selectedIndex = select_dialect.selectedIndex = localDiaIdx;
+            select_dialect.dispatchEvent(new Event("change"));
+            initRecEvent();
+        };
+        let recSetTimer;
+        let initRecFunc = () => {
+            if (!supportRec) return;
+            let localAutoSendWord = localStorage.getItem("autoVoiceSendWord");
+            autoSendWord = autoSendText.value = localAutoSendWord || autoSendText.getAttribute("value") || "";
+            autoSendText.onchange = () => {
+                autoSendWord = autoSendText.value;
+                localStorage.setItem("autoVoiceSendWord", autoSendWord);
+            }
+            autoSendText.dispatchEvent(new Event("change"));
+            let localAutoStopWord = localStorage.getItem("autoVoiceStopWord");
+            autoStopWord = autoStopText.value = localAutoStopWord || autoStopText.getAttribute("value") || "";
+            autoStopText.onchange = () => {
+                autoStopWord = autoStopText.value;
+                localStorage.setItem("autoVoiceStopWord", autoStopWord);
+            }
+            autoStopText.dispatchEvent(new Event("change"));
+            let outEle = document.getElementById("autoSendTimeout");
+            let localTimeout = localStorage.getItem("autoVoiceSendOut");
+            outEle.value = autoSendTime = parseInt(localTimeout || outEle.getAttribute("value"));
+            outEle.oninput = () => {
+                outEle.style.backgroundSize = (outEle.value - outEle.min) * 100 / (outEle.max - outEle.min) + "% 100%";
+                autoSendTime = parseInt(outEle.value);
+                localStorage.setItem("autoVoiceSendOut", outEle.value);
+            }
+            outEle.dispatchEvent(new Event("input"));
+            outEle.onchange = () => {
+                let hasAutoTimer = !!autoSendTimer;
+                clearAutoSendTimer();
+                if (hasAutoTimer) setAutoTimer();
+            }
+            const keepMicEle = document.getElementById("keepListenMic");
+            let localKeepMic = localStorage.getItem("keepListenMic");
+            keepMicEle.checked = keepListenMic = (localKeepMic || keepMicEle.getAttribute("checked")) === "true";
+            keepMicEle.onchange = () => {
+                keepListenMic = keepMicEle.checked;
+                localStorage.setItem("keepListenMic", keepListenMic);
+            }
+            keepMicEle.dispatchEvent(new Event("change"));
+            const closeEvent = (ev) => {
+                if (voiceRecSetEle.contains(ev.target)) return;
+                if (!voiceRecSetEle.contains(ev.target)) {
+                    voiceRecSetEle.style.display = "none";
+                    document.removeEventListener("mousedown", closeEvent, true);
+                    voiceRecEle.classList.remove("voiceLong");
+                }
+            }
+            const longEvent = () => {
+                voiceRecSetEle.style.display = "block";
+                document.addEventListener("mousedown", closeEvent, true);
+            }
+            const voiceDownEvent = (ev) => {
+                ev.preventDefault();
+                let i = 0;
+                voiceRecEle.classList.add("voiceLong");
+                recSetTimer = setInterval(() => {
+                    i += 1;
+                    if (i >= 3) {
+                        clearInterval(recSetTimer);
+                        recSetTimer = void 0;
+                        longEvent();
+                    }
+                }, 100)
+            }
+            const voiceUpEvent = (ev) => {
+                ev.preventDefault();
+                if (recSetTimer !== void 0) {
+                    toggleRecEv();
+                    clearInterval(recSetTimer);
+                    recSetTimer = void 0;
+                    voiceRecEle.classList.remove("voiceLong");
+                }
+            }
+            voiceRecEle.onmouseup = voiceUpEvent;
+            voiceRecEle.ontouchend = voiceUpEvent;
+            voiceRecEle.onmousedown = voiceDownEvent;
+            voiceRecEle.ontouchstart = voiceDownEvent;
+        }
+        initRecFunc();
+        class RiffPcmEncoder {
+            constructor(actualSampleRate, desiredSampleRate) {
+                this.privActualSampleRate = actualSampleRate;
+                this.privDesiredSampleRate = desiredSampleRate;
+            }
+            encode(actualAudioFrame) {
+                const audioFrame = this.downSampleAudioFrame(actualAudioFrame, this.privActualSampleRate, this.privDesiredSampleRate);
+                if (!audioFrame) return null;
+                const audioLength = audioFrame.length * 2;
+                const buffer = new ArrayBuffer(audioLength);
+                const view = new DataView(buffer);
+                this.floatTo16BitPCM(view, 0, audioFrame);
+                return buffer;
+            }
+            floatTo16BitPCM(view, offset, input) {
+                for (let i = 0; i < input.length; i++, offset += 2) {
+                    const s = Math.max(-1, Math.min(1, input[i]));
+                    view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
+                }
+            }
+            downSampleAudioFrame(srcFrame, srcRate, dstRate) {
+                if (!srcFrame) return null;
+                if (dstRate === srcRate || dstRate > srcRate) return srcFrame;
+                const ratio = srcRate / dstRate;
+                const dstLength = Math.round(srcFrame.length / ratio);
+                const dstFrame = new Float32Array(dstLength);
+                let srcOffset = 0;
+                let dstOffset = 0;
+                while (dstOffset < dstLength) {
+                    const nextSrcOffset = Math.round((dstOffset + 1) * ratio);
+                    let accum = 0;
+                    let count = 0;
+                    while (srcOffset < nextSrcOffset && srcOffset < srcFrame.length) {
+                        accum += srcFrame[srcOffset++];
+                        count++;
+                    }
+                    dstFrame[dstOffset++] = accum / count;
+                }
+                return dstFrame;
+            }
+        }
+        let recSocket;
+        class Recorder {
+            constructor() {
+                this.ready = false;
+                this.connId = "";
+                this.reqId = "";
+                this.label = "";
+                this.processScriptURL = "";
+                this.forceStop = false;
+                this.sampleRate = 16000;
+                // avgBytesPerSec / 10, 0.1s
+                this.bufferSize = this.sampleRate / 5;
+                this.chunks = [];
+                this.chunksByte = 0;
+                // "Content-Type: audio/x-wav\r\n" + WAV header
+                this.wavHeader = new Uint8Array([67, 111, 110, 116, 101, 110, 116, 45, 84, 121, 112, 101, 58, 32, 97, 117, 100, 105, 111, 47, 120, 45, 119, 97, 118, 13, 10, 82, 73, 70, 70, 0, 0, 0, 0, 87, 65, 86, 69, 102, 109, 116, 32, 16, 0, 0, 0, 1, 0, 1, 0, 128, 62, 0, 0, 0, 125, 0, 0, 2, 0, 16, 0, 100, 97, 116, 97, 0, 0, 0, 0])
+            }
+            initRecorder() {
+                return new Promise((res) => {
+                    // microsoft cognitive-services-speech-sdk-js
+                    this.context = navigator.mediaDevices.getSupportedConstraints().sampleRate ? new AudioContext({ sampleRate: this.sampleRate }) : new AudioContext();
+                    this.audioInput = this.context.createMediaStreamSource(this.stream);
+                    if (this.context.audioWorklet) {
+                        if (this.processScriptURL == "") {
+                            const workletScript = `${RiffPcmEncoder.toString()}
+                            class SP extends AudioWorkletProcessor {
+                            constructor(options) {
+                                super(options);
+                                this.sampleRate = ${this.sampleRate};
+                                // avgBytesPerSec / 10, 0.1s
+                                this.bufferSize = ${this.bufferSize};
+                                this.encoder = new RiffPcmEncoder(options.processorOptions.sampleRate, this.sampleRate);
+                                this.chunks = [];
+                                this.chunksByte = 0;
+                                this.processing = true;
+                                this.port.onmessage = (e) => {
+                                if (e.data === "stop") {
+                                    this.processing = false;
+                                    this.port.close();
+                                }
+                                }
+                            }
+                            concat() {
+                                let result = new Uint8Array(this.bufferSize);
+                                let offset = 0;
+                                for (let i = 0; i < this.chunks.length; i++) {
+                                result.set(this.chunks[i], offset);
+                                offset += this.chunks[i].byteLength;
+                                }
+                                return result;
+                            }
+                            process(inputs) {
+                                if (inputs[0][0]) {
+                                let data = new Uint8Array(this.encoder.encode(inputs[0][0]));
+                                this.chunks.push(data);
+                                this.chunksByte += data.byteLength;
+                                if (this.chunksByte > this.bufferSize) {
+                                    let lastChunk = this.chunks[this.chunks.length - 1];
+                                    this.chunks[this.chunks.length - 1] = lastChunk.subarray(0, lastChunk.byteLength - this.chunksByte + this.bufferSize);
+                                    let chunk = this.concat();
+                                    this.port.postMessage(chunk, [chunk.buffer]);
+                                    this.chunks.length = 0;
+                                    this.chunks.push(lastChunk.subarray(lastChunk.byteLength - this.chunksByte + this.bufferSize));
+                                    this.chunksByte = this.chunks[0].byteLength;
+                                } else if (this.chunksByte === this.bufferSize) {
+                                    let chunk = this.concat();
+                                    this.port.postMessage(chunk, [chunk.buffer]);
+                                    this.chunks.length = this.chunksByte = 0;
+                                }
+                                }
+                                return this.processing;
+                            }
+                            }
+                            registerProcessor('speech-processor', SP);`;
+                            this.processScriptURL = URL.createObjectURL(new Blob([workletScript], { type: "application/javascript; charset=utf-8" }));
+                        }
+                        this.context.audioWorklet.addModule(this.processScriptURL).then(() => {
+                            this.recorder = new AudioWorkletNode(this.context, "speech-processor", {
+                                processorOptions: { sampleRate: this.context.sampleRate }
+                            });
+                            this.ready = true;
+                            this.recorder.port.onmessage = (e) => { if (e.data && this.ready) recSocket.send(this.getRecBin(e.data)) };
+                            if (isFirefox) { // tested firefox need volume gain
+                                this.gain = this.context.createGain();
+                                this.gain.gain.value = 3;
+                                this.audioInput.connect(this.gain);
+                                this.gain.connect(this.recorder);
+                            } else this.audioInput.connect(this.recorder);
+                            this.recorder.connect(this.context.destination);
+                            res();
+                        }).catch(e => { this.attachScriptProcessor(res) })
+                    } else this.attachScriptProcessor(res);
+                })
+            }
+            attachScriptProcessor(res) {
+                this.encoder = new RiffPcmEncoder(this.context.sampleRate, this.sampleRate);
+                this.recorder = (() => {
+                    let bufferSize = 0;
+                    try {
+                        return this.context.createScriptProcessor(bufferSize, 1, 1);
+                    } catch (error) {
+                        // Webkit (<= version 31) requires a valid bufferSize.
+                        bufferSize = 2048;
+                        let audioSampleRate = this.context.sampleRate;
+                        while (bufferSize < 16384 && audioSampleRate >= (this.audioInput.channelCount * this.sampleRate)) {
+                            bufferSize <<= 1;
+                            audioSampleRate >>= 1;
+                        }
+                        return this.context.createScriptProcessor(bufferSize, 1, 1);
+                    }
+                })();
+                this.ready = true;
+                this.recorder.onaudioprocess = (event) => {
+                    const inputFrame = event.inputBuffer.getChannelData(0);
+                    if (inputFrame && this.ready) this.pushWSFrame(new Uint8Array(this.encoder.encode(inputFrame)));
+                };
+                if (isFirefox) { // tested firefox need volume gain
+                    this.gain = this.context.createGain();
+                    this.gain.gain.value = 3;
+                    this.audioInput.connect(this.gain);
+                    this.gain.connect(this.recorder);
+                } else this.audioInput.connect(this.recorder);
+                this.recorder.connect(this.context.destination);
+                res();
+            }
+            pushWSFrame(audio) {
+                let totalByte = this.chunksByte + audio.byteLength;
+                if (totalByte >= this.bufferSize) {
+                    let offset = 0;
+                    for (let i = 0; i < Math.floor(totalByte / this.bufferSize); i++) {
+                        if (this.chunksByte) {
+                            let partAudio = audio.subarray(offset, (i + 1) * this.bufferSize - this.chunksByte);
+                            this.chunks.push(partAudio);
+                            let chunk = this.concat();
+                            recSocket.send(this.getRecBin(chunk));
+                            offset = (i + 1) * this.bufferSize - this.chunksByte;
+                            this.chunks.length = this.chunksByte = 0;
+                        } else {
+                            let partAudio = audio.subarray(offset, offset + this.bufferSize);
+                            recSocket.send(this.getRecBin(partAudio));
+                            offset += this.bufferSize;
+                        }
+                    }
+                    if (offset < audio.byteLength) {
+                        this.chunks.push(audio.subarray(offset));
+                        this.chunksByte = this.chunks[0].byteLength;
+                    }
+                } else {
+                    this.chunks.push(audio);
+                    this.chunksByte += audio.byteLength;
+                }
+            }
+            concat() {
+                let result = new Uint8Array(this.bufferSize);
+                let offset = 0;
+                for (let i = 0; i < this.chunks.length; i++) {
+                    result.set(this.chunks[i], offset);
+                    offset += this.chunks[i].byteLength;
+                }
+                return result;
+            }
+            initRecWebsocket() {
+                return new Promise((res, rej) => {
+                    let url = `wss://${azureRecRegion}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1?Ocp-Apim-Subscription-Key=${azureRecKey}&language=${select_dialect.value}&storeAudio=true`;
+                    if (!recSocket || recSocket.readyState > 1 || recSocket.url.slice(0, -48) !== url) {
+                        if (recSocket && recSocket.readyState === 1) recSocket.close(1000);
+                        recSocket = new WebSocket(url + `&X-ConnectionId=${this.connId}`);
+                        recSocket.binaryType = "arraybuffer";
+                        recSocket.onopen = () => { res() };
+                        recSocket.onmessage = (e) => { this.handleWSMsg(e) };
+                        recSocket.onerror = (e) => {
+                            if (!this.ready) notyf.error(translations[locale]["badGateTip"]);
+                            recSocket.close();
+                            rej();
+                        };
+                        recSocket.onclose = (e) => {
+                            if (this.ready) {
+                                this.ready = false;
+                                this.retryWebsocket(recSocket.url)
+                            }
+                        }
+                    } else {
+                        return res()
+                    }
+                })
+            }
+            retryWebsocket(url) {
+                recSocket = new WebSocket(url);
+                recSocket.binaryType = "arraybuffer";
+                recSocket.onopen = () => {
+                    this.startWSDetect();
+                    this.ready = true;
+                }
+                recSocket.onmessage = (e) => { this.handleWSMsg(e) };
+                recSocket.onerror = (e) => { recSocket.close() };
+                recSocket.onclose = (e) => {
+                    notyf.error(translations[locale]["badGateTip"]);
+                    this.stopRecorder(true);
+                }
+            }
+            handleWSMsg(e) {
+                if (typeof e.data === "string") {
+                    let path = e.data.match(/Path:(.+)/)[1].trim();
+                    let splitData = e.data.split("\n");
+                    if (this.ready && (path === "speech.phrase" || path === "speech.hypothesis")) {
+                        let data = JSON.parse(splitData[splitData.length - 1]);
+                        let isFinal = data.DisplayText !== void 0;
+                        let autoFlag;
+                        if (isFinal) {
+                            recRes += data.DisplayText;
+                            if (autoSendWord) {
+                                let idx = recRes.indexOf(autoSendWord);
+                                if (idx !== -1) {
+                                    recRes = recRes.slice(0, idx);
+                                    autoFlag = 1;
+                                }
+                            }
+                            if (autoStopWord) {
+                                let idx = recRes.indexOf(autoStopWord);
+                                if (idx !== -1) {
+                                    recRes = recRes.slice(0, idx);
+                                    autoFlag = 2;
+                                }
+                            }
+                        }
+                        else if (data.Text) { tempRes = recRes + data.Text }
+                        inputAreaEle.value = preRes + (isFinal ? recRes : tempRes) + affRes;
+                        textInputEvent();
+                        inputAreaEle.focus();
+                        inputAreaEle.selectionEnd = inputAreaEle.value.length - affRes.length;
+                        if (autoFlag) {
+                            if (autoFlag === 1) genFunc();
+                            else this.stopRecorder(true);
+                        }
+                        clearAutoSendTimer();
+                        if (autoFlag !== 1) setAutoTimer();
+                    } else if (path === "turn.end") {
+                        if (!this.forceStop && keepListenMic || this.ready) {
+                            this.startWSDetect();
+                            this.ready = true;
+                        }
+                    }
+                }
+            }
+            startWSDetect() {
+                this.reqId = uuidv4(true);
+                recSocket.send(this.getRecPre(this.label));
+                recSocket.send(this.getRecConfig());
+                recSocket.send(this.getRecPreBin());
+            }
+            getRecPre(label) {
+                let osPlatform = (typeof window !== "undefined") ? "Browser" : "Node";
+                osPlatform += "/" + navigator.platform;
+                let osName = navigator.userAgent;
+                let osVersion = navigator.appVersion;
+                return `Path: speech.config\r\nX-RequestId: ${this.reqId}\r\nX-Timestamp: ${new Date().toISOString()}\r\nContent-Type: application/json\r\n\r\n{"context":{"system":{"name":"SpeechSDK","version":"1.35.0","build":"JavaScript","lang":"JavaScript"},"os":{"platform":"${osPlatform}","name":"${osName}","version":"${osVersion}"},"audio":{"source":{"bitspersample":16,"channelcount":1,"connectivity":"Unknown","manufacturer":"Speech SDK","model":"${label}","samplerate":${this.sampleRate},"type":"Microphones"}}},"recognition":"conversation"}`
+            }
+            getRecConfig() {
+                return `Path: speech.context\r\nX-RequestId: ${this.reqId}\r\nX-Timestamp: ${new Date().toISOString()}\r\nContent-Type: application/json\r\n\r\n{"phraseDetection":{}}`
+            }
+            getRecPreBin() {
+                let header = this.getRecHeader();
+                let data = new Uint8Array(2 + header.length + this.wavHeader.byteLength);
+                data.set([0, 126], 0);
+                data.set(stringToArray(header), 2);
+                data.set(this.wavHeader, 2 + header.length);
+                return data
+            }
+            getRecBin(audio) {
+                let header = this.getRecHeader();
+                let data = new Uint8Array(2 + header.length + audio.byteLength);
+                data.set([0, 99], 0);
+                data.set(stringToArray(header), 2);
+                data.set(audio, 2 + header.length);
+                return data
+            }
+            getRecHeader() {
+                return `Path: audio\r\nX-RequestId: ${this.reqId}\r\nX-Timestamp: ${new Date().toISOString()}\r\n`
+            }
+            getMedia() {
+                return new Promise((res, rej) => {
+                    navigator.mediaDevices.getUserMedia({ audio: true }).then((stream) => {
+                        this.stream = stream;
+                        res();
+                    }).catch(e => {
+                        notyf.error(translations[locale][e.name === "NotAllowedError" ? "noMicPerTip" : "noMicTip"]);
+                        rej();
+                    })
+                })
+            }
+            async startRecorder() {
+                return new Promise((res, rej) => {
+                    this.connId = uuidv4(true);
+                    Promise.all([this.getMedia(), this.initRecWebsocket()]).then((val) => {
+                        this.label = this.stream.getAudioTracks()[0].label;
+                        this.startWSDetect();
+                        this.initRecorder().then(() => { res() });
+                    }).catch(e => {
+                        if (this.stream) {
+                            this.stream.getAudioTracks().forEach(track => { track.stop() });
+                            this.stream = null;
+                        }
+                        rej(e);
+                    })
+                })
+            }
+            stopRecWebsocket() {
+                if (recSocket && recSocket.readyState === 1) {
+                    let endBin = this.getRecBin(new Uint8Array());
+                    recSocket.send(endBin);
+                    recSocket.send(endBin);
+                }
+            }
+            stopRecorder(forceStop) {
+                this.forceStop = forceStop;
+                this.ready = false;
+                this.stopRecWebsocket();
+                clearAutoSendTimer();
+                if (!forceStop && keepListenMic) return;
+                voiceRecEle.classList.remove("voiceRecing");
+                recing = false;
+                if (this.recorder && this.recorder.port) {
+                    this.recorder.port.postMessage("stop");
+                    this.recorder.port.close();
+                }
+                if (this.stream) {
+                    this.stream.getAudioTracks().forEach(track => { track.stop() });
+                    this.stream = null;
+                }
+                if (this.audioInput) {
+                    this.audioInput.disconnect();
+                    this.audioInput = null;
+                }
+                if (isFirefox && this.gain) {
+                    this.gain.disconnect();
+                    this.gain = null;
+                }
+                if (this.recorder) {
+                    this.recorder.disconnect();
+                    this.recorder = null;
+                }
+                if (this.context) {
+                    this.context.close();
+                    this.context = null;
+                }
+            }
+        }
+        class LegacyRecorder {
+            constructor() {
+                this.mimeType = MediaRecorder.isTypeSupported("audio/mp4") ? "audio/mp4" : "audio/webm";
+                this.suffix = this.mimeType === "audio/mp4" ? ".mp4" : ".webm";
+                this.bitsPerSecond = 88888;
+                this.chunks = [];
+            }
+            initRecorder() {
+                this.recorder = new MediaRecorder(this.stream, { mimeType: this.mimeType, audioBitsPerSecond: this.bitsPerSecond });
+                this.chunks.length = 0;
+                this.recorder.ondataavailable = e => { this.chunks.push(e.data) };
+                this.recorder.start(1e3);
+            }
+            async processData(blob) {
+                let formData = new FormData();
+                formData.append("model", "whisper-1");
+                formData.append("file", blob, "audio" + this.suffix);
+                if (select_dialect.value !== "") formData.append("language", select_dialect.value);
+                let url = apiHost + ((apiHost.length && !apiHost.endsWith("/")) ? "/" : "") + "v1/audio/transcriptions";
+                let controller = new AbortController();
+                let controllerId = setTimeout(() => {
+                    notyf.error(translations[locale]["timeoutTip"]);
+                    controller.abort();
+                }, 15000);
+                try {
+                    const res = await fetch(url, {
+                        method: "POST",
+                        body: formData,
+                        signal: controller.signal,
+                        ...(customAPIKey ? { headers: { Authorization: "Bearer " + customAPIKey } } : {})
+                    });
+                    clearTimeout(controllerId);
+                    if (res.status === 200) {
+                        let result = await res.json();
+                        if (result && result.text) {
+                            inputAreaEle.value = preRes + result.text + affRes;
+                            textInputEvent();
+                            inputAreaEle.focus();
+                            inputAreaEle.selectionEnd = inputAreaEle.value.length - affRes.length;
+                            return true;
+                        }
+                    } else notyf.open({ type: "warning", message: translations[locale]["cantTranscribeTip"] });
+                } catch (e) { }
+            }
+            getMedia() {
+                return new Promise((res, rej) => {
+                    navigator.mediaDevices.getUserMedia({ audio: true }).then((stream) => {
+                        this.stream = stream;
+                        res();
+                    }).catch(e => {
+                        notyf.error(translations[locale][e.name === "NotAllowedError" ? "noMicPerTip" : "noMicTip"]);
+                        rej(e);
+                    })
+                })
+            }
+            async startRecorder() {
+                return new Promise((res, rej) => {
+                    this.getMedia().then(() => {
+                        this.initRecorder();
+                        res();
+                    }).catch(e => { rej(e) })
+                })
+            }
+            stopRecorder(forceStop) {
+                return new Promise((res, rej) => {
+                    clearAutoSendTimer();
+                    if (this.recorder && this.recorder.state === "recording") {
+                        this.recorder.onstop = async (e) => {
+                            let blob = new Blob(this.chunks, { type: this.mimeType });
+                            this.chunks.length = 0;
+                            try {
+                                let result = await this.processData(blob);
+                                if (result) res();
+                                else rej();
+                            } catch (error) { rej() };
+                            if (!forceStop && keepListenMic) this.recorder.start(1e3);
+                        };
+                        this.recorder.stop();
+                    } else res();
+                    if (!forceStop && keepListenMic) return;
+                    voiceRecEle.classList.remove("voiceRecing");
+                    recing = false;
+                    if (this.recorder) this.recorder = null;
+                    if (this.stream) {
+                        this.stream.getAudioTracks().forEach(track => { track.stop() });
+                        this.stream = null;
+                    }
+                })
+            }
+        }
+        let recorder, legacyRecorder, initingRecorder;
+        let recRes = tempRes = "";
+        let preRes, affRes;
+        resetRecRes = () => {
+            preRes = inputAreaEle.value.slice(0, inputAreaEle.selectionStart);
+            affRes = inputAreaEle.value.slice(inputAreaEle.selectionEnd);
+            recRes = tempRes = "";
+        }
+        const initRecEvent = () => {
+            if (existRec === 3) {
+                if (legacyRecorder === void 0) legacyRecorder = new LegacyRecorder();
+                toggleRecEv = async (force = true) => {
+                    if (voiceRecEle.classList.contains("voiceRecing")) await legacyRecorder.stopRecorder(force);
+                    else {
+                        if (initingRecorder) return;
+                        resetRecRes();
+                        initingRecorder = true;
+                        legacyRecorder.startRecorder().then(() => {
+                            recing = true;
+                            initingRecorder = false;
+                            voiceRecEle.classList.add("voiceRecing");
+                        }).catch(e => {
+                            legacyRecorder.stopRecorder(force);
+                            initingRecorder = false;
+                        })
+                    }
+                }
+            } else if (existRec === 2) {
+                if (recorder === void 0) recorder = new Recorder();
+                toggleRecEv = (force = true) => {
+                    if (voiceRecEle.classList.contains("voiceRecing")) recorder.stopRecorder(force);
+                    else {
+                        if (initingRecorder) return;
+                        resetRecRes();
+                        initingRecorder = true;
+                        recorder.startRecorder().then(() => {
+                            recing = true;
+                            initingRecorder = false;
+                            voiceRecEle.classList.add("voiceRecing");
+                        }).catch(e => {
+                            recorder.stopRecorder(force)
+                            initingRecorder = false;
+                        })
+                    }
+                }
+            } else {
+                let recIns = new webkitSpeechRecognition();
+                // prevent some Android bug
+                recIns.continuous = !isAndroid;
+                recIns.interimResults = true;
+                recIns.maxAlternatives = 1;
+                let resEvent = (event) => {
+                    if (typeof (event.results) === "undefined") {
+                        toggleRecEv();
+                        return;
+                    }
+                    let isFinal;
+                    let autoFlag;
+                    for (let i = event.resultIndex; i < event.results.length; ++i) {
+                        isFinal = event.results[i].isFinal;
+                        if (isFinal) {
+                            recRes += event.results[i][0].transcript
+                            if (autoSendWord) {
+                                let idx = recRes.indexOf(autoSendWord);
+                                if (idx !== -1) {
+                                    recRes = recRes.slice(0, idx);
+                                    autoFlag = 1;
+                                    break;
+                                }
+                            }
+                            if (autoStopWord) {
+                                let idx = recRes.indexOf(autoStopWord);
+                                if (idx !== -1) {
+                                    recRes = recRes.slice(0, idx);
+                                    autoFlag = 2;
+                                    break;
+                                }
+                            }
+                        }
+                        else { tempRes = recRes + event.results[i][0].transcript }
+                    }
+                    inputAreaEle.value = preRes + (isFinal ? recRes : tempRes) + affRes;
+                    textInputEvent();
+                    inputAreaEle.focus();
+                    inputAreaEle.selectionEnd = inputAreaEle.value.length - affRes.length;
+                    if (autoFlag) {
+                        if (autoFlag === 1) genFunc();
+                        else endEvent(false, false);
+                    }
+                    clearAutoSendTimer();
+                    if (autoFlag !== 1) setAutoTimer();
+                };
+                const stopAction = () => {
+                    clearAutoSendTimer();
+                    recIns.onresult = null;
+                    recIns.onerror = null;
+                    recIns.onend = null;
+                    voiceRecEle.classList.remove("voiceRecing");
+                    recing = false;
+                };
+                const endEvent = (event, flag) => {
+                    if (flag !== void 0) {
+                        if (!flag) {
+                            recIns.stop();
+                            stopAction();
+                        }
+                    } else if (event) {
+                        if (keepListenMic && event.type === "end") {
+                            recIns.start();
+                            resetRecRes();
+                        } else {
+                            if (event.type === "error") recIns.stop();
+                            stopAction();
+                        }
+                    }
+                };
+                const errorEvent = (ev) => {
+                    if (event.error === "no-speech") {
+                        notyf.open({
+                            type: "warning",
+                            message: translations[locale]["noSpeechTip"]
+                        });
+                    }
+                    if (event.error === "audio-capture") {
+                        notyf.error(translations[locale]["noMicTip"])
+                        endEvent(ev);
+                    }
+                    if (event.error === "not-allowed") {
+                        notyf.error(translations[locale]["noMicPerTip"])
+                        endEvent(ev);
+                    }
+                };
+                toggleRecEv = () => {
+                    if (voiceRecEle.classList.toggle("voiceRecing")) {
+                        try {
+                            resetRecRes();
+                            recIns.lang = select_dialect.value;
+                            recIns.start();
+                            recIns.onresult = resEvent;
+                            recIns.onerror = errorEvent;
+                            recIns.onend = endEvent;
+                            recing = true;
+                        } catch (e) {
+                            endEvent(false, false);
+                        }
+                    } else {
+                        endEvent(false, false);
+                    }
+                };
+            }
+        }
+        recServiceEle.dispatchEvent(new Event("change"));
+        document.querySelector(".modelSwitch").onclick = document.querySelector(".sysSwitch").onclick = document.querySelector(".setSwitch").onclick = function (ev) {
+            let activeEle = this.getElementsByClassName("activeSwitch")[0];
+            if (ev.target !== activeEle) {
+                activeEle.classList.remove("activeSwitch");
+                ev.target.classList.add("activeSwitch");
+                document.getElementById(ev.target.dataset.id).style.display = "block";
+                document.getElementById(activeEle.dataset.id).style.display = "none";
+            }
+        };
+        if (!supportSpe) speechServiceEle.remove(3);
+        const initVoiceVal = () => {
+            let localVoiceType = localStorage.getItem("existVoice");
+            speechServiceEle.value = existVoice = parseInt(localVoiceType || "2");
+        }
+        initVoiceVal();
+        const clearAzureVoice = () => {
+            azureKey = void 0;
+            localStorage.removeItem(azureRegion + "VoiceData");
+            azureRegion = void 0;
+            azureRole = [];
+            azureStyle = [];
+            document.getElementById("azureExtra").style.display = "none";
+            azureKeyInput.parentElement.style.display = "none";
+            preSetAzureRegion.parentElement.style.display = "none";
+        }
+        speechServiceEle.onchange = () => {
+            existVoice = parseInt(speechServiceEle.value);
+            localStorage.setItem("existVoice", existVoice);
+            toggleVoiceCheck(true);
+            if (checkAzureAbort && !checkAzureAbort.signal.aborted) {
+                checkAzureAbort.abort();
+                checkAzureAbort = void 0;
+            }
+            if (checkEdgeAbort && !checkEdgeAbort.signal.aborted) {
+                checkEdgeAbort.abort();
+                checkEdgeAbort = void 0;
+            }
+            if (existVoice === 4) {
+                toggleVoiceCheck(false);
+                clearAzureVoice();
+                loadOpenAIVoice();
+            } else if (existVoice === 3) {
+                azureKeyInput.parentElement.style.display = "block";
+                preSetAzureRegion.parentElement.style.display = "block";
+                loadAzureVoice();
+            } else if (existVoice === 2) {
+                clearAzureVoice();
+                loadEdgeVoice();
+            } else if (existVoice === 1) {
+                toggleVoiceCheck(false);
+                clearAzureVoice();
+                loadLocalVoice();
+            }
+        }
+        let openaiVoiceData, edgeVoiceData, systemVoiceData, checkAzureAbort, checkEdgeAbort;
+        const toggleVoiceCheck = (bool) => {
+            checkVoiceLoad.style.display = bool ? "flex" : "none";
+            speechDetail.style.display = bool ? "none" : "block";
+        }
+        const loadOpenAIVoice = () => {
+            if (openaiVoiceData) {
+                initVoiceSetting(openaiVoiceData);
+            } else {
+                openaiVoiceData = [{ name: "alloy", displayName: "alloy" }, { name: "echo", displayName: "echo" }, { name: "fable", displayName: "fable" }, { name: "onyx", displayName: "onyx" }, { name: "nova", displayName: "nova" }, { name: "shimmer", displayName: "shimmer" }]
+                initVoiceSetting(openaiVoiceData);
+            }
+        };
+        const loadAzureVoice = () => {
+            let checking = false;
+            const checkAzureFunc = () => {
+                if (checking) return;
+                if (azureKey) {
+                    checking = true;
+                    checkVoiceLoad.classList.add("voiceChecking");
+                    checkAzureAbort = new AbortController();
+                    setTimeout(() => {
+                        if (checkAzureAbort && !checkAzureAbort.signal.aborted) {
+                            checkAzureAbort.abort();
+                            checkAzureAbort = void 0;
+                        }
+                    }, 15000);
+                    getAzureToken(checkAzureAbort.signal).then(() => {
+                        getVoiceList(checkAzureAbort.signal).then(() => {
+                            toggleVoiceCheck(false);
+                        }).catch(e => {
+                        }).finally(() => {
+                            checkVoiceLoad.classList.remove("voiceChecking");
+                            checking = false;
+                        })
+                    }).catch(e => {
+                    }).finally(() => {
+                        checkVoiceLoad.classList.remove("voiceChecking");
+                        checking = false;
+                    })
+                }
+            };
+            checkVoiceLoad.onclick = checkAzureFunc;
+            const getAzureToken = (signal) => {
+                return new Promise((res, rej) => {
+                    fetch("https://" + azureRegion + ".api.cognitive.microsoft.com/sts/v1.0/issueToken", {
+                        signal,
+                        method: "POST",
+                        headers: {
+                            "Ocp-Apim-Subscription-Key": azureKey
+                        }
+                    }).then(response => {
+                        response.text().then(text => {
+                            try {
+                                let json = JSON.parse(text);
+                                notyf.error(translations[locale]["azureInvalidTip"]);
+                                rej();
+                            } catch (e) {
+                                res();
+                            }
+                        });
+                    }).catch(e => {
+                        localStorage.removeItem(azureRegion + "VoiceData");
+                        rej();
+                    })
+                })
+            };
+            const getVoiceList = (signal) => {
+                return new Promise((res, rej) => {
+                    let localAzureVoiceData = localStorage.getItem(azureRegion + "VoiceData");
+                    if (localAzureVoiceData) {
+                        initVoiceSetting(JSON.parse(localAzureVoiceData));
+                        res();
+                    } else {
+                        fetch("https://" + azureRegion + ".tts.speech.microsoft.com/cognitiveservices/voices/list", {
+                            signal,
+                            headers: {
+                                "Ocp-Apim-Subscription-Key": azureKey
+                            }
+                        }).then(response => {
+                            response.json().then(json => {
+                                localStorage.setItem(azureRegion + "VoiceData", JSON.stringify(json));
+                                initVoiceSetting(json);
+                                res();
+                            }).catch(e => {
+                                notyf.error(translations[locale]["azureInvalidTip"]);
+                                rej();
+                            })
+                        }).catch(e => {
+                            localStorage.removeItem(azureRegion + "VoiceData");
+                            rej();
+                        })
+                    }
+                })
+            };
+            let azureRegionEle = document.getElementById("preSetAzureRegion");
+            if (!azureRegionEle.options.length) {
+                azureRegions.forEach((region, i) => {
+                    let option = document.createElement("option");
+                    option.value = region;
+                    option.text = region;
+                    azureRegionEle.options.add(option);
+                });
+            }
+            let localAzureRegion = localStorage.getItem("azureRegion");
+            if (localAzureRegion) {
+                azureRegion = localAzureRegion;
+                azureRegionEle.value = localAzureRegion;
+            }
+            azureRegionEle.onchange = () => {
+                azureRegion = azureRegionEle.value;
+                localStorage.setItem("azureRegion", azureRegion);
+                toggleVoiceCheck(true);
+            }
+            azureRegionEle.dispatchEvent(new Event("change"));
+            let azureKeyEle = document.getElementById("azureKeyInput");
+            let localAzureKey = localStorage.getItem("azureKey");
+            if (localAzureKey) {
+                azureKey = localAzureKey;
+                azureKeyEle.value = localAzureKey;
+            }
+            azureKeyEle.onchange = () => {
+                azureKey = azureKeyEle.value;
+                localStorage.setItem("azureKey", azureKey);
+                toggleVoiceCheck(true);
+            }
+            azureKeyEle.dispatchEvent(new Event("change"));
+            if (azureKey) {
+                checkAzureFunc();
+            }
+        }
+        const loadEdgeVoice = () => {
+            let checking = false;
+            const endCheck = () => {
+                checkVoiceLoad.classList.remove("voiceChecking");
+                checking = false;
+            };
+            const checkEdgeFunc = () => {
+                if (checking) return;
+                checking = true;
+                checkVoiceLoad.classList.add("voiceChecking");
+                if (edgeVoiceData) {
+                    initVoiceSetting(edgeVoiceData);
+                    toggleVoiceCheck(false);
+                    endCheck();
+                } else {
+                    checkEdgeAbort = new AbortController();
+                    setTimeout(() => {
+                        if (checkEdgeAbort && !checkEdgeAbort.signal.aborted) {
+                            checkEdgeAbort.abort();
+                            checkEdgeAbort = void 0;
+                        }
+                    }, 10000);
+                    fetch("https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4", { signal: checkEdgeAbort.signal }).then(response => {
+                        response.json().then(json => {
+                            edgeVoiceData = json;
+                            toggleVoiceCheck(false);
+                            initVoiceSetting(json);
+                            endCheck();
+                        });
+                    }).catch(err => {
+                        endCheck();
+                    })
+                }
+            };
+            checkEdgeFunc();
+            checkVoiceLoad.onclick = checkEdgeFunc;
+        };
+        const loadLocalVoice = () => {
+            if (systemVoiceData) {
+                initVoiceSetting(systemVoiceData);
+            } else {
+                let initedVoice = false;
+                const getLocalVoice = () => {
+                    let voices = speechSynthesis.getVoices();
+                    if (voices.length) {
+                        if (!initedVoice) {
+                            initedVoice = true;
+                            systemVoiceData = voices;
+                            initVoiceSetting(voices);
+                        }
+                        return true;
+                    } else {
+                        return false;
+                    }
+                }
+                let syncExist = getLocalVoice();
+                if (!syncExist) {
+                    if ("onvoiceschanged" in speechSynthesis) {
+                        speechSynthesis.onvoiceschanged = () => {
+                            getLocalVoice();
+                        }
+                    } else if (speechSynthesis.addEventListener) {
+                        speechSynthesis.addEventListener("voiceschanged", () => {
+                            getLocalVoice();
+                        })
+                    }
+                    let timeout = 0;
+                    let timer = setInterval(() => {
+                        if (getLocalVoice() || timeout > 1000) {
+                            if (timeout > 1000) {
+                                existVoice = 0;
+                            }
+                            clearInterval(timer);
+                            timer = null;
+                        }
+                        timeout += 300;
+                    }, 300)
+                }
+            }
+        };
+        const voicesEle = document.getElementById("preSetSpeech");
+        const initVoiceSetting = (voices) => {
+            if (existVoice < 4) {
+                let isOnline = existVoice === 2 || existVoice === 3;
+                // 支持中文和英文
+                voices = isOnline ? voices.filter(item => item.Locale.match(/^(zh-|en-)/)) : voices.filter(item => item.lang.match(/^(zh-|en-)/));
+                if (isOnline) {
+                    voices.map(item => {
+                        item.name = item.FriendlyName || (`${item.DisplayName} Online (${item.VoiceType}) - ${item.LocaleName}`);
+                        item.lang = item.Locale;
+                    })
+                } else if (isSafari && voices[0].voiceURI.startsWith("com.apple")) {
+                    voices = voices.filter(item => {
+                        return !item.voiceURI.startsWith("com.apple.voice.super-compact")
+                    })
+                }
+                voices.sort((a, b) => {
+                    if (a.lang.slice(0, 2) === b.lang.slice(0, 2)) {
+                        if (a.lang.slice(0, 2) === "zh") {
+                            return (a.lang === b.lang) ? 0 : (a.lang > b.lang) ? 1 : -1; // zh-CN 在前
+                        } else {
+                            return 0
+                        }
+                    }
+                    return (locale === "zh" ? (a.lang < b.lang) : (a.lang > b.lang)) ? 1 : -1; // 中文UI，则中文"z"在前
+                });
+                voices.map(item => {
+                    if (item.name.match(/^(Google |Microsoft )/)) {
+                        item.displayName = item.name.replace(/^.*? /, "");
+                    } else {
+                        item.displayName = item.name;
+                    }
+                });
+                if (isSafari && !isOnline) {
+                    voices.map(item => { item.displayName = `${item.name} (${item.lang})` });
+                };
+            };
+            voicesData = voices;
+            voicesEle.innerHTML = "";
+            voices.forEach((voice, i) => {
+                let option = document.createElement("option");
+                option.value = i;
+                option.text = voice.displayName;
+                voicesEle.options.add(option);
+            });
+            const loadAnother = (type) => {
+                type = type ^ 1;
+                let localVoice = localStorage.getItem("voice" + type);
+                if (localVoice) {
+                    let localIndex = voices.findIndex(item => { return item.name === localVoice });
+                    if (localIndex === -1) localIndex = 0;
+                    voiceRole[type] = voices[localIndex];
+                } else {
+                    voiceRole[type] = voices[0];
+                }
+                if (existVoice === 3) {
+                    let localStyle = localStorage.getItem("azureStyle" + type);
+                    azureStyle[type] = localStyle ? localStyle : void 0;
+                    let localRole = localStorage.getItem("azureRole" + type);
+                    azureRole[type] = localRole ? localRole : void 0;
+                }
+            }
+            voiceChange();
+            loadAnother(voiceType);
+        };
+        let voiceChange;
+        const initVoiceFunc = () => {
+            voicesEle.onchange = () => {
+                voiceRole[voiceType] = voicesData[voicesEle.value];
+                localStorage.setItem("voice" + voiceType, voiceRole[voiceType].name);
+                if (voiceRole[voiceType].StyleList || voiceRole[voiceType].RolePlayList) {
+                    document.getElementById("azureExtra").style.display = "block";
+                    let voiceStyles = voiceRole[voiceType].StyleList;
+                    let voiceRoles = voiceRole[voiceType].RolePlayList;
+                    if (voiceRoles) {
+                        preSetVoiceRole.innerHTML = "";
+                        let option = document.createElement("option");
+                        option.value = "Default";
+                        option.text = "Default";
+                        preSetVoiceRole.options.add(option);
+                        voiceRoles.forEach((role, i) => {
+                            let option = document.createElement("option");
+                            option.value = role;
+                            option.text = role;
+                            preSetVoiceRole.options.add(option);
+                        });
+                        let localRole = localStorage.getItem("azureRole" + voiceType);
+                        if (localRole && voiceRoles.indexOf(localRole) !== -1) {
+                            preSetVoiceRole.value = localRole;
+                            azureRole[voiceType] = localRole;
+                        } else {
+                            preSetVoiceRole.selectedIndex = 0;
+                            azureRole[voiceType] = voiceRole[0];
+                        }
+                        preSetVoiceRole.onchange = () => {
+                            azureRole[voiceType] = preSetVoiceRole.value;
+                            localStorage.setItem("azureRole" + voiceType, preSetVoiceRole.value);
+                        }
+                        preSetVoiceRole.dispatchEvent(new Event("change"));
+                    } else {
+                        azureRole[voiceType] = void 0;
+                        localStorage.removeItem("azureRole" + voiceType);
+                    }
+                    preSetVoiceRole.style.display = voiceRoles ? "block" : "none";
+                    preSetVoiceRole.previousElementSibling.style.display = voiceRoles ? "block" : "none";
+                    if (voiceStyles) {
+                        preSetVoiceStyle.innerHTML = "";
+                        let option = document.createElement("option");
+                        option.value = "Default";
+                        option.text = "Default";
+                        preSetVoiceStyle.options.add(option);
+                        voiceStyles.forEach((style, i) => {
+                            let option = document.createElement("option");
+                            option.value = style;
+                            option.text = style;
+                            preSetVoiceStyle.options.add(option);
+                        });
+                        let localStyle = localStorage.getItem("azureStyle" + voiceType);
+                        if (localStyle && voiceStyles.indexOf(localStyle) !== -1) {
+                            preSetVoiceStyle.value = localStyle;
+                            azureStyle[voiceType] = localStyle;
+                        } else {
+                            preSetVoiceStyle.selectedIndex = 0;
+                            azureStyle[voiceType] = voiceStyles[0];
+                        }
+                        preSetVoiceStyle.onchange = () => {
+                            azureStyle[voiceType] = preSetVoiceStyle.value;
+                            localStorage.setItem("azureStyle" + voiceType, preSetVoiceStyle.value)
+                        }
+                        preSetVoiceStyle.dispatchEvent(new Event("change"));
+                    } else {
+                        azureStyle[voiceType] = void 0;
+                        localStorage.removeItem("azureStyle" + voiceType);
+                    }
+                    preSetVoiceStyle.style.display = voiceStyles ? "block" : "none";
+                    preSetVoiceStyle.previousElementSibling.style.display = voiceStyles ? "block" : "none";
+                } else {
+                    document.getElementById("azureExtra").style.display = "none";
+                    azureRole[voiceType] = void 0;
+                    localStorage.removeItem("azureRole" + voiceType);
+                    azureStyle[voiceType] = void 0;
+                    localStorage.removeItem("azureStyle" + voiceType);
+                }
+            };
+            voiceChange = () => {
+                let localVoice = localStorage.getItem("voice" + voiceType);
+                if (localVoice) {
+                    let localIndex = voicesData.findIndex(item => { return item.name === localVoice });
+                    if (localIndex === -1) localIndex = 0;
+                    voiceRole[voiceType] = voicesData[localIndex];
+                    voicesEle.value = localIndex;
+                } else {
+                    voiceRole[voiceType] = voicesData[0];
+                }
+                voicesEle.dispatchEvent(new Event("change"));
+            }
+            let volumeEle = document.getElementById("voiceVolume");
+            let localVolume = localStorage.getItem("voiceVolume0");
+            voiceVolume[0] = parseFloat(localVolume || volumeEle.getAttribute("value"));
+            const voiceVolumeChange = () => {
+                let localVolume = localStorage.getItem("voiceVolume" + voiceType);
+                volumeEle.value = voiceVolume[voiceType] = parseFloat(localVolume || volumeEle.getAttribute("value"));
+                volumeEle.style.backgroundSize = (volumeEle.value - volumeEle.min) * 100 / (volumeEle.max - volumeEle.min) + "% 100%";
+            }
+            volumeEle.oninput = () => {
+                volumeEle.style.backgroundSize = (volumeEle.value - volumeEle.min) * 100 / (volumeEle.max - volumeEle.min) + "% 100%";
+                voiceVolume[voiceType] = parseFloat(volumeEle.value);
+                localStorage.setItem("voiceVolume" + voiceType, volumeEle.value);
+            }
+            voiceVolumeChange();
+            let rateEle = document.getElementById("voiceRate");
+            let localRate = localStorage.getItem("voiceRate0");
+            voiceRate[0] = parseFloat(localRate || rateEle.getAttribute("value"));
+            const voiceRateChange = () => {
+                let localRate = localStorage.getItem("voiceRate" + voiceType);
+                rateEle.value = voiceRate[voiceType] = parseFloat(localRate || rateEle.getAttribute("value"));
+                rateEle.style.backgroundSize = (rateEle.value - rateEle.min) * 100 / (rateEle.max - rateEle.min) + "% 100%";
+            }
+            rateEle.oninput = () => {
+                rateEle.style.backgroundSize = (rateEle.value - rateEle.min) * 100 / (rateEle.max - rateEle.min) + "% 100%";
+                voiceRate[voiceType] = parseFloat(rateEle.value);
+                localStorage.setItem("voiceRate" + voiceType, rateEle.value);
+            }
+            voiceRateChange();
+            let pitchEle = document.getElementById("voicePitch");
+            let localPitch = localStorage.getItem("voicePitch0");
+            voicePitch[0] = parseFloat(localPitch || pitchEle.getAttribute("value"));
+            const voicePitchChange = () => {
+                let localPitch = localStorage.getItem("voicePitch" + voiceType);
+                pitchEle.value = voicePitch[voiceType] = parseFloat(localPitch || pitchEle.getAttribute("value"));
+                pitchEle.style.backgroundSize = (pitchEle.value - pitchEle.min) * 100 / (pitchEle.max - pitchEle.min) + "% 100%";
+            }
+            pitchEle.oninput = () => {
+                pitchEle.style.backgroundSize = (pitchEle.value - pitchEle.min) * 100 / (pitchEle.max - pitchEle.min) + "% 100%";
+                voicePitch[voiceType] = parseFloat(pitchEle.value);
+                localStorage.setItem("voicePitch" + voiceType, pitchEle.value);
+            }
+            voicePitchChange();
+            document.getElementById("voiceTypes").onclick = (ev) => {
+                let type = ev.target.dataset.type;
+                if (type !== void 0) {
+                    type = parseInt(type);
+                    if (type != voiceType) {
+                        voiceType = type;
+                        ev.target.classList.add("selVoiceType");
+                        ev.target.parentElement.children[type ^ 1].classList.remove("selVoiceType");
+                        voiceChange();
+                        voiceVolumeChange();
+                        voiceRateChange();
+                        voicePitchChange();
+                    }
+                };
+            };
+            const voiceTestEle = document.getElementById("testVoiceText");
+            let localTestVoice = localStorage.getItem("voiceTestText");
+            voiceTestText = voiceTestEle.value = localTestVoice || voiceTestEle.getAttribute("value");
+            voiceTestEle.oninput = () => {
+                voiceTestText = voiceTestEle.value;
+                localStorage.setItem("voiceTestText", voiceTestText);
+            }
+            const contVoiceEle = document.getElementById("enableContVoice");
+            let localCont = localStorage.getItem("enableContVoice");
+            contVoiceEle.checked = enableContVoice = (localCont || contVoiceEle.getAttribute("checked")) === "true";
+            contVoiceEle.onchange = () => {
+                enableContVoice = contVoiceEle.checked;
+                localStorage.setItem("enableContVoice", enableContVoice);
+            }
+            contVoiceEle.dispatchEvent(new Event("change"));
+            const autoVoiceEle = document.getElementById("enableAutoVoice");
+            let localAuto = localStorage.getItem("enableAutoVoice");
+            autoVoiceEle.checked = enableAutoVoice = (localAuto || autoVoiceEle.getAttribute("checked")) === "true";
+            autoVoiceEle.onchange = () => {
+                enableAutoVoice = autoVoiceEle.checked;
+                localStorage.setItem("enableAutoVoice", enableAutoVoice);
+            }
+            autoVoiceEle.dispatchEvent(new Event("change"));
+        };
+        initVoiceFunc();
+        speechServiceEle.dispatchEvent(new Event("change"));
+    </script>
+    <script crossorigin="anonymous"
+        src="https://fastly.jsdelivr.net/npm/markdown-it@14.1.0/dist/markdown-it.min.js"></script>
+    <script crossorigin="anonymous"
+        src="https://fastly.jsdelivr.net/gh/highlightjs/cdn-release@11.9.0/build/highlight.min.js"></script>
+    <script crossorigin="anonymous" src="https://fastly.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.js"></script>
+    <script>
+        const API_URL = "v1/chat/completions";
+        const Gemini_API_URL = "v1/models/";
+        const Claude_API_URL = "v1/messages";
+        let loading = false;
+        let presetRoleData = {
+            "default": translations[locale]["defaultText"],
+            "normal": translations[locale]["assistantText"],
+            "cat": translations[locale]["catText"],
+            "emoji": translations[locale]["emojiText"],
+            "image": translations[locale]["imageText"]
+        };
+        let modelVersion; // 模型版本
+        let modelType; // 1:ChatGPT，2:Gemini，3:Claude
+        let apiHost; // api反代地址
+        let geminiApiHost;
+        let claudeApiHost;
+        let apiSelects = []; // api地址列表
+        let geminiApiSelects = [];
+        let claudeApiSelects = [];
+        let customAPIKey; // 自定义apiKey
+        let geminiAPIKey;
+        let claudeAPIKey;
+        let customAPIModel; // 自定义apiModel
+        let geminiAPIModel;
+        let claudeAPIModel;
+        let systemRole; // 自定义系统角色
+        let roleNature; // 角色性格
+        let roleTemp; // 回答质量
+        let convWidth = []; // 会话宽度，0:窗口宽度，1:全屏宽度
+        let textSpeed; // 打字机速度，越小越快
+        let contLen; // 连续会话长度，默认25，对话包含25条上下文信息。设置为0即关闭连续会话
+        let enableLongReply; // 是否开启长回复，默认关闭，开启可能导致api费用增加。
+        let longReplyFlag;
+        let voiceIns; // Audio or SpeechSynthesisUtterance
+        const isFirefox = !!navigator.userAgent.match(/firefox/i);
+        const supportMSE = !!window.MediaSource && !isFirefox; // 是否支持MSE（除了ios应该都支持）
+        const voiceMIME = "audio/mpeg";
+        const voiceFormat = "audio-24khz-48kbitrate-mono-mp3";
+        const voicePreLen = 130;
+        const voiceSuffix = ".mp3";
+        const openAIVoiceSuffix = ".aac";
+        let userAvatar; // 用户头像
+        let customDarkOut;
+        let isCaseSearch; // 搜索是否区分大小写
+        let controller;
+        let controllerId;
+        const findOffsetTop = (ele, target) => { // target is positioned ancestor element
+            if (ele.offsetParent !== target) return ele.offsetTop + findOffsetTop(ele.offsetParent, target);
+            else return ele.offsetTop;
+        }
+        const findResEle = (ele) => {
+            if (!ele.classList.contains("response")) return findResEle(ele.parentElement);
+            else return ele;
+        }
+        const isContentBottom = (ele) => {
+            if (refreshIdx !== void 0) {
+                return currentResEle.clientHeight + currentResEle.offsetTop > messagesEle.scrollTop + messagesEle.clientHeight
+            } else {
+                return messagesEle.scrollHeight - messagesEle.scrollTop - messagesEle.clientHeight < 128;
+            }
+        }
+        const isEleBottom = (ele) => {
+            return ele.clientHeight + findOffsetTop(ele, messagesEle) > messagesEle.scrollTop + messagesEle.clientHeight;
+        }
+        const outOfMsgWindow = (ele) => {
+            return ele.offsetTop > messagesEle.scrollTop + messagesEle.clientHeight || ele.offsetTop + ele.clientHeight < messagesEle.scrollTop
+        }
+        const scrollToBottom = () => {
+            if (isContentBottom()) {
+                if (refreshIdx !== void 0) {
+                    messagesEle.scrollTo(0, currentResEle.clientHeight + currentResEle.offsetTop - messagesEle.clientHeight + 10)
+                } else {
+                    messagesEle.scrollTo(0, messagesEle.scrollHeight)
+                }
+            }
+        }
+        const scrollToBottomLoad = (ele) => {
+            if (!controller || !ele.offsetParent) return;
+            if (isEleBottom(ele)) {
+                let resEle = findResEle(ele)
+                messagesEle.scrollTo(0, resEle.clientHeight + resEle.offsetTop - messagesEle.clientHeight + 10)
+            }
+        }
+        const forceRepaint = (ele) => {
+            ele.style.display = "none";
+            ele.offsetHeight;
+            ele.style.display = null;
+        }
+        const escapeTextarea = document.createElement("textarea");
+        const getEscape = str => {
+            escapeTextarea.textContent = str;
+            return escapeTextarea.innerHTML;
+        }
+        const parser = new DOMParser();
+        const getUnescape = html => {
+            return parser.parseFromString(html, 'text/html').body.innerText;
+        }
+        const escapeRegexExp = (str) => { // from vscode src/vs/base/common/strings.ts escapeRegExpCharacters
+            return str.replace(/[\\\{\}\*\+\?\|\^\$\.\[\]\(\)]/g, '\\$&');
+        }
+        const checkStorage = () => {
+            let used = 0;
+            for (let key in localStorage) {
+                localStorage.hasOwnProperty(key) && (used += localStorage[key].length)
+            }
+            let remain = 5242880 - used;
+            usedStorageBar.style.width = (used / 5242880 * 100).toFixed(2) + "%";
+            let usedMBs = used / 1048576;
+            usedStorage.textContent = (usedMBs < 1 ? usedMBs.toPrecision(2) : usedMBs.toFixed(2)) + "MB";
+            availableStorage.textContent = Math.floor(remain / 1048576 * 100) / 100 + "MB";
+        };
+        const UNESCAPE_RE = /\\([ \\!"#$%&'()*+,.\/:;<=>?@[\]^_`{|}~-])/g;
+        const superscript = (state, silent) => {
+            let found,
+                content,
+                token,
+                max = state.posMax,
+                start = state.pos;
+            if (state.src.charCodeAt(start) !== 0x5E/* ^ */) { return false; }
+            if (silent) { return false; } // don't run any pairs in validation mode
+            if (start + 2 >= max) { return false; }
+            state.pos = start + 1;
+            while (state.pos < max) {
+                if (state.src.charCodeAt(state.pos) === 0x5E/* ^ */) {
+                    found = true;
+                    break;
+                }
+                state.md.inline.skipToken(state);
+            }
+            if (!found || start + 1 === state.pos) {
+                state.pos = start;
+                return false;
+            }
+            content = state.src.slice(start + 1, state.pos);
+            // don't allow unescaped spaces/newlines inside
+            if (content.match(/(^|[^\\])(\\\\)*\s/)) {
+                state.pos = start;
+                return false;
+            }
+            // found!
+            state.posMax = state.pos;
+            state.pos = start + 1;
+            // Earlier we checked !silent, but this implementation does not need it
+            token = state.push('sup_open', 'sup', 1);
+            token.markup = '^';
+            token = state.push('text', '', 0);
+            token.content = content.replace(UNESCAPE_RE, '$1');
+            token = state.push('sup_close', 'sup', -1);
+            token.markup = '^';
+            state.pos = state.posMax + 1;
+            state.posMax = max;
+            return true;
+        }
+        const subscript = (state, silent) => {
+            let found,
+                content,
+                token,
+                max = state.posMax,
+                start = state.pos;
+            if (state.src.charCodeAt(start) !== 0x7E/* ~ */) { return false; }
+            if (silent) { return false; } // don't run any pairs in validation mode
+            if (start + 2 >= max) { return false; }
+            state.pos = start + 1;
+            while (state.pos < max) {
+                if (state.src.charCodeAt(state.pos) === 0x7E/* ~ */) {
+                    found = true;
+                    break;
+                }
+                state.md.inline.skipToken(state);
+            }
+            if (!found || start + 1 === state.pos) {
+                state.pos = start;
+                return false;
+            }
+            content = state.src.slice(start + 1, state.pos);
+            // don't allow unescaped spaces/newlines inside
+            if (content.match(/(^|[^\\])(\\\\)*\s/)) {
+                state.pos = start;
+                return false;
+            }
+            // found!
+            state.posMax = state.pos;
+            state.pos = start + 1;
+            // Earlier we checked !silent, but this implementation does not need it
+            token = state.push('sub_open', 'sub', 1);
+            token.markup = '~';
+            token = state.push('text', '', 0);
+            token.content = content.replace(UNESCAPE_RE, '$1');
+            token = state.push('sub_close', 'sub', -1);
+            token.markup = '~';
+            state.pos = state.posMax + 1;
+            state.posMax = max;
+            return true;
+        }
+        const mermaidMap = {};
+        const mermaidQuene = new Map();
+        let mermaidTimeout;
+        let loadingMermaid = false;
+        const loadMermaidQuene = () => {
+            mermaidQuene.forEach((content, id) => { loadSingleMermaid(id, content) });
+            mermaidQuene.clear();
+        };
+        const loadSingleMermaid = async (id, content) => {
+            let contianer = document.querySelector("#" + id);
+            if (!contianer || contianer.children.length) return;
+            let result;
+            try {
+                let { svg } = await mermaid.render(mermaid.detectType(content), content, contianer);
+                result = svg;
+            } catch (error) {
+                return;
+            }
+            result = result.replace(/(<svg[^>]*?)\sstyle="[^"]*"/i, '$1');
+            mermaidMap[id] = result;
+            document.querySelectorAll('#' + id).forEach(item => { item.innerHTML = result });
+        };
+        const loadRunMermaid = (id, content, bounce = false) => {
+            if (typeof mermaid === 'undefined') {
+                if (!loadingMermaid) {
+                    loadingMermaid = true;
+                    let script = document.createElement("script");
+                    script.crossOrigin = "anonymous";
+                    script.src = "https://fastly.jsdelivr.net/npm/mermaid@10.9.1/dist/mermaid.min.js";
+                    script.onload = async () => {
+                        loadingMermaid = false;
+                        mermaid.mermaidAPI.initialize({
+                            startOnLoad: false
+                        });
+                        loadMermaidQuene();
+                    }
+                    document.body.appendChild(script);
+                }
+                if (loading && !bounce) {
+                    clearTimeout(mermaidTimeout);
+                    mermaidTimeout = setTimeout(() => {
+                        mermaidQuene.set(id, content);
+                        if (typeof mermaid !== 'undefined') loadMermaidQuene();
+                    }, 600)
+                } else {
+                    mermaidQuene.set(id, content);
+                }
+            } else {
+                if (loading && !bounce) {
+                    clearTimeout(mermaidTimeout);
+                    mermaidTimeout = setTimeout(() => { loadSingleMermaid(id, content) }, 600)
+                } else {
+                    setTimeout(() => { loadSingleMermaid(id, content) });
+                }
+            }
+        };
+        const md = markdownit({
+            breaks: true,
+            linkify: true,
+            highlight: function (str, lang) {
+                try {
+                    return hljs.highlightAuto(str).value;
+                } catch (e) { }
+                return "";
+            }
+        });
+        md.inline.ruler.after("emphasis", "sup", superscript);
+        md.inline.ruler.after("emphasis", "sub", subscript);
+        md.use(texmath, { engine: katex, delimiters: ["brackets", "dollars"] });
+        md.renderer.rules.link_open = (tokens, idx, options, env, self) => {
+            let aIndex = tokens[idx].attrIndex("target");
+            if (tokens[idx + 1] && tokens[idx + 1].type === "image") tokens[idx].attrPush(["download", ""]);
+            else if (aIndex < 0) tokens[idx].attrPush(["target", "_blank"]);
+            else tokens[idx].attrs[aIndex][1] = "_blank";
+            return self.renderToken(tokens, idx, options);
+        };
+        const codeUtils = {
+            getCodeLang(str = "") {
+                const res = str.match(/ class="language-(.*?)"/);
+                return (res && res[1]) || "";
+            },
+            getFragment(str = "") {
+                return str ? `<span class="u-mdic-copy-code_lang" text="${str}"></span>` : "";
+            },
+        };
+        const getCodeLangFragment = (oriStr = "") => {
+            return codeUtils.getFragment(codeUtils.getCodeLang(oriStr));
+        };
+        const copyClickCode = (ele) => {
+            const input = document.createElement("textarea");
+            input.value = ele.parentElement.nextElementSibling.textContent;
+            const nDom = ele.previousElementSibling;
+            const nDelay = ele.dataset.mdicNotifyDelay;
+            const cDom = nDom.previousElementSibling;
+            document.body.appendChild(input);
+            input.select();
+            input.setSelectionRange(0, input.value.length);
+            document.execCommand("copy");
+            document.body.removeChild(input);
+            if (nDom.style.display === "none") {
+                nDom.style.display = "block";
+                cDom && (cDom.style.display = "none");
+                setTimeout(() => {
+                    nDom.style.display = "none";
+                    cDom && (cDom.style.display = "block");
+                }, nDelay);
+            }
+        };
+        const copyClickMd = (idx) => {
+            const input = document.createElement("textarea");
+            input.value = data[idx].content;
+            document.body.appendChild(input);
+            input.select();
+            input.setSelectionRange(0, input.value.length);
+            document.execCommand("copy");
+            document.body.removeChild(input);
+        }
+        const downloadSVG = (target, ev) => {
+            if (ev.target !== target) {
+                let blob = new Blob([target.innerHTML], { type: "image/svg+xml" });
+                downBlob(blob, "mermaid-" + target.children[0].getAttribute("aria-roledescription") + ".svg")
+            }
+        }
+        const enhanceCode = (render, options = {}) => (...args) => {
+            const {
+                btnText = translations[locale]["copyCode"], // button text
+                successText = translations[locale]["copySuccess"], // copy-success text
+                successTextDelay = 2000, // successText show time [ms]
+                showCodeLanguage = true, // false | show code language
+            } = options;
+            const [tokens = [], idx = 0] = args;
+            const originResult = render.apply(this, args);
+            const langFrag = showCodeLanguage ? getCodeLangFragment(originResult) : "";
+            const tpls = [
+                '<div class="m-mdic-copy-wrapper">',
+                `${langFrag}`,
+                `<div class="u-mdic-copy-notify" style="display:none;" text="${successText}"></div>`,
+                `<button class="u-mdic-copy-btn j-mdic-copy-btn" text="${btnText}" data-mdic-notify-delay="${successTextDelay}" onclick="copyClickCode(this)"></button>`,
+                '</div>',
+            ];
+            if (tokens[idx].type === "fence" && langFrag.indexOf(`text="mermaid"`) !== -1) {
+                let hash = "mermaid" + createHash().update(tokens[idx].content).digest("hex");
+                if (mermaidMap[hash]) return originResult.replace("<pre>", `<pre>${tpls.join("")}`).replace("<pre>", `<div class="mermaid" id="${hash}" onclick="downloadSVG(this, event)">${mermaidMap[hash]}</div><pre>`);
+                loadRunMermaid(hash, tokens[idx].content, tokens.length - 1 > idx);
+                return originResult.replace("<pre>", `<pre>${tpls.join("")}`).replace("<pre>", `<div class="mermaid" id="${hash}" onclick="downloadSVG(this, event)"></div><pre>`)
+            } else {
+                return originResult.replace("<pre>", `<pre>${tpls.join("")}`);
+            }
+        };
+        md.renderer.rules.code_block = enhanceCode(md.renderer.rules.code_block);
+        md.renderer.rules.fence = enhanceCode(md.renderer.rules.fence);
+        md.renderer.rules.image = function (tokens, idx, options, env, self) {
+            let token = tokens[idx];
+            token.attrs[token.attrIndex("alt")][1] = self.renderInlineAsText(token.children, options, env);
+            token.attrSet("onload", "scrollToBottomLoad(this);this.removeAttribute('onload');this.removeAttribute('onerror')");
+            token.attrSet("onerror", "scrollToBottomLoad(this);this.removeAttribute('onload');this.removeAttribute('onerror')");
+            token.attrPush(["decoding", "async"]);
+            token.attrPush(["loading", "lazy"]);
+            return self.renderToken(tokens, idx, options)
+        }
+        let currentVoiceIdx;
+        let editingIdx;
+        let originText;
+        const resumeSend = () => {
+            if (editingIdx !== void 0) {
+                chatlog.children[systemRole ? editingIdx - 1 : editingIdx].classList.remove("showEditReq");
+            }
+            sendBtnEle.children[0].textContent = translations[locale]["send"];
+            inputAreaEle.value = originText;
+            clearEle.title = translations[locale]["clearChat"];
+            clearEle.classList.remove("closeConv");
+            originText = void 0;
+            editingIdx = void 0;
+        }
+        const mdOptionEvent = function (ev) {
+            let id = ev.target.dataset.id;
+            if (id) {
+                let parent = ev.target.parentElement;
+                let idxEle = parent.parentElement;
+                let idx = Array.prototype.indexOf.call(chatlog.children, this.parentElement);
+                if (id === "voiceBtn" || id === "speechMd" || id === "pauseMd" || id === "resumeMd") {
+                    let classList = parent.dataset.id === "voiceBtn" ? parent.classList : ev.target.classList;
+                    if (classList.contains("readyVoice")) {
+                        if (chatlog.children[idx].dataset.loading !== "true") {
+                            idx = systemRole ? idx + 1 : idx;
+                            speechEvent(idx);
+                        }
+                    } else if (classList.contains("pauseVoice")) {
+                        if (voiceIns) {
+                            if (voiceIns instanceof Audio) voiceIns.pause();
+                            else {
+                                if (supportSpe) speechSynthesis.pause();
+                                classList.remove("readyVoice");
+                                classList.remove("pauseVoice");
+                                classList.add("resumeVoice");
+                            }
+                        }
+                    } else {
+                        if (voiceIns) {
+                            if (voiceIns instanceof Audio) voiceIns.play();
+                            else {
+                                if (supportSpe) speechSynthesis.resume();
+                                classList.remove("readyVoice");
+                                classList.remove("resumeVoice");
+                                classList.add("pauseVoice");
+                            }
+                        }
+                    }
+                } else if (id === "editMd") {
+                    let reqEle = chatlog.children[idx];
+                    idx = systemRole ? idx + 1 : idx;
+                    if (editingIdx === idx) return;
+                    if (editingIdx !== void 0) {
+                        chatListEle.children[systemRole ? editingIdx - 1 : editingIdx].classList.remove("showEditReq");
+                    }
+                    reqEle.classList.add("showEditReq");
+                    editingIdx = idx;
+                    originText = inputAreaEle.value;
+                    inputAreaEle.value = data[idx].content;
+                    inputAreaEle.dispatchEvent(new Event("input"));
+                    inputAreaEle.focus();
+                    sendBtnEle.children[0].textContent = translations[locale]["update"];
+                    clearEle.title = translations[locale]["cancel"];
+                    clearEle.classList.add("closeConv");
+                } else if (id === "refreshMd") {
+                    if (noLoading()) {
+                        formatAvatarEle(chatlog.children[idx].children[0], modelVersion);
+                        if (ev.target.classList.contains("refreshReq")) {
+                            chatlog.children[idx].children[1].innerHTML = "<p class='cursorCls'><br /></p>";
+                            chatlog.children[idx].dataset.loading = true;
+                            idx = systemRole ? idx + 1 : idx;
+                            data[idx].content = "";
+                            if (idx === data.findIndex(item => { return item.role === "assistant" })) activeChatEle.children[1].children[1].textContent = "";
+                            if (idx === currentVoiceIdx) endSpeak();
+                            loadAction(true);
+                            refreshIdx = idx;
+                            streamGen();
+                        } else {
+                            chatlog.children[idx].dataset.loading = true;
+                            idx = systemRole ? idx + 1 : idx;
+                            progressData = data[idx].content;
+                            loadAction(true);
+                            refreshIdx = idx;
+                            streamGen(true);
+                        }
+                    }
+                } else if (id === "copyMd") {
+                    idx = systemRole ? idx + 1 : idx;
+                    copyClickMd(idx);
+                    notyf.success(translations[locale]["copySuccess"]);
+                } else if (id === "delMd") {
+                    if (noLoading()) {
+                        if (confirmAction(translations[locale]["delMsgTip"])) {
+                            chatlog.removeChild(chatlog.children[idx]);
+                            idx = systemRole ? idx + 1 : idx;
+                            let firstIdx = data.findIndex(item => { return item.role === "assistant" });
+                            if (currentVoiceIdx !== void 0) {
+                                if (currentVoiceIdx === idx) { endSpeak() }
+                                else if (currentVoiceIdx > idx) { currentVoiceIdx-- }
+                            }
+                            if (editingIdx !== void 0) {
+                                if (editingIdx === idx) { resumeSend() }
+                                else if (editingIdx > idx) { editingIdx-- }
+                            }
+                            data.splice(idx, 1);
+                            if (firstIdx === idx) updateChatPre();
+                            updateChats();
+                        }
+                    }
+                } else if (id === "downAudioMd") {
+                    if (chatlog.children[idx].dataset.loading !== "true") {
+                        idx = systemRole ? idx + 1 : idx;
+                        downloadAudio(idx);
+                    }
+                }
+            }
+        }
+        const formatAvatarEle = (ele, model) => {
+            ele.className = "chatAvatar";
+            if (ele.parentElement.className === "request") {
+                ele.innerHTML = `<img src="${userAvatar}" />`;
+            } else {
+                if (model.startsWith("gpt")) {
+                    ele.classList.add("gptAvatar")
+                    ele.innerHTML = `<svg width="24" height="24"><use xlink:href="#aiIcon"></use></svg>`;
+                } else if (model.startsWith("gemini")) {
+                    ele.innerHTML = `<svg width="30" height="30"><use xlink:href="#geminiIcon"></use></svg>`;
+                } else {
+                    ele.innerHTML = `<svg width="30" height="30"><use xlink:href="#claudeIcon"></use></svg>`;
+                }
+            }
+        }
+        const formatMdEle = (ele, model) => {
+            let avatar = document.createElement("div");
+            ele.appendChild(avatar);
+            formatAvatarEle(avatar, model);
+            let realMd = document.createElement("div");
+            realMd.className = ele.className === "request" ? "requestBody" : "markdown-body";
+            ele.appendChild(realMd);
+            let mdOption = document.createElement("div");
+            mdOption.className = "mdOption";
+            ele.appendChild(mdOption);
+            let optionWidth = existVoice >= 2 ? 140 : 105;
+            mdOption.innerHTML += `<div class="optionItems" style="width:${optionWidth}px;left:-${optionWidth - 10}px">`
+                + (ele.className === "request" ? `<div data-id="editMd" class="optionItem" title="${translations[locale]["edit"]}">
+                <svg width="18" height="18"><use xlink:href="#chatEditIcon" /></svg>
+                </div>` : `<div data-id="refreshMd" class="refreshReq optionItem" title="${translations[locale]["refresh"]}">
+                <svg width="18" height="18" ><use xlink:href="#refreshIcon" /></svg>
+                <svg width="18" height="18" ><use xlink:href="#halfRefIcon" /></svg>
+                </div>`) +
+                `<div data-id="copyMd" class="optionItem" title="${translations[locale]["copy"]}">
+                <svg width="20" height="20"><use xlink:href="#copyIcon" /></svg>
+            </div>
+            <div data-id="delMd" class="optionItem" title="${translations[locale]["del"]}">
+                <svg width="20" height="20"><use xlink:href="#delIcon" /></svg>
+            </div>` + (existVoice >= 2 ? `<div data-id="downAudioMd" class="optionItem" title="${translations[locale]["downAudio"]}">
+                <svg width="20" height="20"><use xlink:href="#downAudioIcon" /></svg>
+            </div>` : "") + `</div>`;
+            if (existVoice) {
+                mdOption.innerHTML += `<div class="voiceCls readyVoice" data-id="voiceBtn">
+                <svg width="20" height="20" role="img" data-id="speechMd"><title>${translations[locale]["speech"]}</title><use xlink:href="#readyVoiceIcon" /></svg>
+                <svg width="20" height="20" role="img" data-id="pauseMd"><title>${translations[locale]["pause"]}</title><use xlink:href="#pauseVoiceIcon" /></svg>
+                <svg width="20" height="20" role="img" data-id="resumeMd"><title>${translations[locale]["resume"]}</title><use xlink:href="#resumeVoiceIcon" /></svg>
+                </div>`
+            }
+            mdOption.onclick = mdOptionEvent;
+        }
+        let allListEle = chatListEle.parentElement;
+        let folderData = [];
+        let chatsData = [];
+        let chatIdxs = [];
+        let searchIdxs = [];
+        let activeChatIdx = 0;
+        let activeChatEle;
+        let operateChatIdx, operateFolderIdx;
+        let dragLi, dragType, dragIdx;
+        let mobileDragOut;
+        const mobileDragStartEV = function (ev) {
+            if (mobileDragOut !== void 0) {
+                clearTimeout(mobileDragOut);
+                mobileDragOut = void 0;
+            }
+            mobileDragOut = setTimeout(() => {
+                this.setAttribute("draggable", "true");
+                this.dispatchEvent(ev);
+            }, 200);
+        };
+        if (isMobile) {
+            let stopDragOut = () => {
+                if (mobileDragOut !== void 0) {
+                    clearTimeout(mobileDragOut);
+                    mobileDragOut = void 0;
+                }
+            };
+            let stopDrag = () => {
+                stopDragOut();
+                document.querySelectorAll("[draggable=true]").forEach(ele => {
+                    ele.setAttribute("draggable", "false");
+                })
+            };
+            document.body.addEventListener("touchmove", stopDragOut);
+            document.body.addEventListener("touchend", stopDrag);
+            document.body.addEventListener("touchcancel", stopDrag);
+        };
+        const delDragIdx = () => {
+            let chatIdx = chatIdxs.indexOf(dragIdx);
+            if (chatIdx !== -1) {
+                chatIdxs.splice(chatIdx, 1);
+            } else {
+                folderData.forEach((item, i) => {
+                    let inIdx = item.idxs.indexOf(dragIdx);
+                    if (inIdx !== -1) {
+                        item.idxs.splice(inIdx, 1);
+                        updateFolder(i);
+                    }
+                })
+            }
+        }
+        const updateFolder = (idx) => {
+            let folderEle = folderListEle.children[idx];
+            let childLen = folderData[idx].idxs.length;
+            folderEle.children[0].children[1].children[1].textContent = childLen + translations[locale]["chats"];
+            folderEle.classList.toggle("expandFolder", childLen);
+        }
+        folderListEle.ondragenter = chatListEle.ondragenter = function (ev) {
+            ev.preventDefault();
+            if (ev.target === dragLi) return;
+            allListEle.querySelectorAll(".dragingChat").forEach(ele => {
+                ele.classList.remove("dragingChat");
+            })
+            if (dragType === "chat") {
+                if (this === chatListEle) {
+                    this.classList.add("dragingChat");
+                    let dragindex = Array.prototype.indexOf.call(chatListEle.children, dragLi);
+                    let targetindex = Array.prototype.indexOf.call(chatListEle.children, ev.target);
+                    delDragIdx();
+                    if (targetindex !== -1) {
+                        chatIdxs.splice(targetindex, 0, dragIdx);
+                        if (dragindex === -1 || dragindex >= targetindex) {
+                            chatListEle.insertBefore(dragLi, ev.target);
+                        } else {
+                            chatListEle.insertBefore(dragLi, ev.target.nextElementSibling);
+                        }
+                    } else {
+                        chatIdxs.push(dragIdx);
+                        chatListEle.appendChild(dragLi);
+                    }
+                } else if (this === folderListEle) {
+                    let folderIdx;
+                    if (ev.target.classList.contains("headLi")) {
+                        ev.target.parentElement.classList.add("dragingChat");
+                        ev.target.nextElementSibling.appendChild(dragLi);
+                        delDragIdx();
+                        folderIdx = Array.prototype.indexOf.call(folderListEle.children, ev.target.parentElement);
+                        folderData[folderIdx].idxs.push(dragIdx);
+                        updateFolder(folderIdx);
+                    } else if (ev.target.classList.contains("chatLi")) {
+                        ev.target.parentElement.parentElement.classList.add("dragingChat");
+                        let parent = ev.target.parentElement;
+                        delDragIdx();
+                        folderIdx = Array.prototype.indexOf.call(folderListEle.children, parent.parentElement);
+                        let dragindex = Array.prototype.indexOf.call(parent.children, dragLi);
+                        let targetindex = Array.prototype.indexOf.call(parent.children, ev.target);
+                        if (dragindex !== -1) {
+                            folderData[folderIdx].idxs.splice(targetindex, 0, dragIdx);
+                            if (dragindex < targetindex) {
+                                parent.insertBefore(dragLi, ev.target.nextElementSibling);
+                            } else {
+                                parent.insertBefore(dragLi, ev.target);
+                            }
+                        } else {
+                            folderData[folderIdx].idxs.push(dragIdx);
+                            parent.appendChild(dragLi);
+                        }
+                        updateFolder(folderIdx);
+                    }
+                }
+                updateChatIdxs();
+            } else if (dragType === "folder") {
+                if (this === folderListEle) {
+                    let dragindex = Array.prototype.indexOf.call(folderListEle.children, dragLi);
+                    let folderIdx = Array.prototype.findIndex.call(folderListEle.children, (item) => {
+                        return item.contains(ev.target);
+                    })
+                    folderListEle.children[folderIdx].classList.remove("expandFolder");
+                    let folderEle = folderListEle.children[folderIdx];
+                    let data = folderData.splice(dragindex, 1)[0];
+                    folderData.splice(folderIdx, 0, data);
+                    if (dragindex === -1 || dragindex >= folderIdx) {
+                        folderListEle.insertBefore(dragLi, folderEle);
+                    } else {
+                        folderListEle.insertBefore(dragLi, folderEle.nextElementSibling);
+                    }
+                    updateChatIdxs();
+                }
+            }
+        }
+        folderListEle.ondragover = chatListEle.ondragover = (ev) => {
+            ev.preventDefault();
+        }
+        folderListEle.ondragend = chatListEle.ondragend = (ev) => {
+            document.getElementsByClassName("dragingLi")[0].classList.remove("dragingLi");
+            allListEle.querySelectorAll(".dragingChat").forEach(ele => {
+                ele.classList.remove("dragingChat");
+            })
+            dragType = dragIdx = dragLi = void 0;
+        }
+        const chatDragStartEv = function (ev) {
+            ev.stopPropagation();
+            dragLi = this;
+            dragLi.classList.add("dragingLi");
+            dragType = "chat";
+            if (chatListEle.contains(this)) {
+                let idx = Array.prototype.indexOf.call(chatListEle.children, this);
+                dragIdx = chatIdxs[idx];
+            } else if (folderListEle.contains(this)) {
+                let folderIdx = Array.prototype.indexOf.call(folderListEle.children, this.parentElement.parentElement);
+                let inFolderIdx = Array.prototype.indexOf.call(this.parentElement.children, this);
+                dragIdx = folderData[folderIdx].idxs[inFolderIdx];
+            }
+        }
+        const extraFolderActive = (folderIdx) => {
+            let folderNewIdx = -1;
+            for (let i = folderIdx - 1; i >= 0; i--) {
+                if (folderData[i].idxs.length) {
+                    folderNewIdx = i;
+                }
+            }
+            if (folderNewIdx === -1) {
+                for (let i = folderIdx + 1; i < folderData.length; i++) {
+                    if (folderData[i].idxs.length) folderNewIdx = i;
+                }
+            }
+            if (folderNewIdx !== -1) {
+                activeChatIdx = folderData[folderNewIdx].idxs[0];
+            } else if (chatIdxs.length) {
+                activeChatIdx = chatIdxs[0];
+            } else {
+                activeChatIdx = -1;
+            }
+        }
+        const delFolder = (folderIdx, ele) => {
+            if (confirmAction(translations[locale]["delFolderTip"])) {
+                let delData = folderData[folderIdx];
+                let idxs = delData.idxs.sort();
+                ele.parentElement.remove();
+                if (idxs.indexOf(activeChatIdx) !== -1) {
+                    endAll();
+                    extraFolderActive(folderIdx);
+                }
+                folderData.splice(folderIdx, 1);
+                for (let i = idxs.length - 1; i >= 0; i--) {
+                    chatsData.splice(idxs[i], 1);
+                }
+                folderData.forEach(item => {
+                    if (item.idxs.length) {
+                        item.idxs.forEach((i, ix) => {
+                            let len = idxs.filter(j => { return i > j }).length;
+                            if (len) {
+                                item.idxs[ix] = i - len;
+                            }
+                        })
+                    }
+                })
+                chatIdxs.forEach((item, ix) => {
+                    let len = idxs.filter(j => { return item > j }).length;
+                    if (len) chatIdxs[ix] = item - len;
+                })
+                let len = idxs.filter(j => { return activeChatIdx > j }).length;
+                if (len) activeChatIdx -= len;
+                if (activeChatIdx === -1) {
+                    addNewChat();
+                    activeChatIdx = 0;
+                    chatEleAdd(activeChatIdx);
+                }
+                updateChats();
+                activeChat();
+            }
+        }
+        const folderAddChat = (folderIdx, headEle) => {
+            endAll();
+            let chat = { name: translations[locale]["newChatName"], data: [] };
+            chatsData.push(chat);
+            activeChatIdx = chatsData.length - 1;
+            folderData[folderIdx].idxs.push(activeChatIdx);
+            let ele = chatEleAdd(activeChatIdx, false)
+            headEle.nextElementSibling.appendChild(ele);
+            updateFolder(folderIdx);
+            updateChats();
+            activeChat(ele);
+        }
+        const folderEleEvent = function (ev) {
+            ev.preventDefault();
+            ev.stopPropagation();
+            let parent = this.parentElement;
+            let idx = Array.prototype.indexOf.call(folderListEle.children, parent);
+            if (ev.target.className === "headLi") {
+                let isExpanded = parent.classList.toggle("expandFolder");
+                if (folderData[idx].idxs.indexOf(activeChatIdx) !== -1) {
+                    parent.classList.toggle("activeFolder", !isExpanded);
+                }
+            } else if (ev.target.dataset.type === "folderAddChat") {
+                folderAddChat(idx, this);
+            } else if (ev.target.dataset.type === "folderEdit") {
+                toEditName(idx, this, 0);
+            } else if (ev.target.dataset.type === "folderDel") {
+                delFolder(idx, this);
+            }
+        }
+        const folderDragStartEv = function (ev) {
+            dragLi = this;
+            dragLi.classList.add("dragingLi");
+            dragType = "folder";
+            dragIdx = Array.prototype.indexOf.call(folderListEle.children, this);
+        }
+        const folderEleAdd = (idx, push = true) => {
+            let folder = folderData[idx];
+            let folderEle = document.createElement("div");
+            folderEle.className = "folderLi";
+            if (!isMobile) folderEle.setAttribute("draggable", "true");
+            else folderEle.ontouchstart = mobileDragStartEV;
+            let headEle = document.createElement("div");
+            headEle.className = "headLi";
+            headEle.innerHTML = `<svg width="24" height="24"><use xlink:href="#expandFolderIcon" /></svg>
+                <div class="folderInfo">
+                    <div class="folderName"></div>
+                    <div class="folderNum"></div>
+                </div>
+                <div class="folderOption">
+                    <svg data-type="folderAddChat" width="24" height="24" role="img"><title>${translations[locale]["newChat"]}</title><use xlink:href="#addIcon" /></svg>
+                    <svg data-type="folderEdit" width="24" height="24" role="img"><title>${translations[locale]["edit"]}</title><use xlink:href="#chatEditIcon" /></svg>
+                    <svg data-type="folderDel" width="24" height="24" role="img"><title>${translations[locale]["del"]}</title><use xlink:href="#delIcon" /></svg>
+                </div>`
+            headEle.children[1].children[0].textContent = folder.name;
+            headEle.children[1].children[1].textContent = folder.idxs.length + translations[locale]["chats"];
+            folderEle.appendChild(headEle);
+            folderEle.ondragstart = folderDragStartEv;
+            headEle.onclick = folderEleEvent;
+            let chatsEle = document.createElement("div");
+            chatsEle.className = "chatsInFolder";
+            for (let i = 0; i < folder.idxs.length; i++) {
+                chatsEle.appendChild(chatEleAdd(folder.idxs[i], false));
+            }
+            folderEle.appendChild(chatsEle);
+            if (push) { folderListEle.appendChild(folderEle) }
+            else { folderListEle.insertBefore(folderEle, folderListEle.firstChild) }
+        }
+        document.getElementById("newFolder").onclick = function () {
+            folderData.unshift({ name: translations[locale]["newFolderName"], idxs: [] });
+            folderEleAdd(0, false);
+            updateChatIdxs();
+            folderListEle.parentElement.scrollTop = 0;
+        };
+        const initChatEle = (index, chatEle) => {
+            chatEle.children[1].children[0].textContent = chatsData[index].name;
+            let chatPreview = "";
+            if (chatsData[index].data && chatsData[index].data.length) {
+                let first = chatsData[index].data.find(item => { return item.role === "assistant" });
+                if (first) { chatPreview = first.content.slice(0, 30) }
+            }
+            chatEle.children[1].children[1].textContent = chatPreview;
+        };
+        const chatEleAdd = (idx, appendChat = true) => {
+            let chat = chatsData[idx];
+            let chatEle = document.createElement("div");
+            chatEle.className = "chatLi";
+            if (!isMobile) chatEle.setAttribute("draggable", "true");
+            else chatEle.ontouchstart = mobileDragStartEV;
+            chatEle.ondragstart = chatDragStartEv;
+            chatEle.innerHTML = `<svg width="24" height="24"><use xlink:href="#chatIcon" /></svg>
+                <div class="chatInfo">
+                    <div class="chatName"></div>
+                    <div class="chatPre"></div>
+                </div>
+                <div class="chatOption"><svg data-type="chatEdit" width="24" height="24" role="img"><title>${translations[locale]["edit"]}</title><use xlink:href="#chatEditIcon" /></svg>
+                <svg data-type="chatDel" width="24" height="24" role="img"><title>${translations[locale]["del"]}</title><use xlink:href="#delIcon" /></svg></div>`
+            if (appendChat) chatListEle.appendChild(chatEle);
+            initChatEle(idx, chatEle);
+            chatEle.onclick = chatEleEvent;
+            return chatEle;
+        };
+        const addNewChat = () => {
+            let chat = { name: translations[locale]["newChatName"], data: [] };
+            if (presetRoleData.default) chat.data.unshift({ role: "system", content: presetRoleData.default });
+            preEle.selectedIndex = 0;
+            chatsData.push(chat);
+            chatIdxs.push(chatsData.length - 1);
+            updateChats();
+        };
+        const delChat = (idx, ele, folderIdx, inFolderIdx) => {
+            if (confirmAction(translations[locale]["delChatTip"])) {
+                if (idx === activeChatIdx) endAll();
+                if (folderIdx !== void 0) {
+                    let folder = folderData[folderIdx];
+                    folder.idxs.splice(inFolderIdx, 1);
+                    updateFolder(folderIdx);
+                    if (idx === activeChatIdx) {
+                        if (inFolderIdx - 1 >= 0) {
+                            activeChatIdx = folder.idxs[inFolderIdx - 1];
+                        } else if (folder.idxs.length) {
+                            activeChatIdx = folder.idxs[0];
+                        } else {
+                            extraFolderActive(folderIdx);
+                        }
+                    }
+                } else {
+                    let chatIdx = chatIdxs.indexOf(idx);
+                    chatIdxs.splice(chatIdx, 1);
+                    if (idx === activeChatIdx) {
+                        if (chatIdx - 1 >= 0) {
+                            activeChatIdx = chatIdxs[chatIdx - 1];
+                        } else if (chatIdxs.length) {
+                            activeChatIdx = chatIdxs[0];
+                        } else {
+                            let folderNewIdx = -1;
+                            for (let i = folderData.length - 1; i >= 0; i--) {
+                                if (folderData[i].idxs.length) folderNewIdx = i;
+                            }
+                            if (folderNewIdx !== -1) {
+                                activeChatIdx = folderData[folderNewIdx].idxs[0];
+                            } else {
+                                activeChatIdx = -1;
+                            }
+                        }
+                    }
+                }
+                if (activeChatIdx > idx) activeChatIdx--;
+                chatsData.splice(idx, 1);
+                ele.remove();
+                folderData.forEach(item => {
+                    if (item.idxs.length) {
+                        item.idxs.forEach((i, ix) => {
+                            if (i > idx) item.idxs[ix] = i - 1;
+                        })
+                    }
+                })
+                chatIdxs.forEach((item, ix) => {
+                    if (item > idx) chatIdxs[ix] = item - 1;
+                })
+                if (activeChatIdx === -1) {
+                    addNewChat();
+                    activeChatIdx = 0;
+                    chatEleAdd(activeChatIdx);
+                }
+                updateChats();
+                activeChat();
+            }
+        };
+        const endEditEvent = (ev) => {
+            if (!document.getElementById("activeChatEdit").contains(ev.target)) {
+                endEditChat();
+            }
+        };
+        const preventDrag = (ev) => {
+            ev.preventDefault();
+            ev.stopPropagation();
+        }
+        const endEditChat = () => {
+            if (operateChatIdx !== void 0) {
+                let ele = getChatEle(operateChatIdx);
+                chatsData[operateChatIdx].name = ele.children[1].children[0].textContent = document.getElementById("activeChatEdit").value;
+                ele.lastElementChild.remove();
+            } else if (operateFolderIdx !== void 0) {
+                let ele = folderListEle.children[operateFolderIdx].children[0];
+                folderData[operateFolderIdx].name = ele.children[1].children[0].textContent = document.getElementById("activeChatEdit").value;
+                ele.lastElementChild.remove();
+            }
+            updateChats();
+            operateChatIdx = operateFolderIdx = void 0;
+            document.body.removeEventListener("mousedown", endEditEvent, true);
+        }
+        const toEditName = (idx, ele, type) => {
+            let inputEle = document.createElement("input");
+            inputEle.id = "activeChatEdit";
+            inputEle.setAttribute("draggable", "true");
+            inputEle.ondragstart = preventDrag;
+            ele.appendChild(inputEle);
+            if (type) {
+                inputEle.value = chatsData[idx].name;
+                operateChatIdx = idx;
+            } else {
+                inputEle.value = folderData[idx].name;
+                operateFolderIdx = idx;
+            }
+            inputEle.setSelectionRange(0, 0);
+            inputEle.focus();
+            inputEle.onkeydown = (e) => {
+                if (e.keyCode === 13) {
+                    e.preventDefault();
+                    endEditChat();
+                }
+            };
+            document.body.addEventListener("mousedown", endEditEvent, true);
+            return inputEle;
+        };
+        const chatEleEvent = function (ev) {
+            ev.preventDefault();
+            ev.stopPropagation();
+            let idx, folderIdx, inFolderIdx;
+            if (chatListEle.contains(this)) {
+                idx = Array.prototype.indexOf.call(chatListEle.children, this);
+                idx = chatIdxs[idx];
+            } else if (folderListEle.contains(this)) {
+                folderIdx = Array.prototype.indexOf.call(folderListEle.children, this.parentElement.parentElement);
+                inFolderIdx = Array.prototype.indexOf.call(this.parentElement.children, this);
+                idx = folderData[folderIdx].idxs[inFolderIdx];
+            }
+            if (ev.target.classList.contains("chatLi")) {
+                if (searchChatEle.value || activeChatIdx !== idx) {
+                    endAll();
+                    activeChatIdx = idx;
+                    activeChat(this);
+                }
+                if (window.innerWidth <= 800) {
+                    document.body.classList.remove("show-nav");
+                }
+            } else if (ev.target.dataset.type === "chatEdit") {
+                toEditName(idx, this, 1);
+            } else if (ev.target.dataset.type === "chatDel") {
+                delChat(idx, this, folderIdx, inFolderIdx);
+            }
+        };
+        const updateChats = () => {
+            localStorage.setItem("chats", JSON.stringify(chatsData));
+            updateChatIdxs();
+        };
+        const updateChatIdxs = () => {
+            localStorage.setItem("chatIdxs", JSON.stringify(chatIdxs));
+            localStorage.setItem("folders", JSON.stringify(folderData));
+        }
+        const createConvEle = (className, append = true, model) => {
+            let div = document.createElement("div");
+            div.className = className;
+            formatMdEle(div, model);
+            if (append) chatlog.appendChild(div);
+            return div;
+        }
+        const getChatEle = (idx) => {
+            let chatIdx = chatIdxs.indexOf(idx);
+            if (chatIdx !== -1) {
+                return chatListEle.children[chatIdx];
+            } else {
+                let inFolderIdx;
+                let folderIdx = folderData.findIndex(item => {
+                    inFolderIdx = item.idxs.indexOf(idx);
+                    return inFolderIdx !== -1;
+                })
+                if (folderIdx !== -1) {
+                    return folderListEle.children[folderIdx].children[1].children[inFolderIdx];
+                }
+            }
+        }
+        const activeChat = (ele) => {
+            data = chatsData[activeChatIdx]["data"];
+            allListEle.querySelectorAll(".activeChatLi").forEach(ele => {
+                ele.classList.remove("activeChatLi");
+            })
+            allListEle.querySelectorAll(".activeFolder").forEach(ele => {
+                ele.classList.remove("activeFolder")
+            })
+            if (!ele) ele = getChatEle(activeChatIdx);
+            ele.classList.add("activeChatLi");
+            activeChatEle = ele;
+            if (chatIdxs.indexOf(activeChatIdx) === -1) {
+                if (!ele.parentElement.parentElement.classList.contains("expandFolder")) {
+                    ele.parentElement.parentElement.classList.add("activeFolder");
+                }
+            }
+            if (data[0] && data[0].role === "system") {
+                systemRole = data[0].content;
+                systemEle.value = systemRole;
+            } else {
+                systemRole = void 0;
+                systemEle.value = "";
+            }
+            chatlog.innerHTML = "";
+            if (systemRole ? data.length - 1 : data.length) {
+                let firstIdx = systemRole ? 1 : 0;
+                for (let i = firstIdx; i < data.length; i++) {
+                    if (data[i].role === "user") {
+                        createConvEle("request").children[1].textContent = data[i].content;
+                    } else {
+                        createConvEle("response", true, data[i].model).children[1].innerHTML = md.render(data[i].content) || "<br />";
+                    }
+                }
+            }
+            let top = ele.offsetTop + ele.offsetHeight - allListEle.clientHeight;
+            if (allListEle.scrollTop < top) allListEle.scrollTop = top;
+            localStorage.setItem("activeChatIdx", activeChatIdx);
+            if (searchIdxs[activeChatIdx] !== void 0) {
+                let dataIdx = searchIdxs[activeChatIdx];
+                if (dataIdx !== -1) {
+                    let currChatEle = chatlog.children[systemRole ? dataIdx - 1 : dataIdx];
+                    let childs = currChatEle.children[1].getElementsByTagName("*");
+                    if (childs.length) {
+                        for (let i = childs.length - 1; i >= 0; i--) {
+                            if (childs[i].textContent && childs[i].textContent.indexOf(searchChatEle.value) !== -1) {
+                                let offTop = findOffsetTop(childs[i], messagesEle);
+                                messagesEle.scrollTop = offTop + childs[i].offsetHeight - messagesEle.clientHeight * 0.15;
+                                break;
+                            }
+                        }
+                    } else messagesEle.scrollTop = currChatEle.offsetTop;
+                } else messagesEle.scrollTop = 0;
+            }
+        };
+        newChatEle.onclick = () => {
+            endAll();
+            addNewChat();
+            activeChatIdx = chatsData.length - 1;
+            chatEleAdd(activeChatIdx);
+            activeChat(chatListEle.lastElementChild);
+        };
+        const initChats = () => {
+            let localChats = localStorage.getItem("chats");
+            let localFolders = localStorage.getItem("folders");
+            let localChatIdxs = localStorage.getItem("chatIdxs")
+            let localChatIdx = localStorage.getItem("activeChatIdx");
+            activeChatIdx = (localChatIdx && parseInt(localChatIdx)) || 0;
+            if (localChats) {
+                if (isCompressedChats) localChats = new TextDecoder().decode(inflateSync(stringToUint(localChats)));
+                chatsData = JSON.parse(localChats);
+                let folderIdxs = [];
+                if (localFolders) {
+                    folderData = JSON.parse(localFolders);
+                    for (let i = 0; i < folderData.length; i++) {
+                        folderEleAdd(i);
+                        folderIdxs.push(...folderData[i].idxs);
+                    }
+                }
+                if (localChatIdxs) {
+                    chatIdxs = JSON.parse(localChatIdxs);
+                    for (let i = 0; i < chatIdxs.length; i++) {
+                        chatEleAdd(chatIdxs[i]);
+                    }
+                } else {
+                    for (let i = 0; i < chatsData.length; i++) {
+                        if (folderIdxs.indexOf(i) === -1) {
+                            chatIdxs.push(i);
+                            chatEleAdd(i);
+                        }
+                    }
+                    updateChatIdxs();
+                }
+            } else {
+                addNewChat();
+                chatEleAdd(activeChatIdx);
+            }
+        };
+        const initExpanded = () => {
+            let folderIdx = folderData.findIndex(item => {
+                return item.idxs.indexOf(activeChatIdx) !== -1;
+            })
+            if (folderIdx !== -1) {
+                folderListEle.children[folderIdx].classList.add("expandFolder");
+            }
+        }
+        initChats();
+        initExpanded();
+        activeChat();
+        document.getElementById("clearSearch").onclick = () => {
+            searchChatEle.value = "";
+            searchChatEle.dispatchEvent(new Event("input"));
+            searchChatEle.focus();
+        }
+        const toSearchChats = () => {
+            searchIdxs.length = 0;
+            for (let i = 0; i < chatsData.length; i++) {
+                let chatEle = getChatEle(i);
+                chatEle.style.display = null;
+                let flags = isCaseSearch ? "" : "i";
+                let pattern = escapeRegexExp(searchChatEle.value);
+                let regex = new RegExp(pattern, flags);
+                let nameData = chatsData[i].name.match(regex);
+                let nameIdx = nameData ? nameData.index : -1;
+                let matchContent;
+                let dataIdx = chatsData[i].data.findIndex(item => {
+                    return item.role !== "system" && (matchContent = item.content.match(regex))
+                })
+                if (nameIdx !== -1 || dataIdx !== -1) {
+                    let ele = chatEle.children[1];
+                    if (dataIdx !== -1) {
+                        let data = chatsData[i].data[dataIdx];
+                        let idx = matchContent.index;
+                        let endIdx = idx + matchContent[0].length;
+                        ele.children[1].textContent = (idx > 8 ? "..." : "") + data.content.slice(idx > 8 ? idx - 5 : 0, idx);
+                        ele.children[1].appendChild(document.createElement("span"));
+                        ele.children[1].lastChild.textContent = data.content.slice(idx, endIdx);
+                        ele.children[1].appendChild(document.createTextNode(data.content.slice(endIdx)))
+                    } else {
+                        initChatEle(i, chatEle);
+                    }
+                    if (nameIdx !== -1) {
+                        let endIdx = nameIdx + nameData[0].length;
+                        ele.children[0].textContent = (nameIdx > 5 ? "..." : "") + chatsData[i].name.slice(nameIdx > 5 ? nameIdx - 3 : 0, nameIdx);
+                        ele.children[0].appendChild(document.createElement("span"));
+                        ele.children[0].lastChild.textContent = chatsData[i].name.slice(nameIdx, endIdx);
+                        ele.children[0].appendChild(document.createTextNode(chatsData[i].name.slice(endIdx)))
+                    } else {
+                        ele.children[0].textContent = chatsData[i].name;
+                    }
+                    searchIdxs[i] = dataIdx;
+                } else {
+                    chatEle.style.display = "none";
+                    initChatEle(i, chatEle);
+                }
+            }
+            for (let i = 0; i < folderListEle.children.length; i++) {
+                let folderChatEle = folderListEle.children[i].children[1];
+                if (!folderChatEle.children.length || Array.prototype.filter.call(folderChatEle.children, (ele) => {
+                    return ele.style.display !== "none"
+                }).length === 0) {
+                    folderListEle.children[i].style.display = "none";
+                }
+            }
+        }
+        searchChatEle.oninput = (ev) => {
+            if (searchChatEle.value.length) {
+                toSearchChats();
+            } else {
+                searchIdxs.length = 0;
+                for (let i = 0; i < chatsData.length; i++) {
+                    let chatEle = getChatEle(i);
+                    chatEle.style.display = null;
+                    initChatEle(i, chatEle);
+                }
+                for (let i = 0; i < folderListEle.children.length; i++) {
+                    folderListEle.children[i].style.display = null;
+                }
+            }
+        };
+        document.getElementById("resetHotKey").onclick = () => {
+            localStorage.removeItem("hotKeys");
+            initHotKey();
+            notyf.success(translations[locale]["resetSetSuccTip"]);
+        };
+        const blobToText = (blob) => {
+            return new Promise((res, rej) => {
+                let reader = new FileReader();
+                reader.readAsText(blob);
+                reader.onload = () => {
+                    res(reader.result);
+                }
+                reader.onerror = (error) => {
+                    rej(error);
+                }
+            })
+        };
+        document.getElementById("exportChat").onclick = () => {
+            if (loading) stopLoading();
+            let data = {
+                chatsData: chatsData,
+                folderData: folderData,
+                chatIdxs: chatIdxs
+            }
+            let blob = new Blob([JSON.stringify(data, null, 2)], { type: "application/json" });
+            let date = new Date();
+            let fileName = "chats-" + date.getFullYear() + "-" + (date.getMonth() + 1) + "-" + date.getDate() + ".json";
+            downBlob(blob, fileName);
+            notyf.success(translations[locale]["exportSuccTip"]);
+        };
+        document.getElementById("importChatInput").onchange = function () {
+            let file = this.files[0];
+            blobToText(file).then(text => {
+                try {
+                    let json = JSON.parse(text);
+                    let checked = json.chatsData && json.folderData && json.chatIdxs && json.chatsData.every(item => {
+                        return item.name !== void 0 && item.data !== void 0;
+                    });
+                    if (checked) {
+                        let preFolder = folderData.length;
+                        let preLen = chatsData.length;
+                        if (json.chatsData) {
+                            chatsData = chatsData.concat(json.chatsData);
+                        }
+                        if (json.folderData) {
+                            for (let i = 0; i < json.folderData.length; i++) {
+                                json.folderData[i].idxs = json.folderData[i].idxs.map(item => {
+                                    return item + preLen;
+                                })
+                                folderData.push(json.folderData[i]);
+                                folderEleAdd(i + preFolder);
+                            }
+                        }
+                        if (json.chatIdxs) {
+                            for (let i = 0; i < json.chatIdxs.length; i++) {
+                                let newIdx = json.chatIdxs[i] + preLen;
+                                chatIdxs.push(newIdx)
+                                chatEleAdd(newIdx);
+                            }
+                        }
+                        updateChats();
+                        checkStorage();
+                        notyf.success(translations[locale]["importSuccTip"]);
+                    } else {
+                        throw new Error("fmt error");
+                    }
+                } catch (e) {
+                    notyf.error(translations[locale]["importFailTip"]);
+                }
+                this.value = "";
+            })
+        };
+        clearChatSet.onclick = clearChat.onclick = () => {
+            if (confirmAction(translations[locale]["clearAllTip"])) {
+                chatsData.length = 0;
+                chatIdxs.length = 0;
+                folderData.length = 0;
+                folderListEle.innerHTML = "";
+                chatListEle.innerHTML = "";
+                endAll();
+                addNewChat();
+                activeChatIdx = 0;
+                chatEleAdd(activeChatIdx);
+                localStorage.removeItem("compressedChats");
+                isCompressedChats = false;
+                updateChats();
+                checkStorage();
+                activeChat(chatListEle.firstElementChild);
+                notyf.success(translations[locale]["clearChatSuccTip"]);
+            }
+        };
+        let localSetKeys = ['modelVersion', 'APISelect', 'GeminiAPISelect', 'ClaudeAPISelect', 'APIHost', 'GeminiAPIHost', 'ClaudeAPIHost', 'APIKey', 'GeminiAPIKey', 'ClaudeAPIKey', 'APIModel', 'GeminiAPIModel', 'ClaudeAPIModel', 'hotKeys', 'userAvatar', 'system', 'temp', 'top_p', 'convWidth0', 'convWidth1', 'textSpeed', 'contLen', 'enableLongReply', 'existVoice', 'voiceTestText', 'azureRegion', 'azureKey', 'enableContVoice', 'enableAutoVoice', 'existRec', 'azureRecRegion', 'azureRecKey', 'voiceRecLang', 'autoVoiceSendWord', 'autoVoiceStopWord', 'autoVoiceSendOut', 'keepListenMic', 'fullWindow', 'themeMode', 'autoThemeMode', 'customDarkTime', 'UILang', 'pinNav', 'voice0', 'voicePitch0', 'voiceVolume0', 'voiceRate0', 'azureRole0', 'azureStyle0', 'voice1', 'voicePitch1', 'voiceVolume1', 'voiceRate1', 'azureRole1', 'azureStyle1', 'searchFlag'];
+        document.getElementById("exportSet").onclick = () => {
+            let data = {}
+            for (let i = 0; i < localSetKeys.length; i++) {
+                let key = localSetKeys[i];
+                let val = localStorage.getItem(key);
+                if (val != void 0) data[key] = val;
+            }
+            let blob = new Blob([JSON.stringify(data, null, 2)], { type: "application/json" });
+            let date = new Date();
+            let fileName = "settings-" + date.getFullYear() + "-" + (date.getMonth() + 1) + "-" + date.getDate() + ".json";
+            downBlob(blob, fileName);
+            notyf.success(translations[locale]["exportSuccTip"]);
+        };
+        document.getElementById("importSetInput").onchange = function () {
+            let file = this.files[0];
+            blobToText(file).then(text => {
+                try {
+                    let json = JSON.parse(text);
+                    let keys = Object.keys(json);
+                    for (let i = 0; i < localSetKeys.length; i++) {
+                        let key = localSetKeys[i];
+                        let val = json[key];
+                        if (val !== void 0) localStorage.setItem(key, val);
+                        else localStorage.removeItem(key);
+                    }
+                    initSetting();
+                    initVoiceVal();
+                    speechServiceEle.dispatchEvent(new Event("change"));
+                    initRecSetting();
+                    initHotKey();
+                    initLang();
+                    checkStorage();
+                    notyf.success(translations[locale]["importSuccTip"]);
+                } catch (e) {
+                    notyf.error(translations[locale]["importFailTip"]);
+                }
+                this.value = "";
+            })
+        };
+        document.getElementById("resetSet").onclick = () => {
+            if (confirmAction(translations[locale]["resetSetTip"])) {
+                endAll();
+                if (existVoice === 3) localStorage.removeItem(azureRegion + "VoiceData");
+                if (existRec === 2) localStorage.removeItem(azureRecRegion + "RecData");
+                let data = {};
+                for (let i = 0; i < localSetKeys.length; i++) {
+                    let key = localSetKeys[i];
+                    let val = localStorage.removeItem(key);
+                }
+                initSetting();
+                initVoiceVal();
+                speechServiceEle.dispatchEvent(new Event("change"));
+                initRecSetting();
+                initHotKey();
+                initLang();
+                checkStorage();
+                notyf.success(translations[locale]["resetSetSuccTip"]);
+            }
+        }
+        const endAll = () => {
+            endSpeak();
+            if (editingIdx !== void 0) resumeSend();
+            if (loading) stopLoading();
+        };
+        const processIdx = (plus) => {
+            if (currentVoiceIdx !== void 0) currentVoiceIdx += plus;
+            if (editingIdx !== void 0) editingIdx += plus;
+        }
+        const hotKeyVals = {};
+        const ctrlHotKeyEv = (ev) => {
+            if (ev.ctrlKey || ev.metaKey) {
+                switch (ev.key.toLowerCase()) {
+                    case hotKeyVals["Nav"]:
+                        ev.preventDefault();
+                        toggleNavEv();
+                        return false;
+                    case hotKeyVals["Search"]:
+                        ev.preventDefault();
+                        searchChatEle.focus();
+                        return false;
+                    case hotKeyVals["Input"]:
+                        ev.preventDefault();
+                        inputAreaEle.focus();
+                        return false;
+                    case hotKeyVals["NewChat"]:
+                        ev.preventDefault();
+                        newChatEle.dispatchEvent(new MouseEvent("click"));
+                        return false;
+                    case hotKeyVals["ClearChat"]:
+                        ev.preventDefault();
+                        clearEle.dispatchEvent(new MouseEvent("click"));
+                        return false;
+                    case hotKeyVals["VoiceRec"]:
+                        if (supportRec) {
+                            ev.preventDefault();
+                            toggleRecEv();
+                        }
+                        return false;
+                    case hotKeyVals["VoiceSpeak"]:
+                        ev.preventDefault();
+                        speechEvent(systemRole ? 1 : 0);
+                        return false;
+                }
+            }
+        }
+        const ctrlAltHotKeyEv = (ev) => {
+            if ((ev.ctrlKey || ev.metaKey) && ev.altKey) {
+                switch (ev.key.toLowerCase()) {
+                    case hotKeyVals["Window"]:
+                        ev.preventDefault();
+                        toggleFull.dispatchEvent(new Event("click"));
+                        return false;
+                    case hotKeyVals["Theme"]:
+                        ev.preventDefault();
+                        lightEle.dispatchEvent(new Event("click"));
+                        return false;
+                    case hotKeyVals["Lang"]:
+                        ev.preventDefault();
+                        let idx = localeList.indexOf(locale) + 1;
+                        if (idx === localeList.length) idx = 0;
+                        locale = localeList[idx];
+                        setLang();
+                        changeLocale();
+                        return false;
+                }
+            }
+        }
+        const listKey = ['Nav', 'Search', 'Input', 'NewChat', 'ClearChat', 'VoiceRec', 'VoiceSpeak', 'Window', 'Theme', 'Lang'];
+        const ctrlKeyIdx = 7;
+        const defKeyVal = ['b', 'k', 'i', 'e', 'r', 'q', 's', 'u', 't', 'l'];
+        const initHotKey = () => {
+            let localKeysObj = {};
+            let localKeys = localStorage.getItem("hotKeys");
+            if (localKeys) {
+                try {
+                    localKeysObj = JSON.parse(localKeys);
+                } catch (e) { }
+            }
+            let pre1 = isApple ? "⌘ + " : "Ctrl + ";
+            let pre2 = isApple ? "⌘ + ⌥ + " : "Ctrl + Alt + ";
+            for (let i = 0; i < listKey.length; i++) {
+                let key = listKey[i];
+                if (key === "VoiceRec" && !supportRec) continue;
+                let ele = window["hotKey" + key];
+                for (let j = 0; j < 26; j++) {
+                    // top-level hotkey, can't overwrite
+                    if (i < ctrlKeyIdx && (j === 13 || j === 19 || j === 22)) continue;
+                    let val = String.fromCharCode(j + 97);
+                    ele.options.add(new Option((i < ctrlKeyIdx ? pre1 : pre2) + val.toUpperCase(), val));
+                }
+                hotKeyVals[key] = ele.value = localKeysObj[key] || defKeyVal[i];
+                ele.onchange = () => {
+                    if (hotKeyVals[key] === ele.value) return;
+                    let exist = listKey.find((item, idx) => {
+                        return (i < ctrlKeyIdx ? idx < ctrlKeyIdx : idx >= ctrlKeyIdx) && hotKeyVals[item] === ele.value;
+                    })
+                    if (exist) {
+                        ele.value = hotKeyVals[key];
+                        notyf.error(translations[locale]["hotkeyConflict"])
+                        return;
+                    }
+                    hotKeyVals[key] = ele.value;
+                    localStorage.setItem("hotKeys", JSON.stringify(hotKeyVals));
+                }
+            }
+        };
+        initHotKey();
+        document.addEventListener("keydown", ctrlHotKeyEv);
+        document.addEventListener("keydown", ctrlAltHotKeyEv);
+        const initSetting = () => {
+            const modelsEle = Array.from(modelSetEle.children);
+            let localModelName = localStorage.getItem("modelVersion");
+            let isVailModel = modelsEle.some(item => item.dataset.value === localModelName)
+            modelVersion = isVailModel ? localModelName : "gpt-3.5-turbo";
+            const applyModelVersion = () => {
+                let activedEle = modelSetEle.querySelector(".activeModel");
+                if (activedEle) activedEle.classList.remove("activeModel");
+                activedEle = modelSetEle.querySelector(`[data-value="${modelVersion}"]`);
+                activedEle.classList.add("activeModel");
+                //modelVer.textContent = activedEle.dataset.ver;
+                if (modelVersion.startsWith("gpt")) {
+                    modelType = 1;
+                    modelName.textContent = "ChatGPT";
+                } else if (modelVersion.startsWith("gemini")) {
+                    modelType = 2;
+                    modelName.textContent = "Gemini";
+                } else {
+                    modelType = 3;
+                    modelName.textContent = "Claude";
+                }
+                modelName.textContent = "llama.cpp";
+            };
+            applyModelVersion();
+            modelSetEle.onclick = (ev) => {
+                if (ev.target.classList.contains("modelSingle")) {
+                    modelVersion = ev.target.dataset.value;
+                    localStorage.setItem("modelVersion", modelVersion);
+                    applyModelVersion();
+                    modelSetEle.style.display = "none";
+                    selectorEle.classList.remove("showModels");
+                }
+            }
+            const apiHostEle = document.getElementById("apiHostInput");
+            const geminiHostEle = document.getElementById("geminiApiHostInput");
+            const claudeHostEle = document.getElementById("claudeApiHostInput");
+            const apiSelectEle = document.getElementById("apiSelect");
+            let localApiSelect = localStorage.getItem("APISelect");
+            if (localApiSelect) {
+                try {
+                    apiSelects = JSON.parse(localApiSelect);
+                } catch (e) {
+                    apiSelects.length = 0;
+                }
+            } else {
+                apiSelects.length = 0;
+            }
+            let localGeminiApiSelect = localStorage.getItem("GeminiAPISelect");
+            if (localGeminiApiSelect) {
+                try {
+                    geminiApiSelects = JSON.parse(localGeminiApiSelect);
+                } catch (e) {
+                    geminiApiSelects.length = 0;
+                }
+            } else {
+                geminiApiSelects.length = 0;
+            }
+            let localClaudeApiSelect = localStorage.getItem("ClaudeAPISelect");
+            if (localClaudeApiSelect) {
+                try {
+                    claudeApiSelects = JSON.parse(localClaudeApiSelect);
+                } catch (e) {
+                    claudeApiSelects.length = 0;
+                }
+            } else {
+                claudeApiSelects.length = 0;
+            }
+            let selApiSelects = apiSelects;
+            let selApiKey = "APISelect";
+            let selApiEle = apiHostEle;
+            const delApiOption = function (ev) {
+                ev.preventDefault();
+                ev.stopPropagation();
+                let index = Array.prototype.indexOf.call(apiSelectEle.children, this.parentElement);
+                selApiSelects.splice(index, 1);
+                this.parentElement.remove();
+                localStorage.setItem(selApiKey, JSON.stringify(selApiSelects));
+                if (!selApiSelects.includes(selApiEle.value)) {
+                    selApiEle.value = selApiSelects[0] || "";
+                    selApiEle.dispatchEvent(new Event("change"));
+                }
+                if (!selApiSelects.length) apiSelectEle.style.display = "none";
+            }
+            const appendApiOption = () => {
+                apiSelects.push(apiHost);
+                initApiOption(apiHost);
+                localStorage.setItem("APISelect", JSON.stringify(apiSelects));
+            }
+            const appendGeminiApiOption = () => {
+                geminiApiSelects.push(geminiApiHost);
+                initApiOption(geminiApiHost);
+                localStorage.setItem("GeminiAPISelect", JSON.stringify(geminiApiSelects));
+            }
+            const appendClaudeApiOption = () => {
+                claudeApiSelects.push(claudeApiHost);
+                initApiOption(claudeApiHost);
+                localStorage.setItem("ClaudeAPISelect", JSON.stringify(claudeApiSelects));
+            }
+            const selApiOption = function (ev) {
+                ev.preventDefault();
+                ev.stopPropagation();
+                apiSelectEle.style.display = "none";
+                let index = Array.prototype.indexOf.call(apiSelectEle.children, this);
+                selApiEle.value = selApiSelects[index];
+                selApiEle.dispatchEvent(new Event("change"));
+            }
+            const initApiOption = (api) => {
+                let optionEle = document.createElement("div");
+                optionEle.onclick = selApiOption;
+                let textEle = document.createElement("span");
+                textEle.textContent = api;
+                optionEle.appendChild(textEle);
+                let delEle = document.createElement("div");
+                delEle.className = "delApiOption";
+                delEle.onmousedown = delApiOption;
+                delEle.innerHTML = `<svg width="24" height="24"><use xlink:href="#closeIcon" /></svg>`;
+                optionEle.appendChild(delEle);
+                apiSelectEle.appendChild(optionEle);
+            }
+            const initApiSelectEle = () => {
+                apiSelectEle.innerHTML = "";
+                for (let i = 0; i < selApiSelects.length; i++) {
+                    initApiOption(selApiSelects[i]);
+                }
+            }
+            apiHostEle.onfocus = geminiHostEle.onfocus = claudeHostEle.onfocus = () => {
+                let type = document.querySelector(".modelSwitch").querySelector(".activeSwitch").dataset.id;
+                if (type === "gptOption") {
+                    selApiSelects = apiSelects;
+                    selApiKey = "APISelect"
+                    selApiEle = apiHostEle;
+                } else if (type === "geminiOption") {
+                    selApiSelects = geminiApiSelects;
+                    selApiKey = "GeminiAPISelect";
+                    selApiEle = geminiHostEle;
+                } else {
+                    selApiSelects = claudeApiSelects;
+                    selApiKey = "ClaudeAPISelect";
+                    selApiEle = claudeHostEle;
+                }
+                initApiSelectEle();
+                if (selApiSelects.length) apiSelectEle.style.display = "block";
+            }
+            apiHostEle.onblur = geminiHostEle.onblur = claudeHostEle.onblur = (ev) => {
+                if (!(ev.relatedTarget && apiSelectEle.contains(ev.relatedTarget))) apiSelectEle.style.display = "none";
+            }
+            let localApiHost = localStorage.getItem("APIHost");
+            apiHost = apiHostEle.value = envAPIEndpoint || localApiHost || apiHostEle.getAttribute("value") || "";
+            apiHostEle.onchange = () => {
+                apiHost = apiHostEle.value;
+                if (apiHost && apiSelects.indexOf(apiHost) === -1) appendApiOption();
+                localStorage.setItem("APIHost", apiHost);
+            }
+            apiHostEle.dispatchEvent(new Event("change"));
+            const keyEle = document.getElementById("keyInput");
+            let localKey = localStorage.getItem("APIKey");
+            customAPIKey = keyEle.value = envAPIKey || localKey || keyEle.getAttribute("value") || "";
+            keyEle.onchange = () => {
+                customAPIKey = keyEle.value;
+                localStorage.setItem("APIKey", customAPIKey);
+            }
+            keyEle.dispatchEvent(new Event("change"));
+            const modelEle = document.getElementById("modelInput");
+            let localModel = localStorage.getItem("APIModel");
+            customAPIModel = modelEle.value = envAPIModel || localModel || modelEle.getAttribute("value") || "";
+            modelEle.onchange = () => {
+                customAPIModel = modelEle.value;
+                localStorage.setItem("APIModel", customAPIModel);
+            }
+            modelEle.dispatchEvent(new Event("change"));
+
+            let localGeminiApiHost = localStorage.getItem("GeminiAPIHost");
+            geminiApiHost = geminiHostEle.value = envGeminiAPIEndpoint || localGeminiApiHost || geminiHostEle.getAttribute("value") || "";
+            geminiHostEle.onchange = () => {
+                geminiApiHost = geminiHostEle.value;
+                if (geminiApiHost && geminiApiSelects.indexOf(geminiApiHost) === -1) appendGeminiApiOption();
+                localStorage.setItem("GeminiAPIHost", geminiApiHost);
+            }
+            geminiHostEle.dispatchEvent(new Event("change"));
+            const geminiKeyEle = document.getElementById("geminiKeyInput");
+            let localGeminiKey = localStorage.getItem("GeminiAPIKey");
+            geminiAPIKey = geminiKeyEle.value = envGeminiAPIKey || localGeminiKey || geminiKeyEle.getAttribute("value") || "";
+            geminiKeyEle.onchange = () => {
+                geminiAPIKey = geminiKeyEle.value;
+                localStorage.setItem("GeminiAPIKey", geminiAPIKey);
+            }
+            geminiKeyEle.dispatchEvent(new Event("change"));
+            const geminiModelEle = document.getElementById("geminiModelInput");
+            let localGeminiModel = localStorage.getItem("GeminiAPIModel");
+            geminiAPIModel = geminiModelEle.value = envGeminiAPIModel || localGeminiModel || geminiModelEle.getAttribute("value") || "";
+            geminiModelEle.onchange = () => {
+                geminiAPIModel = geminiModelEle.value;
+                localStorage.setItem("GeminiAPIModel", geminiAPIModel);
+            }
+            geminiModelEle.dispatchEvent(new Event("change"));
+
+            let localClaudeApiHost = localStorage.getItem("ClaudeAPIHost");
+            claudeApiHost = claudeHostEle.value = envClaudeAPIEndpoint || localClaudeApiHost || claudeHostEle.getAttribute("value") || "";
+            claudeHostEle.onchange = () => {
+                claudeApiHost = claudeHostEle.value;
+                if (claudeApiHost && claudeApiSelects.indexOf(claudeApiHost) === -1) appendClaudeApiOption();
+                localStorage.setItem("ClaudeAPIHost", claudeApiHost);
+            }
+            claudeHostEle.dispatchEvent(new Event("change"));
+            const claudeKeyEle = document.getElementById("claudeKeyInput");
+            let localClaudeKey = localStorage.getItem("ClaudeAPIKey");
+            claudeAPIKey = claudeKeyEle.value = envClaudeAPIKey || localClaudeKey || claudeKeyEle.getAttribute("value") || "";
+            claudeKeyEle.onchange = () => {
+                claudeAPIKey = claudeKeyEle.value;
+                localStorage.setItem("ClaudeAPIKey", claudeAPIKey);
+            }
+            claudeKeyEle.dispatchEvent(new Event("change"));
+            const claudeModelEle = document.getElementById("claudeModelInput");
+            let localClaudeModel = localStorage.getItem("ClaudeAPIModel");
+            claudeAPIModel = claudeModelEle.value = envClaudeAPIModel || localClaudeModel || claudeModelEle.getAttribute("value") || "";
+            claudeModelEle.onchange = () => {
+                claudeAPIModel = claudeModelEle.value;
+                localStorage.setItem("ClaudeAPIModel", claudeAPIModel);
+            }
+            claudeModelEle.dispatchEvent(new Event("change"));
+
+            const updateAvatar = () => {
+                setAvatarPre.src = userAvatar;
+                chatlog.querySelectorAll(".request>.chatAvatar").forEach(ele => {
+                    ele.children[0].src = userAvatar;
+                })
+            }
+            let localAvatar = localStorage.getItem("userAvatar");
+            userAvatar = setAvatarPre.src = setAvatar.value = localAvatar || setAvatar.getAttribute("value") || "avatar.jpg";
+            setAvatar.onchange = () => {
+                userAvatar = setAvatar.value;
+                localStorage.setItem("userAvatar", userAvatar);
+                updateAvatar();
+            }
+            setAvatar.dispatchEvent(new Event("change"));
+            let localSystem = localStorage.getItem("system");
+            systemEle.onchange = () => {
+                systemRole = systemEle.value;
+                localStorage.setItem("system", systemRole);
+                if (systemRole) {
+                    if (data[0] && data[0].role === "system") {
+                        data[0].content = systemRole;
+                    } else {
+                        data.unshift({ role: "system", content: systemRole });
+                        processIdx(1);
+                    }
+                } else if (data[0] && data[0].role === "system") {
+                    data.shift();
+                    processIdx(-1);
+                }
+                updateChats();
+            }
+            if (systemRole === void 0) {
+                systemRole = systemEle.value = localSystem || presetRoleData.default || "";
+                if (systemRole) {
+                    data.unshift({ role: "system", content: systemRole });
+                    processIdx(1);
+                    updateChats();
+                }
+            }
+            preEle.onchange = () => {
+                let val = preEle.value;
+                if (val && presetRoleData[val]) {
+                    systemEle.value = presetRoleData[val];
+                } else {
+                    systemEle.value = "";
+                }
+                systemEle.dispatchEvent(new Event("change"));
+                systemEle.focus();
+            }
+            const topEle = document.getElementById("top_p");
+            let localTop = localStorage.getItem("top_p");
+            topEle.value = roleNature = parseFloat(localTop || topEle.getAttribute("value"));
+            topEle.oninput = () => {
+                topEle.style.backgroundSize = (topEle.value - topEle.min) * 100 / (topEle.max - topEle.min) + "% 100%";
+                roleNature = parseFloat(topEle.value);
+                localStorage.setItem("top_p", topEle.value);
+            }
+            topEle.dispatchEvent(new Event("input"));
+            const tempEle = document.getElementById("temp");
+            let localTemp = localStorage.getItem("temp");
+            tempEle.value = roleTemp = parseFloat(localTemp || tempEle.getAttribute("value"));
+            tempEle.oninput = () => {
+                tempEle.style.backgroundSize = (tempEle.value - tempEle.min) * 100 / (tempEle.max - tempEle.min) + "% 100%";
+                roleTemp = parseFloat(tempEle.value);
+                localStorage.setItem("temp", tempEle.value);
+            }
+            tempEle.dispatchEvent(new Event("input"));
+            const convWEle = document.getElementById("convWidth");
+            const styleSheet = document.styleSheets[0];
+            convWEle.oninput = () => {
+                let type = isFull ? 1 : 0;
+                convWEle.style.backgroundSize = (convWEle.value - convWEle.min) * 100 / (convWEle.max - convWEle.min) + "% 100%";
+                convWidth[type] = parseInt(convWEle.value);
+                localStorage.setItem("convWidth" + type, convWEle.value);
+                styleSheet.deleteRule(0);
+                styleSheet.deleteRule(0);
+                styleSheet.insertRule(`.bottom_wrapper{max-width:${convWidth[type]}%;}`, 0);
+                styleSheet.insertRule(`.requestBody,.response .markdown-body{max-width:calc(${convWidth[type]}% - 84px);}`, 0);
+            }
+            const setConvValue = () => {
+                let type = isFull ? 1 : 0;
+                let localConv = localStorage.getItem("convWidth" + type);
+                convWEle.value = parseInt(localConv || (type ? "60" : "100"));
+                convWEle.dispatchEvent(new Event("input"));
+            }
+            const fullFunc = () => {
+                isFull = windowEle.classList.contains("full_window");
+                localStorage.setItem("fullWindow", isFull);
+                setConvValue();
+                toggleFull.title = isFull ? translations[locale]["winedWin"] : translations[locale]["fullWin"];
+                toggleFull.children[0].children[0].setAttributeNS("http://www.w3.org/1999/xlink", "href", isFull ? "#collapseFullIcon" : "#expandFullIcon");
+            }
+            toggleFull.onclick = () => {
+                windowEle.classList.toggle("full_window");
+                fullFunc();
+            }
+            let localFull = localStorage.getItem("fullWindow");
+            if (localFull && localFull === "true") {
+                if (!windowEle.classList.contains("full_window")) {
+                    windowEle.classList.add("full_window");
+                    fullFunc();
+                }
+            } else if (windowEle.classList.contains("full_window")) {
+                windowEle.classList.remove("full_window");
+                fullFunc();
+            } else {
+                setConvValue();
+            }
+            const speedEle = document.getElementById("textSpeed");
+            let localSpeed = localStorage.getItem("textSpeed");
+            speedEle.value = localSpeed || speedEle.getAttribute("value");
+            textSpeed = parseFloat(speedEle.min) + (speedEle.max - speedEle.value);
+            speedEle.oninput = () => {
+                speedEle.style.backgroundSize = (speedEle.value - speedEle.min) * 100 / (speedEle.max - speedEle.min) + "% 100%";
+                textSpeed = parseFloat(speedEle.min) + (speedEle.max - speedEle.value);
+                localStorage.setItem("textSpeed", speedEle.value);
+            }
+            speedEle.dispatchEvent(new Event("input"));
+            if (localStorage.getItem("enableCont") != null) { // fallback old cont
+                if (localStorage.getItem("enableCont") === "false") localStorage.setItem("contLength", 0);
+                localStorage.removeItem("enableCont");
+            }
+            const contLenEle = document.getElementById("contLength");
+            let localContLen = localStorage.getItem("contLength");
+            contLenEle.value = contLen = parseInt(localContLen || contLenEle.getAttribute("value"));
+            contLenEle.oninput = () => {
+                contLenEle.style.backgroundSize = (contLenEle.value - contLenEle.min) * 100 / (contLenEle.max - contLenEle.min) + "% 100%";
+                contLen = parseInt(contLenEle.value);
+                contLenWrap.textContent = contLen;
+                localStorage.setItem("contLength", contLenEle.value);
+            }
+            contLenEle.dispatchEvent(new Event("input"));
+            const longEle = document.getElementById("enableLongReply");
+            let localLong = localStorage.getItem("enableLongReply");
+            longEle.checked = enableLongReply = (localLong || longEle.getAttribute("checked")) === "true";
+            longEle.onchange = () => {
+                enableLongReply = longEle.checked;
+                localStorage.setItem("enableLongReply", enableLongReply);
+            }
+            longEle.dispatchEvent(new Event("change"));
+            let localPin = localStorage.getItem("pinNav");
+            if (window.innerWidth > 800 && !(localPin && localPin === "false")) {
+                document.body.classList.add("show-nav");
+            };
+            const setDarkTheme = (is) => {
+                let cssEle = document.body.getElementsByTagName("link")[0];
+                cssEle.href = cssEle.href.replace(is ? "light" : "dark", is ? "dark" : "light");
+                let hlCssEle = document.body.getElementsByTagName("link")[1];
+                hlCssEle.href = hlCssEle.href.replace(is ? "github" : "github-dark", is ? "github-dark" : "github");
+                justDarkTheme(is);
+            }
+            const handleAutoMode = (ele) => {
+                if (ele.checked) {
+                    autoThemeMode = parseInt(ele.value);
+                    localStorage.setItem("autoThemeMode", autoThemeMode);
+                    initAutoTime();
+                    if (autoThemeMode) {
+                        if (customDarkOut !== void 0) {
+                            clearTimeout(customDarkOut);
+                            customDarkOut = void 0;
+                        }
+                        setDarkTheme(darkMedia.matches);
+                    } else {
+                        checkCustomTheme();
+                    }
+                }
+            }
+            autoTheme0.onchange = autoTheme1.onchange = function () { handleAutoMode(this) };
+            const handleAutoTime = (ele, idx) => {
+                let otherIdx = 1 - idx;
+                if (ele.value !== customDarkTime[otherIdx]) {
+                    customDarkTime[idx] = ele.value;
+                    localStorage.setItem("customDarkTime", JSON.stringify(customDarkTime));
+                    checkCustomTheme();
+                } else {
+                    ele.value = customDarkTime[idx];
+                    notyf.error(translations[locale]["customDarkTip"]);
+                }
+            }
+            customStart.onchange = function () { handleAutoTime(this, 0) };
+            customEnd.onchange = function () { handleAutoTime(this, 1) };
+            const initAutoTime = () => {
+                customAutoSet.style.display = autoThemeMode === 0 ? "block" : "none";
+                if (autoThemeMode === 0) {
+                    customStart.value = customDarkTime[0];
+                    customEnd.value = customDarkTime[1];
+                }
+            }
+            const initAutoThemeEle = () => {
+                autoThemeEle.querySelector("#autoTheme" + autoThemeMode).checked = true;
+                initAutoTime();
+            }
+            const checkCustomTheme = () => {
+                if (customDarkOut !== void 0) clearTimeout(customDarkOut);
+                let date = new Date();
+                let nowTime = date.getTime();
+                let start = customDarkTime[0].split(":");
+                let startTime = new Date().setHours(start[0], start[1], 0, 0);
+                let end = customDarkTime[1].split(":");
+                let endTime = new Date().setHours(end[0], end[1], 0, 0);
+                let order = endTime > startTime;
+                let isDark = order ? (nowTime > startTime && endTime > nowTime) : !(nowTime > endTime && startTime > nowTime);
+                let nextChange = isDark ? endTime - nowTime : startTime - nowTime;
+                if (nextChange < 0) nextChange += dayMs;
+                setDarkTheme(isDark);
+                customDarkOut = setTimeout(() => {
+                    checkCustomTheme();
+                }, nextChange);
+            }
+            const setDarkMode = () => {
+                if (customDarkOut !== void 0) {
+                    clearTimeout(customDarkOut);
+                    customDarkOut = void 0;
+                }
+                autoThemeEle.style.display = "none";
+                let themeClass, title;
+                if (themeMode === 2) {
+                    autoThemeEle.style.display = "block";
+                    if (autoThemeMode) {
+                        setDarkTheme(darkMedia.matches);
+                    } else {
+                        checkCustomTheme();
+                        initAutoThemeEle();
+                    }
+                    themeClass = "autoTheme";
+                    title = translations[locale]["autoWord"];
+                } else if (themeMode === 1) {
+                    setDarkTheme(false);
+                    themeClass = "lightTheme";
+                    title = translations[locale]["lightTheme"];
+                } else {
+                    setDarkTheme(true);
+                    themeClass = "darkTheme";
+                    title = translations[locale]["darkTheme"];
+                }
+                localStorage.setItem("themeMode", themeMode);
+                setLightEle.className = "setDetail themeDetail " + themeClass;
+                lightEle.children[0].children[0].setAttributeNS("http://www.w3.org/1999/xlink", "href", "#" + themeClass + "Icon");
+                lightEle.title = title;
+            }
+            lightEle.onclick = () => {
+                themeMode = themeMode - 1;
+                if (themeMode === -1) themeMode = 2;
+                setDarkMode();
+            }
+            setLightEle.onclick = (ev) => {
+                let idx = Array.prototype.indexOf.call(setLightEle.children, ev.target);
+                if (themeMode !== idx) {
+                    themeMode = idx;
+                    setDarkMode();
+                }
+            }
+            let localTheme = localStorage.getItem("themeMode");
+            themeMode = parseInt(localTheme || "1");
+            let localAutoTheme = localStorage.getItem("autoThemeMode");
+            autoThemeMode = parseInt(localAutoTheme || "1");
+            let localCustomDark = localStorage.getItem("customDarkTime");
+            customDarkTime = JSON.parse(localCustomDark || '["21:00", "07:00"]');
+            setDarkMode();
+            darkMedia.onchange = e => {
+                if (themeMode === 2 && autoThemeMode) setDarkTheme(e.matches);
+            };
+            const caseSearchEle = document.getElementById("matchCaseSearch");
+            let localSearchFlag = localStorage.getItem("searchFlag") || "0";
+            isCaseSearch = Boolean(localSearchFlag & 1);
+            caseSearchEle.classList.toggle("seledSearch", isCaseSearch);
+            caseSearchEle.onclick = () => {
+                isCaseSearch = caseSearchEle.classList.toggle("seledSearch");
+                localStorage.setItem("searchFlag", ~~isCaseSearch);
+                if (searchChatEle.value.length) toSearchChats();
+            }
+        };
+        initSetting();
+        document.getElementById("loadMask").style.display = "none";
+        const closeEvent = (ev) => {
+            if (settingEle.contains(ev.target)) return;
+            if (!dialogEle.contains(ev.target)) {
+                dialogEle.style.display = "none";
+                document.removeEventListener("mousedown", closeEvent, true);
+                settingEle.classList.remove("showSetting");
+                stopTestVoice();
+            }
+        }
+        settingEle.onmousedown = () => {
+            dialogEle.style.display = dialogEle.style.display === "block" ? "none" : "block";
+            if (dialogEle.style.display === "block") {
+                document.addEventListener("mousedown", closeEvent, true);
+                settingEle.classList.add("showSetting");
+            } else {
+                document.removeEventListener("mousedown", closeEvent, true);
+                settingEle.classList.remove("showSetting");
+            }
+        }
+        const modelCloseEvent = (ev) => {
+            if (selectorEle.contains(ev.target)) return;
+            if (!modelSetEle.contains(ev.target)) {
+                document.removeEventListener("mousedown", modelCloseEvent, true);
+                modelSetEle.style.display = "none";
+                selectorEle.classList.remove("showModels");
+            }
+        }
+        selectorEle.onmousedown = () => {
+            return null;
+            modelSetEle.style.display = modelSetEle.style.display === "block" ? "none" : "block";
+            if (modelSetEle.style.display === "block") {
+                document.addEventListener("mousedown", modelCloseEvent, true);
+                selectorEle.classList.add("showModels");
+            } else {
+                document.removeEventListener("mousedown", modelCloseEvent, true);
+                selectorEle.classList.remove("showModels");
+            }
+        }
+        let delayId;
+        const delay = () => {
+            return new Promise((resolve) => delayId = setTimeout(resolve, textSpeed)); //打字机时间间隔
+        }
+        const getTime = () => {
+            return existVoice === 3 ? new Date().toISOString() : new Date().toString();
+        }
+        const getWSPre = (date, requestId) => {
+            let osPlatform = (typeof window !== "undefined") ? "Browser" : "Node";
+            osPlatform += "/" + navigator.platform;
+            let osName = navigator.userAgent;
+            let osVersion = navigator.appVersion;
+            return `Path: speech.config\r\nX-RequestId: ${requestId}\r\nX-Timestamp: ${date}\r\nContent-Type: application/json\r\n\r\n{"context":{"system":{"name":"SpeechSDK","version":"1.35.0","build":"JavaScript","lang":"JavaScript"},"os":{"platform":"${osPlatform}","name":"${osName}","version":"${osVersion}"}}}`
+        }
+        const getWSAudio = (date, requestId) => {
+            return existVoice === 3 ? `Path: synthesis.context\r\nX-RequestId: ${requestId}\r\nX-Timestamp: ${date}\r\nContent-Type: application/json\r\n\r\n{"synthesis":{"audio":{"metadataOptions":{"bookmarkEnabled":false,"punctuationBoundaryEnabled":"false","sentenceBoundaryEnabled":"false","sessionEndEnabled":true,"visemeEnabled":false,"wordBoundaryEnabled":"false"},"outputFormat":"${voiceFormat}"},"language":{"autoDetection":false}}}`
+                : `X-Timestamp:${date}\r\nContent-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n{"context":{"synthesis":{"audio":{"metadataoptions":{"sentenceBoundaryEnabled":"false","wordBoundaryEnabled":"true"},"outputFormat":"${voiceFormat}"}}}}`
+        }
+        const getWSText = (date, requestId, lang, voice, volume, rate, pitch, style, role, msg) => {
+            let fmtVolume = (volume >= 1 ? "+" : "") + (volume * 100 - 100) + "%";
+            let fmtRate = (rate >= 1 ? "+" : "") + (rate * 100 - 100) + "%";
+            let fmtPitch = (pitch >= 1 ? "+" : "") + (pitch - 1) + "Hz";
+            msg = getEscape(msg);
+            if (existVoice === 3) {
+                let fmtStyle = style ? ` style="${style}"` : ` style="Default"`;
+                let fmtRole = role ? ` role="${role}"` : "";
+                let fmtExpress = fmtStyle + fmtRole;
+                return `Path: ssml\r\nX-RequestId: ${requestId}\r\nX-Timestamp: ${date}\r\nContent-Type: application/ssml+xml\r\n\r\n<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xmlns:emo="http://www.w3.org/2009/10/emotionml" xml:lang="${lang}"><voice name="${voice}"><lang xml:lang="${lang}"><s /><mstts:express-as${fmtExpress}><prosody pitch="${fmtPitch}" rate="${fmtRate}" volume="${fmtVolume}">${msg}</prosody></mstts:express-as><s /></lang></voice></speak>`;
+            } else {
+                return `X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nX-Timestamp:${date}Z\r\nPath:ssml\r\n\r\n<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${lang}"><voice name="${voice}"><prosody pitch="${fmtPitch}" rate="${fmtRate}" volume="${fmtVolume}">${msg}</prosody></voice></speak>`;
+            }
+        }
+        const getAzureWSURL = () => {
+            return `wss://${azureRegion}.tts.speech.microsoft.com/cognitiveservices/websocket/v1?Ocp-Apim-Subscription-Key=${azureKey}`
+        }
+        const edgeTTSURL = "wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=6A5AA1D4EAFF4E9FB37E23D68491D6F4";
+        const resetSpeakIcon = () => {
+            if (currentVoiceIdx !== void 0) {
+                chatlog.children[systemRole ? currentVoiceIdx - 1 : currentVoiceIdx].classList.remove("showVoiceCls");
+                chatlog.children[systemRole ? currentVoiceIdx - 1 : currentVoiceIdx].lastChild.lastChild.className = "voiceCls readyVoice";
+            }
+        }
+        const endSpeak = () => {
+            resetSpeakIcon();
+            currentVoiceIdx = void 0;
+            if (voiceIns && voiceIns instanceof Audio) {
+                voiceIns.pause();
+                voiceIns.currentTime = 0;
+                URL.revokeObjectURL(voiceIns.src);
+                voiceIns.removeAttribute("src");
+                voiceIns.onended = voiceIns.onerror = null;
+                sourceBuffer = void 0;
+                speechPushing = false;
+                if (voiceSocket && voiceSocket["pending"]) {
+                    voiceSocket.close()
+                }
+                if (autoVoiceSocket && autoVoiceSocket["pending"]) {
+                    autoVoiceSocket.close()
+                }
+                speechQuene.length = 0;
+                autoPlayingIdx = 0;
+                autoMediaSource = void 0;
+                voiceContentQuene = [];
+                voiceEndFlagQuene = [];
+                voiceBlobURLQuene = [];
+                autoOnlineVoiceFlag = false;
+            } else if (supportSpe) {
+                speechSynthesis.cancel();
+            }
+        }
+        const speakEvent = (ins, force = true, end = false) => {
+            return new Promise((res, rej) => {
+                ins.onerror = () => {
+                    if (end) {
+                        endSpeak();
+                    } else if (force) {
+                        resetSpeakIcon();
+                    }
+                    res();
+                }
+                if (ins instanceof Audio) {
+                    ins.onended = ins.onerror;
+                    ins.play();
+                } else {
+                    ins.onend = ins.onerror;
+                    speechSynthesis.speak(voiceIns);
+                }
+            })
+        };
+        let voiceData = {};
+        let voiceSocket;
+        let speechPushing = false;
+        let speechQuene = [];
+        let sourceBuffer;
+        speechQuene.push = function (buffer) {
+            if (!speechPushing && (sourceBuffer && !sourceBuffer.updating)) {
+                speechPushing = true;
+                sourceBuffer.appendBuffer(buffer);
+            } else {
+                Array.prototype.push.call(this, buffer)
+            }
+        }
+        const initSocket = () => {
+            return new Promise((res, rej) => {
+                let url = existVoice === 3 ? getAzureWSURL() : edgeTTSURL;
+                if (!voiceSocket || voiceSocket.readyState > 1 || voiceSocket.url !== url) {
+                    if (voiceSocket && voiceSocket.readyState === 1) voiceSocket.close(1000);
+                    voiceSocket = new WebSocket(url);
+                    voiceSocket.binaryType = "arraybuffer";
+                    voiceSocket.onopen = () => {
+                        res();
+                    };
+                    voiceSocket.onerror = () => {
+                        rej();
+                    }
+                } else {
+                    return res();
+                }
+            })
+        }
+        const initStreamVoice = (mediaSource) => {
+            return new Promise((r, j) => {
+                Promise.all([initSocket(), new Promise(res => {
+                    mediaSource.onsourceopen = () => {
+                        res();
+                    };
+                })]).then(() => {
+                    r();
+                })
+            })
+        }
+        let downQuene = {};
+        let downSocket;
+        const downBlob = (blob, name) => {
+            let a = document.createElement("a");
+            a.download = name;
+            let url = URL.createObjectURL(blob);
+            a.href = url;
+            a.click();
+            setTimeout(() => {
+                URL.revokeObjectURL(url)
+            }, 1000);
+            a = null;
+        }
+        const initDownSocket = () => {
+            return new Promise((res, rej) => {
+                let url = existVoice === 3 ? getAzureWSURL() : edgeTTSURL;
+                if (!downSocket || downSocket.readyState > 1 || downSocket.url !== url) {
+                    if (downSocket && downSocket.readyState === 1) downSocket.close(1000);
+                    downSocket = new WebSocket(url);
+                    downSocket.binaryType = "arraybuffer";
+                    downSocket.onopen = () => {
+                        res();
+                    };
+                    downSocket.onmessage = (e) => {
+                        if (e.data instanceof ArrayBuffer) {
+                            let text = new TextDecoder().decode(e.data.slice(0, voicePreLen));
+                            let reqIdx = text.indexOf(":");
+                            let uuid = text.slice(reqIdx + 1, reqIdx + 33);
+                            downQuene[uuid]["blob"].push(e.data.slice(voicePreLen));
+                        } else if (e.data.indexOf("Path:turn.end") !== -1) {
+                            let reqIdx = e.data.indexOf(":");
+                            let uuid = e.data.slice(reqIdx + 1, reqIdx + 33);
+                            let blob = new Blob(downQuene[uuid]["blob"], { type: voiceMIME });
+                            let key = downQuene[uuid]["key"];
+                            let name = downQuene[uuid]["name"];
+                            if (blob.size === 0) {
+                                notyf.open({
+                                    type: "warning",
+                                    message: translations[locale]["cantSpeechTip"]
+                                });
+                                return;
+                            }
+                            voiceData[key] = blob;
+                            if (downQuene[uuid]["isTest"]) {
+                                testVoiceBlob = blob;
+                                playTestAudio();
+                            } else {
+                                downBlob(blob, name.slice(0, 16) + voiceSuffix);
+                            }
+                        }
+                    }
+                    downSocket.onerror = () => {
+                        rej();
+                    }
+                } else {
+                    return res();
+                }
+            })
+        }
+        const getOpenAIVoice = async (input, voice, speed) => {
+            let url = apiHost + ((apiHost.length && !apiHost.endsWith("/")) ? "/" : "") + "v1/audio/speech";
+            let headers = { "Content-Type": "application/json" };
+            if (customAPIKey) headers["Authorization"] = "Bearer " + customAPIKey;
+            let body = JSON.stringify({
+                model: "tts-1",
+                input,
+                voice,
+                response_format: "aac",
+                speed
+            });
+            let controller = new AbortController();
+            let controllerId = setTimeout(() => {
+                notyf.error(translations[locale]["timeoutTip"]);
+                controller.abort();
+            }, 20000);
+            try {
+                const res = await fetch(url, {
+                    method: "POST",
+                    headers,
+                    body,
+                    signal: controller.signal
+                });
+                clearTimeout(controllerId);
+                if (res.status === 200) {
+                    return await res.blob()
+                } else {
+                    notyf.open({ type: "warning", message: translations[locale]["cantSpeechTip"] })
+                }
+            } catch (e) { }
+        }
+        let testVoiceBlob;
+        let testVoiceIns;
+        const playTestAudio = () => {
+            if (existVoice >= 2) {
+                if (!testVoiceIns || testVoiceIns instanceof Audio === false) {
+                    testVoiceIns = new Audio();
+                    testVoiceIns.onended = testVoiceIns.onerror = () => {
+                        stopTestVoice();
+                    }
+                }
+                testVoiceIns.src = URL.createObjectURL(testVoiceBlob);
+                testVoiceIns.play();
+            } else if (supportSpe) {
+                speechSynthesis.speak(testVoiceIns);
+            }
+        }
+        const pauseTestVoice = () => {
+            if (testVoiceIns) {
+                if (testVoiceIns && testVoiceIns instanceof Audio) {
+                    testVoiceIns.pause();
+                } else if (supportSpe) {
+                    speechSynthesis.pause();
+                }
+            }
+            testVoiceBtn.className = "justSetLine resumeTestVoice";
+        }
+        const resumeTestVoice = () => {
+            if (testVoiceIns) {
+                if (testVoiceIns && testVoiceIns instanceof Audio) {
+                    testVoiceIns.play();
+                } else if (supportSpe) {
+                    speechSynthesis.resume();
+                }
+            }
+            testVoiceBtn.className = "justSetLine pauseTestVoice";
+        }
+        const stopTestVoice = () => {
+            if (testVoiceIns) {
+                if (testVoiceIns instanceof Audio) {
+                    testVoiceIns.pause();
+                    testVoiceIns.currentTime = 0;
+                    URL.revokeObjectURL(testVoiceIns.src);
+                    testVoiceIns.removeAttribute("src");
+                } else if (supportSpe) {
+                    speechSynthesis.cancel();
+                }
+            }
+            testVoiceBtn.className = "justSetLine readyTestVoice";
+        }
+        const startTestVoice = async () => {
+            testVoiceBtn.className = "justSetLine pauseTestVoice";
+            let volume = voiceVolume[voiceType];
+            let rate = voiceRate[voiceType];
+            let pitch = voicePitch[voiceType];
+            let content = voiceTestText;
+            if (existVoice === 4) {
+                let voice = voiceRole[voiceType].name;
+                let key = content + voice + rate;
+                let blob = voiceData[key];
+                if (blob) {
+                    testVoiceBlob = blob;
+                    playTestAudio();
+                } else {
+                    testVoiceBlob = await getOpenAIVoice(content, voice, rate);
+                    if (testVoiceBlob) {
+                        voiceData[key] = testVoiceBlob;
+                        playTestAudio();
+                    }
+                }
+            } else if (existVoice >= 2) {
+                let voice = existVoice === 3 ? voiceRole[voiceType].ShortName : voiceRole[voiceType].Name;
+                let style = azureStyle[voiceType];
+                let role = azureRole[voiceType];
+                let key = content + voice + volume + rate + pitch + (style ? style : "") + (role ? role : "");
+                let blob = voiceData[key];
+                if (blob) {
+                    testVoiceBlob = blob;
+                    playTestAudio();
+                } else {
+                    await initDownSocket();
+                    let currDate = getTime();
+                    let lang = voiceRole[voiceType].lang;
+                    let uuid = uuidv4(existVoice === 3);
+                    if (existVoice === 3) {
+                        downSocket.send(getWSPre(currDate, uuid));
+                    }
+                    downSocket.send(getWSAudio(currDate, uuid));
+                    downSocket.send(getWSText(currDate, uuid, lang, voice, volume, rate, pitch, style, role, content));
+                    downSocket["pending"] = true;
+                    downQuene[uuid] = {};
+                    downQuene[uuid]["name"] = content;
+                    downQuene[uuid]["key"] = key;
+                    downQuene[uuid]["isTest"] = true;
+                    downQuene[uuid]["blob"] = [];
+                }
+            } else {
+                testVoiceIns = new SpeechSynthesisUtterance();
+                testVoiceIns.onend = testVoiceIns.onerror = () => {
+                    stopTestVoice();
+                }
+                testVoiceIns.voice = voiceRole[voiceType];
+                testVoiceIns.volume = volume;
+                testVoiceIns.rate = rate;
+                testVoiceIns.pitch = pitch;
+                testVoiceIns.text = content;
+                playTestAudio();
+            }
+        }
+        const downloadAudio = async (idx) => {
+            if (existVoice < 2) {
+                return;
+            }
+            let type = data[idx].role === "user" ? 0 : 1;
+            let content = chatlog.children[systemRole ? idx - 1 : idx].children[1].innerText.trim();
+            let rate = voiceRate[type];
+            if (existVoice === 4) {
+                let voice = voiceRole[type].name;
+                let key = content + voice + rate;
+                let blob = voiceData[key];
+                if (blob) {
+                    downBlob(blob, content.slice(0, 16) + openAIVoiceSuffix);
+                } else {
+                    let resBlob = await getOpenAIVoice(content, voice, rate);
+                    if (resBlob) {
+                        voiceData[key] = resBlob;
+                        downBlob(voiceData[key], content.slice(0, 16) + openAIVoiceSuffix);
+                    }
+                }
+            } else {
+                let voice = existVoice === 3 ? voiceRole[type].ShortName : voiceRole[type].Name;
+                let volume = voiceVolume[type];
+                let pitch = voicePitch[type];
+                let style = azureStyle[type];
+                let role = azureRole[type];
+                let key = content + voice + volume + rate + pitch + (style ? style : "") + (role ? role : "");
+                let blob = voiceData[key];
+                if (blob) {
+                    downBlob(blob, content.slice(0, 16) + voiceSuffix);
+                } else {
+                    await initDownSocket();
+                    let currDate = getTime();
+                    let lang = voiceRole[type].lang;
+                    let uuid = uuidv4(existVoice === 3);
+                    if (existVoice === 3) {
+                        downSocket.send(getWSPre(currDate, uuid));
+                    }
+                    downSocket.send(getWSAudio(currDate, uuid));
+                    downSocket.send(getWSText(currDate, uuid, lang, voice, volume, rate, pitch, style, role, content));
+                    downSocket["pending"] = true;
+                    downQuene[uuid] = {};
+                    downQuene[uuid]["name"] = content;
+                    downQuene[uuid]["key"] = key;
+                    downQuene[uuid]["blob"] = [];
+                }
+            }
+        }
+        const NoMSEPending = (key) => {
+            return new Promise((res, rej) => {
+                let bufArray = [];
+                voiceSocket.onmessage = (e) => {
+                    if (e.data instanceof ArrayBuffer) {
+                        bufArray.push(e.data.slice(voicePreLen));
+                    } else if (e.data.indexOf("Path:turn.end") !== -1) {
+                        voiceSocket["pending"] = false;
+                        if (!(bufArray.length === 1 && bufArray[0].byteLength === 0)) {
+                            voiceData[key] = new Blob(bufArray, { type: voiceMIME });
+                            res(voiceData[key]);
+                        } else {
+                            res(new Blob());
+                        }
+                    }
+                }
+            })
+        }
+        const pauseEv = () => {
+            if (voiceIns.src) {
+                let ele = chatlog.children[systemRole ? currentVoiceIdx - 1 : currentVoiceIdx].lastChild.lastChild;
+                ele.classList.remove("readyVoice");
+                ele.classList.remove("pauseVoice");
+                ele.classList.add("resumeVoice");
+            }
+        }
+        const resumeEv = () => {
+            if (voiceIns.src) {
+                let ele = chatlog.children[systemRole ? currentVoiceIdx - 1 : currentVoiceIdx].lastChild.lastChild;
+                ele.classList.remove("readyVoice");
+                ele.classList.remove("resumeVoice");
+                ele.classList.add("pauseVoice");
+            }
+        }
+        const speechEvent = async (idx) => {
+            if (!data[idx]) return;
+            endSpeak();
+            currentVoiceIdx = idx;
+            if (!data[idx].content && enableContVoice) {
+                if (currentVoiceIdx !== data.length - 1) { return speechEvent(currentVoiceIdx + 1) }
+                else { return endSpeak() }
+            };
+            let type = data[idx].role === "user" ? 0 : 1;
+            chatlog.children[systemRole ? idx - 1 : idx].classList.add("showVoiceCls");
+            let voiceIconEle = chatlog.children[systemRole ? idx - 1 : idx].lastChild.lastChild;
+            voiceIconEle.className = "voiceCls pauseVoice";
+            let content = chatlog.children[systemRole ? idx - 1 : idx].children[1].innerText.trim();
+            let volume = voiceVolume[type];
+            let rate = voiceRate[type];
+            let pitch = voicePitch[type];
+            let style = azureStyle[type];
+            let role = azureRole[type];
+            if (existVoice >= 2) {
+                if (!voiceIns || voiceIns instanceof Audio === false) {
+                    voiceIns = new Audio();
+                    voiceIns.onpause = pauseEv;
+                    voiceIns.onplay = resumeEv;
+                }
+                if (existVoice === 4) {
+                    let voice = voiceRole[type].name;
+                    let key = content + voice + rate;
+                    let currData = voiceData[key];
+                    if (currData) {
+                        voiceIns.src = URL.createObjectURL(currData);
+                    } else {
+                        let blob = await getOpenAIVoice(content, voice, rate);
+                        if (blob) {
+                            voiceData[key] = blob;
+                            voiceIns.src = URL.createObjectURL(blob);
+                        } else return;
+                    }
+                } else {
+                    let voice = existVoice === 3 ? voiceRole[type].ShortName : voiceRole[type].Name;
+                    let key = content + voice + volume + rate + pitch + (style ? style : "") + (role ? role : "");
+                    let currData = voiceData[key];
+                    if (currData) {
+                        voiceIns.src = URL.createObjectURL(currData);
+                    } else {
+                        let mediaSource;
+                        if (supportMSE) {
+                            mediaSource = new MediaSource;
+                            voiceIns.src = URL.createObjectURL(mediaSource);
+                            await initStreamVoice(mediaSource);
+                            if (!sourceBuffer) {
+                                sourceBuffer = mediaSource.addSourceBuffer(voiceMIME);
+                            }
+                            sourceBuffer.onupdateend = function () {
+                                speechPushing = false;
+                                if (speechQuene.length) {
+                                    let buf = speechQuene.shift();
+                                    if (buf["end"]) {
+                                        if (!sourceBuffer.buffered.length) notyf.open({ type: "warning", message: translations[locale]["cantSpeechTip"] });
+                                        mediaSource.endOfStream();
+                                    } else {
+                                        speechPushing = true;
+                                        sourceBuffer.appendBuffer(buf);
+                                    }
+                                }
+                            };
+                            let bufArray = [];
+                            voiceSocket.onmessage = (e) => {
+                                if (e.data instanceof ArrayBuffer) {
+                                    let buf = e.data.slice(voicePreLen);
+                                    bufArray.push(buf);
+                                    speechQuene.push(buf);
+                                } else if (e.data.indexOf("Path:turn.end") !== -1) {
+                                    voiceSocket["pending"] = false;
+                                    if (!(bufArray.length === 1 && bufArray[0].byteLength === 0)) voiceData[key] = new Blob(bufArray, { type: voiceMIME });
+                                    if (!speechQuene.length && !speechPushing) {
+                                        mediaSource.endOfStream();
+                                    } else {
+                                        let buf = new ArrayBuffer();
+                                        buf["end"] = true;
+                                        speechQuene.push(buf);
+                                    }
+                                }
+                            }
+                        } else {
+                            await initSocket();
+                        }
+                        let currDate = getTime();
+                        let lang = voiceRole[type].lang;
+                        let uuid = uuidv4(existVoice === 3);
+                        if (existVoice === 3) {
+                            voiceSocket.send(getWSPre(currDate, uuid));
+                        }
+                        voiceSocket.send(getWSAudio(currDate, uuid));
+                        voiceSocket.send(getWSText(currDate, uuid, lang, voice, volume, rate, pitch, style, role, content));
+                        voiceSocket["pending"] = true;
+                        if (!supportMSE) {
+                            let blob = await NoMSEPending(key);
+                            if (blob.size === 0) notyf.open({ type: "warning", message: translations[locale]["cantSpeechTip"] });
+                            voiceIns.src = URL.createObjectURL(blob);
+                        }
+                    }
+                }
+            } else {
+                voiceIns = new SpeechSynthesisUtterance();
+                voiceIns.voice = voiceRole[type];
+                voiceIns.volume = volume;
+                voiceIns.rate = rate;
+                voiceIns.pitch = pitch;
+                voiceIns.text = content;
+            }
+            await speakEvent(voiceIns);
+            if (enableContVoice) {
+                if (currentVoiceIdx !== data.length - 1) { return speechEvent(currentVoiceIdx + 1) }
+                else { endSpeak() }
+            }
+        };
+        let autoVoiceSocket;
+        let autoMediaSource;
+        let voiceContentQuene = [];
+        let voiceEndFlagQuene = [];
+        let voiceBlobURLQuene = [];
+        let autoOnlineVoiceFlag = false;
+        const autoAddQuene = () => {
+            if (voiceContentQuene.length) {
+                let content = getUnescape(md.render(voiceContentQuene.shift()));
+                let currDate = getTime();
+                let uuid = uuidv4(existVoice === 3);
+                let voice = voiceRole[1].Name;
+                if (existVoice === 3) {
+                    autoVoiceSocket.send(getWSPre(currDate, uuid));
+                    voice = voiceRole[1].ShortName;
+                }
+                autoVoiceSocket.send(getWSAudio(currDate, uuid));
+                autoVoiceSocket.send(getWSText(currDate, uuid, voiceRole[1].lang, voice, voiceVolume[1], voiceRate[1], voicePitch[1], azureStyle[1], azureRole[1], content));
+                autoVoiceSocket["pending"] = true;
+                autoOnlineVoiceFlag = true;
+            }
+        }
+        let autoPlayingIdx = 0;
+        const autoDirectAddQuene = async (index) => {
+            if (voiceContentQuene.length) {
+                let content = getUnescape(md.render(voiceContentQuene[voiceContentQuene.length - 1]));
+                let voice = voiceRole[1].name;
+                let rate = voiceRate[1];
+                let blob;
+                if (content !== "" && (blob = await getOpenAIVoice(content, voice, rate))) {
+                    let blobURL = URL.createObjectURL(blob);
+                    if (!voiceIns.src && autoPlayingIdx === index) {
+                        voiceIns.src = blobURL;
+                        voiceIns.play();
+                    } else voiceBlobURLQuene[index] = blobURL;
+                } else {
+                    if (!voiceIns.src && autoPlayingIdx === index) autoPlayNext();
+                    else voiceBlobURLQuene[index] = null;
+                }
+                if (voiceEndFlagQuene.shift()) {
+                    if (!voiceIns.src) endSpeak();
+                    else voiceBlobURLQuene.push("end");
+                }
+            }
+        }
+        const autoPlayNext = () => {
+            autoPlayingIdx += 1;
+            if (voiceBlobURLQuene.length) {
+                let src = voiceBlobURLQuene[autoPlayingIdx];
+                if (src === "end") {
+                    endSpeak();
+                } else if (src === null) {
+                    autoPlayNext();
+                } else if (src) {
+                    voiceIns.src = src;
+                    voiceIns.currentTime = 0;
+                    voiceIns.play();
+                } else {
+                    voiceIns.currentTime = 0;
+                    voiceIns.removeAttribute("src");
+                }
+            } else {
+                voiceIns.currentTime = 0;
+                voiceIns.removeAttribute("src");
+            }
+        }
+        const autoSpeechEvent = (content, ele, force = false, end = false) => {
+            if (ele.lastChild.lastChild.classList.contains("readyVoice")) {
+                ele.classList.add("showVoiceCls");
+                ele.lastChild.lastChild.className = "voiceCls pauseVoice";
+            }
+            if (existVoice >= 2) {
+                voiceContentQuene.push(content);
+                voiceEndFlagQuene.push(end);
+                if (!voiceIns || voiceIns instanceof Audio === false) {
+                    voiceIns = new Audio();
+                    voiceIns.onpause = pauseEv;
+                    voiceIns.onplay = resumeEv;
+                }
+                if (existVoice === 4) {
+                    autoDirectAddQuene(voiceContentQuene.length - 1);
+                    voiceIns.onended = voiceIns.onerror = () => { autoPlayNext() };
+                } else {
+                    let url = existVoice === 3 ? getAzureWSURL() : edgeTTSURL;
+                    if (!autoVoiceSocket || autoVoiceSocket.readyState > 1 || autoVoiceSocket.url !== url) {
+                        if (autoVoiceSocket && autoVoiceSocket.readyState === 1) autoVoiceSocket.close(1000);
+                        autoVoiceSocket = new WebSocket(url);
+                        autoVoiceSocket.binaryType = "arraybuffer";
+                        autoVoiceSocket.onopen = () => {
+                            autoAddQuene();
+                        };
+                        autoVoiceSocket.onerror = () => {
+                            autoOnlineVoiceFlag = false;
+                        };
+                    };
+                    let bufArray = [];
+                    autoVoiceSocket.onmessage = (e) => {
+                        if (e.data instanceof ArrayBuffer) {
+                            (supportMSE ? speechQuene : bufArray).push(e.data.slice(voicePreLen));
+                        } else {
+                            if (e.data.indexOf("Path:turn.end") !== -1) {
+                                autoVoiceSocket["pending"] = false;
+                                autoOnlineVoiceFlag = false;
+                                if (!supportMSE) {
+                                    let blob = new Blob(bufArray, { type: voiceMIME });
+                                    bufArray = [];
+                                    if (blob.size) {
+                                        let blobURL = URL.createObjectURL(blob);
+                                        if (!voiceIns.src) {
+                                            voiceIns.src = blobURL;
+                                            voiceIns.play();
+                                        } else {
+                                            voiceBlobURLQuene.push(blobURL);
+                                        }
+                                    } else {
+                                        notyf.open({ type: "warning", message: translations[locale]["cantSpeechTip"] });
+                                    }
+                                    autoAddQuene();
+                                }
+                                if (voiceEndFlagQuene.shift()) {
+                                    if (supportMSE) {
+                                        if (!speechQuene.length && !speechPushing) {
+                                            autoMediaSource.endOfStream();
+                                        } else {
+                                            let buf = new ArrayBuffer();
+                                            buf["end"] = true;
+                                            speechQuene.push(buf);
+                                        }
+                                    } else {
+                                        if (!voiceBlobURLQuene.length && !voiceIns.src) {
+                                            endSpeak();
+                                        } else {
+                                            voiceBlobURLQuene.push("end");
+                                        }
+                                    }
+                                };
+                                if (supportMSE) {
+                                    autoAddQuene();
+                                }
+                            }
+                        }
+                    };
+                    if (!autoOnlineVoiceFlag && autoVoiceSocket.readyState) {
+                        autoAddQuene();
+                    }
+                    if (supportMSE) {
+                        if (!autoMediaSource) {
+                            autoMediaSource = new MediaSource();
+                            autoMediaSource.onsourceopen = () => {
+                                if (!sourceBuffer) {
+                                    sourceBuffer = autoMediaSource.addSourceBuffer(voiceMIME);
+                                    sourceBuffer.onupdateend = () => {
+                                        speechPushing = false;
+                                        if (speechQuene.length) {
+                                            let buf = speechQuene.shift();
+                                            if (buf["end"]) {
+                                                if (!sourceBuffer.buffered.length) notyf.open({ type: "warning", message: translations[locale]["cantSpeechTip"] });
+                                                autoMediaSource.endOfStream();
+                                            } else {
+                                                speechPushing = true;
+                                                sourceBuffer.appendBuffer(buf);
+                                            }
+                                        }
+                                    };
+                                }
+                            }
+                        }
+                        if (!voiceIns.src) {
+                            voiceIns.src = URL.createObjectURL(autoMediaSource);
+                            voiceIns.play();
+                            voiceIns.onended = voiceIns.onerror = () => {
+                                endSpeak();
+                            }
+                        }
+                    } else {
+                        voiceIns.onended = voiceIns.onerror = () => {
+                            if (voiceBlobURLQuene.length) {
+                                let src = voiceBlobURLQuene.shift();
+                                if (src === "end") {
+                                    endSpeak();
+                                } else {
+                                    voiceIns.src = src;
+                                    voiceIns.currentTime = 0;
+                                    voiceIns.play();
+                                }
+                            } else {
+                                voiceIns.currentTime = 0;
+                                voiceIns.removeAttribute("src");
+                            }
+                        }
+                    }
+                }
+            } else {
+                voiceIns = new SpeechSynthesisUtterance(content);
+                voiceIns.volume = voiceVolume[1];
+                voiceIns.rate = voiceRate[1];
+                voiceIns.pitch = voicePitch[1];
+                voiceIns.voice = voiceRole[1];
+                speakEvent(voiceIns, force, end);
+            }
+        };
+        const confirmAction = (prompt) => {
+            if (window.confirm(prompt)) { return true }
+            else { return false }
+        };
+        const findLastSpecialCharIndex = (text) => {
+            const specialChars = new Set(['.', '?', '!', '~', '。', '？', '！', '\n']);
+            for (let i = text.length - 1; i >= 0; i--) {
+                if (specialChars.has(text[i])) {
+                    return i;
+                }
+            }
+            return -1;
+        };
+        let currentModelName;
+        const getModelName = () => { return (modelType === 1 ? customAPIModel : modelType === 2 ? geminiAPIModel : claudeAPIModel) || modelVersion };
+        let autoVoiceIdx = 0;
+        let autoVoiceDataIdx;
+        let refreshIdx;
+        let currentResEle;
+        let progressData = "";
+        const streamGen = async (long) => {
+            currentModelName = getModelName();
+            controller = new AbortController();
+            controllerId = setTimeout(() => {
+                notyf.error(translations[locale]["timeoutTip"]);
+                stopLoading();
+            }, 200000);
+            let isRefresh = refreshIdx !== void 0;
+            if (isRefresh) {
+                currentResEle = chatlog.children[systemRole ? refreshIdx - 1 : refreshIdx];
+                if (outOfMsgWindow(currentResEle)) messagesEle.scrollTo(0, currentResEle.offsetTop)
+            } else if (!currentResEle) {
+                currentResEle = createConvEle("response", true, modelVersion);
+                currentResEle.children[1].innerHTML = "<p class='cursorCls'><br /></p>";
+                currentResEle.dataset.loading = true;
+                scrollToBottom();
+            }
+            let idx = isRefresh ? refreshIdx : data.length;
+            if (existVoice && enableAutoVoice && !long) {
+                if (isRefresh) {
+                    endSpeak();
+                    autoVoiceDataIdx = currentVoiceIdx = idx;
+                } else if (currentVoiceIdx !== data.length) {
+                    endSpeak();
+                    autoVoiceDataIdx = currentVoiceIdx = idx;
+                }
+            };
+            try {
+                let dataSlice;
+                if (long) {
+                    idx = isRefresh ? refreshIdx : data.length - 1;
+                    dataSlice = [data[idx - 1], data[idx]];
+                    if (systemRole) dataSlice.unshift(data[0]);
+                } else {
+                    let startIdx = idx > contLen ? idx - contLen - 1 : 0;
+                    dataSlice = data.slice(startIdx, idx);
+                    if (systemRole && startIdx > 0) dataSlice.unshift(data[0]);
+                }
+                let headers = { "Content-Type": "application/json" };
+                let url, body;
+                if (modelType === 1) {
+                    dataSlice = dataSlice.map(item => {
+                        if (item.role === "assistant") return { role: item.role, content: item.content };
+                        else return item;
+                    })
+                    url = apiHost + ((apiHost.length && !apiHost.endsWith("/")) ? "/" : "") + API_URL;
+                    if (customAPIKey) headers["Authorization"] = "Bearer " + customAPIKey;
+                    body = JSON.stringify({
+                        messages: dataSlice,
+                        model: currentModelName,
+                        stream: true,
+                        temperature: roleTemp,
+                        top_p: roleNature
+                    });
+                } else if (modelType === 2) {
+                    dataSlice = dataSlice.map(item => {
+                        return { role: item.role === "assistant" ? "model" : "user", parts: [{ text: item.content }] };
+                    })
+                    url = geminiApiHost + ((geminiApiHost.length && !geminiApiHost.endsWith("/")) ? "/" : "") + Gemini_API_URL + currentModelName + `:streamGenerateContent?key=${geminiAPIKey}`;
+                    body = JSON.stringify({
+                        contents: dataSlice,
+                        generationConfig: {
+                            temperature: roleTemp,
+                            topP: roleNature
+                        }
+                    });
+                } else {
+                    let system;
+                    if (systemRole) {
+                        system = dataSlice.shift().content;
+                    }
+                    dataSlice = dataSlice.map(item => {
+                        if (item.role === "assistant") return { role: item.role, content: item.content };
+                        else return item;
+                    })
+                    url = claudeApiHost + ((claudeApiHost.length && !claudeApiHost.endsWith("/")) ? "/" : "") + Claude_API_URL;
+                    if (claudeAPIKey) headers["x-api-key"] = claudeAPIKey;
+                    headers["anthropic-version"] = "2023-06-01";
+                    body = JSON.stringify({
+                        model: currentModelName,
+                        messages: dataSlice,
+                        max_tokens: 4096,
+                        stream: true,
+                        temperature: roleTemp,
+                        top_p: roleNature,
+                        ...(system ? { system } : {})
+                    });
+                }
+                const res = await fetch(url, {
+                    method: "POST",
+                    headers,
+                    body,
+                    signal: controller.signal
+                });
+                clearTimeout(controllerId);
+                controllerId = void 0;
+                if (res.status !== 200) {
+                    if (res.status === 401) {
+                        notyf.error(translations[locale]["errorAiKeyTip"])
+                    } else if (res.status === 400 || res.status === 413) {
+                        notyf.error(translations[locale]["largeReqTip"]);
+                    } else if (res.status === 404) {
+                        notyf.error(translations[locale]["noModelPerTip"]);
+                    } else if (res.status === 429) {
+                        notyf.error(res.statusText ? translations[locale]["apiRateTip"] : translations[locale]["exceedLimitTip"]);
+                    } else {
+                        notyf.error(translations[locale]["badGateTip"]);
+                    }
+                    stopLoading();
+                    return;
+                }
+                let container = document.createElement("div");
+                const decoder = new TextDecoder();
+                const reader = res.body.getReader();
+                let readChunk;
+                if (modelType === 1) {
+                    readChunk = async () => {
+                        return reader.read().then(async ({ value, done }) => {
+                            if (!done) {
+                                value = decoder.decode(value);
+                                let chunks = value.match(/[^\n]+/g);
+                                if (!chunks) return readChunk();
+                                let payload;
+                                for (let i = 0; i < chunks.length; i++) {
+                                    let chunk = chunks[i];
+                                    if (chunk) {
+                                        try {
+                                            payload = JSON.parse(chunk.slice(5));
+                                        } catch (e) {
+                                            break;
+                                        }
+                                        if (!payload.choices.length) continue;
+                                        if (payload.choices[0].finish_reason) {
+                                            let lenStop = payload.choices[0].finish_reason === "length";
+                                            longReplyFlag = enableLongReply && lenStop;
+                                            let ele = currentResEle.lastChild.children[0].children[0];
+                                            if (!enableLongReply && lenStop) { ele.className = "halfRefReq optionItem"; ele.title = translations[locale]["continue"] }
+                                            else { ele.className = "refreshReq optionItem"; ele.title = translations[locale]["refresh"] };
+                                            if (existVoice && enableAutoVoice && currentVoiceIdx === autoVoiceDataIdx) {
+                                                let voiceText = longReplyFlag ? "" : progressData.slice(autoVoiceIdx), stop = !longReplyFlag;
+                                                autoSpeechEvent(voiceText, currentResEle, false, stop);
+                                            }
+                                            break;
+                                        } else {
+                                            let content = payload.choices[0].delta.content;
+                                            if (content) {
+                                                if (!progressData && !content.trim()) continue;
+                                                if (existVoice && enableAutoVoice && currentVoiceIdx === autoVoiceDataIdx) {
+                                                    let spliter = content.match(/\.|\?|!|~|。|？|！|\n/);
+                                                    if (spliter) {
+                                                        let voiceText = progressData.slice(autoVoiceIdx) + content.slice(0, spliter.index + 1);
+                                                        autoVoiceIdx += voiceText.length;
+                                                        autoSpeechEvent(voiceText, currentResEle);
+                                                    }
+                                                }
+                                                if (progressData && textSpeed) await delay();
+                                                progressData += content;
+                                                container.innerHTML = md.render(progressData);
+                                                if (container.children.length > 1 && currentResEle.children[1].children.length === container.children.length) {
+                                                    morphdom(currentResEle.children[1].lastElementChild, container.lastElementChild);
+                                                } else {
+                                                    morphdom(currentResEle.children[1], container, { childrenOnly: true });
+                                                }
+                                                scrollToBottom();
+                                            }
+                                        }
+                                    }
+                                }
+                                return readChunk();
+                            } else {
+                                if (isRefresh) {
+                                    data[refreshIdx].content = progressData;
+                                    data[refreshIdx].model = currentModelName;
+                                } else {
+                                    if (long) { data[data.length - 1].content = progressData }
+                                    else { data.push({ role: "assistant", content: progressData, model: currentModelName }) }
+                                }
+                                if (longReplyFlag) return streamGen(true);
+                                stopLoading(false);
+                            }
+                        });
+                    };
+                } else if (modelType === 2) {
+                    let stop_reason;
+                    let endGeminiFunc = () => {
+                        let lenStop = stop_reason === "MAX_TOKENS";
+                        longReplyFlag = enableLongReply && lenStop;
+                        let ele = currentResEle.lastChild.children[0].children[0];
+                        if (!enableLongReply && lenStop) { ele.className = "halfRefReq optionItem"; ele.title = translations[locale]["continue"] }
+                        else { ele.className = "refreshReq optionItem"; ele.title = translations[locale]["refresh"] };
+                        if (existVoice && enableAutoVoice && currentVoiceIdx === autoVoiceDataIdx) {
+                            let voiceText = longReplyFlag ? "" : progressData.slice(autoVoiceIdx), stop = !longReplyFlag;
+                            autoSpeechEvent(voiceText, currentResEle, false, stop);
+                        }
+                    };
+                    readChunk = async () => {
+                        return reader.read().then(async ({ value, done }) => {
+                            if (!done) {
+                                value = decoder.decode(value);
+                                let payload;
+                                try {
+                                    let startIdx = value.indexOf("{");
+                                    let endIdx = value.lastIndexOf("}");
+                                    payload = JSON.parse(value.slice(startIdx, endIdx + 1));
+                                } catch (e) {
+                                    if (value === "]") endGeminiFunc();
+                                    return readChunk();
+                                }
+                                let content = payload.candidates[0].content.parts[0].text;
+                                if (content) {
+                                    if (!progressData && !content.trim()) return readChunk();
+                                    if (existVoice && enableAutoVoice && currentVoiceIdx === autoVoiceDataIdx) {
+                                        let spliter = findLastSpecialCharIndex(content);
+                                        if (spliter != -1) {
+                                            let voiceText = progressData.slice(autoVoiceIdx) + content.slice(0, spliter + 1);
+                                            autoVoiceIdx += voiceText.length;
+                                            autoSpeechEvent(voiceText, currentResEle);
+                                        }
+                                    }
+                                    if (progressData && textSpeed) await delay();
+                                    progressData += content;
+                                    container.innerHTML = md.render(progressData);
+                                    if (container.children.length > 1 && currentResEle.children[1].children.length === container.children.length) {
+                                        morphdom(currentResEle.children[1].lastElementChild, container.lastElementChild);
+                                    } else {
+                                        morphdom(currentResEle.children[1], container, { childrenOnly: true });
+                                    }
+                                    scrollToBottom();
+                                    stop_reason = payload.candidates[0].finishReason;
+                                    if (value[value.length - 1] === "]") endGeminiFunc();
+                                }
+                                return readChunk();
+                            } else {
+                                if (isRefresh) {
+                                    data[refreshIdx].content = progressData;
+                                    data[refreshIdx].model = currentModelName;
+                                } else {
+                                    if (long) { data[data.length - 1].content = progressData }
+                                    else { data.push({ role: "assistant", content: progressData, model: currentModelName }) }
+                                }
+                                if (longReplyFlag) return streamGen(true);
+                                stopLoading(false);
+                            }
+                        });
+                    };
+                } else {
+                    readChunk = async () => {
+                        return reader.read().then(async ({ value, done }) => {
+                            if (!done) {
+                                value = decoder.decode(value);
+                                let chunks = value.match(/[^\n]+/g);
+                                if (!chunks) return readChunk();
+                                let event;
+                                let payload;
+                                for (let i = 0; i < chunks.length; i++) {
+                                    let chunk = chunks[i];
+                                    if (chunk) {
+                                        try {
+                                            if (chunk.startsWith("event")) {
+                                                event = chunk.slice(7);
+                                            } else {
+                                                payload = JSON.parse(chunk.slice(5));
+                                            }
+                                        } catch (e) {
+                                            break;
+                                        }
+                                        if (chunk.startsWith("event")) continue;
+                                        if (event === "message_delta") {
+                                            let lenStop = payload.delta.stop_reason === "max_tokens";
+                                            longReplyFlag = enableLongReply && lenStop;
+                                            let ele = currentResEle.lastChild.children[0].children[0];
+                                            if (!enableLongReply && lenStop) { ele.className = "halfRefReq optionItem"; ele.title = translations[locale]["continue"] }
+                                            else { ele.className = "refreshReq optionItem"; ele.title = translations[locale]["refresh"] };
+                                            if (existVoice && enableAutoVoice && currentVoiceIdx === autoVoiceDataIdx) {
+                                                let voiceText = longReplyFlag ? "" : progressData.slice(autoVoiceIdx), stop = !longReplyFlag;
+                                                autoSpeechEvent(voiceText, currentResEle, false, stop);
+                                            }
+                                            break;
+                                        } else if (event === "content_block_delta") {
+                                            if (payload.delta.type !== "text_delta") continue;
+                                            let content = payload.delta.text;
+                                            if (content) {
+                                                if (!progressData && !content.trim()) continue;
+                                                if (existVoice && enableAutoVoice && currentVoiceIdx === autoVoiceDataIdx) {
+                                                    let spliter = content.match(/\.|\?|!|~|。|？|！|\n/);
+                                                    if (spliter) {
+                                                        let voiceText = progressData.slice(autoVoiceIdx) + content.slice(0, spliter.index + 1);
+                                                        autoVoiceIdx += voiceText.length;
+                                                        autoSpeechEvent(voiceText, currentResEle);
+                                                    }
+                                                }
+                                                if (progressData && textSpeed) await delay();
+                                                progressData += content;
+                                                container.innerHTML = md.render(progressData);
+                                                if (container.children.length > 1 && currentResEle.children[1].children.length === container.children.length) {
+                                                    morphdom(currentResEle.children[1].lastElementChild, container.lastElementChild);
+                                                } else {
+                                                    morphdom(currentResEle.children[1], container, { childrenOnly: true });
+                                                }
+                                                scrollToBottom();
+                                            }
+                                        }
+                                    }
+                                }
+                                return readChunk();
+                            } else {
+                                if (isRefresh) {
+                                    data[refreshIdx].content = progressData;
+                                    data[refreshIdx].model = currentModelName;
+                                } else {
+                                    if (long) { data[data.length - 1].content = progressData }
+                                    else { data.push({ role: "assistant", content: progressData, model: currentModelName }) }
+                                }
+                                if (longReplyFlag) return streamGen(true);
+                                stopLoading(false);
+                            }
+                        });
+                    };
+                }
+                await readChunk();
+                container = null;
+            } catch (e) {
+                if (e.message.indexOf("aborted") === -1) {
+                    notyf.error(translations[locale]["badEndpointTip"])
+                    stopLoading();
+                }
+            }
+        };
+        const loadAction = (bool) => {
+            loading = bool;
+            sendBtnEle.disabled = bool;
+            sendBtnEle.className = bool ? " loading" : "loaded";
+            stopEle.style.display = bool ? "flex" : "none";
+            textInputEvent();
+        };
+        const updateChatPre = () => {
+            let ele = activeChatEle.children[1].children[1];
+            let first = data.find(item => { return item.role === "assistant" });
+            ele.textContent = first ? first.content.slice(0, 30) : "";
+            forceRepaint(ele.parentElement)
+        }
+        const stopLoading = (abort = true) => {
+            stopEle.style.display = "none";
+            if (currentResEle.children[1].querySelector(".cursorCls")) currentResEle.children[1].innerHTML = "<br />";
+            if (abort) {
+                controller.abort();
+                if (controllerId) clearTimeout(controllerId);
+                if (delayId) clearTimeout(delayId);
+                if (refreshIdx !== void 0) {
+                    data[refreshIdx].content = progressData;
+                    data[refreshIdx].model = currentModelName;
+                }
+                else if (data[data.length - 1].role === "assistant") {
+                    data[data.length - 1].content = progressData;
+                    data[data.length - 1].model = currentModelName;
+                }
+                else { data.push({ role: "assistant", content: progressData, model: currentModelName }) }
+                if (existVoice && enableAutoVoice && currentVoiceIdx === autoVoiceDataIdx && progressData.length) {
+                    let voiceText = progressData.slice(autoVoiceIdx);
+                    autoSpeechEvent(voiceText, currentResEle, false, true);
+                }
+            }
+            if (activeChatEle.children[1].children[1].textContent === "") updateChatPre();
+            updateChats();
+            controllerId = delayId = refreshIdx = autoVoiceDataIdx = void 0;
+            autoVoiceIdx = 0;
+            currentResEle.dataset.loading = false;
+            currentResEle = null;
+            progressData = "";
+            loadAction(false);
+        };
+        const generateText = (message) => {
+            loadAction(true);
+            let requestEle;
+            let isBottom = isContentBottom();
+            if (editingIdx !== void 0) {
+                let idx = editingIdx;
+                let eleIdx = systemRole ? idx - 1 : idx;
+                requestEle = chatlog.children[eleIdx];
+                data[idx].content = message;
+                resumeSend();
+                if (idx !== data.length - 1) {
+                    requestEle.children[1].textContent = message;
+                    if (data[idx + 1].role !== "assistant") {
+                        if (currentVoiceIdx !== void 0) {
+                            if (currentVoiceIdx > idx) { currentVoiceIdx++ }
+                        }
+                        data.splice(idx + 1, 0, { role: "assistant", content: "", model: modelVersion });
+                        chatlog.insertBefore(createConvEle("response", false, modelVersion), chatlog.children[eleIdx + 1]);
+                    } else formatAvatarEle(chatlog.children[eleIdx + 1].children[0], modelVersion);
+                    chatlog.children[eleIdx + 1].children[1].innerHTML = "<p class='cursorCls'><br /></p>";
+                    chatlog.children[eleIdx + 1].dataset.loading = true;
+                    idx = idx + 1;
+                    data[idx].content = "";
+                    if (idx === currentVoiceIdx) { endSpeak() };
+                    refreshIdx = idx;
+                    updateChats();
+                    streamGen();
+                    return;
+                }
+            } else {
+                requestEle = createConvEle("request");
+                data.push({ role: "user", content: message });
+            }
+            requestEle.children[1].textContent = message;
+            if (chatsData[activeChatIdx].name === translations[locale]["newChatName"]) {
+                if (message.length > 20) message = message.slice(0, 17) + "...";
+                chatsData[activeChatIdx].name = message;
+                activeChatEle.children[1].children[0].textContent = message;
+            }
+            updateChats();
+            if (isBottom) messagesEle.scrollTo(0, messagesEle.scrollHeight);
+            streamGen();
+        };
+        inputAreaEle.onkeydown = (e) => {
+            if (e.keyCode === 13 && !e.shiftKey) {
+                e.preventDefault();
+                genFunc();
+            } else if (keepListenMic && recing) {
+                resetRecRes();
+            }
+        };
+        const genFunc = async function () {
+            clearAutoSendTimer();
+            if (recing) {
+                if (existRec === 3) await toggleRecEv(false);
+                else if (existRec === 2) toggleRecEv(false);
+                else if (!keepListenMic) toggleRecEv();
+            }
+            let message = inputAreaEle.value.trim();
+            if (message.length !== 0 && noLoading()) {
+                inputAreaEle.value = "";
+                inputAreaEle.style.height = "47px";
+                if (keepListenMic && recing) resetRecRes();
+                generateText(message);
+            }
+        };
+        sendBtnEle.onclick = genFunc;
+        stopEle.onclick = stopLoading;
+        clearEle.onclick = () => {
+            if (editingIdx === void 0) {
+                if (noLoading() && confirmAction(translations[locale]["clearChatTip"])) {
+                    endSpeak();
+                    if (systemRole) { data.length = 1 }
+                    else { data.length = 0 }
+                    chatlog.innerHTML = "";
+                    updateChatPre();
+                    updateChats();
+                }
+            } else {
+                resumeSend();
+            }
+        }
+    </script>
+    <link crossorigin="anonymous" href="https://fastly.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.css"
+        rel="stylesheet">
+    <script defer>
+        let downRoleController = new AbortController();
+        const loadPrompt = () => {
+            downRoleController.abort();
+            downRoleController = new AbortController();
+            setTimeout(() => {
+                downRoleController.abort();
+            }, 10000);
+            preEle.options.length = 5;
+            if (locale === "zh") {
+                fetch("https://fastly.jsdelivr.net/gh/PlexPt/awesome-chatgpt-prompts-zh/prompts-zh.json", {
+                    signal: downRoleController.signal
+                }).then(async (response) => {
+                    let res = await response.json();
+                    for (let i = 0; i < res.length; i++) {
+                        let key = "act" + i;
+                        presetRoleData[key] = res[i].prompt.trim();
+                        let optionEle = document.createElement("option");
+                        optionEle.text = res[i].act;
+                        optionEle.value = key;
+                        preEle.options.add(optionEle);
+                    }
+                }).catch(e => { })
+            } else {
+                fetch("https://fastly.jsdelivr.net/gh/f/awesome-chatgpt-prompts/prompts.csv", {
+                    signal: downRoleController.signal
+                }).then(async (response) => {
+                    let res = await response.text();
+                    let arr = res.split("\n");
+                    for (let i = 1; i < arr.length - 1; i++) {
+                        let key = "act" + i;
+                        let index = arr[i].indexOf(",");
+                        presetRoleData[key] = arr[i].slice(index + 2, -1);
+                        let optionEle = document.createElement("option");
+                        optionEle.text = arr[i].slice(1, index - 1);
+                        optionEle.value = key;
+                        preEle.options.add(optionEle);
+                    }
+                }).catch(e => { })
+            }
+        }
+        loadPrompt();
+    </script>
+</body>
+
+</html>
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index d7fb61812..4cd43137f 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -27,6 +27,8 @@
 #include "theme-snowstorm.css.hpp"
 #include "index.html.hpp"
 #include "index-new.html.hpp"
+#include "index-yx.html.hpp"
+#include "avatar.jpg.hpp"
 #include "index.js.hpp"
 #include "completion.js.hpp"
 #include "system-prompts.js.hpp"
@@ -3076,6 +3078,7 @@ int main(int argc, char ** argv) {
         res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
         json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
 
+
         const int id_task = ctx_server.queue_tasks.get_new_id();
 
         ctx_server.queue_results.add_waiting_task_id(id_task);
@@ -3305,7 +3308,7 @@ int main(int argc, char ** argv) {
     }
 
     // using embedded static files
-    svr->Get("/",                           handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
+    svr->Get("/",                           handle_static_file(index_yx_html, index_yx_html_len, "text/html; charset=utf-8"));
     svr->Get("/index.js",                   handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
     svr->Get("/completion.js",              handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
     svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
@@ -3320,6 +3323,8 @@ int main(int argc, char ** argv) {
     svr->Get("/theme-polarnight.css",  handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
     svr->Get("/theme-snowstorm.css",   handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
     svr->Get("/index-new.html",        handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
+    svr->Get("/index-yx.html",        handle_static_file(index_yx_html, index_yx_html_len, "text/html; charset=utf-8"));
+    svr->Get("/avatar.jpg",            handle_static_file(avatar_jpg, avatar_jpg_len, "text/html; charset=utf-8"));
     svr->Get("/system-prompts.js",     handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
     svr->Get("/prompt-formats.js",     handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
 

From d9b5678b5b4eab92dbb119c18e5450a544e63761 Mon Sep 17 00:00:00 2001
From: Aliebc <i@axgln.net>
Date: Sat, 15 Jun 2024 10:45:01 +0800
Subject: [PATCH 50/50] Merge with conflict

---
 .github/workflows/docker.yml | 116 ----------------------
 .github/workflows/server.yml | 183 -----------------------------------
 2 files changed, 299 deletions(-)
 delete mode 100644 .github/workflows/docker.yml
 delete mode 100644 .github/workflows/server.yml

diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
deleted file mode 100644
index bf94b2024..000000000
--- a/.github/workflows/docker.yml
+++ /dev/null
@@ -1,116 +0,0 @@
-# This workflow uses actions that are not certified by GitHub.
-# They are provided by a third-party and are governed by
-# separate terms of service, privacy policy, and support
-# documentation.
-
-# GitHub recommends pinning actions to a commit SHA.
-# To get a newer version, you will need to update the SHA.
-# You can also reference a tag or branch, but the action may change without warning.
-
-name: Publish Docker image
-
-on:
-  #pull_request:
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  push_to_registry:
-    name: Push Docker image to Docker Hub
-    #if: github.event.pull_request.draft == false
-
-    runs-on: ubuntu-latest
-    env:
-      COMMIT_SHA: ${{ github.sha }}
-    strategy:
-      matrix:
-        config:
-          - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          # Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete.
-          #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
-    steps:
-      - name: Check out the repo
-        uses: actions/checkout@v4
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
-
-      - name: Log in to Docker Hub
-        uses: docker/login-action@v2
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
-      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: false
-
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: true
-          swap-storage: true
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Downcase github.repository_owner
-        run: |
-          echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
-        env:
-          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
-
-      - name: Build and push Docker image (versioned)
-        if: github.event_name == 'push'
-        uses: docker/build-push-action@v4
-        with:
-          context: .
-          push: true
-          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
-          file: ${{ matrix.config.dockerfile }}
-
-      - name: Build and push Docker image (tagged)
-        uses: docker/build-push-action@v4
-        with:
-          context: .
-          push: ${{ github.event_name == 'push' }}
-          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
-          file: ${{ matrix.config.dockerfile }}
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
deleted file mode 100644
index 99feb28f2..000000000
--- a/.github/workflows/server.yml
+++ /dev/null
@@ -1,183 +0,0 @@
-# Server build and tests
-name: Server
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      sha:
-        description: 'Commit SHA1 to build'
-        required: false
-        type: string
-      slow_tests:
-        description: 'Run slow tests'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  server:
-    runs-on: ubuntu-latest
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
-        build_type: [RelWithDebInfo]
-        include:
-          - build_type: Release
-            sanitizer: ""
-      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
-
-    steps:
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get -y install \
-            build-essential \
-            xxd \
-            git \
-            cmake \
-            curl \
-            wget \
-            language-pack-en \
-            libcurl4-openssl-dev
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Python setup
-        id: setup_python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Tests dependencies
-        id: test_dependencies
-        run: |
-          pip install -r examples/server/tests/requirements.txt
-
-      - name: Verify server deps
-        id: verify_server_deps
-        run: |
-          git config --global --add safe.directory $(realpath .)
-          cd examples/server
-          git ls-files --others --modified
-          git status
-          ./deps.sh
-          git status
-          not_ignored_files="$(git ls-files --others --modified)"
-          echo "Modified files: ${not_ignored_files}"
-          if [ -n "${not_ignored_files}" ]; then
-            echo "Repository is dirty or server deps are not built as expected"
-            echo "${not_ignored_files}"
-            exit 1
-          fi
-
-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CURL=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-              -DGGML_OPENMP=OFF ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Build
-        id: cmake_build
-        if: ${{ matrix.sanitizer != 'THREAD' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CURL=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Tests
-        id: server_integration_tests
-        run: |
-          cd examples/server/tests
-          PORT=8888 ./tests.sh
-
-      - name: Slow tests
-        id: server_integration_tests_slow
-        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
-        run: |
-          cd examples/server/tests
-          PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
-
-
-  server-windows:
-    runs-on: windows-2019
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: libCURL
-        id: get_libcurl
-        env:
-          CURL_VERSION: 8.6.0_6
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
-          mkdir $env:RUNNER_TEMP/libcurl
-          tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
-
-      - name: Python setup
-        id: setup_python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Tests dependencies
-        id: test_dependencies
-        run: |
-          pip install -r examples/server/tests/requirements.txt
-
-      - name: Copy Libcurl
-        id: prepare_libcurl
-        run: |
-          cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
-
-      - name: Tests
-        id: server_integration_tests
-        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
-        run: |
-          cd examples/server/tests
-          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
-
-      - name: Slow tests
-        id: server_integration_tests_slow
-        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
-        run: |
-          cd examples/server/tests
-          behave.exe --stop --no-skipped --no-capture --tags slow