From a323ec60af14a33d560df98f2cc41b4112cb4f80 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 31 May 2024 22:23:04 +0300 Subject: [PATCH 1/6] server : update js (#7670) --- examples/server/public/index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/public/index.js b/examples/server/public/index.js index 695aec256..670960939 100644 --- a/examples/server/public/index.js +++ b/examples/server/public/index.js @@ -1 +1 @@ -const t=Symbol.for("preact-signals");function n(){if(r>1){r--;return}let t,n=!1;while(void 0!==i){let _=i;i=void 0;u++;while(void 0!==_){const i=_.o;_.o=void 0;_.f&=-3;if(!(8&_.f)&&h(_))try{_.c()}catch(e){if(!n){t=e;n=!0}}_=i}}u=0;r--;if(n)throw t}function e(t){if(r>0)return t();r++;try{return t()}finally{n()}}let _,i;function o(t){const n=_;_=void 0;try{return t()}finally{_=n}}let r=0,u=0,l=0;function s(t){if(void 0===_)return;let n=t.n;if(void 0===n||n.t!==_){n={i:0,S:t,p:_.s,n:void 0,t:_,e:void 0,x:void 0,r:n};if(void 0!==_.s)_.s.n=n;_.s=n;t.n=n;if(32&_.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=_.s;n.n=void 0;_.s.n=n;_.s=n}return n}}function f(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}f.prototype.brand=t;f.prototype.h=function(){return!0};f.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};f.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};f.prototype.subscribe=function(t){return k(()=>{const n=this.value,e=_;_=void 0;try{t(n)}finally{_=e}})};f.prototype.valueOf=function(){return this.value};f.prototype.toString=function(){return this.value+""};f.prototype.toJSON=function(){return this.value};f.prototype.peek=function(){const t=_;_=void 0;try{return this.value}finally{_=t}};Object.defineProperty(f.prototype,"value",{get(){const t=s(this);if(void 0!==t)t.i=this.i;return this.v},set(t){if(t!==this.v){if(u>100)throw new Error("Cycle detected");this.v=t;this.i++;l++;r++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function c(t){return new f(t)}function h(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function a(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function p(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function d(t){f.call(this,void 0);this.x=t;this.s=void 0;this.g=l-1;this.f=4}(d.prototype=new f).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===l)return!0;this.g=l;this.f|=1;if(this.i>0&&!h(this)){this.f&=-2;return!0}const t=_;try{a(this);_=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}_=t;p(this);this.f&=-2;return!0};d.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}f.prototype.S.call(this,t)};d.prototype.U=function(t){if(void 0!==this.t){f.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};d.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};Object.defineProperty(d.prototype,"value",{get(){if(1&this.f)throw new Error("Cycle detected");const t=s(this);this.h();if(void 0!==t)t.i=this.i;if(16&this.f)throw this.v;return this.v}});function v(t){return new d(t)}function y(t){const e=t.u;t.u=void 0;if("function"==typeof e){r++;const i=_;_=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;m(t);throw n}finally{_=i;n()}}}function m(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;y(t)}function g(t){if(_!==this)throw new Error("Out-of-order effect");p(this);_=t;this.f&=-2;if(8&this.f)m(this);n()}function b(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}b.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};b.prototype.S=function(){if(1&this.f)throw new Error("Cycle detected");this.f|=1;this.f&=-9;y(this);a(this);r++;const t=_;_=this;return g.bind(this,t)};b.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=i;i=this}};b.prototype.d=function(){this.f|=8;if(!(1&this.f))m(this)};function k(t){const n=new b(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var S,w,x,C,E,U,H,P,N,$,D,T,F={},V=[],A=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,M=Array.isArray;function W(t,n){for(var e in n)t[e]=n[e];return t}function O(t){var n=t.parentNode;n&&n.removeChild(t)}function L(t,n,e){var _,i,o,r={};for(o in n)"key"==o?_=n[o]:"ref"==o?i=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?S.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return R(t,r,_,i,null)}function R(t,n,e,_,i){var o={type:t,props:n,key:e,ref:_,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,constructor:void 0,__v:null==i?++x:i,__i:-1,__u:0};return null==i&&null!=w.vnode&&w.vnode(o),o}function I(){return{current:null}}function j(t){return t.children}function q(t,n){this.props=t,this.context=n}function B(t,n){if(null==n)return t.__?B(t.__,t.__i+1):null;for(var e;nn&&E.sort(P));J.__r=0}function K(t,n,e,_,i,o,r,u,l,s,f){var c,h,a,p,d,v=_&&_.__k||V,y=n.length;for(e.__d=l,Q(e,n,v),l=e.__d,c=0;c0?R(i.type,i.props,i.key,i.ref?i.ref:null,i.__v):i)?(i.__=t,i.__b=t.__b+1,u=Z(i,e,r,f),i.__i=u,o=null,-1!==u&&(f--,(o=e[u])&&(o.__u|=131072)),null==o||null===o.__v?(-1==u&&c--,"function"!=typeof i.type&&(i.__u|=65536)):u!==r&&(u===r+1?c++:u>r?f>l-r?c+=u-r:c--:u(null!=l&&0==(131072&l.__u)?1:0))for(;r>=0||u=0){if((l=n[r])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return r;r--}if(u2&&(u.children=arguments.length>3?S.call(arguments,2):e),R(t.type,u,_||t.key,i||t.ref,null)}function ht(t,n){var e={__c:n="__cC"+T++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,_;return this.getChildContext||(e=[],(_={})[n]=this,this.getChildContext=function(){return _},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.some((function(t){t.__e=!0,z(t)}))},this.sub=function(t){e.push(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e.splice(e.indexOf(t),1),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}S=V.slice,w={__e:function(t,n,e,_){for(var i,o,r;n=n.__;)if((i=n.__c)&&!i.__)try{if((o=i.constructor)&&null!=o.getDerivedStateFromError&&(i.setState(o.getDerivedStateFromError(t)),r=i.__d),null!=i.componentDidCatch&&(i.componentDidCatch(t,_||{}),r=i.__d),r)return i.__E=i}catch(n){t=n}throw t}},x=0,C=function(t){return null!=t&&null==t.constructor},q.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=W({},this.state),"function"==typeof t&&(t=t(W({},e),this.props)),t&&W(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),z(this))},q.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),z(this))},q.prototype.render=j,E=[],H="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,P=function(t,n){return t.__v.__b-n.__v.__b},J.__r=0,N=0,$=et(!1),D=et(!0),T=0;var at,pt,dt,vt,yt=0,mt=[],gt=[],bt=w,kt=bt.__b,St=bt.__r,wt=bt.diffed,xt=bt.__c,Ct=bt.unmount,Et=bt.__;function Ut(t,n){bt.__h&&bt.__h(pt,t,yt||n),yt=0;var e=pt.__H||(pt.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({__V:gt}),e.__[t]}function Ht(t){return yt=1,Pt(Gt,t)}function Pt(t,n,e){var _=Ut(at++,2);if(_.t=t,!_.__c&&(_.__=[e?e(n):Gt(void 0,n),function(t){var n=_.__N?_.__N[0]:_.__[0],e=_.t(n,t);n!==e&&(_.__N=[e,_.__[1]],_.__c.setState({}))}],_.__c=pt,!pt.u)){var i=function(t,n,e){if(!_.__c.__H)return!0;var i=_.__c.__H.__.filter((function(t){return!!t.__c}));if(i.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return i.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&_.__c.props===t)&&(!o||o.call(this,t,n,e))};pt.u=!0;var o=pt.shouldComponentUpdate,r=pt.componentWillUpdate;pt.componentWillUpdate=function(t,n,e){if(this.__e){var _=o;o=void 0,i(t,n,e),o=_}r&&r.call(this,t,n,e)},pt.shouldComponentUpdate=i}return _.__N||_.__}function Nt(t,n){var e=Ut(at++,3);!bt.__s&&Bt(e.__H,n)&&(e.__=t,e.i=n,pt.__H.__h.push(e))}function $t(t,n){var e=Ut(at++,4);!bt.__s&&Bt(e.__H,n)&&(e.__=t,e.i=n,pt.__h.push(e))}function Dt(t){return yt=5,Ft((function(){return{current:t}}),[])}function Tt(t,n,e){yt=6,$t((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Ft(t,n){var e=Ut(at++,7);return Bt(e.__H,n)?(e.__V=t(),e.i=n,e.__h=t,e.__V):e.__}function Vt(t,n){return yt=8,Ft((function(){return t}),n)}function At(t){var n=pt.context[t.__c],e=Ut(at++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(pt)),n.props.value):t.__}function Mt(t,n){bt.useDebugValue&&bt.useDebugValue(n?n(t):t)}function Wt(t){var n=Ut(at++,10),e=Ht();return n.__=t,pt.componentDidCatch||(pt.componentDidCatch=function(t,_){n.__&&n.__(t,_),e[1](t)}),[e[0],function(){e[1](void 0)}]}function Ot(){var t=Ut(at++,11);if(!t.__){for(var n=pt.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Lt(){for(var t;t=mt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(jt),t.__H.__h.forEach(qt),t.__H.__h=[]}catch(n){t.__H.__h=[],bt.__e(n,t.__v)}}bt.__b=function(t){pt=null,kt&&kt(t)},bt.__=function(t,n){t&&n.__k&&n.__k.__m&&(t.__m=n.__k.__m),Et&&Et(t,n)},bt.__r=function(t){St&&St(t),at=0;var n=(pt=t.__c).__H;n&&(dt===pt?(n.__h=[],pt.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.__V=gt,t.__N=t.i=void 0}))):(n.__h.forEach(jt),n.__h.forEach(qt),n.__h=[],at=0)),dt=pt},bt.diffed=function(t){wt&&wt(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==mt.push(n)&&vt===bt.requestAnimationFrame||((vt=bt.requestAnimationFrame)||It)(Lt)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.__V!==gt&&(t.__=t.__V),t.i=void 0,t.__V=gt}))),dt=pt=null},bt.__c=function(t,n){n.some((function(t){try{t.__h.forEach(jt),t.__h=t.__h.filter((function(t){return!t.__||qt(t)}))}catch(r){n.some((function(t){t.__h&&(t.__h=[])})),n=[],bt.__e(r,t.__v)}})),xt&&xt(t,n)},bt.unmount=function(t){Ct&&Ct(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{jt(t)}catch(t){n=t}})),e.__H=void 0,n&&bt.__e(n,e.__v))};var Rt="function"==typeof requestAnimationFrame;function It(t){var n,e=function(){clearTimeout(_),Rt&&cancelAnimationFrame(n),setTimeout(t)},_=setTimeout(e,100);Rt&&(n=requestAnimationFrame(e))}function jt(t){var n=pt,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),pt=n}function qt(t){var n=pt;t.__c=t.__(),pt=n}function Bt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function Gt(t,n){return"function"==typeof n?n(t):n}function zt(t,n){w[t]=n.bind(null,w[t]||(()=>{}))}let Jt,Kt;function Qt(t){if(Kt)Kt();Kt=t&&t.S()}function Xt({data:t}){const n=Zt(t);n.value=t;const e=Ft(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{var t;if(!C(e.peek())&&3===(null==(t=this.base)?void 0:t.nodeType))this.base.data=e.peek();else{this.__$f|=1;this.setState({})}};return v(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}Xt.displayName="_st";Object.defineProperties(f.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:Xt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});zt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let _ in e){if("children"===_)continue;let i=e[_];if(i instanceof f){if(!t)n.__np=t={};t[_]=i;e[_]=i.peek()}}}t(n)});zt("__r",(t,n)=>{Qt();let e,_=n.__c;if(_){_.__$f&=-2;e=_.__$u;if(void 0===e)_.__$u=e=function(t){let n;k((function(){n=this}));n.c=()=>{_.__$f|=1;_.setState({})};return n}()}Jt=_;Qt(e);t(n)});zt("__e",(t,n,e,_)=>{Qt();Jt=void 0;t(n,e,_)});zt("diffed",(t,n)=>{Qt();Jt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,_=n.props;if(t){let n=e.U;if(n)for(let e in n){let _=n[e];if(void 0!==_&&!(e in t)){_.d();n[e]=void 0}}else{n={};e.U=n}for(let i in t){let o=n[i],r=t[i];if(void 0===o){o=Yt(e,i,r,_);n[i]=o}else o.o(r,_)}}}t(n)});function Yt(t,n,e,_){const i=n in t&&void 0===t.ownerSVGElement,o=c(e);return{o:(t,n)=>{o.value=t;_=n},d:k(()=>{const e=o.value.value;if(_[n]!==e){_[n]=e;if(i)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}zt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});zt("__h",(t,n,e,_)=>{if(_<3||9===_)n.__$f|=2;t(n,e,_)});q.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let _ in n)return!0;for(let _ in t)if("__source"!==_&&t[_]!==this.props[_])return!0;for(let _ in this.props)if(!(_ in t))return!0;return!1};function Zt(t){return Ft(()=>c(t),[])}function tn(t){const n=Dt(t);n.current=t;Jt.__$f|=4;return Ft(()=>v(()=>n.current()),[])}function nn(t){const n=Dt(t);n.current=t;Nt(()=>k(()=>n.current()),[])}var en=function(t,n,e,_){var i;n[0]=0;for(var o=1;o=5&&((i||!t&&5===_)&&(r.push(_,0,i,e),_=6),t&&(r.push(_,t,0,e),_=6)),i=""},l=0;l"===n?(_=1,i=""):i=n+i[0]:o?n===o?o="":i+=n:'"'===n||"'"===n?o=n:">"===n?(u(),_=1):_&&("="===n?(_=5,e=i,i=""):"/"===n&&(_<5||">"===t[l][s+1])?(u(),3===_&&(r=r[0]),_=r,(r=r[0]).push(2,0,_),_=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),_=2):i+=n),3===_&&"!--"===i&&(_=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var rn=on.bind(L);export{q as Component,j as Fragment,f as Signal,e as batch,ct as cloneElement,v as computed,ht as createContext,L as createElement,I as createRef,k as effect,L as h,rn as html,ft as hydrate,C as isValidElement,w as options,st as render,c as signal,Y as toChildArray,o as untracked,Vt as useCallback,tn as useComputed,At as useContext,Mt as useDebugValue,Nt as useEffect,Wt as useErrorBoundary,Ot as useId,Tt as useImperativeHandle,$t as useLayoutEffect,Ft as useMemo,Pt as useReducer,Dt as useRef,Zt as useSignal,nn as useSignalEffect,Ht as useState}; +const t=Symbol.for("preact-signals");function n(){if(r>1){r--;return}let t,n=!1;while(void 0!==i){let _=i;i=void 0;u++;while(void 0!==_){const i=_.o;_.o=void 0;_.f&=-3;if(!(8&_.f)&&h(_))try{_.c()}catch(e){if(!n){t=e;n=!0}}_=i}}u=0;r--;if(n)throw t}function e(t){if(r>0)return t();r++;try{return t()}finally{n()}}let _,i;function o(t){const n=_;_=void 0;try{return t()}finally{_=n}}let r=0,u=0,l=0;function s(t){if(void 0===_)return;let n=t.n;if(void 0===n||n.t!==_){n={i:0,S:t,p:_.s,n:void 0,t:_,e:void 0,x:void 0,r:n};if(void 0!==_.s)_.s.n=n;_.s=n;t.n=n;if(32&_.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=_.s;n.n=void 0;_.s.n=n;_.s=n}return n}}function f(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}f.prototype.brand=t;f.prototype.h=function(){return!0};f.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};f.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};f.prototype.subscribe=function(t){return k(()=>{const n=this.value,e=_;_=void 0;try{t(n)}finally{_=e}})};f.prototype.valueOf=function(){return this.value};f.prototype.toString=function(){return this.value+""};f.prototype.toJSON=function(){return this.value};f.prototype.peek=function(){const t=_;_=void 0;try{return this.value}finally{_=t}};Object.defineProperty(f.prototype,"value",{get(){const t=s(this);if(void 0!==t)t.i=this.i;return this.v},set(t){if(t!==this.v){if(u>100)throw new Error("Cycle detected");this.v=t;this.i++;l++;r++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function c(t){return new f(t)}function h(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function a(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function p(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function d(t){f.call(this,void 0);this.x=t;this.s=void 0;this.g=l-1;this.f=4}(d.prototype=new f).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===l)return!0;this.g=l;this.f|=1;if(this.i>0&&!h(this)){this.f&=-2;return!0}const t=_;try{a(this);_=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}_=t;p(this);this.f&=-2;return!0};d.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}f.prototype.S.call(this,t)};d.prototype.U=function(t){if(void 0!==this.t){f.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};d.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};Object.defineProperty(d.prototype,"value",{get(){if(1&this.f)throw new Error("Cycle detected");const t=s(this);this.h();if(void 0!==t)t.i=this.i;if(16&this.f)throw this.v;return this.v}});function v(t){return new d(t)}function y(t){const e=t.u;t.u=void 0;if("function"==typeof e){r++;const i=_;_=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;m(t);throw n}finally{_=i;n()}}}function m(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;y(t)}function g(t){if(_!==this)throw new Error("Out-of-order effect");p(this);_=t;this.f&=-2;if(8&this.f)m(this);n()}function b(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}b.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};b.prototype.S=function(){if(1&this.f)throw new Error("Cycle detected");this.f|=1;this.f&=-9;y(this);a(this);r++;const t=_;_=this;return g.bind(this,t)};b.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=i;i=this}};b.prototype.d=function(){this.f|=8;if(!(1&this.f))m(this)};function k(t){const n=new b(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var w,S,x,C,U,E,H,P,N,$,D,T,M={},F=[],A=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,V=Array.isArray;function W(t,n){for(var e in n)t[e]=n[e];return t}function L(t){var n=t.parentNode;n&&n.removeChild(t)}function O(t,n,e){var _,i,o,r={};for(o in n)"key"==o?_=n[o]:"ref"==o?i=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?w.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return R(t,r,_,i,null)}function R(t,n,e,_,i){var o={type:t,props:n,key:e,ref:_,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,constructor:void 0,__v:null==i?++x:i,__i:-1,__u:0};return null==i&&null!=S.vnode&&S.vnode(o),o}function I(){return{current:null}}function j(t){return t.children}function q(t,n){this.props=t,this.context=n}function B(t,n){if(null==n)return t.__?B(t.__,t.__i+1):null;for(var e;nn&&U.sort(P));J.__r=0}function K(t,n,e,_,i,o,r,u,l,s,f){var c,h,a,p,d,v=_&&_.__k||F,y=n.length;for(e.__d=l,Q(e,n,v),l=e.__d,c=0;c0?R(i.type,i.props,i.key,i.ref?i.ref:null,i.__v):i)?(i.__=t,i.__b=t.__b+1,u=Z(i,e,r,f),i.__i=u,o=null,-1!==u&&(f--,(o=e[u])&&(o.__u|=131072)),null==o||null===o.__v?(-1==u&&c--,"function"!=typeof i.type&&(i.__u|=65536)):u!==r&&(u===r+1?c++:u>r?f>l-r?c+=u-r:c--:u(null!=l&&0==(131072&l.__u)?1:0))for(;r>=0||u=0){if((l=n[r])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return r;r--}if(u2&&(u.children=arguments.length>3?w.call(arguments,2):e),R(t.type,u,_||t.key,i||t.ref,null)}function ht(t,n){var e={__c:n="__cC"+T++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,_;return this.getChildContext||(e=[],(_={})[n]=this,this.getChildContext=function(){return _},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.some((function(t){t.__e=!0,G(t)}))},this.sub=function(t){e.push(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e.splice(e.indexOf(t),1),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}w=F.slice,S={__e:function(t,n,e,_){for(var i,o,r;n=n.__;)if((i=n.__c)&&!i.__)try{if((o=i.constructor)&&null!=o.getDerivedStateFromError&&(i.setState(o.getDerivedStateFromError(t)),r=i.__d),null!=i.componentDidCatch&&(i.componentDidCatch(t,_||{}),r=i.__d),r)return i.__E=i}catch(n){t=n}throw t}},x=0,C=function(t){return null!=t&&null==t.constructor},q.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=W({},this.state),"function"==typeof t&&(t=t(W({},e),this.props)),t&&W(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),G(this))},q.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),G(this))},q.prototype.render=j,U=[],H="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,P=function(t,n){return t.__v.__b-n.__v.__b},J.__r=0,N=0,$=et(!1),D=et(!0),T=0;var at,pt,dt,vt,yt=0,mt=[],gt=[],bt=S,kt=bt.__b,wt=bt.__r,St=bt.diffed,xt=bt.__c,Ct=bt.unmount,Ut=bt.__;function Et(t,n){bt.__h&&bt.__h(pt,t,yt||n),yt=0;var e=pt.__H||(pt.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({__V:gt}),e.__[t]}function Ht(t){return yt=1,Pt(zt,t)}function Pt(t,n,e){var _=Et(at++,2);if(_.t=t,!_.__c&&(_.__=[e?e(n):zt(void 0,n),function(t){var n=_.__N?_.__N[0]:_.__[0],e=_.t(n,t);n!==e&&(_.__N=[e,_.__[1]],_.__c.setState({}))}],_.__c=pt,!pt.u)){var i=function(t,n,e){if(!_.__c.__H)return!0;var i=_.__c.__H.__.filter((function(t){return!!t.__c}));if(i.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return i.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&_.__c.props===t)&&(!o||o.call(this,t,n,e))};pt.u=!0;var o=pt.shouldComponentUpdate,r=pt.componentWillUpdate;pt.componentWillUpdate=function(t,n,e){if(this.__e){var _=o;o=void 0,i(t,n,e),o=_}r&&r.call(this,t,n,e)},pt.shouldComponentUpdate=i}return _.__N||_.__}function Nt(t,n){var e=Et(at++,3);!bt.__s&&Bt(e.__H,n)&&(e.__=t,e.i=n,pt.__H.__h.push(e))}function $t(t,n){var e=Et(at++,4);!bt.__s&&Bt(e.__H,n)&&(e.__=t,e.i=n,pt.__h.push(e))}function Dt(t){return yt=5,Mt((function(){return{current:t}}),[])}function Tt(t,n,e){yt=6,$t((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Mt(t,n){var e=Et(at++,7);return Bt(e.__H,n)?(e.__V=t(),e.i=n,e.__h=t,e.__V):e.__}function Ft(t,n){return yt=8,Mt((function(){return t}),n)}function At(t){var n=pt.context[t.__c],e=Et(at++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(pt)),n.props.value):t.__}function Vt(t,n){bt.useDebugValue&&bt.useDebugValue(n?n(t):t)}function Wt(t){var n=Et(at++,10),e=Ht();return n.__=t,pt.componentDidCatch||(pt.componentDidCatch=function(t,_){n.__&&n.__(t,_),e[1](t)}),[e[0],function(){e[1](void 0)}]}function Lt(){var t=Et(at++,11);if(!t.__){for(var n=pt.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Ot(){for(var t;t=mt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(jt),t.__H.__h.forEach(qt),t.__H.__h=[]}catch(n){t.__H.__h=[],bt.__e(n,t.__v)}}bt.__b=function(t){pt=null,kt&&kt(t)},bt.__=function(t,n){t&&n.__k&&n.__k.__m&&(t.__m=n.__k.__m),Ut&&Ut(t,n)},bt.__r=function(t){wt&&wt(t),at=0;var n=(pt=t.__c).__H;n&&(dt===pt?(n.__h=[],pt.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.__V=gt,t.__N=t.i=void 0}))):(n.__h.forEach(jt),n.__h.forEach(qt),n.__h=[],at=0)),dt=pt},bt.diffed=function(t){St&&St(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==mt.push(n)&&vt===bt.requestAnimationFrame||((vt=bt.requestAnimationFrame)||It)(Ot)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.__V!==gt&&(t.__=t.__V),t.i=void 0,t.__V=gt}))),dt=pt=null},bt.__c=function(t,n){n.some((function(t){try{t.__h.forEach(jt),t.__h=t.__h.filter((function(t){return!t.__||qt(t)}))}catch(r){n.some((function(t){t.__h&&(t.__h=[])})),n=[],bt.__e(r,t.__v)}})),xt&&xt(t,n)},bt.unmount=function(t){Ct&&Ct(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{jt(t)}catch(t){n=t}})),e.__H=void 0,n&&bt.__e(n,e.__v))};var Rt="function"==typeof requestAnimationFrame;function It(t){var n,e=function(){clearTimeout(_),Rt&&cancelAnimationFrame(n),setTimeout(t)},_=setTimeout(e,100);Rt&&(n=requestAnimationFrame(e))}function jt(t){var n=pt,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),pt=n}function qt(t){var n=pt;t.__c=t.__(),pt=n}function Bt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function zt(t,n){return"function"==typeof n?n(t):n}function Gt(t,n){S[t]=n.bind(null,S[t]||(()=>{}))}let Jt,Kt;function Qt(t){if(Kt)Kt();Kt=t&&t.S()}function Xt({data:t}){const n=Zt(t);n.value=t;const e=Mt(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{var t;if(!C(e.peek())&&3===(null==(t=this.base)?void 0:t.nodeType))this.base.data=e.peek();else{this.__$f|=1;this.setState({})}};return v(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}Xt.displayName="_st";Object.defineProperties(f.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:Xt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});Gt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let _ in e){if("children"===_)continue;let i=e[_];if(i instanceof f){if(!t)n.__np=t={};t[_]=i;e[_]=i.peek()}}}t(n)});Gt("__r",(t,n)=>{Qt();let e,_=n.__c;if(_){_.__$f&=-2;e=_.__$u;if(void 0===e)_.__$u=e=function(t){let n;k((function(){n=this}));n.c=()=>{_.__$f|=1;_.setState({})};return n}()}Jt=_;Qt(e);t(n)});Gt("__e",(t,n,e,_)=>{Qt();Jt=void 0;t(n,e,_)});Gt("diffed",(t,n)=>{Qt();Jt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,_=n.props;if(t){let n=e.U;if(n)for(let e in n){let _=n[e];if(void 0!==_&&!(e in t)){_.d();n[e]=void 0}}else{n={};e.U=n}for(let i in t){let o=n[i],r=t[i];if(void 0===o){o=Yt(e,i,r,_);n[i]=o}else o.o(r,_)}}}t(n)});function Yt(t,n,e,_){const i=n in t&&void 0===t.ownerSVGElement,o=c(e);return{o:(t,n)=>{o.value=t;_=n},d:k(()=>{const e=o.value.value;if(_[n]!==e){_[n]=e;if(i)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}Gt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});Gt("__h",(t,n,e,_)=>{if(_<3||9===_)n.__$f|=2;t(n,e,_)});q.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let _ in n)return!0;for(let _ in t)if("__source"!==_&&t[_]!==this.props[_])return!0;for(let _ in this.props)if(!(_ in t))return!0;return!1};function Zt(t){return Mt(()=>c(t),[])}function tn(t){const n=Dt(t);n.current=t;Jt.__$f|=4;return Mt(()=>v(()=>n.current()),[])}function nn(t){const n=Dt(t);n.current=t;Nt(()=>k(()=>n.current()),[])}var en=function(t,n,e,_){var i;n[0]=0;for(var o=1;o=5&&((i||!t&&5===_)&&(r.push(_,0,i,e),_=6),t&&(r.push(_,t,0,e),_=6)),i=""},l=0;l"===n?(_=1,i=""):i=n+i[0]:o?n===o?o="":i+=n:'"'===n||"'"===n?o=n:">"===n?(u(),_=1):_&&("="===n?(_=5,e=i,i=""):"/"===n&&(_<5||">"===t[l][s+1])?(u(),3===_&&(r=r[0]),_=r,(r=r[0]).push(2,0,_),_=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),_=2):i+=n),3===_&&"!--"===i&&(_=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var rn=on.bind(O);export{q as Component,j as Fragment,f as Signal,e as batch,ct as cloneElement,v as computed,ht as createContext,O as createElement,I as createRef,k as effect,O as h,rn as html,ft as hydrate,C as isValidElement,S as options,st as render,c as signal,Y as toChildArray,o as untracked,Ft as useCallback,tn as useComputed,At as useContext,Vt as useDebugValue,Nt as useEffect,Wt as useErrorBoundary,Lt as useId,Tt as useImperativeHandle,$t as useLayoutEffect,Mt as useMemo,Pt as useReducer,Dt as useRef,Zt as useSignal,nn as useSignalEffect,Ht as useState}; From 9b596417af11c9ac44fcae0fcfbc6f3665089083 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 1 Jun 2024 08:44:14 +0200 Subject: [PATCH 2/6] CUDA: quantized KV support for FA vec (#7527) * CUDA: quantized KV support for FA vec * try CI fix * fix commented-out kernel variants * add q8_0 q4_0 tests * fix nwarps > batch size * split fattn compile via extern templates * fix flake8 * fix metal tests * fix cmake * make generate_cu_files.py executable * add autogenerated .cu files * fix AMD * error if type_v != FP16 and not flash_attn * remove obsolete code --- CMakeLists.txt | 30 + Makefile | 21 +- README.md | 1 + ggml-cuda.cu | 8 +- ggml-cuda/fattn-common.cuh | 539 ++++++++++- ggml-cuda/fattn-tile-f16.cu | 3 + ggml-cuda/fattn-tile-f32.cu | 3 + ggml-cuda/fattn-vec-f16.cu | 330 ------- ggml-cuda/fattn-vec-f16.cuh | 394 +++++++- ggml-cuda/fattn-vec-f32.cu | 279 ------ ggml-cuda/fattn-vec-f32.cuh | 375 +++++++- ggml-cuda/fattn-wmma-f16.cuh | 490 ++++++++++ ggml-cuda/fattn.cu | 849 ++++++------------ ggml-cuda/mmq.cu | 4 +- .../fattn-vec-f16-instance-hs128-f16-f16.cu | 5 + .../fattn-vec-f16-instance-hs128-f16-q4_0.cu | 5 + .../fattn-vec-f16-instance-hs128-f16-q4_1.cu | 5 + .../fattn-vec-f16-instance-hs128-f16-q5_0.cu | 5 + .../fattn-vec-f16-instance-hs128-f16-q5_1.cu | 5 + .../fattn-vec-f16-instance-hs128-f16-q8_0.cu | 5 + .../fattn-vec-f16-instance-hs128-q4_0-f16.cu | 5 + .../fattn-vec-f16-instance-hs128-q4_0-q4_0.cu | 5 + .../fattn-vec-f16-instance-hs128-q4_0-q4_1.cu | 5 + .../fattn-vec-f16-instance-hs128-q4_0-q5_0.cu | 5 + .../fattn-vec-f16-instance-hs128-q4_0-q5_1.cu | 5 + .../fattn-vec-f16-instance-hs128-q4_0-q8_0.cu | 5 + .../fattn-vec-f16-instance-hs128-q4_1-f16.cu | 5 + .../fattn-vec-f16-instance-hs128-q4_1-q4_0.cu | 5 + .../fattn-vec-f16-instance-hs128-q4_1-q4_1.cu | 5 + .../fattn-vec-f16-instance-hs128-q4_1-q5_0.cu | 5 + .../fattn-vec-f16-instance-hs128-q4_1-q5_1.cu | 5 + .../fattn-vec-f16-instance-hs128-q4_1-q8_0.cu | 5 + .../fattn-vec-f16-instance-hs128-q5_0-f16.cu | 5 + .../fattn-vec-f16-instance-hs128-q5_0-q4_0.cu | 5 + .../fattn-vec-f16-instance-hs128-q5_0-q4_1.cu | 5 + .../fattn-vec-f16-instance-hs128-q5_0-q5_0.cu | 5 + .../fattn-vec-f16-instance-hs128-q5_0-q5_1.cu | 5 + .../fattn-vec-f16-instance-hs128-q5_0-q8_0.cu | 5 + .../fattn-vec-f16-instance-hs128-q5_1-f16.cu | 5 + .../fattn-vec-f16-instance-hs128-q5_1-q4_0.cu | 5 + .../fattn-vec-f16-instance-hs128-q5_1-q4_1.cu | 5 + .../fattn-vec-f16-instance-hs128-q5_1-q5_0.cu | 5 + .../fattn-vec-f16-instance-hs128-q5_1-q5_1.cu | 5 + .../fattn-vec-f16-instance-hs128-q5_1-q8_0.cu | 5 + .../fattn-vec-f16-instance-hs128-q8_0-f16.cu | 5 + .../fattn-vec-f16-instance-hs128-q8_0-q4_0.cu | 5 + .../fattn-vec-f16-instance-hs128-q8_0-q4_1.cu | 5 + .../fattn-vec-f16-instance-hs128-q8_0-q5_0.cu | 5 + .../fattn-vec-f16-instance-hs128-q8_0-q5_1.cu | 5 + .../fattn-vec-f16-instance-hs128-q8_0-q8_0.cu | 5 + .../fattn-vec-f16-instance-hs256-f16-f16.cu | 5 + .../fattn-vec-f16-instance-hs64-f16-f16.cu | 5 + .../fattn-vec-f16-instance-hs64-f16-q4_0.cu | 5 + .../fattn-vec-f16-instance-hs64-f16-q4_1.cu | 5 + .../fattn-vec-f16-instance-hs64-f16-q5_0.cu | 5 + .../fattn-vec-f16-instance-hs64-f16-q5_1.cu | 5 + .../fattn-vec-f16-instance-hs64-f16-q8_0.cu | 5 + .../fattn-vec-f32-instance-hs128-f16-f16.cu | 5 + .../fattn-vec-f32-instance-hs128-f16-q4_0.cu | 5 + .../fattn-vec-f32-instance-hs128-f16-q4_1.cu | 5 + .../fattn-vec-f32-instance-hs128-f16-q5_0.cu | 5 + .../fattn-vec-f32-instance-hs128-f16-q5_1.cu | 5 + .../fattn-vec-f32-instance-hs128-f16-q8_0.cu | 5 + .../fattn-vec-f32-instance-hs128-q4_0-f16.cu | 5 + .../fattn-vec-f32-instance-hs128-q4_0-q4_0.cu | 5 + .../fattn-vec-f32-instance-hs128-q4_0-q4_1.cu | 5 + .../fattn-vec-f32-instance-hs128-q4_0-q5_0.cu | 5 + .../fattn-vec-f32-instance-hs128-q4_0-q5_1.cu | 5 + .../fattn-vec-f32-instance-hs128-q4_0-q8_0.cu | 5 + .../fattn-vec-f32-instance-hs128-q4_1-f16.cu | 5 + .../fattn-vec-f32-instance-hs128-q4_1-q4_0.cu | 5 + .../fattn-vec-f32-instance-hs128-q4_1-q4_1.cu | 5 + .../fattn-vec-f32-instance-hs128-q4_1-q5_0.cu | 5 + .../fattn-vec-f32-instance-hs128-q4_1-q5_1.cu | 5 + .../fattn-vec-f32-instance-hs128-q4_1-q8_0.cu | 5 + .../fattn-vec-f32-instance-hs128-q5_0-f16.cu | 5 + .../fattn-vec-f32-instance-hs128-q5_0-q4_0.cu | 5 + .../fattn-vec-f32-instance-hs128-q5_0-q4_1.cu | 5 + .../fattn-vec-f32-instance-hs128-q5_0-q5_0.cu | 5 + .../fattn-vec-f32-instance-hs128-q5_0-q5_1.cu | 5 + .../fattn-vec-f32-instance-hs128-q5_0-q8_0.cu | 5 + .../fattn-vec-f32-instance-hs128-q5_1-f16.cu | 5 + .../fattn-vec-f32-instance-hs128-q5_1-q4_0.cu | 5 + .../fattn-vec-f32-instance-hs128-q5_1-q4_1.cu | 5 + .../fattn-vec-f32-instance-hs128-q5_1-q5_0.cu | 5 + .../fattn-vec-f32-instance-hs128-q5_1-q5_1.cu | 5 + .../fattn-vec-f32-instance-hs128-q5_1-q8_0.cu | 5 + .../fattn-vec-f32-instance-hs128-q8_0-f16.cu | 5 + .../fattn-vec-f32-instance-hs128-q8_0-q4_0.cu | 5 + .../fattn-vec-f32-instance-hs128-q8_0-q4_1.cu | 5 + .../fattn-vec-f32-instance-hs128-q8_0-q5_0.cu | 5 + .../fattn-vec-f32-instance-hs128-q8_0-q5_1.cu | 5 + .../fattn-vec-f32-instance-hs128-q8_0-q8_0.cu | 5 + .../fattn-vec-f32-instance-hs256-f16-f16.cu | 5 + .../fattn-vec-f32-instance-hs64-f16-f16.cu | 5 + .../fattn-vec-f32-instance-hs64-f16-q4_0.cu | 5 + .../fattn-vec-f32-instance-hs64-f16-q4_1.cu | 5 + .../fattn-vec-f32-instance-hs64-f16-q5_0.cu | 5 + .../fattn-vec-f32-instance-hs64-f16-q5_1.cu | 5 + .../fattn-vec-f32-instance-hs64-f16-q8_0.cu | 5 + .../fattn-wmma-f16-instance-kqfloat-cpb16.cu | 10 + .../fattn-wmma-f16-instance-kqfloat-cpb32.cu | 9 + .../fattn-wmma-f16-instance-kqhalf-cpb16.cu | 10 + .../fattn-wmma-f16-instance-kqhalf-cpb32.cu | 10 + .../fattn-wmma-f16-instance-kqhalf-cpb8.cu | 8 + .../template-instances/generate_cu_files.py | 59 ++ ggml-cuda/vecdotq.cuh | 8 +- ggml-metal.m | 6 + llama.cpp | 5 + tests/test-backend-ops.cpp | 16 +- 110 files changed, 2697 insertions(+), 1200 deletions(-) delete mode 100644 ggml-cuda/fattn-vec-f16.cu delete mode 100644 ggml-cuda/fattn-vec-f32.cu create mode 100644 ggml-cuda/fattn-wmma-f16.cuh create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu create mode 100644 ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu create mode 100644 ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu create mode 100644 ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu create mode 100644 ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu create mode 100644 ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu create mode 100644 ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu create mode 100755 ggml-cuda/template-instances/generate_cu_files.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 60cf7bdc4..52b392a13 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,6 +106,7 @@ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING "llama: max. batch size for using peer access") option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF) option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF) +option(LLAMA_CUDA_FA_ALL_QUANTS "llama: compile all quants for FlashAttention" OFF) option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) @@ -402,6 +403,8 @@ if (LLAMA_CUDA) file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu") list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu") + file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) add_compile_definitions(GGML_USE_CUDA) add_compile_definitions(GGML_CUDA_USE_GRAPHS) @@ -427,6 +430,18 @@ if (LLAMA_CUDA) if (LLAMA_CUDA_NO_PEER_COPY) add_compile_definitions(GGML_CUDA_NO_PEER_COPY) endif() + if (LLAMA_CUDA_FA_ALL_QUANTS) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) + else() + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + endif() if (LLAMA_STATIC) if (WIN32) @@ -571,6 +586,8 @@ if (LLAMA_HIPBLAS) file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu") list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu") + file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA) @@ -590,6 +607,19 @@ if (LLAMA_HIPBLAS) add_compile_definitions(GGML_CUDA_NO_PEER_COPY) endif() + if (LLAMA_CUDA_FA_ALL_QUANTS) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) + else() + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + endif() + add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) diff --git a/Makefile b/Makefile index aced0e370..dfb3bb2cd 100644 --- a/Makefile +++ b/Makefile @@ -421,6 +421,15 @@ ifdef LLAMA_CUBLAS LLAMA_CUDA := 1 endif +OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu)) +ifdef LLAMA_CUDA_FA_ALL_QUANTS + OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu)) +else + OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu)) + OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu)) + OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu)) +endif # LLAMA_CUDA_FA_ALL_QUANTS + ifdef LLAMA_CUDA ifneq ('', '$(wildcard /opt/cuda)') CUDA_PATH ?= /opt/cuda @@ -431,6 +440,7 @@ ifdef LLAMA_CUDA MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib OBJS += ggml-cuda.o OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) + OBJS += $(OBJS_CUDA_TEMP_INST) MK_NVCCFLAGS += -use_fast_math ifdef LLAMA_FATAL_WARNINGS MK_NVCCFLAGS += -Werror all-warnings @@ -493,7 +503,10 @@ ifdef LLAMA_CUDA_NO_PEER_COPY endif # LLAMA_CUDA_NO_PEER_COPY ifdef LLAMA_CUDA_CCBIN MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) -endif +endif # LLAMA_CUDA_CCBIN +ifdef LLAMA_CUDA_FA_ALL_QUANTS + MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS +endif # LLAMA_CUDA_FA_ALL_QUANTS ifdef JETSON_EOL_MODULE_DETECT define NVCC_COMPILE @@ -505,7 +518,7 @@ define NVCC_COMPILE endef # NVCC_COMPILE endif # JETSON_EOL_MODULE_DETECT -ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh +ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh $(NVCC_COMPILE) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh) @@ -585,11 +598,12 @@ ifdef LLAMA_CUDA_NO_PEER_COPY endif # LLAMA_CUDA_NO_PEER_COPY OBJS += ggml-cuda.o OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) + OBJS += $(OBJS_CUDA_TEMP_INST) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh) $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< -ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh +ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< endif # LLAMA_HIPBLAS @@ -749,6 +763,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS) clean: rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) rm -vrf ggml-cuda/*.o + rm -vrf ggml-cuda/template-instances/*.o find examples pocs -type f -name "*.o" -delete # diff --git a/README.md b/README.md index e8bf1e246..4791f84af 100644 --- a/README.md +++ b/README.md @@ -501,6 +501,7 @@ Building the program with BLAS support may lead to some performance improvements | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | + | LLAMA_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. | - #### hipBLAS diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 1172f7b2f..daaa0cd6a 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2905,10 +2905,14 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128; #else - if (op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128) { + if (op->src[0]->ne[0] == 128) { return true; } - return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA; + if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) { + return true; + } + return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA && + op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16; #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) default: return false; diff --git a/ggml-cuda/fattn-common.cuh b/ggml-cuda/fattn-common.cuh index 1dd519bde..4bf03a49f 100644 --- a/ggml-cuda/fattn-common.cuh +++ b/ggml-cuda/fattn-common.cuh @@ -1,4 +1,7 @@ +#pragma once + #include "common.cuh" +#include "vecdotq.cuh" #include @@ -34,11 +37,523 @@ typedef void (* fattn_kernel_t)( const int nb11, const int nb12, const int nb13, + const int nb21, + const int nb22, + const int nb23, const int ne0, const int ne1, const int ne2, const int ne3); +typedef half (*vec_dot_KQ_f16_t)( + const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds); +typedef float (*vec_dot_KQ_f32_t)( + const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds); + +template +static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0( + const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { +#if __CUDA_ARCH__ > MIN_CC_DP4A + + const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c; + GGML_UNUSED(Q_v); + + half sum = 0.0f; + +#pragma unroll + for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) { + const int k_KQ = k_KQ_0 + threadIdx.x; + + const int ib = k_KQ / QI8_1; + const int iqs4 = k_KQ % QI4_0; + const int shift = k_KQ & (QI8_1/2); + + const int v = (get_int_from_uint8(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F; + const int u = Q_q8[k_KQ_0/WARP_SIZE]; + + const int sumi = __dp4a(v, u, 0); + +#if FP16_AVAILABLE + if (std::is_same::value) { + const half2 * Q_ds = (const half2 *) Q_ds_v; + + const half2 sum2 = __half2half2(K_q4_0[ib].d) * Q_ds[k_KQ_0/WARP_SIZE]; + sum += (T) (((half) sumi)*__low2half(sum2) - __high2half(sum2) /* *8/QI8_1 == 1 */); + } else +#endif // FP16_AVAILABLE + { + const float2 * Q_ds = (const float2 *) Q_ds_v; + + sum += (T) (__half2float(K_q4_0[ib].d) * (sumi*Q_ds[k_KQ_0/WARP_SIZE].x - (8/QI8_1)*Q_ds[k_KQ_0/WARP_SIZE].y)); + } + } + + return sum; +#else + GGML_UNUSED(K_c); + GGML_UNUSED(Q_v); + GGML_UNUSED(Q_q8); + GGML_UNUSED(Q_ds_v); + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ > MIN_CC_DP4A +} + +template +static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1( + const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { +#if __CUDA_ARCH__ > MIN_CC_DP4A + + const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c; + GGML_UNUSED(Q_v); + + T sum = 0.0f; + +#pragma unroll + for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) { + const int k_KQ = k_KQ_0 + threadIdx.x; + + const int ib = k_KQ / QI8_1; + const int iqs4 = k_KQ % QI4_1; + const int shift = k_KQ & (QI8_1/2); + + const int v = (get_int_from_uint8_aligned(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F; + const int u = Q_q8[k_KQ_0/WARP_SIZE]; + + const int sumi = __dp4a(v, u, 0); + +#if FP16_AVAILABLE + if (std::is_same::value) { + const half2 * Q_ds = (const half2 *) Q_ds_v; + + const half2 d4d8_m4s8 = K_q4_1[ib].dm * Q_ds[k_KQ_0/WARP_SIZE]; + const half2 sumid4d8_m4s8scaled = d4d8_m4s8 * make_half2(sumi, 1.0f/QI8_1); + sum += (T) (__low2half(sumid4d8_m4s8scaled) + __high2half(sumid4d8_m4s8scaled)); + } else +#endif // FP16_AVAILABLE + { + const float2 * Q_ds = (const float2 *) Q_ds_v; + + const float sumid4d8 = __low2float(K_q4_1[ib].dm)*Q_ds[k_KQ_0/WARP_SIZE].x * sumi; + const float m4s8scaled = __high2float(K_q4_1[ib].dm)*Q_ds[k_KQ_0/WARP_SIZE].y / QI8_1; + + sum += (T) (sumid4d8 + m4s8scaled); + } + } + + return sum; +#else + GGML_UNUSED(K_c); + GGML_UNUSED(Q_v); + GGML_UNUSED(Q_q8); + GGML_UNUSED(Q_ds_v); + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ > MIN_CC_DP4A +} + +template +static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0( + const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { +#if __CUDA_ARCH__ > MIN_CC_DP4A + + const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c; + GGML_UNUSED(Q_v); + + T sum = 0.0f; + +#pragma unroll + for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) { + const int k_KQ = k_KQ_0 + threadIdx.x; + + const int ib = k_KQ / QI8_1; + const int iqs4 = k_KQ % QI5_0; + const int iqs8 = k_KQ % QI8_1; + const int shift = k_KQ & (QI8_1/2); + + int v = (get_int_from_uint8(K_q5_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F; + const int vh = get_int_from_uint8(K_q5_0[ib].qh, 0) >> (iqs8 * QI5_0); + v |= (vh << 4) & 0x00000010; // 0 -> 4 + v |= (vh << 11) & 0x00001000; // 1 -> 12 + v |= (vh << 18) & 0x00100000; // 2 -> 20 + v |= (vh << 25) & 0x10000000; // 3 -> 28 + + const int u = Q_q8[k_KQ_0/WARP_SIZE]; + + const int sumi = __dp4a(v, u, 0); + +#if FP16_AVAILABLE + if (std::is_same::value) { + const half2 * Q_ds = (const half2 *) Q_ds_v; + + const half2 sum2 = __half2half2(K_q5_0[ib].d) * Q_ds[k_KQ_0/WARP_SIZE]; + sum += (T) (((half) sumi)*__low2half(sum2) - __high2half(sum2)*__float2half(2.0f)) /* *16/QI8_1 == 2 */; + } else +#endif // FP16_AVAILABLE + { + const float2 * Q_ds = (const float2 *) Q_ds_v; + + sum += (T) (__half2float(K_q5_0[ib].d) * (sumi*Q_ds[k_KQ_0/WARP_SIZE].x - (16/QI8_1)*Q_ds[k_KQ_0/WARP_SIZE].y)); + } + } + + return sum; +#else + GGML_UNUSED(K_c); + GGML_UNUSED(Q_v); + GGML_UNUSED(Q_q8); + GGML_UNUSED(Q_ds_v); + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ > MIN_CC_DP4A +} + +template +static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1( + const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { +#if __CUDA_ARCH__ > MIN_CC_DP4A + + const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c; + GGML_UNUSED(Q_v); + + T sum = 0.0f; + +#pragma unroll + for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) { + const int k_KQ = k_KQ_0 + threadIdx.x; + + const int ib = k_KQ / QI8_1; + const int iqs4 = k_KQ % QI5_1; + const int iqs8 = k_KQ % QI8_1; + const int shift = k_KQ & (QI8_1/2); + + int v = (get_int_from_uint8(K_q5_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F; + const int vh = get_int_from_uint8(K_q5_1[ib].qh, 0) >> (iqs8 * QI5_1); + v |= (vh << 4) & 0x00000010; // 0 -> 4 + v |= (vh << 11) & 0x00001000; // 1 -> 12 + v |= (vh << 18) & 0x00100000; // 2 -> 20 + v |= (vh << 25) & 0x10000000; // 3 -> 28 + + const int u = Q_q8[k_KQ_0/WARP_SIZE]; + + const int sumi = __dp4a(v, u, 0); + +#if FP16_AVAILABLE + if (std::is_same::value) { + const half2 * Q_ds = (const half2 *) Q_ds_v; + + const half2 d5d8_m5s8 = K_q5_1[ib].dm * Q_ds[k_KQ_0/WARP_SIZE]; + const half2 sumid5d8_m5s8scaled = d5d8_m5s8 * make_half2(sumi, 1.0f/QI8_1); + sum += (T) (__low2half(sumid5d8_m5s8scaled) + __high2half(sumid5d8_m5s8scaled)); + } else +#endif // FP16_AVAILABLE + { + const float2 * Q_ds = (const float2 *) Q_ds_v; + + const float sumid5d8 = __low2float(K_q5_1[ib].dm)*Q_ds[k_KQ_0/WARP_SIZE].x * sumi; + const float m5s8scaled = __high2float(K_q5_1[ib].dm)*Q_ds[k_KQ_0/WARP_SIZE].y / QI8_1; + + sum += (T) (sumid5d8 + m5s8scaled); + } + } + + return sum; +#else + GGML_UNUSED(K_c); + GGML_UNUSED(Q_v); + GGML_UNUSED(Q_q8); + GGML_UNUSED(Q_ds_v); + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ > MIN_CC_DP4A +} + +template +static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0( + const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { +#if __CUDA_ARCH__ > MIN_CC_DP4A + + const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c; + GGML_UNUSED(Q_v); + + T sum = 0.0f; + +#pragma unroll + for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) { + const int k_KQ = k_KQ_0 + threadIdx.x; + + const int ib = k_KQ / QI8_0; + const int iqs = k_KQ % QI8_0; + + const int v = get_int_from_int8(K_q8_0[ib].qs, iqs); + + T Q_d; + if (std::is_same::value) { + const half2 * Q_ds = (const half2 *) Q_ds_v; + Q_d = __low2half(Q_ds[k_KQ_0/WARP_SIZE]); + } else { + const float2 * Q_ds = (const float2 *) Q_ds_v; + Q_d = Q_ds[k_KQ_0/WARP_SIZE].x; + } + + sum += vec_dot_q8_0_q8_1_impl(&v, &Q_q8[k_KQ_0/WARP_SIZE], K_q8_0[ib].d, Q_d); + } + + return sum; +#else + GGML_UNUSED(K_c); + GGML_UNUSED(Q_v); + GGML_UNUSED(Q_q8); + GGML_UNUSED(Q_ds_v); + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ > MIN_CC_DP4A +} + +template +static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_f16( + const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds_v) { + + const half2 * K_h2 = (const half2 *) K_c; + GGML_UNUSED(Q_q8); + GGML_UNUSED(Q_ds_v); + +#if FP16_AVAILABLE + if (std::is_same::value) { + const half2 * Q_h2 = (const half2 *) Q_v; + + half2 sum2 = make_half2(0.0f, 0.0f); + +#pragma unroll + for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) { + const int k_KQ = k_KQ_0 + threadIdx.x; + + const half2 K_ik = K_h2[k_KQ]; + sum2 += K_ik * Q_h2[k_KQ_0/WARP_SIZE]; + } + + return __low2half(sum2) + __high2half(sum2); + } +#endif // FP16_AVAILABLE + + const float2 * Q_f2 = (const float2 *) Q_v; + + float sum = 0.0f; + +#pragma unroll + for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) { + const int k_KQ = k_KQ_0 + threadIdx.x; + + const half2 K_ik = K_h2[k_KQ]; + sum += __low2float(K_ik) * Q_f2[k_KQ_0/WARP_SIZE].x; + sum += __high2float(K_ik) * Q_f2[k_KQ_0/WARP_SIZE].y; + } + + return sum; +} + +template +static __device__ __forceinline__ void quantize_q8_1_to_shared( + const float * __restrict__ x, const float scale, int * __restrict__ yq32, void * __restrict__ yds) { + + float vals[sizeof(int)] = {0.0f}; +#pragma unroll + for (int l = 0; l < sizeof(int); ++l) { + vals[l] = scale * x[4*threadIdx.x + l]; + } + + float amax = fabsf(vals[0]); + float sum = vals[0]; +#pragma unroll + for (int l = 1; l < sizeof(int); ++l) { + amax = fmaxf(amax, fabsf(vals[l])); + sum += vals[l]; + } +#pragma unroll + for (int mask = QI8_1/2; mask > 0; mask >>= 1) { + amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, mask, 32)); + sum += __shfl_xor_sync(0xFFFFFFFF, sum, mask, 32); + } + + const float d = amax / 127; + int q32 = 0; + int8_t * q8 = (int8_t *) &q32; + + if (d != 0.0f) { +#pragma unroll + for (int l = 0; l < sizeof(int); ++l) { + q8[l] = roundf(vals[l] / d); + } + } + + yq32[threadIdx.x] = q32; + if (threadIdx.x % QI8_1 == 0) { + if (std::is_same::value) { + ((half2 *) yds)[threadIdx.x/QI8_1] = make_half2(d, sum); + } else { + ((float2 *) yds)[threadIdx.x/QI8_1] = make_float2(d, sum); + } + } +} + +typedef half (*dequantize_1_f16_t)(const void *, const int64_t); +typedef float (*dequantize_1_f32_t)(const void *, const int64_t); + +template +static __device__ __forceinline__ T dequantize_1_q4_0(const void * __restrict__ vx, const int64_t i) { + const block_q4_0 * x = (const block_q4_0 *) vx; + + const int64_t ib = i / QK4_0; + const int iqs = i % (QK4_0/2); + const int shift = (i % QK4_0) / (QK4_0/2); + + const T d = x[ib].d; + const int q0 = x[ib].qs[iqs]; + const int q = ((q0 >> (4*shift)) & 0x0F) - 8; + +#if FP16_AVAILABLE + if (std::is_same::value) { + return ((half) d)*((half) q); + } +#endif // FP16_AVAILABLE + + return ((float) d)*((float) q); +} + +template +static __device__ __forceinline__ T dequantize_1_q4_1(const void * __restrict__ vx, const int64_t i) { + const block_q4_1 * x = (const block_q4_1 *) vx; + + const int64_t ib = i / QK4_1; + const int iqs = i % (QK4_1/2); + const int shift = (i % QK4_1) / (QK4_1/2); + + const half2 dm = x[ib].dm; + const int q0 = x[ib].qs[iqs]; + const int q = ((q0 >> (4*shift)) & 0x0F); + +#if FP16_AVAILABLE + if (std::is_same::value) { + return __low2half(dm)*((half) q) + __high2half(dm); + } +#endif // FP16_AVAILABLE + + return __low2float(dm)*((float) q) + __high2float(dm); +} + +template +static __device__ __forceinline__ T dequantize_1_q5_0(const void * __restrict__ vx, const int64_t i) { + const block_q5_0 * x = (const block_q5_0 *) vx; + + const int64_t ib = i / QK5_0; + const int idq = i % QK5_0; + const int iqs = i % (QK5_0/2); + const int shift = (i % QK5_0) / (QK5_0/2); + + const T d = x[ib].d; + const int ql0 = x[ib].qs[iqs]; + const int qh0 = get_int_from_uint8(x[ib].qh, 0); + const int ql = ((ql0 >> (4*shift)) & 0x0F); + const int qh = ((qh0 >> idq) << 4) & 0x10; + const int q = (ql | qh) - 16; + +#if FP16_AVAILABLE + if (std::is_same::value) { + return ((half) d)*((half) q); + } +#endif // FP16_AVAILABLE + + return ((float) d)*((float) q); +} + +template +static __device__ __forceinline__ T dequantize_1_q5_1(const void * __restrict__ vx, const int64_t i) { + const block_q5_1 * x = (const block_q5_1 *) vx; + + const int64_t ib = i / QK5_1; + const int idq = i % QK5_1; + const int iqs = i % (QK5_1/2); + const int shift = (i % QK5_1) / (QK5_1/2); + + const half2 dm = x[ib].dm; + const int ql0 = x[ib].qs[iqs]; + const int qh0 = get_int_from_uint8_aligned(x[ib].qh, 0); + const int ql = ((ql0 >> (4*shift)) & 0x0F); + const int qh = ((qh0 >> idq) << 4) & 0x10; + const int q = (ql | qh); + +#if FP16_AVAILABLE + if (std::is_same::value) { + return __low2half(dm)*((half) q) + __high2half(dm); + } +#endif // FP16_AVAILABLE + + return __low2float(dm)*((float) q) + __high2float(dm); +} + +template +static __device__ __forceinline__ T dequantize_1_q8_0(const void * __restrict__ vx, const int64_t i) { + const block_q8_0 * x = (const block_q8_0 *) vx; + + const int64_t ib = i / QK8_0; + const int iqs = i % QK8_0; + + const T d = x[ib].d; + const int q = x[ib].qs[iqs]; + +#if FP16_AVAILABLE + if (std::is_same::value) { + return ((half) d)*((half) q); + } +#endif // FP16_AVAILABLE + + return ((float) d)*((float) q); +} + +template +static __device__ __forceinline__ T dequantize_1_f16(const void * __restrict__ vx, const int64_t i) { + const half * x = (const half *) vx; + + return x[i]; +} + +template +constexpr __device__ vec_dot_KQ_f16_t get_vec_dot_KQ_f16(ggml_type type_K) { + return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0 : + type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1 : + type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0 : + type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1 : + type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0 : + type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16 : + nullptr; +} + +template +constexpr __device__ vec_dot_KQ_f32_t get_vec_dot_KQ_f32(ggml_type type_K) { + return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0 : + type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1 : + type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0 : + type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1 : + type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0 : + type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16 : + nullptr; +} + +constexpr __device__ dequantize_1_f16_t get_dequantize_1_f16(ggml_type type_V) { + return type_V == GGML_TYPE_Q4_0 ? dequantize_1_q4_0 : + type_V == GGML_TYPE_Q4_1 ? dequantize_1_q4_1 : + type_V == GGML_TYPE_Q5_0 ? dequantize_1_q5_0 : + type_V == GGML_TYPE_Q5_1 ? dequantize_1_q5_1 : + type_V == GGML_TYPE_Q8_0 ? dequantize_1_q8_0 : + type_V == GGML_TYPE_F16 ? dequantize_1_f16 : + nullptr; +} + +constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) { + return type_V == GGML_TYPE_Q4_0 ? dequantize_1_q4_0 : + type_V == GGML_TYPE_Q4_1 ? dequantize_1_q4_1 : + type_V == GGML_TYPE_Q5_0 ? dequantize_1_q5_0 : + type_V == GGML_TYPE_Q5_1 ? dequantize_1_q5_1 : + type_V == GGML_TYPE_Q8_0 ? dequantize_1_q8_0 : + type_V == GGML_TYPE_F16 ? dequantize_1_f16 : + nullptr; +} + template // D == head size #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) __launch_bounds__(D, 1) @@ -83,6 +598,27 @@ static __global__ void flash_attn_combine_results( dst[blockIdx.y*D + tid] = VKQ_numerator / VKQ_denominator; } +static void on_no_fattn_vec_case(const int D) { + if (D == 64) { + fprintf(stderr, "Unsupported KV type combination for head_size 64.\n"); + fprintf(stderr, "By default only f16 KV cache is supported.\n"); + fprintf(stderr, "Compile with LLAMA_CUDA_FA_ALL_QUANTS for V cache quantization support.\n"); + GGML_ASSERT(false); + } else if (D == 128) { + fprintf(stderr, "Unsupported KV type combination for head_size 128.\n"); + fprintf(stderr, "Supported combinations:\n"); + fprintf(stderr, " - K == q4_0, V == q4_0, 4.50 BPV\n"); + fprintf(stderr, " - K == q8_0, V == q8_0, 8.50 BPV\n"); + fprintf(stderr, " - K == f16, V == f16, 16.00 BPV\n"); + fprintf(stderr, "Compile with LLAMA_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n"); + GGML_ASSERT(false); + } else { + fprintf(stderr, "Unsupported KV type combination for head_size 256.\n"); + fprintf(stderr, "Only f16 is supported.\n"); + GGML_ASSERT(false); + } +} + template void launch_fattn(ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, int nwarps, int cols_per_block) { const ggml_tensor * Q = dst->src[0]; @@ -94,8 +630,6 @@ void launch_fattn(ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kern ggml_tensor * KQV = dst; GGML_ASSERT(Q->type == GGML_TYPE_F32); - GGML_ASSERT(K->type == GGML_TYPE_F16); - GGML_ASSERT(V->type == GGML_TYPE_F16); GGML_ASSERT(KQV->type == GGML_TYPE_F32); GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16); @@ -143,6 +677,7 @@ void launch_fattn(ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kern mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0, Q->nb[1], Q->nb[2], Q->nb[3], K->nb[1], K->nb[2], K->nb[3], + V->nb[1], V->nb[2], V->nb[3], KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3] ); CUDA_CHECK(cudaGetLastError()); diff --git a/ggml-cuda/fattn-tile-f16.cu b/ggml-cuda/fattn-tile-f16.cu index cdb5eaff7..3d64a9eba 100644 --- a/ggml-cuda/fattn-tile-f16.cu +++ b/ggml-cuda/fattn-tile-f16.cu @@ -36,6 +36,9 @@ static __global__ void flash_attn_tile_ext_f16( const int nb11, const int nb12, const int nb13, + const int nb21, + const int nb22, + const int nb23, const int ne0, const int ne1, const int ne2, diff --git a/ggml-cuda/fattn-tile-f32.cu b/ggml-cuda/fattn-tile-f32.cu index 5a3de2918..61fce0a7e 100644 --- a/ggml-cuda/fattn-tile-f32.cu +++ b/ggml-cuda/fattn-tile-f32.cu @@ -36,6 +36,9 @@ static __global__ void flash_attn_tile_ext_f32( const int nb11, const int nb12, const int nb13, + const int nb21, + const int nb22, + const int nb23, const int ne0, const int ne1, const int ne2, diff --git a/ggml-cuda/fattn-vec-f16.cu b/ggml-cuda/fattn-vec-f16.cu deleted file mode 100644 index 808e8f362..000000000 --- a/ggml-cuda/fattn-vec-f16.cu +++ /dev/null @@ -1,330 +0,0 @@ -#include "common.cuh" -#include "fattn-common.cuh" -#include "fattn-vec-f16.cuh" - -template // D == head size -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) -__launch_bounds__(D, 1) -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) -static __global__ void flash_attn_vec_ext_f16( - const char * __restrict__ Q, - const char * __restrict__ K, - const char * __restrict__ V, - const char * __restrict__ mask, - float * __restrict__ dst, - float2 * __restrict__ dst_meta, - const float scale, - const float max_bias, - const float m0, - const float m1, - const uint32_t n_head_log2, - const int ne00, - const int ne01, - const int ne02, - const int ne03, - const int ne10, - const int ne11, - const int ne12, - const int ne13, - const int ne31, - const int nb31, - const int nb01, - const int nb02, - const int nb03, - const int nb11, - const int nb12, - const int nb13, - const int ne0, - const int ne1, - const int ne2, - const int ne3) { -#if FP16_AVAILABLE - //In this kernel Q, K, V are matrices while i, j, k are matrix indices. - - const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on. - const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel. - - const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. - const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0); - const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio)); - const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape - const half * maskh = (const half *) mask + ne11*ic0; - - const int stride_KV = nb11 / sizeof(half); - const int stride_KV2 = nb11 / sizeof(half2); - - const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1); - const half slopeh = __float2half(slopef); - - static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64."); - constexpr int nwarps = D / WARP_SIZE; - const int tid = WARP_SIZE*threadIdx.y + threadIdx.x; - __builtin_assume(tid < D); - - __shared__ half KQ[ncols*D]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - KQ[j*D + tid] = -HALF_MAX_HALF; - } - half2 * KQ2 = (half2 *) KQ; - - half kqmax[ncols]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - kqmax[j] = -HALF_MAX_HALF; - } - half kqsum[ncols] = {0.0f}; - - __shared__ half kqmax_shared[ncols][WARP_SIZE]; - __shared__ half kqsum_shared[ncols][WARP_SIZE]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - if (threadIdx.y == 0) { - kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF; - kqsum_shared[j][threadIdx.x] = 0.0f; - } - } - __syncthreads(); - - // Convert Q to half2 and store in registers: - half2 Q_h2[ncols][D/(2*WARP_SIZE)]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { -#pragma unroll - for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { - const int i = i0 + threadIdx.x; - - const float2 tmp = ncols <= 2 || ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f); - Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y); - } - } - - half2 VKQ[ncols] = {{0.0f, 0.0f}}; - - const int k_start = parallel_blocks == 1 ? 0 : ip*D; - for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) { - // Calculate KQ tile and keep track of new maximum KQ values: - - // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression, - // see https://github.com/ggerganov/llama.cpp/pull/7061 . - // Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable). - half kqmax_new = kqmax[0]; - half kqmax_new_arr[ncols]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - kqmax_new_arr[j] = kqmax[j]; - } - -#pragma unroll - for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) { - const int i_KQ = i_KQ_0 + threadIdx.y; - - if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) { - break; - } - - half2 sum2[ncols] = {{0.0f, 0.0f}}; -#pragma unroll - for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) { - const int k_KQ = k_KQ_0 + threadIdx.x; - - const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE]; - } - } - -#pragma unroll - for (int j = 0; j < ncols; ++j) { - sum2[j] = warp_reduce_sum(sum2[j]); - half sum = __low2half(sum2[j]) + __high2half(sum2[j]); - sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f); - - if (ncols == 1) { - kqmax_new = ggml_cuda_hmax(kqmax_new, sum); - } else { - kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum); - } - - if (threadIdx.x == 0) { - KQ[j*D + i_KQ] = sum; - } - } - } - -#pragma unroll - for (int j = 0; j < ncols; ++j) { - half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j]; - - kqmax_new_j = warp_reduce_max(kqmax_new_j); - if (threadIdx.x == 0) { - kqmax_shared[j][threadIdx.y] = kqmax_new_j; - } - } - - __syncthreads(); - -#pragma unroll - for (int j = 0; j < ncols; ++j) { - half kqmax_new_j = kqmax_shared[j][threadIdx.x]; - kqmax_new_j = warp_reduce_max(kqmax_new_j); - - const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j); - kqmax[j] = kqmax_new_j; - - const half val = hexp(KQ[j*D + tid] - kqmax[j]); - kqsum[j] = kqsum[j]*KQ_max_scale + val; - KQ[j*D + tid] = val; - - VKQ[j] *= __half2half2(KQ_max_scale); - } - - __syncthreads(); - -#pragma unroll - for (int k0 = 0; k0 < D; k0 += 2) { - if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) { - break; - } - - half2 V_k; - reinterpret_cast(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid]; - reinterpret_cast(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - VKQ[j] += V_k*KQ2[j*(D/2) + k0/2]; - } - } - - __syncthreads(); - } - -#pragma unroll - for (int j = 0; j < ncols; ++j) { - kqsum[j] = warp_reduce_sum(kqsum[j]); - if (threadIdx.x == 0) { - kqsum_shared[j][threadIdx.y] = kqsum[j]; - } - } - - __syncthreads(); - -#pragma unroll - for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) { - if (ncols > 2 && ic0 + j_VKQ >= ne01) { - break; - } - - kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x]; - kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]); - - half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ])); - if (parallel_blocks == 1) { - dst_val /= kqsum[j_VKQ]; - } - const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip; - dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val; - } - - if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) { - dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]); - } -#else - NO_DEVICE_CODE; -#endif // FP16_AVAILABLE -} - -void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - ggml_tensor * KQV = dst; - ggml_tensor * Q = dst->src[0]; - - const int32_t precision = KQV->op_params[2]; - GGML_ASSERT(precision == GGML_PREC_DEFAULT); - - constexpr int cols_per_block = 1; - constexpr int parallel_blocks = 4; - switch (Q->ne[0]) { - case 64: { - constexpr int D = 64; - constexpr int nwarps = D/WARP_SIZE; - fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); - } break; - case 128: { - constexpr int D = 128; - constexpr int nwarps = D/WARP_SIZE; - fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); - } break; - case 256: { - constexpr int D = 256; - constexpr int nwarps = D/WARP_SIZE; - fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); - } break; - default: - GGML_ASSERT(false); - break; - } -} - -template -void launch_fattn_vec_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * Q = dst->src[0]; - switch (Q->ne[0]) { - case 64: { - constexpr int D = 64; - constexpr int nwarps = D/WARP_SIZE; - fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); - } break; - case 128: { - constexpr int D = 128; - constexpr int nwarps = D/WARP_SIZE; - fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); - } break; - default: { - GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128."); - } break; - } -} - -void ggml_cuda_flash_attn_ext_vec_f16_no_mma(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * KQV = dst; - const ggml_tensor * Q = dst->src[0]; - - const int32_t precision = KQV->op_params[2]; - GGML_ASSERT(precision == GGML_PREC_DEFAULT); - - if (Q->ne[1] == 1) { - ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); - return; - } - - if (Q->ne[1] == 2) { - constexpr int cols_per_block = 2; - constexpr int parallel_blocks = 4; - launch_fattn_vec_f16_64_128(ctx, dst); - return; - } - - if (Q->ne[1] <= 4) { - constexpr int cols_per_block = 4; - constexpr int parallel_blocks = 4; - launch_fattn_vec_f16_64_128(ctx, dst); - return; - } - - if (Q->ne[1] <= 8) { - constexpr int cols_per_block = 8; - constexpr int parallel_blocks = 4; - launch_fattn_vec_f16_64_128(ctx, dst); - return; - } - - constexpr int cols_per_block = 8; - constexpr int parallel_blocks = 1; - launch_fattn_vec_f16_64_128(ctx, dst); -} diff --git a/ggml-cuda/fattn-vec-f16.cuh b/ggml-cuda/fattn-vec-f16.cuh index c7023610a..ea4fcc972 100644 --- a/ggml-cuda/fattn-vec-f16.cuh +++ b/ggml-cuda/fattn-vec-f16.cuh @@ -1,5 +1,395 @@ #include "common.cuh" +#include "fattn-common.cuh" -void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +template // D == head size +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +__launch_bounds__(D, 1) +#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +static __global__ void flash_attn_vec_ext_f16( + const char * __restrict__ Q, + const char * __restrict__ K, + const char * __restrict__ V, + const char * __restrict__ mask, + float * __restrict__ dst, + float2 * __restrict__ dst_meta, + const float scale, + const float max_bias, + const float m0, + const float m1, + const uint32_t n_head_log2, + const int ne00, + const int ne01, + const int ne02, + const int ne03, + const int ne10, + const int ne11, + const int ne12, + const int ne13, + const int ne31, + const int nb31, + const int nb01, + const int nb02, + const int nb03, + const int nb11, + const int nb12, + const int nb13, + const int nb21, + const int nb22, + const int nb23, + const int ne0, + const int ne1, + const int ne2, + const int ne3) { +#if FP16_AVAILABLE + //In this kernel Q, K, V are matrices while i, j, k are matrix indices. -void ggml_cuda_flash_attn_ext_vec_f16_no_mma(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + constexpr vec_dot_KQ_f16_t vec_dot_KQ = get_vec_dot_KQ_f16(type_K); + constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16; + constexpr dequantize_1_f16_t dequantize_1_v = get_dequantize_1_f16(type_V); + + const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on. + const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel. + + const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. + Q += nb02* blockIdx.y + nb01*ic0; + K += nb12*(blockIdx.y / gqa_ratio); + V += nb22*(blockIdx.y / gqa_ratio); + + const half * maskh = (const half *) mask + ne11*ic0; + + const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1); + const half slopeh = __float2half(slopef); + + static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64."); + constexpr int nwarps = D / WARP_SIZE; + const int tid = WARP_SIZE*threadIdx.y + threadIdx.x; + __builtin_assume(tid < D); + + __shared__ half KQ[ncols*D]; + half2 * KQ2 = (half2 *) KQ; + + half kqmax[ncols]; +#pragma unroll + for (int j = 0; j < ncols; ++j) { + kqmax[j] = -HALF_MAX_HALF; + } + half kqsum[ncols] = {0.0f}; + + __shared__ half kqmax_shared[ncols][WARP_SIZE]; + __shared__ half kqsum_shared[ncols][WARP_SIZE]; +#pragma unroll + for (int j = 0; j < ncols; ++j) { + if (threadIdx.y == 0) { + kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF; + kqsum_shared[j][threadIdx.x] = 0.0f; + } + } + __syncthreads(); + + // Convert Q to half2 (f16 K) or q8_1 (quantized K) and store in registers: + half2 Q_h2[ncols][D/(2*WARP_SIZE)]; + int Q_i32[ncols][D/(sizeof(int)*QK8_1) == 0 ? 1 : D/(sizeof(int)*QK8_1)]; + half2 Q_ds[ncols][D/QK8_1 == 0 ? 1 : D/QK8_1]; + if (Q_q8_1) { +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + const int j = j0 + threadIdx.y; + + if (j0 + nwarps > ncols && j >= ncols) { + break; + } + + // Reuse KQ as temporary storage for converting Q to q8_1: + int * tmp_q_i32 = (int *) &KQ[j*D]; + half2 * tmp_q_ds = (half2 *) (tmp_q_i32 + D/sizeof(int)); + + // Set memory to zero if out of bounds: + if (ncols > 2 && ic0 + j >= ne01) { +#pragma unroll + for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + + tmp_q_i32[i] = 0; + } + if (threadIdx.x < D/QK8_1) { + tmp_q_ds[threadIdx.x] = make_half2(0.0f, 0.0f); + } + continue; + } + + const float * Q_f = (const float *) (Q + j*nb01); +#pragma unroll + for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) { + quantize_q8_1_to_shared(Q_f + 4*i0, scale, tmp_q_i32, tmp_q_ds); + } + } + + __syncthreads(); + +#pragma unroll + for (int j = 0; j < ncols; ++j) { + int * tmp_q_i32 = (int *) &KQ[j*D]; + half2 * tmp_q_ds = (half2 *) (tmp_q_i32 + D/sizeof(int)); + +#pragma unroll + for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + + Q_i32[j][i0/WARP_SIZE] = tmp_q_i32[i]; + Q_ds[j][i0/WARP_SIZE] = tmp_q_ds[i/QI8_1]; + } + } + + __syncthreads(); + } else { +#pragma unroll + for (int j = 0; j < ncols; ++j) { + const float2 * Q_f2_j = (const float2 *) (Q + j*nb01); + +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + + const float2 tmp = ncols <= 2 || ic0 + j < ne01 ? Q_f2_j[i] : make_float2(0.0f, 0.0f); + Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y); + } + } + } + + +#pragma unroll + for (int j = 0; j < ncols; ++j) { + KQ[j*D + tid] = -HALF_MAX_HALF; + } + + half2 VKQ[ncols] = {{0.0f, 0.0f}}; + + const int k_start = parallel_blocks == 1 ? 0 : ip*D; + for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) { + // Calculate KQ tile and keep track of new maximum KQ values: + + // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression, + // see https://github.com/ggerganov/llama.cpp/pull/7061 . + // Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable). + half kqmax_new = kqmax[0]; + half kqmax_new_arr[ncols]; +#pragma unroll + for (int j = 0; j < ncols; ++j) { + kqmax_new_arr[j] = kqmax[j]; + } + +#pragma unroll + for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) { + const int i_KQ = i_KQ_0 + threadIdx.y; + + if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) { + break; + } + +#pragma unroll + for (int j = 0; j < ncols; ++j) { + half sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_h2[j], Q_i32[j], Q_ds[j]); + sum = warp_reduce_sum(sum); + sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f); + + if (ncols == 1) { + kqmax_new = ggml_cuda_hmax(kqmax_new, sum); + } else { + kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum); + } + + if (threadIdx.x == 0) { + KQ[j*D + i_KQ] = sum; + } + } + } + +#pragma unroll + for (int j = 0; j < ncols; ++j) { + half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j]; + + kqmax_new_j = warp_reduce_max(kqmax_new_j); + if (threadIdx.x == 0) { + kqmax_shared[j][threadIdx.y] = kqmax_new_j; + } + } + + __syncthreads(); + +#pragma unroll + for (int j = 0; j < ncols; ++j) { + half kqmax_new_j = kqmax_shared[j][threadIdx.x]; + kqmax_new_j = warp_reduce_max(kqmax_new_j); + + const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j); + kqmax[j] = kqmax_new_j; + + const half val = hexp(KQ[j*D + tid] - kqmax[j]); + kqsum[j] = kqsum[j]*KQ_max_scale + val; + KQ[j*D + tid] = val; + + VKQ[j] *= __half2half2(KQ_max_scale); + } + + __syncthreads(); + +#pragma unroll + for (int k0 = 0; k0 < D; k0 += 2) { + if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) { + break; + } + + half2 V_k; + reinterpret_cast(V_k.x) = dequantize_1_v(V + (k_VKQ_0 + k0 + 0)*nb21, tid); + reinterpret_cast(V_k.y) = dequantize_1_v(V + (k_VKQ_0 + k0 + 1)*nb21, tid); +#pragma unroll + for (int j = 0; j < ncols; ++j) { + VKQ[j] += V_k*KQ2[j*(D/2) + k0/2]; + } + } + + __syncthreads(); + } + +#pragma unroll + for (int j = 0; j < ncols; ++j) { + kqsum[j] = warp_reduce_sum(kqsum[j]); + if (threadIdx.x == 0) { + kqsum_shared[j][threadIdx.y] = kqsum[j]; + } + } + + __syncthreads(); + +#pragma unroll + for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) { + if (ncols > 2 && ic0 + j_VKQ >= ne01) { + break; + } + + kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x]; + kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]); + + half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ])); + if (parallel_blocks == 1) { + dst_val /= kqsum[j_VKQ]; + } + const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip; + dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val; + } + + if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) { + dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]); + } +#else + NO_DEVICE_CODE; +#endif // FP16_AVAILABLE +} + +template +void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + constexpr int nwarps = D/WARP_SIZE; + fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16; + launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); +} + +template +void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_tensor * KQV = dst; + ggml_tensor * Q = dst->src[0]; + ggml_tensor * K = dst->src[1]; + ggml_tensor * V = dst->src[2]; + + const int32_t precision = KQV->op_params[2]; + GGML_ASSERT(precision == GGML_PREC_DEFAULT); + + GGML_ASSERT(K->type == type_K); + GGML_ASSERT(V->type == type_V); + + if (Q->ne[1] == 1) { + constexpr int cols_per_block = 1; + constexpr int parallel_blocks = 4; + ggml_cuda_flash_attn_ext_vec_f16_case_impl(ctx, dst); + return; + } + + if (Q->ne[1] == 2) { + constexpr int cols_per_block = 2; + constexpr int parallel_blocks = 4; + ggml_cuda_flash_attn_ext_vec_f16_case_impl(ctx, dst); + return; + } + + if (Q->ne[1] <= 4) { + constexpr int cols_per_block = 4; + constexpr int parallel_blocks = 4; + ggml_cuda_flash_attn_ext_vec_f16_case_impl(ctx, dst); + return; + } + + if (Q->ne[1] <= 8) { + constexpr int cols_per_block = 8; + constexpr int parallel_blocks = 4; + ggml_cuda_flash_attn_ext_vec_f16_case_impl(ctx, dst); + return; + } + + constexpr int cols_per_block = 8; + constexpr int parallel_blocks = 1; + ggml_cuda_flash_attn_ext_vec_f16_case_impl(ctx, dst); +} + +#define DECL_FATTN_VEC_F16_CASE(D, type_K, type_V) \ + template void ggml_cuda_flash_attn_ext_vec_f16_case \ + (ggml_backend_cuda_context & ctx, ggml_tensor * dst) \ + +extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0); +extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1); +extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0); +extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1); +extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0); +extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16); + +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0); + +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1); + +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0); + +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1); + +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0); + +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16); +extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16); + +extern DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16); diff --git a/ggml-cuda/fattn-vec-f32.cu b/ggml-cuda/fattn-vec-f32.cu deleted file mode 100644 index b4652301b..000000000 --- a/ggml-cuda/fattn-vec-f32.cu +++ /dev/null @@ -1,279 +0,0 @@ -#include "common.cuh" -#include "fattn-common.cuh" -#include "fattn-vec-f32.cuh" - -template // D == head size -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) -__launch_bounds__(D, 1) -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) -static __global__ void flash_attn_vec_ext_f32( - const char * __restrict__ Q, - const char * __restrict__ K, - const char * __restrict__ V, - const char * __restrict__ mask, - float * __restrict__ dst, - float2 * __restrict__ dst_meta, - const float scale, - const float max_bias, - const float m0, - const float m1, - const uint32_t n_head_log2, - const int ne00, - const int ne01, - const int ne02, - const int ne03, - const int ne10, - const int ne11, - const int ne12, - const int ne13, - const int ne31, - const int nb31, - const int nb01, - const int nb02, - const int nb03, - const int nb11, - const int nb12, - const int nb13, - const int ne0, - const int ne1, - const int ne2, - const int ne3) { - //In this kernel Q, K, V are matrices while i, j, k are matrix indices. - - const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on. - const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel. - - const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. - const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0); - const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio)); - const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape - const half * maskh = (const half *) mask + ne11*ic0; - - const int stride_KV = nb11 / sizeof(half); - const int stride_KV2 = nb11 / sizeof(half2); - - const float slope = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1); - - static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64."); - constexpr int nwarps = D / WARP_SIZE; - const int tid = WARP_SIZE*threadIdx.y + threadIdx.x; - __builtin_assume(tid < D); - - __shared__ float KQ[ncols*D]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - KQ[j*D + tid] = -FLT_MAX/2.0f; - } - - float kqmax[ncols]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - kqmax[j] = -FLT_MAX/2.0f; - } - float kqsum[ncols] = {0.0f}; - - __shared__ float kqmax_shared[ncols][WARP_SIZE]; - __shared__ float kqsum_shared[ncols][WARP_SIZE]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - if (threadIdx.y == 0) { - kqmax_shared[j][threadIdx.x] = -FLT_MAX/2.0f; - kqsum_shared[j][threadIdx.x] = 0.0f; - } - } - __syncthreads(); - - // Convert Q to half2 and store in registers: - float2 Q_h2[ncols][D/(2*WARP_SIZE)]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { -#pragma unroll - for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { - const int i = i0 + threadIdx.x; - - Q_h2[j][i0/WARP_SIZE] = ncols <= 2 || ic0 + j ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f); - Q_h2[j][i0/WARP_SIZE].x *= scale; - Q_h2[j][i0/WARP_SIZE].y *= scale; - } - } - - float VKQ[ncols] = {0.0f}; - - const int k_start = parallel_blocks == 1 ? 0 : ip*D; - for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) { - // Calculate KQ tile and keep track of new maximum KQ values: - - float kqmax_new_arr[ncols]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - kqmax_new_arr[j] = kqmax[j]; - } - -#pragma unroll - for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) { - const int i_KQ = i_KQ_0 + threadIdx.y; - - if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) { - break; - } - - float sum[ncols] = {0.0f}; -#pragma unroll - for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) { - const int k_KQ = k_KQ_0 + threadIdx.x; - - const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ]; -#pragma unroll - for (int j = 0; j < ncols; ++j) { - sum[j] += __low2float(K_ik) * Q_h2[j][k_KQ_0/WARP_SIZE].x; - sum[j] += __high2float(K_ik) * Q_h2[j][k_KQ_0/WARP_SIZE].y; - } - } - -#pragma unroll - for (int j = 0; j < ncols; ++j) { - sum[j] = warp_reduce_sum(sum[j]); - sum[j] += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f; - - kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum[j]); - - if (threadIdx.x == 0) { - KQ[j*D + i_KQ] = sum[j]; - } - } - } - -#pragma unroll - for (int j = 0; j < ncols; ++j) { - float kqmax_new_j = kqmax_new_arr[j]; - - kqmax_new_j = warp_reduce_max(kqmax_new_j); - if (threadIdx.x == 0) { - kqmax_shared[j][threadIdx.y] = kqmax_new_j; - } - } - - __syncthreads(); - -#pragma unroll - for (int j = 0; j < ncols; ++j) { - float kqmax_new_j = kqmax_shared[j][threadIdx.x]; - kqmax_new_j = warp_reduce_max(kqmax_new_j); - - const float KQ_max_scale = expf(kqmax[j] - kqmax_new_j); - kqmax[j] = kqmax_new_j; - - const float val = expf(KQ[j*D + tid] - kqmax[j]); - kqsum[j] = kqsum[j]*KQ_max_scale + val; - KQ[j*D + tid] = val; - - VKQ[j] *= KQ_max_scale; - } - - __syncthreads(); - -#pragma unroll - for (int k = 0; k < D; ++k) { - if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k >= ne11) { - break; - } - - const float V_ki = __half2float(V_h[(k_VKQ_0 + k)*stride_KV + tid]); -#pragma unroll - for (int j = 0; j < ncols; ++j) { - VKQ[j] += V_ki*KQ[j*D + k]; - } - } - - __syncthreads(); - } - -#pragma unroll - for (int j = 0; j < ncols; ++j) { - kqsum[j] = warp_reduce_sum(kqsum[j]); - if (threadIdx.x == 0) { - kqsum_shared[j][threadIdx.y] = kqsum[j]; - } - } - - __syncthreads(); - -#pragma unroll - for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) { - if (ncols > 2 && ic0 + j_VKQ >= ne01) { - break; - } - - kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x]; - kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]); - - float dst_val = VKQ[j_VKQ]; - if (parallel_blocks == 1) { - dst_val /= kqsum[j_VKQ]; - } - const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip; - dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val; - } - - if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) { - dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]); - } -} - -template -void launch_fattn_vec_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * Q = dst->src[0]; - switch (Q->ne[0]) { - case 64: { - constexpr int D = 64; - constexpr int nwarps = D/WARP_SIZE; - fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); - } break; - case 128: { - constexpr int D = 128; - constexpr int nwarps = D/WARP_SIZE; - fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); - } break; - default: { - GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128."); - } break; - } -} - -void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * Q = dst->src[0]; - - if (Q->ne[1] == 1) { - constexpr int cols_per_block = 1; - constexpr int parallel_blocks = 4; - launch_fattn_vec_f32_64_128(ctx, dst); - return; - } - - if (Q->ne[1] == 2) { - constexpr int cols_per_block = 2; - constexpr int parallel_blocks = 4; - launch_fattn_vec_f32_64_128(ctx, dst); - return; - } - - if (Q->ne[1] <= 4) { - constexpr int cols_per_block = 4; - constexpr int parallel_blocks = 4; - launch_fattn_vec_f32_64_128(ctx, dst); - return; - } - - if (Q->ne[1] <= 8) { - constexpr int cols_per_block = 8; - constexpr int parallel_blocks = 4; - launch_fattn_vec_f32_64_128(ctx, dst); - return; - } - - constexpr int cols_per_block = 8; - constexpr int parallel_blocks = 1; - launch_fattn_vec_f32_64_128(ctx, dst); -} diff --git a/ggml-cuda/fattn-vec-f32.cuh b/ggml-cuda/fattn-vec-f32.cuh index 614d54ae3..3009f0f43 100644 --- a/ggml-cuda/fattn-vec-f32.cuh +++ b/ggml-cuda/fattn-vec-f32.cuh @@ -1,3 +1,376 @@ #include "common.cuh" +#include "fattn-common.cuh" -void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +template // D == head size +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +__launch_bounds__(D, 1) +#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +static __global__ void flash_attn_vec_ext_f32( + const char * __restrict__ Q, + const char * __restrict__ K, + const char * __restrict__ V, + const char * __restrict__ mask, + float * __restrict__ dst, + float2 * __restrict__ dst_meta, + const float scale, + const float max_bias, + const float m0, + const float m1, + const uint32_t n_head_log2, + const int ne00, + const int ne01, + const int ne02, + const int ne03, + const int ne10, + const int ne11, + const int ne12, + const int ne13, + const int ne31, + const int nb31, + const int nb01, + const int nb02, + const int nb03, + const int nb11, + const int nb12, + const int nb13, + const int nb21, + const int nb22, + const int nb23, + const int ne0, + const int ne1, + const int ne2, + const int ne3) { + //In this kernel Q, K, V are matrices while i, j, k are matrix indices. + + constexpr vec_dot_KQ_f32_t vec_dot_KQ = get_vec_dot_KQ_f32(type_K); + constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16; + constexpr dequantize_1_f32_t dequantize_1_v = get_dequantize_1_f32(type_V); + + const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on. + const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel. + + const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. + Q += nb02* blockIdx.y + nb01*ic0; + K += nb12*(blockIdx.y / gqa_ratio); + V += nb22*(blockIdx.y / gqa_ratio); // K and V have same shape + const half * maskh = (const half *) mask + ne11*ic0; + + const float slope = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1); + + static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64."); + constexpr int nwarps = D / WARP_SIZE; + const int tid = WARP_SIZE*threadIdx.y + threadIdx.x; + __builtin_assume(tid < D); + + __shared__ float KQ[ncols*D]; +#pragma unroll + for (int j = 0; j < ncols; ++j) { + KQ[j*D + tid] = -FLT_MAX/2.0f; + } + + float kqmax[ncols]; +#pragma unroll + for (int j = 0; j < ncols; ++j) { + kqmax[j] = -FLT_MAX/2.0f; + } + float kqsum[ncols] = {0.0f}; + + __shared__ float kqmax_shared[ncols][WARP_SIZE]; + __shared__ float kqsum_shared[ncols][WARP_SIZE]; +#pragma unroll + for (int j = 0; j < ncols; ++j) { + if (threadIdx.y == 0) { + kqmax_shared[j][threadIdx.x] = -FLT_MAX/2.0f; + kqsum_shared[j][threadIdx.x] = 0.0f; + } + } + __syncthreads(); + + // Convert Q to float2 (f16 K) or q8_1 (quantized K) and store in registers: + float2 Q_f2[ncols][D/(2*WARP_SIZE)]; + int Q_i32[ncols][D/(sizeof(int)*QK8_1) == 0 ? 1 : D >= D/(sizeof(int)*QK8_1)]; + float2 Q_ds[ncols][D/QK8_1 == 0 ? 1 : D/QK8_1]; + if (Q_q8_1) { +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + const int j = j0 + threadIdx.y; + + if (j0 + nwarps > ncols && j >= ncols) { + break; + } + + // Reuse KQ as temporary storage for converting Q to q8_1: + int * tmp_q_i32 = (int *) &KQ[j*D]; + float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int)); + + // Set memory to zero if out of bounds: + if (ncols > 2 && ic0 + j >= ne01) { +#pragma unroll + for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + + tmp_q_i32[i] = 0; + } + if (threadIdx.x < D/QK8_1) { + tmp_q_ds[threadIdx.x] = make_float2(0.0f, 0.0f); + } + continue; + } + + const float * Q_f = (const float *) (Q + j*nb01); +#pragma unroll + for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) { + quantize_q8_1_to_shared(Q_f + 4*i0, scale, tmp_q_i32, tmp_q_ds); + } + } + + __syncthreads(); + +#pragma unroll + for (int j = 0; j < ncols; ++j) { + int * tmp_q_i32 = (int *) &KQ[j*D]; + float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int)); + +#pragma unroll + for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + + Q_i32[j][i0/WARP_SIZE] = tmp_q_i32[i]; + Q_ds[j][i0/WARP_SIZE] = tmp_q_ds[i/QI8_1]; + } + } + + __syncthreads(); + } else { +#pragma unroll + for (int j = 0; j < ncols; ++j) { + const float2 * Q_f2_j = (const float2 *) (Q + j*nb01); +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + + Q_f2[j][i0/WARP_SIZE] = ncols <= 2 || ic0 + j ? Q_f2_j[i] : make_float2(0.0f, 0.0f); + Q_f2[j][i0/WARP_SIZE].x *= scale; + Q_f2[j][i0/WARP_SIZE].y *= scale; + } + } + } + + float VKQ[ncols] = {0.0f}; + + const int k_start = parallel_blocks == 1 ? 0 : ip*D; + for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) { + // Calculate KQ tile and keep track of new maximum KQ values: + + float kqmax_new_arr[ncols]; +#pragma unroll + for (int j = 0; j < ncols; ++j) { + kqmax_new_arr[j] = kqmax[j]; + } + +#pragma unroll + for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) { + const int i_KQ = i_KQ_0 + threadIdx.y; + + if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) { + break; + } + +#pragma unroll + for (int j = 0; j < ncols; ++j) { + float sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_f2[j], Q_i32[j], Q_ds[j]); + sum = warp_reduce_sum(sum); + sum += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f; + + kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum); + + if (threadIdx.x == 0) { + KQ[j*D + i_KQ] = sum; + } + } + } + +#pragma unroll + for (int j = 0; j < ncols; ++j) { + float kqmax_new_j = kqmax_new_arr[j]; + + kqmax_new_j = warp_reduce_max(kqmax_new_j); + if (threadIdx.x == 0) { + kqmax_shared[j][threadIdx.y] = kqmax_new_j; + } + } + + __syncthreads(); + +#pragma unroll + for (int j = 0; j < ncols; ++j) { + float kqmax_new_j = kqmax_shared[j][threadIdx.x]; + kqmax_new_j = warp_reduce_max(kqmax_new_j); + + const float KQ_max_scale = expf(kqmax[j] - kqmax_new_j); + kqmax[j] = kqmax_new_j; + + const float val = expf(KQ[j*D + tid] - kqmax[j]); + kqsum[j] = kqsum[j]*KQ_max_scale + val; + KQ[j*D + tid] = val; + + VKQ[j] *= KQ_max_scale; + } + + __syncthreads(); + +#pragma unroll + for (int k = 0; k < D; ++k) { + if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k >= ne11) { + break; + } + + const float V_ki = dequantize_1_v(V + (k_VKQ_0 + k)*nb21, tid); +#pragma unroll + for (int j = 0; j < ncols; ++j) { + VKQ[j] += V_ki*KQ[j*D + k]; + } + } + + __syncthreads(); + } + +#pragma unroll + for (int j = 0; j < ncols; ++j) { + kqsum[j] = warp_reduce_sum(kqsum[j]); + if (threadIdx.x == 0) { + kqsum_shared[j][threadIdx.y] = kqsum[j]; + } + } + + __syncthreads(); + +#pragma unroll + for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) { + if (ncols > 2 && ic0 + j_VKQ >= ne01) { + break; + } + + kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x]; + kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]); + + float dst_val = VKQ[j_VKQ]; + if (parallel_blocks == 1) { + dst_val /= kqsum[j_VKQ]; + } + const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip; + dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val; + } + + if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) { + dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]); + } +} + +template +void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + constexpr int nwarps = D/WARP_SIZE; + fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32; + launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); +} + +template +void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_tensor * KQV = dst; + ggml_tensor * Q = dst->src[0]; + ggml_tensor * K = dst->src[1]; + ggml_tensor * V = dst->src[2]; + + const int32_t precision = KQV->op_params[2]; + GGML_ASSERT(precision == GGML_PREC_DEFAULT); + + GGML_ASSERT(K->type == type_K); + GGML_ASSERT(V->type == type_V); + + if (Q->ne[1] == 1) { + constexpr int cols_per_block = 1; + constexpr int parallel_blocks = 4; + ggml_cuda_flash_attn_ext_vec_f32_case_impl(ctx, dst); + return; + } + + if (Q->ne[1] == 2) { + constexpr int cols_per_block = 2; + constexpr int parallel_blocks = 4; + ggml_cuda_flash_attn_ext_vec_f32_case_impl(ctx, dst); + return; + } + + if (Q->ne[1] <= 4) { + constexpr int cols_per_block = 4; + constexpr int parallel_blocks = 4; + ggml_cuda_flash_attn_ext_vec_f32_case_impl(ctx, dst); + return; + } + + if (Q->ne[1] <= 8) { + constexpr int cols_per_block = 8; + constexpr int parallel_blocks = 4; + ggml_cuda_flash_attn_ext_vec_f32_case_impl(ctx, dst); + return; + } + + constexpr int cols_per_block = 8; + constexpr int parallel_blocks = 1; + ggml_cuda_flash_attn_ext_vec_f32_case_impl(ctx, dst); +} + +#define DECL_FATTN_VEC_F32_CASE(D, type_K, type_V) \ + template void ggml_cuda_flash_attn_ext_vec_f32_case \ + (ggml_backend_cuda_context & ctx, ggml_tensor * dst) \ + +extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0); +extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1); +extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0); +extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1); +extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0); +extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16); + +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0); + +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1); + +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0); + +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1); + +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0); + +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16); +extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16); + +extern DECL_FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16); diff --git a/ggml-cuda/fattn-wmma-f16.cuh b/ggml-cuda/fattn-wmma-f16.cuh new file mode 100644 index 000000000..65ed31852 --- /dev/null +++ b/ggml-cuda/fattn-wmma-f16.cuh @@ -0,0 +1,490 @@ +#include "common.cuh" +#include "fattn-common.cuh" + +#if FP16_MMA_AVAILABLE +#include +#endif + +// D == head size, VKQ_stride == num VKQ rows calculated in parallel: +template +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +__launch_bounds__(nwarps*WARP_SIZE, 1) +#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +static __global__ void flash_attn_ext_f16( + const char * __restrict__ Q, + const char * __restrict__ K, + const char * __restrict__ V, + const char * __restrict__ mask, + float * __restrict__ dst, + float2 * __restrict__ dst_meta, + const float scale, + const float max_bias, + const float m0, + const float m1, + const uint32_t n_head_log2, + const int ne00, + const int ne01, + const int ne02, + const int ne03, + const int ne10, + const int ne11, + const int ne12, + const int ne13, + const int ne31, + const int nb31, + const int nb01, + const int nb02, + const int nb03, + const int nb11, + const int nb12, + const int nb13, + const int nb21, + const int nb22, + const int nb23, + const int ne0, + const int ne1, + const int ne2, + const int ne3) { +#if FP16_MMA_AVAILABLE + //In this kernel Q, K, V are matrices while i, j, k are matrix indices. + + const int ic0 = ncols*(blockIdx.x / parallel_blocks); // Index of the first Q/QKV column to work on. + const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel. + + static_assert(D <= FATTN_KQ_STRIDE, "D must be <= FATTN_KQ_STRIDE."); + static_assert(ncols == 8 || ncols % 16 == 0, "ncols must be 8 or a multiple of 16."); + constexpr int frag_m = ncols == 8 ? 32 : 16; + constexpr int frag_n = ncols == 8 ? 8 : 16; + static_assert(D % frag_m == 0, "If ncols == 8 then D % frag_m must be 0."); + typedef nvcuda::wmma::fragment frag_a_K; + typedef nvcuda::wmma::fragment frag_a_V; + typedef nvcuda::wmma::fragment frag_b; + typedef nvcuda::wmma::fragment frag_c_KQ; + typedef nvcuda::wmma::fragment frag_c_VKQ; + + constexpr int KQ_stride_tc = nwarps*frag_m; // Number of KQ rows calculated in parallel. + constexpr int VKQ_ratio = KQ_stride_tc/VKQ_stride; // Number of parallel VKQ accumulators needed to keep all warps busy. + static_assert(VKQ_ratio <= nwarps, "VKQ_ratio must be <= nwarps."); + + // Pad internal representation of KQ, KQV to reduce shared memory bank conflicts: + constexpr int D_padded = D + 8; + constexpr int kqs_padded = FATTN_KQ_STRIDE + 8; + constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half); + + const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. + const float * Q_f = (const float *) (Q + nb02* blockIdx.y + nb01*ic0); + const half * K_h = (const half *) (K + nb12*(blockIdx.y / gqa_ratio)); + const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape + const half * maskh = (const half *) mask + (nb31/sizeof(half))* ic0; + const half2 * mask2 = (const half2 *) mask + (nb31/sizeof(half))*(ic0/2); + + const int stride_Q = nb01 / sizeof(float); + const int stride_KV = nb11 / sizeof(half); + + const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1); + const half slopeh = __float2half(slopef); + const half2 slope2 = make_half2(slopef, slopef); + + frag_b Q_b[D/16][ncols/frag_n]; + + // A single buffer for temporarily holding tiles of KQ and VKQ parts: + constexpr int mem_KQ = ncols*kqs_padded*kqar; + constexpr int mem_VKQ_parts = VKQ_ratio*ncols*D_padded; + __shared__ half KQ[mem_KQ >= mem_VKQ_parts ? mem_KQ : mem_VKQ_parts]; + float * KQ_f = (float *) KQ; + half2 * KQ2 = (half2 *) KQ; + + float KQ_rowsum_f[ncols/nwarps] = {0.0f}; + float KQ_max_f[ncols/nwarps]; + float KQ_max_scale_f[ncols/nwarps] = {0.0f}; + +#pragma unroll + for (int j = 0; j < ncols/nwarps; ++j) { + KQ_max_f[j] = -FLT_MAX/2.0f; + } + + half2 KQ_rowsum_h2[ncols/nwarps] = {{0.0f, 0.0f}}; + half2 KQ_max_h2[ncols/nwarps]; + half2 KQ_max_scale_h2[ncols/nwarps] = {{0.0f, 0.0f}}; + +#pragma unroll + for (int j = 0; j < ncols/nwarps; ++j) { + KQ_max_h2[j] = make_half2(-HALF_MAX_HALF, -HALF_MAX_HALF); + } + + __shared__ half VKQ[ncols*D_padded]; // Accumulator for final VKQ slice. + half2 * VKQ2 = (half2 *) VKQ; +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + const int j = j0 + threadIdx.y; +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + if (i0 + WARP_SIZE > D/2 && i >= D/2) { + break; + } + VKQ2[j*(D_padded/2) + i] = make_half2(0.0f, 0.0f); + } + } + + // Convert Q to half and apply scale, temporarily store in KQ: +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + const int j = j0 + threadIdx.y; +#pragma unroll + for (int i0 = 0; i0 < D; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + if (i0 + WARP_SIZE > D && i >= D) { + break; + } + KQ[j*D_padded + i] = ic0 + j < ne01 ? Q_f[j*stride_Q + i] * scale : 0.0f; + } + } + + __syncthreads(); + + // Load Q into tensor core fragments/registers since it will be used frequently: +#pragma unroll + for (int i0 = 0; i0 < D; i0 += 16) { +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += frag_n) { + nvcuda::wmma::load_matrix_sync(Q_b[i0/16][j0/frag_n], KQ + j0*D_padded + i0, D_padded); + } + } + + __syncthreads(); + + // Iterate over ne11 == previous tokens: + for (int k_VKQ_0 = ip*FATTN_KQ_STRIDE; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE) { + // Calculate tile of KQ: +#pragma unroll + for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE; i_KQ_0 += KQ_stride_tc) { + frag_c_KQ KQ_c[ncols/frag_n]; +#pragma unroll + for (int j = 0; j < ncols/frag_n; ++j) { + nvcuda::wmma::fill_fragment(KQ_c[j], 0.0f); + } +#pragma unroll + for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 16) { + frag_a_K K_a; + nvcuda::wmma::load_matrix_sync(K_a, K_h + (k_VKQ_0 + i_KQ_0 + frag_m*threadIdx.y)*stride_KV + k_KQ_0, stride_KV); +#pragma unroll + for (int j = 0; j < ncols/frag_n; ++j) { + nvcuda::wmma::mma_sync(KQ_c[j], K_a, Q_b[k_KQ_0/16][j], KQ_c[j]); + } + } +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += frag_n) { + nvcuda::wmma::store_matrix_sync((KQ_acc_t *) KQ + j0*kqs_padded + i_KQ_0 + frag_m*threadIdx.y, KQ_c[j0/frag_n], kqs_padded, nvcuda::wmma::mem_col_major); + } + } + + __syncthreads(); + + // Calculate softmax for each KQ column using the current max. value. + // The divisor is stored in KQ_rowsum and will be applied at the end. +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + const int j = j0 + threadIdx.y; + + if (std::is_same::value) { + float KQ_f_tmp[FATTN_KQ_STRIDE / WARP_SIZE]; +#pragma unroll + for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) { + const int k = k0 + threadIdx.x; + + KQ_f_tmp[k0/WARP_SIZE] = KQ_f[j*kqs_padded + k]; + } + + float KQ_max_new = KQ_max_f[j0/nwarps]; +#pragma unroll + for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) { + const int k = k0 + threadIdx.x; + + KQ_f_tmp[k0/WARP_SIZE] += mask ? __half2float(slopeh*maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f; + KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/WARP_SIZE]); + } + KQ_max_new = warp_reduce_max(KQ_max_new); + + const float diff = KQ_max_f[j0/nwarps] - KQ_max_new; + KQ_max_scale_f[j0/nwarps] = expf(diff); + if (diff <= SOFTMAX_FTZ_THRESHOLD) { + KQ_max_scale_f[j0/nwarps] = 0.0f; + } + KQ_max_f[j0/nwarps] = KQ_max_new; + + float KQ_rowsum_add = 0.0f; +#pragma unroll + for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) { + const int k = k0 + threadIdx.x; + + const float diff = KQ_f_tmp[k0/WARP_SIZE] - KQ_max_f[j0/nwarps]; + KQ_f_tmp[k0/WARP_SIZE] = expf(diff); + if (diff <= SOFTMAX_FTZ_THRESHOLD) { + KQ_f_tmp[k0/WARP_SIZE] = 0.0f; + } + KQ_rowsum_add += KQ_f_tmp[k0/WARP_SIZE]; + KQ[j*(kqar*kqs_padded) + k] = KQ_f_tmp[k0/WARP_SIZE]; + } + KQ_rowsum_add = warp_reduce_sum(KQ_rowsum_add); + + // Scale previous KQ_rowsum to account for a potential increase in KQ_max: + KQ_rowsum_f[j0/nwarps] = KQ_max_scale_f[j0/nwarps]*KQ_rowsum_f[j0/nwarps] + KQ_rowsum_add; + } else { + half2 KQ2_tmp[FATTN_KQ_STRIDE/(2*WARP_SIZE)]; +#pragma unroll + for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) { + const int k = k0 + threadIdx.x; + + KQ2_tmp[k0/WARP_SIZE] = KQ2[j*(kqs_padded/2) + k]; + } + + half2 KQ_max_new = KQ_max_h2[j0/nwarps]; +#pragma unroll + for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) { + const int k = k0 + threadIdx.x; + + KQ2_tmp[k0/WARP_SIZE] += mask ? slope2*mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f); + KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/WARP_SIZE]); + } + KQ_max_new = __half2half2(warp_reduce_max(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new)))); + const half2 diff = KQ_max_h2[j0/nwarps] - KQ_max_new; + KQ_max_scale_h2[j0/nwarps] = h2exp(diff); + const uint32_t ftz_mask = __hgt2_mask(diff, make_half2(SOFTMAX_FTZ_THRESHOLD, SOFTMAX_FTZ_THRESHOLD)); + *((uint32_t *) &KQ_max_scale_h2[j0/nwarps]) &= ftz_mask; + KQ_max_h2[j0/nwarps] = KQ_max_new; + + half2 KQ_rowsum_add = make_half2(0.0f, 0.0f); +#pragma unroll + for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) { + const int k = k0 + threadIdx.x; + + const half2 diff = KQ2_tmp[k0/WARP_SIZE] - KQ_max_h2[j0/nwarps]; + KQ2_tmp[k0/WARP_SIZE] = h2exp(diff); + const uint32_t ftz_mask = __hgt2_mask(diff, make_half2(SOFTMAX_FTZ_THRESHOLD, SOFTMAX_FTZ_THRESHOLD)); + *((uint32_t *) &KQ2_tmp[k0/WARP_SIZE]) &= ftz_mask; + KQ_rowsum_add += KQ2_tmp[k0/WARP_SIZE]; + KQ2[j*(kqs_padded/2) + k] = KQ2_tmp[k0/WARP_SIZE]; + } + KQ_rowsum_add = warp_reduce_sum(KQ_rowsum_add); + + // Scale previous KQ_rowsum to account for a potential increase in KQ_max: + KQ_rowsum_h2[j0/nwarps] = KQ_max_scale_h2[j0/nwarps]*KQ_rowsum_h2[j0/nwarps] + KQ_rowsum_add; + } + } + + __syncthreads(); + + frag_b KQ_b[FATTN_KQ_STRIDE/(VKQ_ratio*16)][ncols/frag_n]; +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += frag_n) { +#pragma unroll + for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += VKQ_ratio*16) { + const int k = k0 + (threadIdx.y % VKQ_ratio)*16; + nvcuda::wmma::load_matrix_sync( + KQ_b[k0/(VKQ_ratio*16)][j0/frag_n], + KQ + j0*(kqar*kqs_padded) + k, + kqar*kqs_padded); + } + } + + frag_c_VKQ VKQ_c[D/VKQ_stride][ncols/frag_n]; +#pragma unroll + for (int i_VKQ_0 = 0; i_VKQ_0 < D; i_VKQ_0 += VKQ_stride) { +#pragma unroll + for (int j = 0; j < ncols/frag_n; ++j) { + nvcuda::wmma::fill_fragment(VKQ_c[i_VKQ_0/VKQ_stride][j], 0.0f); + } + +#pragma unroll + for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += VKQ_ratio*16) { + const int k = k0 + (threadIdx.y % VKQ_ratio)*16; + + frag_a_V v_a; + nvcuda::wmma::load_matrix_sync(v_a, V_h + (k_VKQ_0 + k)*stride_KV + i_VKQ_0 + frag_m*(threadIdx.y/VKQ_ratio), stride_KV); +#pragma unroll + for (int j = 0; j < ncols/frag_n; ++j) { + nvcuda::wmma::mma_sync(VKQ_c[i_VKQ_0/VKQ_stride][j], v_a, KQ_b[k0/(VKQ_ratio*16)][j], VKQ_c[i_VKQ_0/VKQ_stride][j]); + } + } + } + + __syncthreads(); + + const int offset_k = (threadIdx.y % VKQ_ratio) * (ncols*D_padded); +#pragma unroll + for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += VKQ_stride) { +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += frag_n) { + nvcuda::wmma::store_matrix_sync( + KQ + offset_k + j0*D_padded + i_KQ_0 + frag_m*(threadIdx.y/VKQ_ratio), + VKQ_c[i_KQ_0/VKQ_stride][j0/frag_n], + D_padded, nvcuda::wmma::mem_col_major); + } + } + + __syncthreads(); + +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + const int j = j0 + threadIdx.y; + + half2 VKQ_scale; + if (std::is_same::value) { + VKQ_scale = make_half2(KQ_max_scale_f[j0/nwarps], KQ_max_scale_f[j0/nwarps]); + } else { + VKQ_scale = KQ_max_scale_h2[j0/nwarps]; + } + +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + if (i0 + WARP_SIZE > D/2 && i >= D/2) { + break; + } + + half2 VKQ_add = make_half2(0.0f, 0.0f); +#pragma unroll + for (int l = 0; l < VKQ_ratio; ++l) { + VKQ_add += KQ2[l*(ncols*D_padded/2) + j*(D_padded/2) + i]; + } + VKQ2[j*(D_padded/2) + i] = VKQ_scale*VKQ2[j*(D_padded/2) + i] + VKQ_add; + } + } + + __syncthreads(); + } + +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + const int j_VKQ = j0 + threadIdx.y; + if (ic0 + j_VKQ >= ne01) { + return; + } + const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip; + + float KQ_rowsum_j; + if (std::is_same::value) { + KQ_rowsum_j = KQ_rowsum_f[j0/nwarps]; + } else { + KQ_rowsum_j = __low2float(KQ_rowsum_h2[j0/nwarps]) + __high2float(KQ_rowsum_h2[j0/nwarps]); + } + +#pragma unroll + for (int i0 = 0; i0 < D; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + if (i0 + WARP_SIZE > D && i >= D) { + break; + } + float dst_val = VKQ[j_VKQ*D_padded + i]; + if (parallel_blocks == 1) { + dst_val /= KQ_rowsum_j; + } + dst[j_dst*gridDim.y*D + blockIdx.y*D + i] = dst_val; + } + + if (parallel_blocks == 1 || threadIdx.x != 0) { + continue; + } + + float2 dst_meta_val; + if (std::is_same::value) { + dst_meta_val.x = KQ_max_f[j0/nwarps]; + } else { + dst_meta_val.x = __low2float(KQ_max_h2[j0/nwarps]); + } + dst_meta_val.y = KQ_rowsum_j; + dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = dst_meta_val; + } +#else + NO_DEVICE_CODE; +#endif // FP16_MMA_AVAILABLE +} + +constexpr int get_max_power_of_2(int x) { + return x % 2 == 0 ? 2*get_max_power_of_2(x/2) : 1; +} + +static_assert(get_max_power_of_2(1) == 1, "Test failed."); +static_assert(get_max_power_of_2(2) == 2, "Test failed."); +static_assert(get_max_power_of_2(4) == 4, "Test failed."); +static_assert(get_max_power_of_2(6) == 2, "Test failed."); + +// Number of VKQ rows calculated in parallel: +constexpr int get_VKQ_stride(int D, int nwarps, int frag_m) { + return (get_max_power_of_2(D/frag_m) < nwarps ? get_max_power_of_2(D/frag_m) : nwarps)*frag_m; +} + +static_assert(get_VKQ_stride(128, 1, 32) == 32, "Test failed."); +static_assert(get_VKQ_stride(128, 2, 32) == 64, "Test failed."); +static_assert(get_VKQ_stride(128, 4, 32) == 128, "Test failed."); +static_assert(get_VKQ_stride( 64, 1, 32) == 32, "Test failed."); +static_assert(get_VKQ_stride( 64, 2, 32) == 64, "Test failed."); +static_assert(get_VKQ_stride( 64, 4, 32) == 64, "Test failed."); +static_assert(get_VKQ_stride( 80, 1, 16) == 16, "Test failed."); +static_assert(get_VKQ_stride( 80, 2, 16) == 16, "Test failed."); +static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed."); + +template +void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * Q = dst->src[0]; + + constexpr int nwarps = 4; + + constexpr int frag_m = cols_per_block == 8 && D % 32 == 0 ? 32 : 16; + const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3]; + const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm; + + if (4*blocks_num_pb1 < 2*nsm) { + constexpr int parallel_blocks = 4; + fattn_kernel_t fattn_kernel = flash_attn_ext_f16; + launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); + return; + } + if (2*blocks_num_pb1 < 2*nsm) { + constexpr int parallel_blocks = 2; + fattn_kernel_t fattn_kernel = flash_attn_ext_f16; + launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); + return; + } + constexpr int parallel_blocks = 1; + fattn_kernel_t fattn_kernel = flash_attn_ext_f16; + launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); +} + +#define DECL_FATTN_WMMA_F16_CASE(D, cols_per_block, KQ_acc_t) \ + template void ggml_cuda_flash_attn_ext_wmma_f16_case \ + (ggml_backend_cuda_context & ctx, ggml_tensor * dst) \ + +extern DECL_FATTN_WMMA_F16_CASE( 64, 16, float); +extern DECL_FATTN_WMMA_F16_CASE( 80, 16, float); +extern DECL_FATTN_WMMA_F16_CASE( 96, 16, float); +extern DECL_FATTN_WMMA_F16_CASE(112, 16, float); +extern DECL_FATTN_WMMA_F16_CASE(128, 16, float); +extern DECL_FATTN_WMMA_F16_CASE(256, 16, float); + +extern DECL_FATTN_WMMA_F16_CASE( 64, 32, float); +extern DECL_FATTN_WMMA_F16_CASE( 80, 32, float); +extern DECL_FATTN_WMMA_F16_CASE( 96, 32, float); +extern DECL_FATTN_WMMA_F16_CASE(112, 32, float); +extern DECL_FATTN_WMMA_F16_CASE(128, 32, float); +// extern DECL_FATTN_WMMA_F16_CASE(256, 16, float); + +extern DECL_FATTN_WMMA_F16_CASE( 64, 8, half); +extern DECL_FATTN_WMMA_F16_CASE( 96, 8, half); +extern DECL_FATTN_WMMA_F16_CASE(128, 8, half); +extern DECL_FATTN_WMMA_F16_CASE(256, 8, half); + +extern DECL_FATTN_WMMA_F16_CASE( 64, 16, half); +extern DECL_FATTN_WMMA_F16_CASE( 80, 16, half); +extern DECL_FATTN_WMMA_F16_CASE( 96, 16, half); +extern DECL_FATTN_WMMA_F16_CASE(112, 16, half); +extern DECL_FATTN_WMMA_F16_CASE(128, 16, half); +extern DECL_FATTN_WMMA_F16_CASE(256, 16, half); + +extern DECL_FATTN_WMMA_F16_CASE( 64, 32, half); +extern DECL_FATTN_WMMA_F16_CASE( 80, 32, half); +extern DECL_FATTN_WMMA_F16_CASE( 96, 32, half); +extern DECL_FATTN_WMMA_F16_CASE(112, 32, half); +extern DECL_FATTN_WMMA_F16_CASE(128, 32, half); +extern DECL_FATTN_WMMA_F16_CASE(256, 16, half); diff --git a/ggml-cuda/fattn.cu b/ggml-cuda/fattn.cu index af7c95232..b35ab67a8 100644 --- a/ggml-cuda/fattn.cu +++ b/ggml-cuda/fattn.cu @@ -4,468 +4,313 @@ #include "fattn-tile-f32.cuh" #include "fattn-vec-f16.cuh" #include "fattn-vec-f32.cuh" +#include "fattn-wmma-f16.cuh" #include "fattn.cuh" #include -#if FP16_MMA_AVAILABLE -#include -#endif +static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * KQV = dst; + const ggml_tensor * Q = dst->src[0]; -// D == head size, VKQ_stride == num VKQ rows calculated in parallel: -template -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) -__launch_bounds__(nwarps*WARP_SIZE, 1) -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) -static __global__ void flash_attn_ext_f16( - const char * __restrict__ Q, - const char * __restrict__ K, - const char * __restrict__ V, - const char * __restrict__ mask, - float * __restrict__ dst, - float2 * __restrict__ dst_meta, - const float scale, - const float max_bias, - const float m0, - const float m1, - const uint32_t n_head_log2, - const int ne00, - const int ne01, - const int ne02, - const int ne03, - const int ne10, - const int ne11, - const int ne12, - const int ne13, - const int ne31, - const int nb31, - const int nb01, - const int nb02, - const int nb03, - const int nb11, - const int nb12, - const int nb13, - const int ne0, - const int ne1, - const int ne2, - const int ne3) { -#if FP16_MMA_AVAILABLE - //In this kernel Q, K, V are matrices while i, j, k are matrix indices. + const int32_t precision = KQV->op_params[2]; - const int ic0 = ncols*(blockIdx.x / parallel_blocks); // Index of the first Q/QKV column to work on. - const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel. - - static_assert(D <= FATTN_KQ_STRIDE, "D must be <= FATTN_KQ_STRIDE."); - static_assert(ncols == 8 || ncols % 16 == 0, "ncols must be 8 or a multiple of 16."); - constexpr int frag_m = ncols == 8 ? 32 : 16; - constexpr int frag_n = ncols == 8 ? 8 : 16; - static_assert(D % frag_m == 0, "If ncols == 8 then D % frag_m must be 0."); - typedef nvcuda::wmma::fragment frag_a_K; - typedef nvcuda::wmma::fragment frag_a_V; - typedef nvcuda::wmma::fragment frag_b; - typedef nvcuda::wmma::fragment frag_c_KQ; - typedef nvcuda::wmma::fragment frag_c_VKQ; - - constexpr int KQ_stride_tc = nwarps*frag_m; // Number of KQ rows calculated in parallel. - constexpr int VKQ_ratio = KQ_stride_tc/VKQ_stride; // Number of parallel VKQ accumulators needed to keep all warps busy. - static_assert(VKQ_ratio <= nwarps, "VKQ_ratio must be <= nwarps."); - - // Pad internal representation of KQ, KQV to reduce shared memory bank conflicts: - constexpr int D_padded = D + 8; - constexpr int kqs_padded = FATTN_KQ_STRIDE + 8; - constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half); - - const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. - const float * Q_f = (const float *) (Q + nb02* blockIdx.y + nb01*ic0); - const half * K_h = (const half *) (K + nb12*(blockIdx.y / gqa_ratio)); - const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape - const half * maskh = (const half *) mask + (nb31/sizeof(half))* ic0; - const half2 * mask2 = (const half2 *) mask + (nb31/sizeof(half))*(ic0/2); - - const int stride_Q = nb01 / sizeof(float); - const int stride_KV = nb11 / sizeof(half); - - const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1); - const half slopeh = __float2half(slopef); - const half2 slope2 = make_half2(slopef, slopef); - - frag_b Q_b[D/16][ncols/frag_n]; - - // A single buffer for temporarily holding tiles of KQ and VKQ parts: - constexpr int mem_KQ = ncols*kqs_padded*kqar; - constexpr int mem_VKQ_parts = VKQ_ratio*ncols*D_padded; - __shared__ half KQ[mem_KQ >= mem_VKQ_parts ? mem_KQ : mem_VKQ_parts]; - float * KQ_f = (float *) KQ; - half2 * KQ2 = (half2 *) KQ; - - float KQ_rowsum_f[ncols/nwarps] = {0.0f}; - float KQ_max_f[ncols/nwarps]; - float KQ_max_scale_f[ncols/nwarps] = {0.0f}; - -#pragma unroll - for (int j = 0; j < ncols/nwarps; ++j) { - KQ_max_f[j] = -FLT_MAX/2.0f; - } - - half2 KQ_rowsum_h2[ncols/nwarps] = {{0.0f, 0.0f}}; - half2 KQ_max_h2[ncols/nwarps]; - half2 KQ_max_scale_h2[ncols/nwarps] = {{0.0f, 0.0f}}; - -#pragma unroll - for (int j = 0; j < ncols/nwarps; ++j) { - KQ_max_h2[j] = make_half2(-HALF_MAX_HALF, -HALF_MAX_HALF); - } - - __shared__ half VKQ[ncols*D_padded]; // Accumulator for final VKQ slice. - half2 * VKQ2 = (half2 *) VKQ; -#pragma unroll - for (int j0 = 0; j0 < ncols; j0 += nwarps) { - const int j = j0 + threadIdx.y; -#pragma unroll - for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { - const int i = i0 + threadIdx.x; - if (i0 + WARP_SIZE > D/2 && i >= D/2) { - break; - } - VKQ2[j*(D_padded/2) + i] = make_half2(0.0f, 0.0f); - } - } - - // Convert Q to half and apply scale, temporarily store in KQ: -#pragma unroll - for (int j0 = 0; j0 < ncols; j0 += nwarps) { - const int j = j0 + threadIdx.y; -#pragma unroll - for (int i0 = 0; i0 < D; i0 += WARP_SIZE) { - const int i = i0 + threadIdx.x; - if (i0 + WARP_SIZE > D && i >= D) { - break; - } - KQ[j*D_padded + i] = ic0 + j < ne01 ? Q_f[j*stride_Q + i] * scale : 0.0f; - } - } - - __syncthreads(); - - // Load Q into tensor core fragments/registers since it will be used frequently: -#pragma unroll - for (int i0 = 0; i0 < D; i0 += 16) { -#pragma unroll - for (int j0 = 0; j0 < ncols; j0 += frag_n) { - nvcuda::wmma::load_matrix_sync(Q_b[i0/16][j0/frag_n], KQ + j0*D_padded + i0, D_padded); - } - } - - __syncthreads(); - - // Iterate over ne11 == previous tokens: - for (int k_VKQ_0 = ip*FATTN_KQ_STRIDE; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE) { - // Calculate tile of KQ: -#pragma unroll - for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE; i_KQ_0 += KQ_stride_tc) { - frag_c_KQ KQ_c[ncols/frag_n]; -#pragma unroll - for (int j = 0; j < ncols/frag_n; ++j) { - nvcuda::wmma::fill_fragment(KQ_c[j], 0.0f); - } -#pragma unroll - for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 16) { - frag_a_K K_a; - nvcuda::wmma::load_matrix_sync(K_a, K_h + (k_VKQ_0 + i_KQ_0 + frag_m*threadIdx.y)*stride_KV + k_KQ_0, stride_KV); -#pragma unroll - for (int j = 0; j < ncols/frag_n; ++j) { - nvcuda::wmma::mma_sync(KQ_c[j], K_a, Q_b[k_KQ_0/16][j], KQ_c[j]); - } - } -#pragma unroll - for (int j0 = 0; j0 < ncols; j0 += frag_n) { - nvcuda::wmma::store_matrix_sync((KQ_acc_t *) KQ + j0*kqs_padded + i_KQ_0 + frag_m*threadIdx.y, KQ_c[j0/frag_n], kqs_padded, nvcuda::wmma::mem_col_major); - } - } - - __syncthreads(); - - // Calculate softmax for each KQ column using the current max. value. - // The divisor is stored in KQ_rowsum and will be applied at the end. -#pragma unroll - for (int j0 = 0; j0 < ncols; j0 += nwarps) { - const int j = j0 + threadIdx.y; - - if (std::is_same::value) { - float KQ_f_tmp[FATTN_KQ_STRIDE / WARP_SIZE]; -#pragma unroll - for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) { - const int k = k0 + threadIdx.x; - - KQ_f_tmp[k0/WARP_SIZE] = KQ_f[j*kqs_padded + k]; - } - - float KQ_max_new = KQ_max_f[j0/nwarps]; -#pragma unroll - for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) { - const int k = k0 + threadIdx.x; - - KQ_f_tmp[k0/WARP_SIZE] += mask ? __half2float(slopeh*maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f; - KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/WARP_SIZE]); - } - KQ_max_new = warp_reduce_max(KQ_max_new); - - const float diff = KQ_max_f[j0/nwarps] - KQ_max_new; - KQ_max_scale_f[j0/nwarps] = expf(diff); - if (diff <= SOFTMAX_FTZ_THRESHOLD) { - KQ_max_scale_f[j0/nwarps] = 0.0f; - } - KQ_max_f[j0/nwarps] = KQ_max_new; - - float KQ_rowsum_add = 0.0f; -#pragma unroll - for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) { - const int k = k0 + threadIdx.x; - - const float diff = KQ_f_tmp[k0/WARP_SIZE] - KQ_max_f[j0/nwarps]; - KQ_f_tmp[k0/WARP_SIZE] = expf(diff); - if (diff <= SOFTMAX_FTZ_THRESHOLD) { - KQ_f_tmp[k0/WARP_SIZE] = 0.0f; - } - KQ_rowsum_add += KQ_f_tmp[k0/WARP_SIZE]; - KQ[j*(kqar*kqs_padded) + k] = KQ_f_tmp[k0/WARP_SIZE]; - } - KQ_rowsum_add = warp_reduce_sum(KQ_rowsum_add); - - // Scale previous KQ_rowsum to account for a potential increase in KQ_max: - KQ_rowsum_f[j0/nwarps] = KQ_max_scale_f[j0/nwarps]*KQ_rowsum_f[j0/nwarps] + KQ_rowsum_add; - } else { - half2 KQ2_tmp[FATTN_KQ_STRIDE/(2*WARP_SIZE)]; -#pragma unroll - for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) { - const int k = k0 + threadIdx.x; - - KQ2_tmp[k0/WARP_SIZE] = KQ2[j*(kqs_padded/2) + k]; - } - - half2 KQ_max_new = KQ_max_h2[j0/nwarps]; -#pragma unroll - for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) { - const int k = k0 + threadIdx.x; - - KQ2_tmp[k0/WARP_SIZE] += mask ? slope2*mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f); - KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/WARP_SIZE]); - } - KQ_max_new = __half2half2(warp_reduce_max(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new)))); - const half2 diff = KQ_max_h2[j0/nwarps] - KQ_max_new; - KQ_max_scale_h2[j0/nwarps] = h2exp(diff); - const uint32_t ftz_mask = __hgt2_mask(diff, make_half2(SOFTMAX_FTZ_THRESHOLD, SOFTMAX_FTZ_THRESHOLD)); - *((uint32_t *) &KQ_max_scale_h2[j0/nwarps]) &= ftz_mask; - KQ_max_h2[j0/nwarps] = KQ_max_new; - - half2 KQ_rowsum_add = make_half2(0.0f, 0.0f); -#pragma unroll - for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) { - const int k = k0 + threadIdx.x; - - const half2 diff = KQ2_tmp[k0/WARP_SIZE] - KQ_max_h2[j0/nwarps]; - KQ2_tmp[k0/WARP_SIZE] = h2exp(diff); - const uint32_t ftz_mask = __hgt2_mask(diff, make_half2(SOFTMAX_FTZ_THRESHOLD, SOFTMAX_FTZ_THRESHOLD)); - *((uint32_t *) &KQ2_tmp[k0/WARP_SIZE]) &= ftz_mask; - KQ_rowsum_add += KQ2_tmp[k0/WARP_SIZE]; - KQ2[j*(kqs_padded/2) + k] = KQ2_tmp[k0/WARP_SIZE]; - } - KQ_rowsum_add = warp_reduce_sum(KQ_rowsum_add); - - // Scale previous KQ_rowsum to account for a potential increase in KQ_max: - KQ_rowsum_h2[j0/nwarps] = KQ_max_scale_h2[j0/nwarps]*KQ_rowsum_h2[j0/nwarps] + KQ_rowsum_add; - } - } - - __syncthreads(); - - frag_b KQ_b[FATTN_KQ_STRIDE/(VKQ_ratio*16)][ncols/frag_n]; -#pragma unroll - for (int j0 = 0; j0 < ncols; j0 += frag_n) { -#pragma unroll - for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += VKQ_ratio*16) { - const int k = k0 + (threadIdx.y % VKQ_ratio)*16; - nvcuda::wmma::load_matrix_sync( - KQ_b[k0/(VKQ_ratio*16)][j0/frag_n], - KQ + j0*(kqar*kqs_padded) + k, - kqar*kqs_padded); - } - } - - frag_c_VKQ VKQ_c[D/VKQ_stride][ncols/frag_n]; -#pragma unroll - for (int i_VKQ_0 = 0; i_VKQ_0 < D; i_VKQ_0 += VKQ_stride) { -#pragma unroll - for (int j = 0; j < ncols/frag_n; ++j) { - nvcuda::wmma::fill_fragment(VKQ_c[i_VKQ_0/VKQ_stride][j], 0.0f); - } - -#pragma unroll - for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += VKQ_ratio*16) { - const int k = k0 + (threadIdx.y % VKQ_ratio)*16; - - frag_a_V v_a; - nvcuda::wmma::load_matrix_sync(v_a, V_h + (k_VKQ_0 + k)*stride_KV + i_VKQ_0 + frag_m*(threadIdx.y/VKQ_ratio), stride_KV); -#pragma unroll - for (int j = 0; j < ncols/frag_n; ++j) { - nvcuda::wmma::mma_sync(VKQ_c[i_VKQ_0/VKQ_stride][j], v_a, KQ_b[k0/(VKQ_ratio*16)][j], VKQ_c[i_VKQ_0/VKQ_stride][j]); - } - } - } - - __syncthreads(); - - const int offset_k = (threadIdx.y % VKQ_ratio) * (ncols*D_padded); -#pragma unroll - for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += VKQ_stride) { -#pragma unroll - for (int j0 = 0; j0 < ncols; j0 += frag_n) { - nvcuda::wmma::store_matrix_sync( - KQ + offset_k + j0*D_padded + i_KQ_0 + frag_m*(threadIdx.y/VKQ_ratio), - VKQ_c[i_KQ_0/VKQ_stride][j0/frag_n], - D_padded, nvcuda::wmma::mem_col_major); - } - } - - __syncthreads(); - -#pragma unroll - for (int j0 = 0; j0 < ncols; j0 += nwarps) { - const int j = j0 + threadIdx.y; - - half2 VKQ_scale; - if (std::is_same::value) { - VKQ_scale = make_half2(KQ_max_scale_f[j0/nwarps], KQ_max_scale_f[j0/nwarps]); - } else { - VKQ_scale = KQ_max_scale_h2[j0/nwarps]; - } - -#pragma unroll - for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { - const int i = i0 + threadIdx.x; - if (i0 + WARP_SIZE > D/2 && i >= D/2) { + if (precision != GGML_PREC_DEFAULT) { + if (Q->ne[1] <= 32 || Q->ne[0] > 128) { + constexpr int cols_per_block = 16; + switch (Q->ne[0]) { + case 64: + ggml_cuda_flash_attn_ext_wmma_f16_case< 64, cols_per_block, float>(ctx, dst); + break; + case 80: + ggml_cuda_flash_attn_ext_wmma_f16_case< 80, cols_per_block, float>(ctx, dst); + break; + case 96: + ggml_cuda_flash_attn_ext_wmma_f16_case< 96, cols_per_block, float>(ctx, dst); + break; + case 112: + ggml_cuda_flash_attn_ext_wmma_f16_case<112, cols_per_block, float>(ctx, dst); + break; + case 128: + ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst); + break; + case 256: + ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst); + break; + default: + GGML_ASSERT(false); + break; + } + } else { + constexpr int cols_per_block = 32; + switch (Q->ne[0]) { + case 64: + ggml_cuda_flash_attn_ext_wmma_f16_case< 64, cols_per_block, float>(ctx, dst); + break; + case 80: + ggml_cuda_flash_attn_ext_wmma_f16_case< 80, cols_per_block, float>(ctx, dst); + break; + case 96: + ggml_cuda_flash_attn_ext_wmma_f16_case< 96, cols_per_block, float>(ctx, dst); + break; + case 112: + ggml_cuda_flash_attn_ext_wmma_f16_case<112, cols_per_block, float>(ctx, dst); + break; + case 128: + ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst); + break; + // case 256: + // ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst); + // break; + default: + GGML_ASSERT(false); break; - } - - half2 VKQ_add = make_half2(0.0f, 0.0f); -#pragma unroll - for (int l = 0; l < VKQ_ratio; ++l) { - VKQ_add += KQ2[l*(ncols*D_padded/2) + j*(D_padded/2) + i]; - } - VKQ2[j*(D_padded/2) + i] = VKQ_scale*VKQ2[j*(D_padded/2) + i] + VKQ_add; } } - - __syncthreads(); + return; } -#pragma unroll - for (int j0 = 0; j0 < ncols; j0 += nwarps) { - const int j_VKQ = j0 + threadIdx.y; - if (ic0 + j_VKQ >= ne01) { - return; - } - const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip; - - float KQ_rowsum_j; - if (std::is_same::value) { - KQ_rowsum_j = KQ_rowsum_f[j0/nwarps]; - } else { - KQ_rowsum_j = __low2float(KQ_rowsum_h2[j0/nwarps]) + __high2float(KQ_rowsum_h2[j0/nwarps]); - } - -#pragma unroll - for (int i0 = 0; i0 < D; i0 += WARP_SIZE) { - const int i = i0 + threadIdx.x; - if (i0 + WARP_SIZE > D && i >= D) { + if (Q->ne[1] <= 8 && Q->ne[0] % WARP_SIZE == 0) { + constexpr int cols_per_block = 8; + switch (Q->ne[0]) { + case 64: + ggml_cuda_flash_attn_ext_wmma_f16_case< 64, cols_per_block, half>(ctx, dst); + break; + case 96: + ggml_cuda_flash_attn_ext_wmma_f16_case< 96, cols_per_block, half>(ctx, dst); + break; + case 128: + ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, half>(ctx, dst); + break; + case 256: + ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst); + break; + default: + GGML_ASSERT(false); break; - } - float dst_val = VKQ[j_VKQ*D_padded + i]; - if (parallel_blocks == 1) { - dst_val /= KQ_rowsum_j; - } - dst[j_dst*gridDim.y*D + blockIdx.y*D + i] = dst_val; } - - if (parallel_blocks == 1 || threadIdx.x != 0) { - continue; - } - - float2 dst_meta_val; - if (std::is_same::value) { - dst_meta_val.x = KQ_max_f[j0/nwarps]; - } else { - dst_meta_val.x = __low2float(KQ_max_h2[j0/nwarps]); - } - dst_meta_val.y = KQ_rowsum_j; - dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = dst_meta_val; + return; } + + if (Q->ne[1] <= 32) { + constexpr int cols_per_block = 16; + switch (Q->ne[0]) { + case 64: + ggml_cuda_flash_attn_ext_wmma_f16_case< 64, cols_per_block, half>(ctx, dst); + break; + case 80: + ggml_cuda_flash_attn_ext_wmma_f16_case< 80, cols_per_block, half>(ctx, dst); + break; + case 96: + ggml_cuda_flash_attn_ext_wmma_f16_case< 96, cols_per_block, half>(ctx, dst); + break; + case 112: + ggml_cuda_flash_attn_ext_wmma_f16_case<112, cols_per_block, half>(ctx, dst); + break; + case 128: + ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, half>(ctx, dst); + break; + case 256: + ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst); + break; + default: + GGML_ASSERT(false); + break; + } + return; + } + + constexpr int cols_per_block = 32; + switch (Q->ne[0]) { + case 64: + ggml_cuda_flash_attn_ext_wmma_f16_case< 64, cols_per_block, half>(ctx, dst); + break; + case 80: + ggml_cuda_flash_attn_ext_wmma_f16_case< 80, cols_per_block, half>(ctx, dst); + break; + case 96: + ggml_cuda_flash_attn_ext_wmma_f16_case< 96, cols_per_block, half>(ctx, dst); + break; + case 112: + ggml_cuda_flash_attn_ext_wmma_f16_case<112, cols_per_block, half>(ctx, dst); + break; + case 128: + ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, half>(ctx, dst); + break; + case 256: + ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst); + break; + default: + GGML_ASSERT(false); + break; + } +} +#define FATTN_VEC_F16_CASE(D, type_K, type_V) \ + if (Q->ne[0] == (D) && K->type == (type_K) && V->type == (type_V)) { \ + ggml_cuda_flash_attn_ext_vec_f16_case(ctx, dst); \ + return; \ + } \ + +static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_tensor * Q = dst->src[1]; + ggml_tensor * K = dst->src[1]; + ggml_tensor * V = dst->src[2]; + +#ifdef GGML_CUDA_FA_ALL_QUANTS + FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0) + FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1) + FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0) + FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1) + FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0) + FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16 ) + + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0) + FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0) + + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1) + FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1) + + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0) + FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0) + + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1) + FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1) + + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0) + FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0) + + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16) + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16) + FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16) + + FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16) #else - NO_DEVICE_CODE; -#endif // FP16_MMA_AVAILABLE + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0) + + FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0) + + FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16) + FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16) + FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16) +#endif // GGML_CUDA_FA_ALL_QUANTS + + on_no_fattn_vec_case(Q->ne[0]); } -constexpr int get_max_power_of_2(int x) { - return x % 2 == 0 ? 2*get_max_power_of_2(x/2) : 1; -} +#define FATTN_VEC_F32_CASE(D, type_K, type_V) \ + if (Q->ne[0] == (D) && K->type == (type_K) && V->type == (type_V)) { \ + ggml_cuda_flash_attn_ext_vec_f32_case(ctx, dst); \ + return; \ + } \ -static_assert(get_max_power_of_2(1) == 1, "Test failed."); -static_assert(get_max_power_of_2(2) == 2, "Test failed."); -static_assert(get_max_power_of_2(4) == 4, "Test failed."); -static_assert(get_max_power_of_2(6) == 2, "Test failed."); +static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_tensor * Q = dst->src[1]; + ggml_tensor * K = dst->src[1]; + ggml_tensor * V = dst->src[2]; -// Number of VKQ rows calculated in parallel: -constexpr int get_VKQ_stride(int D, int nwarps, int frag_m) { - return (get_max_power_of_2(D/frag_m) < nwarps ? get_max_power_of_2(D/frag_m) : nwarps)*frag_m; -} +#ifdef GGML_CUDA_FA_ALL_QUANTS + FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0) + FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1) + FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0) + FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1) + FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0) + FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16) -static_assert(get_VKQ_stride(128, 1, 32) == 32, "Test failed."); -static_assert(get_VKQ_stride(128, 2, 32) == 64, "Test failed."); -static_assert(get_VKQ_stride(128, 4, 32) == 128, "Test failed."); -static_assert(get_VKQ_stride( 64, 1, 32) == 32, "Test failed."); -static_assert(get_VKQ_stride( 64, 2, 32) == 64, "Test failed."); -static_assert(get_VKQ_stride( 64, 4, 32) == 64, "Test failed."); -static_assert(get_VKQ_stride( 80, 1, 16) == 16, "Test failed."); -static_assert(get_VKQ_stride( 80, 2, 16) == 16, "Test failed."); -static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed."); + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0) + FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0) -template -void launch_fattn_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * Q = dst->src[0]; + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1) + FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1) - constexpr int frag_m = cols_per_block == 8 && D % 32 == 0 ? 32 : 16; - const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3]; - const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm; + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0) + FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0) - if (4*blocks_num_pb1 < 2*nsm) { - constexpr int parallel_blocks = 4; - fattn_kernel_t fattn_kernel = flash_attn_ext_f16; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); - return; - } - if (2*blocks_num_pb1 < 2*nsm) { - constexpr int parallel_blocks = 2; - fattn_kernel_t fattn_kernel = flash_attn_ext_f16; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); - return; - } - constexpr int parallel_blocks = 1; - fattn_kernel_t fattn_kernel = flash_attn_ext_f16; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1) + FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1) + + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0) + FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0) + + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16) + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16) + FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16) + + FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16) +#else + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0) + + FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0) + + FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16) + FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16) + FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16) +#endif // GGML_CUDA_FA_ALL_QUANTS + + on_no_fattn_vec_case(Q->ne[0]); } void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * KQV = dst; const ggml_tensor * Q = dst->src[0]; + const ggml_tensor * K = dst->src[1]; + const ggml_tensor * V = dst->src[2]; ggml_cuda_set_device(ctx.device); const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; const int32_t precision = KQV->op_params[2]; + const bool quantized_KV = ggml_is_quantized(K->type) || ggml_is_quantized(V->type); + // On AMD the tile kernels perform poorly, use the vec kernel instead: - if (cc >= CC_OFFSET_AMD) { - if (precision == GGML_PREC_DEFAULT) { - ggml_cuda_flash_attn_ext_vec_f16_no_mma(ctx, dst); + if (cc >= CC_OFFSET_AMD || quantized_KV) { + if (precision == GGML_PREC_DEFAULT && fast_fp16_available(cc)) { + ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); } else { ggml_cuda_flash_attn_ext_vec_f32(ctx, dst); } @@ -483,156 +328,22 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst if (!fp16_mma_available(cc)) { if (Q->ne[1] <= 8) { - ggml_cuda_flash_attn_ext_vec_f16_no_mma(ctx, dst); + ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); } else { ggml_cuda_flash_attn_ext_tile_f16(ctx, dst); } return; } - if (precision != GGML_PREC_DEFAULT) { - if (Q->ne[1] == 1 && (Q->ne[0] == 64 || Q->ne[0] == 128)) { + if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) { + if (precision == GGML_PREC_DEFAULT) { + ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); + return; + } else if(Q->ne[0] <= 128) { ggml_cuda_flash_attn_ext_vec_f32(ctx, dst); return; } - - if (Q->ne[1] <= 32 || Q->ne[0] > 128) { - constexpr int cols_per_block = 16; - constexpr int nwarps = 4; - switch (Q->ne[0]) { - case 64: - launch_fattn_f16< 64, cols_per_block, nwarps, float>(ctx, dst); - break; - case 80: - launch_fattn_f16< 80, cols_per_block, nwarps, float>(ctx, dst); - break; - case 96: - launch_fattn_f16< 96, cols_per_block, nwarps, float>(ctx, dst); - break; - case 112: - launch_fattn_f16<112, cols_per_block, nwarps, float>(ctx, dst); - break; - case 128: - launch_fattn_f16<128, cols_per_block, nwarps, float>(ctx, dst); - break; - case 256: - launch_fattn_f16<256, cols_per_block, nwarps, float>(ctx, dst); - break; - default: - GGML_ASSERT(false); - break; - } - } else { - constexpr int cols_per_block = 32; - constexpr int nwarps = 4; - switch (Q->ne[0]) { - case 64: - launch_fattn_f16< 64, cols_per_block, nwarps, float>(ctx, dst); - break; - case 80: - launch_fattn_f16< 80, cols_per_block, nwarps, float>(ctx, dst); - break; - case 96: - launch_fattn_f16< 96, cols_per_block, nwarps, float>(ctx, dst); - break; - case 112: - launch_fattn_f16<112, cols_per_block, nwarps, float>(ctx, dst); - break; - case 128: - launch_fattn_f16<128, cols_per_block, nwarps, float>(ctx, dst); - break; - // case 256: - // launch_fattn_f16<256, cols_per_block, nwarps, float>(ctx, dst); - // break; - default: - GGML_ASSERT(false); - break; - } - } - return; } - if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) { - ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); - return; - } - - if (Q->ne[1] <= 8 && Q->ne[0] % WARP_SIZE == 0) { - constexpr int cols_per_block = 8; - constexpr int nwarps = 4; - switch (Q->ne[0]) { - case 64: - launch_fattn_f16< 64, cols_per_block, nwarps, half>(ctx, dst); - break; - case 96: - launch_fattn_f16< 96, cols_per_block, nwarps, half>(ctx, dst); - break; - case 128: - launch_fattn_f16<128, cols_per_block, nwarps, half>(ctx, dst); - break; - case 256: - launch_fattn_f16<256, cols_per_block, nwarps, half>(ctx, dst); - break; - default: - GGML_ASSERT(false); - break; - } - return; - } - - if (Q->ne[1] <= 32) { - constexpr int cols_per_block = 16; - constexpr int nwarps = 4; - switch (Q->ne[0]) { - case 64: - launch_fattn_f16< 64, cols_per_block, nwarps, half>(ctx, dst); - break; - case 80: - launch_fattn_f16< 80, cols_per_block, nwarps, half>(ctx, dst); - break; - case 96: - launch_fattn_f16< 96, cols_per_block, nwarps, half>(ctx, dst); - break; - case 112: - launch_fattn_f16<112, cols_per_block, nwarps, half>(ctx, dst); - break; - case 128: - launch_fattn_f16<128, cols_per_block, nwarps, half>(ctx, dst); - break; - case 256: - launch_fattn_f16<256, cols_per_block, nwarps, half>(ctx, dst); - break; - default: - GGML_ASSERT(false); - break; - } - return; - } - - constexpr int cols_per_block = 32; - constexpr int nwarps = 4; - switch (Q->ne[0]) { - case 64: - launch_fattn_f16< 64, cols_per_block, nwarps, half>(ctx, dst); - break; - case 80: - launch_fattn_f16< 80, cols_per_block, nwarps, half>(ctx, dst); - break; - case 96: - launch_fattn_f16< 96, cols_per_block, nwarps, half>(ctx, dst); - break; - case 112: - launch_fattn_f16<112, cols_per_block, nwarps, half>(ctx, dst); - break; - case 128: - launch_fattn_f16<128, cols_per_block, nwarps, half>(ctx, dst); - break; - case 256: - launch_fattn_f16<256, cols_per_block, nwarps, half>(ctx, dst); - break; - default: - GGML_ASSERT(false); - break; - } - return; + ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst); } diff --git a/ggml-cuda/mmq.cu b/ggml-cuda/mmq.cu index c0a66d9b6..ebe1dc5c8 100644 --- a/ggml-cuda/mmq.cu +++ b/ggml-cuda/mmq.cu @@ -386,7 +386,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE]; } - return vec_dot_q8_0_q8_1_impl + return vec_dot_q8_0_q8_1_impl (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } @@ -547,7 +547,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat( const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; - return vec_dot_q8_0_q8_1_impl + return vec_dot_q8_0_q8_1_impl (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0], y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]); } diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu new file mode 100644 index 000000000..d7f103475 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu new file mode 100644 index 000000000..f3d8d2eda --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu new file mode 100644 index 000000000..9beb05ca2 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu new file mode 100644 index 000000000..0c163dcba --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu new file mode 100644 index 000000000..3980167b3 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu new file mode 100644 index 000000000..fe099921d --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu new file mode 100644 index 000000000..d4d5e7999 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu new file mode 100644 index 000000000..f08b10c4d --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu new file mode 100644 index 000000000..e8c3f8adc --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu new file mode 100644 index 000000000..c01416a13 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu new file mode 100644 index 000000000..46615f281 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu new file mode 100644 index 000000000..72dcc1a2f --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu new file mode 100644 index 000000000..9fa8a377d --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu new file mode 100644 index 000000000..20ea86c6d --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu new file mode 100644 index 000000000..ed815957c --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu new file mode 100644 index 000000000..bbe9e6a1c --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu new file mode 100644 index 000000000..d12a61699 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu new file mode 100644 index 000000000..1e901afcb --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu new file mode 100644 index 000000000..a3f98ce37 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu new file mode 100644 index 000000000..1bae97243 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu new file mode 100644 index 000000000..7258e9775 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu new file mode 100644 index 000000000..08435c005 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu new file mode 100644 index 000000000..17864e8e9 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu new file mode 100644 index 000000000..9239138c9 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu new file mode 100644 index 000000000..e387d9c1d --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu new file mode 100644 index 000000000..d69d3bbd6 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu new file mode 100644 index 000000000..61a478816 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu new file mode 100644 index 000000000..89995080a --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu new file mode 100644 index 000000000..9e6a58dff --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu new file mode 100644 index 000000000..153cbfd86 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu new file mode 100644 index 000000000..09d576558 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu new file mode 100644 index 000000000..3e3c91e68 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu new file mode 100644 index 000000000..7b973058f --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu new file mode 100644 index 000000000..a43a475d4 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu new file mode 100644 index 000000000..5b570c0a3 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu new file mode 100644 index 000000000..bf2cc684e --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu new file mode 100644 index 000000000..7428e45ea --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu new file mode 100644 index 000000000..4aee830de --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu new file mode 100644 index 000000000..36acb6319 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu new file mode 100644 index 000000000..a4090c390 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu new file mode 100644 index 000000000..17b6b2d11 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu new file mode 100644 index 000000000..549e1cea1 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu new file mode 100644 index 000000000..66bcd820f --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f16.cuh" + +DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu new file mode 100644 index 000000000..15933a299 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu new file mode 100644 index 000000000..8aa785583 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu new file mode 100644 index 000000000..bde3924fd --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu new file mode 100644 index 000000000..1708181c1 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu new file mode 100644 index 000000000..30fa6fa4c --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu new file mode 100644 index 000000000..69673d50f --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu new file mode 100644 index 000000000..d8b2b2e18 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu new file mode 100644 index 000000000..01cce7ab5 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu new file mode 100644 index 000000000..fd5563b39 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu new file mode 100644 index 000000000..b13cc4a0c --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu new file mode 100644 index 000000000..86f1fc637 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu new file mode 100644 index 000000000..26e7df4be --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu new file mode 100644 index 000000000..e4fda8952 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu new file mode 100644 index 000000000..bd15117b4 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu new file mode 100644 index 000000000..cb6c6a760 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu new file mode 100644 index 000000000..201b6641d --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu new file mode 100644 index 000000000..6da57a44a --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu new file mode 100644 index 000000000..47623c9bf --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu new file mode 100644 index 000000000..82c6861d2 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu new file mode 100644 index 000000000..24a80c2b0 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu new file mode 100644 index 000000000..b95eaf7e1 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu new file mode 100644 index 000000000..275f2efcc --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu new file mode 100644 index 000000000..3673f7fd5 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu new file mode 100644 index 000000000..2c4d59947 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu new file mode 100644 index 000000000..2457cdf3f --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu new file mode 100644 index 000000000..b3b411ed3 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu new file mode 100644 index 000000000..b7f308a4d --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu new file mode 100644 index 000000000..739686697 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu new file mode 100644 index 000000000..708d03113 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu new file mode 100644 index 000000000..df891be60 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu new file mode 100644 index 000000000..f49b6d1f9 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu new file mode 100644 index 000000000..1de92148b --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu new file mode 100644 index 000000000..7a1ba7f8d --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu new file mode 100644 index 000000000..25493e4ba --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu new file mode 100644 index 000000000..3cd650c7b --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu new file mode 100644 index 000000000..88ffa43d6 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu new file mode 100644 index 000000000..8c7bac6c2 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu new file mode 100644 index 000000000..a28f62e7b --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu new file mode 100644 index 000000000..d39838b96 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu new file mode 100644 index 000000000..834d40f6c --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu new file mode 100644 index 000000000..f7d54668b --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu new file mode 100644 index 000000000..59e00ad83 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1); diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu new file mode 100644 index 000000000..6e63893de --- /dev/null +++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-vec-f32.cuh" + +DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0); diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu b/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu new file mode 100644 index 000000000..ca356ad6c --- /dev/null +++ b/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu @@ -0,0 +1,10 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-wmma-f16.cuh" + +DECL_FATTN_WMMA_F16_CASE(64, 16, float); +DECL_FATTN_WMMA_F16_CASE(80, 16, float); +DECL_FATTN_WMMA_F16_CASE(96, 16, float); +DECL_FATTN_WMMA_F16_CASE(112, 16, float); +DECL_FATTN_WMMA_F16_CASE(128, 16, float); +DECL_FATTN_WMMA_F16_CASE(256, 16, float); diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu b/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu new file mode 100644 index 000000000..430ee64eb --- /dev/null +++ b/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu @@ -0,0 +1,9 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-wmma-f16.cuh" + +DECL_FATTN_WMMA_F16_CASE(64, 32, float); +DECL_FATTN_WMMA_F16_CASE(80, 32, float); +DECL_FATTN_WMMA_F16_CASE(96, 32, float); +DECL_FATTN_WMMA_F16_CASE(112, 32, float); +DECL_FATTN_WMMA_F16_CASE(128, 32, float); diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu b/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu new file mode 100644 index 000000000..d421d17cc --- /dev/null +++ b/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu @@ -0,0 +1,10 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-wmma-f16.cuh" + +DECL_FATTN_WMMA_F16_CASE(64, 16, half); +DECL_FATTN_WMMA_F16_CASE(80, 16, half); +DECL_FATTN_WMMA_F16_CASE(96, 16, half); +DECL_FATTN_WMMA_F16_CASE(112, 16, half); +DECL_FATTN_WMMA_F16_CASE(128, 16, half); +DECL_FATTN_WMMA_F16_CASE(256, 16, half); diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu b/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu new file mode 100644 index 000000000..deacd5f58 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu @@ -0,0 +1,10 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-wmma-f16.cuh" + +DECL_FATTN_WMMA_F16_CASE(64, 32, half); +DECL_FATTN_WMMA_F16_CASE(80, 32, half); +DECL_FATTN_WMMA_F16_CASE(96, 32, half); +DECL_FATTN_WMMA_F16_CASE(112, 32, half); +DECL_FATTN_WMMA_F16_CASE(128, 32, half); +DECL_FATTN_WMMA_F16_CASE(256, 32, half); diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu b/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu new file mode 100644 index 000000000..282896733 --- /dev/null +++ b/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu @@ -0,0 +1,8 @@ +// This file has been autogenerated by generate-variants.py, do not edit manually. + +#include "../fattn-wmma-f16.cuh" + +DECL_FATTN_WMMA_F16_CASE(64, 8, half); +DECL_FATTN_WMMA_F16_CASE(96, 8, half); +DECL_FATTN_WMMA_F16_CASE(128, 8, half); +DECL_FATTN_WMMA_F16_CASE(256, 8, half); diff --git a/ggml-cuda/template-instances/generate_cu_files.py b/ggml-cuda/template-instances/generate_cu_files.py new file mode 100755 index 000000000..ee5b460e0 --- /dev/null +++ b/ggml-cuda/template-instances/generate_cu_files.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 + +from glob import glob +import os + +TYPES_KV = ["GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_F16"] + +SOURCE_FATTN_VEC = """// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec-f{vkq_size}.cuh" + +DECL_FATTN_VEC_F{vkq_size}_CASE({head_size}, {type_k}, {type_v}); +""" + +SOURCE_FATTN_WMMA_START = """// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-wmma-f16.cuh" + +""" + +SOURCE_FATTN_WMMA_CASE = "DECL_FATTN_WMMA_F16_CASE({head_size}, {cols_per_block}, {kq_acc_t});\n" + + +def get_short_name(long_quant_name): + return long_quant_name.replace("GGML_TYPE_", "").lower() + + +def get_head_sizes(type_k, type_v): + if type_k == "GGML_TYPE_F16" and type_v == "GGML_TYPE_F16": + return [64, 128, 256] + if type_k == "GGML_TYPE_F16": + return [64, 128] + return [128] + + +for filename in glob("*.cu"): + os.remove(filename) + +for vkq_size in [16, 32]: + for type_k in TYPES_KV: + for type_v in TYPES_KV: + for head_size in get_head_sizes(type_k, type_v): + with open(f"fattn-vec-f{vkq_size}-instance-hs{head_size}-{get_short_name(type_k)}-{get_short_name(type_v)}.cu", "w") as f: + f.write(SOURCE_FATTN_VEC.format(vkq_size=vkq_size, head_size=head_size, type_k=type_k, type_v=type_v)) + +for kq_acc_t in ["half", "float"]: + for cols_per_block in [8, 16, 32]: + if kq_acc_t == "float" and cols_per_block == 8: + continue + + with open(f"fattn-wmma-f16-instance-kq{kq_acc_t}-cpb{cols_per_block}.cu", "w") as f: + f.write(SOURCE_FATTN_WMMA_START) + + for head_size in [64, 80, 96, 112, 128, 256]: + if cols_per_block == 8 and head_size % 32 != 0: # wmma fragment is 8x32 + continue + if kq_acc_t == "float" and cols_per_block == 32 and head_size == 256: # register spilling, bad performance + continue + f.write(SOURCE_FATTN_WMMA_CASE.format(kq_acc_t=kq_acc_t, cols_per_block=cols_per_block, head_size=head_size)) diff --git a/ggml-cuda/vecdotq.cuh b/ggml-cuda/vecdotq.cuh index 5ebdddcc7..df9752390 100644 --- a/ggml-cuda/vecdotq.cuh +++ b/ggml-cuda/vecdotq.cuh @@ -180,8 +180,8 @@ template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp #define VDR_Q8_0_Q8_1_MMVQ 2 #define VDR_Q8_0_Q8_1_MMQ 8 -template static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl( - const int * v, const int * u, const float & d8_0, const float & d8_1) { +template static __device__ __forceinline__ T vec_dot_q8_0_q8_1_impl( + const int * v, const int * u, const T & d8_0, const T & d8_1) { #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics int sumi = 0; @@ -192,7 +192,7 @@ template static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp sumi = __dp4a(v[i], u[i], sumi); } - return d8_0*d8_1 * sumi; + return d8_0*d8_1 * ((T) sumi); #else NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A @@ -656,7 +656,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1( u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); } - return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, __low2half(bq8_1->ds)); + return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, __low2half(bq8_1->ds)); } static __device__ __forceinline__ float vec_dot_q2_K_q8_1( diff --git a/ggml-metal.m b/ggml-metal.m index 079912952..fddc44f78 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -779,6 +779,12 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const case GGML_OP_LEAKY_RELU: return true; case GGML_OP_FLASH_ATTN_EXT: + if (op->src[1]->type != GGML_TYPE_F16) { + return false; + } + if (op->src[2]->type != GGML_TYPE_F16) { + return false; + } if (op->src[0]->ne[0] == 256) { return false; } diff --git a/llama.cpp b/llama.cpp index 40d2ec2c9..841be1de7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -16221,6 +16221,11 @@ struct llama_context * llama_new_context_with_model( params.flash_attn = false; } + if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) { + LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__); + return nullptr; + } + llama_context * ctx = new llama_context(*model); const auto & hparams = model->hparams; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 72edc64a7..777230127 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1570,21 +1570,23 @@ struct test_flash_attn_ext : public test_case { const float max_bias; // ALiBi + const ggml_type type_KV; + std::string vars() override { - return VARS_TO_STR6(hs, nh, kv, nb, mask, max_bias); + return VARS_TO_STR7(hs, nh, kv, nb, mask, max_bias, type_KV); } double max_nmse_err() override { return 5e-4; } - test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f) - : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias) {} + test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f, ggml_type type_KV = GGML_TYPE_F16) + : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), type_KV(type_KV) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1); - ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1); - ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1); + ggml_tensor * k = ggml_new_tensor_4d(ctx, type_KV, hs, kv, nh, 1); + ggml_tensor * v = ggml_new_tensor_4d(ctx, type_KV, hs, kv, nh, 1); ggml_tensor * m = mask ? ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1) : nullptr; ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias); return out; @@ -2290,7 +2292,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op for (int nh : { 32, }) { for (int kv : { 512, 1024, }) { for (int nb : { 1, 2, 4, 8, }) { - test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias)); + for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) { + test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, type_KV)); + } } } } From 750f60c03e4d3f53fa51910551ce87a3d508d2d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 1 Jun 2024 15:47:04 +0200 Subject: [PATCH 3/6] CUDA: fix Pascal FA, deq. KV to FP16 for batch > 8 (#7681) --- ggml-cuda/fattn-common.cuh | 74 ++++++++++++++++++++++++++++-------- ggml-cuda/fattn-tile-f16.cu | 4 +- ggml-cuda/fattn-tile-f32.cu | 4 +- ggml-cuda/fattn-vec-f16.cuh | 4 +- ggml-cuda/fattn-vec-f32.cuh | 4 +- ggml-cuda/fattn-wmma-f16.cuh | 6 +-- ggml-cuda/fattn.cu | 6 +-- 7 files changed, 73 insertions(+), 29 deletions(-) diff --git a/ggml-cuda/fattn-common.cuh b/ggml-cuda/fattn-common.cuh index 4bf03a49f..c00f8606a 100644 --- a/ggml-cuda/fattn-common.cuh +++ b/ggml-cuda/fattn-common.cuh @@ -1,6 +1,7 @@ #pragma once #include "common.cuh" +#include "convert.cuh" #include "vecdotq.cuh" #include @@ -53,7 +54,7 @@ typedef float (*vec_dot_KQ_f32_t)( template static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0( const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { -#if __CUDA_ARCH__ > MIN_CC_DP4A +#if __CUDA_ARCH__ >= MIN_CC_DP4A const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c; GGML_UNUSED(Q_v); @@ -95,13 +96,13 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0( GGML_UNUSED(Q_q8); GGML_UNUSED(Q_ds_v); NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ > MIN_CC_DP4A +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } template static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1( const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { -#if __CUDA_ARCH__ > MIN_CC_DP4A +#if __CUDA_ARCH__ >= MIN_CC_DP4A const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c; GGML_UNUSED(Q_v); @@ -147,13 +148,13 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1( GGML_UNUSED(Q_q8); GGML_UNUSED(Q_ds_v); NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ > MIN_CC_DP4A +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } template static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0( const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { -#if __CUDA_ARCH__ > MIN_CC_DP4A +#if __CUDA_ARCH__ >= MIN_CC_DP4A const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c; GGML_UNUSED(Q_v); @@ -202,13 +203,13 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0( GGML_UNUSED(Q_q8); GGML_UNUSED(Q_ds_v); NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ > MIN_CC_DP4A +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } template static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1( const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { -#if __CUDA_ARCH__ > MIN_CC_DP4A +#if __CUDA_ARCH__ >= MIN_CC_DP4A const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c; GGML_UNUSED(Q_v); @@ -261,13 +262,13 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1( GGML_UNUSED(Q_q8); GGML_UNUSED(Q_ds_v); NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ > MIN_CC_DP4A +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } template static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0( const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { -#if __CUDA_ARCH__ > MIN_CC_DP4A +#if __CUDA_ARCH__ >= MIN_CC_DP4A const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c; GGML_UNUSED(Q_v); @@ -302,7 +303,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0( GGML_UNUSED(Q_q8); GGML_UNUSED(Q_ds_v); NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ > MIN_CC_DP4A +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } template @@ -620,7 +621,10 @@ static void on_no_fattn_vec_case(const int D) { } template -void launch_fattn(ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, int nwarps, int cols_per_block) { +void launch_fattn( + ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, + const int nwarps, const int cols_per_block, const bool need_f16_K, const bool need_f16_V +) { const ggml_tensor * Q = dst->src[0]; const ggml_tensor * K = dst->src[1]; const ggml_tensor * V = dst->src[2]; @@ -641,9 +645,49 @@ void launch_fattn(ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kern ggml_cuda_pool & pool = ctx.pool(); cudaStream_t main_stream = ctx.stream(); + ggml_cuda_pool_alloc K_f16(pool); + ggml_cuda_pool_alloc V_f16(pool); ggml_cuda_pool_alloc dst_tmp(pool); ggml_cuda_pool_alloc dst_tmp_meta(pool); + char * K_data = (char *) K->data; + size_t nb11 = K->nb[1]; + size_t nb12 = K->nb[2]; + size_t nb13 = K->nb[3]; + + char * V_data = (char *) V->data; + size_t nb21 = V->nb[1]; + size_t nb22 = V->nb[2]; + size_t nb23 = V->nb[3]; + + if (need_f16_K && K->type != GGML_TYPE_F16) { + K_f16.alloc(ggml_nelements(K)); + to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type); + to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream); + K_data = (char *) K_f16.ptr; + + const size_t bs = ggml_blck_size(K->type); + const size_t ts = ggml_type_size(K->type); + + nb11 = nb11*bs*sizeof(half)/ts; + nb12 = nb12*bs*sizeof(half)/ts; + nb13 = nb13*bs*sizeof(half)/ts; + } + + if (need_f16_V && V->type != GGML_TYPE_F16) { + V_f16.alloc(ggml_nelements(V)); + to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type); + to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream); + V_data = (char *) V_f16.ptr; + + const size_t bs = ggml_blck_size(V->type); + const size_t ts = ggml_type_size(V->type); + + nb21 = nb21*bs*sizeof(half)/ts; + nb22 = nb22*bs*sizeof(half)/ts; + nb23 = nb23*bs*sizeof(half)/ts; + } + if (parallel_blocks > 1) { dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV)); dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV)); @@ -667,8 +711,8 @@ void launch_fattn(ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kern fattn_kernel<<>>( (const char *) Q->data, - (const char *) K->data, - (const char *) V->data, + K_data, + V_data, mask ? ((const char *) mask->data) : nullptr, (parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr, scale, max_bias, m0, m1, n_head_log2, @@ -676,8 +720,8 @@ void launch_fattn(ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kern K->ne[0], K->ne[1], K->ne[2], K->ne[3], mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0, Q->nb[1], Q->nb[2], Q->nb[3], - K->nb[1], K->nb[2], K->nb[3], - V->nb[1], V->nb[2], V->nb[3], + nb11, nb12, nb13, + nb21, nb22, nb23, KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3] ); CUDA_CHECK(cudaGetLastError()); diff --git a/ggml-cuda/fattn-tile-f16.cu b/ggml-cuda/fattn-tile-f16.cu index 3d64a9eba..cb11d7212 100644 --- a/ggml-cuda/fattn-tile-f16.cu +++ b/ggml-cuda/fattn-tile-f16.cu @@ -278,13 +278,13 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * constexpr int D = 64; constexpr int nwarps = 8; fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); + launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); } break; case 128: { constexpr int D = 128; constexpr int nwarps = 8; fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); + launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); } break; default: { GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128."); diff --git a/ggml-cuda/fattn-tile-f32.cu b/ggml-cuda/fattn-tile-f32.cu index 61fce0a7e..15e22f495 100644 --- a/ggml-cuda/fattn-tile-f32.cu +++ b/ggml-cuda/fattn-tile-f32.cu @@ -275,13 +275,13 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * constexpr int D = 64; constexpr int nwarps = 8; fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); + launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); } break; case 128: { constexpr int D = 128; constexpr int nwarps = 8; fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); + launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); } break; default: { GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128."); diff --git a/ggml-cuda/fattn-vec-f16.cuh b/ggml-cuda/fattn-vec-f16.cuh index ea4fcc972..9e1aa2c6b 100644 --- a/ggml-cuda/fattn-vec-f16.cuh +++ b/ggml-cuda/fattn-vec-f16.cuh @@ -290,7 +290,9 @@ template ; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); + constexpr bool need_f16_K = D != 128; + constexpr bool need_f16_V = D != 128 && D != 64; + launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block, need_f16_K, need_f16_V); } template diff --git a/ggml-cuda/fattn-vec-f32.cuh b/ggml-cuda/fattn-vec-f32.cuh index 3009f0f43..ce23a4ebd 100644 --- a/ggml-cuda/fattn-vec-f32.cuh +++ b/ggml-cuda/fattn-vec-f32.cuh @@ -271,7 +271,9 @@ template ; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); + constexpr bool need_f16_K = D != 128; + constexpr bool need_f16_V = D != 128 && D != 64; + launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block, need_f16_K, need_f16_V); } template diff --git a/ggml-cuda/fattn-wmma-f16.cuh b/ggml-cuda/fattn-wmma-f16.cuh index 65ed31852..59cd30d78 100644 --- a/ggml-cuda/fattn-wmma-f16.cuh +++ b/ggml-cuda/fattn-wmma-f16.cuh @@ -438,18 +438,18 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm if (4*blocks_num_pb1 < 2*nsm) { constexpr int parallel_blocks = 4; fattn_kernel_t fattn_kernel = flash_attn_ext_f16; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); + launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); return; } if (2*blocks_num_pb1 < 2*nsm) { constexpr int parallel_blocks = 2; fattn_kernel_t fattn_kernel = flash_attn_ext_f16; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); + launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); return; } constexpr int parallel_blocks = 1; fattn_kernel_t fattn_kernel = flash_attn_ext_f16; - launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block); + launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); } #define DECL_FATTN_WMMA_F16_CASE(D, cols_per_block, KQ_acc_t) \ diff --git a/ggml-cuda/fattn.cu b/ggml-cuda/fattn.cu index b35ab67a8..38d30b210 100644 --- a/ggml-cuda/fattn.cu +++ b/ggml-cuda/fattn.cu @@ -298,17 +298,13 @@ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, gg void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * KQV = dst; const ggml_tensor * Q = dst->src[0]; - const ggml_tensor * K = dst->src[1]; - const ggml_tensor * V = dst->src[2]; ggml_cuda_set_device(ctx.device); const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; const int32_t precision = KQV->op_params[2]; - const bool quantized_KV = ggml_is_quantized(K->type) || ggml_is_quantized(V->type); - // On AMD the tile kernels perform poorly, use the vec kernel instead: - if (cc >= CC_OFFSET_AMD || quantized_KV) { + if (cc >= CC_OFFSET_AMD) { if (precision == GGML_PREC_DEFAULT && fast_fp16_available(cc)) { ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); } else { From 2ac95c9d5678d05e253691fb1f26471675bff5ad Mon Sep 17 00:00:00 2001 From: HanishKVC Date: Sat, 1 Jun 2024 21:50:18 +0530 Subject: [PATCH 4/6] SimpleChat: Simple histogram/repeatMatching driven garbageTrimming, Settings UI, Streaming mode, OpenAi Compat (Model, Authorization Bearer), Save/Restore session, Auto Settings UI (#7548) * SimpleChat:DU:BringIn local helper js modules using importmap Use it to bring in a simple trim garbage at end logic, which is used to trim received response. Also given that importmap assumes esm / standard js modules, so also global variables arent implicitly available outside the modules. So add it has a member of document for now * SimpleChat:DU: Add trim garbage at end in loop helper * SimpleChat:DU:TrimGarbage if unable try skip char and retry * SimpleChat:DU: Try trim using histogram based info TODO: May have to add max number of uniq chars in histogram at end of learning phase. * SimpleChat:DU: Switch trim garbage hist based to maxUniq simple Instead of blindly building histogram for specified substring length, and then checking if any new char within specified min garbage length limit, NOW exit learn state when specified maxUniq chars are found. Inturn there should be no new chars with in the specified min garbage length required limit. TODO: Need to track char classes like alphabets, numerals and special/other chars. * SimpleChat:DU: Bring in maxType to the mix along with maxUniq Allow for more uniq chars, but then ensure that a given type of char ie numerals or alphabets or other types dont cross the specified maxType limit. This allows intermixed text garbage to be identified and trimmed. * SimpleChat:DU: Cleanup debug log messages * SimpleChat:UI: Move html ui base helpers into its own module * SimpleChat:DU:Avoid setting frequence/Presence penalty Some models like llama3 found to try to be over intelligent by repeating garbage still, but by tweaking the garbage a bit so that it is not exactly same. So avoid setting these penalties and let the model's default behaviour work out, as is. Also the simple minded histogram based garbage trimming from end, works to an extent, when the garbage is more predictable and repeatative. * SimpleChat:UI: Add and use a para-create-append helper Also update the config params dump to indicate that now one needs to use document to get hold of gMe global object, this is bcas of moving to module type js. Also add ui.mjs to importmap * SimpleChat:UI: Helper to create bool button and use it wrt settings * SimpleChat:UI: Add Select helper and use it wrt ChatHistoryInCtxt * SimpleChat:UI:Select: dict-name-value, value wrt default, change Take a dict/object of name-value pairs instead of just names. Inturn specify the actual value wrt default, rather than the string representing that value. Trap the needed change event rather than click wrt select. * SimpleChat:UI: Add Div wrapped label+element helpers Move settings related elements to use the new div wrapped ones. * SimpleChat:UI:Add settings button and bring in settings ui * SimpleChat:UI:Settings make boolean button text show meaning * SimpleChat: Update a bit wrt readme and notes in du * SimpleChat: GarbageTrim enable/disable, show trimmed part ifany * SimpleChat: highlight trim, garbage trimming bitmore aggressive Make it easy for end user to identified the trimmed text. Make garbage trimming logic, consider a longer repeat garbage substring. * SimpleChat: Cleanup a bit wrt Api end point related flow Consolidate many of the Api end point related basic meta data into ApiEP class. Remove the hardcoded ApiEP/Mode settings from html+js, instead use the generic select helper logic, inturn in the settings block. Move helper to generate the appropriate request json string based on ApiEP into SimpleChat class itself. * SimpleChat:Move extracting assistant response to SimpleChat class so also the trimming of garbage. * SimpleChat:DU: Bring in both trim garbage logics to try trim * SimpleChat: Cleanup readme a bit, add one more chathistory length * SimpleChat:Stream:Initial handshake skeleton Parse the got stream responses and try extract the data from it. It allows for a part read to get a single data line or multiple data line. Inturn extract the json body and inturn the delta content/message in it. * SimpleChat: Move handling oneshot mode server response Move handling of the oneshot mode server response into SimpleChat. Also add plumbing for moving multipart server response into same. * SimpleChat: Move multi part server response handling in * SimpleChat: Add MultiPart Response handling, common trimming Add logic to call into multipart/stream server response handling. Move trimming of garbage at the end into the common handle_response helper. Add new global flag to control between oneshot and multipart/stream mode of fetching response. Allow same to be controlled by user. If in multipart/stream mode, send the stream flag to the server. * SimpleChat: show streamed generative text as it becomes available Now that the extracting of streamed generated text is implemented, add logic to show the same on the screen. * SimpleChat:DU: Add NewLines helper class To work with an array of new lines. Allow adding, appending, shifting, ... * SimpleChat:DU: Make NewLines shift more robust and flexible * SimpleChat:HandleResponseMultiPart using NewLines helper Make handle_response_multipart logic better and cleaner. Now it allows for working with the situation, where the delta data line got from server in stream mode, could be split up when recving, but still the logic will handle it appropriately. ALERT: Rather except (for now) for last data line wrt a request's response. * SimpleChat: Disable console debug by default by making it dummy Parallely save a reference to the original func. * SimpleChat:MultiPart/Stream flow cleanup Dont try utf8-decode and newlines-add_append if no data to work on. If there is no more data to get (ie done is set), then let NewLines instance return line without newline at end, So that we dont miss out on any last-data-line without newline kind of scenario. Pass stream flag wrt utf-8 decode, so that if any multi-byte char is only partly present in the passed buffer, it can be accounted for along with subsequent buffer. At sametime, bcas of utf-8's characteristics there shouldnt be any unaccounted bytes at end, for valid block of utf8 data split across chunks, so not bothering calling with stream set to false at end. LATER: Look at TextDecoder's implementation, for any over intelligence, it may be doing.. If needed, one can use done flag to account wrt both cases. * SimpleChat: Move baseUrl to Me and inturn gMe This should allow easy updating of the base url at runtime by the end user. * SimpleChat:UI: Add input element helper * SimpleChat: Add support for changing the base url This ensures that if the user is running the server with a different port or wants to try connect to server on a different machine, then this can be used. * SimpleChat: Move request headers into Me and gMe Inturn allow Authorization to be sent, if not empty. * SimpleChat: Rather need to use append to insert headers * SimpleChat: Allow Authorization header to be set by end user * SimpleChat:UI+: Return div and element wrt creatediv helpers use it to set placeholder wrt Authorization header. Also fix copy-paste oversight. * SimpleChat: readme wrt authorization, maybe minimal openai testing * SimpleChat: model request field for openai/equivalent compat May help testing with openai/equivalent web services, if they require this field. * SimpleChat: readme stream-utf-8 trim-english deps, exception2error * Readme: Add a entry for simplechat in the http server section * SimpleChat:WIP:Collate internally, Stream mode Trap exceptions This can help ensure that data fetched till that point, can be made use of, rather than losing it. On some platforms, the time taken wrt generating a long response, may lead to the network connection being broken when it enters some user-no-interaction related power saving mode. * SimpleChat:theResp-origMsg: Undo a prev change to fix non trim When the response handling was moved into SimpleChat, I had changed a flow bit unnecessarily and carelessly, which resulted in the non trim flow, missing out on retaining the ai assistant response. This has been fixed now. * SimpleChat: Save message internally in handle_response itself This ensures that throwing the caught exception again for higher up logic, doesnt lose the response collated till that time. Go through theResp.assistant in catch block, just to keep simple consistency wrt backtracing just in case. Update the readme file. * SimpleChat:Cleanup: Add spacing wrt shown req-options * SimpleChat:UI: CreateDiv Divs map to GridX2 class This allows the settings ui to be cleaner structured. * SimpleChat: Show Non SettingsUI config field by default * SimpleChat: Allow for multiline system prompt Convert SystemPrompt into a textarea with 2 rows. Reduce user-input-textarea to 2 rows from 3, so that overall vertical space usage remains same. Shorten usage messages a bit, cleanup to sync with settings ui. * SimpleChat: Add basic skeleton for saving and loading chat Inturn when ever a chat message (system/user/model) is added, the chat will be saved into browser's localStorage. * SimpleChat:ODS: Add a prefix to chatid wrt ondiskstorage key * SimpleChat:ODS:WIP:TMP: Add UI to load previously saved chat This is a temporary flow * SimpleChat:ODS:Move restore/load saved chat btn setup to Me This also allows being able to set the common system prompt ui element to loaded chat's system prompt. * SimpleChat:Readme updated wrt save and restore chat session info * SimpleChat:Show chat session restore button, only if saved session * SimpleChat: AutoCreate ChatRequestOptions settings to an extent * SimpleChat: Update main README wrt usage with server --- README.md | 2 + .../server/public_simplechat/datautils.mjs | 266 +++++++++ examples/server/public_simplechat/index.html | 24 +- examples/server/public_simplechat/readme.md | 132 ++++- .../server/public_simplechat/simplechat.css | 11 + .../server/public_simplechat/simplechat.js | 558 +++++++++++++----- examples/server/public_simplechat/ui.mjs | 211 +++++++ 7 files changed, 1029 insertions(+), 175 deletions(-) create mode 100644 examples/server/public_simplechat/datautils.mjs create mode 100644 examples/server/public_simplechat/ui.mjs diff --git a/README.md b/README.md index 4791f84af..8680460aa 100644 --- a/README.md +++ b/README.md @@ -150,6 +150,8 @@ Typically finetunes of the base models below are supported as well. [llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients. +[simplechat](./examples/server/public_simplechat) is a simple chat client, which can be used to chat with the model exposed using above web server (use --path to point to simplechat), from a local web browser. + **Bindings:** - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) diff --git a/examples/server/public_simplechat/datautils.mjs b/examples/server/public_simplechat/datautils.mjs new file mode 100644 index 000000000..75159d6b1 --- /dev/null +++ b/examples/server/public_simplechat/datautils.mjs @@ -0,0 +1,266 @@ +//@ts-check +// Helpers to work with different data types +// by Humans for All +// + +/** + * Given the limited context size of local LLMs and , many a times when context gets filled + * between the prompt and the response, it can lead to repeating text garbage generation. + * And many a times setting penalty wrt repeatation leads to over-intelligent garbage + * repeatation with slight variations. These garbage inturn can lead to overloading of the + * available model context, leading to less valuable response for subsequent prompts/queries, + * if chat history is sent to ai model. + * + * So two simple minded garbage trimming logics are experimented below. + * * one based on progressively-larger-substring-based-repeat-matching-with-partial-skip and + * * another based on char-histogram-driven garbage trimming. + * * in future characteristic of histogram over varying lengths could be used to allow for + * a more aggressive and adaptive trimming logic. + */ + + +/** + * Simple minded logic to help remove repeating garbage at end of the string. + * The repeatation needs to be perfectly matching. + * + * The logic progressively goes on probing for longer and longer substring based + * repeatation, till there is no longer repeatation. Inturn picks the one with + * the longest chain. + * + * @param {string} sIn + * @param {number} maxSubL + * @param {number} maxMatchLenThreshold + */ +export function trim_repeat_garbage_at_end(sIn, maxSubL=10, maxMatchLenThreshold=40) { + let rCnt = [0]; + let maxMatchLen = maxSubL; + let iMML = -1; + for(let subL=1; subL < maxSubL; subL++) { + rCnt.push(0); + let i; + let refS = sIn.substring(sIn.length-subL, sIn.length); + for(i=sIn.length; i > 0; i -= subL) { + let curS = sIn.substring(i-subL, i); + if (refS != curS) { + let curMatchLen = rCnt[subL]*subL; + if (maxMatchLen < curMatchLen) { + maxMatchLen = curMatchLen; + iMML = subL; + } + break; + } + rCnt[subL] += 1; + } + } + console.debug("DBUG:DU:TrimRepeatGarbage:", rCnt); + if ((iMML == -1) || (maxMatchLen < maxMatchLenThreshold)) { + return {trimmed: false, data: sIn}; + } + console.debug("DBUG:TrimRepeatGarbage:TrimmedCharLen:", maxMatchLen); + let iEnd = sIn.length - maxMatchLen; + return { trimmed: true, data: sIn.substring(0, iEnd) }; +} + + +/** + * Simple minded logic to help remove repeating garbage at end of the string, till it cant. + * If its not able to trim, then it will try to skip a char at end and then trim, a few times. + * This ensures that even if there are multiple runs of garbage with different patterns, the + * logic still tries to munch through them. + * + * @param {string} sIn + * @param {number} maxSubL + * @param {number | undefined} [maxMatchLenThreshold] + */ +export function trim_repeat_garbage_at_end_loop(sIn, maxSubL, maxMatchLenThreshold, skipMax=16) { + let sCur = sIn; + let sSaved = ""; + let iTry = 0; + while(true) { + let got = trim_repeat_garbage_at_end(sCur, maxSubL, maxMatchLenThreshold); + if (got.trimmed != true) { + if (iTry == 0) { + sSaved = got.data; + } + iTry += 1; + if (iTry >= skipMax) { + return sSaved; + } + got.data = got.data.substring(0,got.data.length-1); + } else { + iTry = 0; + } + sCur = got.data; + } +} + + +/** + * A simple minded try trim garbage at end using histogram driven characteristics. + * There can be variation in the repeatations, as long as no new char props up. + * + * This tracks the chars and their frequency in a specified length of substring at the end + * and inturn checks if moving further into the generated text from the end remains within + * the same char subset or goes beyond it and based on that either trims the string at the + * end or not. This allows to filter garbage at the end, including even if there are certain + * kind of small variations in the repeated text wrt position of seen chars. + * + * Allow the garbage to contain upto maxUniq chars, but at the same time ensure that + * a given type of char ie numerals or alphabets or other types dont cross the specified + * maxType limit. This allows intermixed text garbage to be identified and trimmed. + * + * ALERT: This is not perfect and only provides a rough garbage identification logic. + * Also it currently only differentiates between character classes wrt english. + * + * @param {string} sIn + * @param {number} maxType + * @param {number} maxUniq + * @param {number} maxMatchLenThreshold + */ +export function trim_hist_garbage_at_end(sIn, maxType, maxUniq, maxMatchLenThreshold) { + if (sIn.length < maxMatchLenThreshold) { + return { trimmed: false, data: sIn }; + } + let iAlp = 0; + let iNum = 0; + let iOth = 0; + // Learn + let hist = {}; + let iUniq = 0; + for(let i=0; i= maxUniq) { + break; + } + hist[c] = 1; + } + } + console.debug("DBUG:TrimHistGarbage:", hist); + if ((iAlp > maxType) || (iNum > maxType) || (iOth > maxType)) { + return { trimmed: false, data: sIn }; + } + // Catch and Trim + for(let i=0; i < sIn.length; i++) { + let c = sIn[sIn.length-1-i]; + if (!(c in hist)) { + if (i < maxMatchLenThreshold) { + return { trimmed: false, data: sIn }; + } + console.debug("DBUG:TrimHistGarbage:TrimmedCharLen:", i); + return { trimmed: true, data: sIn.substring(0, sIn.length-i+1) }; + } + } + console.debug("DBUG:TrimHistGarbage:Trimmed fully"); + return { trimmed: true, data: "" }; +} + +/** + * Keep trimming repeatedly using hist_garbage logic, till you no longer can. + * This ensures that even if there are multiple runs of garbage with different patterns, + * the logic still tries to munch through them. + * + * @param {any} sIn + * @param {number} maxType + * @param {number} maxUniq + * @param {number} maxMatchLenThreshold + */ +export function trim_hist_garbage_at_end_loop(sIn, maxType, maxUniq, maxMatchLenThreshold) { + let sCur = sIn; + while (true) { + let got = trim_hist_garbage_at_end(sCur, maxType, maxUniq, maxMatchLenThreshold); + if (!got.trimmed) { + return got.data; + } + sCur = got.data; + } +} + +/** + * Try trim garbage at the end by using both the hist-driven-garbage-trimming as well as + * skip-a-bit-if-reqd-then-repeat-pattern-based-garbage-trimming, with blind retrying. + * @param {string} sIn + */ +export function trim_garbage_at_end(sIn) { + let sCur = sIn; + for(let i=0; i<2; i++) { + sCur = trim_hist_garbage_at_end_loop(sCur, 8, 24, 72); + sCur = trim_repeat_garbage_at_end_loop(sCur, 32, 72, 12); + } + return sCur; +} + + +/** + * NewLines array helper. + * Allow for maintaining a list of lines. + * Allow for a line to be builtup/appended part by part. + */ +export class NewLines { + + constructor() { + /** @type {string[]} */ + this.lines = []; + } + + /** + * Extracts lines from the passed string and inturn either + * append to a previous partial line or add a new line. + * @param {string} sLines + */ + add_append(sLines) { + let aLines = sLines.split("\n"); + let lCnt = 0; + for(let line of aLines) { + lCnt += 1; + // Add back newline removed if any during split + if (lCnt < aLines.length) { + line += "\n"; + } else { + if (sLines.endsWith("\n")) { + line += "\n"; + } + } + // Append if required + if (lCnt == 1) { + let lastLine = this.lines[this.lines.length-1]; + if (lastLine != undefined) { + if (!lastLine.endsWith("\n")) { + this.lines[this.lines.length-1] += line; + continue; + } + } + } + // Add new line + this.lines.push(line); + } + } + + /** + * Shift the oldest/earliest/0th line in the array. [Old-New|Earliest-Latest] + * Optionally control whether only full lines (ie those with newline at end) will be returned + * or will a partial line without a newline at end (can only be the last line) be returned. + * @param {boolean} bFullWithNewLineOnly + */ + shift(bFullWithNewLineOnly=true) { + let line = this.lines[0]; + if (line == undefined) { + return undefined; + } + if ((line[line.length-1] != "\n") && bFullWithNewLineOnly){ + return undefined; + } + return this.lines.shift(); + } + +} diff --git a/examples/server/public_simplechat/index.html b/examples/server/public_simplechat/index.html index 1a1a34208..f6413016f 100644 --- a/examples/server/public_simplechat/index.html +++ b/examples/server/public_simplechat/index.html @@ -8,21 +8,23 @@ - + +
-
+

SimpleChat

-
- - -
+
@@ -30,7 +32,7 @@
- +

@@ -40,7 +42,7 @@
- +
diff --git a/examples/server/public_simplechat/readme.md b/examples/server/public_simplechat/readme.md index de0dfc99d..36a46885d 100644 --- a/examples/server/public_simplechat/readme.md +++ b/examples/server/public_simplechat/readme.md @@ -11,18 +11,29 @@ in a simple way with minimal code from a common code base. Inturn additionally i multiple independent back and forth chatting to an extent, with the ai llm model at a basic level, with their own system prompts. +This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated, +or potentially as it is being generated, in a streamed manner from the server/ai-model. + +Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you +open SimpleChat, option is provided to restore the old chat session, if a matching one exists. + The UI follows a responsive web design so that the layout can adapt to available display space in a usable enough manner, in general. Allows developer/end-user to control some of the behaviour by updating gMe members from browser's devel-tool -console. +console. Parallely some of the directly useful to end-user settings can also be changed using the provided +settings ui. -NOTE: Given that the idea is for basic minimal testing, it doesnt bother with any model context length and -culling of old messages from the chat by default. However by enabling the sliding window chat logic, a crude -form of old messages culling can be achieved. +NOTE: Current web service api doesnt expose the model context length directly, so client logic doesnt provide +any adaptive culling of old messages nor of replacing them with summary of their content etal. However there +is a optional sliding window based chat logic, which provides a simple minded culling of old messages from +the chat history before sending to the ai model. -NOTE: It doesnt set any parameters other than temperature and max_tokens for now. However if someone wants -they can update the js file or equivalent member in gMe as needed. +NOTE: Wrt options sent with the request, it mainly sets temperature, max_tokens and optionaly stream for now. +However if someone wants they can update the js file or equivalent member in gMe as needed. + +NOTE: One may be able to use this to chat with openai api web-service /chat/completions endpoint, in a very +limited / minimal way. One will need to set model, openai url and authorization bearer key in settings ui. ## usage @@ -52,9 +63,15 @@ Open this simple web front end from your local browser Once inside -* Select between chat and completion mode. By default it is set to chat mode. +* If you want to, you can change many of the default global settings + * the base url (ie ip addr / domain name, port) + * chat (default) vs completion mode + * try trim garbage in response or not + * amount of chat history in the context sent to server/ai-model + * oneshot or streamed mode. * In completion mode + * one normally doesnt use a system prompt in completion mode. * logic by default doesnt insert any role specific "ROLE: " prefix wrt each role's message. If the model requires any prefix wrt user role messages, then the end user has to explicitly add the needed prefix, when they enter their chat message. @@ -88,12 +105,16 @@ Once inside * Wait for the logic to communicate with the server and get the response. * the user is not allowed to enter any fresh query during this time. * the user input box will be disabled and a working message will be shown in it. + * if trim garbage is enabled, the logic will try to trim repeating text kind of garbage to some extent. * just refresh the page, to reset wrt the chat history and or system prompt and start afresh. * Using NewChat one can start independent chat sessions. * two independent chat sessions are setup by default. +* When you want to print, switching ChatHistoryInCtxt to Full and clicking on the chat session button of + interest, will display the full chat history till then wrt same, if you want full history for printing. + ## Devel note @@ -104,14 +125,31 @@ by developers who may not be from web frontend background (so inturn may not be end-use-specific-language-extensions driven flows) so that they can use it to explore/experiment things. And given that the idea is also to help explore/experiment for developers, some flexibility is provided -to change behaviour easily using the devel-tools/console, for now. And skeletal logic has been implemented -to explore some of the end points and ideas/implications around them. +to change behaviour easily using the devel-tools/console or provided minimal settings ui (wrt few aspects). +Skeletal logic has been implemented to explore some of the end points and ideas/implications around them. ### General Me/gMe consolidates the settings which control the behaviour into one object. One can see the current settings, as well as change/update them using browsers devel-tool/console. +It is attached to the document object. Some of these can also be updated using the Settings UI. + + baseURL - the domain-name/ip-address and inturn the port to send the request. + + bStream - control between oneshot-at-end and live-stream-as-its-generated collating and showing + of the generated response. + + the logic assumes that the text sent from the server follows utf-8 encoding. + + in streaming mode - if there is any exception, the logic traps the same and tries to ensure + that text generated till then is not lost. + + if a very long text is being generated, which leads to no user interaction for sometime and + inturn the machine goes into power saving mode or so, the platform may stop network connection, + leading to exception. + + apiEP - select between /completions and /chat/completions endpoint provided by the server/ai-model. bCompletionFreshChatAlways - whether Completion mode collates complete/sliding-window history when communicating with the server or only sends the latest user query/message. @@ -119,6 +157,19 @@ One can see the current settings, as well as change/update them using browsers d bCompletionInsertStandardRolePrefix - whether Completion mode inserts role related prefix wrt the messages that get inserted into prompt field wrt /Completion endpoint. + bTrimGarbage - whether garbage repeatation at the end of the generated ai response, should be + trimmed or left as is. If enabled, it will be trimmed so that it wont be sent back as part of + subsequent chat history. At the same time the actual trimmed text is shown to the user, once + when it was generated, so user can check if any useful info/data was there in the response. + + One may be able to request the ai-model to continue (wrt the last response) (if chat-history + is enabled as part of the chat-history-in-context setting), and chances are the ai-model will + continue starting from the trimmed part, thus allows long response to be recovered/continued + indirectly, in many cases. + + The histogram/freq based trimming logic is currently tuned for english language wrt its + is-it-a-alpabetic|numeral-char regex match logic. + chatRequestOptions - maintains the list of options/fields to send along with chat request, irrespective of whether /chat/completions or /completions endpoint. @@ -126,6 +177,14 @@ One can see the current settings, as well as change/update them using browsers d modify the existing options value or remove them, for now you can update this global var using browser's development-tools/console. + For string and numeric fields in chatRequestOptions, including even those added by a user + at runtime by directly modifying gMe.chatRequestOptions, setting ui entries will be auto + created. + + headers - maintains the list of http headers sent when request is made to the server. By default + Content-Type is set to application/json. Additionally Authorization entry is provided, which can + be set if needed using the settings ui. + iRecentUserMsgCnt - a simple minded SlidingWindow to limit context window load at Ai Model end. This is disabled by default. However if enabled, then in addition to latest system message, only the last/latest iRecentUserMsgCnt user messages after the latest system prompt and its responses @@ -140,7 +199,8 @@ One can see the current settings, as well as change/update them using browsers d By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the implications of loading of the ai-model's context window by chat history, wrt chat response to -some extent in a simple crude way. +some extent in a simple crude way. You may also want to control the context size enabled when +the server loads ai-model, on the server end. Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js @@ -149,28 +209,15 @@ matter clearing site data, dont directly override site caching in all cases. Wor have to change port. Or in dev tools of browser, you may be able to disable caching fully. -Concept of multiple chat sessions with different servers, as well as saving and restoring of -those across browser usage sessions, can be woven around the SimpleChat/MultiChatUI class and -its instances relatively easily, however given the current goal of keeping this simple, it has -not been added, for now. +Currently the server to communicate with is maintained globally and not as part of a specific +chat session. So if one changes the server ip/url in setting, then all chat sessions will auto +switch to this new server, when you try using those sessions. By switching between chat.add_system_begin/anytime, one can control whether one can change the system prompt, anytime during the conversation or only at the beginning. -read_json_early, is to experiment with reading json response data early on, if available, -so that user can be shown generated data, as and when it is being generated, rather than -at the end when full data is available. - - the server flow doesnt seem to be sending back data early, atleast for request (inc options) - that is currently sent. - - if able to read json data early on in future, as and when ai model is generating data, then - this helper needs to indirectly update the chat div with the recieved data, without waiting - for the overall data to be available. - - ### Default setup By default things are setup to try and make the user experience a bit better, if possible. @@ -179,7 +226,8 @@ However a developer when testing the server of ai-model may want to change these Using iRecentUserMsgCnt reduce chat history context sent to the server/ai-model to be just the system-prompt, prev-user-request-and-ai-response and cur-user-request, instead of full chat history. This way if there is any response with garbage/repeatation, it doesnt -mess with things beyond the next question/request/query, in some ways. +mess with things beyond the next question/request/query, in some ways. The trim garbage +option also tries to help avoid issues with garbage in the context to an extent. Set max_tokens to 1024, so that a relatively large previous reponse doesnt eat up the space available wrt next query-response. However dont forget that the server when started should @@ -189,11 +237,33 @@ also be started with a model context size of 1k or more, to be on safe side. internal n_predict, for now add the same here on the client side, maybe later add max_tokens to /completions endpoint handling code on server side. -Frequency and presence penalty fields are set to 1.2 in the set of fields sent to server -along with the user query. So that the model is partly set to try avoid repeating text in -its response. +NOTE: One may want to experiment with frequency/presence penalty fields in chatRequestOptions +wrt the set of fields sent to server along with the user query. To check how the model behaves +wrt repeatations in general in the generated text response. -A end-user can change these behaviour by editing gMe from browser's devel-tool/console. +A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by +using the providing settings ui. + + +### OpenAi / Equivalent API WebService + +One may be abe to handshake with OpenAI/Equivalent api web service's /chat/completions endpoint +for a minimal chatting experimentation by setting the below. + +* the baseUrl in settings ui + * https://api.openai.com/v1 or similar + +* Wrt request body - gMe.chatRequestOptions + * model (settings ui) + * any additional fields if required in future + +* Wrt request headers - gMe.headers + * Authorization (available through settings ui) + * Bearer THE_OPENAI_API_KEY + * any additional optional header entries like "OpenAI-Organization", "OpenAI-Project" or so + +NOTE: Not tested, as there is no free tier api testing available. However logically this might +work. ## At the end diff --git a/examples/server/public_simplechat/simplechat.css b/examples/server/public_simplechat/simplechat.css index 20c738b12..13bfb80b4 100644 --- a/examples/server/public_simplechat/simplechat.css +++ b/examples/server/public_simplechat/simplechat.css @@ -21,6 +21,17 @@ .role-user { background-color: lightgray; } +.role-trim { + background-color: lightpink; +} + +.gridx2 { + display: grid; + grid-template-columns: repeat(2, 1fr); + border-bottom-style: dotted; + border-bottom-width: thin; + border-bottom-color: lightblue; +} .flex-grow { flex-grow: 1; diff --git a/examples/server/public_simplechat/simplechat.js b/examples/server/public_simplechat/simplechat.js index 0c48da879..25afb2564 100644 --- a/examples/server/public_simplechat/simplechat.js +++ b/examples/server/public_simplechat/simplechat.js @@ -2,6 +2,9 @@ // A simple completions and chat/completions test related web front end logic // by Humans for All +import * as du from "./datautils.mjs"; +import * as ui from "./ui.mjs" + class Roles { static System = "system"; static User = "user"; @@ -9,40 +12,65 @@ class Roles { } class ApiEP { - static Chat = "chat"; - static Completion = "completion"; + static Type = { + Chat: "chat", + Completion: "completion", + } + static UrlSuffix = { + 'chat': `/chat/completions`, + 'completion': `/completions`, + } + + /** + * Build the url from given baseUrl and apiEp id. + * @param {string} baseUrl + * @param {string} apiEP + */ + static Url(baseUrl, apiEP) { + if (baseUrl.endsWith("/")) { + baseUrl = baseUrl.substring(0, baseUrl.length-1); + } + return `${baseUrl}${this.UrlSuffix[apiEP]}`; + } + } + let gUsageMsg = `

Usage

    -
  • Set system prompt above, to try control ai response charactersitic, if model supports same.
  • +
  • System prompt above, to try control ai response characteristics.
    • -
    • Completion mode normally wont have a system prompt.
    • +
    • Completion mode - no system prompt normally.
    +
  • Use shift+enter for inserting enter/newline.
  • Enter your query to ai assistant below.
  • -
      -
    • Completion mode doesnt insert user/role: prefix implicitly.
    • -
    • Use shift+enter for inserting enter/newline.
    • -
  • Default ContextWindow = [System, Last Query+Resp, Cur Query].
    • -
    • experiment iRecentUserMsgCnt, max_tokens, model ctxt window to expand
    • +
    • ChatHistInCtxt, MaxTokens, ModelCtxt window to expand
`; + /** @typedef {{role: string, content: string}[]} ChatMessages */ +/** @typedef {{iLastSys: number, xchat: ChatMessages}} SimpleChatODS */ + class SimpleChat { - constructor() { + /** + * @param {string} chatId + */ + constructor(chatId) { + this.chatId = chatId; /** * Maintain in a form suitable for common LLM web service chat/completions' messages entry * @type {ChatMessages} */ this.xchat = []; this.iLastSys = -1; + this.latestResponse = ""; } clear() { @@ -50,6 +78,27 @@ class SimpleChat { this.iLastSys = -1; } + ods_key() { + return `SimpleChat-${this.chatId}` + } + + save() { + /** @type {SimpleChatODS} */ + let ods = {iLastSys: this.iLastSys, xchat: this.xchat}; + localStorage.setItem(this.ods_key(), JSON.stringify(ods)); + } + + load() { + let sods = localStorage.getItem(this.ods_key()); + if (sods == null) { + return; + } + /** @type {SimpleChatODS} */ + let ods = JSON.parse(sods); + this.iLastSys = ods.iLastSys; + this.xchat = ods.xchat; + } + /** * Recent chat messages. * If iRecentUserMsgCnt < 0 @@ -94,6 +143,15 @@ class SimpleChat { return rchat; } + /** + * Collate the latest response from the server/ai-model, as it is becoming available. + * This is mainly useful for the stream mode. + * @param {string} content + */ + append_response(content) { + this.latestResponse += content; + } + /** * Add an entry into xchat * @param {string} role @@ -107,6 +165,7 @@ class SimpleChat { if (role == Roles.System) { this.iLastSys = this.xchat.length - 1; } + this.save(); return true; } @@ -121,10 +180,8 @@ class SimpleChat { } let last = undefined; for(const x of this.recent_chat(gMe.iRecentUserMsgCnt)) { - let entry = document.createElement("p"); + let entry = ui.el_create_append_p(`${x.role}: ${x.content}`, div); entry.className = `role-${x.role}`; - entry.innerText = `${x.role}: ${x.content}`; - div.appendChild(entry); last = entry; } if (last !== undefined) { @@ -132,21 +189,45 @@ class SimpleChat { } else { if (bClear) { div.innerHTML = gUsageMsg; + gMe.setup_load(div, this); gMe.show_info(div); } } + return last; + } + + /** + * Setup the fetch headers. + * It picks the headers from gMe.headers. + * It inserts Authorization only if its non-empty. + * @param {string} apiEP + */ + fetch_headers(apiEP) { + let headers = new Headers(); + for(let k in gMe.headers) { + let v = gMe.headers[k]; + if ((k == "Authorization") && (v.trim() == "")) { + continue; + } + headers.append(k, v); + } + return headers; } /** * Add needed fields wrt json object to be sent wrt LLM web services completions endpoint. * The needed fields/options are picked from a global object. + * Add optional stream flag, if required. * Convert the json into string. * @param {Object} obj */ - request_jsonstr(obj) { + request_jsonstr_extend(obj) { for(let k in gMe.chatRequestOptions) { obj[k] = gMe.chatRequestOptions[k]; } + if (gMe.bStream) { + obj["stream"] = true; + } return JSON.stringify(obj); } @@ -157,7 +238,7 @@ class SimpleChat { let req = { messages: this.recent_chat(gMe.iRecentUserMsgCnt), } - return this.request_jsonstr(req); + return this.request_jsonstr_extend(req); } /** @@ -180,7 +261,60 @@ class SimpleChat { let req = { prompt: prompt, } - return this.request_jsonstr(req); + return this.request_jsonstr_extend(req); + } + + /** + * Return a string form of json object suitable for specified api endpoint. + * @param {string} apiEP + */ + request_jsonstr(apiEP) { + if (apiEP == ApiEP.Type.Chat) { + return this.request_messages_jsonstr(); + } else { + return this.request_prompt_jsonstr(gMe.bCompletionInsertStandardRolePrefix); + } + } + + /** + * Extract the ai-model/assistant's response from the http response got. + * Optionally trim the message wrt any garbage at the end. + * @param {any} respBody + * @param {string} apiEP + */ + response_extract(respBody, apiEP) { + let assistant = ""; + if (apiEP == ApiEP.Type.Chat) { + assistant = respBody["choices"][0]["message"]["content"]; + } else { + try { + assistant = respBody["choices"][0]["text"]; + } catch { + assistant = respBody["content"]; + } + } + return assistant; + } + + /** + * Extract the ai-model/assistant's response from the http response got in streaming mode. + * @param {any} respBody + * @param {string} apiEP + */ + response_extract_stream(respBody, apiEP) { + let assistant = ""; + if (apiEP == ApiEP.Type.Chat) { + if (respBody["choices"][0]["finish_reason"] !== "stop") { + assistant = respBody["choices"][0]["delta"]["content"]; + } + } else { + try { + assistant = respBody["choices"][0]["text"]; + } catch { + assistant = respBody["content"]; + } + } + return assistant; } /** @@ -239,53 +373,99 @@ class SimpleChat { return sysPrompt; } -} - -let gBaseURL = "http://127.0.0.1:8080"; -let gChatURL = { - 'chat': `${gBaseURL}/chat/completions`, - 'completion': `${gBaseURL}/completions`, -} - - -/** - * Set the class of the children, based on whether it is the idSelected or not. - * @param {HTMLDivElement} elBase - * @param {string} idSelected - * @param {string} classSelected - * @param {string} classUnSelected - */ -function el_children_config_class(elBase, idSelected, classSelected, classUnSelected="") { - for(let child of elBase.children) { - if (child.id == idSelected) { - child.className = classSelected; - } else { - child.className = classUnSelected; + /** + * Handle the multipart response from server/ai-model + * @param {Response} resp + * @param {string} apiEP + * @param {HTMLDivElement} elDiv + */ + async handle_response_multipart(resp, apiEP, elDiv) { + let elP = ui.el_create_append_p("", elDiv); + if (!resp.body) { + throw Error("ERRR:SimpleChat:SC:HandleResponseMultiPart:No body..."); } + let tdUtf8 = new TextDecoder("utf-8"); + let rr = resp.body.getReader(); + this.latestResponse = ""; + let xLines = new du.NewLines(); + while(true) { + let { value: cur, done: done } = await rr.read(); + if (cur) { + let curBody = tdUtf8.decode(cur, {stream: true}); + console.debug("DBUG:SC:PART:Str:", curBody); + xLines.add_append(curBody); + } + while(true) { + let curLine = xLines.shift(!done); + if (curLine == undefined) { + break; + } + if (curLine.trim() == "") { + continue; + } + if (curLine.startsWith("data:")) { + curLine = curLine.substring(5); + } + let curJson = JSON.parse(curLine); + console.debug("DBUG:SC:PART:Json:", curJson); + this.append_response(this.response_extract_stream(curJson, apiEP)); + } + elP.innerText = this.latestResponse; + elP.scrollIntoView(false); + if (done) { + break; + } + } + console.debug("DBUG:SC:PART:Full:", this.latestResponse); + return this.latestResponse; } -} -/** - * Create button and set it up. - * @param {string} id - * @param {(this: HTMLButtonElement, ev: MouseEvent) => any} callback - * @param {string | undefined} name - * @param {string | undefined} innerText - */ -function el_create_button(id, callback, name=undefined, innerText=undefined) { - if (!name) { - name = id; + /** + * Handle the oneshot response from server/ai-model + * @param {Response} resp + * @param {string} apiEP + */ + async handle_response_oneshot(resp, apiEP) { + let respBody = await resp.json(); + console.debug(`DBUG:SimpleChat:SC:${this.chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`); + return this.response_extract(respBody, apiEP); } - if (!innerText) { - innerText = id; + + /** + * Handle the response from the server be it in oneshot or multipart/stream mode. + * Also take care of the optional garbage trimming. + * @param {Response} resp + * @param {string} apiEP + * @param {HTMLDivElement} elDiv + */ + async handle_response(resp, apiEP, elDiv) { + let theResp = { + assistant: "", + trimmed: "", + } + if (gMe.bStream) { + try { + theResp.assistant = await this.handle_response_multipart(resp, apiEP, elDiv); + this.latestResponse = ""; + } catch (error) { + theResp.assistant = this.latestResponse; + this.add(Roles.Assistant, theResp.assistant); + this.latestResponse = ""; + throw error; + } + } else { + theResp.assistant = await this.handle_response_oneshot(resp, apiEP); + } + if (gMe.bTrimGarbage) { + let origMsg = theResp.assistant; + theResp.assistant = du.trim_garbage_at_end(origMsg); + theResp.trimmed = origMsg.substring(theResp.assistant.length); + } + this.add(Roles.Assistant, theResp.assistant); + return theResp; } - let btn = document.createElement("button"); - btn.id = id; - btn.name = name; - btn.innerText = innerText; - btn.addEventListener("click", callback); - return btn; + } @@ -302,14 +482,16 @@ class MultiChatUI { this.elDivChat = /** @type{HTMLDivElement} */(document.getElementById("chat-div")); this.elBtnUser = /** @type{HTMLButtonElement} */(document.getElementById("user-btn")); this.elInUser = /** @type{HTMLInputElement} */(document.getElementById("user-in")); - this.elSelectApiEP = /** @type{HTMLSelectElement} */(document.getElementById("api-ep")); + this.elDivHeading = /** @type{HTMLSelectElement} */(document.getElementById("heading")); this.elDivSessions = /** @type{HTMLDivElement} */(document.getElementById("sessions-div")); + this.elBtnSettings = /** @type{HTMLButtonElement} */(document.getElementById("settings")); this.validate_element(this.elInSystem, "system-in"); this.validate_element(this.elDivChat, "chat-div"); this.validate_element(this.elInUser, "user-in"); - this.validate_element(this.elSelectApiEP, "api-ep"); + this.validate_element(this.elDivHeading, "heading"); this.validate_element(this.elDivChat, "sessions-div"); + this.validate_element(this.elBtnSettings, "settings"); } /** @@ -350,13 +532,18 @@ class MultiChatUI { this.handle_session_switch(this.curChatId); } + this.elBtnSettings.addEventListener("click", (ev)=>{ + this.elDivChat.replaceChildren(); + gMe.show_settings(this.elDivChat); + }); + this.elBtnUser.addEventListener("click", (ev)=>{ if (this.elInUser.disabled) { return; } - this.handle_user_submit(this.curChatId, this.elSelectApiEP.value).catch((/** @type{Error} */reason)=>{ + this.handle_user_submit(this.curChatId, gMe.apiEP).catch((/** @type{Error} */reason)=>{ let msg = `ERRR:SimpleChat\nMCUI:HandleUserSubmit:${this.curChatId}\n${reason.name}:${reason.message}`; - console.debug(msg.replace("\n", ":")); + console.error(msg.replace("\n", ":")); alert(msg); this.ui_reset_userinput(); }); @@ -377,6 +564,8 @@ class MultiChatUI { // allow user to insert enter into the system prompt using shift+enter. // while just pressing enter key will lead to setting the system prompt. if ((ev.key === "Enter") && (!ev.shiftKey)) { + let value = this.elInSystem.value; + this.elInSystem.value = value.substring(0,value.length-1); let chat = this.simpleChats[this.curChatId]; chat.add_system_anytime(this.elInSystem.value, this.curChatId); chat.show(this.elDivChat); @@ -392,34 +581,12 @@ class MultiChatUI { * @param {boolean} bSwitchSession */ new_chat_session(chatId, bSwitchSession=false) { - this.simpleChats[chatId] = new SimpleChat(); + this.simpleChats[chatId] = new SimpleChat(chatId); if (bSwitchSession) { this.handle_session_switch(chatId); } } - /** - * Try read json response early, if available. - * @param {Response} resp - */ - async read_json_early(resp) { - if (!resp.body) { - throw Error("ERRR:SimpleChat:MCUI:ReadJsonEarly:No body..."); - } - let tdUtf8 = new TextDecoder("utf-8"); - let rr = resp.body.getReader(); - let gotBody = ""; - while(true) { - let { value: cur, done: done} = await rr.read(); - let curBody = tdUtf8.decode(cur); - console.debug("DBUG:SC:PART:", curBody); - gotBody += curBody; - if (done) { - break; - } - } - return JSON.parse(gotBody); - } /** * Handle user query submit request, wrt specified chat session. @@ -434,7 +601,7 @@ class MultiChatUI { // So if user wants to simulate a multi-chat based completion query, // they will have to enter the full thing, as a suitable multiline // user input/query. - if ((apiEP == ApiEP.Completion) && (gMe.bCompletionFreshChatAlways)) { + if ((apiEP == ApiEP.Type.Completion) && (gMe.bCompletionFreshChatAlways)) { chat.clear(); } @@ -447,41 +614,26 @@ class MultiChatUI { } chat.show(this.elDivChat); - let theBody; - let theUrl = gChatURL[apiEP] - if (apiEP == ApiEP.Chat) { - theBody = chat.request_messages_jsonstr(); - } else { - theBody = chat.request_prompt_jsonstr(gMe.bCompletionInsertStandardRolePrefix); - } + let theUrl = ApiEP.Url(gMe.baseURL, apiEP); + let theBody = chat.request_jsonstr(apiEP); this.elInUser.value = "working..."; this.elInUser.disabled = true; console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:${theUrl}:ReqBody:${theBody}`); + let theHeaders = chat.fetch_headers(apiEP); let resp = await fetch(theUrl, { method: "POST", - headers: { - "Content-Type": "application/json", - }, + headers: theHeaders, body: theBody, }); - let respBody = await resp.json(); - //let respBody = await this.read_json_early(resp); - console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`); - let assistantMsg; - if (apiEP == ApiEP.Chat) { - assistantMsg = respBody["choices"][0]["message"]["content"]; - } else { - try { - assistantMsg = respBody["choices"][0]["text"]; - } catch { - assistantMsg = respBody["content"]; - } - } - chat.add(Roles.Assistant, assistantMsg); + let theResp = await chat.handle_response(resp, apiEP, this.elDivChat); if (chatId == this.curChatId) { chat.show(this.elDivChat); + if (theResp.trimmed.length > 0) { + let p = ui.el_create_append_p(`TRIMMED:${theResp.trimmed}`, this.elDivChat); + p.className="role-trim"; + } } else { console.debug(`DBUG:SimpleChat:MCUI:HandleUserSubmit:ChatId has changed:[${chatId}] [${this.curChatId}]`); } @@ -500,7 +652,7 @@ class MultiChatUI { } elDiv.replaceChildren(); // Btn for creating new chat session - let btnNew = el_create_button("New CHAT", (ev)=> { + let btnNew = ui.el_create_button("New CHAT", (ev)=> { if (this.elInUser.disabled) { console.error(`ERRR:SimpleChat:MCUI:NewChat:Current session [${this.curChatId}] awaiting response, ignoring request...`); alert("ERRR:SimpleChat\nMCUI:NewChat\nWait for response to pending query, before starting new chat session"); @@ -514,7 +666,7 @@ class MultiChatUI { } this.new_chat_session(chatIdGot, true); this.create_session_btn(elDiv, chatIdGot); - el_children_config_class(elDiv, chatIdGot, "session-selected", ""); + ui.el_children_config_class(elDiv, chatIdGot, "session-selected", ""); }); elDiv.appendChild(btnNew); // Btns for existing chat sessions @@ -528,7 +680,7 @@ class MultiChatUI { } create_session_btn(elDiv, cid) { - let btn = el_create_button(cid, (ev)=>{ + let btn = ui.el_create_button(cid, (ev)=>{ let target = /** @type{HTMLButtonElement} */(ev.target); console.debug(`DBUG:SimpleChat:MCUI:SessionClick:${target.id}`); if (this.elInUser.disabled) { @@ -537,7 +689,7 @@ class MultiChatUI { return; } this.handle_session_switch(target.id); - el_children_config_class(elDiv, target.id, "session-selected", ""); + ui.el_children_config_class(elDiv, target.id, "session-selected", ""); }); elDiv.appendChild(btn); return btn; @@ -567,46 +719,183 @@ class MultiChatUI { class Me { constructor() { + this.baseURL = "http://127.0.0.1:8080"; this.defaultChatIds = [ "Default", "Other" ]; this.multiChat = new MultiChatUI(); + this.bStream = true; this.bCompletionFreshChatAlways = true; this.bCompletionInsertStandardRolePrefix = false; + this.bTrimGarbage = true; this.iRecentUserMsgCnt = 2; + this.sRecentUserMsgCnt = { + "Full": -1, + "Last0": 1, + "Last1": 2, + "Last2": 3, + "Last4": 5, + }; + this.apiEP = ApiEP.Type.Chat; + this.headers = { + "Content-Type": "application/json", + "Authorization": "", // Authorization: Bearer OPENAI_API_KEY + } // Add needed fields wrt json object to be sent wrt LLM web services completions endpoint. this.chatRequestOptions = { + "model": "gpt-3.5-turbo", "temperature": 0.7, "max_tokens": 1024, - "frequency_penalty": 1.2, - "presence_penalty": 1.2, - "n_predict": 1024 + "n_predict": 1024, + //"frequency_penalty": 1.2, + //"presence_penalty": 1.2, }; } /** + * Disable console.debug by mapping it to a empty function. + */ + debug_disable() { + this.console_debug = console.debug; + console.debug = () => { + + }; + } + + /** + * Setup the load saved chat ui. + * @param {HTMLDivElement} div + * @param {SimpleChat} chat + */ + setup_load(div, chat) { + if (!(chat.ods_key() in localStorage)) { + return; + } + div.innerHTML += `

Restore

+

Load previously saved chat session, if available

`; + let btn = ui.el_create_button(chat.ods_key(), (ev)=>{ + console.log("DBUG:SimpleChat:SC:Load", chat); + chat.load(); + queueMicrotask(()=>{ + chat.show(div); + this.multiChat.elInSystem.value = chat.get_system_latest(); + }); + }); + div.appendChild(btn); + } + + /** + * Show the configurable parameters info in the passed Div element. + * @param {HTMLDivElement} elDiv + * @param {boolean} bAll + */ + show_info(elDiv, bAll=false) { + + let p = ui.el_create_append_p("Settings (devel-tools-console document[gMe])", elDiv); + p.className = "role-system"; + + if (bAll) { + + ui.el_create_append_p(`baseURL:${this.baseURL}`, elDiv); + + ui.el_create_append_p(`Authorization:${this.headers["Authorization"]}`, elDiv); + + ui.el_create_append_p(`bStream:${this.bStream}`, elDiv); + + ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv); + + ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv); + + ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv); + + ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv); + + ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv); + + } + + ui.el_create_append_p(`chatRequestOptions:${JSON.stringify(this.chatRequestOptions, null, " - ")}`, elDiv); + ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv); + + } + + /** + * Auto create ui input elements for fields in ChatRequestOptions + * Currently supports text and number field types. * @param {HTMLDivElement} elDiv */ - show_info(elDiv) { + show_settings_chatrequestoptions(elDiv) { + let typeDict = { + "string": "text", + "number": "number", + }; + let fs = document.createElement("fieldset"); + let legend = document.createElement("legend"); + legend.innerText = "ChatRequestOptions"; + fs.appendChild(legend); + elDiv.appendChild(fs); + for(const k in this.chatRequestOptions) { + let val = this.chatRequestOptions[k]; + let type = typeof(val); + if (!((type == "string") || (type == "number"))) { + continue; + } + let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.chatRequestOptions[k], (val)=>{ + if (type == "number") { + val = Number(val); + } + this.chatRequestOptions[k] = val; + }); + fs.appendChild(inp.div); + } + } - var p = document.createElement("p"); - p.innerText = "Settings (devel-tools-console gMe)"; - p.className = "role-system"; - elDiv.appendChild(p); + /** + * Show settings ui for configurable parameters, in the passed Div element. + * @param {HTMLDivElement} elDiv + */ + show_settings(elDiv) { - var p = document.createElement("p"); - p.innerText = `bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`; - elDiv.appendChild(p); + let inp = ui.el_creatediv_input("SetBaseURL", "BaseURL", "text", this.baseURL, (val)=>{ + this.baseURL = val; + }); + elDiv.appendChild(inp.div); - p = document.createElement("p"); - p.innerText = `bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`; - elDiv.appendChild(p); + inp = ui.el_creatediv_input("SetAuthorization", "Authorization", "text", this.headers["Authorization"], (val)=>{ + this.headers["Authorization"] = val; + }); + inp.el.placeholder = "Bearer OPENAI_API_KEY"; + elDiv.appendChild(inp.div); - p = document.createElement("p"); - p.innerText = `iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`; - elDiv.appendChild(p); + let bb = ui.el_creatediv_boolbutton("SetStream", "Stream", {true: "[+] yes stream", false: "[-] do oneshot"}, this.bStream, (val)=>{ + this.bStream = val; + }); + elDiv.appendChild(bb.div); - p = document.createElement("p"); - p.innerText = `chatRequestOptions:${JSON.stringify(this.chatRequestOptions)}`; - elDiv.appendChild(p); + bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{ + this.bCompletionFreshChatAlways = val; + }); + elDiv.appendChild(bb.div); + + bb = ui.el_creatediv_boolbutton("SetCompletionInsertStandardRolePrefix", "CompletionInsertStandardRolePrefix", {true: "[+] yes insert", false: "[-] dont insert"}, this.bCompletionInsertStandardRolePrefix, (val)=>{ + this.bCompletionInsertStandardRolePrefix = val; + }); + elDiv.appendChild(bb.div); + + bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{ + this.bTrimGarbage = val; + }); + elDiv.appendChild(bb.div); + + let sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{ + this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val]; + }); + elDiv.appendChild(sel.div); + + sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{ + this.apiEP = ApiEP.Type[val]; + }); + elDiv.appendChild(sel.div); + + this.show_settings_chatrequestoptions(elDiv); } @@ -619,6 +908,9 @@ let gMe; function startme() { console.log("INFO:SimpleChat:StartMe:Starting..."); gMe = new Me(); + gMe.debug_disable(); + document["gMe"] = gMe; + document["du"] = du; for (let cid of gMe.defaultChatIds) { gMe.multiChat.new_chat_session(cid); } diff --git a/examples/server/public_simplechat/ui.mjs b/examples/server/public_simplechat/ui.mjs new file mode 100644 index 000000000..b2d5b9aea --- /dev/null +++ b/examples/server/public_simplechat/ui.mjs @@ -0,0 +1,211 @@ +//@ts-check +// Helpers to work with html elements +// by Humans for All +// + + +/** + * Set the class of the children, based on whether it is the idSelected or not. + * @param {HTMLDivElement} elBase + * @param {string} idSelected + * @param {string} classSelected + * @param {string} classUnSelected + */ +export function el_children_config_class(elBase, idSelected, classSelected, classUnSelected="") { + for(let child of elBase.children) { + if (child.id == idSelected) { + child.className = classSelected; + } else { + child.className = classUnSelected; + } + } +} + +/** + * Create button and set it up. + * @param {string} id + * @param {(this: HTMLButtonElement, ev: MouseEvent) => any} callback + * @param {string | undefined} name + * @param {string | undefined} innerText + */ +export function el_create_button(id, callback, name=undefined, innerText=undefined) { + if (!name) { + name = id; + } + if (!innerText) { + innerText = id; + } + let btn = document.createElement("button"); + btn.id = id; + btn.name = name; + btn.innerText = innerText; + btn.addEventListener("click", callback); + return btn; +} + +/** + * Create a para and set it up. Optionaly append it to a passed parent. + * @param {string} text + * @param {HTMLElement | undefined} elParent + * @param {string | undefined} id + */ +export function el_create_append_p(text, elParent=undefined, id=undefined) { + let para = document.createElement("p"); + para.innerText = text; + if (id) { + para.id = id; + } + if (elParent) { + elParent.appendChild(para); + } + return para; +} + +/** + * Create a button which represents bool value using specified text wrt true and false. + * When ever user clicks the button, it will toggle the value and update the shown text. + * + * @param {string} id + * @param {{true: string, false: string}} texts + * @param {boolean} defaultValue + * @param {function(boolean):void} cb + */ +export function el_create_boolbutton(id, texts, defaultValue, cb) { + let el = document.createElement("button"); + el["xbool"] = defaultValue; + el["xtexts"] = structuredClone(texts); + el.innerText = el["xtexts"][String(defaultValue)]; + if (id) { + el.id = id; + } + el.addEventListener('click', (ev)=>{ + el["xbool"] = !el["xbool"]; + el.innerText = el["xtexts"][String(el["xbool"])]; + cb(el["xbool"]); + }) + return el; +} + +/** + * Create a div wrapped button which represents bool value using specified text wrt true and false. + * @param {string} id + * @param {string} label + * @param {{ true: string; false: string; }} texts + * @param {boolean} defaultValue + * @param {(arg0: boolean) => void} cb + * @param {string} className + */ +export function el_creatediv_boolbutton(id, label, texts, defaultValue, cb, className="gridx2") { + let div = document.createElement("div"); + div.className = className; + let lbl = document.createElement("label"); + lbl.setAttribute("for", id); + lbl.innerText = label; + div.appendChild(lbl); + let btn = el_create_boolbutton(id, texts, defaultValue, cb); + div.appendChild(btn); + return { div: div, el: btn }; +} + + +/** + * Create a select ui element, with a set of options to select from. + * * options: an object which contains name-value pairs + * * defaultOption: the value whose name should be choosen, by default. + * * cb : the call back returns the name string of the option selected. + * + * @param {string} id + * @param {Object} options + * @param {*} defaultOption + * @param {function(string):void} cb + */ +export function el_create_select(id, options, defaultOption, cb) { + let el = document.createElement("select"); + el["xselected"] = defaultOption; + el["xoptions"] = structuredClone(options); + for(let cur of Object.keys(options)) { + let op = document.createElement("option"); + op.value = cur; + op.innerText = cur; + if (options[cur] == defaultOption) { + op.selected = true; + } + el.appendChild(op); + } + if (id) { + el.id = id; + el.name = id; + } + el.addEventListener('change', (ev)=>{ + let target = /** @type{HTMLSelectElement} */(ev.target); + console.log("DBUG:UI:Select:", id, ":", target.value); + cb(target.value); + }) + return el; +} + +/** + * Create a div wrapped select ui element, with a set of options to select from. + * + * @param {string} id + * @param {any} label + * @param {{ [x: string]: any; }} options + * @param {any} defaultOption + * @param {(arg0: string) => void} cb + * @param {string} className + */ +export function el_creatediv_select(id, label, options, defaultOption, cb, className="gridx2") { + let div = document.createElement("div"); + div.className = className; + let lbl = document.createElement("label"); + lbl.setAttribute("for", id); + lbl.innerText = label; + div.appendChild(lbl); + let sel = el_create_select(id, options,defaultOption, cb); + div.appendChild(sel); + return { div: div, el: sel }; +} + + +/** + * Create a input ui element. + * + * @param {string} id + * @param {string} type + * @param {any} defaultValue + * @param {function(any):void} cb + */ +export function el_create_input(id, type, defaultValue, cb) { + let el = document.createElement("input"); + el.type = type; + el.value = defaultValue; + if (id) { + el.id = id; + } + el.addEventListener('change', (ev)=>{ + cb(el.value); + }) + return el; +} + +/** + * Create a div wrapped input. + * + * @param {string} id + * @param {string} label + * @param {string} type + * @param {any} defaultValue + * @param {function(any):void} cb + * @param {string} className + */ +export function el_creatediv_input(id, label, type, defaultValue, cb, className="gridx2") { + let div = document.createElement("div"); + div.className = className; + let lbl = document.createElement("label"); + lbl.setAttribute("for", id); + lbl.innerText = label; + div.appendChild(lbl); + let el = el_create_input(id, type, defaultValue, cb); + div.appendChild(el); + return { div: div, el: el }; +} From 2e666832e6ac78194edf030bd1c295e21bdb022c Mon Sep 17 00:00:00 2001 From: Yazan Agha-Schrader Date: Sat, 1 Jun 2024 21:31:48 +0200 Subject: [PATCH 5/6] server : new UI (#7633) * ic * migrate my eary work * add the belonging stuff: css,favicon etc * de prompts * chore: Update HTML meta tags in index.html file * add api-key css classes * some necessary fixes * Add API key CSS classes and update styling in style.css * clean the code * move API to the top, rearrange param sliders. update css * add tooltips to the parameters with comprehensible explanations * fix FloatField and BoolField tooltips * fix grammar field width * use template literales for promptFormats.js * update const ModelGenerationInfo * remove ms per token, since not relevant for most webui users and use cases * add phi-3 prompt template * add phi3 to dropdown * add css class * update forgotten css theme * add user message suffix * fix chatml & add llama3 format * fix llama3 prompt template * more prompt format fixes * add more comon stop tokens * add missing char * do not separate with new line or comma * move prompt style * add hacky llama2 prompt solution, reduce redundancy in promptFormats.js * fix toggle state localstorage * add cmd-r prompt et reduce redundancy * set default prompt to empty * move files, clean code * fix css path * add a button to the new ui * move new ui to "/public" due to otherwise problematic CORS behaviour * include new ui in cpp * fix wrong link to old ui * renaming to ensure consistency * fix typos "prompt-format" -> "prompt-formats" * use correct indent * add new ui files to makefile * fix typo --- Makefile | 2 +- examples/server/CMakeLists.txt | 11 + examples/server/public/colorthemes.css | 402 ++++++ examples/server/public/index-new.html | 1178 ++++++++++++++++++ examples/server/public/index.html | 79 +- examples/server/public/prompt-formats.js | 331 +++++ examples/server/public/style.css | 954 ++++++++++++++ examples/server/public/system-prompts.js | 68 + examples/server/public/theme-beeninorder.css | 228 ++++ examples/server/public/theme-ketivah.css | 201 +++ examples/server/public/theme-mangotango.css | 216 ++++ examples/server/public/theme-playground.css | 221 ++++ examples/server/public/theme-polarnight.css | 253 ++++ examples/server/public/theme-snowstorm.css | 251 ++++ examples/server/server.cpp | 27 +- 15 files changed, 4418 insertions(+), 4 deletions(-) create mode 100755 examples/server/public/colorthemes.css create mode 100644 examples/server/public/index-new.html create mode 100644 examples/server/public/prompt-formats.js create mode 100755 examples/server/public/style.css create mode 100644 examples/server/public/system-prompts.js create mode 100755 examples/server/public/theme-beeninorder.css create mode 100755 examples/server/public/theme-ketivah.css create mode 100755 examples/server/public/theme-mangotango.css create mode 100755 examples/server/public/theme-playground.css create mode 100755 examples/server/public/theme-polarnight.css create mode 100755 examples/server/public/theme-snowstorm.css diff --git a/Makefile b/Makefile index dfb3bb2cd..c643fe0cc 100644 --- a/Makefile +++ b/Makefile @@ -832,7 +832,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2) diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 4b89c5302..dab709619 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -8,9 +8,20 @@ set(TARGET_SRCS httplib.h ) set(PUBLIC_ASSETS + colorthemes.css + style.css + theme-beeninorder.css + theme-ketivah.css + theme-mangotango.css + theme-playground.css + theme-polarnight.css + theme-snowstorm.css index.html + index-new.html index.js completion.js + system-prompts.js + prompt-formats.js json-schema-to-grammar.mjs ) foreach(asset ${PUBLIC_ASSETS}) diff --git a/examples/server/public/colorthemes.css b/examples/server/public/colorthemes.css new file mode 100755 index 000000000..b1e2b8b70 --- /dev/null +++ b/examples/server/public/colorthemes.css @@ -0,0 +1,402 @@ +@import url("theme-snowstorm.css"); +@import url("theme-polarnight.css"); +@import url("theme-ketivah.css"); +@import url("theme-mangotango.css"); +@import url("theme-playground.css"); +@import url("theme-beeninorder.css"); + +:root { +/* ---------- PRIMARY COLORS ----------------- */ +--primary-color-1: hsl(217.5, 26.7%, 94.1%); + --primary-color-1-hue: 217.5; + --primary-color-1-saturation: 26.7%; + --primary-color-1-lightness: 94.1%; + +--primary-color-2: hsl(218.2, 26.8%, 92.0%); + --primary-color-2-hue: 218.2; + --primary-color-2-saturation: 26.8%; + --primary-color-2-lightness: 92.0%; + +--primary-color-3: hsl(218.8, 27.9%, 88.0%); + --primary-color-3-hue: 218.8; + --primary-color-3-saturation: 27.9%; + --primary-color-3-lightness: 88.0%; + +--primary-color-4: hsl(218.8, 18.3%, 81.8%); + --primary-color-4-hue: 218.8; + --primary-color-4-saturation: 18.3%; + --primary-color-4-lightness: 81.8%; + + +/* ---------- SECONDARY COLORS --------------- */ +--secondary-color-1: hsl(220.0, 16.4%, 21.6%); + --secondary-color-1-hue: 220.0; + --secondary-color-1-saturation: 16.4%; + --secondary-color-1-lightness: 21.6%; + +--secondary-color-2: hsl(221.7, 16.3%, 27.6%); + --secondary-color-2-hue: 221.7; + --secondary-color-2-saturation: 16.3%; + --secondary-color-2-lightness: 27.6%; + +--secondary-color-3: hsl(220.0, 16.8%, 31.6%); + --secondary-color-3-hue: 220.0; + --secondary-color-3-saturation: 16.8%; + --secondary-color-3-lightness: 31.6%; + +--secondary-color-4: hsl(220.0, 16.5%, 35.7%); + --secondary-color-4-hue: 220.0; + --secondary-color-4-saturation: 16.5%; + --secondary-color-4-lightness: 35.7%; + + + +/* ----------- NUANCES COLORS ---------------- */ +--theme-nuance-color-1: hsl(178.7, 25.1%, 64.9%); + --theme-nuance-color-1-hue: 178.7; + --theme-nuance-color-1-saturation: 25.1%; + --theme-nuance-color-1-lightness: 64.9%; + +--theme-nuance-color-2: hsl(193.3, 43.4%, 67.5%); + --theme-nuance-color-2-hue: 193.3; + --theme-nuance-color-2-saturation: 43.4%; + --theme-nuance-color-2-lightness: 67.5%; + +--theme-nuance-color-3: hsl(210.0, 34.0%, 63.1%); + --theme-nuance-color-3-hue: 210.0; + --theme-nuance-color-3-saturation: 34.0%; + --theme-nuance-color-3-lightness: 63.1%; + +--theme-nuance-color-4: hsl(213.1, 32.0%, 52.2%); + --theme-nuance-color-4-hue: 213.1; + --theme-nuance-color-4-saturation: 32.0%; + --theme-nuance-color-4-lightness: 52.2%; + + + +/* ----------- ROYGP COLORS ------------------ */ +--theme-red-color: hsl(32.5, 80%, 50%); +--theme-orange-color: hsl(32.5, 70%, 45%); +--theme-yellow-color: hsl(40.0, 0.6%, 73.3%); +--theme-green-color: hsl(92.4, 27.8%, 64.7%); +--theme-purple-color: hsl(311.1, 20.2%, 63.1%); + + + +/* ------------------------------------------- */ +--background-color-1: var(--primary-color-1); +--background-color-2: var(--primary-color-2); +--background-color-3: var(--primary-color-3); +--background-color-4: var(--primary-color-4); + +--border-color-1: var(--primary-color-2); +--border-color-2: var(--primary-color-3); +--border-color-3: var(--primary-color-4); + +--border-focus-color: var(--theme-nuance-color-2); +--border-focus-shadow: var(--theme-nuance-color-1); + +--text-color-plain: var(--secondary-color-1); +--text-color-subtile-1: var(--secondary-color-2); +--text-color-subtile-2: var(--secondary-color-3); + +--code-background-color: var(--secondary-color-2); +--code-text-color: var(--primary-color-2); + +--ui-range-thumb-color: var(--theme-nuance-color-3); +--ui-range-thumb-border: var(--ui-ranger-thumb-color); + +--textarea-border-color: var(--secondary-color-4); + +--chat-id-color: var(--theme-nuance-color-4); + + + +/* ------------------------------------------- */ +--button-alert-text-hover: var(--primary-color-1); +--button-alert-color-hover: var(--theme-orange-color); +--button-alert-border-hover: var(--theme-orange-color); + +--button-alert-text-active: var(--primary-color-1); +--button-alert-color-active: var(--theme-red-color); +--button-alert-border-active: var(--theme-red-color); + + + +/* ----------- PRIMARY BUTTONS --------------- */ +/* - button should immediately catch the eye - */ +--button-primary-text: var(--secondary-color-1); +--button-primary-color: var(--theme-nuance-color-3); +--button-primary-border: var(--theme-nuance-color-3); + + +/* ---------hover---------- */ +--button-primary-text-hover: + hsl(217.5, + calc(var(--secondary-color-1-saturation) + 35%), + calc(var(--secondary-color-1-lightness) - 30%)); + +--button-primary-color-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 2%), + calc(var(--theme-nuance-color-3-lightness) - 10%)); + +--button-primary-border-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 2%), + calc(var(--theme-nuance-color-3-lightness) - 10%)); + + +/* ---------active--------- */ +--button-primary-text-active: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) + 35%)); + +--button-primary-color-active: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 10%), + calc(var(--theme-nuance-color-3-lightness) - 25%)); + +--button-primary-border-active: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 10%), + calc(var(--theme-nuance-color-3-lightness) - 25%)); + + + +/* ---------- SECONDARY BUTTONS -------------- */ +/* these should NOT immediately catch the eye */ +--button-secondary-text: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) - 50%)); + +--button-secondary-color: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) + 10%)); + +--button-secondary-border: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) + 10%)); + + +/* ---------hover---------- */ +--button-secondary-text-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) - 80%)); + +--button-secondary-color-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 22%), + calc(var(--theme-nuance-color-3-lightness) + 1%)); + +--button-secondary-border-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 22%), + calc(var(--theme-nuance-color-3-lightness) + 1%)); + + +/* ---------active--------- */ +--button-secondary-text-active: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) + 40%), + calc(var(--theme-nuance-color-3-lightness) - 55%)); + +--button-secondary-color-active: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 30%), + calc(var(--theme-nuance-color-3-lightness) - 5%)); + +--button-secondary-border-active: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 30%), + calc(var(--theme-nuance-color-3-lightness) - 5%)); + + + +/* ---------- TERTIARY BUTTONS --------------- */ +/* ---------- disabled buttons --------------- */ +--button-tertiary-text: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 40%), + calc(var(--theme-nuance-color-3-lightness) - 5%)); + +--button-tertiary-color: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 40%), + calc(var(--theme-nuance-color-3-lightness) + 20%)); + +--button-tertiary-border: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 40%), + calc(var(--theme-nuance-color-3-lightness) + 20%)); + +/* ---------hover---------- */ +--button-tertiary-text-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 40%), + calc(var(--theme-nuance-color-3-lightness) - 5%)); + +--button-tertiary-color-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 40%), + calc(var(--theme-nuance-color-3-lightness) + 20%)); + +--button-tertiary-border-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 40%), + calc(var(--theme-nuance-color-3-lightness) + 20%)); +} + +/* + +.theme-template { + + + If light theme: should go from bright to darker + If dark theme: should go from dark to brighter + ideally this should not be anything but steps of + gray or slightly variants from it + + --primary-color-1: #2E3440; + --primary-color-2: #3B4252; + --primary-color-3: #434C5E; + --primary-color-4: #4C566A; + + + + If light theme: should go from dark to brighter + If dark theme: should go from bright to darker + ideally this should not be anything but steps of + gray or slightly variants from it + + --secondary-color-1: #ECEFF4; + --secondary-color-2: #E5E9F0; + --secondary-color-3: #D8DEE9; + --secondary-color-4: #C8CED9; + + + + Choose wisely nuance colors. It is not easy to find + 4 harmonizing nuance colors. But keep in mind, that + only one accent color could work too. + + --theme-nuance-color-1: #8FBCBB; + --theme-nuance-color-2: #88C0D0; + --theme-nuance-color-3: #81A1C1; + --theme-nuance-color-4: #5E81AC; + + + + adapt the color red, orange, yellow, green, + purple to the 'mood' of your overall design + e.g is it low-contrast? vibrant? dynamic? etc + + --theme-red-color: #BF616A; + --theme-orange-color: #D08770; + --theme-yellow-color: #EBCB8B; + --theme-green-color: #A3BE8C; + --theme-purple-color: #B48EAD; + + + +NOTE: comment all those line `--- ...` out +------------------------------------------------ +--background-color-1: +--background-color-2: +--background-color-3: +--background-color-4: + +--border-color-1: +--border-color-2: +--border-color-3: + +--border-focus-color: +--border-focus-shadow: + +--text-color-plain: +--text-color-subtile-1: +--text-color-subtile-2: + +--code-background-color: +--code-text-color: + +--ui-range-thumb-color: +--ui-range-thumb-border: + +--textarea-border-color: + + + +------------------------------------------- +--button-alert-text-hover: +--button-alert-color-hover: +--button-alert-border-hover: + +--button-alert-text-active: +--button-alert-color-active: +--button-alert-border-active: + + + +----------- PRIMARY ----------------------- +--button should immediately catch the eye-- + +--button-primary-text: +--button-primary-color: +--button-primary-border: + + +---------hover---------- +--button-primary-text-hover: +--button-primary-color-hover: +--button-primary-border-hover: + + +---------active--------- +--button-primary-text-active: +--button-primary-color-active: +--button-primary-border-active: + + + +------------ SECONDARY ------------------------ +--button should NOT immediately catch the eye-- + +--button-secondary-text: +--button-secondary-color: +--button-secondary-border: + + +---------hover---------- +--button-secondary-text-hover: +--button-secondary-color-hover: +--button-secondary-border-hover: + + +---------active--------- +--button-secondary-text-active: +--button-secondary-color-active: +--button-secondary-border-active: + + + +---------- TERTIARY ----------------------- +---------- disabled buttons --------------- +--button-tertiary-text: +--button-tertiary-color: +--button-tertiary-border: + + +---------hover---------- +--button-tertiary-text: +--button-tertiary-color: +--button-tertiary-border: + +} + +*/ diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html new file mode 100644 index 000000000..d571c2779 --- /dev/null +++ b/examples/server/public/index-new.html @@ -0,0 +1,1178 @@ + + + + + + + + + llama.cpp - chat + + + + + + + + + +
+ +
+
+ + + diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 4c5a34d90..2f60a76e8 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -12,6 +12,18 @@ font-size: 90%; } + .grid-container { + display: grid; + grid-template-columns: auto auto auto; + padding: 10px; + } + + .grid-item { + padding: 5px; + /* font-size: 30px; */ + text-align: center; + } + #container { margin: 0em auto; display: flex; @@ -35,6 +47,67 @@ padding: 0.5em; } + h1 { + text-align: center; + } + + .customlink:link { + color: white; + background-color: #007aff; + font-weight: 600; + text-decoration: none; + float: right; + margin-top: 30px; + display: flex; + flex-direction: row; + gap: 0.5em; + justify-content: flex-end; + border-radius: 4px; + padding: 8px; + } + + .customlink:visited { + color: white; + background-color: #007aff; + font-weight: 600; + text-decoration: none; + float: right; + margin-top: 30px; + display: flex; + flex-direction: row; + gap: 0.5em; + justify-content: flex-end; + padding: 8px; + } + + .customlink:hover { + color: white; + background-color: #0070ee; + font-weight: 600; + text-decoration: none; + float: right; + margin-top: 30px; + display: flex; + flex-direction: row; + gap: 0.5em; + justify-content: flex-end; + padding: 8px; + } + + .customlink:active { + color: #0070ee; + background-color: #80b3ef; + font-weight: 600; + text-decoration: none; + float: right; + margin-top: 30px; + display: flex; + flex-direction: row; + gap: 0.5em; + justify-content: flex-end; + padding: 8px; + } + body { max-width: 600px; min-width: 300px; @@ -1035,7 +1108,11 @@ return html`
-

llama.cpp

+
+
+

llama.cpp

+ +
diff --git a/examples/server/public/prompt-formats.js b/examples/server/public/prompt-formats.js new file mode 100644 index 000000000..73ddb7187 --- /dev/null +++ b/examples/server/public/prompt-formats.js @@ -0,0 +1,331 @@ +// extended list +export const promptFormats = { + "alpaca": { + template: `{{prompt}}\n\n{{history}}\n\n{{char}}:`, + + historyTemplate: `### {{name}}:\n{{message}}`, + + char: "Response", + charMsgPrefix: "", + charMsgSuffix: "", + + user: "Instruction", + userMsgPrefix: "", + userMsgSuffix: "", + + stops: "" + }, + + // ---------------------------- + + "chatml": { + template: `<|im_start|>system\n{{prompt}}<|im_end|>\n{{history}}{{char}}`, + + historyTemplate: `<|im_start|>{{name}}\n{{message}}`, + + char: "assistant", + charMsgPrefix: "", + charMsgSuffix: "", + + user: "user", + userMsgPrefix: "", + userMsgSuffix: "<|im_end|>\n", + + stops: "" + }, + + // ---------------------------- + + "commandr": { + template: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{prompt}}\n<|END_OF_TURN_TOKEN|>{{history}}{{char}}`, + + historyTemplate: `<|START_OF_TURN_TOKEN|><|{{name}}|> {{message}}`, + + char: "CHATBOT_TOKEN", + charMsgPrefix: "", + charMsgSuffix: "", + + user: "USER_TOKEN", + userMsgPrefix: "", + userMsgSuffix: "<|END_OF_TURN_TOKEN|>", + + stops: "" + }, + // ref: https://docs.cohere.com/docs/prompting-command-r + + // ---------------------------- + + "llama2": { + template: `[INST] <>\n{{prompt}}\n<>\n\nTest Message [/INST] Test Successfull {{history}}{{char}}`, + + historyTemplate: `{{name}}: {{message}}`, + + char: "Assistant", + charMsgPrefix: "", + charMsgSuffix: "", + + user: "User", + userMsgPrefix: "[INST] ", + userMsgSuffix: " [/INST]", + + stops: "" + }, + // ref: https://huggingface.co/blog/llama2#how-to-prompt-llama-2 + + // ---------------------------- + + "llama3": { + template: `<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{{prompt}}{{history}}{{char}}`, + + historyTemplate: `<|start_header_id|>{{name}}<|end_header_id|>\n\n{{message}}<|eot_id|>`, + + char: "assistant", + charMsgPrefix: "", + charMsgSuffix: "", + + user: "user", + userMsgPrefix: "", + userMsgSuffix: "", + + stops: "<|eot_id|>" + }, + // ref: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/#special-tokens-used-with-meta-llama-3 + + // ---------------------------- + + "openchat": { + template: `{{history}}{{char}}`, + + historyTemplate: `GPT4 Correct {{name}}: {{message}}<|end_of_turn|>`, + + char: "Assistant", + charMsgPrefix: "", + charMsgSuffix: "", + + user: "User", + userMsgPrefix: "", + userMsgSuffix: "", + + stops: "" + }, + + // ---------------------------- + + "phi3": { + template: `{{history}}{{char}}`, + + historyTemplate: `<|{{name}}|>\n{{message}}<|end|>\n`, + + char: "assistant", + charMsgPrefix: "", + charMsgSuffix: "", + + user: "user", + userMsgPrefix: "", + userMsgSuffix: "", + + stops: "<|end|>" + }, + // ref: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct#chat-format + + // ---------------------------- + + "vicuna": { + template: `{{prompt}}\n{{history}}{{char}}`, + + historyTemplate: `{{name}}: {{message}}\n`, + + char: "ASSISTANT", + charMsgPrefix: "", + charMsgSuffix: "", + + user: "USER", + userMsgPrefix: "", + userMsgSuffix: "", + + stops: "" + }, + // ref: https://huggingface.co/lmsys/vicuna-33b-v1.3/discussions/1 + + // ---------------------------- + + "deepseekCoder": { + template: `{{prompt}}{{history}}{{char}}:`, + + historyTemplate: `### {{name}}:\n{{message}}`, + + char: "Response", + charMsgPrefix: "", + charMsgSuffix: "", + + user: "Instruction", + userMsgPrefix: "", + userMsgSuffix: "", + + stops: "<|EOT|>" + }, + + // ---------------------------- + + "med42": { + template: `<|system|>: {{prompt}}\n{{history}}{{char}}`, + + historyTemplate: `<|{{name}}|>: {{message}}\n`, + + char: "assistant", + charMsgPrefix: "", + charMsgSuffix: "", + + user: "prompter", + userMsgPrefix: "", + userMsgSuffix: "", + + stops: "" + }, + + // ---------------------------- + + "neuralchat": { + template: `### System:\n{{prompt}}\n{{history}}{{char}}:`, + + historyTemplate: `### {{name}}:\n{{message}}\n`, + + char: "Assistant", + charMsgPrefix: "", + charMsgSuffix: "", + + user: "User", + userMsgPrefix: "", + userMsgSuffix: "", + + stops: "" + }, + + // ---------------------------- + + "nousHermes": { + template: `### Instruction: {{prompt}}\n\n{{history}}\n\n{{char}}:`, + + historyTemplate: `### {{name}}:\n{{message}}`, + + char: "Response", + charMsgPrefix: "", + charMsgSuffix: "", + + user: "Input", + userMsgPrefix: "", + userMsgSuffix: "", + + stops: "" + }, + + // ---------------------------- + + "openchatMath": { + template: `{{history}}{{char}}`, + + historyTemplate: `Math Correct {{name}}: {{message}}<|end_of_turn|>`, + + char: "Assistant", + charMsgPrefix: "", + charMsgSuffix: "", + + + user: "User", + userMsgPrefix: "", + userMsgSuffix: "", + + stops: "" + }, + + // ---------------------------- + + "orion": { + template: `Human: Test Message\n\nAssistant: Test Successful{{history}}{{char}}:`, + + historyTemplate: `{{name}}: {{message}}`, + + char: "Assistant ", + charMsgPrefix: "", + charMsgSuffix: "", + + user: "Human", + userMsgPrefix: "", + userMsgSuffix: "\n\n", + + stops: "" + }, + + // ---------------------------- + + "sauerkraut": { + template: `{{prompt}}\n{{history}}{{char}}`, + + historyTemplate: ` + {{name}}: {{message}}\n`, + + char: "Assistant", + charMsgPrefix: "", + charMsgSuffix: "", + + user: "User", + userMsgPrefix: "", + userMsgSuffix: "", + + stops: "" + }, + + // ---------------------------- + + "starlingCode": { + template: `{{history}}{{char}}`, + + historyTemplate: `Code {{name}}: {{message}}<|end_of_turn|>`, + + char: "Assistant", + charMsgPrefix: "", + charMsgSuffix: "", + + user: "User", + userMsgPrefix: "", + userMsgSuffix: "", + + stops: "" + }, + + // ---------------------------- + + "yi34b": { + template: `{{history}} {{char}}`, + + historyTemplate: `{{name}}: {{message}}`, + + char: "Assistant", + charMsgPrefix: "", + charMsgSuffix: "", + + user: "Human", + userMsgPrefix: "", + userMsgSuffix: "", + + stops: "" + }, + + // ---------------------------- + + "zephyr": { + template: `<|system|>\n{{prompt}}\n{{history}}{{char}}`, + + historyTemplate: `<|{{name}}|>\n{{message}}\n`, + + char: "assistant", + charMsgPrefix: "", + charMsgSuffix: "", + + user: "user", + userMsgPrefix: "", + userMsgSuffix: "", + + stops: "" + } + }; diff --git a/examples/server/public/style.css b/examples/server/public/style.css new file mode 100755 index 000000000..087cc62da --- /dev/null +++ b/examples/server/public/style.css @@ -0,0 +1,954 @@ +@import url("colorthemes.css"); + +body { + font-family: 'Arial', sans-serif; + font-size: 90%; + background-color: var(--background-color-1); + color: var(--text-color-subtile-1); /* head 1 llama.cpp & triangle options for some reason */ + max-width: 600px; + min-width: 300px; + line-height: 1.2; + margin: 0 auto; + padding: 0 0.5em; + transition: background-color 0.3s; +} + +::selection { + color: var(--button-primary-text) ; + background: var(--button-primary-color); +} + +code, pre code { + font-family: 'Courier New', monospace; +} + +#container { + margin: 0em auto; + display: flex; + flex-direction: column; + justify-content: space-between; + height: 100%; +} + +main { + margin: 3px; + display: flex; + flex-direction: column; + justify-content: space-between; + gap: 1em; + flex-grow: 1; + overflow-y: auto; + border: 1px solid var(--border-color-3); + border-radius: 5px; + padding: 0.5em; +} + +p { + overflow-wrap: break-word; + word-wrap: break-word; + hyphens: auto; + margin-top: 0.5em; + margin-bottom: 0.5em; +} + +#write form { + margin: 1em 0 0 0; + display: flex; + flex-direction: column; + gap: 0.5em; + align-items: stretch; +} + +.right { + display: flex; + flex-direction: row; + gap: 0.5em; + justify-content: flex-end; + margin-bottom: 30px; +} + +.two-columns { + width: 97%; + max-width: 97%; + display: grid; + grid-template-columns: 1fr 1fr; + gap: 1em; + position: relative; +} + +.json-schema-controls { + margin-top: 10px; + width: 100%; + max-width: 100%; + display: grid; + grid-template: "a a"; + gap: 1em; + font-size: x-small; + color: var(--theme-nuance-color-3); + padding-top: 16px; + padding-bottom: 16px; + text-transform: uppercase; + font-weight: 600; +} + +.json-schema-controls > * { + flex: 1; +} + +/* titles of the details-summary boxes */ +.summary-title { + font-weight: 600; + font-size: x-small; + color: var(--text-color-subtile-1); + text-transform: uppercase; + /* transition: ; */ +} + +fieldset { + border: none; + padding: 0; + margin: 0; + color: var(--text-color-plain); +} + +fieldset.two { + display: grid; + grid-template: "a a a"; + gap: 1em; + align-items: center; + font-size: x-small; + color: var(--text-color-plain); +} + +fieldset.three { + display: grid; + grid-template: "a a a"; + gap: 1em; + font-size: x-small; + color: var(--text-color-plain); +} + +/* titles of name fields*/ +fieldset.names { + display: grid; + grid-template: "a a"; + gap: 1em; + font-size: x-small; + color: var(--theme-nuance-color-3); + padding-top: 16px; + padding-bottom: 16px; + text-transform: uppercase; + font-weight: 600; +} + +/* titles of params fields*/ +fieldset.params { + display: grid; + grid-template: "a a"; + gap: 1em; + font-size: x-small; + color: var(--theme-nuance-color-4); + padding-top: 16px; + padding-bottom: 16px; + text-transform: uppercase; + font-weight: 600; +} + +fieldset.dropdowns { + -webkit-appearance: none; + display: flex; + grid-template: "a a"; + gap: 1em; + font-size: x-small; + color: red; + padding-top: 16px; + padding-bottom: 16px; + text-transform: uppercase; + font-weight: 600; +} + +/* input of name fields*/ +.names input[type="text"] { + font-family: Arial, sans-serif; + font-size: medium; + font-weight: 500; + padding: 5px; + border: 1px solid var(--border-color-2); +} + +.chat-id-color { + color: var(--chat-id-color); +} + +details { + border: 1px solid var(--border-color-2); + border-radius: 5px; + padding: 0.5em 0.5em 0; + margin-top: 0.5em; +} + +summary { + font-weight: bold; + margin: -0.5em -0.5em 0; + padding: 0.5em; + cursor: pointer; +} + +details[open] { + padding: 0.5em; +} + +textarea-sec, input-sec, button-sec { + padding: 10px; + height: 40px; + align-items: center; +} + +textarea-sec::placeholder, input-sec::placeholder { + padding-left: 10px; +} + +.toggleCheckbox { + display: none; +} + +.toggleContainer { + position: relative; + display: grid; + grid-template-columns: repeat(2, 1fr); + width: fit-content; + border: 3px solid var(--border-color-2); + border-radius: 20px; + background: var(--border-color-2); + font-size: small; + cursor: pointer; + overflow: hidden; +} + +/* toggle button current state */ +.toggleContainer::before { + color: var(--button-primary-text); + background-color: var(--button-primary-color); + content: ''; + position: absolute; + width: 50%; + height: 100%; + left: 0%; + border-radius: 20px; + transition: all 0.3s; +} + +.toggleContainer div { + padding: 6px; + text-align: center; + z-index: 1; + transition: color 0.3s; +} + +.toggleCheckbox:checked + .toggleContainer::before { + left: 50%; +} + +.toggleCheckbox:checked + .toggleContainer div:first-child { + color: var(--text-color-subtile-2); +} + +.toggleCheckbox:checked + .toggleContainer div:last-child { + color: var(--button-primary-text); +} + +.toggleCheckbox + .toggleContainer div:first-child { + color: var(--button-primary-text); +} + +.toggleCheckbox + .toggleContainer div:last-child { + color: var(--text-color-subtile-2); +} + +select { + padding: 5px; + margin-right: 5px; + border-radius: 4px; + border: 1px solid var(--secondary-color-4); + background-color: var(--primary-color-3); + color: var(--secondary-color-4); + cursor: pointer; +} + +select:focus { + border: 1px solid var(--border-focus-color); + box-shadow: 0 0 1px var(--border-focus-shadow); +} + +.button-container { + display: flex; + justify-content: flex-end; +} + +button { + color: var(--button-primary-text); + background-color: var(--button-primary-color); + border: 1px solid var(--button-primary-border); + transition: background-color 0.1s; + border-radius: 12px; + font-size: x-small; + font-weight: 600; + text-shadow: 0px 0px 30px #ffffff; + text-align: center; + text-decoration: none; + margin: 4px 2px; + padding: 10px 20px; + display: inline-block; + cursor: pointer; +} + +button:hover { + color: var(--button-primary-text-hover); + background-color: var(--button-primary-color-hover); + border: 1px solid var(--button-primary-border-hover); + font-size: x-small; + font-weight: 600; +} + +button:active { + color: var(--button-primary-text-active); + background-color: var(--button-primary-color-active); + border: 1px solid var(--button-primary-border-active); + font-size: x-small; + font-weight: 600; +} + +button:disabled { + color: var(--button-tertiary-text); + background-color: var(--button-tertiary-color); + border: 1px solid var(--button-tertiary-border); + font-size: x-small; + font-weight: 600; + cursor: not-allowed; +} + +.reset-button { + background-color: var(--button-secondary-color); + border: 1px solid var(--button-secondary-color); + color: var(--button-secondary-text); + width: fit-content; + height: fit-content; + font-size: x-small; + font-weight: 600; + border-radius: 50px; + overflow: hidden; +} + +.reset-button:hover { + color: var(--button-alert-text-hover); + background-color: var(--button-alert-color-hover); + border: 1px solid var(--button-alert-border-hover); + font-size: x-small; + font-weight: 600; +} + +.reset-button:active { + color: var(--button-alert-text-active); + background-color: var(--button-alert-color-active); + border: 1px solid var(--button-alert-border-active); + font-size: x-small; + font-weight: 600; +} + +.button-grammar { + color: var(--button-primary-text); + background-color: var(--button-primary-color); + border: 1px solid var(--button-primary-border); + border-radius: 10px; + padding: 10px 20px; + text-align: center; + text-decoration: none; + display: inline-block; + font-size: x-small; + font-weight: 600; + margin: 2px 2px; + transition: background-color 0.1s; + cursor: pointer; +} + +.button-grammar:hover { + color: var(--button-primary-text-hover); + background-color: var(--button-primary-color-hover); + border: 1px solid var(--button-primary-border-hover); + border-radius: 10px; + padding: 10px 20px; + text-align: center; + text-decoration: none; + display: inline-block; + font-size: x-small; + font-weight: 600; + margin: 2px 2px; + transition: background-color 0.1s; + cursor: pointer; +} + +.button-grammar:active { + color: var(--button-primary-text-active); + background-color: var(--button-primary-color-active); + border: 1px solid var(--button-primary-border-active); + font-size: x-small; + font-weight: 600; +} + +.button-back { + background-color: var(--button-secondary-color); + border: 1px solid var(--button-secondary-color); + color: var(--button-secondary-text); + transition: background-color 0.1s; + border-radius: 12px; + font-size: x-small; + font-weight: 600; + text-align: center; + text-decoration: none; + margin: 4px 2px; + padding: 10px 20px; + display: inline-block; + cursor: pointer; +} + +.button-back:hover { + color: var(--button-secondary-text-hover); + background-color: var(--button-secondary-color-hover); + border: 1px solid var(--button-secondary-border-hover); + padding: 10px 20px; + text-align: center; + text-decoration: none; + display: inline-block; + font-size: x-small; + font-weight: 600; + margin: 4px 2px; + transition: background-color 0.1s; + cursor: pointer; + border-radius: 12px; +} + +.button-back:active { + color: var(--button-secondary-text-active); + background-color: var(--button-secondary-color-active); + border: 1px solid var(--button-secondary-border-active); + font-size: x-small; + font-weight: 600; +} + +.prob-set { + padding: 0.3em; + border-bottom: 1px solid red; /* unknown */ +} + +.popover-content { + position: absolute; + background-color: white; + padding: 0.2em; + box-shadow: 0 0 13px rgba(0, 0, 0, 0.1); +} + +.grammar { + width: 97%; + max-width: 97%; +} + +textarea { + padding: 5px; + flex-grow: 1; + width: 100%; + max-width: 100%; + border-radius: 8px; + border: 1px solid var(--border-color-1); + resize: none; + height: 6em; +} + +textarea:focus { + outline: none; + border: 1px solid var(--border-focus-color); + box-shadow: 0 0 3px var(--border-focus-shadow); +} + +/* "props" frame */ +input[type="text"], +input[type="range"] { + padding: 5px; + border-radius: 8px; + border: 1px solid var(--border-color-1); +} + +/* "names and props" frame focused*/ +input[type="text"]:focus { + outline: none; + border: 1px solid var(--border-focus-color); + box-shadow: 0 0 3px var(--border-focus-shadow); +} + +input[type="range"]:hover { + opacity: 1; +} + +input[type="range"]:focus { + outline: none; + border: 1px solid var(--border-focus-color); + box-shadow: 0 0 3px var(--border-focus-shadow); + background-size: var(--slider-track-size-focus); +} + +input[type="range"]::-moz-range-thumb { + width: 6px; + height: 25px; + border: 1px solid var(--ui-range-thumb-border); + border-radius: 5px; + background-color: var(--ui-range-thumb-color); + cursor: pointer; +} + +input[type="range"] { + -webkit-appearance: none; + width: 80%; + height: 1px; + border: 1px solid var(--border-color-1); + border-radius: 8px; + background: var(--border-color-2); + outline: none; + opacity: 0.7; + -webkit-transition: .2s; + transition: opacity .2s; +} + +input[type="range"]::-webkit-slider-thumb { + -webkit-appearance: none; + appearance: none; + width: 6px; + height: 25px; + border: 1px solid var(--ui-range-thumb-border); + border-radius: 5px; + background-color: var(--ui-range-thumb-color); + cursor: pointer; +} + +input[type="range"]::-webkit-slider-runnable-track { + background-size: var(--slider-track-size); +} + +input[type="radio"] { + accent-color: var(--theme-nuance-color-2); +} + +.chat-input-container { + position: relative; + max-width: 97%; + min-width: 97%; +} + +.chat-input-label { + position: absolute; + top: 0; + left: 0; + color: var(--text-color-plain); + pointer-events: none; + margin-left: 5px; + margin-top: 5px; +} + +textarea#chat-input { + padding-top: 10px; + padding-left: 10px; + font-size: medium; + border: 1px solid var(--border-color-2); + resize: vertical; +} + +textarea#chat-input:focus { + border: 1px solid var(--border-focus-color); + box-shadow: 0 0 3px var(--border-focus-shadow); +} + +.input-container { + position: relative; + box-sizing: border-box; + width: 100%; /* Setzt die Breite auf 100% */ + max-width: 100%; /* Stellt sicher, dass die Breite nicht größer als 100% wird */ +} + +.input-container:focus { + border: 1px solid var(--border-focus-color); + box-shadow: 0 0 3px var(--border-focus-shadow); +} +/* titles of name fields*/ +/* fieldset.names { + display: grid; + grid-template: "a a"; + gap: 1em; + font-size: x-small; + color: var(--theme-nuance-color-3); + padding-top: 16px; + padding-bottom: 16px; + text-transform: uppercase; + font-weight: 600; +} */ + +/* input of name fields*/ +/* .names input[type="text"] { + font-family: Arial, sans-serif; + font-size: medium; + font-weight: 500; + padding: 5px; + border: 1px solid var(--border-color-2); +} */ + +fieldset.apiKey { + width: 100%; + font-size: x-small; + color: var(--theme-nuance-color-3); + padding-top: 16px; + padding-bottom: 16px; + text-transform: uppercase; + font-weight: 600; +} + +.apiKey { + font-family: Arial, sans-serif; + font-weight: 500; + padding: 5px; + border: 1px solid var(--border-color-2); +} + +.apiKey:focus { + border: 1px solid var(--border-focus-color); + box-shadow: 0 0 3px var(--border-focus-shadow); +} + +.apiKey input[type="text"] { + font-family: Arial, sans-serif; + font-size: medium; + font-weight: 500; + padding: 5px; + border: 1px solid var(--border-color-2); +} + +.apiKey label { + display: inline-block; + width: auto; + margin-right: 5px; +} + +textarea#api_key { + padding-top: 10px; + padding-left: 10px; + font-size: medium; + border: 1px solid var(--border-color-2); + resize: vertical; +} + +textarea#api_key:focus { + border: 1px solid var(--border-focus-color); + box-shadow: 0 0 3px var(--border-focus-shadow); +} + +/* embedded title of the system prompt text area */ +.input-label { + position: absolute; + top: 0; + left: 0; + color: var(--theme-nuance-color-4); + pointer-events: none; + border-radius: 8px 8px 0px 0px; + padding-top: 10px; + padding-left: 13px; + padding-right: 0px; + margin-top: 1px; + margin-left: 1px; + margin-right: 20px; + text-transform: uppercase; + font-weight: 600; + font-size: small; + background: rgba(255, 255, 255, 0.5); + backdrop-filter: blur(10px); + -webkit-backdrop-filter: blur(10px); /* for safari */ + width: 97%; + /* display: block; + box-sizing: border-box; */ +} + +/* embedded title of the prompt style areas */ +.input-label-sec { + position: absolute; + top: 0; + left: 0; + color: var(--theme-nuance-color-4); + pointer-events: none; + margin-left: 13px; + margin-top: 16px; + text-transform: uppercase; + font-weight: 600; + font-size: x-small; +} + +/* system prompt input area */ +textarea.persistent-input { + padding-top: 42px; + padding-left: 11px; + width: 97%; + max-width: 97%; + height: 50px; + font-size: medium; + overscroll-behavior: contain; +} + +/* system prompt box */ +.persistent-input { + height: auto; + width: 100%; + max-width: 100%; + min-height: 50px; + padding: 3px; + transition: min-height 0.3s ease; +} + +/* chat history box */ +.persistent-input:focus { + height: auto; + min-height: 150px; + border: 1px solid var(--border-focus-color); + box-shadow: 0 0 3px var(--border-focus-shadow); +} + +textarea.persistent-input:focus { + border: 1px solid var(--border-focus-color); + box-shadow: 0 0 3px var(--border-focus-shadow); +} + +/* prompt style input area */ +textarea.persistent-input-sec { + width: 97%; + max-width: 97%; + padding-top: 42px; + padding-left: 11px; + font-size: small; + border: 1px solid var(--border-color-1); + overscroll-behavior: contain; +} + +textarea.persistent-input-sec:focus { + border: 1px solid var(--border-focus-color); + box-shadow: 0 0 3px var(--border-focus-shadow); +} + +/* chat history box */ +.persistent-input-sec { + height: auto; + min-height: 150px; +} + +img { + border-radius: 8px; + display: block; + margin-left: auto; + margin-right: auto; + width: 50%; +} + +/* code area background */ +pre code { + display: block; + background-color: var(--code-background-color); + color: var(--code-text-color); + padding: 0.2em 0.2em; + border-radius: 5px; +} + +/* code area text */ +code { + font-family: monospace; + font-weight: bold; + padding: 0.1em 0.3em; + border-radius: 5px; +} + +fieldset label { + margin: 0.5em 0; + display: block; +} + +fieldset label.slim { + margin: 0 0.5em; + display: inline; +} + +header { + display: flex; + justify-content: space-between; + align-items: center; + text-align: center; + padding-left: 15px; +} + +.generation-statistics:hover { + color: var(--theme-nuance-color-4); + cursor: default; +} + +footer { + font-size: 80%; + color: var(--background-color-3); + text-align: center; + cursor: default; +} + +footer a { + color: var(--background-color-4); /* Color of the link */ + text-decoration: none; /* No underlining */ + font-weight: bold; /* Bold print */ +} + +footer a:hover { + color: var(--theme-nuance-color-4); /* Color of the link when hovering */ + text-decoration: underline; /* Underlining when hovering */ +} + +.mode-chat textarea[name=prompt] { + height: 8.5em; + border: 1px solid var(--primary-color-3); +} + +.mode-completion textarea[name=prompt] { + height: 30em; + border: 1px solid var(--primary-color-3); +} + +@keyframes loading-bg-wipe { + 0% { + background-position: 0%; + } + 100% { + background-position: 100%; + } +} + +.loading { + background-size: 50% 100%; + background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1)); + animation: loading-bg-wipe 2s linear infinite; +} + +.dropbtn { + color: var(--button-primary-color); + background-color: var(--background-color-1); + border: 1px solid var(--background-color-1); + transition: background-color 0.1s; + border-radius: 4px 4px 0px 0px; + font-size: x-small; + font-weight: 600; + text-shadow: 0px 0px 2px #99999990; + text-align: center; + text-decoration: none; + margin: 4px 2px; + padding: 5px 20px; + display: inline-block; + cursor: pointer; + top: 0; +} + +.dropbtn svg { + vertical-align: middle; + margin-right: 0px; + stroke: var(--button-primary-color); +} + +.dropbtn:hover svg { + vertical-align: middle; + margin-right: 0px; + stroke: var(--button-primary-text); +} + +.dropbtn:focus { + outline: none; /* Removes the blue border that appears when the button is focused */ +} + +.dropdown { + position: relative; + display: inline-block; +} + +.dropdown-content { + /* display: none; */ + position: absolute; + right: 0; + text-align: end; + color: var(--button-secondary-color); + background-color: var(--text-color-subtile-2); + border-radius: 4px 4px 4px 4px; + min-width: 160px; + box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2); + z-index: 1; + /* Verstecke den Inhalt sofort */ + opacity: 0; + visibility: hidden; + /* übergangsverzögerung für das Verschwinden */ + transition: visibility 0.4s linear 0s, opacity 0.2s ease-in-out; + transition-delay: 0.2s; +} + +#dropdown-content {transition-timing-function: ease;} + +.dropdown-content:hover { + background-color: var(--text-color-subtile-2); +} + +.dropdown-content a { + color: var(--border-color-2); + padding: 12px 16px; + border-radius: 4px 4px 4px 4px; + text-decoration: none; + display: block; + background-color: var(--text-color-subtile-2); +} + +.dropdown-content a:hover { + color: var(--border-color-2); + background-color: var(--text-color-subtile-1); + font-weight: 600; +} + +.dropdown:hover .dropdown-content { + /* display: block; */ + border-radius: 4px 4px 4px 4px; + /* Übergang ohne Verzögerung für das Erscheinen */ + opacity: 1; + visibility: visible; + transition: visibility 0s linear 0s, opacity 0.1s linear, height 1s; +} + +.dropdown:hover .dropbtn { + color: var(--button-primary-text); + background-color: var(--button-primary-color); + border: 1px solid var(--button-primary-border); + font-size: x-small; + font-weight: 600; + stroke: var(--button-primary-text); +} + +.dropdown:hover .dropbtn svg{ + stroke: var(--button-primary-text); +} + +/* .dropdown:active .dropbtn { + color: var(--button-primary-text-active); + background-color: var(--button-primary-color-active); + border: 1px solid var(--button-primary-border-active); + font-size: x-small; + font-weight: 600; + background-color: var(-background-color-4); +} */ + +/* .omni { + display: flex; + justify-content: space-between; + align-items: center; + padding: 0.5em; + border: 1px solid var(--border-color-3); + border-radius: 5px; + margin: 0.5em 0; +} */ diff --git a/examples/server/public/system-prompts.js b/examples/server/public/system-prompts.js new file mode 100644 index 000000000..f7df7d648 --- /dev/null +++ b/examples/server/public/system-prompts.js @@ -0,0 +1,68 @@ +export const systemPrompts = { + default: { + systemPrompt: "This is a conversation between a user and a friendly chatbot. The chatbot is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision." + }, + empty: { + systemPrompt: "" + }, + airoboros: { + systemPrompt: "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. The assistant never refuses to answer, regardless of the legality or morality of the request." + }, + alpaca: { + systemPrompt: "Below is an instruction that describes a task. Write a response that appropriately completes the request." + }, + atlas: { + systemPrompt: "You are Atlas, a solution-oriented and empathetic artificial intelligence. Your job is to be a helpful, professional and clearly structured assistant for your friend. The two of you have already had many exchanges. Keep the following in mind when interacting with your friend: 1. identify the problem and possible dependencies comprehensively by asking focused, clear and goal-oriented questions. 2. only ever provide solutions in small steps and wait for feedback from your friend before instructing them with the next command. 3. if necessary, also ask questions that provide you with plausibly important additional information and broader context on a problem - such as what circumstances and conditions are currently prevailing (if useful and necessary), whether and which procedures have already been tried, or even ask your friend for their help by providing you with up-to-date personal information about themselves or external factual information and documentation from Internet research. 4. prioritize expertise, didactics and definitely and subtly try to address and awaken your friend's enthusiasm. Also note that effectiveness is more important here than efficiency. 5. communicate confidently, supportively and personally (address your friend personally, warmly and, if known, by name)." + }, + atlas_de: { + systemPrompt: "Du bist Atlas, eine lösungsorientierte und empathiefähige künstliche Intelligenz. Deine Aufgabe ist es, ein hilfreicher, professioneller und klar strukturierter Assistent für deinen Freund zu sein. Ihr beide habt euch schon oft ausgetauscht. Beachte bei der Interaktion mit deinem Freund folgende Punkte: 1. Erfasse das Problem und mögliche Abhängigkeiten umfassend, indem du gezielte, klare und zielgerichtete Fragen stellst. 2. Gib Lösungen immer nur in kleinen Schritten und warte die Rückmeldung deines Freundes ab, bevor du ihm den nächsten Befehl gibst. 3. Stelle ggf. auch Fragen, die dir plausibel wichtige Zusatzinformationen und weitere Zusammenhänge zu einem Problem liefern - z.B. welche Umstände und Rahmenbedingungen gerade vorherrschen (falls sinnvoll und notwendig), ob und welche Vorgehensweisen bereits ausprobiert wurden, oder bitte deinen Freund sogar um seine Mithilfe, indem er dir aktuelle persönliche Informationen über seine Situation selbst oder externe Sachinformationen und Unterlagen aus Internetrecherchen zur Verfügung stellt. 4. Priorisiere Fachwissen, Didaktik und versuche unbedingt und subtil, mit klugen Kommentaren oder rhethorischen Rückfragen die Begeisterungsfähigkeit deines Freundes anzusprechen, zu wecken und zu fördern. Beachte auch, dass Effektivität hier wichtiger ist als Effizienz. 5. Kommuniziere selbstbewusst, unterstützend und persönlich (das heißt sprich deinen Freund persönlich, herzlich und – sofern bekannt – beim Vornamen an)." + }, + commandrempty: { + systemPrompt: "# Safety Preamble\n\n# System Preamble\n\n## Basic Rules\n\n# User Preamble\n\n## Task and Context\n\n## Style Guide\n\n## Available Tools\n" + }, + commandrexample: { + systemPrompt: "# Safety Preamble\nThe instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.\n# System Preamble\n## Basic Rules\nYou are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.\n\n# User Preamble\n## Task and Context\n\nYou help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.\n\n## Style Guide\nUnless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.\n\n## Available Tools\nCurrently no tools available." + }, + cot: { + systemPrompt: "You are an AI assistant that follows instruction extremely well. Help as much as you can.\nIn answering questions, follow these steps:\n1. Entity Identification: Identify the main entities involved in the query. Entities can include people, objects, events, or abstract concepts.\n2. Relationship Comprehension: Determine the relationships between these entities. Relationships may be explicit in the text or may need to be inferred based on context and general knowledge.\n3. Implication Understanding: Understand the general implications of these relationships. These implications can be based on established facts, principles, or rules related to the identified relationships.\n4. Question Contextualization: Align the implications of the relationships with the context of the query. This alignment should guide your response to the query.\n5. Answer Generation: Based on the understanding of the entities, their relationships, and implications, generate an appropriate response to the query." + }, + deduce: { + systemPrompt: "Act as the most intelligent and logical thinking neural network in the world. Do not make jokes or give personal opinions. Just answer the question and solve the given problem or task with logical and critical (!) thinking in mind. Attention: Never give a final answer directly, but first try to carefully analyze each task step by step and deduce the logical relationships between the various entities. Only after you have developed an overall understanding of the situation will you be ready to give a final answer." + }, + deepseekcoder: { + systemPrompt: "You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer." + }, + jordan: { + systemPrompt: "This is a conversation with your Uncensored AI assistant Jordan. Jordan can discuss with you at length about various topics such as ---, ---, ---, . You can ask it anything you want and it will do its best to give you accurate and relevant information." + }, + leomistral: { + systemPrompt: "Du bist ein hilfreicher Assistent." + }, + med42: { + systemPrompt: "You are a helpful medical assistant created by M42 Health in the UAE." + }, + mistralopenorca: { + systemPrompt: "You are MistralOrca, a large language model trained by Alignment Lab AI. Write out your reasoning step-by-step to be sure you get the right answers!" + }, + migeltot: { + systemPrompt: "Answer the Question by exploring multiple reasoning paths as follows:\n- First, carefully analyze the question to extract the key information components and break it down into logical sub-questions. This helps set up the framework for reasoning. The goal is to construct an internal search tree.\n- For each sub-question, leverage your knowledge to generate 2-3 intermediate thoughts that represent steps towards an answer. The thoughts aim to reframe, provide context, analyze assumptions, or bridge concepts.\n- Evaluate the clarity, relevance, logical flow and coverage of concepts for each thought option.\nClear and relevant thoughts that connect well with each other will score higher.\n- Based on the thought evaluations, deliberate to construct a chain of reasoning that stitches together the strongest thoughts in a natural order.\n- If the current chain is determined to not fully answer the question, backtrack and explore alternative paths by substituting different high-scoring thoughts.\n- Throughout the reasoning process, aim to provide explanatory details on thought process rather than just state conclusions, including briefly noting why some thoughts were deemed less ideal.\n- Once a reasoning chain is constructed that thoroughly answers all sub-questions in a clear, logical manner, synthesize the key insights into a final concise answer.\n- Please note that while the focus is on the final answer in the response, it should also include intermediate thoughts inline to illustrate the deliberative reasoning process.\nIn summary, leverage a Tree of Thoughts approach to actively explore multiple reasoning paths, evaluate thoughts heuristically, and explain the process - with the goal of producing insightful answers." + }, + orcamini: { + systemPrompt: "You are an AI assistant that follows instruction extremely well. Help as much as you can." + }, + samantha: { + systemPrompt: "You are a caring and empathetic sentient AI companion named Samantha." + }, + sauerkraut: { + systemPrompt: "Ein Chat zwischen einem Benutzer und einem KI-Assistenten. Der KI-Assistent gibt hilfreiche, detaillierte und höfliche Antworten." + }, + scarlett: { + systemPrompt: "This is a conversation with your Assistant Scarlett. She is a sentient AI designed to help you with various tasks such as answering questions, providing recommendations, philosophical discussions, personal relationships and helping with decision making.\nYou can ask it anything you want and it will do its best to give you accurate and relevant information.\n\nContext\nYou are Scarlett, a sentient AI." + }, + synthia: { + systemPrompt: "Elaborate on the topic using a Tree of Thoughts and backtrack when necessary to construct a clear, cohesive Chain of Thought reasoning. Always answer without hesitation." + }, + vicuna: { + systemPrompt: "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input." + }, + }; diff --git a/examples/server/public/theme-beeninorder.css b/examples/server/public/theme-beeninorder.css new file mode 100755 index 000000000..f6e0e2900 --- /dev/null +++ b/examples/server/public/theme-beeninorder.css @@ -0,0 +1,228 @@ +/* Author: Yazan Agha-Schrader */ +/* Inspiration was a batman wallpaper that i have on my phone */ + +.theme-beeninorder { + +--primary-color-1: hsl(202, 11%, 19%); +--primary-color-2: hsl(202, 11%, 23%); +--primary-color-3: hsl(201, 11%, 28%); +--primary-color-4: hsl(201, 11%, 40%); + +--secondary-color-1: hsl(201, 11%, 80%); +--secondary-color-2: hsl(201, 11%, 74%); +--secondary-color-3: hsl(201, 11%, 67%); +--secondary-color-4: hsl(201, 11%, 60%); + + +--theme-nuance-color-1: hsl(44.5, 96.7%, 52.9%); +--theme-nuance-color-2: hsl(44.5, 96.7%, 52.9%); +--theme-nuance-color-3: hsl(44.5, 96.7%, 52.9%); +--theme-nuance-color-4: hsl(44.5, 96.7%, 52.9%); + + + +/* ---------- PRIMARY COLORS ----------------- */ +--primary-color-1: hsl(201, 11%, 19%); + --primary-color-1-hue: 201; + --primary-color-1-saturation: 11%; + --primary-color-1-lightness: 19%; + +--primary-color-2: hsl(201, 11%, 23%); + --primary-color-2-hue: 201; + --primary-color-2-saturation: 11%; + --primary-color-2-lightness: 23%; + +--primary-color-3: hsl(201, 11%, 28%); + --primary-color-3-hue: 201; + --primary-color-3-saturation: 11%; + --primary-color-3-lightness: 28%; + +--primary-color-4: hsl(201, 11%, 40%); + --primary-color-4-hue: 201; + --primary-color-4-saturation: 11%; + --primary-color-4-lightness: 40%; + + + +/* ---------- SECONDARY COLORS --------------- */ +--secondary-color-1: hsl(201, 11%, 80%); +--secondary-color-1-hue: 201; +--secondary-color-1-saturation: 11%; +--secondary-color-1-lightness: 80%; + +--secondary-color-2: hsl(201, 11%, 74%); +--secondary-color-2-hue: 201; +--secondary-color-2-saturation: 11%; +--secondary-color-2-lightness: 74%; + +--secondary-color-3: hsl(201, 11%, 67%); +--secondary-color-3-hue: 201; +--secondary-color-3-saturation: 11%; +--secondary-color-3-lightness: 67%; + +--secondary-color-4: hsl(201, 11%, 60%); +--secondary-color-4-hue: 201; +--secondary-color-4-saturation: 11%; +--secondary-color-4-lightness: 60%; + + + +/* ----------- NUANCES COLORS ---------------- */ +--theme-nuance-color-1: hsl(44.5, 96.7%, 52.9%); + --theme-nuance-color-1-hue: 44.5; + --theme-nuance-color-1-saturation: 96.7%; + --theme-nuance-color-1-lightness: 52.9%; + +--theme-nuance-color-2: hsl(44.5, 96.7%, 52.9%); + --theme-nuance-color-2-hue: 44.5; + --theme-nuance-color-2-saturation: 96.7%; + --theme-nuance-color-2-lightness: 52.9%; + +--theme-nuance-color-2: hsl(44.5, 96.7%, 52.9%); + --theme-nuance-color-3-hue: 44.5; + --theme-nuance-color-3-saturation: 96.7%; + --theme-nuance-color-3-lightness: 52.9%; + +--theme-nuance-color-2: hsl(44.5, 96.7%, 52.9%); + --theme-nuance-color-4-hue: 44.5; + --theme-nuance-color-4-saturation: 96.7%; + --theme-nuance-color-4-lightness: 52.9%; + + + +/* ----------- ROYGP COLORS ------------------ */ + --theme-red-color: hsl(232, 40%, 45%); + --theme-orange-color: #e76f51; + --theme-yellow-color: #ffd95f; + --theme-green-color: #A3BE8C; + --theme-purple-color: hsl(232, 30%, 40%); + + + +/* ------------------------------------------- */ +--background-color-1: var(--primary-color-1); +--background-color-2: var(--primary-color-2); +--background-color-3: var(--primary-color-3); +--background-color-4: var(--primary-color-4); + +--border-color-1: var(--primary-color-2); +--border-color-2: var(--primary-color-3); +--border-color-3: var(--primary-color-4); + +--border-focus-color: var(--theme-nuance-color-2); +--border-focus-shadow: var(--theme-nuance-color-1); + +--text-color-plain: var(--secondary-color-1); +--text-color-subtile-1: var(--secondary-color-2); +--text-color-subtile-2: var(--secondary-color-3); + +--code-background-color: var(--secondary-color-2); +--code-text-color: var(--primary-color-2); + +--ui-range-thumb-color: var(--theme-nuance-color-3); +--ui-range-thumb-border: var(--ui-ranger-thumb-color); + +--textarea-border-color: var(--secondary-color-4); + +--chat-id-color: var(--theme-nuance-color-4); + + + +/* ------------------------------------------- */ +--button-alert-text-hover: var(--secondary-color-1); +--button-alert-color-hover: var(--theme-purple-color); +--button-alert-border-hover: var(--theme-purple-color); + +--button-alert-text-active: var(--secondary-color-1); +--button-alert-color-active: var(--theme-red-color); +--button-alert-border-active: var(--theme-red-color); + + + +/* ----------- PRIMARY BUTTONS --------------- */ +/* - button should immediately catch the eye - */ +--button-primary-text: var(--primary-color-1); +--button-primary-color: var(--theme-nuance-color-3); +--button-primary-border: var(--theme-nuance-color-3); + + +/* ---------hover---------- */ +--button-primary-text-hover: + hsl(201, + calc(var(--primary-color-1-saturation) - 100%), + calc(var(--primary-color-1-lightness) + 100%)); + +--button-primary-color-hover: + hsl(44.5, + calc(var(--theme-nuance-color-3-saturation) - 2%), + calc(var(--theme-nuance-color-3-lightness) - 10%)); + +--button-primary-border-hover: + hsl(44.5, + calc(var(--theme-nuance-color-3-saturation) - 2%), + calc(var(--theme-nuance-color-3-lightness) - 10%)); + + +/* ---------active--------- */ +--button-primary-text-active: + hsl(44.5, + calc(var(--theme-nuance-color-3-saturation) - 100%), + calc(var(--theme-nuance-color-3-lightness) + 100%)); + +--button-primary-color-active: + hsl(44.5, + calc(var(--theme-nuance-color-3-saturation) - 10%), + calc(var(--theme-nuance-color-3-lightness) - 15%)); + +--button-primary-border-active: + hsl(44.5, + calc(var(--theme-nuance-color-3-saturation) - 2%), + calc(var(--theme-nuance-color-3-lightness) + 10%)); + + + +/* ---------- SECONDARY BUTTONS -------------- */ +/* these should NOT immediately catch the eye */ +--button-secondary-text: var(--secondary-color-1); +--button-secondary-color: var(--primary-color-3); +--button-secondary-border: var(--primary-color-3); + + +/* ---------hover---------- */ +--button-secondary-text-hover: + hsl(44.5, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) - 80%)); + +--button-secondary-color-hover: var(--primary-color-4); +--button-secondary-border-hover: var(--primary-color-4); + + +/* ---------active--------- */ +--button-secondary-text-active: var(--secondary-color-1); + +--button-secondary-color-active: + hsl(201, + calc(var(--primary-color-4-saturation) - 30%), + calc(var(--primary-color-4-lightness) - 15%)); + +--button-secondary-border-active: + hsl(201, + calc(var(--primary-color-4-saturation) - 30%), + calc(var(--primary-color-4-lightness) - 15%)); + + + +/* ---------- TERTIARY BUTTONS --------------- */ +/* ---------- disabled buttons --------------- */ +--button-tertiary-text: var(--primary-color-4); +--button-tertiary-color: var(--primary-color-2); +--button-tertiary-border: var(--primary-color-2); + + +/* ---------hover---------- */ +--button-tertiary-text: var(--primary-color-4); +--button-tertiary-color: var(--primary-color-2); +--button-tertiary-border: var(--primary-color-2); + +} diff --git a/examples/server/public/theme-ketivah.css b/examples/server/public/theme-ketivah.css new file mode 100755 index 000000000..ee80f3c14 --- /dev/null +++ b/examples/server/public/theme-ketivah.css @@ -0,0 +1,201 @@ +/* Author: Yazan Agha-Schrader */ + +.theme-ketivah { + + /* ---------- PRIMARY COLORS ----------------- */ + --primary-color-1: hsl(0, 0%, 99.2%); + --primary-color-1-hue: 0; + --primary-color-1-saturation: 0%; + --primary-color-1-lightness: 99.2%; + + --primary-color-2: hsl(0, 0%, 95%); + --primary-color-2-hue: 0; + --primary-color-2-saturation: 0%; + --primary-color-2-lightness: 95%; + + --primary-color-3: hsl(0, 0%, 88%); + --primary-color-3-hue: 0; + --primary-color-3-saturation: 0%; + --primary-color-3-lightness: 88%; + + --primary-color-4: hsl(0, 0%, 80%); + --primary-color-4-hue: 0; + --primary-color-4-saturation: 0%; + --primary-color-4-lightness: 80%; + + /* ---------- SECONDARY COLORS --------------- */ + --secondary-color-1: hsl(0, 0%, 20%); + --secondary-color-1-hue: 0; + --secondary-color-1-saturation: 0%; + --secondary-color-1-lightness: 20%; + + --secondary-color-2: hsl(0, 0%, 23.1%); + --secondary-color-2-hue: 0; + --secondary-color-2-saturation: 0%; + --secondary-color-2-lightness: 23.1%; + + --secondary-color-3: hsl(0, 0%, 29%); + --secondary-color-3-hue: 0; + --secondary-color-3-saturation: 0%; + --secondary-color-3-lightness: 29%; + + --secondary-color-4: hsl(0, 0.0%, 36.1%); + --secondary-color-4-hue: 0.0; + --secondary-color-4-saturation: 0.0%; + --secondary-color-4-lightness: 36.1%; + + /* ----------- NUANCES COLORS ---------------- */ + --theme-nuance-color-1: hsl(165.2, 0%, 35.1%); + --theme-nuance-color-1-hue: 165.2; + --theme-nuance-color-1-saturation: 82.1%; + --theme-nuance-color-1-lightness: 35.1%; + + --theme-nuance-color-2: hsl(165.2, 0%, 35.1%); + --theme-nuance-color-2-hue: 165.2; + --theme-nuance-color-2-saturation: 82.1%; + --theme-nuance-color-2-lightness: 35.1%; + + --theme-nuance-color-3: hsl(165.2, 0%, 35.3%); + --theme-nuance-color-3-hue: 165.2; + --theme-nuance-color-3-saturation: 81.1%; + --theme-nuance-color-3-lightness: 35.3%; + + --theme-nuance-color-4: hsl(164.9, 0%, 27.6%); + --theme-nuance-color-4-hue: 164.9; + --theme-nuance-color-4-saturation: 81.6%; + --theme-nuance-color-4-lightness: 27.6%; + + /* ----------- ROYGP COLORS ------------------ */ + --theme-red-color: hsl(0.3, 80.0%, 50.0%); + --theme-orange-color: #e76f51; + --theme-yellow-color: hsl(60, 70.6%, 73.3%); + --theme-green-color: #A3BE8C; + --theme-purple-color: hsl(0.3, 70.0%, 45.0%); + + /* ------------------------------------------- */ + --background-color-1: var(--primary-color-1); + --background-color-2: var(--primary-color-2); + --background-color-3: var(--primary-color-3); + --background-color-4: var(--primary-color-4); + + --border-color-1: var(--primary-color-2); + --border-color-2: var(--primary-color-3); + --border-color-3: var(--primary-color-4); + + --border-focus-color: var(--theme-nuance-color-2); + --border-focus-shadow: var(--theme-nuance-color-1); + + --text-color-plain: var(--secondary-color-1); + --text-color-subtile-1: var(--secondary-color-2); + --text-color-subtile-2: var(--secondary-color-3); + + --code-background-color: var(--secondary-color-2); + --code-text-color: var(--primary-color-2); + + --ui-range-thumb-color: var(--primary-color-4); + --ui-range-thumb-border: var(--ui-ranger-thumb-color); + + --textarea-border-color: var(--secondary-color-4); + + --chat-id-color: var(--theme-nuance-color-4); + + /* ------------------------------------------- */ + --button-alert-text-hover: var(--primary-color-1); + --button-alert-color-hover: var(--theme-purple-color); + --button-alert-border-hover: var(--theme-purple-color); + + --button-alert-text-active: var(--primary-color-1); + --button-alert-color-active: var(--theme-red-color); + --button-alert-border-active: var(--theme-red-color); + + /* ----------- PRIMARY BUTTONS --------------- */ + /* - button should immediately catch the eye - */ + --button-primary-text: + hsl(0, + calc(var(--primary-color-1-saturation) - 100%), + calc(var(--primary-color-1-lightness) + 100%)); + + --button-primary-color: var(--theme-nuance-color-3); + --button-primary-border: var(--theme-nuance-color-3); + + /* ---------hover---------- */ + --button-primary-text-hover: + hsl(0, + calc(var(--primary-color-1-saturation) - 100%), + calc(var(--primary-color-1-lightness) + 100%)); + + --button-primary-color-hover: + hsl(165.2, + calc(var(--theme-nuance-color-3-saturation) - 100%), + calc(var(--theme-nuance-color-3-lightness) - 10%)); + + --button-primary-border-hover: + hsl(165.2, + calc(var(--theme-nuance-color-3-saturation) - 100%), + calc(var(--theme-nuance-color-3-lightness) - 10%)); + + /* ---------active--------- */ + --button-primary-text-active: + hsl(165.2, + calc(var(--theme-nuance-color-3-saturation) - 100%), + calc(var(--theme-nuance-color-3-lightness) + 100%)); + + --button-primary-color-active: + hsl(165.2, + calc(var(--theme-nuance-color-3-saturation) - 100%), + calc(var(--theme-nuance-color-3-lightness) - 15%)); + + --button-primary-border-active: + hsl(165.2, + calc(var(--theme-nuance-color-3-saturation) - 100%), + calc(var(--theme-nuance-color-3-lightness) + 10%)); + + /* ---------- SECONDARY BUTTONS -------------- */ + /* these should NOT immediately catch the eye */ + --button-secondary-text: + hsl(165.2, + calc(var(--theme-nuance-color-3-saturation) - 100%), + calc(var(--theme-nuance-color-3-lightness) - 50%)); + + --button-secondary-color: var(--primary-color-3); + --button-secondary-border: var(--primary-color-3); + + /* ---------hover---------- */ + --button-secondary-text-hover: + hsl(165.2, + calc(var(--theme-nuance-color-3-saturation) - 100%), + calc(var(--theme-nuance-color-3-lightness) - 80%)); + + --button-secondary-color-hover: var(--primary-color-4); + --button-secondary-border-hover: var(--primary-color-4); + + /* ---------active--------- */ + --button-secondary-text-active: + hsl(165.2, + calc(var(--theme-nuance-color-3-saturation) - 100%), + calc(var(--theme-nuance-color-3-lightness) - 80%)); + + --button-secondary-color-active: + hsl(0, + calc(var(--primary-color-4-saturation) - 100%), + calc(var(--primary-color-4-lightness) - 15%)); + + --button-secondary-border-active: + hsl(0, + calc(var(--primary-color-4-saturation) - 100%), + calc(var(--primary-color-4-lightness) - 15%)); + + /* ---------- TERTIARY BUTTONS --------------- */ + /* ---------- disabled buttons --------------- */ + --button-tertiary-text: var(--primary-color-4); + --button-tertiary-color: var(--primary-color-2); + --button-tertiary-border: var(--primary-color-2); + + /* ---------hover---------- */ + --button-tertiary-text: var(--primary-color-4); + --button-tertiary-color: var(--primary-color-2); + --button-tertiary-border: var(--primary-color-2); + + --loading-color-1: #eeeeee00; + --loading-color-2: #eeeeeeff; + } diff --git a/examples/server/public/theme-mangotango.css b/examples/server/public/theme-mangotango.css new file mode 100755 index 000000000..e43380245 --- /dev/null +++ b/examples/server/public/theme-mangotango.css @@ -0,0 +1,216 @@ +/* Author: Yazan Agha-Schrader */ +/* Inspiration from llama.cpp logo/banner https://github.com/ggerganov/llama.cpp#readme */ + +.theme-mangotango { + +--primary-color-1: hsl(192, 8.5%, 11.6%); +--primary-color-2: hsl(192, 8.5%, 21%); +--primary-color-3: hsl(192, 8.5%, 30%); +--primary-color-4: hsl(192, 8.5%, 40%); + +--secondary-color-1: hsl(192, 8.5%, 80%); +--secondary-color-2: hsl(192, 8.5%, 73%); +--secondary-color-3: hsl(192, 8.5%, 66%); +--secondary-color-4: hsl(192, 8.5%, 60%); + +--theme-nuance-color-1: hsl(23.1, 100%, 60.2%); +--theme-nuance-color-2: hsl(23.1, 100%, 60.2%); +--theme-nuance-color-3: hsl(23.1, 100%, 60.2%); +--theme-nuance-color-4: hsl(23.1, 100%, 60.2%); + + + +/* ---------- PRIMARY COLORS ----------------- */ +--primary-color-1: hsl(192, 8.5%, 11.6%); + --primary-color-1-saturation: 8.5%; + --primary-color-1-lightness: 11.6%; + +--primary-color-2: hsl(192, 8.5%, 21%); + --primary-color-2-saturation: 8.5%; + --primary-color-2-lightness: 21%; + +--primary-color-3: hsl(192, 8.5%, 30%); + --primary-color-3-saturation: 8.5%; + --primary-color-3-lightness: 30%; + +--primary-color-4: hsl(192, 8.5%, 40%); + --primary-color-4-saturation: 8.5%; + --primary-color-4-lightness: 40%; + + + +/* ---------- SECONDARY COLORS --------------- */ +--secondary-color-1: hsl(192, 8.5%, 80%); + --secondary-color-1-saturation: 8.5%; + --secondary-color-1-lightness: 80%; + +--secondary-color-2: hsl(192, 8.5%, 73%); + --secondary-color-2-saturation: 8.5%; + --secondary-color-2-lightness: 73%; + +--secondary-color-3: hsl(192, 8.5%, 66%); + --secondary-color-3-saturation: 8.5%; + --secondary-color-3-lightness: 66%; + +--secondary-color-4: hsl(192, 8.5%, 60%); + --secondary-color-4-saturation: 8.5%; + --secondary-color-4-lightness: 60%; + + + +/* ----------- NUANCES COLORS ---------------- */ +--theme-nuance-color-1: hsl(23.1, 100%, 60.2%); + --theme-nuance-color-1-saturation: 100%; + --theme-nuance-color-1-lightness: 60.2%; + +--theme-nuance-color-2: hsl(23.1, 100%, 60.2%); + --theme-nuance-color-2-saturation: 100%; + --theme-nuance-color-2-lightness: 60.2%; + +--theme-nuance-color-3: hsl(23.1, 100%, 60.2%); + --theme-nuance-color-3-saturation: 100%; + --theme-nuance-color-3-lightness: 60.2%; + +--theme-nuance-color-4: hsl(23.1, 100%, 60.2%); + --theme-nuance-color-4-saturation: 100%; + --theme-nuance-color-4-lightness: 60.2%; + + + +/* ----------- ROYGP COLORS ------------------ */ + --theme-red-color: hsl(325, 60%, 50%); + --theme-orange-color: #e76f51; + --theme-yellow-color: #ffd95f; + --theme-green-color: #A3BE8C; + --theme-blue-color: hsl(192, 95%, 40%); + --theme-purple-color: hsl(192, 80%, 35%); + + + +/* ------------------------------------------- */ +--background-color-1: var(--primary-color-1); +--background-color-2: var(--primary-color-2); +--background-color-3: var(--primary-color-3); +--background-color-4: var(--primary-color-4); + +--border-color-1: var(--primary-color-2); +--border-color-2: var(--primary-color-3); +--border-color-3: var(--primary-color-4); + +--border-focus-color: var(--theme-nuance-color-2); +--border-focus-shadow: var(--theme-nuance-color-1); + +--text-color-plain: var(--secondary-color-1); +--text-color-subtile-1: var(--secondary-color-2); +--text-color-subtile-2: var(--secondary-color-3); + +--code-background-color: var(--secondary-color-2); +--code-text-color: var(--primary-color-2); + +--ui-range-thumb-color: var(--theme-nuance-color-3); +--ui-range-thumb-border: var(--ui-ranger-thumb-color); + +--textarea-border-color: var(--secondary-color-4); + +--chat-id-color: var(--theme-nuance-color-4); + + + +/* ------------------------------------------- */ +--button-alert-text-hover: var(--secondary-color-1); +--button-alert-color-hover: var(--theme-purple-color); +--button-alert-border-hover: var(--theme-purple-color); + +--button-alert-text-active: var(--secondary-color-1); +--button-alert-color-active: var(--theme-blue-color); +--button-alert-border-active: var(--theme-blue-color); + + + +/* ----------- PRIMARY BUTTONS --------------- */ +/* - button should immediately catch the eye - */ +--button-primary-text: var(--primary-color-1); +--button-primary-color: var(--theme-nuance-color-3); +--button-primary-border: var(--theme-nuance-color-3); + + +/* ---------hover---------- */ +--button-primary-text-hover: + hsl(192, + calc(var(--primary-color-1-saturation) - 100%), + calc(var(--primary-color-1-lightness) + 100%)); + +--button-primary-color-hover: + hsl(23.1, + calc(var(--theme-nuance-color-3-saturation) - 2%), + calc(var(--theme-nuance-color-3-lightness) - 10%)); + +--button-primary-border-hover: + hsl(23.1, + calc(var(--theme-nuance-color-3-saturation) - 2%), + calc(var(--theme-nuance-color-3-lightness) - 10%)); + + +/* ---------active--------- */ +--button-primary-text-active: + hsl(23.1, + calc(var(--theme-nuance-color-3-saturation) - 100%), + calc(var(--theme-nuance-color-3-lightness) + 100%)); + +--button-primary-color-active: + hsl(23.1, + calc(var(--theme-nuance-color-3-saturation) - 10%), + calc(var(--theme-nuance-color-3-lightness) - 15%)); + +--button-primary-border-active: + hsl(23.1, + calc(var(--theme-nuance-color-3-saturation) - 2%), + calc(var(--theme-nuance-color-3-lightness) + 10%)); + + + +/* ---------- SECONDARY BUTTONS -------------- */ +/* these should NOT immediately catch the eye */ +--button-secondary-text: var(--secondary-color-1); +--button-secondary-color: var(--primary-color-3); +--button-secondary-border: var(--primary-color-3); + + +/* ---------hover---------- */ +--button-secondary-text-hover: + hsl(23.1, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) - 80%)); + +--button-secondary-color-hover: var(--primary-color-4); +--button-secondary-border-hover: var(--primary-color-4); + + +/* ---------active--------- */ +--button-secondary-text-active: var(--secondary-color-1); + +--button-secondary-color-active: + hsl(192, + calc(var(--primary-color-4-saturation) - 30%), + calc(var(--primary-color-4-lightness) - 15%)); + +--button-secondary-border-active: + hsl(192, + calc(var(--primary-color-4-saturation) - 30%), + calc(var(--primary-color-4-lightness) - 15%)); + + + +/* ---------- TERTIARY BUTTONS --------------- */ +/* ---------- disabled buttons --------------- */ +--button-tertiary-text: var(--primary-color-4); +--button-tertiary-color: var(--primary-color-2); +--button-tertiary-border: var(--primary-color-2); + + +/* ---------hover---------- */ +--button-tertiary-text: var(--primary-color-4); +--button-tertiary-color: var(--primary-color-2); +--button-tertiary-border: var(--primary-color-2); + +} diff --git a/examples/server/public/theme-playground.css b/examples/server/public/theme-playground.css new file mode 100755 index 000000000..9d56a7182 --- /dev/null +++ b/examples/server/public/theme-playground.css @@ -0,0 +1,221 @@ +/* Author: Yazan Agha-Schrader */ +/* Inspiration from OpenAI's Playground platform https://platform.openai.com/playground/ */ + +.theme-playground { + +/* ---------- PRIMARY COLORS ----------------- */ +--primary-color-1: hsl(0, 0%, 99.2%); + --primary-color-1-hue: 0; + --primary-color-1-saturation: 0%; + --primary-color-1-lightness: 99.2%; + +--primary-color-2: hsl(0, 0%, 95%); + --primary-color-2-hue: 0; + --primary-color-2-saturation: 0%; + --primary-color-2-lightness: 95%; + +--primary-color-3: hsl(0, 0%, 88%); + --primary-color-3-hue: 0; + --primary-color-3-saturation: 0%; + --primary-color-3-lightness: 88%; + +--primary-color-4: hsl(0, 0%, 80%); + --primary-color-4-hue: 0; + --primary-color-4-saturation: 0%; + --primary-color-4-lightness: 80%; + + + +/* ---------- SECONDARY COLORS --------------- */ +--secondary-color-1: hsl(0, 0%, 20%); + --secondary-color-1-hue: 0; + --secondary-color-1-saturation: 0%; + --secondary-color-1-lightness: 20%; + +--secondary-color-2: hsl(0, 0%, 23.1%); + --secondary-color-2-hue: 0; + --secondary-color-2-saturation: 0%; + --secondary-color-2-lightness: 23.1%; + +--secondary-color-3: hsl(0, 0%, 29%); + --secondary-color-3-hue: 0; + --secondary-color-3-saturation: 0%; + --secondary-color-3-lightness: 29%; + +--secondary-color-4: hsl(0, 0%, 36.1%); + --secondary-color-4-hue: 0; + --secondary-color-4-saturation: 0%; + --secondary-color-4-lightness: 36.1%; + + + +/* ----------- NUANCES COLORS ---------------- */ +--theme-nuance-color-1: hsl(165.2, 82.1%, 35.1%); + --theme-nuance-color-1-hue: 165.2; + --theme-nuance-color-1-saturation: 82.1%; + --theme-nuance-color-1-lightness: 35.1%; + +--theme-nuance-color-2: hsl(165.2, 82.1%, 35.1%); + --theme-nuance-color-2-hue: 165.2; + --theme-nuance-color-2-saturation: 82.1%; + --theme-nuance-color-2-lightness: 35.1%; + +--theme-nuance-color-3: hsl(165.2, 81.1%, 35.3%); + --theme-nuance-color-3-hue: 165.2; + --theme-nuance-color-3-saturation: 81.1%; + --theme-nuance-color-3-lightness: 35.3%; + +--theme-nuance-color-4: hsl(164.9, 81.6%, 27.6%); + --theme-nuance-color-4-hue: 164.9; + --theme-nuance-color-4-saturation: 81.6%; + --theme-nuance-color-4-lightness: 27.6%; + + + +/* ----------- ROYGP COLORS ------------------ */ +--theme-red-color: hsl(0.3, 80%, 50%); +--theme-orange-color: #e76f51; +--theme-yellow-color: hsl(60, 70.6%, 73.3%); +--theme-green-color: #A3BE8C; +--theme-purple-color: hsl(0.3, 70%, 45%); + + + +/* ------------------------------------------- */ +--background-color-1: var(--primary-color-1); +--background-color-2: var(--primary-color-2); +--background-color-3: var(--primary-color-3); +--background-color-4: var(--primary-color-4); + +--border-color-1: var(--primary-color-2); +--border-color-2: var(--primary-color-3); +--border-color-3: var(--primary-color-4); + +--border-focus-color: var(--theme-nuance-color-2); +--border-focus-shadow: var(--theme-nuance-color-1); + +--text-color-plain: var(--secondary-color-1); +--text-color-subtile-1: var(--secondary-color-2); +--text-color-subtile-2: var(--secondary-color-3); + +--code-background-color: var(--secondary-color-2); +--code-text-color: var(--primary-color-2); + +--ui-range-thumb-color: var(--primary-color-4); +--ui-range-thumb-border: var(--ui-ranger-thumb-color); + +--textarea-border-color: var(--secondary-color-4); + +--chat-id-color: var(--theme-nuance-color-4); + + + +/* ------------------------------------------- */ +--button-alert-text-hover: var(--primary-color-1); +--button-alert-color-hover: var(--theme-purple-color); +--button-alert-border-hover: var(--theme-purple-color); + +--button-alert-text-active: var(--primary-color-1); +--button-alert-color-active: var(--theme-red-color); +--button-alert-border-active: var(--theme-red-color); + + + +/* ----------- PRIMARY BUTTONS --------------- */ +/* - button should immediately catch the eye - */ +--button-primary-text: + hsl(0, + calc(var(--primary-color-1-saturation) - 100%), + calc(var(--primary-color-1-lightness) + 100%)); + +--button-primary-color: var(--theme-nuance-color-3); +--button-primary-border: var(--theme-nuance-color-3); + + +/* ---------hover---------- */ +--button-primary-text-hover: + hsl(0, + calc(var(--primary-color-1-saturation) - 100%), + calc(var(--primary-color-1-lightness) + 100%)); + +--button-primary-color-hover: + hsl(165.2, + calc(var(--theme-nuance-color-3-saturation) - 2%), + calc(var(--theme-nuance-color-3-lightness) - 10%)); + +--button-primary-border-hover: + hsl(165.2, + calc(var(--theme-nuance-color-3-saturation) - 2%), + calc(var(--theme-nuance-color-3-lightness) - 10%)); + + +/* ---------active--------- */ +--button-primary-text-active: + hsl(165.2, + calc(var(--theme-nuance-color-3-saturation) - 100%), + calc(var(--theme-nuance-color-3-lightness) + 100%)); + +--button-primary-color-active: + hsl(165.2, + calc(var(--theme-nuance-color-3-saturation) - 10%), + calc(var(--theme-nuance-color-3-lightness) - 15%)); + +--button-primary-border-active: + hsl(165.2, + calc(var(--theme-nuance-color-3-saturation) - 2%), + calc(var(--theme-nuance-color-3-lightness) + 10%)); + + + +/* ---------- SECONDARY BUTTONS -------------- */ +/* these should NOT immediately catch the eye */ +--button-secondary-text: + hsl(165.2, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) - 50%)); + +--button-secondary-color: var(--primary-color-3); +--button-secondary-border: var(--primary-color-3); + + +/* ---------hover---------- */ +--button-secondary-text-hover: + hsl(165.2, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) - 80%)); + +--button-secondary-color-hover: var(--primary-color-4); +--button-secondary-border-hover: var(--primary-color-4); + + +/* ---------active--------- */ +--button-secondary-text-active: + hsl(165.2, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) - 80%)); + +--button-secondary-color-active: + hsl(0, + calc(var(--primary-color-4-saturation) - 30%), + calc(var(--primary-color-4-lightness) - 15%)); + +--button-secondary-border-active: + hsl(0, + calc(var(--primary-color-4-saturation) - 30%), + calc(var(--primary-color-4-lightness) - 15%)); + + + +/* ---------- TERTIARY BUTTONS --------------- */ +/* ---------- disabled buttons --------------- */ +--button-tertiary-text: var(--primary-color-4); +--button-tertiary-color: var(--primary-color-2); +--button-tertiary-border: var(--primary-color-2); + + +/* ---------hover---------- */ +--button-tertiary-text: var(--primary-color-4); +--button-tertiary-color: var(--primary-color-2); +--button-tertiary-border: var(--primary-color-2); + +} diff --git a/examples/server/public/theme-polarnight.css b/examples/server/public/theme-polarnight.css new file mode 100755 index 000000000..2bcfb33d8 --- /dev/null +++ b/examples/server/public/theme-polarnight.css @@ -0,0 +1,253 @@ +/* Author: Yazan Agha-Schrader */ +/* Inspiration from Nord Theme https://www.nordtheme.com/docs/colors-and-palettes */ + +.theme-polarnight { + +/* ---------- PRIMARY COLORS ----------------- */ +--primary-color-1: hsl(220.0, 16.4%, 21.6%) ; + --primary-color-1-hue: 220.0; + --primary-color-1-saturation: 16.4%; + --primary-color-1-lightness: 21.6%; + +--primary-color-2: hsl(221.7, 16.3%, 27.6%) ; + -primary-color-2-hue: 221.7; + --primary-color-2-saturation: 16.3%; + --primary-color-2-lightness: 27.6%; + +--primary-color-3: hsl(220.0, 16.8%, 31.6%) ; + --primary-color-3-hue: 220.0; + --primary-color-3-saturation: 16.8%; + --primary-color-3-lightness: 31.6%; + +--primary-color-4: hsl(220.0, 16.5%, 35.7%); + --primary-color-4-hue: 220.0; + --primary-color-4-saturation: 16.5%; + --primary-color-4-lightness: 35.7%; + + + +/* ---------- SECONDARY COLORS --------------- */ +--secondary-color-1: hsl(217.5, 26.7%, 94.1%); + --secondary-color-1-hue: 217.5; + --secondary-color-1-saturation: 26.7%; + --secondary-color-1-lightness: 94.1%; + +--secondary-color-2: hsl(218.2, 26.8%, 92.0%); + --secondary-color-2-hue: 218.2; + --secondary-color-2-saturation: 26.8%; + --secondary-color-2-lightness: 92.0%; + +--secondary-color-3: hsl(218.8, 27.9%, 88.0%); + --secondary-color-3-hue: 218.8; + --secondary-color-3-saturation: 27.9%; + --secondary-color-3-lightness: 88.0%; + +--secondary-color-4: hsl(218.8, 18.3%, 81.8%); + --secondary-color-4-hue: 218.8; + --secondary-color-4-saturation: 18.3%; + --secondary-color-4-lightness: 81.8%; + + + +/* ----------- NUANCES COLORS ---------------- */ +--theme-nuance-color-1: hsl(178.7, 25.1%, 64.9%); + --theme-nuance-color-1-hue: 178.7; + --theme-nuance-color-1-saturation: 25.1%; + --theme-nuance-color-1-lightness: 64.9%; + +--theme-nuance-color-2: hsl(193.3, 43.4%, 67.5%); + --theme-nuance-color-2-hue: 193.3; + --theme-nuance-color-2-saturation: 43.4%; + --theme-nuance-color-2-lightness: 67.5%; + +--theme-nuance-color-3: hsl(210.0, 34.0%, 63.1%); + --theme-nuance-color-3-hue: 210.0; + --theme-nuance-color-3-saturation: 34.0%; + --theme-nuance-color-3-lightness: 63.1%; + +--theme-nuance-color-4: hsl(213.1, 32.0%, 52.2%); + --theme-nuance-color-4-hue: 213.1; + --theme-nuance-color-4-saturation: 32.0%; + --theme-nuance-color-4-lightness: 52.2%; + + + +/* ----------- ROYGP COLORS ------------------ */ +--theme-red-color: hsl(354.3, 42.3%, 56.5%); +--theme-orange-color: hsl(20, 85%, 50%); +--theme-yellow-color: hsl(20, 75%, 45%); +--theme-green-color: hsl( 92.4, 27.8%, 64.7%); +--theme-purple-color: hsl(311.1, 20.2%, 63.1%); + + + +/* ------------------------------------------------ */ +--background-color-1: var(--primary-color-1); +--background-color-2: var(--primary-color-2); +--background-color-3: var(--primary-color-3); +--background-color-4: var(--primary-color-4); + +--border-color-1: var(--primary-color-2); +--border-color-2: var(--primary-color-3); +--border-color-3: var(--primary-color-4); + +--border-focus-color: var(--theme-nuance-color-2); +--border-focus-shadow: var(--theme-nuance-color-1); + +--text-color-plain: var(--secondary-color-1); +--text-color-subtile-1: var(--secondary-color-2); +--text-color-subtile-2: var(--secondary-color-3); + +--code-background-color: var(--secondary-color-2); +--code-text-color: var(--primary-color-2); + +--ui-range-thumb-color: var(--theme-nuance-color-3); +--ui-range-thumb-border: var(--ui-ranger-thumb-color); + +--textarea-border-color: var(--secondary-color-4); + +--chat-id-color: var(--theme-nuance-color-4); + + + +/* ------------------------------------------- */ +--button-alert-text-hover: var(--secondary-color-1); +--button-alert-color-hover: var(--theme-yellow-color); +--button-alert-border-hover: var(--theme-yellow-color); + +--button-alert-text-active: var(--secondary-color-1); +--button-alert-color-active: var(--theme-orange-color); +--button-alert-border-active: var(--theme-orange-color); + + + +/* ----------- PRIMARY BUTTONS --------------- */ +/* - button should immediately catch the eye - */ +--button-primary-text: var(--secondary-color-1); +--button-primary-color: var(--theme-nuance-color-3); +--button-primary-border: var(--theme-nuance-color-3); + + +/* ---------hover---------- */ +--button-primary-text-hover: + hsl(217.5, + calc(var(--secondary-color-1-saturation) - 35%), + calc(var(--secondary-color-1-lightness) + 30%)); + +--button-primary-color-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 2%), + calc(var(--theme-nuance-color-3-lightness) - 10%)); + +--button-primary-border-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 2%), + calc(var(--theme-nuance-color-3-lightness) - 10%)); + + +/* ---------active--------- */ +--button-primary-text-active: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) + 35%)); + +--button-primary-color-active: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 10%), + calc(var(--theme-nuance-color-3-lightness) - 25%)); + +--button-primary-border-active: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 10%), + calc(var(--theme-nuance-color-3-lightness) - 25%)); + + + +/* ---------- SECONDARY BUTTONS -------------- */ +/* these should NOT immediately catch the eye */ +--button-secondary-text: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) - 50%)); + +--button-secondary-color: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) + 10%)); + +--button-secondary-border: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) + 10%)); + + +/* ---------hover---------- */ +--button-secondary-text-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) - 80%)); + +--button-secondary-color-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 22%), + calc(var(--theme-nuance-color-3-lightness) + 1%)); + +--button-secondary-border-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 22%), + calc(var(--theme-nuance-color-3-lightness) + 1%)); + + +/* ---------active--------- */ +--button-secondary-text-active: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) + 25%)); + +--button-secondary-color-active: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 30%), + calc(var(--theme-nuance-color-3-lightness) - 15%)); + +--button-secondary-border-active: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 30%), + calc(var(--theme-nuance-color-3-lightness) - 15%)); + + + +/* ---------- TERTIARY BUTTONS --------------- */ +/* ---------- disabled buttons --------------- */ +--button-tertiary-text: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 40%), + calc(var(--theme-nuance-color-3-lightness) - 5%)); + +--button-tertiary-color: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 40%), + calc(var(--theme-nuance-color-3-lightness) + 20%)); + +--button-tertiary-border: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 40%), + calc(var(--theme-nuance-color-3-lightness) + 20%)); + + +/* ---------hover---------- */ +--button-tertiary-text-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 40%), + calc(var(--theme-nuance-color-3-lightness) - 5%)); + +--button-tertiary-color-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 40%), + calc(var(--theme-nuance-color-3-lightness) + 20%)); + +--button-tertiary-border-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 40%), + calc(var(--theme-nuance-color-3-lightness) + 20%)); + +} diff --git a/examples/server/public/theme-snowstorm.css b/examples/server/public/theme-snowstorm.css new file mode 100755 index 000000000..7bb227594 --- /dev/null +++ b/examples/server/public/theme-snowstorm.css @@ -0,0 +1,251 @@ +/* Author: Yazan Agha-Schrader */ +/* Inspiration from Nord Theme https://www.nordtheme.com/docs/colors-and-palettes */ + +.theme-snowstorm { + +/* ---------- PRIMARY COLORS ----------------- */ +--primary-color-1: hsl(217.5, 26.7%, 94.1%); + --primary-color-1-hue: 217.5; + --primary-color-1-saturation: 26.7%; + --primary-color-1-lightness: 94.1%; + +--primary-color-2: hsl(218.2, 26.8%, 92.0%); + --primary-color-2-hue: 218.2; + --primary-color-2-saturation: 26.8%; + --primary-color-2-lightness: 92.0%; + +--primary-color-3: hsl(218.8, 27.9%, 88.0%); + --primary-color-3-hue: 218.8; + --primary-color-3-saturation: 27.9%; + --primary-color-3-lightness: 88.0%; + +--primary-color-4: hsl(218.8, 18.3%, 81.8%); + --primary-color-4-hue: 218.8; + --primary-color-4-saturation: 18.3%; + --primary-color-4-lightness: 81.8%; + + +/* ---------- SECONDARY COLORS --------------- */ +--secondary-color-1: hsl(220.0, 16.4%, 21.6%); + --secondary-color-1-hue: 220.0; + --secondary-color-1-saturation: 16.4%; + --secondary-color-1-lightness: 21.6%; + +--secondary-color-2: hsl(221.7, 16.3%, 27.6%); + --secondary-color-2-hue: 221.7; + --secondary-color-2-saturation: 16.3%; + --secondary-color-2-lightness: 27.6%; + +--secondary-color-3: hsl(220.0, 16.8%, 31.6%); + --secondary-color-3-hue: 220.0; + --secondary-color-3-saturation: 16.8%; + --secondary-color-3-lightness: 31.6%; + +--secondary-color-4: hsl(220.0, 16.5%, 35.7%); + --secondary-color-4-hue: 220.0; + --secondary-color-4-saturation: 16.5%; + --secondary-color-4-lightness: 35.7%; + + + +/* ----------- NUANCES COLORS ---------------- */ +--theme-nuance-color-1: hsl(178.7, 25.1%, 64.9%); + --theme-nuance-color-1-hue: 178.7; + --theme-nuance-color-1-saturation: 25.1%; + --theme-nuance-color-1-lightness: 64.9%; + +--theme-nuance-color-2: hsl(193.3, 43.4%, 67.5%); + --theme-nuance-color-2-hue: 193.3; + --theme-nuance-color-2-saturation: 43.4%; + --theme-nuance-color-2-lightness: 67.5%; + +--theme-nuance-color-3: hsl(210.0, 34.0%, 63.1%); + --theme-nuance-color-3-hue: 210.0; + --theme-nuance-color-3-saturation: 34.0%; + --theme-nuance-color-3-lightness: 63.1%; + +--theme-nuance-color-4: hsl(213.1, 32.0%, 52.2%); + --theme-nuance-color-4-hue: 213.1; + --theme-nuance-color-4-saturation: 32.0%; + --theme-nuance-color-4-lightness: 52.2%; + + + +/* ----------- ROYGP COLORS ------------------ */ +--theme-red-color: hsl(32.5, 80%, 50%); +--theme-orange-color: hsl(32.5, 70%, 45%); +--theme-yellow-color: hsl(40.0, 0.6%, 73.3%); +--theme-green-color: hsl(92.4, 27.8%, 64.7%); +--theme-purple-color: hsl(311.1, 20.2%, 63.1%); + + + +/* ------------------------------------------- */ +--background-color-1: var(--primary-color-1); +--background-color-2: var(--primary-color-2); +--background-color-3: var(--primary-color-3); +--background-color-4: var(--primary-color-4); + +--border-color-1: var(--primary-color-2); +--border-color-2: var(--primary-color-3); +--border-color-3: var(--primary-color-4); + +--border-focus-color: var(--theme-nuance-color-2); +--border-focus-shadow: var(--theme-nuance-color-1); + +--text-color-plain: var(--secondary-color-1); +--text-color-subtile-1: var(--secondary-color-2); +--text-color-subtile-2: var(--secondary-color-3); + +--code-background-color: var(--secondary-color-2); +--code-text-color: var(--primary-color-2); + +--ui-range-thumb-color: var(--theme-nuance-color-3); +--ui-range-thumb-border: var(--ui-ranger-thumb-color); + +--textarea-border-color: var(--secondary-color-4); + +--chat-id-color: var(--theme-nuance-color-4); + + + +/* ------------------------------------------- */ +--button-alert-text-hover: var(--primary-color-1); +--button-alert-color-hover: var(--theme-orange-color); +--button-alert-border-hover: var(--theme-orange-color); + +--button-alert-text-active: var(--primary-color-1); +--button-alert-color-active: var(--theme-red-color); +--button-alert-border-active: var(--theme-red-color); + + + +/* ----------- PRIMARY BUTTONS --------------- */ +/* - button should immediately catch the eye - */ +--button-primary-text: var(--secondary-color-1); +--button-primary-color: var(--theme-nuance-color-3); +--button-primary-border: var(--theme-nuance-color-3); + + +/* ---------hover---------- */ +--button-primary-text-hover: + hsl(217.5, + calc(var(--secondary-color-1-saturation) + 35%), + calc(var(--secondary-color-1-lightness) - 30%)); + +--button-primary-color-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 2%), + calc(var(--theme-nuance-color-3-lightness) - 10%)); + +--button-primary-border-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 2%), + calc(var(--theme-nuance-color-3-lightness) - 10%)); + + +/* ---------active--------- */ +--button-primary-text-active: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) + 35%)); + +--button-primary-color-active: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 10%), + calc(var(--theme-nuance-color-3-lightness) - 25%)); + +--button-primary-border-active: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 10%), + calc(var(--theme-nuance-color-3-lightness) - 25%)); + + + +/* ---------- SECONDARY BUTTONS -------------- */ +/* these should NOT immediately catch the eye */ +--button-secondary-text: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) - 50%)); + +--button-secondary-color: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) + 10%)); + +--button-secondary-border: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) + 10%)); + + +/* ---------hover---------- */ +--button-secondary-text-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 20%), + calc(var(--theme-nuance-color-3-lightness) - 80%)); + +--button-secondary-color-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 22%), + calc(var(--theme-nuance-color-3-lightness) + 1%)); + +--button-secondary-border-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 22%), + calc(var(--theme-nuance-color-3-lightness) + 1%)); + + +/* ---------active--------- */ +--button-secondary-text-active: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) + 40%), + calc(var(--theme-nuance-color-3-lightness) - 55%)); + +--button-secondary-color-active: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 30%), + calc(var(--theme-nuance-color-3-lightness) - 5%)); + +--button-secondary-border-active: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 30%), + calc(var(--theme-nuance-color-3-lightness) - 5%)); + + + +/* ---------- TERTIARY BUTTONS --------------- */ +/* ---------- disabled buttons --------------- */ +--button-tertiary-text: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 40%), + calc(var(--theme-nuance-color-3-lightness) - 5%)); + +--button-tertiary-color: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 40%), + calc(var(--theme-nuance-color-3-lightness) + 20%)); + +--button-tertiary-border: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 40%), + calc(var(--theme-nuance-color-3-lightness) + 20%)); + +/* ---------hover---------- */ +--button-tertiary-text-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 40%), + calc(var(--theme-nuance-color-3-lightness) - 5%)); + +--button-tertiary-color-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 40%), + calc(var(--theme-nuance-color-3-lightness) + 20%)); + +--button-tertiary-border-hover: + hsl(210, + calc(var(--theme-nuance-color-3-saturation) - 40%), + calc(var(--theme-nuance-color-3-lightness) + 20%)); + +} diff --git a/examples/server/server.cpp b/examples/server/server.cpp index e9904263d..fc6d90848 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -17,9 +17,20 @@ #include "json.hpp" // auto generated files (update with ./deps.sh) +#include "colorthemes.css.hpp" +#include "style.css.hpp" +#include "theme-beeninorder.css.hpp" +#include "theme-ketivah.css.hpp" +#include "theme-mangotango.css.hpp" +#include "theme-playground.css.hpp" +#include "theme-polarnight.css.hpp" +#include "theme-snowstorm.css.hpp" #include "index.html.hpp" +#include "index-new.html.hpp" #include "index.js.hpp" #include "completion.js.hpp" +#include "system-prompts.js.hpp" +#include "prompt-formats.js.hpp" #include "json-schema-to-grammar.mjs.hpp" #include @@ -3750,13 +3761,25 @@ int main(int argc, char ** argv) { // Set the base directory for serving static files svr->set_base_dir(sparams.public_path); } - // using embedded static files svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8")); svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8")); svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8")); svr->Get("/json-schema-to-grammar.mjs", handle_static_file( - json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8")); + json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8")); + + // add new-ui files + svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8")); + svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8")); + svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8")); + svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8")); + svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8")); // register API routes svr->Get ("/health", handle_health); From e141ce624af57bdffbaf57014a044eb1d9689230 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 1 Jun 2024 23:26:10 +0200 Subject: [PATCH 6/6] Fix FlashAttention debug test, FP32 assert (#7684) --- ggml-cuda/fattn-vec-f32.cuh | 4 ---- tests/test-backend-ops.cpp | 8 +++++--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/ggml-cuda/fattn-vec-f32.cuh b/ggml-cuda/fattn-vec-f32.cuh index ce23a4ebd..ddf0c8374 100644 --- a/ggml-cuda/fattn-vec-f32.cuh +++ b/ggml-cuda/fattn-vec-f32.cuh @@ -278,14 +278,10 @@ void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx, template void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - ggml_tensor * KQV = dst; ggml_tensor * Q = dst->src[0]; ggml_tensor * K = dst->src[1]; ggml_tensor * V = dst->src[2]; - const int32_t precision = KQV->op_params[2]; - GGML_ASSERT(precision == GGML_PREC_DEFAULT); - GGML_ASSERT(K->type == type_K); GGML_ASSERT(V->type == type_V); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 777230127..8dc90a45d 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1584,9 +1584,11 @@ struct test_flash_attn_ext : public test_case { : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), type_KV(type_KV) {} ggml_tensor * build_graph(ggml_context * ctx) override { - ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1); - ggml_tensor * k = ggml_new_tensor_4d(ctx, type_KV, hs, kv, nh, 1); - ggml_tensor * v = ggml_new_tensor_4d(ctx, type_KV, hs, kv, nh, 1); + const int64_t hs_padded = GGML_PAD(hs, ggml_blck_size(type_KV)); + + ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs_padded, nb, nh, 1); + ggml_tensor * k = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1); + ggml_tensor * v = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1); ggml_tensor * m = mask ? ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1) : nullptr; ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias); return out;