Show tokens per second
diff --git a/examples/server/webui/package-lock.json b/examples/server/webui/package-lock.json
index f9104f65f..bbebccbf2 100644
--- a/examples/server/webui/package-lock.json
+++ b/examples/server/webui/package-lock.json
@@ -8,8 +8,12 @@
"name": "webui",
"version": "0.0.0",
"dependencies": {
+ "@sec-ant/readable-stream": "^0.6.0",
+ "@vscode/markdown-it-katex": "^1.1.1",
"autoprefixer": "^10.4.20",
"daisyui": "^4.12.14",
+ "highlight.js": "^11.10.0",
+ "katex": "^0.16.15",
"markdown-it": "^14.1.0",
"postcss": "^8.4.49",
"tailwindcss": "^3.4.15",
@@ -18,6 +22,7 @@
"vue": "^3.5.13"
},
"devDependencies": {
+ "sass-embedded": "^1.83.0",
"vite": "^5.4.10"
}
},
@@ -33,6 +38,13 @@
"url": "https://github.com/sponsors/sindresorhus"
}
},
+ "node_modules/@bufbuild/protobuf": {
+ "version": "2.2.3",
+ "resolved": "https://registry.npmjs.org/@bufbuild/protobuf/-/protobuf-2.2.3.tgz",
+ "integrity": "sha512-tFQoXHJdkEOSwj5tRIZSPNUuXK3RaR7T1nUrPgbYX1pUbvqqaaZAsfo+NXBPsz5rZMSKVFrgK1WL8Q/MSLvprg==",
+ "devOptional": true,
+ "license": "(Apache-2.0 AND BSD-3-Clause)"
+ },
"node_modules/@esbuild/aix-ppc64": {
"version": "0.21.5",
"resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz",
@@ -606,6 +618,21 @@
"win32"
]
},
+ "node_modules/@sec-ant/readable-stream": {
+ "version": "0.6.0",
+ "resolved": "https://registry.npmjs.org/@sec-ant/readable-stream/-/readable-stream-0.6.0.tgz",
+ "integrity": "sha512-uiBh8DrB5FN35gP6/o8JEhEQ7/ci1jUsOZO/VMUjyvTpjtV54VstOXVj1TvTj/wsT23pfX6butxxh3qufsW3+g==",
+ "license": "MIT"
+ },
+ "node_modules/@vscode/markdown-it-katex": {
+ "version": "1.1.1",
+ "resolved": "https://registry.npmjs.org/@vscode/markdown-it-katex/-/markdown-it-katex-1.1.1.tgz",
+ "integrity": "sha512-3KTlbsRBPJQLE2YmLL7K6nunTlU+W9T5+FjfNdWuIUKgxSS6HWLQHaO3L4MkJi7z7MpIPpY+g4N+cWNBPE/MSA==",
+ "license": "MIT",
+ "dependencies": {
+ "katex": "^0.16.4"
+ }
+ },
"node_modules/@vue/compiler-dom": {
"version": "3.5.13",
"resolved": "https://registry.npmjs.org/@vue/compiler-dom/-/compiler-dom-3.5.13.tgz",
@@ -1004,6 +1031,13 @@
"browserslist": ">= 4.21.0"
}
},
+ "node_modules/buffer-builder": {
+ "version": "0.2.0",
+ "resolved": "https://registry.npmjs.org/buffer-builder/-/buffer-builder-0.2.0.tgz",
+ "integrity": "sha512-7VPMEPuYznPSoR21NE1zvd2Xna6c/CloiZCfcMXR1Jny6PjX0N4Nsa38zcBFo/FMK+BlA+FLKbJCQ0i2yxp+Xg==",
+ "devOptional": true,
+ "license": "MIT/X11"
+ },
"node_modules/caniuse-lite": {
"version": "1.0.30001684",
"resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001684.tgz",
@@ -1166,6 +1200,22 @@
"node": ">=8.0"
}
},
+ "node_modules/colorjs.io": {
+ "version": "0.5.2",
+ "resolved": "https://registry.npmjs.org/colorjs.io/-/colorjs.io-0.5.2.tgz",
+ "integrity": "sha512-twmVoizEW7ylZSN32OgKdXRmo1qg+wT5/6C3xu5b9QsWzSFAhHLn2xd8ro0diCsKfCj1RdaTP/nrcW+vAoQPIw==",
+ "devOptional": true,
+ "license": "MIT"
+ },
+ "node_modules/commander": {
+ "version": "8.3.0",
+ "resolved": "https://registry.npmjs.org/commander/-/commander-8.3.0.tgz",
+ "integrity": "sha512-OkTL9umf+He2DZkUq8f8J9of7yL6RJKI24dVITBmNfZBmri9zYZQrKkuXiKhyfPSu8tUhnVBB1iKXevvnlR4Ww==",
+ "license": "MIT",
+ "engines": {
+ "node": ">= 12"
+ }
+ },
"node_modules/css-selector-tokenizer": {
"version": "0.8.0",
"resolved": "https://registry.npmjs.org/css-selector-tokenizer/-/css-selector-tokenizer-0.8.0.tgz",
@@ -1473,6 +1523,31 @@
"node": ">=10.13.0"
}
},
+ "node_modules/has-flag": {
+ "version": "4.0.0",
+ "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+ "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+ "devOptional": true,
+ "license": "MIT",
+ "engines": {
+ "node": ">=8"
+ }
+ },
+ "node_modules/highlight.js": {
+ "version": "11.10.0",
+ "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-11.10.0.tgz",
+ "integrity": "sha512-SYVnVFswQER+zu1laSya563s+F8VDGt7o35d4utbamowvUNLLMovFqwCLSocpZTz3MgaSRA1IbqRWZv97dtErQ==",
+ "engines": {
+ "node": ">=12.0.0"
+ }
+ },
+ "node_modules/immutable": {
+ "version": "5.0.3",
+ "resolved": "https://registry.npmjs.org/immutable/-/immutable-5.0.3.tgz",
+ "integrity": "sha512-P8IdPQHq3lA1xVeBRi5VPqUm5HDgKnx0Ru51wZz5mjxHr5n3RWhjIpOFU7ybkUxfB+5IToy+OLaHYDBIWsv+uw==",
+ "devOptional": true,
+ "license": "MIT"
+ },
"node_modules/is-glob": {
"version": "4.0.3",
"resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz",
@@ -1503,6 +1578,22 @@
"jiti": "bin/jiti.js"
}
},
+ "node_modules/katex": {
+ "version": "0.16.15",
+ "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.15.tgz",
+ "integrity": "sha512-yE9YJIEAk2aZ+FL/G8r+UGw0CTUzEA8ZFy6E+8tc3spHUKq3qBnzCkI1CQwGoI9atJhVyFPEypQsTY7mJ1Pi9w==",
+ "funding": [
+ "https://opencollective.com/katex",
+ "https://github.com/sponsors/katex"
+ ],
+ "license": "MIT",
+ "dependencies": {
+ "commander": "^8.3.0"
+ },
+ "bin": {
+ "katex": "cli.js"
+ }
+ },
"node_modules/lilconfig": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-2.1.0.tgz",
@@ -2022,6 +2113,381 @@
"integrity": "sha512-AYnb1nQyY49te+VRAVgmzfcgjYS91mY5P0TKUDCLEM+gNnA+3T6rWITXRLYCpahpqSQbN5cE+gHpnPyXjHWxcw==",
"license": "MIT"
},
+ "node_modules/rxjs": {
+ "version": "7.8.1",
+ "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.8.1.tgz",
+ "integrity": "sha512-AA3TVj+0A2iuIoQkWEK/tqFjBq2j+6PO6Y0zJcvzLAFhEFIO3HL0vls9hWLncZbAAbK0mar7oZ4V079I/qPMxg==",
+ "devOptional": true,
+ "license": "Apache-2.0",
+ "dependencies": {
+ "tslib": "^2.1.0"
+ }
+ },
+ "node_modules/sass-embedded": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded/-/sass-embedded-1.83.0.tgz",
+ "integrity": "sha512-/8cYZeL39evUqe0o//193na51Q1VWZ61qhxioQvLJwOtWIrX+PgNhCyD8RSuTtmzc4+6+waFZf899bfp/MCUwA==",
+ "devOptional": true,
+ "license": "MIT",
+ "dependencies": {
+ "@bufbuild/protobuf": "^2.0.0",
+ "buffer-builder": "^0.2.0",
+ "colorjs.io": "^0.5.0",
+ "immutable": "^5.0.2",
+ "rxjs": "^7.4.0",
+ "supports-color": "^8.1.1",
+ "sync-child-process": "^1.0.2",
+ "varint": "^6.0.0"
+ },
+ "bin": {
+ "sass": "dist/bin/sass.js"
+ },
+ "engines": {
+ "node": ">=16.0.0"
+ },
+ "optionalDependencies": {
+ "sass-embedded-android-arm": "1.83.0",
+ "sass-embedded-android-arm64": "1.83.0",
+ "sass-embedded-android-ia32": "1.83.0",
+ "sass-embedded-android-riscv64": "1.83.0",
+ "sass-embedded-android-x64": "1.83.0",
+ "sass-embedded-darwin-arm64": "1.83.0",
+ "sass-embedded-darwin-x64": "1.83.0",
+ "sass-embedded-linux-arm": "1.83.0",
+ "sass-embedded-linux-arm64": "1.83.0",
+ "sass-embedded-linux-ia32": "1.83.0",
+ "sass-embedded-linux-musl-arm": "1.83.0",
+ "sass-embedded-linux-musl-arm64": "1.83.0",
+ "sass-embedded-linux-musl-ia32": "1.83.0",
+ "sass-embedded-linux-musl-riscv64": "1.83.0",
+ "sass-embedded-linux-musl-x64": "1.83.0",
+ "sass-embedded-linux-riscv64": "1.83.0",
+ "sass-embedded-linux-x64": "1.83.0",
+ "sass-embedded-win32-arm64": "1.83.0",
+ "sass-embedded-win32-ia32": "1.83.0",
+ "sass-embedded-win32-x64": "1.83.0"
+ }
+ },
+ "node_modules/sass-embedded-android-arm": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded-android-arm/-/sass-embedded-android-arm-1.83.0.tgz",
+ "integrity": "sha512-uwFSXzJlfbd4Px189xE5l+cxN8+TQpXdQgJec7TIrb4HEY7imabtpYufpVdqUVwT1/uiis5V4+qIEC4Vl5XObQ==",
+ "cpu": [
+ "arm"
+ ],
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "android"
+ ],
+ "engines": {
+ "node": ">=14.0.0"
+ }
+ },
+ "node_modules/sass-embedded-android-arm64": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded-android-arm64/-/sass-embedded-android-arm64-1.83.0.tgz",
+ "integrity": "sha512-GBiCvM4a2rkWBLdYDxI6XYnprfk5U5c81g69RC2X6kqPuzxzx8qTArQ9M6keFK4+iDQ5N9QTwFCr0KbZTn+ZNQ==",
+ "cpu": [
+ "arm64"
+ ],
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "android"
+ ],
+ "engines": {
+ "node": ">=14.0.0"
+ }
+ },
+ "node_modules/sass-embedded-android-ia32": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded-android-ia32/-/sass-embedded-android-ia32-1.83.0.tgz",
+ "integrity": "sha512-5ATPdGo2SICqAhiJl/Z8KQ23zH4sGgobGgux0TnrNtt83uHZ+r+To/ubVJ7xTkZxed+KJZnIpolGD8dQyQqoTg==",
+ "cpu": [
+ "ia32"
+ ],
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "android"
+ ],
+ "engines": {
+ "node": ">=14.0.0"
+ }
+ },
+ "node_modules/sass-embedded-android-riscv64": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded-android-riscv64/-/sass-embedded-android-riscv64-1.83.0.tgz",
+ "integrity": "sha512-aveknUOB8GZewOzVn2Uwk+DKcncTR50Q6vtzslNMGbYnxtgQNHzy8A1qVEviNUruex+pHofppeMK4iMPFAbiEQ==",
+ "cpu": [
+ "riscv64"
+ ],
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "android"
+ ],
+ "engines": {
+ "node": ">=14.0.0"
+ }
+ },
+ "node_modules/sass-embedded-android-x64": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded-android-x64/-/sass-embedded-android-x64-1.83.0.tgz",
+ "integrity": "sha512-WqIay/72ncyf9Ph4vS742J3a73wZihWmzFUwpn1OD6lme1Aj4eWzWIve5IVnlTEJgcZcDHu6ECID9IZgehJKoA==",
+ "cpu": [
+ "x64"
+ ],
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "android"
+ ],
+ "engines": {
+ "node": ">=14.0.0"
+ }
+ },
+ "node_modules/sass-embedded-darwin-arm64": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded-darwin-arm64/-/sass-embedded-darwin-arm64-1.83.0.tgz",
+ "integrity": "sha512-XQl9QqgxFFIPm/CzHhmppse5o9ocxrbaAdC2/DAnlAqvYWBBtgFqPjGoYlej13h9SzfvNoogx+y9r+Ap+e+hYg==",
+ "cpu": [
+ "arm64"
+ ],
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "darwin"
+ ],
+ "engines": {
+ "node": ">=14.0.0"
+ }
+ },
+ "node_modules/sass-embedded-darwin-x64": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded-darwin-x64/-/sass-embedded-darwin-x64-1.83.0.tgz",
+ "integrity": "sha512-ERQ7Tvp1kFOW3ux4VDFIxb7tkYXHYc+zJpcrbs0hzcIO5ilIRU2tIOK1OrNwrFO6Qxyf7AUuBwYKLAtIU/Nz7g==",
+ "cpu": [
+ "x64"
+ ],
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "darwin"
+ ],
+ "engines": {
+ "node": ">=14.0.0"
+ }
+ },
+ "node_modules/sass-embedded-linux-arm": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded-linux-arm/-/sass-embedded-linux-arm-1.83.0.tgz",
+ "integrity": "sha512-baG9RYBJxUFmqwDNC9h9ZFElgJoyO3jgHGjzEZ1wHhIS9anpG+zZQvO8bHx3dBpKEImX+DBeLX+CxsFR9n81gQ==",
+ "cpu": [
+ "arm"
+ ],
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "linux"
+ ],
+ "engines": {
+ "node": ">=14.0.0"
+ }
+ },
+ "node_modules/sass-embedded-linux-arm64": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded-linux-arm64/-/sass-embedded-linux-arm64-1.83.0.tgz",
+ "integrity": "sha512-syEAVTJt4qhaMLxrSwOWa46zdqHJdnqJkLUK+t9aCr8xqBZLPxSUeIGji76uOehQZ1C+KGFj6n9xstHN6wzOJw==",
+ "cpu": [
+ "arm64"
+ ],
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "linux"
+ ],
+ "engines": {
+ "node": ">=14.0.0"
+ }
+ },
+ "node_modules/sass-embedded-linux-ia32": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded-linux-ia32/-/sass-embedded-linux-ia32-1.83.0.tgz",
+ "integrity": "sha512-RRBxQxMpoxu5+XcSSc6QR/o9asEwUzR8AbCS83RaXcdTIHTa/CccQsiAoDDoPlRsMTLqnzs0LKL4CfOsf7zBbA==",
+ "cpu": [
+ "ia32"
+ ],
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "linux"
+ ],
+ "engines": {
+ "node": ">=14.0.0"
+ }
+ },
+ "node_modules/sass-embedded-linux-musl-arm": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-arm/-/sass-embedded-linux-musl-arm-1.83.0.tgz",
+ "integrity": "sha512-Yc7u2TelCfBab+PRob9/MNJFh3EooMiz4urvhejXkihTiKSHGCv5YqDdtWzvyb9tY2Jb7YtYREVuHwfdVn3dTQ==",
+ "cpu": [
+ "arm"
+ ],
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "linux"
+ ],
+ "engines": {
+ "node": ">=14.0.0"
+ }
+ },
+ "node_modules/sass-embedded-linux-musl-arm64": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-arm64/-/sass-embedded-linux-musl-arm64-1.83.0.tgz",
+ "integrity": "sha512-Y7juhPHClUO2H5O+u+StRy6SEAcwZ+hTEk5WJdEmo1Bb1gDtfHvJaWB/iFZJ2tW0W1e865AZeUrC4OcOFjyAQA==",
+ "cpu": [
+ "arm64"
+ ],
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "linux"
+ ],
+ "engines": {
+ "node": ">=14.0.0"
+ }
+ },
+ "node_modules/sass-embedded-linux-musl-ia32": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-ia32/-/sass-embedded-linux-musl-ia32-1.83.0.tgz",
+ "integrity": "sha512-arQeYwGmwXV8byx5G1PtSzZWW1jbkfR5qrIHMEbTFSAvAxpqjgSvCvrHMOFd73FcMxVaYh4BX9LQNbKinkbEdg==",
+ "cpu": [
+ "ia32"
+ ],
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "linux"
+ ],
+ "engines": {
+ "node": ">=14.0.0"
+ }
+ },
+ "node_modules/sass-embedded-linux-musl-riscv64": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-riscv64/-/sass-embedded-linux-musl-riscv64-1.83.0.tgz",
+ "integrity": "sha512-E6uzlIWz59rut+Z3XR6mLG915zNzv07ISvj3GUNZENdHM7dF8GQ//ANoIpl5PljMQKp89GnYdvo6kj2gnaBf/g==",
+ "cpu": [
+ "riscv64"
+ ],
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "linux"
+ ],
+ "engines": {
+ "node": ">=14.0.0"
+ }
+ },
+ "node_modules/sass-embedded-linux-musl-x64": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-x64/-/sass-embedded-linux-musl-x64-1.83.0.tgz",
+ "integrity": "sha512-eAMK6tyGqvqr21r9g8BnR3fQc1rYFj85RGduSQ3xkITZ6jOAnOhuU94N5fwRS852Hpws0lXhET+7JHXgg3U18w==",
+ "cpu": [
+ "x64"
+ ],
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "linux"
+ ],
+ "engines": {
+ "node": ">=14.0.0"
+ }
+ },
+ "node_modules/sass-embedded-linux-riscv64": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded-linux-riscv64/-/sass-embedded-linux-riscv64-1.83.0.tgz",
+ "integrity": "sha512-Ojpi78pTv02sy2fUYirRGXHLY3fPnV/bvwuC2i5LwPQw2LpCcFyFTtN0c5h4LJDk9P6wr+/ZB/JXU8tHIOlK+Q==",
+ "cpu": [
+ "riscv64"
+ ],
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "linux"
+ ],
+ "engines": {
+ "node": ">=14.0.0"
+ }
+ },
+ "node_modules/sass-embedded-linux-x64": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded-linux-x64/-/sass-embedded-linux-x64-1.83.0.tgz",
+ "integrity": "sha512-3iLjlXdoPfgZRtX4odhRvka1BQs5mAXqfCtDIQBgh/o0JnGPzJIWWl9bYLpHxK8qb+uyVBxXYgXpI0sCzArBOw==",
+ "cpu": [
+ "x64"
+ ],
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "linux"
+ ],
+ "engines": {
+ "node": ">=14.0.0"
+ }
+ },
+ "node_modules/sass-embedded-win32-arm64": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded-win32-arm64/-/sass-embedded-win32-arm64-1.83.0.tgz",
+ "integrity": "sha512-iOHw/8/t2dlTW3lOFwG5eUbiwhEyGWawivlKWJ8lkXH7fjMpVx2VO9zCFAm8RvY9xOHJ9sf1L7g5bx3EnNP9BQ==",
+ "cpu": [
+ "arm64"
+ ],
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "win32"
+ ],
+ "engines": {
+ "node": ">=14.0.0"
+ }
+ },
+ "node_modules/sass-embedded-win32-ia32": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded-win32-ia32/-/sass-embedded-win32-ia32-1.83.0.tgz",
+ "integrity": "sha512-2PxNXJ8Pad4geVcTXY4rkyTr5AwbF8nfrCTDv0ulbTvPhzX2mMKEGcBZUXWn5BeHZTBc6whNMfS7d5fQXR9dDQ==",
+ "cpu": [
+ "ia32"
+ ],
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "win32"
+ ],
+ "engines": {
+ "node": ">=14.0.0"
+ }
+ },
+ "node_modules/sass-embedded-win32-x64": {
+ "version": "1.83.0",
+ "resolved": "https://registry.npmjs.org/sass-embedded-win32-x64/-/sass-embedded-win32-x64-1.83.0.tgz",
+ "integrity": "sha512-muBXkFngM6eLTNqOV0FQi7Dv9s+YRQ42Yem26mosdan/GmJQc81deto6uDTgrYn+bzFNmiXcOdfm+0MkTWK3OQ==",
+ "cpu": [
+ "x64"
+ ],
+ "license": "MIT",
+ "optional": true,
+ "os": [
+ "win32"
+ ],
+ "engines": {
+ "node": ">=14.0.0"
+ }
+ },
"node_modules/sucrase": {
"version": "3.35.0",
"resolved": "https://registry.npmjs.org/sucrase/-/sucrase-3.35.0.tgz",
@@ -2641,6 +3107,45 @@
"node": ">=8"
}
},
+ "node_modules/supports-color": {
+ "version": "8.1.1",
+ "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-8.1.1.tgz",
+ "integrity": "sha512-MpUEN2OodtUzxvKQl72cUF7RQ5EiHsGvSsVG0ia9c5RbWGL2CI4C7EpPS8UTBIplnlzZiNuV56w+FuNxy3ty2Q==",
+ "devOptional": true,
+ "license": "MIT",
+ "dependencies": {
+ "has-flag": "^4.0.0"
+ },
+ "engines": {
+ "node": ">=10"
+ },
+ "funding": {
+ "url": "https://github.com/chalk/supports-color?sponsor=1"
+ }
+ },
+ "node_modules/sync-child-process": {
+ "version": "1.0.2",
+ "resolved": "https://registry.npmjs.org/sync-child-process/-/sync-child-process-1.0.2.tgz",
+ "integrity": "sha512-8lD+t2KrrScJ/7KXCSyfhT3/hRq78rC0wBFqNJXv3mZyn6hW2ypM05JmlSvtqRbeq6jqA94oHbxAr2vYsJ8vDA==",
+ "devOptional": true,
+ "license": "MIT",
+ "dependencies": {
+ "sync-message-port": "^1.0.0"
+ },
+ "engines": {
+ "node": ">=16.0.0"
+ }
+ },
+ "node_modules/sync-message-port": {
+ "version": "1.1.3",
+ "resolved": "https://registry.npmjs.org/sync-message-port/-/sync-message-port-1.1.3.tgz",
+ "integrity": "sha512-GTt8rSKje5FilG+wEdfCkOcLL7LWqpMlr2c3LRuKt/YXxcJ52aGSbGBAdI4L3aaqfrBt6y711El53ItyH1NWzg==",
+ "devOptional": true,
+ "license": "MIT",
+ "engines": {
+ "node": ">=16.0.0"
+ }
+ },
"node_modules/tailwindcss": {
"version": "3.4.15",
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.15.tgz",
@@ -2684,12 +3189,26 @@
"integrity": "sha512-iBHbi7BQxrFmwZUQJsT0SjNzlLLsXhvW/kg7EyOMVMBIrlnj/qYofwo1LVLZi+3GbUEo96Iu2eqToI2+lZoAEQ==",
"license": "MIT"
},
+ "node_modules/tslib": {
+ "version": "2.8.1",
+ "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
+ "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
+ "devOptional": true,
+ "license": "0BSD"
+ },
"node_modules/uc.micro": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/uc.micro/-/uc.micro-2.1.0.tgz",
"integrity": "sha512-ARDJmphmdvUk6Glw7y9DQ2bFkKBHwQHLi2lsaH6PPmz/Ka9sFOBsBluozhDltWmnv9u/cF6Rt87znRTPV+yp/A==",
"license": "MIT"
},
+ "node_modules/varint": {
+ "version": "6.0.0",
+ "resolved": "https://registry.npmjs.org/varint/-/varint-6.0.0.tgz",
+ "integrity": "sha512-cXEIW6cfr15lFv563k4GuVuW/fiwjknytD37jIOLSdSWuOI6WnO/oKwmP2FQTU2l01LP8/M5TSAJpzUaGe3uWg==",
+ "devOptional": true,
+ "license": "MIT"
+ },
"node_modules/vite": {
"version": "5.4.11",
"resolved": "https://registry.npmjs.org/vite/-/vite-5.4.11.tgz",
diff --git a/examples/server/webui/package.json b/examples/server/webui/package.json
index d656a841d..2836cce00 100644
--- a/examples/server/webui/package.json
+++ b/examples/server/webui/package.json
@@ -6,14 +6,20 @@
"scripts": {
"dev": "vite",
"build": "vite build",
- "preview": "vite preview"
+ "preview": "vite preview",
+ "analyze": "ANALYZE=1 npx vite-bundle-visualizer"
},
"devDependencies": {
+ "sass-embedded": "^1.83.0",
"vite": "^5.4.10"
},
"dependencies": {
+ "@sec-ant/readable-stream": "^0.6.0",
+ "@vscode/markdown-it-katex": "^1.1.1",
"autoprefixer": "^10.4.20",
"daisyui": "^4.12.14",
+ "highlight.js": "^11.10.0",
+ "katex": "^0.16.15",
"markdown-it": "^14.1.0",
"postcss": "^8.4.49",
"tailwindcss": "^3.4.15",
diff --git a/examples/server/webui/public/demo-conversation.json b/examples/server/webui/public/demo-conversation.json
new file mode 100644
index 000000000..75ab599dd
--- /dev/null
+++ b/examples/server/webui/public/demo-conversation.json
@@ -0,0 +1,33 @@
+{
+ "demo": true,
+ "id": "conv-1734086746930",
+ "lastModified": 1734087548943,
+ "messages": [
+ {
+ "id": 1734086764521,
+ "role": "user",
+ "content": "this is a demo conversation, used in dev mode"
+ },
+ {
+ "id": 1734087548327,
+ "role": "assistant",
+ "content": "This is the formula:\n\n$\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}$\n\nGiven an input vector \\(\\mathbf{x} = [x_1, x_2, \\ldots, x_n]\\)\n\n\\[\ny_i = \\frac{e^{x_i}}{\\sum_{j=1}^n e^{x_j}}\n\\]\n\nCode block latex:\n```latex\n\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}\n```\n\nTest dollar sign: $1234 $4567\n\nInvalid latex syntax: $E = mc^$ and $$E = mc^$$",
+ "timings": {
+ "prompt_n": 1,
+ "prompt_ms": 28.923,
+ "predicted_n": 25,
+ "predicted_ms": 573.016
+ }
+ },
+ {
+ "id": 1734087548328,
+ "role": "user",
+ "content": "this is a demo conversation, used in dev mode"
+ },
+ {
+ "id": 1734087548329,
+ "role": "assistant",
+ "content": "Code block:\n```js\nconsole.log('hello world')\n```\n```sh\nls -la /dev\n```"
+ }
+ ]
+}
diff --git a/examples/server/webui/src/highlight-config.js b/examples/server/webui/src/highlight-config.js
new file mode 100644
index 000000000..96c7028f9
--- /dev/null
+++ b/examples/server/webui/src/highlight-config.js
@@ -0,0 +1,60 @@
+import hljs from 'highlight.js/lib/core';
+
+// only import commonly used languages to reduce bundle size
+
+import python from 'highlight.js/lib/languages/python';
+import javascript from 'highlight.js/lib/languages/javascript';
+import json from 'highlight.js/lib/languages/json';
+import bash from 'highlight.js/lib/languages/bash';
+import yaml from 'highlight.js/lib/languages/yaml';
+import markdown from 'highlight.js/lib/languages/markdown';
+import scss from 'highlight.js/lib/languages/scss';
+import xml from 'highlight.js/lib/languages/xml';
+import ruby from 'highlight.js/lib/languages/ruby';
+import go from 'highlight.js/lib/languages/go';
+import java from 'highlight.js/lib/languages/java';
+import rust from 'highlight.js/lib/languages/rust';
+import scala from 'highlight.js/lib/languages/scala';
+import cpp from 'highlight.js/lib/languages/cpp';
+import csharp from 'highlight.js/lib/languages/csharp';
+import swift from 'highlight.js/lib/languages/swift';
+import dart from 'highlight.js/lib/languages/dart';
+import elixir from 'highlight.js/lib/languages/elixir';
+import kotlin from 'highlight.js/lib/languages/kotlin';
+import lua from 'highlight.js/lib/languages/lua';
+import php from 'highlight.js/lib/languages/php';
+import latex from 'highlight.js/lib/languages/latex';
+
+hljs.registerLanguage('python', python);
+hljs.registerLanguage('javascript', javascript);
+hljs.registerLanguage('json', json);
+hljs.registerLanguage('yaml', yaml);
+hljs.registerLanguage('markdown', markdown);
+hljs.registerLanguage('xml', xml);
+hljs.registerLanguage('ruby', ruby);
+hljs.registerLanguage('go', go);
+hljs.registerLanguage('java', java);
+hljs.registerLanguage('rust', rust);
+hljs.registerLanguage('scala', scala);
+hljs.registerLanguage('csharp', csharp);
+hljs.registerLanguage('swift', swift);
+hljs.registerLanguage('dart', dart);
+hljs.registerLanguage('elixir', elixir);
+hljs.registerLanguage('kotlin', kotlin);
+hljs.registerLanguage('lua', lua);
+hljs.registerLanguage('php', php);
+hljs.registerLanguage('latex', latex);
+
+// reuse some languages to further reduce bundle size
+
+hljs.registerLanguage('shell', bash);
+hljs.registerLanguage('bash', bash);
+hljs.registerLanguage('sh', bash);
+
+hljs.registerLanguage('css', scss);
+hljs.registerLanguage('scss', scss);
+
+hljs.registerLanguage('c', cpp);
+hljs.registerLanguage('cpp', cpp);
+
+export default hljs;
diff --git a/examples/server/webui/src/katex-gpt.js b/examples/server/webui/src/katex-gpt.js
new file mode 100644
index 000000000..7c7c5e22c
--- /dev/null
+++ b/examples/server/webui/src/katex-gpt.js
@@ -0,0 +1,66 @@
+import katex from 'katex';
+
+// Adapted from https://github.com/SchneeHertz/markdown-it-katex-gpt
+// MIT license
+
+const defaultOptions = {
+ delimiters: [
+ { left: '\\[', right: '\\]', display: true },
+ { left: '\\(', right: '\\)', display: false },
+ ],
+};
+
+export function renderLatexHTML(content, display = false) {
+ return katex.renderToString(content, {
+ throwOnError: false,
+ output: 'mathml',
+ displayMode: display,
+ });
+}
+
+function escapedBracketRule(options) {
+ return (state, silent) => {
+ const max = state.posMax;
+ const start = state.pos;
+
+ for (const { left, right, display } of options.delimiters) {
+
+ // Check if it starts with the left delimiter
+ if (!state.src.slice(start).startsWith(left)) continue;
+
+ // Skip the length of the left delimiter
+ let pos = start + left.length;
+
+ // Find the matching right delimiter
+ while (pos < max) {
+ if (state.src.slice(pos).startsWith(right)) {
+ break;
+ }
+ pos++;
+ }
+
+ // No matching right delimiter found, skip to the next match
+ if (pos >= max) continue;
+
+ // If not in silent mode, convert LaTeX formula to MathML
+ if (!silent) {
+ const content = state.src.slice(start + left.length, pos);
+ try {
+ const renderedContent = renderLatexHTML(content, display);
+ const token = state.push('html_inline', '', 0);
+ token.content = renderedContent;
+ } catch (e) {
+ console.error(e);
+ }
+ }
+
+ // Update position, skip the length of the right delimiter
+ state.pos = pos + right.length;
+ return true;
+ }
+ }
+}
+
+export default function (md, options = defaultOptions) {
+ md.inline.ruler.after('text', 'escaped_bracket', escapedBracketRule(options));
+}
diff --git a/examples/server/webui/src/main.js b/examples/server/webui/src/main.js
index f1f35481d..2a3021ce9 100644
--- a/examples/server/webui/src/main.js
+++ b/examples/server/webui/src/main.js
@@ -1,8 +1,20 @@
-import './styles.css';
+import './styles.scss';
import { createApp, defineComponent, shallowRef, computed, h } from 'vue/dist/vue.esm-bundler.js';
import MarkdownIt from 'markdown-it';
import TextLineStream from 'textlinestream';
+// math formula rendering
+import 'katex/dist/katex.min.css';
+import markdownItKatexGpt from './katex-gpt';
+import markdownItKatexNormal from '@vscode/markdown-it-katex';
+
+// code highlighting
+import hljs from './highlight-config';
+import daisyuiThemes from 'daisyui/src/theming/themes';
+
+// ponyfill for missing ReadableStream asyncIterator on Safari
+import { asyncIterator } from "@sec-ant/readable-stream/ponyfill/asyncIterator";
+
const isDev = import.meta.env.MODE === 'development';
// utility functions
@@ -13,15 +25,18 @@ const escapeAttr = (str) => str.replace(/>/g, '>').replace(/"/g, '"');
const copyStr = (str) => navigator.clipboard.writeText(str);
// constants
-const BASE_URL = localStorage.getItem('base') // for debugging
- || (new URL('.', document.baseURI).href).toString().replace(/\/$/, ''); // for production
+const BASE_URL = isDev
+ ? (localStorage.getItem('base') || 'https://localhost:8080') // for debugging
+ : (new URL('.', document.baseURI).href).toString().replace(/\/$/, ''); // for production
+console.log({ BASE_URL });
+
const CONFIG_DEFAULT = {
// Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
apiKey: '',
systemMessage: 'You are a helpful assistant.',
showTokensPerSecond: false,
// make sure these default values are in sync with `common.h`
- samplers: 'dkypmxt',
+ samplers: 'edkypmxt',
temperature: 0.8,
dynatemp_range: 0.0,
dynatemp_exponent: 1.0,
@@ -69,12 +84,39 @@ const CONFIG_INFO = {
// config keys having numeric value (i.e. temperature, top_k, top_p, etc)
const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
// list of themes supported by daisyui
-const THEMES = ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset'];
+const THEMES = ['light', 'dark']
+ // make sure light & dark are always at the beginning
+ .concat(Object.keys(daisyuiThemes).filter(t => t !== 'light' && t !== 'dark'));
// markdown support
const VueMarkdown = defineComponent(
(props) => {
- const md = shallowRef(new MarkdownIt({ breaks: true }));
+ const md = shallowRef(new MarkdownIt({
+ breaks: true,
+ highlight: function (str, lang) { // Add highlight.js
+ if (lang && hljs.getLanguage(lang)) {
+ try {
+ return '
' +
+ hljs.highlight(str, { language: lang, ignoreIllegals: true }).value +
+ '
';
+ } catch (__) {}
+ }
+ return '
' + md.value.utils.escapeHtml(str) + '
';
+ }
+ }));
+ // support latex with double dollar sign and square brackets
+ md.value.use(markdownItKatexGpt, {
+ delimiters: [
+ { left: '\\[', right: '\\]', display: true },
+ { left: '\\(', right: '\\)', display: false },
+ { left: '$$', right: '$$', display: false },
+ // do not add single dollar sign here, other wise it will confused with dollar used for money symbol
+ ],
+ throwOnError: false,
+ });
+ // support latex with single dollar sign
+ md.value.use(markdownItKatexNormal, { throwOnError: false });
+ // add copy button to code blocks
const origFenchRenderer = md.value.renderer.rules.fence;
md.value.renderer.rules.fence = (tokens, idx, ...args) => {
const content = tokens[idx].content;
@@ -244,7 +286,7 @@ async function* sendSSEPostRequest(url, fetchOptions) {
const lines = res.body
.pipeThrough(new TextDecoderStream())
.pipeThrough(new TextLineStream());
- for await (const line of lines) {
+ for await (const line of asyncIterator(lines)) {
if (isDev) console.log({line});
if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
const data = JSON.parse(line.slice(5));
@@ -278,6 +320,7 @@ const mainApp = createApp({
themes: THEMES,
configDefault: {...CONFIG_DEFAULT},
configInfo: {...CONFIG_INFO},
+ isDev,
}
},
computed: {},
@@ -289,6 +332,7 @@ const mainApp = createApp({
if (this.isGenerating) chatScrollToBottom(true);
});
resizeObserver.observe(pendingMsgElem);
+ this.setSelectedTheme(this.selectedTheme);
},
watch: {
viewingConvId: function(val, oldVal) {
@@ -305,6 +349,8 @@ const mainApp = createApp({
},
setSelectedTheme(theme) {
this.selectedTheme = theme;
+ document.body.setAttribute('data-theme', theme);
+ document.body.setAttribute('data-color-scheme', daisyuiThemes[theme]?.['color-scheme'] ?? 'auto');
StorageUtils.setTheme(theme);
},
newConversation() {
@@ -513,6 +559,17 @@ const mainApp = createApp({
fetchMessages() {
this.messages = StorageUtils.getOneConversation(this.viewingConvId)?.messages ?? [];
},
+
+ // debug functions
+ async debugImportDemoConv() {
+ const res = await fetch('/demo-conversation.json');
+ const demoConv = await res.json();
+ StorageUtils.remove(demoConv.id);
+ for (const msg of demoConv.messages) {
+ StorageUtils.appendMsg(demoConv.id, msg);
+ }
+ this.fetchConversation();
+ }
},
});
mainApp.config.errorHandler = alert;
diff --git a/examples/server/webui/src/styles.css b/examples/server/webui/src/styles.css
deleted file mode 100644
index 67d35b99e..000000000
--- a/examples/server/webui/src/styles.css
+++ /dev/null
@@ -1,26 +0,0 @@
-@tailwind base;
-@tailwind components;
-@tailwind utilities;
-
-.markdown {
- h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; }
- pre {
- @apply whitespace-pre-wrap rounded-lg p-2;
- border: 1px solid currentColor;
- }
- /* TODO: fix markdown table */
-}
-
-.show-on-hover {
- @apply md:opacity-0 md:group-hover:opacity-100;
-}
-.btn-mini {
- @apply cursor-pointer hover:shadow-md;
-}
-.chat-screen { max-width: 900px; }
-
-.chat-bubble-base-300 {
- --tw-bg-opacity: 1;
- --tw-text-opacity: 1;
- @apply bg-base-300 text-base-content;
-}
diff --git a/examples/server/webui/src/styles.scss b/examples/server/webui/src/styles.scss
new file mode 100644
index 000000000..34fe2aaf0
--- /dev/null
+++ b/examples/server/webui/src/styles.scss
@@ -0,0 +1,48 @@
+@use "sass:meta";
+
+@tailwind base;
+@tailwind components;
+@tailwind utilities;
+
+.markdown {
+ h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; }
+ pre {
+ @apply whitespace-pre-wrap rounded-lg p-2;
+ border: 1px solid currentColor;
+ }
+ /* TODO: fix markdown table */
+}
+
+.show-on-hover {
+ @apply md:opacity-0 md:group-hover:opacity-100;
+}
+.btn-mini {
+ @apply cursor-pointer hover:shadow-md;
+}
+.chat-screen { max-width: 900px; }
+
+.chat-bubble-base-300 {
+ --tw-bg-opacity: 1;
+ --tw-text-opacity: 1;
+ @apply bg-base-300 text-base-content;
+}
+
+/* Highlight.js */
+[data-color-scheme='light'] {
+ @include meta.load-css('highlight.js/styles/stackoverflow-light');
+}
+[data-color-scheme='dark'] {
+ @include meta.load-css('highlight.js/styles/stackoverflow-dark');
+}
+[data-color-scheme='auto'] {
+ @media (prefers-color-scheme: light) {
+ @include meta.load-css('highlight.js/styles/stackoverflow-light');
+ }
+ @media (prefers-color-scheme: dark) {
+ @include meta.load-css('highlight.js/styles/stackoverflow-dark');
+ }
+}
+.hljs {
+ background: transparent !important;
+ padding: 0.5em !important;
+}
diff --git a/examples/server/webui/vite.config.js b/examples/server/webui/vite.config.js
index 789bf9cbb..6619a630d 100644
--- a/examples/server/webui/vite.config.js
+++ b/examples/server/webui/vite.config.js
@@ -2,6 +2,9 @@
import { viteSingleFile } from 'vite-plugin-singlefile';
import path from 'path';
import fs from 'fs';
+import zlib from 'zlib';
+
+const MAX_BUNDLE_SIZE = 1.5 * 1024 * 1024; // only increase when absolutely necessary
const GUIDE_FOR_FRONTEND = `
`.trim();
-export default {
- plugins: [
- viteSingleFile(),
- (function llamaCppPlugin() {
- let config;
- return {
- name: 'llamacpp:build',
- apply: 'build',
- async configResolved(_config) {
- config = _config;
- },
- writeBundle() {
- const outputIndexHtml = path.join(config.build.outDir, 'index.html');
- const content = fs.readFileSync(outputIndexHtml, 'utf-8');
+const BUILD_PLUGINS = [
+ viteSingleFile(),
+ (function llamaCppPlugin() {
+ let config;
+ return {
+ name: 'llamacpp:build',
+ apply: 'build',
+ async configResolved(_config) {
+ config = _config;
+ },
+ writeBundle() {
+ const outputIndexHtml = path.join(config.build.outDir, 'index.html');
+ const content = GUIDE_FOR_FRONTEND + '\n' + fs.readFileSync(outputIndexHtml, 'utf-8');
+ const compressed = zlib.gzipSync(Buffer.from(content, 'utf-8'), { level: 9 });
- const targetOutputFile = path.join(config.build.outDir, '../../public/index.html');
- fs.writeFileSync(targetOutputFile, GUIDE_FOR_FRONTEND + '\n' + content);
+ // because gzip header contains machine-specific info, we must remove these data from the header
+ // timestamp
+ compressed[0x4] = 0;
+ compressed[0x5] = 0;
+ compressed[0x6] = 0;
+ compressed[0x7] = 0;
+ // OS
+ compressed[0x9] = 0;
+
+ if (compressed.byteLength > MAX_BUNDLE_SIZE) {
+ throw new Error(
+ `Bundle size is too large (${Math.ceil(compressed.byteLength / 1024)} KB).\n` +
+ `Please reduce the size of the frontend or increase MAX_BUNDLE_SIZE in vite.config.js.\n`,
+ );
}
+
+ const targetOutputFile = path.join(config.build.outDir, '../../public/index.html.gz');
+ fs.writeFileSync(targetOutputFile, compressed);
}
- })(),
- ],
+ }
+ })(),
+];
+
+/** @type {import('vite').UserConfig} */
+export default {
+ plugins: process.env.ANALYZE ? [] : BUILD_PLUGINS,
};
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index c91e93163..3442142ad 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -179,6 +179,11 @@ set (GGML_SYCL_TARGET "INTEL" CACHE STRING
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
"ggml: sycl device architecture")
+option(GGML_OPENCL "ggml: use OpenCL" OFF)
+option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
+option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
+option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
+
# extra artifacts
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
diff --git a/ggml/include/ggml-opencl.h b/ggml/include/ggml-opencl.h
new file mode 100644
index 000000000..6b6177135
--- /dev/null
+++ b/ggml/include/ggml-opencl.h
@@ -0,0 +1,26 @@
+#ifndef GGML_OPENCL_H
+#define GGML_OPENCL_H
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// backend API
+//
+GGML_BACKEND_API ggml_backend_t ggml_backend_opencl_init(void);
+GGML_BACKEND_API bool ggml_backend_is_opencl(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl_reg(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // GGML_OPENCL_H
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 386d5a15d..b0c1ac9ce 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -237,7 +237,9 @@
#define GGML_EXIT_SUCCESS 0
#define GGML_EXIT_ABORTED 1
-#define GGML_ROPE_TYPE_NEOX 2
+#define GGML_ROPE_TYPE_NEOX 2
+#define GGML_ROPE_TYPE_MROPE 8
+#define GGML_ROPE_TYPE_VISION 24
#define GGUF_MAGIC "GGUF"
@@ -1443,6 +1445,22 @@ extern "C" {
float beta_fast,
float beta_slow);
+ GGML_API struct ggml_tensor * ggml_rope_multi(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ struct ggml_tensor * c,
+ int n_dims,
+ int sections[4],
+ int mode,
+ int n_ctx_orig,
+ float freq_base,
+ float freq_scale,
+ float ext_factor,
+ float attn_factor,
+ float beta_fast,
+ float beta_slow);
+
// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
struct ggml_context * ctx,
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 349f4c57f..bf5ee5fc2 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -308,6 +308,7 @@ ggml_add_backend(MUSA)
ggml_add_backend(RPC)
ggml_add_backend(SYCL)
ggml_add_backend(Vulkan)
+ggml_add_backend(OpenCL)
foreach (target ggml-base ggml)
target_include_directories(${target} PUBLIC $
$)
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 2b2240be8..8dc8226ac 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -534,7 +534,6 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
hn->buffer_id = buffer_id;
hn->offset = offset;
- return;
}
}
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index b2eded903..66927148a 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -46,6 +46,10 @@
#include "ggml-vulkan.h"
#endif
+#ifdef GGML_USE_OPENCL
+#include "ggml-opencl.h"
+#endif
+
#ifdef GGML_USE_BLAS
#include "ggml-blas.h"
#endif
@@ -146,6 +150,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_VULKAN
register_backend(ggml_backend_vk_reg());
#endif
+#ifdef GGML_USE_OPENCL
+ register_backend(ggml_backend_opencl_reg());
+#endif
#ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg());
#endif
@@ -539,6 +546,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
ggml_backend_load_best("rpc", silent, dir_path);
ggml_backend_load_best("sycl", silent, dir_path);
ggml_backend_load_best("vulkan", silent, dir_path);
+ ggml_backend_load_best("opencl", silent, dir_path);
ggml_backend_load_best("musa", silent, dir_path);
ggml_backend_load_best("cpu", silent, dir_path);
}
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index fa04ab84f..d410c0244 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1747,6 +1747,15 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
if (*ext_factor != 0) {
return false;
}
+
+ const int mode = ((const int32_t *) op->op_params)[2];
+ if (mode & GGML_ROPE_TYPE_MROPE) {
+ return false;
+ }
+ if (mode & GGML_ROPE_TYPE_VISION) {
+ return false;
+ }
+
return true;
}
case GGML_OP_UPSCALE: {
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index 0e0556703..5d4732337 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -179,7 +179,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endif()
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
- CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
+ CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
if (MSVC)
# instruction set detection for MSVC only
if (GGML_NATIVE)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 92df6fdda..67e67a089 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -9133,6 +9133,64 @@ static void ggml_rope_cache_init(
}
}
+static void ggml_mrope_cache_init(
+ float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool indep_sects,
+ float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
+ float * cache, float sin_sign, float theta_scale) {
+ // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
+ float theta_t = theta_base_t;
+ float theta_h = theta_base_h;
+ float theta_w = theta_base_w;
+ float theta_e = theta_base_e; // extra position id for vision encoder
+ int sect_dims = sections[0] + sections[1] + sections[2] + sections[3];
+ int sec_w = sections[1] + sections[0];
+ int sec_e = sections[2] + sec_w;
+ GGML_ASSERT(sect_dims <= ne0);
+
+ for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+ const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
+
+ int sector = (i0 / 2) % sect_dims;
+ if (indep_sects) {
+ // compute theta independently for each dim sections
+ // (i.e. reset corresponding theta when `i0` go from one section to another)
+ if (sector == 0) {
+ theta_t = theta_base_t;
+ }
+ else if (sector == sections[0]) {
+ theta_h = theta_base_h;;
+ }
+ else if (sector == sec_w) {
+ theta_w = theta_base_w;
+ }
+ else if (sector == sec_e) {
+ theta_e = theta_base_e;
+ }
+ }
+
+ float theta = theta_t;
+ if (sector >= sections[0] && sector < sec_w) {
+ theta = theta_h;
+ }
+ else if (sector >= sec_w && sector < sec_w + sections[2]) {
+ theta = theta_w;
+ }
+ else if (sector >= sec_w + sections[2]) {
+ theta = theta_e;
+ }
+
+ rope_yarn(
+ theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
+ );
+ cache[i0 + 1] *= sin_sign;
+
+ theta_t *= theta_scale;
+ theta_w *= theta_scale;
+ theta_h *= theta_scale;
+ theta_e *= theta_scale;
+ }
+}
+
static void ggml_compute_forward_rope_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst,
@@ -9143,6 +9201,7 @@ static void ggml_compute_forward_rope_f32(
const struct ggml_tensor * src2 = dst->src[2];
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+ int sections[4];
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1];
@@ -9156,6 +9215,7 @@ static void ggml_compute_forward_rope_f32(
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
+ memcpy(§ions, (int32_t *) dst->op_params + 11, sizeof(int)*4);
GGML_TENSOR_UNARY_OP_LOCALS
@@ -9188,6 +9248,16 @@ static void ggml_compute_forward_rope_f32(
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+ const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, multimodal rotary position embedding
+ const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+ if (is_mrope) {
+ GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
+ }
+
+ if (is_vision) {
+ GGML_ASSERT(n_dims == ne0/2);
+ }
const float * freq_factors = NULL;
if (src2 != NULL) {
@@ -9203,18 +9273,63 @@ static void ggml_compute_forward_rope_f32(
const int32_t * pos = (const int32_t *) src1->data;
- for (int64_t i3 = 0; i3 < ne3; i3++) {
- for (int64_t i2 = 0; i2 < ne2; i2++) {
- const int64_t p = pos[i2];
+ for (int64_t i3 = 0; i3 < ne3; i3++) { // batch
+ for (int64_t i2 = 0; i2 < ne2; i2++) { // seq-len
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
- ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+ if (!is_mrope) {
+ const int64_t p = pos[i2];
+ ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+ }
+ else {
+ const int64_t p_t = pos[i2];
+ const int64_t p_h = pos[i2 + ne2];
+ const int64_t p_w = pos[i2 + ne2 * 2];
+ const int64_t p_e = pos[i2 + ne2 * 3];
+ ggml_mrope_cache_init(
+ p_t, p_h, p_w, p_e, sections, is_vision,
+ freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+ }
- for (int64_t i1 = 0; i1 < ne1; i1++) {
+ for (int64_t i1 = 0; i1 < ne1; i1++) { // attn-heads
if (ir++ < ir0) continue;
if (ir > ir1) break;
- if (!is_neox) {
+ if (is_neox || is_mrope) {
+ if (is_vision){
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+ const int64_t ic = i0/2;
+
+ const float cos_theta = cache[i0 + 0];
+ const float sin_theta = cache[i0 + 1];
+
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
+
+ const float x0 = src[0];
+ const float x1 = src[n_dims];
+
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
+ dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
+ }
+ } else {
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+ const int64_t ic = i0/2;
+
+ const float cos_theta = cache[i0 + 0];
+ const float sin_theta = cache[i0 + 1];
+
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
+
+ const float x0 = src[0];
+ const float x1 = src[n_dims/2];
+
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
+ dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
+ }
+ }
+ } else {
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
const float cos_theta = cache[i0 + 0];
const float sin_theta = cache[i0 + 1];
@@ -9228,8 +9343,10 @@ static void ggml_compute_forward_rope_f32(
dst_data[0] = x0*cos_theta - x1*sin_theta;
dst_data[1] = x0*sin_theta + x1*cos_theta;
}
- } else {
- for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+ }
+
+ if (is_vision) {
+ for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
const int64_t ic = i0/2;
const float cos_theta = cache[i0 + 0];
@@ -9239,19 +9356,20 @@ static void ggml_compute_forward_rope_f32(
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
const float x0 = src[0];
- const float x1 = src[n_dims/2];
+ const float x1 = src[n_dims];
- dst_data[0] = x0*cos_theta - x1*sin_theta;
- dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
+ dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
}
- }
+ } else {
+ // fill the remain channels with data from src tensor
+ for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
- for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
- dst_data[0] = src[0];
- dst_data[1] = src[1];
+ dst_data[0] = src[0];
+ dst_data[1] = src[1];
+ }
}
}
}
@@ -9269,6 +9387,7 @@ static void ggml_compute_forward_rope_f16(
const struct ggml_tensor * src2 = dst->src[2];
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+ int sections[4];
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1];
@@ -9281,6 +9400,8 @@ static void ggml_compute_forward_rope_f16(
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
+ memcpy(§ions, (int32_t *) dst->op_params + 11, sizeof(int)*4);
+
GGML_TENSOR_UNARY_OP_LOCALS
@@ -9313,6 +9434,16 @@ static void ggml_compute_forward_rope_f16(
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+ const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
+ const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+ if (is_mrope) {
+ GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
+ }
+
+ if (is_vision) {
+ GGML_ASSERT(n_dims == ne0/2);
+ }
const float * freq_factors = NULL;
if (src2 != NULL) {
@@ -9330,16 +9461,61 @@ static void ggml_compute_forward_rope_f16(
for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = 0; i2 < ne2; i2++) {
- const int64_t p = pos[i2];
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
- ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+ if (!is_mrope) {
+ const int64_t p = pos[i2];
+ ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+ }
+ else {
+ const int64_t p_t = pos[i2];
+ const int64_t p_h = pos[i2 + ne2];
+ const int64_t p_w = pos[i2 + ne2 * 2];
+ const int64_t p_e = pos[i2 + ne2 * 3];
+ ggml_mrope_cache_init(
+ p_t, p_h, p_w, p_e, sections, is_vision,
+ freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+ }
for (int64_t i1 = 0; i1 < ne1; i1++) {
if (ir++ < ir0) continue;
if (ir > ir1) break;
- if (!is_neox) {
+ if (is_neox || is_mrope) {
+ if (is_vision) {
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+ const int64_t ic = i0/2;
+
+ const float cos_theta = cache[i0 + 0];
+ const float sin_theta = cache[i0 + 1];
+
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
+
+ const float x0 = GGML_FP16_TO_FP32(src[0]);
+ const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
+
+ dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+ dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+ }
+ } else {
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+ const int64_t ic = i0/2;
+
+ const float cos_theta = cache[i0 + 0];
+ const float sin_theta = cache[i0 + 1];
+
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
+
+ const float x0 = GGML_FP16_TO_FP32(src[0]);
+ const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
+
+ dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+ dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+ }
+ }
+ } else {
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
const float cos_theta = cache[i0 + 0];
const float sin_theta = cache[i0 + 1];
@@ -9353,8 +9529,10 @@ static void ggml_compute_forward_rope_f16(
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
}
- } else {
- for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+ }
+
+ if (is_vision) {
+ for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
const int64_t ic = i0/2;
const float cos_theta = cache[i0 + 0];
@@ -9364,19 +9542,19 @@ static void ggml_compute_forward_rope_f16(
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
const float x0 = GGML_FP16_TO_FP32(src[0]);
- const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
+ const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
- dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+ dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+ dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
}
- }
+ } else {
+ for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
- for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
- dst_data[0] = src[0];
- dst_data[1] = src[1];
+ dst_data[0] = src[0];
+ dst_data[1] = src[1];
+ }
}
}
}
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index c390957af..0b6419f83 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -394,8 +394,11 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
switch (op->op) {
case GGML_OP_CPY:
return
+ op->type != GGML_TYPE_IQ3_XXS &&
+ op->type != GGML_TYPE_IQ3_S &&
op->type != GGML_TYPE_IQ2_XXS &&
op->type != GGML_TYPE_IQ2_XS &&
+ op->type != GGML_TYPE_IQ2_S &&
op->type != GGML_TYPE_IQ1_S &&
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
case GGML_OP_MUL_MAT:
diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
index 88f586d68..2c84778d2 100644
--- a/ggml/src/ggml-cuda/rope.cu
+++ b/ggml/src/ggml-cuda/rope.cu
@@ -4,6 +4,11 @@ struct rope_corr_dims {
float v[2];
};
+
+struct mrope_sections {
+ int v[4];
+};
+
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
const float y = (i0 / 2 - low) / max(0.001f, high - low);
return 1.0f - min(1.0f, max(0.0f, y));
@@ -108,6 +113,105 @@ static __global__ void rope_neox(
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
}
+template
+static __global__ void rope_multi(
+ const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) {
+ const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
+
+ if (i0 >= ne0) {
+ return;
+ }
+
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
+
+ if (i0 >= n_dims) {
+ const int i = row*ne0 + i0;
+
+ dst[i + 0] = x[i + 0];
+ dst[i + 1] = x[i + 1];
+
+ return;
+ }
+
+ const int i = row*ne0 + i0/2;
+ const int i2 = row/p_delta_rows;
+
+ int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
+ int sec_w = sections.v[1] + sections.v[0];
+ int sector = (i0 / 2) % sect_dims;
+
+ float theta_base = 0.0;
+ if (sector < sections.v[0]) {
+ theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
+ }
+ else if (sector >= sections.v[0] && sector < sec_w) {
+ theta_base = pos[i2 + ne2 * 1]*powf(theta_scale, i0/2.0f);
+ }
+ else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
+ theta_base = pos[i2 + ne2 * 2]*powf(theta_scale, i0/2.0f);
+ }
+ else if (sector >= sec_w + sections.v[2]) {
+ theta_base = pos[i2 + ne2 * 3]*powf(theta_scale, i0/2.0f);
+ }
+
+ const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
+
+ float cos_theta;
+ float sin_theta;
+
+ rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
+
+ const float x0 = x[i + 0];
+ const float x1 = x[i + n_dims/2];
+
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
+ dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
+}
+
+template
+static __global__ void rope_vision(
+ const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) {
+ const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
+
+ if (i0 >= ne0) {
+ return;
+ }
+
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
+
+ const int i = row*ne0 + i0/2;
+ const int i2 = row/p_delta_rows; // i2-th tokens
+
+ int sect_dims = sections.v[0] + sections.v[1];
+ int sec_w = sections.v[1] + sections.v[0];
+ int sector = (i0 / 2) % sect_dims;
+
+ float theta_base = 0.0;
+ if (sector < sections.v[0]) {
+ const int p = sector;
+ theta_base = pos[i2]*powf(theta_scale, p);
+ }
+ else if (sector >= sections.v[0] && sector < sec_w) {
+ const int p = sector - sections.v[0];
+ theta_base = pos[i2 + ne2]*powf(theta_scale, p);
+ }
+
+ const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
+
+ float cos_theta;
+ float sin_theta;
+
+ rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
+
+ const float x0 = x[i + 0];
+ const float x1 = x[i + n_dims];
+
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
+ dst[i + n_dims] = x0*sin_theta + x1*cos_theta;
+}
+
template
static void rope_norm_cuda(
const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
@@ -156,6 +260,56 @@ static void rope_neox_cuda(
}
}
+template
+static void rope_multi_cuda(
+ const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) {
+ GGML_ASSERT(ne0 % 2 == 0);
+ const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
+ const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+ const dim3 block_nums(nr, n_blocks_x, 1);
+
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+ if (freq_factors == nullptr) {
+ rope_multi<<>>(
+ x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
+ theta_scale, freq_factors, sections
+ );
+ } else {
+ rope_multi<<>>(
+ x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
+ theta_scale, freq_factors, sections
+ );
+ }
+}
+
+template
+static void rope_vision_cuda(
+ const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) {
+ GGML_ASSERT(ne0 % 2 == 0);
+ const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
+ const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+ const dim3 block_nums(nr, n_blocks_x, 1);
+ // break down (head_dim, heads, seq) into (CUDA_ROPE_BLOCK_SIZE, x, heads * seq)
+ // where x ~= ceil(head_dim / CUDA_ROPE_BLOCK_SIZE);
+
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+ if (freq_factors == nullptr) {
+ rope_vision<<>>(
+ x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
+ theta_scale, freq_factors, sections
+ );
+ } else {
+ rope_vision<<>>(
+ x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
+ theta_scale, freq_factors, sections
+ );
+ }
+}
+
static void rope_norm_cuda_f16(
const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
@@ -185,6 +339,38 @@ static void rope_neox_cuda_f32(
rope_neox_cuda(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
}
+static void rope_multi_cuda_f16(
+ const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
+) {
+
+ rope_multi_cuda(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
+}
+
+static void rope_multi_cuda_f32(
+ const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
+) {
+
+ rope_multi_cuda(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
+}
+
+static void rope_vision_cuda_f16(
+ const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
+) {
+
+ rope_vision_cuda(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
+}
+
+static void rope_vision_cuda_f32(
+ const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
+) {
+
+ rope_vision_cuda(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
+}
+
void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
@@ -201,8 +387,9 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
- const int64_t ne00 = src0->ne[0];
- const int64_t ne01 = src0->ne[1];
+ const int64_t ne00 = src0->ne[0]; // head dims
+ const int64_t ne01 = src0->ne[1]; // num heads
+ const int64_t ne02 = src0->ne[2]; // num heads
const int64_t nr = ggml_nrows(src0);
//const int n_past = ((int32_t *) dst->op_params)[0];
@@ -210,6 +397,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const int mode = ((int32_t *) dst->op_params)[2];
//const int n_ctx = ((int32_t *) dst->op_params)[3];
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
+ mrope_sections sections;
// RoPE alteration for extended context
float freq_base;
@@ -225,8 +413,19 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
+ memcpy(§ions.v, (int32_t *) dst->op_params + 11, sizeof(int)*4);
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+ const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
+ const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+ if (is_mrope) {
+ GGML_ASSERT(sections.v[0] > 0 || sections.v[1] > 0 || sections.v[2] > 0);
+ }
+
+ if (is_vision) {
+ GGML_ASSERT(n_dims == ne00/2);
+ }
const int32_t * pos = (const int32_t *) src1_d;
@@ -253,6 +452,34 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
} else {
GGML_ABORT("fatal error");
}
+ } else if (is_mrope && !is_vision) {
+ if (src0->type == GGML_TYPE_F32) {
+ rope_multi_cuda_f32(
+ (const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
+ attn_factor, corr_dims, freq_factors, sections, stream
+ );
+ } else if (src0->type == GGML_TYPE_F16) {
+ rope_multi_cuda_f16(
+ (const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
+ attn_factor, corr_dims, freq_factors, sections, stream
+ );
+ } else {
+ GGML_ABORT("fatal error");
+ }
+ } else if (is_vision) {
+ if (src0->type == GGML_TYPE_F32) {
+ rope_vision_cuda_f32(
+ (const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
+ attn_factor, corr_dims, freq_factors, sections, stream
+ );
+ } else if (src0->type == GGML_TYPE_F16) {
+ rope_vision_cuda_f16(
+ (const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
+ attn_factor, corr_dims, freq_factors, sections, stream
+ );
+ } else {
+ GGML_ABORT("fatal error");
+ }
} else {
if (src0->type == GGML_TYPE_F32) {
rope_norm_cuda_f32(
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index f961134ed..549772c57 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -551,6 +551,22 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
+// expose GGUF internals for test code
+
+GGML_API size_t gguf_type_size(enum gguf_type type);
+
+GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
+
+struct gguf_buf {
+ void * data;
+ size_t size;
+ size_t offset;
+};
+GGML_API struct gguf_buf gguf_buf_init(size_t size);
+GGML_API void gguf_buf_free(struct gguf_buf buf);
+
+GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta);
+
#ifdef __cplusplus
}
#endif
diff --git a/ggml/src/ggml-kompute/ggml-kompute.cpp b/ggml/src/ggml-kompute/ggml-kompute.cpp
index 28ceecfc4..505792271 100644
--- a/ggml/src/ggml-kompute/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute/ggml-kompute.cpp
@@ -1419,8 +1419,18 @@ static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, cons
case GGML_OP_SOFT_MAX:
case GGML_OP_RMS_NORM:
case GGML_OP_NORM:
- case GGML_OP_ROPE:
return true;
+ case GGML_OP_ROPE:
+ {
+ const int mode = ((const int32_t *) op->op_params)[2];
+ if (mode & GGML_ROPE_TYPE_MROPE) {
+ return false;
+ }
+ if (mode & GGML_ROPE_TYPE_VISION) {
+ return false;
+ }
+ return true;
+ }
case GGML_OP_DUP:
case GGML_OP_CPY:
case GGML_OP_CONT:
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 34fe5778e..28f590f92 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -1125,8 +1125,18 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
return has_simdgroup_reduction && (op->ne[0] % 4 == 0);
case GGML_OP_ARGMAX:
case GGML_OP_NORM:
- case GGML_OP_ROPE:
return true;
+ case GGML_OP_ROPE:
+ {
+ const int mode = ((const int32_t *) op->op_params)[2];
+ if (mode & GGML_ROPE_TYPE_MROPE) {
+ return false;
+ }
+ if (mode & GGML_ROPE_TYPE_VISION) {
+ return false;
+ }
+ return true;
+ }
case GGML_OP_IM2COL:
return op->src[0]->type == GGML_TYPE_F16;
case GGML_OP_POOL_1D:
@@ -3026,7 +3036,9 @@ static void ggml_metal_encode_node(
} break;
case GGML_OP_ROPE:
{
- GGML_ASSERT(ne10 == ne02);
+ // make sure we have one or more position id(ne10) per token(ne02)
+ GGML_ASSERT(ne10 % ne02 == 0);
+ GGML_ASSERT(ne10 >= ne02);
const int nth = MIN(1024, ne00);
diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt
new file mode 100644
index 000000000..45328a657
--- /dev/null
+++ b/ggml/src/ggml-opencl/CMakeLists.txt
@@ -0,0 +1,147 @@
+find_package(OpenCL REQUIRED)
+find_package(Python3 REQUIRED)
+
+set(TARGET_NAME ggml-opencl)
+
+ggml_add_backend_library(${TARGET_NAME}
+ ggml-opencl.cpp
+ ../../include/ggml-opencl.h)
+target_link_libraries(${TARGET_NAME} PRIVATE ${OpenCL_LIBRARIES})
+target_include_directories(${TARGET_NAME} PRIVATE ${OpenCL_INCLUDE_DIRS})
+
+if (GGML_OPENCL_PROFILING)
+ message(STATUS "OpenCL profiling enabled (increases CPU overhead)")
+ add_compile_definitions(GGML_OPENCL_PROFILING)
+endif ()
+
+add_compile_definitions(GGML_OPENCL_SOA_Q)
+
+if (GGML_OPENCL_USE_ADRENO_KERNELS)
+ message(STATUS "OpenCL will use matmul kernels optimized for Adreno")
+ add_compile_definitions(GGML_OPENCL_USE_ADRENO_KERNELS)
+endif ()
+
+if (GGML_OPENCL_EMBED_KERNELS)
+ add_compile_definitions(GGML_OPENCL_EMBED_KERNELS)
+
+ set(OPENCL_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl.cl.h")
+ set(OPENCL_MM_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mm.cl.h")
+ set(OPENCL_CVT_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_cvt.cl.h")
+
+ set(OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle.cl.h")
+ set(OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle_general.cl.h")
+ set(OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h")
+ set(OPENCL_TRANSPOSE_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_16.cl.h")
+ set(OPENCL_TRANSPOSE_32_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32.cl.h")
+ set(OPENCL_TRANSPOSE_32_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32_16.cl.h")
+
+ set(EMBED_KERNEL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py")
+ file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
+
+ include_directories("${CMAKE_BINARY_DIR}/autogenerated")
+
+ # Python must be accessible from command line
+ add_custom_command(
+ OUTPUT ${OPENCL_CL_SOURCE_EMBED}
+ COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+ ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl.cl
+ ${OPENCL_CL_SOURCE_EMBED}
+ DEPENDS kernels/ggml-opencl.cl ${EMBED_KERNEL_SCRIPT}
+ COMMENT "Generate ggml-opencl.cl.h"
+ )
+
+ add_custom_command(
+ OUTPUT ${OPENCL_MM_CL_SOURCE_EMBED}
+ COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+ ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mm.cl
+ ${OPENCL_MM_CL_SOURCE_EMBED}
+ DEPENDS kernels/ggml-opencl_mm.cl ${EMBED_KERNEL_SCRIPT}
+ COMMENT "Generate ggml-opencl_mm.cl.h"
+ )
+
+ add_custom_command(
+ OUTPUT ${OPENCL_CVT_CL_SOURCE_EMBED}
+ COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+ ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_cvt.cl
+ ${OPENCL_CVT_CL_SOURCE_EMBED}
+ DEPENDS kernels/ggml-opencl_cvt.cl ${EMBED_KERNEL_SCRIPT}
+ COMMENT "Generate ggml-opencl_cvt.cl.h"
+ )
+
+ add_custom_command(
+ OUTPUT ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
+ COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+ ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle.cl
+ ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
+ DEPENDS kernels/ggml-opencl_gemv_noshuffle.cl ${EMBED_KERNEL_SCRIPT}
+ COMMENT "Generate ggml-opencl_gemv_noshuffle.cl.h"
+ )
+
+ add_custom_command(
+ OUTPUT ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
+ COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+ ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle_general.cl
+ ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
+ DEPENDS kernels/ggml-opencl_gemv_noshuffle_general.cl ${EMBED_KERNEL_SCRIPT}
+ COMMENT "Generate ggml-opencl_gemv_noshuffle_general.cl.h"
+ )
+
+ add_custom_command(
+ OUTPUT ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
+ COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+ ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
+ ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
+ DEPENDS kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${EMBED_KERNEL_SCRIPT}
+ COMMENT "Generate ggml-opencl_mul_mat_Ab_Bi_8x4.cl.cl.h"
+ )
+
+ add_custom_command(
+ OUTPUT ${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
+ COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+ ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_16.cl
+ ${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
+ DEPENDS kernels/ggml-opencl_transpose_16.cl ${EMBED_KERNEL_SCRIPT}
+ COMMENT "Generate ggml-opencl_transpose_16.cl.h"
+ )
+
+ add_custom_command(
+ OUTPUT ${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
+ COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+ ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32.cl
+ ${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
+ DEPENDS kernels/ggml-opencl_transpose_32.cl ${EMBED_KERNEL_SCRIPT}
+ COMMENT "Generate ggml-opencl_transpose_32.cl.h"
+ )
+
+ add_custom_command(
+ OUTPUT ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
+ COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+ ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32_16.cl
+ ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
+ DEPENDS kernels/ggml-opencl_transpose_32_16.cl ${EMBED_KERNEL_SCRIPT}
+ COMMENT "Generate ggml-opencl_transpose_32_16.cl.h"
+ )
+
+ target_sources(${TARGET_NAME} PRIVATE
+ ${OPENCL_CL_SOURCE_EMBED}
+ ${OPENCL_MM_CL_SOURCE_EMBED}
+ ${OPENCL_CVT_CL_SOURCE_EMBED}
+ ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
+ ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
+ ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
+ ${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
+ ${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
+ ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED})
+else ()
+ # copy ggml-opencl.cl to bin directory
+ configure_file(kernels/ggml-opencl.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl.cl COPYONLY)
+ configure_file(kernels/ggml-opencl_mm.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mm.cl COPYONLY)
+ configure_file(kernels/ggml-opencl_cvt.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_cvt.cl COPYONLY)
+
+ configure_file(kernels/ggml-opencl_gemv_noshuffle.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle.cl COPYONLY)
+ configure_file(kernels/ggml-opencl_gemv_noshuffle_general.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle_general.cl COPYONLY)
+ configure_file(kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mul_mat_Ab_Bi_8x4.cl COPYONLY)
+ configure_file(kernels/ggml-opencl_transpose_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_16.cl COPYONLY)
+ configure_file(kernels/ggml-opencl_transpose_32.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32.cl COPYONLY)
+ configure_file(kernels/ggml-opencl_transpose_32_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32_16.cl COPYONLY)
+endif ()
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
new file mode 100644
index 000000000..c77d629f0
--- /dev/null
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -0,0 +1,4004 @@
+#define CL_TARGET_OPENCL_VERSION 220
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+
+// suppress warnings in CL headers for GCC and Clang
+#pragma GCC diagnostic ignored "-Woverlength-strings"
+#ifdef __clang__
+#pragma GCC diagnostic ignored "-Wgnu-anonymous-struct"
+#endif
+
+#include "ggml-opencl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml.h"
+
+#include
+
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+#define UNUSED(x) (void)(x)
+
+#define CL_CHECK(err) \
+ do { \
+ cl_int err_ = (err); \
+ if (err_ != CL_SUCCESS) { \
+ GGML_LOG_ERROR("ggml_opencl: %s error %d at %s:%d\n", \
+ #err, err_, __FILE__, __LINE__); \
+ GGML_ASSERT(0); \
+ } \
+ } while (0)
+
+//------------------------------------------------------------------------------
+// OpenCL
+//------------------------------------------------------------------------------
+
+bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor);
+
+enum GPU_FAMILY {
+ ADRENO,
+ INTEL,
+ UNKNOWN,
+};
+
+enum ADRENO_GPU_GEN {
+ ADRENO_UNKNOWN,
+ A7X,
+ A8X,
+ X1E,
+};
+
+static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
+ if (strstr(device_name, "730") ||
+ strstr(device_name, "740") ||
+ strstr(device_name, "750")) {
+ return ADRENO_GPU_GEN::A7X;
+ }
+
+ if (strstr(device_name, "830")) {
+ return ADRENO_GPU_GEN::A8X;
+ }
+
+ if (strstr(device_name, "X1")) {
+ return ADRENO_GPU_GEN::X1E;
+ }
+
+ return ADRENO_GPU_GEN::ADRENO_UNKNOWN;
+}
+
+static int get_adreno_cl_compiler_version(const char *driver_version) {
+ std::string driver_ver_str(driver_version);
+ size_t compiler_ver_pos = driver_ver_str.find("E031");
+ size_t compiler_ver_len = 13;
+ size_t compiler_ver_offset = 5;
+
+ if (compiler_ver_pos == std::string::npos) {
+ compiler_ver_pos = driver_ver_str.find("DX");
+ if (compiler_ver_pos == std::string::npos) {
+ return -1;
+ }
+ compiler_ver_len = 11;
+ compiler_ver_offset = 3;
+ }
+
+ std::string compiler_ver_str = driver_ver_str.substr(compiler_ver_pos, compiler_ver_len);
+ std::string major_ver_str = compiler_ver_str.substr(compiler_ver_offset, 2);
+ return std::atoi(major_ver_str.c_str());
+}
+
+// backend device context
+struct ggml_backend_opencl_device_context {
+ cl_platform_id platform;
+ std::string platform_name;
+
+ cl_device_id device;
+ std::string device_name;
+};
+
+// backend context
+struct ggml_backend_opencl_context {
+ cl_device_id device;
+ std::string device_name;
+
+ std::string driver_version;
+
+ GPU_FAMILY gpu_family;
+ ADRENO_GPU_GEN adreno_gen;
+
+ cl_int alignment;
+ size_t max_alloc_size;
+ bool fp16_support;
+
+ int adreno_wave_size;
+
+ cl_context context;
+ cl_command_queue queue;
+
+ cl_program program;
+ cl_program program_1;
+ cl_program program_2;
+
+ cl_kernel kernel_add, kernel_add_row;
+ cl_kernel kernel_mul, kernel_mul_row;
+ cl_kernel kernel_scale;
+ cl_kernel kernel_silu, kernel_silu_4;
+ cl_kernel kernel_gelu, kernel_gelu_4;
+ cl_kernel kernel_relu;
+ cl_kernel kernel_clamp;
+ cl_kernel kernel_norm;
+ cl_kernel kernel_rms_norm;
+ cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
+ cl_kernel kernel_soft_max, kernel_soft_max_4;
+ cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
+ cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
+ cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
+ cl_kernel kernel_mul_mat_f32_f32;
+ cl_kernel kernel_mul_mat_f16_f16;
+ cl_kernel kernel_mul_mat_f16_f32_1row;
+ cl_kernel kernel_mul_mat_f16_f32;
+ cl_kernel kernel_mul_mat_f16_f32_l4;
+ cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
+ cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0, kernel_mul_mat_q4_0_f32_flat;
+ cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
+ cl_kernel kernel_convert_block_q4_0_noshuffle, kernel_mul_mat_q4_0_f32_flat_v0,
+ kernel_mul_mat_q4_0_f32_flat_img_v0;
+ cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
+ cl_kernel kernel_mul_mv_q6_K_f32;
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+ // Transpose kernels
+ cl_program program_transpose_32;
+ cl_program program_transpose_32_16;
+ cl_program program_transpose_16;
+ cl_kernel kernel_transpose_32;
+ cl_kernel kernel_transpose_32_16;
+ cl_kernel kernel_transpose_16;
+
+ cl_mem A_s_d_max; // max scale buffer size for transpose
+ cl_mem A_q_d_max; // max weight buffer size for transpose
+ cl_mem B_d_max; // max activation buffer size for transpose
+
+ // Gemm and Gemv related programs, kernels, etc
+ cl_program program_CL_gemm;
+ cl_program program_CL_gemv_general;
+ cl_program program_CL_gemv_4096_1_11008;
+ cl_program program_CL_gemv_4096_1_4096;
+ cl_program program_CL_gemv_11008_1_4096;
+ cl_program program_CL_gemv_32000_1_4096;
+ cl_kernel CL_mul_mat_Ab_Bi_8x4;
+ cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general;
+ cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008;
+ cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096;
+ cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
+ cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+};
+
+static ggml_backend_device g_ggml_backend_opencl_device;
+static ggml_backend_opencl_device_context g_ggml_ctx_dev_main {
+ /*.platform =*/ nullptr,
+ /*.platform_nane =*/ "",
+ /*.device =*/ nullptr,
+ /*.device_name =*/ "",
+};
+
+static int ggml_backend_opencl_n_devices = 0;
+
+// Profiling
+#ifdef GGML_OPENCL_PROFILING
+struct ProfilingInfo {
+ std::string op_name;
+ std::string kernel_name;
+ // Kernel execution time in nanoseconds.
+ cl_ulong duration_ns;
+ // Global and local work sizes.
+ size_t global_size[3];
+ size_t local_size[3];
+ // Op output size.
+ size_t output_size[4];
+};
+
+std::vector g_profiling_info;
+#endif
+
+inline std::string read_file(const std::string &path) {
+ std::ifstream ifs(path);
+ if (!ifs) {
+ return "";
+ }
+ std::string text;
+ ifs.seekg(0, std::ios::end);
+ text.resize(ifs.tellg());
+ ifs.seekg(0, std::ios::beg);
+ ifs.read(&text[0], text.size());
+ return text;
+}
+
+static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer, const std::string &compile_opts) {
+ cl_program p;
+ char *program_log;
+ size_t program_size;
+ size_t log_size;
+ int err;
+
+ program_size = strlen(program_buffer);
+
+ p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
+ if(err < 0) {
+ GGML_LOG_ERROR("OpenCL error creating program");
+ exit(1);
+ }
+
+ err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL);
+ if(err < 0) {
+ clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+ program_log = (char*) malloc(log_size + 1);
+ program_log[log_size] = '\0';
+ clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
+ GGML_LOG_ERROR("ggml_opencl: kernel compile error:\n\n%s\n", program_log);
+ free(program_log);
+ exit(1);
+ }
+
+ return p;
+}
+
+static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
+ static bool initialized = false;
+ static ggml_backend_opencl_context *backend_ctx = nullptr;
+
+ if (initialized) {
+ return backend_ctx;
+ }
+
+ ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *)dev->context;
+ GGML_ASSERT(dev_ctx);
+ GGML_ASSERT(dev_ctx->platform == nullptr);
+ GGML_ASSERT(dev_ctx->device == nullptr);
+ GGML_ASSERT(backend_ctx == nullptr);
+
+ initialized = true;
+ backend_ctx = new ggml_backend_opencl_context();
+ backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
+
+ cl_int err;
+
+#ifdef GGML_PROFILE_OPENCL
+ GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
+#endif
+
+ struct cl_device;
+ struct cl_platform {
+ cl_platform_id id;
+ unsigned number;
+ char name[128];
+ char vendor[128];
+ struct cl_device * devices;
+ unsigned n_devices;
+ struct cl_device * default_device;
+ };
+
+ struct cl_device {
+ struct cl_platform * platform;
+ cl_device_id id;
+ unsigned number;
+ cl_device_type type;
+ char name[128];
+ };
+
+ enum { NPLAT = 16, NDEV = 16 };
+
+ struct cl_platform platforms[NPLAT];
+ unsigned n_platforms = 0;
+ struct cl_device devices[NDEV];
+ unsigned n_devices = 0;
+ struct cl_device * default_device = NULL;
+
+ cl_platform_id platform_ids[NPLAT];
+ if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
+ GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n");
+ return backend_ctx;
+ }
+
+ for (unsigned i = 0; i < n_platforms; i++) {
+ struct cl_platform * p = &platforms[i];
+ p->number = i;
+ p->id = platform_ids[i];
+ CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
+ CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
+
+ cl_device_id device_ids[NDEV];
+ cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
+ if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
+ p->n_devices = 0;
+ } else {
+ CL_CHECK(clGetDeviceIDsError);
+ }
+ p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
+ p->default_device = NULL;
+
+ for (unsigned j = 0; j < p->n_devices; j++) {
+ struct cl_device * d = &devices[n_devices];
+ d->number = n_devices++;
+ d->id = device_ids[j];
+ d->platform = p;
+ CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
+ CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
+
+ if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
+ p->default_device = d;
+ }
+ }
+
+ if (default_device == NULL && p->default_device != NULL) {
+ default_device = p->default_device;
+ }
+ }
+
+ if (n_devices == 0) {
+ GGML_LOG_ERROR("ggml_opencl: could find any OpenCL devices.\n");
+ return backend_ctx;
+ }
+
+ char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
+ char * user_device_string = getenv("GGML_OPENCL_DEVICE");
+ int user_platform_number = -1;
+ int user_device_number = -1;
+
+ unsigned n;
+ if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
+ user_platform_number = (int)n;
+ }
+ if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) {
+ user_device_number = (int)n;
+ }
+ if (user_platform_number != -1 && user_device_number != -1) {
+ cl_platform* platform = &platforms[user_platform_number];
+ if ((unsigned)user_device_number >= platform->n_devices) {
+ GGML_LOG_ERROR("ggml_opencl: invalid device number %d\n", user_device_number);
+ exit(1);
+ }
+ default_device = &platform->devices[user_device_number];
+ } else {
+
+ struct cl_device * selected_devices = devices;
+ unsigned n_selected_devices = n_devices;
+
+ if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
+ for (unsigned i = 0; i < n_platforms; i++) {
+ struct cl_platform * p = &platforms[i];
+ if (strstr(p->name, user_platform_string) != NULL ||
+ strstr(p->vendor, user_platform_string) != NULL) {
+ user_platform_number = (int)i;
+ break;
+ }
+ }
+ if (user_platform_number == -1) {
+ GGML_LOG_ERROR("ggml_opencl: no platform matching '%s' was found.\n", user_platform_string);
+ exit(1);
+ }
+ }
+ if (user_platform_number != -1) {
+ struct cl_platform * p = &platforms[user_platform_number];
+ selected_devices = p->devices;
+ n_selected_devices = p->n_devices;
+ default_device = p->default_device;
+ if (n_selected_devices == 0) {
+ GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
+ exit(1);
+ }
+ }
+
+ if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
+ for (unsigned i = 0; i < n_selected_devices; i++) {
+ struct cl_device * d = &selected_devices[i];
+ if (strstr(d->name, user_device_string) != NULL) {
+ user_device_number = d->number;
+ break;
+ }
+ }
+ if (user_device_number == -1) {
+ GGML_LOG_ERROR("ggml_opencl: no device matching '%s' was found.\n", user_device_string);
+ exit(1);
+ }
+ }
+ if (user_device_number != -1) {
+ selected_devices = &devices[user_device_number];
+ n_selected_devices = 1;
+ default_device = &selected_devices[0];
+ }
+
+ GGML_ASSERT(n_selected_devices > 0);
+
+ if (default_device == NULL) {
+ default_device = &selected_devices[0];
+ }
+ }
+
+ GGML_LOG_INFO("ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
+ GGML_LOG_INFO("ggml_opencl: selecting device: '%s'\n", default_device->name);
+ if (default_device->type != CL_DEVICE_TYPE_GPU) {
+ GGML_LOG_WARN("ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
+ }
+
+ dev_ctx->platform = default_device->platform->id;
+ dev_ctx->device = default_device->id;
+ backend_ctx->device = default_device->id;
+
+ if (strstr(default_device->name, "Adreno")) {
+ backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
+ backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
+
+ // Default wave size is 128, A8x uses 64.
+ if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A8X) {
+ backend_ctx->adreno_wave_size = 64;
+ } else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
+ backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
+ backend_ctx->adreno_wave_size = 128;
+ } else {
+ backend_ctx->adreno_wave_size = 128;
+ GGML_LOG_WARN("ggml_opencl: Unsupported Adreno GPU: %s, "
+ "using wave size %d, "
+ "may not work as expected\n",
+ backend_ctx->device_name.c_str(), backend_ctx->adreno_wave_size);
+ }
+ } else if (strstr(default_device->name, "Intel")) {
+ backend_ctx->gpu_family = GPU_FAMILY::INTEL;
+ } else {
+ GGML_LOG_ERROR("Unsupported GPU: %s\n", default_device->name);
+ backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
+ return backend_ctx;
+ }
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+ if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
+ GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
+ "run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
+ return backend_ctx;
+ }
+#endif
+
+ // Populate backend device name
+ dev_ctx->platform_name = default_device->platform->name;
+ dev_ctx->device_name = default_device->name;
+ backend_ctx->device_name = default_device->name;
+
+ // A local ref of cl_device_id for convenience
+ cl_device_id device = backend_ctx->device;
+
+ // Check device OpenCL version, OpenCL 2.0 or above is required
+ size_t device_ver_str_size;
+ clGetDeviceInfo(device, CL_DEVICE_VERSION, 0, NULL, &device_ver_str_size);
+ char *device_ver_buffer = (char *)alloca(device_ver_str_size + 1);
+ clGetDeviceInfo(device, CL_DEVICE_VERSION, device_ver_str_size, device_ver_buffer, NULL);
+ device_ver_buffer[device_ver_str_size] = '\0';
+ GGML_LOG_INFO("ggml_opencl: device OpenCL version: %s\n", device_ver_buffer);
+
+ if (strstr(device_ver_buffer, "OpenCL 2") == NULL &&
+ strstr(device_ver_buffer, "OpenCL 3") == NULL) {
+ GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
+ return backend_ctx;
+ }
+
+ // Check driver version
+ size_t driver_version_str_size;
+ clGetDeviceInfo(device, CL_DRIVER_VERSION, 0, NULL, &driver_version_str_size);
+ char *driver_version = (char *)alloca(driver_version_str_size + 1);
+ clGetDeviceInfo(device, CL_DRIVER_VERSION, driver_version_str_size, driver_version, NULL);
+ driver_version[driver_version_str_size] = '\0';
+ GGML_LOG_INFO("ggml_opencl: OpenCL driver: %s\n", driver_version);
+ backend_ctx->driver_version = driver_version;
+
+ int adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version);
+ bool has_vector_subgroup_broadcast =
+ adreno_cl_compiler_version >= 47 || adreno_cl_compiler_version == 17;
+ GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n",
+ has_vector_subgroup_broadcast ? "true" : "false");
+
+ size_t ext_str_size;
+ clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
+ char *ext_buffer = (char *)alloca(ext_str_size + 1);
+ clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
+ ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
+ // Check if ext_buffer contains cl_khr_fp16
+ backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
+ GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false");
+
+ // fp16 is required
+ if (!backend_ctx->fp16_support) {
+ GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n");
+ return backend_ctx;
+ }
+
+ // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
+ // optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
+ if (strstr(device_ver_buffer, "OpenCL 3") &&
+ strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
+ strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
+ GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
+ "(note that subgroups is an optional feature in OpenCL 3.0)\n");
+ return backend_ctx;
+ }
+
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &backend_ctx->alignment, NULL));
+ GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
+
+ clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
+ GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024);
+
+ // Check SVM.
+ cl_device_svm_capabilities svm_caps;
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &svm_caps, 0));
+ GGML_LOG_INFO("ggml_opencl: SVM coarse grain buffer support: %s\n",
+ svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? "true" : "false");
+ GGML_LOG_INFO("ggml_opencl: SVM fine grain buffer support: %s\n",
+ svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? "true" : "false");
+ GGML_LOG_INFO("ggml_opencl: SVM fine grain system support: %s\n",
+ svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? "true" : "false");
+ GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
+ svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
+
+ // Print out configurations
+#ifdef GGML_OPENCL_SOA_Q
+ GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
+#endif // GGML_OPENCL_SOA_Q
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+ GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
+ cl_context_properties properties[] = {
+ (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)dev_ctx->platform, 0
+ };
+
+ CL_CHECK((backend_ctx->context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
+
+ // A local ref of cl_context for convenience
+ cl_context context = backend_ctx->context;
+
+ //CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
+ // (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
+ // (queue = clCreateCommandQueue(context, device, 0, &err), err)
+ //)));
+ cl_command_queue_properties command_queue_props = 0;
+#ifdef GGML_OPENCL_PROFILING
+ command_queue_props |= CL_QUEUE_PROFILING_ENABLE;
+#endif
+ CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
+
+#ifdef GGML_OPENCL_EMBED_KERNELS
+ const std::string kernel_src {
+ #include "ggml-opencl.cl.h"
+ };
+#else
+ const std::string kernel_src = read_file("ggml-opencl.cl");
+#endif
+
+ std::string compile_opts =
+ "-cl-std=CL2.0 -cl-mad-enable -cl-unsafe-math-optimizations "
+ "-cl-finite-math-only -cl-fast-relaxed-math ";
+ backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
+
+ // Non matmul kernels.
+ CL_CHECK((backend_ctx->kernel_get_rows_f32 = clCreateKernel(backend_ctx->program, "kernel_get_rows_f32", &err), err));
+ CL_CHECK((backend_ctx->kernel_get_rows_f16 = clCreateKernel(backend_ctx->program, "kernel_get_rows_f16", &err), err));
+ CL_CHECK((backend_ctx->kernel_get_rows_q4_0 = clCreateKernel(backend_ctx->program, "kernel_get_rows_q4_0", &err), err));
+ CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program, "kernel_add", &err), err));
+ CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program, "kernel_add_row", &err), err));
+ CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program, "kernel_mul", &err), err));
+ CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program, "kernel_mul_row", &err), err));
+ CL_CHECK((backend_ctx->kernel_scale = clCreateKernel(backend_ctx->program, "kernel_scale", &err), err));
+ CL_CHECK((backend_ctx->kernel_silu = clCreateKernel(backend_ctx->program, "kernel_silu", &err), err));
+ CL_CHECK((backend_ctx->kernel_silu_4 = clCreateKernel(backend_ctx->program, "kernel_silu_4", &err), err));
+ CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program, "kernel_gelu", &err), err));
+ CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program, "kernel_gelu_4", &err), err));
+ CL_CHECK((backend_ctx->kernel_relu = clCreateKernel(backend_ctx->program, "kernel_relu", &err), err));
+ CL_CHECK((backend_ctx->kernel_clamp = clCreateKernel(backend_ctx->program, "kernel_clamp", &err), err));
+ CL_CHECK((backend_ctx->kernel_norm = clCreateKernel(backend_ctx->program, "kernel_norm", &err), err));
+ CL_CHECK((backend_ctx->kernel_rms_norm = clCreateKernel(backend_ctx->program, "kernel_rms_norm", &err), err));
+ CL_CHECK((backend_ctx->kernel_diag_mask_inf = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf", &err), err));
+ CL_CHECK((backend_ctx->kernel_diag_mask_inf_8 = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf_8", &err), err));
+ CL_CHECK((backend_ctx->kernel_soft_max = clCreateKernel(backend_ctx->program, "kernel_soft_max", &err), err));
+ CL_CHECK((backend_ctx->kernel_soft_max_4 = clCreateKernel(backend_ctx->program, "kernel_soft_max_4", &err), err));
+ CL_CHECK((backend_ctx->kernel_rope_norm_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f32", &err), err));
+ CL_CHECK((backend_ctx->kernel_rope_norm_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f16", &err), err));
+ CL_CHECK((backend_ctx->kernel_rope_neox_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f32", &err), err));
+ CL_CHECK((backend_ctx->kernel_rope_neox_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f16", &err), err));
+ CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f16", &err), err));
+ CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f32", &err), err));
+ CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(backend_ctx->program, "kernel_cpy_f32_f16", &err), err));
+ CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(backend_ctx->program, "kernel_cpy_f32_f32", &err), err));
+
+ // Matmul kernels.
+ CL_CHECK((backend_ctx->kernel_mul_mat_f32_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f32_f32", &err), err));
+ CL_CHECK((backend_ctx->kernel_mul_mat_f16_f16 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f16", &err), err));
+ CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_1row = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32_1row", &err), err));
+ CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32", &err), err));
+ CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32_l4", &err), err));
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32", &err), err));
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_v = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_v", &err), err));
+
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_flat", &err), err));
+ CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program, "kernel_convert_block_q4_0", &err), err));
+ CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program, "kernel_restore_block_q4_0", &err), err));
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_8x_flat", &err), err));
+
+ // Load additional mulmat kernels.
+#ifdef GGML_OPENCL_EMBED_KERNELS
+ const std::string kernel_src_1 {
+ #include "ggml-opencl_mm.cl.h"
+ };
+#else
+ const std::string kernel_src_1 = read_file("ggml-opencl_mm.cl");
+#endif
+ backend_ctx->program_1 = build_program_from_source(context, device, kernel_src_1.c_str(), compile_opts);
+
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_1d_8x_flat", &err), err));
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_1d_16x_flat", &err), err));
+ CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mv_q6_K_f32", &err), err));
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat_v0 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_flat_v0", &err), err));
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat_img_v0 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_flat_img_v0", &err), err));
+
+ // Load additional data conversion kernels.
+#ifdef GGML_OPENCL_EMBED_KERNELS
+ const std::string kernel_src_2 {
+ #include "ggml-opencl_cvt.cl.h"
+ };
+#else
+ const std::string kernel_src_2 = read_file("ggml-opencl_cvt.cl");
+#endif
+ backend_ctx->program_2 = build_program_from_source(context, device, kernel_src_2.c_str(), compile_opts);
+
+ CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_2, "kernel_convert_block_q4_0_noshuffle", &err), err));
+
+ // Kernels for Adreno
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+#ifdef GGML_OPENCL_EMBED_KERNELS
+ const std::string transpose_32_src {
+ #include "ggml-opencl_transpose_32.cl.h"
+ };
+#else
+ const std::string transpose_32_src = read_file("ggml-opencl_transpose_32.cl");
+#endif
+ backend_ctx->program_transpose_32 = build_program_from_source(context, device, transpose_32_src.c_str(), compile_opts);
+ CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose_32, "kernel_transpose_32", &err), err));
+
+#ifdef GGML_OPENCL_EMBED_KERNELS
+ const std::string transpose_32_16_src {
+ #include "ggml-opencl_transpose_32_16.cl.h"
+ };
+#else
+ const std::string transpose_32_16_src = read_file("ggml-opencl_transpose_32_16.cl");
+#endif
+ backend_ctx->program_transpose_32_16 = build_program_from_source(context, device, transpose_32_16_src.c_str(), compile_opts);
+ CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose_32_16, "kernel_transpose_32_16", &err), err));
+
+#ifdef GGML_OPENCL_EMBED_KERNELS
+ const std::string transpose_16_src {
+ #include "ggml-opencl_transpose_16.cl.h"
+ };
+#else
+ const std::string transpose_16_src = read_file("ggml-opencl_transpose_16.cl");
+#endif
+ backend_ctx->program_transpose_16 = build_program_from_source(context, device, transpose_16_src.c_str(), compile_opts);
+ CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
+
+ // Gemv general
+ std::string CL_gemv_compile_opts =
+ " -cl-std=CL2.0 "
+ " -cl-mad-enable "
+ " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
+ if (has_vector_subgroup_broadcast) {
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
+ }
+#ifdef GGML_OPENCL_EMBED_KERNELS
+ const std::string kernel_src_CL_gemv_general {
+ #include "ggml-opencl_gemv_noshuffle_general.cl.h"
+ };
+#else
+ const std::string kernel_src_CL_gemv_general = read_file("ggml-opencl_gemv_noshuffle_general.cl");
+#endif
+
+ backend_ctx->program_CL_gemv_general = build_program_from_source(
+ context, device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
+
+ // Gemv 2048, 16384
+ CL_gemv_compile_opts =
+ " -cl-std=CL2.0 "
+ " -cl-mad-enable "
+ " -DLINE_STRIDE_A=2048 "
+ " -DBLOCK_STRIDE_A=16384 "
+ " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
+ if (has_vector_subgroup_broadcast) {
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
+ }
+#ifdef GGML_OPENCL_EMBED_KERNELS
+ const std::string kernel_src_CL_gemv {
+ #include "ggml-opencl_gemv_noshuffle.cl.h"
+ };
+#else
+ const std::string kernel_src_CL_gemv = read_file("ggml-opencl_gemv_noshuffle.cl");
+#endif
+
+ backend_ctx->program_CL_gemv_4096_1_4096 = build_program_from_source(
+ context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
+
+ // Gemv 2048, 16384
+ CL_gemv_compile_opts =
+ " -cl-std=CL2.0 "
+ " -cl-mad-enable "
+ " -DLINE_STRIDE_A=2048 "
+ " -DBLOCK_STRIDE_A=16384 "
+ " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
+ if (has_vector_subgroup_broadcast) {
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
+ }
+
+ backend_ctx->program_CL_gemv_4096_1_11008 = build_program_from_source(
+ context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
+
+ // Gemv 5504, 44032
+ CL_gemv_compile_opts =
+ " -cl-std=CL2.0 "
+ " -cl-mad-enable "
+ " -DLINE_STRIDE_A=5504 "
+ " -DBLOCK_STRIDE_A=44032 "
+ " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
+ if (has_vector_subgroup_broadcast) {
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
+ }
+
+ backend_ctx->program_CL_gemv_11008_1_4096 = build_program_from_source(
+ context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
+
+ // Gemv 16000, 128000
+ CL_gemv_compile_opts =
+ " -cl-std=CL2.0 "
+ " -cl-mad-enable "
+ " -DLINE_STRIDE_A=16000 "
+ " -DBLOCK_STRIDE_A=128000 "
+ " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
+ if (has_vector_subgroup_broadcast) {
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
+ }
+
+ backend_ctx->program_CL_gemv_32000_1_4096 = build_program_from_source(context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_32000_1_4096, "kernel_gemv_noshuffle", &err), err));
+
+ // Gemm
+#ifdef GGML_OPENCL_EMBED_KERNELS
+ const std::string kernel_src_CL_gemm {
+ #include "ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h"
+ };
+#else
+ const std::string kernel_src_CL_gemm = read_file("ggml-opencl_mul_mat_Ab_Bi_8x4.cl");
+#endif
+ backend_ctx->program_CL_gemm = build_program_from_source(context, device, kernel_src_CL_gemm.c_str(), compile_opts);
+ CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
+
+ // Allocate intermediate buffers and images
+ size_t max_A_q_d_bytes = 311164928;
+ size_t max_A_s_d_bytes = 38895616;
+ size_t max_B_d_bytes = 45088768;
+
+ CL_CHECK((backend_ctx->A_q_d_max = clCreateBuffer(context, 0, max_A_q_d_bytes, NULL, &err), err));
+ CL_CHECK((backend_ctx->A_s_d_max = clCreateBuffer(context, 0, max_A_s_d_bytes, NULL, &err), err));
+ CL_CHECK((backend_ctx->B_d_max = clCreateBuffer(context, 0, max_B_d_bytes, NULL, &err), err));
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
+ // For now we support a single devices
+ ggml_backend_opencl_n_devices = 1;
+
+ return backend_ctx;
+}
+
+static void ggml_cl2_free(void) {
+#ifdef GGML_OPENCL_PROFILING
+ FILE * fperf = fopen("cl_profiling.csv", "w");
+ if (!fperf) {
+ GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
+ return;
+ }
+
+ float total_kernel_time = 0;
+ fprintf(fperf, "op name, kernel name, duration (ms), global size, local size, output size\n");
+ for (const ProfilingInfo & info : g_profiling_info) {
+ total_kernel_time += info.duration_ns/1.e6f;
+ fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
+ info.op_name.c_str(), info.kernel_name.c_str(), info.duration_ns/1.e6f,
+ info.global_size[0], info.global_size[1], info.global_size[2],
+ info.local_size[0], info.local_size[2], info.local_size[2],
+ info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
+ }
+ fclose(fperf);
+
+ GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Tensor extra management
+//------------------------------------------------------------------------------
+struct ggml_tensor_extra_cl {
+ // The buffer object that holds the data.
+ cl_mem data_device;
+ // The offset into the buffer object. This is primarily for scratch buffer
+ // and view operation.
+ // NB: this offset no longer includes view offset (view_offs). Whenever this
+ // offset is used, view_offs should be considered.
+ cl_ulong offset;
+ // The actual size of the cl_mem object. This is needed when returning the
+ // block to the pool.
+ size_t actual_size;
+
+ void reset() {
+ data_device = nullptr;
+ offset = 0;
+ actual_size = 0;
+ }
+};
+
+// Additional tensor extra structs for quantized tensors.
+// These tensors are loaded from files and should not be allocated in scratch --
+// they should always be allocated from the pool. Hence, they do not have an
+// `offset`, which indicate their locations in the scratch buffer.
+struct ggml_tensor_extra_cl_q4_0 {
+ // Quantized values.
+ cl_mem q = nullptr;
+ // Quantized values in image1d_buffer_t.
+ cl_mem q_img = nullptr;
+ // Scales.
+ cl_mem d = nullptr;
+ // Scales in image1d_buffer_t.
+ cl_mem d_img = nullptr;
+ // Size of quantized values.
+ size_t size_q = 0;
+ // Size of scales.
+ size_t size_d = 0;
+
+ ~ggml_tensor_extra_cl_q4_0() {
+ reset();
+ }
+
+ void reset() {
+ // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
+ // They must be properly released so that the original buffer can be
+ // properly released to avoid memory leak.
+ if (q != nullptr) {
+ CL_CHECK(clReleaseMemObject(q));
+ q = nullptr;
+ }
+ if (d != nullptr) {
+ CL_CHECK(clReleaseMemObject(d));
+ d = nullptr;
+ }
+ // Currently, q_img and d_img are only initialized when SMALL_ALLOC is
+ // enabled. They point to the images in ggml_backend_opencl_buffer_context.
+ // So, there is no need to release them here.
+ // TODO: initialize them for non SMALL_PATH path, or remove them.
+ q_img = nullptr;
+ d_img = nullptr;
+ size_q = 0;
+ size_d = 0;
+ }
+};
+
+//------------------------------------------------------------------------------
+// Backend API
+//------------------------------------------------------------------------------
+
+//
+// backend
+//
+static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
+ return "OpenCL";
+
+ UNUSED(backend);
+}
+
+static void ggml_backend_opencl_free(ggml_backend_t backend) {
+ ggml_cl2_free();
+
+ GGML_UNUSED(backend);
+}
+
+static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+ GGML_UNUSED(backend);
+ GGML_UNUSED(tensor);
+ GGML_UNUSED(data);
+ GGML_UNUSED(offset);
+ GGML_UNUSED(size);
+}
+
+static void ggml_backend_opencl_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+ GGML_UNUSED(backend);
+ GGML_UNUSED(tensor);
+ GGML_UNUSED(data);
+ GGML_UNUSED(offset);
+ GGML_UNUSED(size);
+}
+
+static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
+ GGML_UNUSED(backend);
+ GGML_UNUSED(src);
+ GGML_UNUSED(dst);
+ return false;
+}
+
+static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
+ GGML_UNUSED(backend);
+}
+
+static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+ for (int i = 0; i < cgraph->n_nodes; i++) {
+ ggml_tensor * node = cgraph->nodes[i];
+
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+ continue;
+ }
+
+ bool ok = ggml_cl_compute_forward(backend, node);
+ if (!ok) {
+ GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+ }
+ GGML_ASSERT(ok);
+ }
+
+ return GGML_STATUS_SUCCESS;
+}
+
+static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+ GGML_UNUSED(dev);
+
+ switch (op->op) {
+ case GGML_OP_NONE:
+ return true;
+ case GGML_OP_GET_ROWS:
+ switch (op->src[0]->type) {
+ case GGML_TYPE_F32:
+ case GGML_TYPE_F16:
+ return true;
+ case GGML_TYPE_Q4_0:
+#ifdef GGML_OPENCL_SOA_Q
+ // We do not support flattened Q4_0 (and possibly other Q's)
+ return false;
+#else // GGML_OPENCL_SOA_Q
+ return true;
+#endif // GGML_OPENCL_SOA_Q
+ default:
+ return false;
+ }
+ case GGML_OP_CPY:
+ case GGML_OP_DUP:
+ case GGML_OP_CONT:
+ switch (op->src[0]->type) {
+ case GGML_TYPE_F32:
+ switch (op->type) {
+ case GGML_TYPE_F16:
+ case GGML_TYPE_F32:
+ return true;
+ default:
+ return false;
+ }
+ case GGML_TYPE_F16:
+ switch (op->type) {
+ case GGML_TYPE_F16:
+ case GGML_TYPE_F32:
+ return true;
+ default:
+ return false;
+ }
+ default:
+ return false;
+ }
+ case GGML_OP_ADD:
+ case GGML_OP_SCALE:
+ case GGML_OP_MUL:
+ return true;
+ case GGML_OP_UNARY:
+ switch (ggml_get_unary_op(op)) {
+ case GGML_UNARY_OP_GELU:
+ case GGML_UNARY_OP_SILU:
+ case GGML_UNARY_OP_RELU:
+ return ggml_is_contiguous(op->src[0]);
+ default:
+ return false;
+ }
+ case GGML_OP_CLAMP:
+ case GGML_OP_SOFT_MAX:
+ case GGML_OP_NORM:
+ case GGML_OP_RMS_NORM:
+ return true;
+ case GGML_OP_MUL_MAT:
+ if (op->src[0]->type == GGML_TYPE_F16) {
+ return true;
+ } else if (op->src[0]->type == GGML_TYPE_F32) {
+ return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
+ } else if (op->src[0]->type == GGML_TYPE_Q4_0 ||
+ op->src[0]->type == GGML_TYPE_Q6_K) {
+ return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
+ }
+ return false;
+ case GGML_OP_RESHAPE:
+ case GGML_OP_VIEW:
+ case GGML_OP_PERMUTE:
+ case GGML_OP_TRANSPOSE:
+ return true;
+ case GGML_OP_DIAG_MASK_INF:
+ return op->ne[3] == 1;
+ case GGML_OP_ROPE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+// Forward declaration - implementation appears later in the file.
+static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer_type_t buffer_type);
+
+static ggml_guid_t ggml_backend_opencl_guid() {
+ static ggml_guid guid = { 0xde, 0xe0, 0x70, 0xa2, 0x73, 0x4e, 0x4d, 0xbc, 0xb0, 0xc7, 0x4f, 0xd4, 0x6d, 0x4e, 0x90, 0xfe };
+ return &guid;
+}
+
+static ggml_backend_i ggml_backend_opencl_i = {
+ /* .get_name = */ ggml_backend_opencl_name,
+ /* .free = */ ggml_backend_opencl_free,
+ /* .set_tensor_async = */ NULL, /* ggml_backend_opencl_set_tensor_async */
+ /* .get_tensor_async = */ NULL, /* ggml_backend_opencl_get_tensor_async */
+ /* .cpy_tensor_async = */ NULL, /* ggml_backend_opencl_cpy_tensor_async */
+ /* .synchronize = */ NULL, /* ggml_backend_opencl_synchronize */
+ /* .graph_plan_create = */ NULL,
+ /* .graph_plan_free = */ NULL,
+ /* .graph_plan_update = */ NULL,
+ /* .graph_plan_compute = */ NULL,
+ /* .graph_compute = */ ggml_backend_opencl_graph_compute,
+ /* .event_record = */ NULL,
+ /* .event_wait = */ NULL,
+};
+
+ggml_backend_t ggml_backend_opencl_init(void) {
+ ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_opencl_reg(), 0);
+ ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
+
+ ggml_backend_t backend = new ggml_backend {
+ /* .guid = */ ggml_backend_opencl_guid(),
+ /* .interface = */ ggml_backend_opencl_i,
+ /* .device = */ dev,
+ /* .context = */ backend_ctx
+ };
+
+ return backend;
+}
+
+bool ggml_backend_is_opencl(ggml_backend_t backend) {
+ return backend && backend->iface.get_name == ggml_backend_opencl_name;
+}
+
+//
+// buffer
+//
+struct ggml_backend_opencl_buffer_context {
+ // A buffer context can hold multiple cl_mem objects. This is for flattening
+ // quantized weights and should be used with GGML_OPENCL_SMALL_ALLOC where
+ // each tensor is allocated a separate buffer. When flattening is enabled
+ // with small allocation, each tensor is backed by two cl_mem objects (for
+ // quants and scales) packed into a backend_opencl_buffer.
+ ggml_backend_opencl_buffer_context(cl_mem buf)
+ : name("OpenCL") {
+ buffer.push_back(buf);
+ }
+
+ ~ggml_backend_opencl_buffer_context() {
+ for (cl_mem buf : buffer) {
+ CL_CHECK(clReleaseMemObject(buf));
+ }
+ for (cl_mem im : img) {
+ CL_CHECK(clReleaseMemObject(im));
+ }
+
+ // Delete all extras to trigger their destructors
+ for (ggml_tensor_extra_cl * e : temp_tensor_extras) {
+ delete e;
+ }
+ for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
+ delete e;
+ }
+ for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0) {
+ delete e;
+ }
+ for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
+ delete e;
+ }
+ }
+
+ ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra() {
+ ggml_tensor_extra_cl * extra;
+ if (temp_tensor_extras.empty()) {
+ extra = new ggml_tensor_extra_cl();
+ } else {
+ extra = temp_tensor_extras.back();
+ temp_tensor_extras.pop_back();
+ }
+
+ temp_tensor_extras_in_use.push_back(extra);
+
+ extra->reset();
+ return extra;
+ }
+
+ ggml_tensor_extra_cl_q4_0 * ggml_opencl_alloc_temp_tensor_extra_q4_0() {
+ ggml_tensor_extra_cl_q4_0 * extra;
+ if (temp_tensor_extras_q4_0.empty()) {
+ extra = new ggml_tensor_extra_cl_q4_0();
+ } else {
+ extra = temp_tensor_extras_q4_0.back();
+ temp_tensor_extras_q4_0.pop_back();
+ }
+
+ temp_tensor_extras_q4_0_in_use.push_back(extra);
+
+ extra->reset();
+ return extra;
+ }
+
+ void reset() {
+ for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
+ temp_tensor_extras.push_back(e);
+ }
+ temp_tensor_extras_in_use.clear();
+
+ for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
+ temp_tensor_extras_q4_0.push_back(e);
+ }
+ temp_tensor_extras_q4_0_in_use.clear();
+ }
+
+ // Pools for extras. Available extras are in `temp_tensor_extras`. Extras
+ // being used are in `temp_tensor_extras_in_use`. At the first run, new
+ // extras get created and put in `in_use`. When the buffer is reset via
+ // the `reset` callback, all extras in `in_use` get moved to available extras
+ // for reuse.
+ std::vector temp_tensor_extras;
+ std::vector temp_tensor_extras_in_use;
+ std::vector temp_tensor_extras_q4_0;
+ std::vector temp_tensor_extras_q4_0_in_use;
+
+ // The buffer_context is initially created by ggml_backend_buft_alloc_buffer
+ // before any tensor is initialized (at the beginning of alloc_tensor_range).
+ // Hence, there is alway a buffer object in this vector. When each tensor is
+ // being initialized, this original buffer object will be released if both
+ // flattening and small allocation are enabled, and additional buffer
+ // objects will be created in init_tensor to represent flattened quantized
+ // weights.
+ std::vector buffer;
+ // These are image1d_buffer_t objects that wrap around the quants and scales.
+ // For Q4_0 quantization, there should be two of them - one for quants and
+ // one for scales. They should be populated only when flattening and small
+ // allocation are enabled.
+ std::vector img;
+ std::string name;
+};
+
+static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
+
+static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+ delete ctx;
+}
+
+static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
+ return cl_ptr_base;
+
+ GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+
+ ggml_cl2_init(buffer->buft->device);
+
+ if (tensor->view_src != nullptr) {
+ GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
+
+ ggml_tensor_extra_cl * view_extra = (ggml_tensor_extra_cl *) tensor->view_src->extra;
+ GGML_ASSERT(view_extra && "view_extra is nullptr?");
+
+ // Reuse extra of the parent tensor. The offset of this view tensor
+ // becomes `extra->offset + view_offs` and needs to be calculated when
+ // it is used. This changes is needed because of the change to
+ // ggml_alloc.c in https://github.com/ggerganov/llama.cpp/pull/7640.
+ // `buffer` passed in here will always be `tensor->buffer`. It is OK
+ // to allocate extras from the same buffer context for ordinary
+ // intermediate tensors. But for views into kv cache tensors, doing so
+ // would mess up the extras used by kv cache.
+ // Before #7640, `buffer` is for intermediate tensors, which is always
+ // different from that of kv cache tensors.
+ //
+ // NB: now extra->offset no longer accounts for view_offs.
+ // NB: this should not apply to weight tensors (for end-to-end runs, but
+ // may apply for test-backend-ops).
+ // FIXME: if any unexpected results are seen, double check the offset -
+ // there could be other places that need fix.
+ tensor->extra = view_extra;
+ } else {
+ {
+ size_t offset = (char *)tensor->data - (char *)cl_ptr_base;
+
+ ggml_tensor_extra_cl * extra = ctx->ggml_opencl_alloc_temp_tensor_extra();
+ extra->offset = offset;
+ extra->data_device = ctx->buffer[0];
+ extra->actual_size = ggml_nbytes(tensor);
+
+ tensor->extra = extra;
+ }
+ }
+}
+
+// The optimized gemm and gemv kernels are used for large matrices without batch.
+// tensor is the quantized weights matrix.
+inline bool use_adreno_kernels(const ggml_tensor *tensor) {
+ return tensor->ne[0] >= 512 && tensor->ne[1] >= 512 &&
+ tensor->ne[2] == 1 && tensor->ne[3] == 1;
+}
+
+static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+ ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
+
+ cl_context context = backend_ctx->context;
+ cl_command_queue queue = backend_ctx->queue;
+
+#ifdef GGML_OPENCL_SOA_Q
+ // We separate the quantized bits and scale from block_q4_0 by using an
+ // additional kernel, where each thread handles a block. We first read the
+ // original weights into a temporary buffer, then create two separate
+ // buffers for quantized bits and scales, which are then populated by the
+ // conversion kernel.
+ if (tensor->type == GGML_TYPE_Q4_0) {
+ // Tensors should have been preallocated, therefore they should
+ // already have ggml_tensor_extra_cl as extra.
+ ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
+ GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
+
+ // Allocate the new extra and create aliases from the original.
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+ ggml_tensor_extra_cl_q4_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q4_0();
+
+ size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
+ size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
+ GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
+
+ cl_int err;
+ cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
+ ggml_nbytes(tensor), NULL, &err);
+ CL_CHECK(err);
+ CL_CHECK(clEnqueueWriteBuffer(
+ queue, data_device, CL_TRUE, 0,
+ ggml_nbytes(tensor), data, 0, NULL, NULL));
+
+ // We consider the specified offset arg as always, although For weights
+ // the offset arg should be 0 (we do not assert this).
+ //GGML_ASSERT(offset == 0);
+
+ // We create subbuffers from the original tensor buffer for scales and
+ // quants - i.e., scales and quants are aliases into the buffer obejct
+ // that backs the original tensor. This is a cleaner way to adapt to the
+ // new memory management.
+ // In the old code, we allocate new buffers for scales and quants
+ // respectively, which could still be done but would result in double
+ // allocation; properly deallocating the preallocated buffer that backs
+ // the tensors is tricky and would leak the backend specific information
+ // into the general backend code.
+ // Does this create misaligned subbuffers (alignment is 1024) in certain
+ // cases ?
+ cl_buffer_region region;
+
+ // The original tensor memory is divided into scales and quants, i.e.,
+ // we first store scales, then quants.
+ // Create subbuffer for scales.
+ region.origin = extra_orig->offset + tensor->view_offs + offset;
+ region.size = size_d;
+ extra->d = clCreateSubBuffer(
+ extra_orig->data_device, CL_MEM_READ_WRITE,
+ CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
+ CL_CHECK(err);
+
+ // Create subbuffer for quants.
+ region.origin = extra_orig->offset + tensor->view_offs + offset + size_d;
+ region.size = size_q;
+ extra->q = clCreateSubBuffer(
+ extra_orig->data_device, CL_MEM_READ_WRITE,
+ CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
+ CL_CHECK(err);
+
+ //cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+ cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
+
+ // The optimized kernels need weights in natural order, so unshuffle.
+ if (use_adreno_kernels(tensor)) {
+ kernel = backend_ctx->kernel_convert_block_q4_0_noshuffle;
+ }
+ #else
+ cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
+
+ size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
+ size_t local_work_size[] = {64, 1, 1};
+
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+ CL_CHECK(clWaitForEvents(1, &evt));
+ CL_CHECK(clReleaseMemObject(data_device));
+
+ tensor->extra = extra;
+
+ // transpose the weights and scales
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+ // Only do transpose for large, non batched matrix
+ // TODO: use preallocated images instead of sub-buffer then image
+ if (use_adreno_kernels(tensor)) {
+ // <----------------------------------------------------------------------------------> //
+ // start transpose
+ // <----------------------------------------------------------------------------------> //
+ int M = tensor->ne[1]; // ne01
+ int K = tensor->ne[0]; // ne00
+
+ // transpose is out of place, so we need to allocate transposed buffers
+ // <----------------------------------------------------------------------------------> //
+ // use sub_buffer of max buffer size instead
+
+ size_t q_size_bytes = K * M / 8 * sizeof(float);
+ cl_buffer_region region;
+ region.origin = 0;
+ region.size = q_size_bytes;
+ cl_mem qT_d = clCreateSubBuffer(
+ backend_ctx->A_q_d_max,
+ 0,
+ CL_BUFFER_CREATE_TYPE_REGION,
+ ®ion,
+ &err);
+ // cl_mem qT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, q_size_bytes, NULL, &err);
+ CL_CHECK(err);
+
+ // size_t d_size_bytes = M * (K / 32) / 2 * sizeof(float);
+ size_t d_size_bytes = M * (K / 32) * 2;
+ region.origin = 0;
+ region.size = d_size_bytes;
+ cl_mem dT_d = clCreateSubBuffer(
+ backend_ctx->A_s_d_max,
+ 0,
+ CL_BUFFER_CREATE_TYPE_REGION,
+ ®ion,
+ &err);
+ // cl_mem dT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, d_size_bytes, NULL, &err);
+ CL_CHECK(err);
+
+ // <----------------------------------------------------------------------------------> //
+
+
+ // create images from the buffers
+ // <----------------------------------------------------------------------------------> //
+ cl_mem q_d_image1D;
+ cl_mem d_d_image1D;
+ cl_mem qT_d_image1D;
+ cl_mem dT_d_image1D;
+
+ cl_image_format img_fmt_1d = { CL_RGBA, CL_FLOAT };
+ cl_image_desc img_desc_1d;
+
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+ img_desc_1d.image_width = M * K / 8 / 4;
+ img_desc_1d.buffer = extra->q;
+ q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
+ CL_CHECK(err);
+
+ img_fmt_1d = { CL_RGBA, CL_FLOAT };
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+ img_desc_1d.image_width = M * K / 8 / 4;
+ img_desc_1d.buffer = qT_d;
+ qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
+ CL_CHECK(err);
+
+ img_fmt_1d = { CL_RGBA, CL_FLOAT };
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+ img_desc_1d.image_width = M * K / 32 / 4 / 2;
+ img_desc_1d.buffer = extra->d;
+ d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
+ CL_CHECK(err);
+
+ img_fmt_1d = { CL_RGBA, CL_FLOAT };
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+ img_desc_1d.image_width = M * K / 32 / 4 / 2;
+ img_desc_1d.buffer = dT_d;
+ dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
+ CL_CHECK(err);
+ // <----------------------------------------------------------------------------------> //
+
+ // set up and call the transpose kernels
+ // <----------------------------------------------------------------------------------> //
+ // weights
+ int height_q = M / 8;
+ int width_q = K / 8 / 4;
+ kernel = backend_ctx->kernel_transpose_16;
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_q));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_q));
+
+ size_t local_size_q[3] = {4, 16, 1};
+ size_t global_size_q[3] = {static_cast(width_q), static_cast(height_q), 1};
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt));
+ CL_CHECK(clWaitForEvents(1, &evt));
+
+ // scales
+ int height_s = M / 8;
+ int width_s = K / 32 / 8;
+
+ kernel = backend_ctx->kernel_transpose_16;
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s));
+
+ size_t local_size_s[3] = {4, 16, 1};
+ size_t global_size_s[3] = {static_cast(width_s), static_cast(height_s), 1};
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt));
+ CL_CHECK(clWaitForEvents(1, &evt));
+ // <----------------------------------------------------------------------------------> //
+
+ // copy transposed buffer contents to original buffers
+ // <----------------------------------------------------------------------------------> //
+ // weights
+ CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt));
+ CL_CHECK(clWaitForEvents(1, &evt));
+
+ // scales
+ CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt));
+ CL_CHECK(clWaitForEvents(1, &evt));
+ // <----------------------------------------------------------------------------------> //
+
+ // deallocate transpose buffers
+ // <----------------------------------------------------------------------------------> //
+ CL_CHECK(clReleaseMemObject(qT_d));
+ CL_CHECK(clReleaseMemObject(dT_d));
+
+ // deallocate temporary images
+ CL_CHECK(clReleaseMemObject(q_d_image1D));
+ CL_CHECK(clReleaseMemObject(d_d_image1D));
+ CL_CHECK(clReleaseMemObject(qT_d_image1D));
+ CL_CHECK(clReleaseMemObject(dT_d_image1D));
+ // <----------------------------------------------------------------------------------> //
+ // end transpose
+ // <----------------------------------------------------------------------------------> //
+ }
+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
+ return;
+ }
+#endif // GGML_OPENCL_SOA_Q
+
+ ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
+ GGML_ASSERT(extra);
+
+ CL_CHECK(clEnqueueWriteBuffer(
+ queue, extra->data_device, CL_TRUE, extra->offset + offset,
+ size, data, 0, NULL, NULL));
+
+ GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+ GGML_ASSERT(tensor->extra);
+
+ ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
+
+ cl_context context = backend_ctx->context;
+ cl_command_queue queue = backend_ctx->queue;
+
+ // Make sure all previously submitted commands are finished.
+ CL_CHECK(clFinish(queue));
+
+#ifdef GGML_OPENCL_SOA_Q
+ // In end-to-end runs, get_tensor is usually used to get back the logits,
+ // where we can simply do clEnqueueReadBuffer since they are f32.
+ // However, in test-backend-ops, the GPU graph is copied to the CPU backend,
+ // which requires reading back quantized weight tensors.
+ // To properly support this, we need to restore block_q4_0 struct arrays
+ // from the flattened buffers.
+ if (tensor->type == GGML_TYPE_Q4_0) {
+ ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra;
+
+ cl_int err;
+ cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
+ ggml_nbytes(tensor), NULL, &err);
+ CL_CHECK(err);
+
+ cl_kernel kernel = backend_ctx->kernel_restore_block_q4_0;
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
+
+ size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
+ size_t local_work_size[] = {1, 1, 1};
+
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
+ global_work_size, local_work_size, 0, NULL, &evt));
+ CL_CHECK(clWaitForEvents(1, &evt));
+ CL_CHECK(clEnqueueReadBuffer(
+ queue, data_device, CL_TRUE, offset,
+ size, data, 0, NULL, NULL));
+ CL_CHECK(clReleaseMemObject(data_device));
+ return;
+ }
+#endif // GGML_OPENCL_SOA_Q
+
+ ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
+
+ CL_CHECK(clEnqueueReadBuffer(
+ queue, extra->data_device, CL_TRUE, extra->offset + tensor->view_offs + offset,
+ size, data, 0, NULL, NULL));
+
+ GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+ ggml_backend_dev_t dev = buffer->buft->device;
+ ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
+ cl_command_queue queue = backend_ctx->queue;
+
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+ for (cl_mem buf : ctx->buffer) {
+ CL_CHECK(clEnqueueFillBuffer(queue, buf, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL));
+ }
+ CL_CHECK(clFinish(queue));
+}
+
+static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+ ctx->reset();
+}
+
+static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
+ /* .free_buffer = */ ggml_backend_opencl_buffer_free_buffer,
+ /* .get_base = */ ggml_backend_opencl_buffer_get_base,
+ /* .init_tensor = */ ggml_backend_opencl_buffer_init_tensor,
+ /* .memset_tensor = */ NULL,
+ /* .set_tensor = */ ggml_backend_opencl_buffer_set_tensor,
+ /* .get_tensor = */ ggml_backend_opencl_buffer_get_tensor,
+ /* .cpy_tensor = */ NULL,
+ /* .clear = */ ggml_backend_opencl_buffer_clear,
+ /* .reset = */ ggml_backend_opencl_buffer_reset,
+};
+
+//
+// buffer type
+//
+
+static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer_type_t buffer_type) {
+ return "OpenCL";
+
+ GGML_UNUSED(buffer_type);
+}
+
+static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
+ ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer_type->device);
+
+ // clCreateBuffer returns -61 for size 0
+ size = std::max(size, (size_t)1);
+
+ cl_int err;
+ cl_mem mem = clCreateBuffer(backend_ctx->context, CL_MEM_READ_WRITE, size, NULL, &err);
+ if (err != CL_SUCCESS) {
+ GGML_LOG_INFO("%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
+ return nullptr;
+ }
+
+ ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context(mem);
+
+ return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
+ // FIXME: not thread safe, device may not be initialized yet
+ static cl_uint alignment = -1;
+ if (alignment == (cl_uint)-1) {
+ ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
+ alignment = backend_ctx->alignment;
+ }
+ return alignment;
+}
+
+static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
+ static size_t max_size = -1;
+ if (max_size == (size_t)-1) {
+ ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
+ max_size = backend_ctx->max_alloc_size;
+ }
+ return max_size;
+}
+
+static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+ return ggml_backend_is_opencl(backend);
+
+ UNUSED(buft);
+}
+
+static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
+ /* .get_name = */ ggml_backend_opencl_buffer_type_get_name,
+ /* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
+ /* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
+ /* .get_max_size = */ ggml_backend_opencl_buffer_type_get_max_size,
+ /* .get_alloc_size = */ NULL,
+ /* .is_host = */ NULL,
+};
+
+ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
+ static ggml_backend_buffer_type buffer_type = {
+ /* .iface = */ ggml_backend_opencl_buffer_type_interface,
+ /* .device = */ &g_ggml_backend_opencl_device,
+ /* .context = */ nullptr,
+ };
+
+ return &buffer_type;
+}
+
+//
+// backend device
+//
+
+static const char * ggml_backend_opencl_device_get_name(ggml_backend_dev_t dev) {
+ return "GPUOpenCL";
+
+ GGML_UNUSED(dev);
+}
+
+static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_t dev) {
+ ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
+ return dev_ctx->device_name.c_str();
+}
+
+static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+ *free = 1;
+ *total = 1;
+
+ GGML_UNUSED(dev);
+}
+
+static enum ggml_backend_dev_type ggml_backend_opencl_device_get_type(ggml_backend_dev_t dev) {
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
+
+ GGML_UNUSED(dev);
+}
+
+static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+ props->name = ggml_backend_opencl_device_get_name(dev);
+ props->description = ggml_backend_opencl_device_get_description(dev);
+ props->type = ggml_backend_opencl_device_get_type(dev);
+ ggml_backend_opencl_device_get_memory(dev, &props->memory_free, &props->memory_total);
+ props->caps = ggml_backend_dev_caps {
+ /* .async = */ false,
+ /* .host_buffer = */ false,
+ /* .buffer_from_host_ptr = */ false,
+ /* .events = */ false,
+ };
+}
+
+static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
+ ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
+
+ ggml_backend_t backend = new ggml_backend {
+ /* .guid = */ ggml_backend_opencl_guid(),
+ /* .interface = */ ggml_backend_opencl_i,
+ /* .device = */ dev,
+ /* .context = */ backend_ctx,
+ };
+
+ return backend;
+
+ GGML_UNUSED(params);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_opencl_device_get_buffer_type(ggml_backend_dev_t dev) {
+ return ggml_backend_opencl_buffer_type();
+
+ GGML_UNUSED(dev);
+}
+
+static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+ GGML_UNUSED(dev);
+ GGML_UNUSED(ptr);
+ GGML_UNUSED(size);
+ GGML_UNUSED(max_tensor_size);
+ return nullptr;
+}
+
+static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+ return ggml_opencl_supports_op(dev, op);
+}
+
+static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+ return buft->iface.get_name == ggml_backend_opencl_buffer_type_get_name;
+
+ GGML_UNUSED(dev);
+}
+
+static struct ggml_backend_device_i ggml_backend_opencl_device_i = {
+ /* .get_name = */ ggml_backend_opencl_device_get_name,
+ /* .get_description = */ ggml_backend_opencl_device_get_description,
+ /* .get_memory = */ ggml_backend_opencl_device_get_memory,
+ /* .get_type = */ ggml_backend_opencl_device_get_type,
+ /* .get_props = */ ggml_backend_opencl_device_get_props,
+ /* .init_backend = */ ggml_backend_opencl_device_init,
+ /* .get_buffer_type = */ ggml_backend_opencl_device_get_buffer_type,
+ /* .get_host_buffer_type = */ NULL,
+ /* .buffer_from_host_ptr = */ ggml_backend_opencl_device_buffer_from_ptr,
+ /* .supports_op = */ ggml_backend_opencl_device_supports_op,
+ /* .supports_buft = */ ggml_backend_opencl_device_supports_buft,
+ /* .offload_op = */ NULL,
+ /* .event_new = */ NULL,
+ /* .event_free = */ NULL,
+ /* .event_synchronize = */ NULL,
+};
+
+// Backend registry
+
+static const char * ggml_backend_opencl_reg_get_name(ggml_backend_reg_t reg) {
+ return "OpenCL";
+
+ GGML_UNUSED(reg);
+}
+
+static size_t ggml_backend_opencl_reg_device_count(ggml_backend_reg_t reg) {
+ return ggml_backend_opencl_n_devices;
+
+ GGML_UNUSED(reg);
+}
+
+static ggml_backend_dev_t ggml_backend_opencl_reg_device_get(ggml_backend_reg_t reg, size_t index) {
+ GGML_ASSERT(index == 0);
+
+ return &g_ggml_backend_opencl_device;
+
+ GGML_UNUSED(reg);
+ GGML_UNUSED(index);
+}
+
+static struct ggml_backend_reg_i ggml_backend_opencl_reg_i = {
+ /* .get_name = */ ggml_backend_opencl_reg_get_name,
+ /* .device_count = */ ggml_backend_opencl_reg_device_count,
+ /* .device_get = */ ggml_backend_opencl_reg_device_get,
+ /* .get_proc_address = */ NULL,
+};
+
+ggml_backend_reg_t ggml_backend_opencl_reg(void) {
+ // TODO: make this thread-safe somehow?
+ static ggml_backend_reg reg;
+ static bool initialized = false;
+
+ if (!initialized) {
+ reg = ggml_backend_reg {
+ /* .api_version = */ GGML_BACKEND_API_VERSION,
+ /* .iface = */ ggml_backend_opencl_reg_i,
+ /* .context = */ NULL,
+ };
+
+ g_ggml_backend_opencl_device = ggml_backend_device {
+ /* .iface = */ ggml_backend_opencl_device_i,
+ /* .reg = */ ®,
+ /* .context = */ &g_ggml_ctx_dev_main,
+ };
+
+ ggml_cl2_init(&g_ggml_backend_opencl_device);
+
+ initialized = true;
+ }
+
+ return ®
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_opencl_reg)
+
+//------------------------------------------------------------------------------
+// Debugging utils
+//------------------------------------------------------------------------------
+#if 0
+#define QK4_0 32
+typedef struct {
+ ggml_fp16_t d; // delta
+ uint8_t qs[QK4_0 / 2]; // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2,
+ "wrong q4_0 block size/padding");
+
+#include
+#ifdef __cplusplus
+#include "half.hpp"
+#endif
+
+static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tensor) {
+ void * buf = malloc(ggml_nbytes(tensor));
+
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+ cl_command_queue queue = backend_ctx->queue;
+#ifdef GGML_OPENCL_SOA_Q
+ void * buf_q;
+ void * buf_d;
+#endif
+
+#ifdef GGML_USE_OPENCL
+ // Make sure everything is done.
+ CL_CHECK(clFinish(queue));
+
+#ifdef GGML_OPENCL_SOA_Q
+ if (tensor->type == GGML_TYPE_Q4_0) {
+ ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *) tensor->extra;
+ GGML_ASSERT(extra);
+
+ size_t size_q = ggml_nelements(tensor)/QK4_0 * QK4_0/2;
+ size_t size_d = ggml_nelements(tensor)/QK4_0 * sizeof(ggml_fp16_t);
+ GGML_ASSERT(size_q + size_d == ggml_nbytes(tensor));
+ buf_q = malloc(size_q);
+ buf_d = malloc(size_d);
+
+ CL_CHECK(clEnqueueReadBuffer(queue, extra->q, CL_TRUE, 0, size_q, buf_q, 0, NULL, NULL));
+ CL_CHECK(clEnqueueReadBuffer(queue, extra->d, CL_TRUE, 0, size_d, buf_d, 0, NULL, NULL));
+ CL_CHECK(clFinish(queue));
+ } else {
+ // Read out the tensor from GPU memory.
+ ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
+ GGML_ASSERT(extra);
+
+ CL_CHECK(clEnqueueReadBuffer(queue, extra->data_device, CL_TRUE,
+ extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
+ CL_CHECK(clFinish(queue));
+ }
+#else
+ // Read out the tensor from GPU memory.
+ ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
+ GGML_ASSERT(extra);
+
+ CL_CHECK(clEnqueueReadBuffer(queue, extra->data_device, CL_TRUE,
+ extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
+ CL_CHECK(clFinish(queue));
+#endif // GGML_OPENCL_SOA_Q
+#endif // GGML_USE_OPENCL
+
+ // Open file and dump.
+ char fname[512];
+ sprintf(fname, "./tensor-dumps/%s.txt", tensor->name);
+ FILE * f = fopen(fname, "w");
+ if (!f) {
+ printf("Failed to open %s\n", fname);
+ return;
+ }
+
+ if (tensor->type == GGML_TYPE_F32) {
+ float * data = (float *) buf;
+ for (int i = 0; i < ggml_nelements(tensor); ++i) {
+ if (isnan(data[i])) {
+ printf("NaN found: %s\n", tensor->name);
+ break;
+ }
+ fprintf(f, "%f\n", data[i]);
+ }
+ } else if (tensor->type == GGML_TYPE_I32) {
+ int * data = (int *) buf;
+ for (int i = 0; i < ggml_nelements(tensor); ++i) {
+ if (isnan(data[i])) {
+ printf("NaN found: %s\n", tensor->name);
+ break;
+ }
+ fprintf(f, "%d\n", data[i]);
+ }
+ } else if (tensor->type == GGML_TYPE_F16) {
+#ifdef __cplusplus
+ half_float::half * data = (half_float::half *) buf;
+ for (int i = 0; i < ggml_nelements(tensor); ++i) {
+ if (std::isnan(data[i])) {
+ printf("NaN found: %s\n", tensor->name);
+ break;
+ }
+ fprintf(f, "%f\n", float(data[i]));
+ }
+#endif
+ } else if (tensor->type == GGML_TYPE_Q4_0) {
+#ifdef GGML_OPENCL_SOA_Q
+ ggml_fp16_t * data_d = (ggml_fp16_t *)buf_d;
+ unsigned char * data_q = (unsigned char *)buf_q;
+
+ for (int i = 0; i < ggml_nelements(tensor)/QK4_0; ++i) {
+ fprintf(f, "%04x, ", data_d[i]);
+ for (int k = 0; k < QK4_0/2; ++k) {
+ fprintf(f, "%02x, ", data_q[k]);
+ }
+ fprintf(f, "\n");
+ data_q += QK4_0/2;
+ }
+ free(buf_d);
+ free(buf_q);
+#else
+ block_q4_0 * data = (block_q4_0 *) buf;
+ for (int i = 0; i < ggml_nelements(tensor)/QK4_0; ++i) {
+ fprintf(f, "%04x, ", data[i].d);
+ for (int k = 0; k < QK4_0/2; ++k) {
+ fprintf(f, "%02x, ", data[i].qs[k]);
+ }
+ fprintf(f, "\n");
+ }
+#endif // GGML_OPENCL_SOA_Q
+ }
+ free(buf);
+ fflush(f);
+ fclose(f);
+}
+#else
+#define dump_tensor(tensor)
+#endif
+
+//------------------------------------------------------------------------------
+// Profiling utility
+//------------------------------------------------------------------------------
+#ifdef GGML_OPENCL_PROFILING
+void populateProfilingInfo(
+ ProfilingInfo& info, cl_event evt, cl_kernel kernel,
+ size_t global_size[3], size_t local_size[3],
+ const ggml_tensor * tensor) {
+ cl_ulong start;
+ cl_ulong end;
+ CL_CHECK(clWaitForEvents(1, &evt));
+ CL_CHECK(clGetEventProfilingInfo(
+ evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL));
+ CL_CHECK(clGetEventProfilingInfo(
+ evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
+
+ char kernel_name[512];
+ CL_CHECK(clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME,
+ sizeof(kernel_name), kernel_name, NULL));
+
+ info.duration_ns = end - start;
+ info.op_name = tensor->name;
+ info.kernel_name = kernel_name;
+ info.local_size[0] = local_size[0];
+ info.local_size[1] = local_size[1];
+ info.local_size[2] = local_size[2];
+ info.global_size[0] = global_size[0];
+ info.global_size[1] = global_size[1];
+ info.global_size[2] = global_size[2];
+ info.output_size[0] = tensor->ne[0];
+ info.output_size[1] = tensor->ne[1];
+ info.output_size[2] = tensor->ne[2];
+ info.output_size[3] = tensor->ne[3];
+}
+#endif
+
+//------------------------------------------------------------------------------
+// Ops
+//------------------------------------------------------------------------------
+
+static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+ const int64_t ne10 = src1->ne[0];
+
+ const int64_t ne0 = dst->ne[0];
+ const int64_t ne1 = dst->ne[1];
+
+ // TODO: find the optimal values for these
+ return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+ src1->type == GGML_TYPE_F32 &&
+ dst->type == GGML_TYPE_F32 &&
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
+}
+
+static void ggml_cl_nop(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ UNUSED(backend);
+ UNUSED(src0);
+ UNUSED(src1);
+ UNUSED(dst);
+}
+
+static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0);
+ GGML_ASSERT(src0->extra);
+ GGML_ASSERT(src1);
+ GGML_ASSERT(src1->extra);
+ GGML_ASSERT(dst);
+ GGML_ASSERT(dst->extra);
+
+ const int ne00 = src0 ? src0->ne[0] : 0;
+ const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+ const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
+ const int ne10 = src1 ? src1->ne[0] : 0;
+ const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
+ const int ne11 = src1 ? src1->ne[1] : 0;
+ const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
+ const cl_ulong nb1 = dst ? dst->nb[1] : 0;
+ const cl_ulong nb2 = dst ? dst->nb[2] : 0;
+
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+ cl_command_queue queue = backend_ctx->queue;
+
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+ cl_kernel kernel;
+
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ kernel = backend_ctx->kernel_get_rows_f32;
+ break;
+ case GGML_TYPE_F16:
+ kernel = backend_ctx->kernel_get_rows_f16;
+ break;
+ case GGML_TYPE_Q4_0:
+ kernel = backend_ctx->kernel_get_rows_q4_0;
+ break;
+ default:
+ GGML_ASSERT(false && "not implemented");
+ }
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb10));
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb11));
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb1));
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb2));
+
+ size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1};
+ size_t local_work_size[] = {1, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0);
+ GGML_ASSERT(src0->extra);
+ GGML_ASSERT(src1);
+ GGML_ASSERT(src1->extra);
+ GGML_ASSERT(dst);
+ GGML_ASSERT(dst->extra);
+
+ const int ne00 = src0 ? src0->ne[0] : 0;
+ const int ne01 = src0 ? src0->ne[1] : 0;
+ const int ne02 = src0 ? src0->ne[2] : 0;
+ const int ne03 = src0 ? src0->ne[3] : 0;
+
+ const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
+ const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+ const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
+ const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
+
+ const int ne10 = src1 ? src1->ne[0] : 0;
+ const int ne11 = src1 ? src1->ne[1] : 0;
+ const int ne12 = src1 ? src1->ne[2] : 0;
+ const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
+
+ const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
+ const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
+ const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
+ const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
+
+ const int ne0 = dst ? dst->ne[0] : 0;
+ const int ne1 = dst ? dst->ne[1] : 0;
+ const int ne2 = dst ? dst->ne[2] : 0;
+ const int ne3 = dst ? dst->ne[3] : 0;
+
+ const cl_ulong nb0 = dst ? dst->nb[0] : 0;
+ const cl_ulong nb1 = dst ? dst->nb[1] : 0;
+ const cl_ulong nb2 = dst ? dst->nb[2] : 0;
+ const cl_ulong nb3 = dst ? dst->nb[3] : 0;
+
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+ cl_command_queue queue = backend_ctx->queue;
+
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+ bool bcast_row = false;
+ cl_kernel kernel;
+
+ if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
+ GGML_ASSERT(ggml_is_contiguous(src0));
+
+ // src1 is a row
+ GGML_ASSERT(ne11 == 1);
+
+ bcast_row = true;
+ int ne = ne00 / 4;
+ kernel = backend_ctx->kernel_add_row;
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
+ } else {
+ kernel = backend_ctx->kernel_add;
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
+ CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
+ CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
+ CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
+ CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
+ CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
+ CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
+ }
+
+ if (bcast_row) {
+ int n = ggml_nelements(dst)/4;
+ size_t global_work_size[] = {(size_t)n, 1, 1};
+ size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+ } else {
+ unsigned int nth = MIN(64, ne0);
+ size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
+ size_t local_work_size[] = {nth, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+ }
+}
+
+static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0);
+ GGML_ASSERT(src0->extra);
+ GGML_ASSERT(src1);
+ GGML_ASSERT(src1->extra);
+ GGML_ASSERT(dst);
+ GGML_ASSERT(dst->extra);
+
+ const int ne00 = src0 ? src0->ne[0] : 0;
+ const int ne01 = src0 ? src0->ne[1] : 0;
+ const int ne02 = src0 ? src0->ne[2] : 0;
+ const int ne03 = src0 ? src0->ne[3] : 0;
+
+ const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
+ const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+ const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
+ const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
+
+ const int ne10 = src1 ? src1->ne[0] : 0;
+ const int ne11 = src1 ? src1->ne[1] : 0;
+ const int ne12 = src1 ? src1->ne[2] : 0;
+ const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
+
+ const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
+ const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
+ const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
+ const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
+
+ const int ne0 = dst ? dst->ne[0] : 0;
+ const int ne1 = dst ? dst->ne[1] : 0;
+ const int ne2 = dst ? dst->ne[2] : 0;
+ const int ne3 = dst ? dst->ne[3] : 0;
+
+ const cl_ulong nb0 = dst ? dst->nb[0] : 0;
+ const cl_ulong nb1 = dst ? dst->nb[1] : 0;
+ const cl_ulong nb2 = dst ? dst->nb[2] : 0;
+ const cl_ulong nb3 = dst ? dst->nb[3] : 0;
+
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+ cl_command_queue queue = backend_ctx->queue;
+
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+ bool bcast_row = false;
+ cl_kernel kernel;
+
+ if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
+ GGML_ASSERT(ggml_is_contiguous(src0));
+
+ // src1 is a row
+ GGML_ASSERT(ne11 == 1);
+
+ bcast_row = true;
+ int ne = ne00 / 4;
+ kernel = backend_ctx->kernel_mul_row;
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
+ } else {
+ kernel = backend_ctx->kernel_mul;
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
+ CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
+ CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
+ CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
+ CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
+ CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
+ CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
+ }
+
+ if (bcast_row) {
+ int n = ggml_nelements(dst)/4;
+ size_t global_work_size[] = {(size_t)n, 1, 1};
+ size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+ } else {
+ unsigned int nth = MIN(64, ne0);
+ size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
+ size_t local_work_size[] = {nth, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+ }
+}
+
+static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0);
+ GGML_ASSERT(src0->extra);
+ GGML_ASSERT(dst);
+ GGML_ASSERT(dst->extra);
+
+ UNUSED(src1);
+
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+ cl_command_queue queue = backend_ctx->queue;
+
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+ cl_kernel kernel;
+
+ int n = ggml_nelements(dst);
+
+ if (n % 4 == 0) {
+ kernel = backend_ctx->kernel_gelu_4;
+ n /= 4;
+ } else {
+ kernel = backend_ctx->kernel_gelu;
+ }
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+
+ size_t global_work_size[] = {(size_t)n, 1, 1};
+ size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+ clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
+#endif
+}
+
+static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0);
+ GGML_ASSERT(src0->extra);
+ GGML_ASSERT(dst);
+ GGML_ASSERT(dst->extra);
+
+ UNUSED(src1);
+
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+ cl_command_queue queue = backend_ctx->queue;
+
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+ cl_kernel kernel;
+
+ int n = ggml_nelements(dst);
+
+ if (n % 4 == 0) {
+ kernel = backend_ctx->kernel_silu_4;
+ n /= 4;
+ } else {
+ kernel = backend_ctx->kernel_silu;
+ }
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+
+ size_t global_work_size[] = {(size_t)n, 1, 1};
+ size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0);
+ GGML_ASSERT(src0->extra);
+ GGML_ASSERT(dst);
+ GGML_ASSERT(dst->extra);
+
+ UNUSED(src1);
+
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+ cl_command_queue queue = backend_ctx->queue;
+
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+ cl_kernel kernel = backend_ctx->kernel_relu;
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+
+ const int64_t n = ggml_nelements(dst);
+
+ size_t global_work_size[] = {(size_t)n, 1, 1};
+ size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0);
+ GGML_ASSERT(src0->extra);
+ GGML_ASSERT(dst);
+ GGML_ASSERT(dst->extra);
+
+ UNUSED(src1);
+
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+ cl_command_queue queue = backend_ctx->queue;
+
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+ float min;
+ float max;
+ memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float));
+ memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float));
+
+ cl_kernel kernel = backend_ctx->kernel_clamp;
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &min));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &max));
+
+ const int64_t n = ggml_nelements(dst);
+
+ size_t global_work_size[] = {(size_t)n, 1, 1};
+ size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0);
+ GGML_ASSERT(src0->extra);
+ GGML_ASSERT(dst);
+ GGML_ASSERT(dst->extra);
+
+ UNUSED(src1);
+
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+ cl_command_queue queue = backend_ctx->queue;
+
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+ float eps;
+ memcpy(&eps, dst->op_params, sizeof(float));
+
+ const int ne00 = src0 ? src0->ne[0] : 0;
+ const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
+
+ const int nth = MIN(64, ne00);
+
+ cl_kernel kernel = backend_ctx->kernel_norm;
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth, NULL));
+
+ const int64_t nrows = ggml_nrows(src0);
+
+ size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0);
+ GGML_ASSERT(src0->extra);
+ GGML_ASSERT(dst);
+ GGML_ASSERT(dst->extra);
+
+ UNUSED(src1);
+
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+ cl_command_queue queue = backend_ctx->queue;
+
+ ggml_backend_opencl_device_context * dev_ctx =
+ (ggml_backend_opencl_device_context *)backend->device->context;
+
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+ float eps;
+ memcpy(&eps, dst->op_params, sizeof(float));
+
+ const int ne00 = src0 ? src0->ne[0] : 0;
+ const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+
+ GGML_ASSERT(ne00 % 4 == 0);
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
+
+ const int nth = MIN(64, ne00);
+
+ const int64_t nrows = ggml_nrows(src0);
+
+ size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+ cl_kernel kernel = backend_ctx->kernel_rms_norm;
+
+ // Note, this kernel declares local memory in kernel args and the size
+ // depends on subgroup size.
+ // Retrieve subgroup size.
+ // Note, this requires OpenCL 2.1 and above
+ size_t sgs;
+ CL_CHECK(clGetKernelSubGroupInfo(kernel, dev_ctx->device,
+ CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
+ sizeof(local_work_size), local_work_size,
+ sizeof(size_t), &sgs, NULL));
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
+ // This is local memory - the size depends on subgroup size.
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth/sgs, NULL));
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0);
+ GGML_ASSERT(src0->extra);
+ GGML_ASSERT(src1);
+ GGML_ASSERT(src1->extra);
+ GGML_ASSERT(dst);
+ GGML_ASSERT(dst->extra);
+
+ const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
+ const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+ cl_command_queue queue = backend_ctx->queue;
+
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+#ifdef GGML_OPENCL_SOA_Q
+ ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
+#endif
+
+ const int ne00 = src0 ? src0->ne[0] : 0;
+ const int ne01 = src0 ? src0->ne[1] : 0;
+ const int ne02 = src0 ? src0->ne[2] : 0;
+ const int ne03 = src0 ? src0->ne[3] : 0;
+
+ const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
+ const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+ const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
+ const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
+
+ const int ne10 = src1 ? src1->ne[0] : 0;
+ const int ne11 = src1 ? src1->ne[1] : 0;
+ const int ne12 = src1 ? src1->ne[2] : 0;
+ const int ne13 = src1 ? src1->ne[3] : 0;
+
+ const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
+ const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
+ const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
+ const cl_ulong nb13 = src1 ? src1->nb[3] : 0;
+
+ const int ne0 = dst ? dst->ne[0] : 0;
+ const int ne1 = dst ? dst->ne[1] : 0;
+
+ int r2 = ne12/ne02;
+ int r3 = ne13/ne03;
+
+ GGML_ASSERT(ne00 == ne10);
+
+ int nth0 = 32;
+ int nth1 = 1;
+ int nrows = 1;
+ // The number of values produced by each subgroup
+ int ndst = 4;
+
+ cl_kernel kernel;
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+ cl_context context = backend_ctx->context;
+
+ if (ne01 && ne1 && use_adreno_kernels(src0)) {
+
+ // init CL objects
+ // <--------------------------------------------> //
+ cl_int status;
+ cl_image_format img_fmt_1d;
+ cl_image_desc img_desc_1d;
+ cl_buffer_region region;
+ cl_mem A_image1d;
+ cl_mem B_image1d;
+ cl_mem B_sub_buffer;
+ cl_mem C_d;
+ // for B transpose
+ cl_mem B_d;
+ cl_mem B_d_input_image;
+ // <--------------------------------------------> //
+
+ // define matrix dimensions
+ // <--------------------------------------------> //
+ int M = ne01;
+ int N = ne1;
+ int K = ne00;
+ int padding;
+ // <--------------------------------------------> //
+
+ // q4_0 x fp32
+ if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) {
+ // TODO: remove duplicate definitions of image description + format -- move to top
+
+ // create an image for A
+ // <--------------------------------------------> //
+ if (N == 1) {
+ img_fmt_1d = { CL_R, CL_UNSIGNED_INT32};
+ } else {
+ img_fmt_1d = { CL_R, CL_FLOAT};
+ }
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+ img_desc_1d.image_width = M * K / 2 / 4; // Divide by 4 for char -> float
+ img_desc_1d.buffer = extra0_q4_0->q;
+ A_image1d = clCreateImage(
+ context,
+ CL_MEM_READ_ONLY,
+ &img_fmt_1d,
+ &img_desc_1d,
+ NULL,
+ &status);
+ CL_CHECK(status);
+ // <--------------------------------------------> //
+
+
+ // create a sub_buffer for B
+ // <--------------------------------------------> //
+ region.origin = (extra1->offset);
+ region.size = K * N * sizeof(float);
+ B_sub_buffer = clCreateSubBuffer(
+ extra1->data_device,
+ 0,
+ CL_BUFFER_CREATE_TYPE_REGION,
+ ®ion,
+ &status);
+ CL_CHECK(status);
+ // <--------------------------------------------> //
+
+ // transpose activation for Skyler's gemm
+ if (N != 1) {
+ //how many extra elements beyond multiple of 8
+ int extra_elements = N % 8;
+
+ //how much padding to add
+ padding = 0;
+ if (extra_elements > 0){
+ padding = 8 - extra_elements;
+ }
+
+ // Specify the starting offset (in bytes)
+ region.origin = 0;
+ // Specify the size of the sub-buffer (divide by 2 for FP16)
+ region.size = K * (N + padding) * sizeof(float)/2;
+ B_d = clCreateSubBuffer(
+ backend_ctx->B_d_max,
+ 0,
+ CL_BUFFER_CREATE_TYPE_REGION,
+ ®ion,
+ &status);
+ CL_CHECK(status);
+
+ cl_image_format image_format_B_d_input = { CL_RGBA, CL_FLOAT };
+ cl_image_desc image_desc_B_d_input = {
+ CL_MEM_OBJECT_IMAGE1D_BUFFER,
+ static_cast(K * N / 4),
+ 0, 0, 0, 0, 0, 0, 0, { B_sub_buffer }
+ };
+ B_d_input_image = clCreateImage(
+ context,
+ 0,
+ &image_format_B_d_input,
+ &image_desc_B_d_input,
+ NULL,
+ &status);
+ CL_CHECK(status);
+
+ cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16)
+ cl_image_desc image_desc_B_d_output = {
+ CL_MEM_OBJECT_IMAGE1D_BUFFER,
+ static_cast(K * (N + padding)/4),
+ 0, 0, 0, 0, 0, 0, 0, { B_d }
+ };
+ B_image1d = clCreateImage(
+ context,
+ 0,
+ &image_format_B_d_output,
+ &image_desc_B_d_output,
+ NULL,
+ &status);
+ CL_CHECK(status);
+
+ int height_B = N/4;
+ int width_B = K/4;
+ int padded_height_B = (N + padding)/4;
+
+ kernel = backend_ctx->kernel_transpose_32_16;
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_d_input_image));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
+
+ size_t local_size_t[2] = { 1, 16 };
+ //WGS tuning
+ if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) {
+ local_size_t[0]=4;
+ local_size_t[1]=8;
+ } else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) {
+ local_size_t[0]=2;
+ local_size_t[1]=8;
+ } else if(ne0 == 4096 && ne1 == 128 && ne10 == 11008) {
+ local_size_t[0]=1;
+ local_size_t[1]=8;
+ } else if(ne0 == 32000 && ne1 == 128 && ne10 == 4096) {
+ local_size_t[0]=2;
+ local_size_t[1]=8;
+ }
+
+ size_t global_size_t[2] = {
+ static_cast(width_B),
+ static_cast(padded_height_B)
+ };
+
+ #ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_size_t, local_size_t, dst);
+ #else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, NULL));
+ #endif
+ } else {
+ // no need to transpose B in other cases
+ // create an image for B from sub_buffer
+ // <--------------------------------------------> //
+ img_fmt_1d = {CL_RGBA, CL_FLOAT};
+
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+ img_desc_1d.image_width = K * N / 4;
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+ img_desc_1d.buffer = B_sub_buffer;
+ B_image1d = clCreateImage(
+ context,
+ CL_MEM_READ_ONLY,
+ &img_fmt_1d,
+ &img_desc_1d,
+ NULL,
+ &status);
+ CL_CHECK(status);
+ // <--------------------------------------------> //
+ }
+
+ // choose gemm or gemv kernel
+ // <--------------------------------------------> //
+ if (N == 1) {
+ kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general;
+ if (M == 4096 && K == 4096) {
+ kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096;
+ } else if (M == 4096 && K == 11008) {
+ kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008;
+ } else if (M == 11008 && K == 4096) {
+ kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
+ } else if (M == 32000 && K == 4096) {
+ kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
+ }
+ } else {
+ kernel = backend_ctx->CL_mul_mat_Ab_Bi_8x4;
+ }
+ // <--------------------------------------------> //
+
+ // set kernel args
+ // <--------------------------------------------> //
+ cl_uint k_arg = 0;
+
+ if (N == 1) {
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extra0_q4_0->d));
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_image1d));
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extra1->offset));
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extrad->offset));
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne01));
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne10));
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne0));
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne1));
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r2));
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r3));
+ } else {
+ region.origin = extrad->offset; // Specify the starting offset (in bytes)
+ region.size = M * N * sizeof(float); // Specify the size of the sub-buffer
+ C_d = clCreateSubBuffer(extrad->data_device, CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
+ CL_CHECK(status);
+
+ int padded_N = ne1 + padding;
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); //A_q_dextra0_q4_0->q
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d)); //A_s_d
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &B_image1d)); //B_d
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &C_d)); //C_d
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01)); //M
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &padded_N)); //N with padding
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); //K
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne1)); //N without padding
+ }
+ // <--------------------------------------------> //
+
+ // choose workgroup size
+ // <--------------------------------------------> //
+ size_t global_work_size[3] = {
+ 64, static_cast((M+63)/64), static_cast((N+31)/32)};
+ size_t local_work_size[3] = {64, 2, 4};
+
+ global_work_size[0] = (size_t)(ceil((float)ne1/8));
+ global_work_size[1] = (size_t)(ne01/4);
+ global_work_size[2] = (size_t)(1);
+
+ local_work_size[0] = (size_t)(1); //4x32 for FP32
+ local_work_size[1] = (size_t)(128);
+ local_work_size[2] = (size_t)(1);
+
+ //WGS tuning
+ if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) {
+ local_work_size[0] = 1;
+ local_work_size[1] = 128;
+ } else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) {
+ local_work_size[0] = 2;
+ local_work_size[1] = 64;
+ } else if (ne0 == 4096 && ne1 == 128 && ne10 == 11008) {
+ local_work_size[0] = 2;
+ local_work_size[1] = 64;
+ } else if (ne0 == 32000 && ne1 == 128 && ne10 == 4096) {
+ local_work_size[0] = 2;
+ local_work_size[1] = 64;
+ }
+
+ if (N == 1) {
+ local_work_size[0] = backend_ctx->adreno_wave_size; // localsize
+ local_work_size[1] = 4; // reduce factor
+ local_work_size[2] = 1;
+
+ global_work_size[0] = M / 2;
+ global_work_size[1] = 4; // reduce factor
+ global_work_size[2] = 1;
+ }
+ // <--------------------------------------------> //
+
+ // enqueue kernel with profiling
+ // <--------------------------------------------> //
+ #ifdef GGML_OPENCL_PROFILING
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+ // enqueue kernel without profiling
+ #else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+ #endif
+ // <--------------------------------------------> //
+
+ // deallocate sub buffers and images
+ // <--------------------------------------------> //
+ CL_CHECK(clReleaseMemObject(A_image1d));
+ CL_CHECK(clReleaseMemObject(B_sub_buffer));
+ CL_CHECK(clReleaseMemObject(B_image1d));
+
+ if (N != 1) {
+ CL_CHECK(clReleaseMemObject(B_d));
+ CL_CHECK(clReleaseMemObject(B_d_input_image));
+ CL_CHECK(clReleaseMemObject(C_d));
+ }
+ // <--------------------------------------------> //
+
+ return;
+ }
+ } // if (ne01 && ne1)
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
+ if (!ggml_is_transposed(src0) &&
+ !ggml_is_transposed(src1) &&
+ src1t == GGML_TYPE_F32 &&
+ ne00%32 == 0 &&
+ ne11 > 2) {
+#ifdef GGML_OPENCL_SOA_Q
+ // Set up kernel.
+ switch(src0t) {
+ case GGML_TYPE_Q4_0:
+ // This should have been satisfied.
+ GGML_ASSERT(ne11 == ne1);
+ GGML_ASSERT(ne01 == ne0);
+
+ if (backend_ctx->gpu_family == INTEL) {
+ nth0 = 16;
+ nth1 = 1;
+
+ kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat;
+ } else if (backend_ctx->gpu_family == ADRENO) {
+ nth0 = 64;
+ nth1 = 1;
+
+ kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat;
+ } else {
+ GGML_ASSERT(false && "TODO: Unknown GPU");
+ }
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
+ break;
+ default:
+ break;
+ }
+
+ // Launch kernel.
+ if (src0t == GGML_TYPE_Q4_0) {
+ size_t global_work_size[] = {(size_t)(ne01 + 7)/8*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
+ size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
+
+ if (backend_ctx->gpu_family == INTEL) {
+ // Set global size for Intel. It uses 16x output values.
+ global_work_size[0] = (size_t)(ne01 + 15)/16*nth0;
+ global_work_size[1] = (size_t)ne11*nth1;
+ global_work_size[2] = (size_t)ne12*ne13;
+ }
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+ return;
+ }
+#else // GGML_OPENCL_SOA_Q
+ // TODO: add block_q4_0 variant.
+#endif // GGML_OPENCL_SOA_Q
+ }
+
+ // use custom matrix x vector kernel
+ switch (src0t) {
+ case GGML_TYPE_F32:
+ //GGML_ASSERT(ne02 == ne12);
+ GGML_ASSERT(src1t == GGML_TYPE_F32);
+ kernel = backend_ctx->kernel_mul_mat_f32_f32;
+ nrows = 4;
+
+ if (backend_ctx->gpu_family == INTEL) {
+ nth0 = 32;
+ nth1 = 1;
+ } else if (backend_ctx->gpu_family == ADRENO) {
+ nth0 = 64;
+ nth1 = 1;
+ } else {
+ GGML_ASSERT(false && "TODO: Unknown GPU");
+ }
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb00));
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0));
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1));
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2));
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3));
+ break;
+ case GGML_TYPE_F16:
+ //GGML_ASSERT(ne02 == ne12);
+ if (backend_ctx->gpu_family == INTEL) {
+ nth0 = 32;
+ nth1 = 1;
+ } else if (backend_ctx->gpu_family == ADRENO) {
+ nth0 = 64;
+ nth1 = 1;
+ } else {
+ GGML_ASSERT(false && "TODO: Unknown GPU");
+ }
+
+ if (src1t == GGML_TYPE_F32) {
+ if (ne11 * ne12 < 4) {
+ kernel = backend_ctx->kernel_mul_mat_f16_f32_1row;
+ } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
+ kernel = backend_ctx->kernel_mul_mat_f16_f32_l4;
+ nrows = ne11;
+ } else {
+ kernel = backend_ctx->kernel_mul_mat_f16_f32;
+ nrows = 4;
+ }
+ } else {
+ kernel = backend_ctx->kernel_mul_mat_f16_f16;
+ nrows = 4;
+ }
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb00));
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0));
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1));
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2));
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3));
+ break;
+ case GGML_TYPE_Q4_0:
+ // This should have been satisfied.
+ GGML_ASSERT(ne11 == ne1);
+ GGML_ASSERT(ne01 == ne0);
+
+#ifdef GGML_OPENCL_SOA_Q
+ if (backend_ctx->gpu_family == INTEL) {
+ nth0 = 16;
+ nth1 = 1;
+
+ kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat;
+ ndst = 8;
+ } else if (backend_ctx->gpu_family == ADRENO) {
+ nth0 = 64;
+ nth1 = 1;
+
+ kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat;
+ ndst =8;
+ } else {
+ GGML_ASSERT(false && "TODO: Unknown GPU");
+ }
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
+#else // GGML_OPENCL_SOA_Q
+ if (backend_ctx->gpu_family == INTEL) {
+ // Use 1D local size. Each workgroup is a SIMD group. Each SIMD
+ // group produces N_DST (4 for Q4_0 kernel) values in the result.
+ // The number of workgroups on dim 0 (the leading dimension) is
+ // the nearest multiple of 4 that covers ne0 (equals ne01).
+ nth0 = 16;
+ nth1 = 1;
+
+ kernel = backend_ctx->kernel_mul_mat_q4_0_f32;
+ ndst = 4;
+ } else if (backend_ctx->gpu_family == ADRENO) {
+ nth0 = 64;
+ nth1 = 1;
+
+ kernel = backend_ctx->kernel_mul_mat_q4_0_f32_v;
+ ndst = 4;
+ } else {
+ GGML_ASSERT(false && "TODO: Unknown GPU");
+ }
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
+#endif // GGML_OPENCL_SOA_Q
+ break;
+ case GGML_TYPE_Q4_1:
+ case GGML_TYPE_Q8_0:
+ case GGML_TYPE_Q2_K:
+ case GGML_TYPE_Q3_K:
+ case GGML_TYPE_Q4_K:
+ case GGML_TYPE_Q5_K:
+ case GGML_TYPE_Q6_K:
+ kernel = backend_ctx->kernel_mul_mv_q6_K_f32;
+
+ if (backend_ctx->gpu_family == INTEL) {
+ nth0 = 2;
+ nth1 = 16;
+ } else if (backend_ctx->gpu_family == ADRENO) {
+ nth0 = 2;
+ nth1 = 64;
+ } else {
+ GGML_ASSERT(false && "TODO: Unknown GPU");
+ }
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
+ break;
+ default:
+ GGML_ASSERT(false && "not implemented");
+ }
+
+ if (src0t == GGML_TYPE_Q4_0 ||
+ src0t == GGML_TYPE_Q4_1 ||
+ src0t == GGML_TYPE_Q8_0 ||
+ src0t == GGML_TYPE_Q2_K) {
+ // Each SIMD group produces N_DST values in the result. Assuming each
+ // workgroup has N_SIMDGROUP SIMD groups, then each workgroup will
+ // produce N_DST*N_SIMDGROUP values in the result. Hence, the grid size
+ // (number of workgroups) will be a nearest multiple of
+ // N_DST*N_SIMDGROUP to cover the size of the dimension. Below, 4 is
+ // N_DST*N_SIMDGROUP (see the kernel for Q4_0 matmul).
+ size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
+ size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+ } else if (src0t == GGML_TYPE_Q4_K) {
+ GGML_ASSERT(false && "not implemented");
+ } else if (src0t == GGML_TYPE_Q3_K) {
+ GGML_ASSERT(false && "not implemented");
+ } else if (src0t == GGML_TYPE_Q5_K) {
+ GGML_ASSERT(false && "not implemented");
+ } else if (src0t == GGML_TYPE_Q6_K) {
+ size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
+ size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+ } else {
+ int64_t ny = (ne11 + nrows - 1)/nrows;
+
+ size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
+ size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+ }
+}
+
+static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0);
+ GGML_ASSERT(src0->extra);
+ GGML_ASSERT(dst);
+ GGML_ASSERT(dst->extra);
+ GGML_UNUSED(src1);
+
+ GGML_ASSERT(ggml_is_contiguous(src0));
+
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+ cl_command_queue queue = backend_ctx->queue;
+
+ float scale;
+ memcpy(&scale, dst->op_params, sizeof(scale));
+
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+ cl_kernel kernel = backend_ctx->kernel_scale;
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale));
+
+ int n = ggml_nelements(dst)/4;
+
+ size_t global_work_size[] = {(size_t)n, 1, 1};
+ size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0);
+ GGML_ASSERT(src0->extra);
+ GGML_ASSERT(src1);
+ GGML_ASSERT(src1->extra);
+
+ // GGML_OP_CPY happens between src0 and src1.
+ // GGML_OP_DUP and GGML_OP_CONT happen between src0 and dst.
+ UNUSED(dst);
+
+ const int ne00 = src0 ? src0->ne[0] : 0;
+ const int ne01 = src0 ? src0->ne[1] : 0;
+ const int ne02 = src0 ? src0->ne[2] : 0;
+ const int ne03 = src0 ? src0->ne[3] : 0;
+
+ const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
+ const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+ const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
+ const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
+
+ const int ne10 = src1 ? src1->ne[0] : 0;
+ const int ne11 = src1 ? src1->ne[1] : 0;
+ const int ne12 = src1 ? src1->ne[2] : 0;
+ const int ne13 = src1 ? src1->ne[3] : 0;
+
+ const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
+ const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
+ const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
+ const cl_ulong nb13 = src1 ? src1->nb[3] : 0;
+
+ const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
+ const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+ cl_command_queue queue = backend_ctx->queue;
+
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
+
+ cl_kernel kernel;
+
+ switch (src0t) {
+ case GGML_TYPE_F32:
+ switch (src1t) {
+ case GGML_TYPE_F16:
+ kernel = backend_ctx->kernel_cpy_f32_f16;
+ break;
+ case GGML_TYPE_F32:
+ kernel = backend_ctx->kernel_cpy_f32_f32;
+ break;
+ default:
+ GGML_ASSERT(false && "not implemented");
+ }
+ break;
+ case GGML_TYPE_F16:
+ switch (src1t) {
+ case GGML_TYPE_F16:
+ kernel = backend_ctx->kernel_cpy_f16_f16;
+ break;
+ case GGML_TYPE_F32:
+ kernel = backend_ctx->kernel_cpy_f16_f32;
+ break;
+ default:
+ GGML_ASSERT(false && "not implemented");
+ }
+ break;
+ default:
+ GGML_ASSERT(false && "not implemented");
+ }
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
+
+ const int nth = MIN(64, ne00);
+
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, src1);
+#else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ ggml_cl_cpy(backend, src0, dst, nullptr);
+ UNUSED(src1);
+}
+
+static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0);
+ GGML_ASSERT(src0->extra);
+ GGML_ASSERT(dst);
+ GGML_ASSERT(dst->extra);
+
+ UNUSED(src1);
+
+ int n_past = ((int32_t *)(dst->op_params))[0];
+
+ const int ne00 = src0 ? src0->ne[0] : 0;
+ const int ne01 = src0 ? src0->ne[1] : 0;
+ const int ne02 = src0 ? src0->ne[2] : 0;
+
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+ cl_command_queue queue = backend_ctx->queue;
+
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+ cl_kernel kernel;
+
+ if (ne00%8 == 0) {
+ kernel = backend_ctx->kernel_diag_mask_inf_8;
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n_past));
+
+ size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
+ size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+ } else {
+ kernel = backend_ctx->kernel_diag_mask_inf;
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n_past));
+
+ size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02};
+ size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+ }
+}
+
+static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0);
+ GGML_ASSERT(src0->extra);
+ GGML_ASSERT(dst);
+ GGML_ASSERT(dst->extra);
+
+ // Softmax can now fuse KQ mask and KQ scale, which used to be two additional
+ // ops before softmax. It now also fuses alibi if `max_bias > 0`. For llama,
+ // alibi is not used; however, for some other models, it is used.
+ // KQ_mask
+ if (src1) {
+ GGML_ASSERT(src1);
+ GGML_ASSERT(src1->extra);
+ }
+
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+ cl_command_queue queue = backend_ctx->queue;
+
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+ ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr;
+
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+ cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
+
+ const int ne00 = src0 ? src0->ne[0] : 0;
+ const int ne01 = src0 ? src0->ne[1] : 0;
+ const int ne02 = src0 ? src0->ne[2] : 0;
+ const int ne03 = src0 ? src0->ne[3] : 0;
+
+ float scale, max_bias;
+ memcpy(&scale, dst->op_params + 0, sizeof(float));
+ memcpy(&max_bias, dst->op_params + 1, sizeof(float));
+
+ const int nrows_x = ggml_nrows(src0);
+ const int nrows_y = src0->ne[1];
+
+ const int n_head = nrows_x/nrows_y;
+ const int n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
+
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+ // Local size must be wave size. Each workgroup is a wave, working on a row,
+ // where a row corresponds to leading dimension.
+ int nth = MIN(32, ne00);
+
+ if (backend_ctx->gpu_family == INTEL) {
+ // This is the same as the initial value.
+ nth = MIN(32, ne00);
+ }
+ else if (backend_ctx->gpu_family == ADRENO) {
+ nth = 64;
+ } else {
+ GGML_ASSERT(false && "TODO: Unknown GPU");
+ }
+
+ cl_kernel kernel;
+
+ if (ne00%4 == 0) {
+ kernel = backend_ctx->kernel_soft_max_4;
+ } else {
+ kernel = backend_ctx->kernel_soft_max;
+ }
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), extra1 ? &extra1->data_device : &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(float), &scale));
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(float), &max_bias));
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &m0));
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &m1));
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &n_head_log2));
+
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0);
+ GGML_ASSERT(src0->extra);
+ GGML_ASSERT(src1);
+ GGML_ASSERT(src1->extra);
+ GGML_ASSERT(dst);
+ GGML_ASSERT(dst->extra);
+
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+ cl_command_queue queue = backend_ctx->queue;
+
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+ ggml_tensor * src2 = dst->src[2];
+ ggml_tensor_extra_cl * extra2 = src2 ? (ggml_tensor_extra_cl *)src2->extra : nullptr;
+
+ cl_ulong offset2 = extra2 ? extra2->offset + src2->view_offs : offset0;
+
+ const int ne00 = src0 ? src0->ne[0] : 0;
+ const int ne01 = src0 ? src0->ne[1] : 0;
+ const int ne02 = src0 ? src0->ne[2] : 0;
+ const int ne03 = src0 ? src0->ne[3] : 0;
+
+ const int nb00 = src0 ? src0->nb[0] : 0;
+ const int nb01 = src0 ? src0->nb[1] : 0;
+ const int nb02 = src0 ? src0->nb[2] : 0;
+ const int nb03 = src0 ? src0->nb[3] : 0;
+
+ const int ne10 = src1 ? src1->ne[0] : 0;
+ const int ne11 = src1 ? src1->ne[1] : 0; UNUSED(ne11);
+ const int ne12 = src1 ? src1->ne[2] : 0; UNUSED(ne12);
+ const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
+
+ const int ne0 = dst ? dst->ne[0] : 0;
+ const int ne1 = dst ? dst->ne[1] : 0;
+ const int ne2 = dst ? dst->ne[2] : 0;
+ const int ne3 = dst ? dst->ne[3] : 0;
+
+ const int nb0 = dst ? dst->nb[0] : 0;
+ const int nb1 = dst ? dst->nb[1] : 0;
+ const int nb2 = dst ? dst->nb[2] : 0;
+ const int nb3 = dst ? dst->nb[3] : 0;
+
+ GGML_ASSERT(ne10 == ne02);
+
+ int nth = MIN(64, ne00);
+
+ const int n_past = ((int *) dst->op_params)[0];
+ const int n_dims = ((int *) dst->op_params)[1];
+ const int mode = ((int *) dst->op_params)[2];
+ const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
+
+ float freq_base;
+ float freq_scale;
+ float ext_factor;
+ float attn_factor;
+ float beta_fast;
+ float beta_slow;
+
+ memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
+ memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
+ memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
+ memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
+ memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
+
+ const bool is_neox = mode & 2;
+
+ cl_kernel kernel;
+
+ if (!is_neox) {
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ kernel = backend_ctx->kernel_rope_norm_f32;
+ break;
+ case GGML_TYPE_F16:
+ kernel = backend_ctx->kernel_rope_norm_f16;
+ break;
+ default:
+ GGML_ASSERT(false);
+ };
+ } else {
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ kernel = backend_ctx->kernel_rope_neox_f32;
+ break;
+ case GGML_TYPE_F16:
+ kernel = backend_ctx->kernel_rope_neox_f16;
+ break;
+ default:
+ GGML_ASSERT(false);
+ };
+ }
+
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), extra2 ? &extra2->data_device : &extra0->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne03));
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb00));
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb01));
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb02));
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb03));
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne0));
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne1));
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne2));
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne3));
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb0));
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb1));
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb2));
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &nb3));
+ CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &n_past));
+ CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &n_dims));
+ CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &n_ctx_orig));
+ CL_CHECK(clSetKernelArg(kernel, 27, sizeof(float), &freq_base));
+ CL_CHECK(clSetKernelArg(kernel, 28, sizeof(float), &freq_scale));
+ CL_CHECK(clSetKernelArg(kernel, 29, sizeof(float), &ext_factor));
+ CL_CHECK(clSetKernelArg(kernel, 30, sizeof(float), &attn_factor));
+ CL_CHECK(clSetKernelArg(kernel, 31, sizeof(float), &beta_fast));
+ CL_CHECK(clSetKernelArg(kernel, 32, sizeof(float), &beta_slow));
+
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+ cl_event evt;
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+ g_profiling_info.emplace_back();
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Op offloading
+//------------------------------------------------------------------------------
+
+typedef void (*ggml_cl_func_t)(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor) {
+ ggml_cl_func_t func = nullptr;
+
+ ggml_tensor * src0 = tensor->src[0];
+ ggml_tensor * src1 = tensor->src[1];
+
+ const bool any_on_device = tensor->extra
+ || (src0 != nullptr && src0->extra)
+ || (src1 != nullptr && src1->extra);
+
+ switch (tensor->op) {
+ case GGML_OP_GET_ROWS:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cl_get_rows;
+ break;
+ case GGML_OP_CPY:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cl_cpy;
+ break;
+ case GGML_OP_DUP:
+ case GGML_OP_CONT:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cl_dup;
+ break;
+ case GGML_OP_ADD:
+ if (!any_on_device) {
+ return false;
+ }
+ GGML_ASSERT(ggml_is_contiguous(src0));
+ GGML_ASSERT(ggml_is_contiguous(src1));
+ func = ggml_cl_add;
+ break;
+ case GGML_OP_MUL:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cl_mul;
+ break;
+ case GGML_OP_UNARY:
+ switch (ggml_get_unary_op(tensor)) {
+ case GGML_UNARY_OP_GELU:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cl_gelu;
+ break;
+ case GGML_UNARY_OP_SILU:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cl_silu;
+ break;
+ case GGML_UNARY_OP_RELU:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cl_relu;
+ break;
+ default:
+ return false;
+ } break;
+ case GGML_OP_CLAMP:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cl_clamp;
+ break;
+ case GGML_OP_NORM:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cl_norm;
+ break;
+ case GGML_OP_RMS_NORM:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cl_rms_norm;
+ break;
+ case GGML_OP_MUL_MAT:
+ if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
+ return false;
+ }
+ func = ggml_cl_mul_mat;
+ break;
+ case GGML_OP_SCALE:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cl_scale;
+ break;
+ case GGML_OP_RESHAPE:
+ case GGML_OP_VIEW:
+ case GGML_OP_PERMUTE:
+ case GGML_OP_TRANSPOSE:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cl_nop;
+ break;
+ case GGML_OP_DIAG_MASK_INF:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cl_diag_mask_inf;
+ break;
+ case GGML_OP_SOFT_MAX:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cl_soft_max;
+ break;
+ case GGML_OP_ROPE:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cl_rope;
+ break;
+ default:
+ return false;
+ }
+
+ func(backend, tensor->src[0], tensor->src[1], tensor);
+ return true;
+}
diff --git a/ggml/src/ggml-opencl/kernels/embed_kernel.py b/ggml/src/ggml-opencl/kernels/embed_kernel.py
new file mode 100644
index 000000000..b5d1d7242
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/embed_kernel.py
@@ -0,0 +1,26 @@
+#
+
+import sys
+import logging
+logger = logging.getLogger("opencl-embed-kernel")
+
+
+def main():
+ logging.basicConfig(level=logging.INFO)
+
+ if len(sys.argv) != 3:
+ logger.info("Usage: python embed_kernel.py ")
+ sys.exit(1)
+
+ ifile = open(sys.argv[1], "r")
+ ofile = open(sys.argv[2], "w")
+
+ for i in ifile:
+ ofile.write('R"({})"\n'.format(i))
+
+ ifile.close()
+ ofile.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl.cl
new file mode 100644
index 000000000..d1cdf709b
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl.cl
@@ -0,0 +1,2683 @@
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#elif defined(cl_amd_fp16)
+#pragma OPENCL EXTENSION cl_amd_fp16 : enable
+#else
+#error "Half precision floating point not supportedby OpenCL implementation on your device."
+#endif
+
+#ifdef cl_khr_subgroups
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#elif defined(cl_intel_subgroups)
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#error "Subgroup not supported on your device."
+#endif
+
+#ifdef cl_intel_required_subgroup_size
+// Always use subgroup size of 32 on Intel.
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+// Always use subgroups size of 64 on Adreno.
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#else
+// TODO: do not know how to choose subgroup size on other GPUs.
+#error "Selecting subgroup size is not supported on your device."
+#endif
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+#define QR4_1 2
+#define QK5_0 32
+#define QR5_0 2
+#define QK5_1 32
+#define QR5_1 2
+#define QK8_0 32
+#define QR8_0 1
+#define QK_K 256
+#define K_QUANTS_PER_ITERATION 2
+
+typedef char int8_t;
+typedef uchar uint8_t;
+typedef short int16_t;
+typedef ushort uint16_t;
+typedef int int32_t;
+typedef uint uint32_t;
+
+//------------------------------------------------------------------------------
+// block_q4_0
+//------------------------------------------------------------------------------
+struct block_q4_0
+{
+ half d;
+ uint8_t qs[QK4_0 / 2];
+};
+
+//------------------------------------------------------------------------------
+// block_q4_1
+//------------------------------------------------------------------------------
+struct block_q4_1
+{
+ half d;
+ half m;
+ uint8_t qs[QK4_1 / 2];
+};
+
+//------------------------------------------------------------------------------
+// block_q5_0
+//------------------------------------------------------------------------------
+struct block_q5_0
+{
+ half d;
+ uint32_t qh;
+ uint8_t qs[QK5_0 / 2];
+};
+
+//------------------------------------------------------------------------------
+// block_q5_1
+//------------------------------------------------------------------------------
+struct block_q5_1
+{
+ half d;
+ half m;
+ uint32_t qh;
+ uint8_t qs[QK5_1 / 2];
+};
+
+//------------------------------------------------------------------------------
+// block_q8_0
+//------------------------------------------------------------------------------
+struct block_q8_0
+{
+ half d;
+ int8_t qs[QK8_0];
+};
+
+//------------------------------------------------------------------------------
+// block_q2_K
+//------------------------------------------------------------------------------
+struct block_q2_K
+{
+ uint8_t scales[16];
+ uint8_t qs[64];
+ half d;
+ half dmin;
+};
+
+//------------------------------------------------------------------------------
+// block_q3_K
+//------------------------------------------------------------------------------
+struct block_q3_K
+{
+ uint8_t hmask[32];
+ uint8_t qs[64];
+ uint8_t scales[12];
+ half d;
+};
+
+//------------------------------------------------------------------------------
+// block_q4_K
+//------------------------------------------------------------------------------
+struct block_q4_K
+{
+ half d;
+ half dmin;
+ uint8_t scales[12];
+ uint8_t qs[128];
+};
+
+//------------------------------------------------------------------------------
+// block_q5_K
+//------------------------------------------------------------------------------
+struct block_q5_K
+{
+ half d;
+ half dmin;
+ uint8_t scales[12];
+ uint8_t qh[32];
+ uint8_t qs[128];
+};
+
+//------------------------------------------------------------------------------
+// block_q6_K
+//------------------------------------------------------------------------------
+struct block_q6_K
+{
+ uint8_t ql[128];
+ uint8_t qh[64];
+ int8_t scales[16];
+ half d;
+};
+
+//------------------------------------------------------------------------------
+// dequantize_q4_0_f32, dequantize_q4_0_f16
+//------------------------------------------------------------------------------
+void dequantize_q4_0_f32(global struct block_q4_0 * xb, short il, float16 * reg) {
+ global ushort * qs = ((global ushort *)xb + 1);
+ float d1 = il ? (xb->d / 16.h) : xb->d;
+ float d2 = d1 / 256.f;
+ float md = -8.h * xb->d;
+ ushort mask0 = il ? 0x00F0 : 0x000F;
+ ushort mask1 = mask0 << 8;
+
+ reg->s0 = d1 * (qs[0] & mask0) + md;
+ reg->s1 = d2 * (qs[0] & mask1) + md;
+
+ reg->s2 = d1 * (qs[1] & mask0) + md;
+ reg->s3 = d2 * (qs[1] & mask1) + md;
+
+ reg->s4 = d1 * (qs[2] & mask0) + md;
+ reg->s5 = d2 * (qs[2] & mask1) + md;
+
+ reg->s6 = d1 * (qs[3] & mask0) + md;
+ reg->s7 = d2 * (qs[3] & mask1) + md;
+
+ reg->s8 = d1 * (qs[4] & mask0) + md;
+ reg->s9 = d2 * (qs[4] & mask1) + md;
+
+ reg->sa = d1 * (qs[5] & mask0) + md;
+ reg->sb = d2 * (qs[5] & mask1) + md;
+
+ reg->sc = d1 * (qs[6] & mask0) + md;
+ reg->sd = d2 * (qs[6] & mask1) + md;
+
+ reg->se = d1 * (qs[7] & mask0) + md;
+ reg->sf = d2 * (qs[7] & mask1) + md;
+}
+
+void dequantize_q4_0_f16(global struct block_q4_0 * xb, short il, half16 * reg) {
+ global ushort * qs = ((global ushort *)xb + 1);
+ half d1 = il ? (xb->d / 16.h) : xb->d;
+ half d2 = d1 / 256.h;
+ half md = -8.h * xb->d;
+ ushort mask0 = il ? 0x00F0 : 0x000F;
+ ushort mask1 = mask0 << 8;
+
+ reg->s0 = d1 * (qs[0] & mask0) + md;
+ reg->s1 = d2 * (qs[0] & mask1) + md;
+
+ reg->s2 = d1 * (qs[1] & mask0) + md;
+ reg->s3 = d2 * (qs[1] & mask1) + md;
+
+ reg->s4 = d1 * (qs[2] & mask0) + md;
+ reg->s5 = d2 * (qs[2] & mask1) + md;
+
+ reg->s6 = d1 * (qs[3] & mask0) + md;
+ reg->s7 = d2 * (qs[3] & mask1) + md;
+
+ reg->s8 = d1 * (qs[4] & mask0) + md;
+ reg->s9 = d2 * (qs[4] & mask1) + md;
+
+ reg->sa = d1 * (qs[5] & mask0) + md;
+ reg->sb = d2 * (qs[5] & mask1) + md;
+
+ reg->sc = d1 * (qs[6] & mask0) + md;
+ reg->sd = d2 * (qs[6] & mask1) + md;
+
+ reg->se = d1 * (qs[7] & mask0) + md;
+ reg->sf = d2 * (qs[7] & mask1) + md;
+}
+
+//------------------------------------------------------------------------------
+// add
+//------------------------------------------------------------------------------
+
+// general-purpose kernel for addition of two tensors
+// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
+// cons: not very efficient
+kernel void kernel_add(
+ global char * src0,
+ ulong offset0,
+ global char * src1,
+ ulong offset1,
+ global char * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne03,
+ ulong nb00,
+ ulong nb01,
+ ulong nb02,
+ ulong nb03,
+ int ne10,
+ int ne11,
+ int ne12,
+ int ne13,
+ ulong nb10,
+ ulong nb11,
+ ulong nb12,
+ ulong nb13,
+ int ne0,
+ int ne1,
+ int ne2,
+ int ne3,
+ ulong nb0,
+ ulong nb1,
+ ulong nb2,
+ ulong nb3
+) {
+ src0 = src0 + offset0;
+ src1 = src1 + offset1;
+ dst = dst + offsetd;
+
+ int i03 = get_group_id(2);
+ int i02 = get_group_id(1);
+ int i01 = get_group_id(0);
+
+ int i13 = i03 % ne13;
+ int i12 = i02 % ne12;
+ int i11 = i01 % ne11;
+
+ global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
+ global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
+ global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
+
+ for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
+ const int i10 = i0 % ne10;
+ *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) + *((global float *)(src1_ptr + i10*nb10));
+ }
+}
+
+// assumption: src1 is a row
+// broadcast src1 into src0
+kernel void kernel_add_row(
+ global float4 * src0,
+ ulong offset0,
+ global float4 * src1,
+ ulong offset1,
+ global float4 * dst,
+ ulong offsetd,
+ int ne
+) {
+ src0 = (global float4*)((global char*)src0 + offset0);
+ src1 = (global float4*)((global char*)src1 + offset1);
+ dst = (global float4*)((global char*)dst + offsetd);
+
+ // This performs better than using %.
+ uint gid = get_global_id(0);
+ uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
+ dst[gid] = src0[gid] + src1[idx1];
+}
+
+//------------------------------------------------------------------------------
+// mul
+//------------------------------------------------------------------------------
+kernel void kernel_mul(
+ global char * src0,
+ ulong offset0,
+ global char * src1,
+ ulong offset1,
+ global char * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne03,
+ ulong nb00,
+ ulong nb01,
+ ulong nb02,
+ ulong nb03,
+ int ne10,
+ int ne11,
+ int ne12,
+ int ne13,
+ ulong nb10,
+ ulong nb11,
+ ulong nb12,
+ ulong nb13,
+ int ne0,
+ int ne1,
+ int ne2,
+ int ne3,
+ ulong nb0,
+ ulong nb1,
+ ulong nb2,
+ ulong nb3
+) {
+ src0 = src0 + offset0;
+ src1 = src1 + offset1;
+ dst = dst + offsetd;
+
+ int i03 = get_group_id(2);
+ int i02 = get_group_id(1);
+ int i01 = get_group_id(0);
+
+ int i13 = i03 % ne13;
+ int i12 = i02 % ne12;
+ int i11 = i01 % ne11;
+
+ global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
+ global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
+ global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
+
+ for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
+ const int i10 = i0 % ne10;
+ *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) * *((global float *)(src1_ptr + i10*nb10));
+ }
+}
+
+// assumption: src1 is a row
+// broadcast src1 into src0
+kernel void kernel_mul_row(
+ global float4 * src0,
+ ulong offset0,
+ global float4 * src1,
+ ulong offset1,
+ global float4 * dst,
+ ulong offsetd,
+ int ne
+) {
+ src0 = (global float4*)((global char*)src0 + offset0);
+ src1 = (global float4*)((global char*)src1 + offset1);
+ dst = (global float4*)((global char*)dst + offsetd);
+
+ // This performs better than using %.
+ uint gid = get_global_id(0);
+ uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
+ dst[gid] = src0[gid] * src1[idx1];
+}
+
+//------------------------------------------------------------------------------
+// scale
+//------------------------------------------------------------------------------
+kernel void kernel_scale(
+ global float4 * src0,
+ ulong offset0,
+ global float4 * dst,
+ ulong offsetd,
+ float scale
+) {
+ src0 = (global float4*)((global char*)src0 + offset0);
+ dst = (global float4*)((global char*)dst + offsetd);
+ dst[get_global_id(0)] = src0[get_global_id(0)] * scale;
+}
+
+//------------------------------------------------------------------------------
+// gelu
+//------------------------------------------------------------------------------
+#define GELU_COEF_A 0.044715f
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876f
+
+kernel void kernel_gelu(
+ global float * src0,
+ ulong offset0,
+ global float * dst,
+ ulong offsetd
+) {
+ src0 = (global float*)((global char*)src0 + offset0);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ float x = src0[get_global_id(0)];
+
+ dst[get_global_id(0)] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
+}
+
+kernel void kernel_gelu_4(
+ global float4 * src0,
+ ulong offset0,
+ global float4 * dst,
+ ulong offsetd
+) {
+ src0 = (global float4*)((global char*)src0 + offset0);
+ dst = (global float4*)((global char*)dst + offsetd);
+
+ float4 x = src0[get_global_id(0)];
+
+ dst[get_global_id(0)] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
+}
+
+//------------------------------------------------------------------------------
+// silu
+//------------------------------------------------------------------------------
+kernel void kernel_silu(
+ global float * src0,
+ ulong offset0,
+ global float * dst,
+ ulong offsetd
+) {
+ src0 = (global float*)((global char*)src0 + offset0);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ float x = src0[get_global_id(0)];
+ dst[get_global_id(0)] = x / (1.0f + exp(-x));
+}
+
+kernel void kernel_silu_4(
+ global float4 * src0,
+ ulong offset0,
+ global float4 * dst,
+ ulong offsetd
+) {
+ src0 = (global float4*)((global char*)src0 + offset0);
+ dst = (global float4*)((global char*)dst + offsetd);
+
+ float4 x = src0[get_global_id(0)];
+ dst[get_global_id(0)] = x / (1.0f + exp(-x));
+}
+
+//------------------------------------------------------------------------------
+// relu
+//------------------------------------------------------------------------------
+kernel void kernel_relu(
+ global float * src0,
+ ulong offset0,
+ global float * dst,
+ ulong offsetd
+) {
+ src0 = (global float*)((global char*)src0 + offset0);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ dst[get_global_id(0)] = fmax(0.0f, src0[get_global_id(0)]);
+}
+
+//------------------------------------------------------------------------------
+// clamp
+//------------------------------------------------------------------------------
+kernel void kernel_clamp(
+ global float * src0,
+ ulong offset0,
+ global float * dst,
+ ulong offsetd,
+ float min,
+ float max
+) {
+ src0 = (global float*)((global char*)src0 + offset0);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ dst[get_global_id(0)] = src0[get_global_id(0)] < min ?
+ min :
+ (src0[get_global_id(0)] > max ? max : src0[get_global_id(0)]);
+}
+
+//------------------------------------------------------------------------------
+// norm
+//------------------------------------------------------------------------------
+kernel void kernel_norm(
+ global void * src0,
+ ulong offset0,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ ulong nb01,
+ float eps,
+ local float * sum
+) {
+ src0 = (global void*)((global char*)src0 + offset0);
+ dst = (global void*)((global char*)dst + offsetd);
+
+ global float * x = (global float *) ((global char *) src0 + get_group_id(0)*nb01);
+
+ // MEAN
+ // parallel sum
+ sum[get_local_id(0)] = 0.0f;
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+ sum[get_local_id(0)] += x[i00];
+ }
+ // reduce
+ barrier(CLK_LOCAL_MEM_FENCE);
+ for (uint i = get_local_size(0)/2; i > 0; i /= 2) {
+ if (get_local_id(0) < i) {
+ sum[get_local_id(0)] += sum[get_local_id(0) + i];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ float mean = sum[0] / ne00;
+
+ // recenter and VARIANCE
+ barrier(CLK_LOCAL_MEM_FENCE);
+ global float * y = dst + get_group_id(0)*ne00;
+ sum[get_local_id(0)] = 0.0f;
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+ y[i00] = x[i00] - mean;
+ sum[get_local_id(0)] += y[i00] * y[i00];
+ }
+
+ // reduce
+ barrier(CLK_LOCAL_MEM_FENCE);
+ for (uint i = get_local_size(0)/2; i > 0; i /= 2) {
+ if (get_local_id(0) < i) {
+ sum[get_local_id(0)] += sum[get_local_id(0) + i];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ float variance = sum[0] / ne00;
+
+ float scale = 1.0f/sqrt(variance + eps);
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+ y[i00] = y[i00] * scale;
+ }
+}
+
+//------------------------------------------------------------------------------
+// rms_norm
+//------------------------------------------------------------------------------
+// This kernel depends on subgroup size.
+kernel void kernel_rms_norm(
+ global void * src0,
+ ulong offset0,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ ulong nb01,
+ float eps,
+ local float * sum // Note, the size depends on number of subgroups
+) {
+ src0 = (global void*)((global char*)src0 + offset0);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ global float4 * x = (global float4 *) ((global char *) src0 + get_group_id(0)*nb01);
+ global float * x_scalar = (global float *) x;
+ float4 sumf = 0;
+ float all_sum = 0;
+
+ // parallel sum
+ for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
+ sumf += x[i00] * x[i00];
+ }
+ all_sum = sumf.s0 + sumf.s1 + sumf.s2 + sumf.s3;
+ all_sum = sub_group_reduce_add(all_sum);
+ if (get_sub_group_local_id() == 0) {
+ sum[get_sub_group_id()] = all_sum;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ // broadcast
+ for (uint i = get_local_size(0) / get_max_sub_group_size() / 2; i > 0; i /= 2) {
+ if (get_local_id(0) < i) {
+ sum[get_local_id(0)] += sum[get_local_id(0) + i];
+ }
+ }
+ if (get_local_id(0) == 0) {
+ for (int i = 4 * (ne00 / 4); i < ne00; i++) {
+ sum[0] += x_scalar[i];
+ }
+ sum[0] /= ne00;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ const float mean = sum[0];
+ const float scale = 1.0f/sqrt(mean + eps);
+
+ global float4 * y = (global float4 *) (dst + get_group_id(0)*ne00);
+ global float * y_scalar = (global float *) y;
+ for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
+ y[i00] = x[i00] * scale;
+ }
+ if (get_local_id(0) == 0) {
+ for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
+ y_scalar[i00] = x_scalar[i00] * scale;
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+// diag_mask_inf kernels
+//------------------------------------------------------------------------------
+kernel void kernel_diag_mask_inf(
+ global float * src0,
+ ulong offset0,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int n_past
+) {
+ src0 = (global float*)((global char*)src0 + offset0);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ int i02 = get_global_id(2);
+ int i01 = get_global_id(1);
+ int i00 = get_global_id(0);
+
+ if (i00 > n_past + i01) {
+ dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
+ } else {
+ dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
+ }
+}
+
+kernel void kernel_diag_mask_inf_8(
+ global float4 * src0,
+ ulong offset0,
+ global float4 * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int n_past
+) {
+ src0 = (global float4*)((global char*)src0 + offset0);
+ dst = (global float4*)((global char*)dst + offsetd);
+
+ int i = 2*get_global_id(0);
+
+ dst[i+0] = src0[i+0];
+ dst[i+1] = src0[i+1];
+ int i4 = 4*i;
+ int i02 = i4/(ne00*ne01); i4 -= i02*ne00*ne01;
+ int i01 = i4/(ne00); i4 -= i01*ne00;
+ int i00 = i4;
+ for (int k = 3; k >= 0; --k) {
+ if (i00 + 4 + k <= n_past + i01) {
+ break;
+ }
+ (&dst[i+1])[k] = -INFINITY;
+ if (i00 + k > n_past + i01) {
+ (&dst[i])[k] = -INFINITY;
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+// softmax
+//------------------------------------------------------------------------------
+kernel void kernel_soft_max(
+ global float * src0,
+ ulong offset0,
+ global float * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ float scale,
+ float max_bias,
+ float m0,
+ float m1,
+ int n_head_log2
+) {
+ src0 = (global float*)((global char*)src0 + offset0);
+ src1 = (global float*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ int i03 = get_group_id(2);
+ int i02 = get_group_id(1);
+ int i01 = get_group_id(0);
+
+ global float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+ global float * pmask = src1 != src0 ? src1 + i01*ne00 : 0;
+ global float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+ float slope = 1.0f;
+
+ // ALiBi
+ if (max_bias > 0.0f) {
+ int h = i02;
+
+ float base = h < n_head_log2 ? m0 : m1;
+ int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+
+ slope = pow(base, exp);
+ }
+
+ // parallel max
+ float lmax = -INFINITY;
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+ lmax = fmax(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
+ }
+ float max = sub_group_reduce_max(lmax);
+
+ // parallel sum
+ float lsum = 0.0f;
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+ float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max);
+ lsum += exp_psrc0;
+ // Remember the result of exp here. exp is expensive, so we really do not
+ // wish to compute it twice.
+ pdst[i00] = exp_psrc0;
+ }
+
+ const float sum = sub_group_reduce_add(lsum);
+
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+ pdst[i00] /= sum;
+ }
+}
+
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_soft_max_4(
+ global float * src0,
+ ulong offset0,
+ global float * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ float scale,
+ float max_bias,
+ float m0,
+ float m1,
+ int n_head_log2
+) {
+ src0 = (global float*)((global char*)src0 + offset0);
+ src1 = (global float*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ int i03 = get_group_id(2);
+ int i02 = get_group_id(1);
+ int i01 = get_group_id(0);
+
+ global float4 * psrc4 = (global float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
+ global float4 * pmask = src1 != src0 ? (global float4 *)(src1 + i01*ne00) : 0;
+ global float4 * pdst4 = (global float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
+
+ float slope = 1.0f;
+
+ // ALiBi
+ if (max_bias > 0.0f) {
+ int h = i02;
+
+ float base = h < n_head_log2 ? m0 : m1;
+ int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+
+ slope = pow(base, exp);
+ }
+
+ // parallel max
+ float4 lmax4 = -INFINITY;
+ for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
+ lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
+ }
+ float lmax = fmax(fmax(lmax4.s0, lmax4.s1), fmax(lmax4.s2, lmax4.s3));
+
+ const float max = sub_group_reduce_max(lmax);
+
+ // parallel sum
+ float4 lsum4 = 0.0f;
+ for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
+ const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max);
+ lsum4 += exp_psrc4;
+ pdst4[i00] = exp_psrc4;
+ }
+ float lsum = lsum4.s0 + lsum4.s1 + lsum4.s2 + lsum4.s3;
+
+ const float sum = sub_group_reduce_add(lsum);
+
+ for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
+ pdst4[i00] /= sum;
+ }
+}
+
+//------------------------------------------------------------------------------
+// kernel_rope
+//------------------------------------------------------------------------------
+float rope_yarn_ramp(float low, float high, int i0) {
+ const float y = (i0 / 2 - low) / max(0.001f, high - low);
+ return 1.0f - min(1.0f, max(0.0f, y));
+}
+
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+float2 rope_yarn(
+ float theta_extrap, float freq_scale, float2 corr_dims, int i0, float ext_factor, float mscale
+) {
+ // Get n-d rotational scaling corrected for extrapolation
+ float theta_interp = freq_scale * theta_extrap;
+ float theta = theta_interp;
+ if (ext_factor != 0.0f) {
+ float ramp_mix = rope_yarn_ramp(corr_dims.s0, corr_dims.s1, i0) * ext_factor;
+ theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+ // Get n-d magnitude scaling corrected for interpolation
+ mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
+ }
+ return (float2)(cos(theta) * mscale, sin(theta) * mscale);
+}
+
+// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
+// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
+float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) {
+ return n_dims * log(n_ctx_orig / (n_rot * 2 * M_PI_F)) / (2 * log(base));
+}
+
+float2 rope_yarn_corr_dims(
+ int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow
+) {
+ // start and end correction dims
+ return (float2)(
+ max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base))),
+ min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base)))
+ );
+}
+
+kernel void kernel_rope_norm_f32(
+ global void * src0,
+ ulong offset0,
+ global int * src1,
+ ulong offset1,
+ global float * src2,
+ ulong offset2,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne03,
+ ulong nb00,
+ ulong nb01,
+ ulong nb02,
+ ulong nb03,
+ int ne0,
+ int ne1,
+ int ne2,
+ int ne3,
+ ulong nb0,
+ ulong nb1,
+ ulong nb2,
+ ulong nb3,
+ int n_past,
+ int n_dims,
+ int n_ctx_orig,
+ float freq_base,
+ float freq_scale,
+ float ext_factor,
+ float attn_factor,
+ float beta_fast,
+ float beta_slow
+) {
+ src0 = (global void*)((global char*)src0 + offset0);
+ src1 = (global int*)((global char*)src1 + offset1);
+ src2 = (global float*)((global char*)src2 + offset2);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ int i3 = get_group_id(2);
+ int i2 = get_group_id(1);
+ int i1 = get_group_id(0);
+
+ float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
+
+ global int * pos = src1;
+
+ float theta_base = (float) pos[i2];
+ float inv_ndims = -1.f/n_dims;
+
+ for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
+ if (i0 < n_dims) {
+ int ic = i0/2;
+
+ float theta = theta_base * pow(freq_base, inv_ndims*i0);
+
+ float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
+
+ float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
+
+ global float * src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+ global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+ float x0 = src[0];
+ float x1 = src[1];
+
+ dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
+ dst_data[1] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
+ } else {
+ global float * src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+ global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+ dst_data[0] = src[0];
+ dst_data[1] = src[1];
+ }
+ }
+}
+
+kernel void kernel_rope_norm_f16(
+ global void * src0,
+ ulong offset0,
+ global int * src1,
+ ulong offset1,
+ global float * src2,
+ ulong offset2,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne03,
+ ulong nb00,
+ ulong nb01,
+ ulong nb02,
+ ulong nb03,
+ int ne0,
+ int ne1,
+ int ne2,
+ int ne3,
+ ulong nb0,
+ ulong nb1,
+ ulong nb2,
+ ulong nb3,
+ int n_past,
+ int n_dims,
+ int n_ctx_orig,
+ float freq_base,
+ float freq_scale,
+ float ext_factor,
+ float attn_factor,
+ float beta_fast,
+ float beta_slow
+) {
+ src0 = (global void*)((global char*)src0 + offset0);
+ src1 = (global int*)((global char*)src1 + offset1);
+ src2 = (global float*)((global char*)src2 + offset2);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ int i3 = get_group_id(2);
+ int i2 = get_group_id(1);
+ int i1 = get_group_id(0);
+
+ float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
+
+ global int * pos = src1;
+
+ float theta_base = (float) pos[i2];
+ float inv_ndims = -1.f/n_dims;
+
+ for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
+ if (i0 < n_dims) {
+ int ic = i0/2;
+
+ float theta = theta_base * pow(freq_base, inv_ndims*i0);
+
+ float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
+
+ float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
+
+ global half * src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+ global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+ float x0 = src[0];
+ float x1 = src[1];
+
+ dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
+ dst_data[1] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
+ } else {
+ global half * src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+ global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+ dst_data[0] = src[0];
+ dst_data[1] = src[1];
+ }
+ }
+}
+
+kernel void kernel_rope_neox_f32(
+ global void * src0,
+ ulong offset0,
+ global int * src1,
+ ulong offset1,
+ global float * src2,
+ ulong offset2,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne03,
+ ulong nb00,
+ ulong nb01,
+ ulong nb02,
+ ulong nb03,
+ int ne0,
+ int ne1,
+ int ne2,
+ int ne3,
+ ulong nb0,
+ ulong nb1,
+ ulong nb2,
+ ulong nb3,
+ int n_past,
+ int n_dims,
+ int n_ctx_orig,
+ float freq_base,
+ float freq_scale,
+ float ext_factor,
+ float attn_factor,
+ float beta_fast,
+ float beta_slow
+) {
+ src0 = (global void*)((global char*)src0 + offset0);
+ src1 = (global int*)((global char*)src1 + offset1);
+ src2 = (global float*)((global char*)src2 + offset2);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ int i3 = get_group_id(2);
+ int i2 = get_group_id(1);
+ int i1 = get_group_id(0);
+
+ float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
+
+ global int * pos = src1;
+
+ float theta_base = (float) pos[i2];
+ float inv_ndims = -1.f/n_dims;
+
+ for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
+ if (i0 < n_dims) {
+ int ic = i0/2;
+
+ const float theta = theta_base * pow(freq_base, inv_ndims*i0);
+
+ const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
+
+ float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
+
+ global float * src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+ global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
+
+ const float x0 = src[0];
+ const float x1 = src[n_dims/2];
+
+ dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
+ dst_data[n_dims/2] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
+ } else {
+ global float * const src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+ global float * dst_data = (global float *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+ dst_data[0] = src[0];
+ dst_data[1] = src[1];
+ }
+ }
+}
+
+kernel void kernel_rope_neox_f16(
+ global void * src0,
+ ulong offset0,
+ global int * src1,
+ ulong offset1,
+ global float * src2,
+ ulong offset2,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne03,
+ ulong nb00,
+ ulong nb01,
+ ulong nb02,
+ ulong nb03,
+ int ne0,
+ int ne1,
+ int ne2,
+ int ne3,
+ ulong nb0,
+ ulong nb1,
+ ulong nb2,
+ ulong nb3,
+ int n_past,
+ int n_dims,
+ int n_ctx_orig,
+ float freq_base,
+ float freq_scale,
+ float ext_factor,
+ float attn_factor,
+ float beta_fast,
+ float beta_slow
+) {
+ src0 = (global void*)((global char*)src0 + offset0);
+ src1 = (global int*)((global char*)src1 + offset1);
+ src2 = (global float*)((global char*)src2 + offset2);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ int i3 = get_group_id(2);
+ int i2 = get_group_id(1);
+ int i1 = get_group_id(0);
+
+ float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
+
+ global int * pos = src1;
+
+ float theta_base = (float) pos[i2];
+ float inv_ndims = -1.f/n_dims;
+
+ for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
+ if (i0 < n_dims) {
+ int ic = i0/2;
+
+ const float theta = theta_base * pow(freq_base, inv_ndims*i0);
+
+ const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
+
+ float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
+
+ global half * src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+ global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
+
+ const float x0 = src[0];
+ const float x1 = src[n_dims/2];
+
+ dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
+ dst_data[n_dims/2] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
+ } else {
+ global half * const src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+ global half * dst_data = (global half *)((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+ dst_data[0] = src[0];
+ dst_data[1] = src[1];
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+// cpy
+//------------------------------------------------------------------------------
+
+kernel void kernel_cpy_f16_f16(
+ global half * src0,
+ ulong offset0,
+ global half * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne03,
+ ulong nb00,
+ ulong nb01,
+ ulong nb02,
+ ulong nb03,
+ int ne0,
+ int ne1,
+ int ne2,
+ int ne3,
+ ulong nb0,
+ ulong nb1,
+ ulong nb2,
+ ulong nb3
+) {
+ src0 = (global half*)((global char*)src0 + offset0);
+ dst = (global half*)((global char*)dst + offsetd);
+
+ int i03 = get_group_id(2);
+ int i02 = get_group_id(1);
+ int i01 = get_group_id(0);
+
+ int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+ int i3 = n / (ne2*ne1*ne0);
+ int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+ int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+ int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+
+ global half * dst_data = (global half *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+ global const half * src = (global half *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+ dst_data[i00] = src[0];
+ }
+}
+
+kernel void kernel_cpy_f16_f32(
+ global half * src0,
+ ulong offset0,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne03,
+ ulong nb00,
+ ulong nb01,
+ ulong nb02,
+ ulong nb03,
+ int ne0,
+ int ne1,
+ int ne2,
+ int ne3,
+ ulong nb0,
+ ulong nb1,
+ ulong nb2,
+ ulong nb3
+) {
+
+ src0 = (global half*)((global char*)src0 + offset0);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ int i03 = get_group_id(2);
+ int i02 = get_group_id(1);
+ int i01 = get_group_id(0);
+
+ int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+ int i3 = n / (ne2*ne1*ne0);
+ int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+ int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+ int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+
+ global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+ global half * src = (global half *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+ dst_data[i00] = src[0];
+ }
+}
+
+kernel void kernel_cpy_f32_f16(
+ global float * src0,
+ ulong offset0,
+ global half * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne03,
+ ulong nb00,
+ ulong nb01,
+ ulong nb02,
+ ulong nb03,
+ int ne0,
+ int ne1,
+ int ne2,
+ int ne3,
+ ulong nb0,
+ ulong nb1,
+ ulong nb2,
+ ulong nb3
+) {
+ src0 = (global float*)((global char*)src0 + offset0);
+ dst = (global half*)((global char*)dst + offsetd);
+
+ int i03 = get_group_id(2);
+ int i02 = get_group_id(1);
+ int i01 = get_group_id(0);
+
+ int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+ int i3 = n / (ne2*ne1*ne0);
+ int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+ int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+ int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+
+ global half * dst_data = (global half *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+ global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+
+ dst_data[i00] = src[0];
+ }
+}
+
+kernel void kernel_cpy_f32_f32(
+ global float * src0,
+ ulong offset0,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne03,
+ ulong nb00,
+ ulong nb01,
+ ulong nb02,
+ ulong nb03,
+ int ne0,
+ int ne1,
+ int ne2,
+ int ne3,
+ ulong nb0,
+ ulong nb1,
+ ulong nb2,
+ ulong nb3
+) {
+ src0 = (global float*)((global char*)src0 + offset0);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ int i03 = get_group_id(2);
+ int i02 = get_group_id(1);
+ int i01 = get_group_id(0);
+
+ int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+ int i3 = n / (ne2*ne1*ne0);
+ int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+ int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+ int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+
+ global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+ global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+
+ dst_data[i00] = src[0];
+ }
+}
+
+//------------------------------------------------------------------------------
+// get_rows
+//------------------------------------------------------------------------------
+kernel void kernel_get_rows_f32(
+ global void * src0,
+ ulong offset0,
+ global int * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ ulong nb01,
+ ulong nb02,
+ int ne10,
+ ulong nb10,
+ ulong nb11,
+ ulong nb1,
+ ulong nb2
+) {
+ src0 = (global void*)((global char*)src0 + offset0);
+ src1 = (global int*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ int i10 = get_group_id(0);
+ int i11 = get_group_id(1);
+
+ int r = ((global int *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
+
+ int i02 = i11;
+
+ for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
+ ((global float *) ((global char *) dst + i11*nb2 + i10*nb1))[ind] =
+ ((global float *) ((global char *) src0 + r*nb01 + i02*nb02))[ind];
+ }
+}
+
+kernel void kernel_get_rows_f16(
+ global void * src0,
+ ulong offset0,
+ global int * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ ulong nb01,
+ ulong nb02,
+ int ne10,
+ ulong nb10,
+ ulong nb11,
+ ulong nb1,
+ ulong nb2
+) {
+ src0 = (global void*)((global char*)src0 + offset0);
+ src1 = (global int*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ int i10 = get_group_id(0);
+ int i11 = get_group_id(1);
+
+ int r = ((global int32_t *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
+
+ int i02 = i11;
+
+ for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
+ ((global float *) ((global char *) dst + i11*nb2 + i10*nb1))[ind] =
+ ((global half *) ((global char *) src0 + r*nb01 + i02*nb02))[ind];
+ }
+}
+
+kernel void kernel_get_rows_q4_0(
+ global void * src0,
+ ulong offset0,
+ global int * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ ulong nb01,
+ ulong nb02,
+ int ne10,
+ ulong nb10,
+ ulong nb11,
+ ulong nb1,
+ ulong nb2
+) {
+ src0 = (global void*)((global char*)src0 + offset0);
+ src1 = (global int*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ const int NL = 2;
+
+ int i10 = get_group_id(0);
+ int i11 = get_group_id(1);
+
+ int r = ((global int32_t *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
+
+ int i02 = i11;
+
+ for (int ind = get_local_id(0); ind < ne00/16; ind += get_local_size(0)) {
+ float16 temp;
+ dequantize_q4_0_f32(
+ ((global struct block_q4_0 *) ((global char *) src0 + r*nb01 + i02*nb02)) + ind/NL, ind%NL, &temp);
+ *(((global float16 *) ((global char *) dst + i11*nb2 + i10*nb1)) + ind) = temp;
+ }
+}
+
+//------------------------------------------------------------------------------
+// mul_mat_f32_f32
+//------------------------------------------------------------------------------
+#define N_F32_F32 4
+
+kernel void kernel_mul_mat_f32_f32(
+ global char * src0,
+ ulong offset0,
+ global char * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ ulong nb00,
+ ulong nb01,
+ ulong nb02,
+ ulong nb03,
+ int ne10,
+ int ne11,
+ int ne12,
+ ulong nb10,
+ ulong nb11,
+ ulong nb12,
+ ulong nb13,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+ src0 = (global char*)((global char*)src0 + offset0);
+ src1 = (global char*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ int r0 = get_group_id(0);
+ int rb = get_group_id(1)*N_F32_F32;
+ int im = get_group_id(2);
+
+ int i12 = im%ne12;
+ int i13 = im/ne12;
+
+ ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+
+ global float * x = (global float *) (src0 + offset_src0);
+
+ if (ne00 < 128) {
+ for (int row = 0; row < N_F32_F32; ++row) {
+ int r1 = rb + row;
+ if (r1 >= ne11) {
+ break;
+ }
+
+ ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
+
+ global float * y = (global float *) (src1 + offset_src1);
+
+ float sumf = 0;
+ for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
+ sumf += (float) x[i] * (float) y[i];
+ }
+
+ float all_sum = sub_group_reduce_add(sumf);
+ if (get_sub_group_local_id() == 0) {
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+ }
+ }
+ } else {
+ global float4 * x4 = (global float4 *)x;
+ for (int row = 0; row < N_F32_F32; ++row) {
+ int r1 = rb + row;
+ if (r1 >= ne11) {
+ break;
+ }
+
+ ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
+
+ global float * y = (global float *) (src1 + offset_src1);
+ global float4 * y4 = (global float4 *) y;
+
+ float sumf = 0;
+ for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
+ sumf += (float) x4[i].s0 * y4[i].s0;
+ sumf += (float) x4[i].s1 * y4[i].s1;
+ sumf += (float) x4[i].s2 * y4[i].s2;
+ sumf += (float) x4[i].s3 * y4[i].s3;
+ }
+
+ float all_sum = sub_group_reduce_add(sumf);
+ if (get_sub_group_local_id() == 0) {
+ for (int i = 4*(ne00/4); i < ne00; ++i) {
+ all_sum += (float) x[i] * y[i];
+ }
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+ }
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+// mul_mat_f16_f16
+//------------------------------------------------------------------------------
+#define N_F16_F16 4
+
+kernel void kernel_mul_mat_f16_f16(
+ global char * src0,
+ ulong offset0,
+ global char * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ ulong nb00,
+ ulong nb01,
+ ulong nb02,
+ ulong nb03,
+ int ne10,
+ int ne11,
+ int ne12,
+ ulong nb10,
+ ulong nb11,
+ ulong nb12,
+ ulong nb13,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3)
+{
+ src0 = (global char*)((global char*)src0 + offset0);
+ src1 = (global char*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ int r0 = get_group_id(0);
+ int rb = get_group_id(1)*N_F16_F16;
+ int im = get_group_id(2);
+
+ int i12 = im%ne12;
+ int i13 = im/ne12;
+
+ ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+
+ global half * x = (global half *) (src0 + offset_src0);
+
+ if (ne00 < 128) {
+ for (int row = 0; row < N_F16_F16; ++row) {
+ int r1 = rb + row;
+ if (r1 >= ne11) {
+ break;
+ }
+
+ ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
+
+ global half * y = (global half *) (src1 + offset_src1);
+
+ float sumf = 0;
+ for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
+ sumf += (half) x[i] * (half) y[i];
+ }
+
+ float all_sum = sub_group_reduce_add(sumf);
+ if (get_sub_group_local_id() == 0) {
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+ }
+ }
+ } else {
+ global half4 * x4 = (global half4 *)x;
+ for (int row = 0; row < N_F16_F16; ++row) {
+ int r1 = rb + row;
+ if (r1 >= ne11) {
+ break;
+ }
+
+ ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
+
+ global half * y = (global half *) (src1 + offset_src1);
+ global half4 * y4 = (global half4 *) y;
+
+ float sumf = 0;
+ for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
+ sumf += (half) x4[i].s0 * y4[i].s0;
+ sumf += (half) x4[i].s1 * y4[i].s1;
+ sumf += (half) x4[i].s2 * y4[i].s2;
+ sumf += (half) x4[i].s3 * y4[i].s3;
+ }
+
+ float all_sum = sub_group_reduce_add(sumf);
+ if (get_sub_group_local_id() == 0) {
+ for (int i = 4*(ne00/4); i < ne00; ++i) {
+ all_sum += (half) x[i] * y[i];
+ }
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+ }
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+// mul_mat_f16_f32_1row
+//------------------------------------------------------------------------------
+kernel void kernel_mul_mat_f16_f32_1row(
+ global char * src0,
+ ulong offset0,
+ global char * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ ulong nb00,
+ ulong nb01,
+ ulong nb02,
+ ulong nb03,
+ int ne10,
+ int ne11,
+ int ne12,
+ ulong nb10,
+ ulong nb11,
+ ulong nb12,
+ ulong nb13,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+ src0 = (global char*)((global char*)src0 + offset0);
+ src1 = (global char*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ int r0 = get_group_id(0);
+ int r1 = get_group_id(1);
+ int im = get_group_id(2);
+
+ int i12 = im%ne12;
+ int i13 = im/ne12;
+
+ ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+ ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
+
+ global half * x = (global half *) (src0 + offset_src0);
+ global float * y = (global float *) (src1 + offset_src1);
+
+ float sumf = 0;
+ if (ne00 < 128) {
+ for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
+ sumf += (float) x[i] * (float) y[i];
+ }
+ float all_sum = sub_group_reduce_add(sumf);
+ if (get_sub_group_local_id() == 0) {
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+ }
+ } else {
+ global half4 * x4 = (global half4 *) x;
+ global float4 * y4 = (global float4 *) y;
+ for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
+ sumf += (float) x4[i].s0 * y4[i].s0;
+ sumf += (float) x4[i].s1 * y4[i].s1;
+ sumf += (float) x4[i].s2 * y4[i].s2;
+ sumf += (float) x4[i].s3 * y4[i].s3;
+ }
+ float all_sum = sub_group_reduce_add(sumf);
+ if (get_sub_group_local_id() == 0) {
+ for (int i = 4*(ne00/4); i < ne00; ++i) {
+ all_sum += (float) x[i] * y[i];
+ }
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+ }
+ }
+
+}
+
+//------------------------------------------------------------------------------
+// mul_mat_f16_f32
+//------------------------------------------------------------------------------
+#define N_F16_F32 4
+
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_f16_f32(
+ global char * src0,
+ ulong offset0,
+ global char * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ ulong nb00,
+ ulong nb01,
+ ulong nb02,
+ ulong nb03,
+ int ne10,
+ int ne11,
+ int ne12,
+ ulong nb10,
+ ulong nb11,
+ ulong nb12,
+ ulong nb13,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+ src0 = (global char*)((global char*)src0 + offset0);
+ src1 = (global char*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ int r0 = get_group_id(0);
+ int rb = get_group_id(1)*N_F16_F32;
+ int im = get_group_id(2);
+
+ int i12 = im%ne12;
+ int i13 = im/ne12;
+
+ ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+
+ global half * x = (global half *) (src0 + offset_src0);
+
+ if (ne00 < 128) {
+ for (int row = 0; row < N_F16_F32; ++row) {
+ int r1 = rb + row;
+ if (r1 >= ne11) {
+ break;
+ }
+
+ ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
+
+ global float * y = (global float *) (src1 + offset_src1);
+
+ float sumf = 0;
+ for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
+ sumf += convert_float(x[i]) * y[i];
+ }
+
+ float all_sum = sub_group_reduce_add(sumf);
+ if (get_sub_group_local_id() == 0) {
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+ }
+ }
+ } else {
+ global half4 * x4 = (global half4 *)x;
+ for (int row = 0; row < N_F16_F32; ++row) {
+ int r1 = rb + row;
+ if (r1 >= ne11) {
+ break;
+ }
+
+ ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
+
+ global float * y = (global float *) (src1 + offset_src1);
+ global float4 * y4 = (global float4 *) y;
+
+ float sumf = 0;
+ for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
+ sumf += convert_float(x4[i].s0) * y4[i].s0;
+ sumf += convert_float(x4[i].s1) * y4[i].s1;
+ sumf += convert_float(x4[i].s2) * y4[i].s2;
+ sumf += convert_float(x4[i].s3) * y4[i].s3;
+ }
+
+ float all_sum = sub_group_reduce_add(sumf);
+ if (get_sub_group_local_id() == 0) {
+ for (int i = 4*(ne00/4); i < ne00; ++i) {
+ all_sum += (float) x[i] * y[i];
+ }
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+ }
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+// mul_mat_f16_f32_l4
+//------------------------------------------------------------------------------
+// Assumes row size (ne00) is a multiple of 4
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_f16_f32_l4(
+ global char * src0,
+ ulong offset0,
+ global char * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ ulong nb00,
+ ulong nb01,
+ ulong nb02,
+ ulong nb03,
+ int ne10,
+ int ne11,
+ int ne12,
+ ulong nb10,
+ ulong nb11,
+ ulong nb12,
+ ulong nb13,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+ src0 = (global char*)((global char*)src0 + offset0);
+ src1 = (global char*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ int nrows = ne11;
+ int r0 = get_group_id(0);
+ int im = get_group_id(2);
+
+ int i12 = im%ne12;
+ int i13 = im/ne12;
+
+ ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+
+ global half4 * x4 = (global half4 *) (src0 + offset_src0);
+
+ for (int r1 = 0; r1 < nrows; ++r1) {
+ ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
+
+ global float4 * y4 = (global float4 *) (src1 + offset_src1);
+
+ float sumf = 0;
+ for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
+ sumf += convert_float(x4[i].s0) * y4[i].s0;
+ sumf += convert_float(x4[i].s1) * y4[i].s1;
+ sumf += convert_float(x4[i].s2) * y4[i].s2;
+ sumf += convert_float(x4[i].s3) * y4[i].s3;
+ }
+
+ float all_sum = sub_group_reduce_add(sumf);
+ if (get_sub_group_local_id() == 0) {
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+// mul_vec_q_n_f32
+//------------------------------------------------------------------------------
+// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
+// il indicates where the q4 quants begin (0 or QK4_0/4)
+// we assume that the yl's have been multiplied with the appropriate scale factor
+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
+inline float block_q_4_0_dot_y(
+ global struct block_q4_0 * qb_curr,
+ float sumy,
+ private float * yl,
+ int il
+) {
+ float d = qb_curr->d;
+ float2 acc = 0.f;
+ global ushort * qs = ((global ushort *)qb_curr + 1 + il/2);
+ for (int i = 0; i < 8; i+=2) {
+ acc.s0 += yl[i + 0] * (qs[i / 2] & 0x000F)
+ + yl[i + 1] * (qs[i / 2] & 0x0F00);
+ acc.s1 += yl[i + 8] * (qs[i / 2] & 0x00F0)
+ + yl[i + 9] * (qs[i / 2] & 0xF000);
+ }
+ return d * (sumy * -8.f + acc.s0 + acc.s1);
+}
+
+#ifdef INTEL_GPU
+#define N_DST 4 // each SIMD group works on 4 rows
+#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
+#elif defined (ADRENO_GPU)
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+
+inline void mul_vec_q_n_f32(
+ global void * src0,
+ global float * src1,
+ global float * dst,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne10,
+ int ne12,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+
+ const ulong nb = ne00/QK4_0;
+
+ int r0 = get_group_id(0);
+ int r1 = get_group_id(1);
+ int im = get_group_id(2);
+
+ // (r0 * N_SIMDGROUP + get_sub_group_id()) is essenatially the linear global
+ // id of a SIMD group in the grid.
+ int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+ int i12 = im%ne12;
+ int i13 = im/ne12;
+
+ ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
+ global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0;
+ global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
+
+ float yl[16]; // src1 vector cache
+ float sumf[N_DST]={0.f};
+
+ int ix = get_sub_group_local_id()/2;
+ int il = 8*(get_sub_group_local_id()%2);
+
+ global float * yb = y + ix * QK4_0 + il;
+
+ // each thread in a SIMD group deals with half a block.
+ for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+ float sumy = 0;
+ for (int i = 0; i < 8; i += 2) {
+ sumy += yb[i] + yb[i+1];
+ yl[i+0] = yb[i+ 0];
+ yl[i+1] = yb[i+ 1]/256.f;
+ sumy += yb[i+16] + yb[i+17];
+ yl[i+8] = yb[i+16]/16.f;
+ yl[i+9] = yb[i+17]/4096.f;
+ }
+
+ for (int row = 0; row < N_DST; row++) {
+ sumf[row] += block_q_4_0_dot_y(x+ib+row*nb, sumy, yl, il);
+ }
+
+ // One thread in a SIMD group (i.e., subgroup) handles a half block,
+ // hence then entire SIMD group handles SIMDWIDTH/2 blocks.
+ // y points to the activation matrix (of type float). Therefore for
+ // one thread, the # of blocks y should advance is SIMDWIDTH/2 (because
+ // SIMDWIDTH/2 blocks are processed by a SIMD group) - in terms of
+ // floats, it is QK4_0 * (SIMDWIDTH/2), where QK4_0 is the block size.
+ yb += QK4_0 * (N_SIMDWIDTH/2);
+ }
+
+ // The above does not work for Adreno - it produces incorrect results for
+ // row = 1, 2, 3 and only row = 0 gives the correct result.
+ // If N_DST is changed, the below array must be initialized accordingly.
+ // This also seems to perform better on Intel.
+ float tot[N_DST] = {
+ sub_group_reduce_add(sumf[0]), sub_group_reduce_add(sumf[1]),
+ sub_group_reduce_add(sumf[2]), sub_group_reduce_add(sumf[3])};
+ for (int row = 0; row < N_DST; ++row) {
+ if (get_sub_group_local_id() == 0 && first_row + row < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot[row];
+ }
+ }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32(
+ global void * src0,
+ ulong offset0,
+ global float * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne10,
+ int ne12,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+ src0 = (global void*)((global char*)src0 + offset0);
+ src1 = (global float*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ mul_vec_q_n_f32(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
+
+//
+// This variant unrolls the loops and uses vector types instead of pointers.
+// It improves performance on Adreno but not so much on Intel.
+//
+inline float block_q_4_0_dot_y_v(
+ global struct block_q4_0 * qb_curr,
+ float sumy,
+ float16 yl,
+ int il
+) {
+ float d = qb_curr->d;
+ float acc = 0.f;
+ global ushort * qs = ((global ushort *)qb_curr + 1 + il/2);
+
+ acc += yl.s0 * (qs[0] & 0x000F);
+ acc += yl.s1 * (qs[0] & 0x0F00);
+ acc += yl.s8 * (qs[0] & 0x00F0);
+ acc += yl.s9 * (qs[0] & 0xF000);
+
+ acc += yl.s2 * (qs[1] & 0x000F);
+ acc += yl.s3 * (qs[1] & 0x0F00);
+ acc += yl.sa * (qs[1] & 0x00F0);
+ acc += yl.sb * (qs[1] & 0xF000);
+
+ acc += yl.s4 * (qs[2] & 0x000F);
+ acc += yl.s5 * (qs[2] & 0x0F00);
+ acc += yl.sc * (qs[2] & 0x00F0);
+ acc += yl.sd * (qs[2] & 0xF000);
+
+ acc += yl.s6 * (qs[3] & 0x000F);
+ acc += yl.s7 * (qs[3] & 0x0F00);
+ acc += yl.se * (qs[3] & 0x00F0);
+ acc += yl.sf * (qs[3] & 0xF000);
+
+ return d * (sumy * -8.f + acc);
+}
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 4 // each SIMD group works on 4 rows
+#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
+#elif defined (ADRENO_GPU)
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+
+inline void mul_vec_q_n_f32_v(
+ global void * src0,
+ global float * src1,
+ global float * dst,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne10,
+ int ne12,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+ const ulong nb = ne00/QK4_0;
+
+ int r0 = get_group_id(0);
+ int r1 = get_group_id(1);
+ int im = get_group_id(2);
+
+ // (r0 * N_SIMDGROUP + get_sub_group_id()) is essenatially the linear global
+ // id of a SIMD group in the grid.
+ int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+ int i12 = im%ne12;
+ int i13 = im/ne12;
+
+ ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
+ global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0;
+ global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
+
+ float16 yl; // src1 vector cache
+ float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f);
+
+ int ix = get_sub_group_local_id()/2;
+ int il = 8*(get_sub_group_local_id()%2);
+
+ global float * yb = y + ix * QK4_0 + il;
+
+ // each thread in a SIMD group deals with half a block.
+ for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+ float sumy = 0;
+
+ sumy += yb[0];
+ sumy += yb[1];
+ sumy += yb[2];
+ sumy += yb[3];
+ sumy += yb[4];
+ sumy += yb[5];
+ sumy += yb[6];
+ sumy += yb[7];
+
+ sumy += yb[16];
+ sumy += yb[17];
+ sumy += yb[18];
+ sumy += yb[19];
+ sumy += yb[20];
+ sumy += yb[21];
+ sumy += yb[22];
+ sumy += yb[23];
+
+
+ yl.s0 = yb[0];
+ yl.s1 = yb[1]/256.f;
+
+ yl.s2 = yb[2];
+ yl.s3 = yb[3]/256.f;
+
+ yl.s4 = yb[4];
+ yl.s5 = yb[5]/256.f;
+
+ yl.s6 = yb[6];
+ yl.s7 = yb[7]/256.f;
+
+ yl.s8 = yb[16]/16.f;
+ yl.s9 = yb[17]/4096.f;
+
+ yl.sa = yb[18]/16.f;
+ yl.sb = yb[19]/4096.f;
+
+ yl.sc = yb[20]/16.f;
+ yl.sd = yb[21]/4096.f;
+
+ yl.se = yb[22]/16.f;
+ yl.sf = yb[23]/4096.f;
+
+ sumf.s0 += block_q_4_0_dot_y_v(x+ib+0*nb, sumy, yl, il);
+ sumf.s1 += block_q_4_0_dot_y_v(x+ib+1*nb, sumy, yl, il);
+ sumf.s2 += block_q_4_0_dot_y_v(x+ib+2*nb, sumy, yl, il);
+ sumf.s3 += block_q_4_0_dot_y_v(x+ib+3*nb, sumy, yl, il);
+
+ // One thread in a SIMD group (i.e., subgroup) handles a half block,
+ // hence then entire SIMD group handles SIMDWIDTH/2 blocks.
+ // y points to the activation matrix (of type float). Therefore for
+ // one thread, the # of blocks y should advance is SIMDWIDTH/2 (because
+ // SIMDWIDTH/2 blocks are processed by a SIMD group) - in terms of
+ // floats, it is QK4_0 * (SIMDWIDTH/2), where QK4_0 is the block size.
+ yb += QK4_0 * (N_SIMDWIDTH/2);
+ }
+
+ // The above does not work for Adreno - it produces incorrect results for
+ // row = 1, 2, 3 and only row = 0 gives the correct result.
+ // If N_DST is changed, the below array must be initialized accordingly.
+ // This also seems to perform better on Intel.
+ float4 tot = (float4)(
+ sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
+ sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
+ );
+
+ if (get_sub_group_local_id() == 0) {
+ if (first_row + 0 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+ }
+ if (first_row + 1 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+ }
+ if (first_row + 2 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+ }
+ if (first_row + 3 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+ }
+ }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_v(
+ global void * src0,
+ ulong offset0,
+ global float * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne10,
+ int ne12,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+ src0 = (global void*)((global char*)src0 + offset0);
+ src1 = (global float*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ mul_vec_q_n_f32_v(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
+
+//------------------------------------------------------------------------------
+// kernel_convert_block_q4_0
+// Convert the block_q4_0 format to 2 separate arrays (AOS -> SOA).
+// This kernel does not deshuffle the bits.
+//------------------------------------------------------------------------------
+kernel void kernel_convert_block_q4_0(
+ global struct block_q4_0 * src0,
+ global uchar * dst_q,
+ global half * dst_d
+) {
+ global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
+ global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
+ global half * d = (global half *) dst_d + get_global_id(0);
+
+ *d = b->d;
+
+ for (int i = 0; i < QK4_0/2; ++i) {
+ q[i] = b->qs[i];
+ }
+}
+
+kernel void kernel_restore_block_q4_0(
+ global uchar * src_q,
+ global half * src_d,
+ global struct block_q4_0 * dst
+) {
+ global struct block_q4_0 * b = (global struct block_q4_0 *) dst + get_global_id(0);
+ global uchar * q = (global uchar *) src_q + QK4_0/2*get_global_id(0);
+ global half * d = (global half *) src_d + get_global_id(0);
+
+ b->d = *d;
+ for (int i = 0; i < QK4_0/2; ++i) {
+ b->qs[i] = q[i];
+ }
+}
+
+//------------------------------------------------------------------------------
+// mul_vec_q_n_f32_flat
+//
+// This variation uses flat arrays (struct of arrays, SOA) representation for
+// quant tensors.
+//------------------------------------------------------------------------------
+
+// This function requires the original shuffled weights.
+// As a reminder, the original weights are shuffled so that (q[0], q[16]) are
+// packed together in a byte, so are (q[1], q[17]) and so on.
+inline float block_q_4_0_dot_y_flat(
+ global uchar * x,
+ global half * dh,
+ float sumy,
+ float16 yl,
+ int il
+) {
+ float d = *dh;
+ global ushort * qs = ((global ushort *)x + il/2);
+ float acc = 0.f;
+
+ acc += yl.s0 * (qs[0] & 0x000F);
+ acc += yl.s1 * (qs[0] & 0x0F00);
+ acc += yl.s8 * (qs[0] & 0x00F0);
+ acc += yl.s9 * (qs[0] & 0xF000);
+
+ acc += yl.s2 * (qs[1] & 0x000F);
+ acc += yl.s3 * (qs[1] & 0x0F00);
+ acc += yl.sa * (qs[1] & 0x00F0);
+ acc += yl.sb * (qs[1] & 0xF000);
+
+ acc += yl.s4 * (qs[2] & 0x000F);
+ acc += yl.s5 * (qs[2] & 0x0F00);
+ acc += yl.sc * (qs[2] & 0x00F0);
+ acc += yl.sd * (qs[2] & 0xF000);
+
+ acc += yl.s6 * (qs[3] & 0x000F);
+ acc += yl.s7 * (qs[3] & 0x0F00);
+ acc += yl.se * (qs[3] & 0x00F0);
+ acc += yl.sf * (qs[3] & 0xF000);
+
+ return d * (sumy * -8.f + acc);
+}
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 4 // each SIMD group works on 4 rows
+#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // assuming SIMD group size is 32
+#elif defined (ADRENO_GPU)
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+
+inline void mul_vec_q_n_f32_flat(
+ global uchar * src0_q,
+ global half * src0_d,
+ global float * src1,
+ global float * dst,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne10,
+ int ne12,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+ const ulong nb = ne00/QK4_0;
+
+ int r0 = get_group_id(0);
+ int r1 = get_group_id(1);
+ int im = get_group_id(2);
+
+ // (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
+ // a SIMD group in the grid. Each SIMD group produces N_DST values in the
+ // result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
+ // Currently with llama2 7B, im is always 0.
+ // TODO: how to handle im/gqa*(nb*ne0)?
+ int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+ int i12 = im%ne12;
+ int i13 = im/ne12;
+
+ // The number of scales is the same as the number of blocks.
+ ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+ // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
+ ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
+
+ global uchar * x = (global uchar *) src0_q + offset0_q;
+ global half * d = (global half *) src0_d + offset0_d;
+ global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
+
+ float16 yl;
+ float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f);
+
+ int ix = get_sub_group_local_id()/2;
+ int il = 8*(get_sub_group_local_id()%2);
+
+ global float * yb = y + ix*QK4_0 + il;
+
+ for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+ float sumy = 0.f;
+
+ sumy += yb[0];
+ sumy += yb[1];
+ sumy += yb[2];
+ sumy += yb[3];
+ sumy += yb[4];
+ sumy += yb[5];
+ sumy += yb[6];
+ sumy += yb[7];
+
+ sumy += yb[16];
+ sumy += yb[17];
+ sumy += yb[18];
+ sumy += yb[19];
+ sumy += yb[20];
+ sumy += yb[21];
+ sumy += yb[22];
+ sumy += yb[23];
+
+ yl.s0 = yb[0];
+ yl.s1 = yb[1]/256.f;
+
+ yl.s2 = yb[2];
+ yl.s3 = yb[3]/256.f;
+
+ yl.s4 = yb[4];
+ yl.s5 = yb[5]/256.f;
+
+ yl.s6 = yb[6];
+ yl.s7 = yb[7]/256.f;
+
+ yl.s8 = yb[16]/16.f;
+ yl.s9 = yb[17]/4096.f;
+
+ yl.sa = yb[18]/16.f;
+ yl.sb = yb[19]/4096.f;
+
+ yl.sc = yb[20]/16.f;
+ yl.sd = yb[21]/4096.f;
+
+ yl.se = yb[22]/16.f;
+ yl.sf = yb[23]/4096.f;
+
+ sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
+ sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
+ sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
+ sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
+
+ yb += QK4_0 * (N_SIMDWIDTH/2);
+ }
+
+ float4 tot = (float4)(
+ sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
+ sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
+ );
+
+ if (get_sub_group_local_id() == 0) {
+ if (first_row + 0 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+ }
+ if (first_row + 1 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+ }
+ if (first_row + 2 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+ }
+ if (first_row + 3 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+ }
+ }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_flat(
+ global uchar * src0_q,
+ global half * src0_d,
+ global float * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne10,
+ int ne12,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+ src1 = (global float*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ mul_vec_q_n_f32_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
+
+//
+// This variant outputs 8 values.
+//
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 8 // each SIMD group works on 8 rows
+#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // assuming SIMD group size is 32
+#elif defined (ADRENO_GPU)
+#define N_DST 8
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+
+inline void mul_vec_q_n_f32_8x_flat(
+ global uchar * src0_q,
+ global half * src0_d,
+ global float * src1,
+ global float * dst,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne10,
+ int ne12,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+ const ulong nb = ne00/QK4_0;
+
+ int r0 = get_group_id(0);
+ int r1 = get_group_id(1);
+ int im = get_group_id(2);
+
+ // (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
+ // a SIMD group in the grid. Each SIMD group produces N_DST values in the
+ // result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
+ // Currently with llama2 7B, im is always 0.
+ // TODO: how to handle im/gqa*(nb*ne0)?
+ int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+ int i12 = im%ne12;
+ int i13 = im/ne12;
+
+ // The number of scales is the same as the number of blocks.
+ ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+ // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
+ ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
+
+ global uchar * x = (global uchar *) src0_q + offset0_q;
+ global half * d = (global half *) src0_d + offset0_d;
+ global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
+
+ float16 yl;
+ float8 sumf = 0.f;
+
+ int ix = get_sub_group_local_id()/2;
+ int il = 8*(get_sub_group_local_id()%2);
+
+ global float * yb = y + ix*QK4_0 + il;
+
+ for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+ float sumy = 0.f;
+
+ sumy += yb[0];
+ sumy += yb[1];
+ sumy += yb[2];
+ sumy += yb[3];
+ sumy += yb[4];
+ sumy += yb[5];
+ sumy += yb[6];
+ sumy += yb[7];
+
+ sumy += yb[16];
+ sumy += yb[17];
+ sumy += yb[18];
+ sumy += yb[19];
+ sumy += yb[20];
+ sumy += yb[21];
+ sumy += yb[22];
+ sumy += yb[23];
+
+ yl.s0 = yb[0];
+ yl.s1 = yb[1]/256.f;
+
+ yl.s2 = yb[2];
+ yl.s3 = yb[3]/256.f;
+
+ yl.s4 = yb[4];
+ yl.s5 = yb[5]/256.f;
+
+ yl.s6 = yb[6];
+ yl.s7 = yb[7]/256.f;
+
+ yl.s8 = yb[16]/16.f;
+ yl.s9 = yb[17]/4096.f;
+
+ yl.sa = yb[18]/16.f;
+ yl.sb = yb[19]/4096.f;
+
+ yl.sc = yb[20]/16.f;
+ yl.sd = yb[21]/4096.f;
+
+ yl.se = yb[22]/16.f;
+ yl.sf = yb[23]/4096.f;
+
+ sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
+ sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
+ sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
+ sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
+
+ sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
+ sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
+ sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
+ sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
+
+ yb += QK4_0 * (N_SIMDWIDTH/2);
+ }
+
+ float8 tot = (float8)(
+ sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
+ sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
+ sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
+ sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
+ );
+
+ if (get_sub_group_local_id() == 0) {
+ if (first_row + 0 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+ }
+ if (first_row + 1 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+ }
+ if (first_row + 2 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+ }
+ if (first_row + 3 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+ }
+
+ if (first_row + 4 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
+ }
+ if (first_row + 5 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
+ }
+ if (first_row + 6 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
+ }
+ if (first_row + 7 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
+ }
+ }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_8x_flat(
+ global uchar * src0_q,
+ global half * src0_d,
+ global float * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne10,
+ int ne12,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+ src1 = (global float*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ mul_vec_q_n_f32_8x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl
new file mode 100644
index 000000000..e2024332f
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl
@@ -0,0 +1,106 @@
+//------------------------------------------------------------------------------
+// This file is contains additional kernels for data conversion.
+// These kernels are used when loading the model, so its performance is less
+// important.
+//------------------------------------------------------------------------------
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#elif defined(cl_amd_fp16)
+#pragma OPENCL EXTENSION cl_amd_fp16 : enable
+#else
+#error "Half precision floating point not supportedby OpenCL implementation on your device."
+#endif
+
+#ifdef cl_khr_subgroups
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#elif defined(cl_intel_subgroups)
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#error "Subgroup not supported on your device."
+#endif
+
+#ifdef cl_intel_required_subgroup_size
+// Always use subgroup size of 32 on Intel.
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+// Always use subgroups size of 64 on Adreno.
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#else
+// TODO: do not know how to choose subgroup size on other GPUs.
+#error "Selecting subgroup size is not supported on your device."
+#endif
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+#define QR4_1 2
+#define QK5_0 32
+#define QR5_0 2
+#define QK5_1 32
+#define QR5_1 2
+#define QK8_0 32
+#define QR8_0 1
+#define QK_K 256
+#define K_QUANTS_PER_ITERATION 2
+
+typedef char int8_t;
+typedef uchar uint8_t;
+typedef short int16_t;
+typedef ushort uint16_t;
+typedef int int32_t;
+typedef uint uint32_t;
+
+//------------------------------------------------------------------------------
+// block_q4_0
+//------------------------------------------------------------------------------
+struct block_q4_0
+{
+ half d;
+ uint8_t qs[QK4_0 / 2];
+};
+
+//------------------------------------------------------------------------------
+// mul_vec_q_n_f32_flat_noshuffle
+//
+// This variation uses flat arrays (struct of arrays, SOA) representation for
+// quant tensors. It also uses non shuffled bit order for weights.
+//
+// The shuffled version is kept in the original file because moving it here
+// seems to result in worse performance for adreno.
+//------------------------------------------------------------------------------
+
+kernel void kernel_convert_block_q4_0_noshuffle(
+ global struct block_q4_0 * src0,
+ global uchar * dst_q,
+ global half * dst_d
+) {
+ global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
+ global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
+ global half * d = (global half *) dst_d + get_global_id(0);
+
+ *d = b->d;
+ for (int i = 0; i < QK4_0/4; ++i) {
+ uchar x0 = b->qs[2*i + 0];
+ uchar x1 = b->qs[2*i + 1];
+
+ q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
+ q[i + QK4_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
+
+#ifdef ADRENO_GPU
+ // Workaround for adreno - must have the following printf statement for
+ // the kernel to work properly. Otherwise it produces incorrect result.
+ // convert_uchar above also seems necessary.
+ // Compare against a large number so that it does not print anything.
+ // get_sub_group_local_id() also works.
+ if (get_global_id(0) == 65536*4096) {
+ printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
+ }
+#endif
+ }
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
new file mode 100644
index 000000000..5e195411d
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
@@ -0,0 +1,265 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
+#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+
+// assume
+#define QK4_0 32
+#define N_SIMDGROUP 4
+
+#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
+ float shared_y; \
+ shared_y = sub_group_broadcast(y.s0, 0); \
+ total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s1, 0); \
+ total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s2, 0); \
+ total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s3, 0); \
+ total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s4, 0); \
+ total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s5, 0); \
+ total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s6, 0); \
+ total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s7, 0); \
+ total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s0, 1); \
+ total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s1, 1); \
+ total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s2, 1); \
+ total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s3, 1); \
+ total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s4, 1); \
+ total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s5, 1); \
+ total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s6, 1); \
+ total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s7, 1); \
+ total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+
+
+#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
+ shared_y = sub_group_broadcast(y.s0, 2); \
+ total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s1, 2); \
+ total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s2, 2); \
+ total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s3, 2); \
+ total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s4, 2); \
+ total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s5, 2); \
+ total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s6, 2); \
+ total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s7, 2); \
+ total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s0, 3); \
+ total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s1, 3); \
+ total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s2, 3); \
+ total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s3, 3); \
+ total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s4, 3); \
+ total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s5, 3); \
+ total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s6, 3); \
+ total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s7, 3); \
+ total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+
+
+#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
+ float8 shared_y; \
+ shared_y = sub_group_broadcast(y, 0); \
+ total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+ total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+ total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+ total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+ total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+ total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+ total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+ total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+ total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+ total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+ total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+ total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+ total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+ total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+ total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+ total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+ shared_y = sub_group_broadcast(y, 1); \
+ total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+ total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+ total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+ total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+ total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+ total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+ total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+ total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+ total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+ total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+ total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+ total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+ total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+ total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+ total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+ total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+
+
+#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
+ shared_y = sub_group_broadcast(y, 2); \
+ total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+ total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+ total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+ total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+ total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+ total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+ total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+ total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+ total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+ total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+ total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+ total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+ total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+ total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+ total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+ total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+ shared_y = sub_group_broadcast(y, 3); \
+ total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+ total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+ total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+ total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+ total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+ total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+ total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+ total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+ total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+ total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+ total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+ total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+ total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+ total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+ total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+ total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+
+
+__attribute__((qcom_reqd_sub_group_size("full")))
+__kernel void kernel_gemv_noshuffle(
+ __read_only image1d_buffer_t src0_q, // quantized A
+ global half2 * src0_d, // A scales
+ __read_only image1d_buffer_t src1, // B
+ ulong offset1, // offset to B (0)
+ global float * dst, // C
+ ulong offsetd, // offset to C (0)
+ uint K, // K
+ int ne01, // M
+ int ne02, // 1
+ int ne10, // K
+ int ne12, // 1
+ int ne0, // M
+ int ne1, // N
+ int r2, // 1
+ int r3)
+{
+ uint groupId = get_local_id(1);
+ uint gid = get_global_id(0);
+ ushort slid = get_sub_group_local_id();
+
+ __private uint4 regA;
+ __private half2 regS;
+ __private float8 regB;
+
+ __private float2 totalSum = (float2)(0.0f);
+
+ // loop along K in block granularity, skip 4 blocks every iter
+ for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
+ regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
+ // first 4 fibers in each wave load 8 B values to its private scope
+ if (slid < 4) {
+ regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
+ regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
+ }
+
+ // load half weights for two blocks in consecutive rows
+ regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
+ regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
+ regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
+ regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
+#ifdef VECTOR_SUB_GROUP_BROADCAT
+ dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
+#else
+ dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
+#endif // VECTOR_SUB_GROUP_BROADCAT
+
+ regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
+ regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
+ regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
+ regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
+#ifdef VECTOR_SUB_GROUP_BROADCAT
+ dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
+#else
+ dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
+#endif // VECTOR_SUB_GROUP_BROADCAT
+ }
+
+ // reduction in local memory, assumes #wave=4
+ __local float2 reduceLM[SIMDGROUP_WIDTH * 3];
+ if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
+ if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
+ if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
+ if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
+ if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
+
+ // 2 outputs per fiber in wave 0
+ if (groupId == 0) {
+ dst = (global float*)((global char*)dst + offsetd);
+ vstore2(totalSum, 0, &(dst[gid * 2]));
+ }
+
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl
new file mode 100644
index 000000000..5bdd4d067
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl
@@ -0,0 +1,271 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
+#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+
+// assume
+#define QK4_0 32
+#define N_SIMDGROUP 4
+
+#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
+ float shared_y; \
+ shared_y = sub_group_broadcast(y.s0, 0); \
+ total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s1, 0); \
+ total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s2, 0); \
+ total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s3, 0); \
+ total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s4, 0); \
+ total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s5, 0); \
+ total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s6, 0); \
+ total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s7, 0); \
+ total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s0, 1); \
+ total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s1, 1); \
+ total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s2, 1); \
+ total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s3, 1); \
+ total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s4, 1); \
+ total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s5, 1); \
+ total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s6, 1); \
+ total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s7, 1); \
+ total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+
+
+#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
+ shared_y = sub_group_broadcast(y.s0, 2); \
+ total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s1, 2); \
+ total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s2, 2); \
+ total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s3, 2); \
+ total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s4, 2); \
+ total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s5, 2); \
+ total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s6, 2); \
+ total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s7, 2); \
+ total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s0, 3); \
+ total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s1, 3); \
+ total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s2, 3); \
+ total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s3, 3); \
+ total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s4, 3); \
+ total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s5, 3); \
+ total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s6, 3); \
+ total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+ shared_y = sub_group_broadcast(y.s7, 3); \
+ total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+ total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+
+
+#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
+ float8 shared_y; \
+ shared_y = sub_group_broadcast(y, 0); \
+ total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+ total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+ total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+ total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+ total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+ total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+ total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+ total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+ total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+ total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+ total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+ total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+ total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+ total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+ total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+ total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+ shared_y = sub_group_broadcast(y, 1); \
+ total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+ total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+ total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+ total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+ total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+ total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+ total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+ total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+ total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+ total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+ total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+ total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+ total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+ total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+ total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+ total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+
+
+#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
+ shared_y = sub_group_broadcast(y, 2); \
+ total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+ total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+ total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+ total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+ total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+ total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+ total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+ total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+ total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+ total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+ total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+ total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+ total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+ total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+ total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+ total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+ shared_y = sub_group_broadcast(y, 3); \
+ total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+ total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+ total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+ total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+ total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+ total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+ total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+ total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+ total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+ total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+ total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+ total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+ total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+ total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+ total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+ total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+
+
+__attribute__((qcom_reqd_sub_group_size("full")))
+__kernel void kernel_gemv_noshuffle(
+ __read_only image1d_buffer_t src0_q, // quantized A
+ global half2 * src0_d, // A scales
+ __read_only image1d_buffer_t src1, // B
+ ulong offset1, // offset to B (0)
+ global float * dst, // C
+ ulong offsetd, // offset to C (0)
+ int ne00, // K
+ int ne01, // M
+ int ne02, // 1
+ int ne10, // K
+ int ne12, // 1
+ int ne0, // M
+ int ne1, // N
+ int r2, // 1
+ int r3)
+{
+ uint groupId = get_local_id(1);
+ uint gid = get_global_id(0);
+ ushort slid = get_sub_group_local_id();
+
+ uint K = ne00;
+ uint M = ne01;
+
+ uint LINE_STRIDE_A = M / 2;
+ uint BLOCK_STRIDE_A = N_SIMDGROUP * M;
+
+ __private uint4 regA;
+ __private half2 regS;
+ __private float8 regB;
+
+ __private float2 totalSum = (float2)(0.0f);
+
+ // loop along K in block granularity, skip 4 blocks every iter
+ for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
+ regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
+ // first 4 fibers in each wave load 8 B values to its private scope
+ if (slid < 4) {
+ regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
+ regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
+ }
+
+ // load half weights for two blocks in consecutive rows
+ regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
+ regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
+ regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
+ regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
+#ifdef VECTOR_SUB_GROUP_BROADCAT
+ dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
+#else
+ dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
+#endif // VECTOR_SUB_GROUP_BROADCAT
+
+ regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
+ regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
+ regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
+ regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
+#ifdef VECTOR_SUB_GROUP_BROADCAT
+ dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
+#else
+ dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
+#endif // VECTOR_SUB_GROUP_BROADCAT
+ }
+
+ // reduction in local memory, assumes #wave=4
+ __local float2 reduceLM[SIMDGROUP_WIDTH * 3];
+ if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
+ if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
+ if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
+ if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
+ if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
+
+ // 2 outputs per fiber in wave 0
+ if (groupId == 0) {
+ dst = (global float*)((global char*)dst + offsetd);
+ vstore2(totalSum, 0, &(dst[gid * 2]));
+ }
+
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl
new file mode 100644
index 000000000..e19e9a2f4
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl
@@ -0,0 +1,1225 @@
+//------------------------------------------------------------------------------
+// This file is contains additional mulmat kernels
+// (and potentially other kernels).
+//------------------------------------------------------------------------------
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#elif defined(cl_amd_fp16)
+#pragma OPENCL EXTENSION cl_amd_fp16 : enable
+#else
+#error "Half precision floating point not supportedby OpenCL implementation on your device."
+#endif
+
+#ifdef cl_khr_subgroups
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#elif defined(cl_intel_subgroups)
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#error "Subgroup not supported on your device."
+#endif
+
+#ifdef cl_intel_required_subgroup_size
+// Always use subgroup size of 32 on Intel.
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+// Always use subgroups size of 64 on Adreno.
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#else
+// TODO: do not know how to choose subgroup size on other GPUs.
+#error "Selecting subgroup size is not supported on your device."
+#endif
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+#define QR4_1 2
+#define QK5_0 32
+#define QR5_0 2
+#define QK5_1 32
+#define QR5_1 2
+#define QK8_0 32
+#define QR8_0 1
+#define QK_K 256
+#define K_QUANTS_PER_ITERATION 2
+
+typedef char int8_t;
+typedef uchar uint8_t;
+typedef short int16_t;
+typedef ushort uint16_t;
+typedef int int32_t;
+typedef uint uint32_t;
+
+//------------------------------------------------------------------------------
+// block_q4_0
+//------------------------------------------------------------------------------
+struct block_q4_0
+{
+ half d;
+ uint8_t qs[QK4_0 / 2];
+};
+
+//------------------------------------------------------------------------------
+// block_q6_K
+//------------------------------------------------------------------------------
+// 6-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elements each
+// Effectively 6.5625 bits per weight
+typedef struct {
+ uint8_t ql[QK_K/2]; // quants, lower 4 bits
+ uint8_t qh[QK_K/4]; // quants, upper 2 bits
+ int8_t scales[QK_K/16]; // scales, quantized with 8 bits
+ half d; // super-block scale
+} block_q6_K;
+
+//------------------------------------------------------------------------------
+// These are the variant for matmatmul, based on the matvecmul kernel with
+// flattened block_q4_0.
+//------------------------------------------------------------------------------
+
+// Common dot prod.
+inline float mm_block_q_4_0_dot_y_flat(
+ global uchar * x,
+ global half * dh,
+ float sumy,
+ float16 yl,
+ int il
+) {
+ float d = *dh;
+ global ushort * qs = ((global ushort *)x + il/2);
+ float acc = 0.f;
+
+ acc += yl.s0 * (qs[0] & 0x000F);
+ acc += yl.s1 * (qs[0] & 0x0F00);
+ acc += yl.s8 * (qs[0] & 0x00F0);
+ acc += yl.s9 * (qs[0] & 0xF000);
+
+ acc += yl.s2 * (qs[1] & 0x000F);
+ acc += yl.s3 * (qs[1] & 0x0F00);
+ acc += yl.sa * (qs[1] & 0x00F0);
+ acc += yl.sb * (qs[1] & 0xF000);
+
+ acc += yl.s4 * (qs[2] & 0x000F);
+ acc += yl.s5 * (qs[2] & 0x0F00);
+ acc += yl.sc * (qs[2] & 0x00F0);
+ acc += yl.sd * (qs[2] & 0xF000);
+
+ acc += yl.s6 * (qs[3] & 0x000F);
+ acc += yl.s7 * (qs[3] & 0x0F00);
+ acc += yl.se * (qs[3] & 0x00F0);
+ acc += yl.sf * (qs[3] & 0xF000);
+
+ return d * (sumy * -8.f + acc);
+}
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 8 // each SIMD group works on 8 rows (in weights matrix)
+#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
+#elif defined (ADRENO_GPU)
+#define N_DST 8
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+//
+// This variant performs 1d blocking with 8x output.
+// Eeach simdgroup outputs 8 values on `n0` dim (row in the output matrix).
+//
+inline void mul_mat_q_n_f32_1d_8x_flat(
+ global uchar * src0_q,
+ global half * src0_d,
+ global float * src1,
+ global float * dst,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne10,
+ int ne12,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+ const int nb = ne00/QK4_0;
+
+ int r0 = get_group_id(0);
+ int r1 = get_group_id(1);
+ int im = get_group_id(2);
+
+ // (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
+ // a SIMD group in the grid. Each SIMD group produces N_DST values in the
+ // result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
+ // Currently with llama2 7B, im is always 0.
+ // TODO: how to handle im/gqa*(nb*ne0)?
+ int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+ int i12 = im%ne12;
+ int i13 = im/ne12;
+
+ // The number of scales is the same as the number of blocks.
+ ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+ // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
+ ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
+
+ global uchar * x = (global uchar *) src0_q + offset0_q;
+ global half * d = (global half *) src0_d + offset0_d;
+ global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
+
+ float16 yl;
+ float8 sumf = (float8)(0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f);
+
+ int ix = get_sub_group_local_id()/2;
+ int il = 8*(get_sub_group_local_id()%2);
+
+ global float * yb = y + ix*QK4_0 + il;
+
+ for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+ float sumy = 0.f;
+
+ sumy += yb[0];
+ sumy += yb[1];
+ sumy += yb[2];
+ sumy += yb[3];
+ sumy += yb[4];
+ sumy += yb[5];
+ sumy += yb[6];
+ sumy += yb[7];
+
+ sumy += yb[16];
+ sumy += yb[17];
+ sumy += yb[18];
+ sumy += yb[19];
+ sumy += yb[20];
+ sumy += yb[21];
+ sumy += yb[22];
+ sumy += yb[23];
+
+ yl.s0 = yb[0];
+ yl.s1 = yb[1]/256.f;
+
+ yl.s2 = yb[2];
+ yl.s3 = yb[3]/256.f;
+
+ yl.s4 = yb[4];
+ yl.s5 = yb[5]/256.f;
+
+ yl.s6 = yb[6];
+ yl.s7 = yb[7]/256.f;
+
+ yl.s8 = yb[16]/16.f;
+ yl.s9 = yb[17]/4096.f;
+
+ yl.sa = yb[18]/16.f;
+ yl.sb = yb[19]/4096.f;
+
+ yl.sc = yb[20]/16.f;
+ yl.sd = yb[21]/4096.f;
+
+ yl.se = yb[22]/16.f;
+ yl.sf = yb[23]/4096.f;
+
+ sumf.s0 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
+ sumf.s1 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
+ sumf.s2 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
+ sumf.s3 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
+
+ sumf.s4 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
+ sumf.s5 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
+ sumf.s6 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
+ sumf.s7 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
+
+ yb += QK4_0 * (N_SIMDWIDTH/2);
+ }
+
+ float8 tot = (float8)(
+ sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
+ sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
+ sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
+ sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
+ );
+
+ if (get_sub_group_local_id() == 0) {
+ if (first_row + 0 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+ }
+ if (first_row + 1 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+ }
+ if (first_row + 2 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+ }
+ if (first_row + 3 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+ }
+
+ if (first_row + 4 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
+ }
+ if (first_row + 5 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
+ }
+ if (first_row + 6 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
+ }
+ if (first_row + 7 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
+ }
+ }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_1d_8x_flat(
+ global uchar * src0_q,
+ global half * src0_d,
+ global float * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne10,
+ int ne12,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+ src1 = (global float*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ mul_mat_q_n_f32_1d_8x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 16 // each SIMD group works on 8 rows (in weights matrix)
+#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
+#elif defined (ADRENO_GPU)
+#define N_DST 16
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+//
+// This variant performs 1d blocking with 16x output.
+// Eeach simdgroup outputs 16 values on `n0` dim (row in the output matrix).
+//
+inline void mul_mat_q_n_f32_1d_16x_flat(
+ global uchar * src0_q,
+ global half * src0_d,
+ global float * src1,
+ global float * dst,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne10,
+ int ne12,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+ const int nb = ne00/QK4_0;
+
+ int r0 = get_group_id(0);
+ int r1 = get_group_id(1);
+ int im = get_group_id(2);
+
+ // (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
+ // a SIMD group in the grid. Each SIMD group produces N_DST values in the
+ // result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
+ // Currently with llama2 7B, im is always 0.
+ // TODO: how to handle im/gqa*(nb*ne0)?
+ int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+ int i12 = im%ne12;
+ int i13 = im/ne12;
+
+ // The number of scales is the same as the number of blocks.
+ ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+ // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
+ ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
+
+ global uchar * x = (global uchar *) src0_q + offset0_q;
+ global half * d = (global half *) src0_d + offset0_d;
+ global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
+
+ float16 yl;
+ float16 sumf = (float16)(0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f);
+
+ int ix = get_sub_group_local_id()/2;
+ int il = 8*(get_sub_group_local_id()%2);
+
+ global float * yb = y + ix*QK4_0 + il;
+
+ for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+ float sumy = 0.f;
+
+ sumy += yb[0];
+ sumy += yb[1];
+ sumy += yb[2];
+ sumy += yb[3];
+ sumy += yb[4];
+ sumy += yb[5];
+ sumy += yb[6];
+ sumy += yb[7];
+
+ sumy += yb[16];
+ sumy += yb[17];
+ sumy += yb[18];
+ sumy += yb[19];
+ sumy += yb[20];
+ sumy += yb[21];
+ sumy += yb[22];
+ sumy += yb[23];
+
+ yl.s0 = yb[0];
+ yl.s1 = yb[1]/256.f;
+
+ yl.s2 = yb[2];
+ yl.s3 = yb[3]/256.f;
+
+ yl.s4 = yb[4];
+ yl.s5 = yb[5]/256.f;
+
+ yl.s6 = yb[6];
+ yl.s7 = yb[7]/256.f;
+
+ yl.s8 = yb[16]/16.f;
+ yl.s9 = yb[17]/4096.f;
+
+ yl.sa = yb[18]/16.f;
+ yl.sb = yb[19]/4096.f;
+
+ yl.sc = yb[20]/16.f;
+ yl.sd = yb[21]/4096.f;
+
+ yl.se = yb[22]/16.f;
+ yl.sf = yb[23]/4096.f;
+
+ sumf.s0 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
+ sumf.s1 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
+ sumf.s2 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
+ sumf.s3 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
+
+ sumf.s4 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
+ sumf.s5 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
+ sumf.s6 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
+ sumf.s7 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
+
+ sumf.s8 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 8*nb*QK4_0/2, d + ib + 8*nb, sumy, yl, il);
+ sumf.s9 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 9*nb*QK4_0/2, d + ib + 9*nb, sumy, yl, il);
+ sumf.sa += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 10*nb*QK4_0/2, d + ib + 10*nb, sumy, yl, il);
+ sumf.sb += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 11*nb*QK4_0/2, d + ib + 11*nb, sumy, yl, il);
+
+ sumf.sc += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 12*nb*QK4_0/2, d + ib + 12*nb, sumy, yl, il);
+ sumf.sd += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 13*nb*QK4_0/2, d + ib + 13*nb, sumy, yl, il);
+ sumf.se += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 14*nb*QK4_0/2, d + ib + 14*nb, sumy, yl, il);
+ sumf.sf += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 15*nb*QK4_0/2, d + ib + 15*nb, sumy, yl, il);
+
+ yb += QK4_0 * (N_SIMDWIDTH/2);
+ }
+
+ float16 tot = (float16)(
+ sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
+ sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
+ sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
+ sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7),
+
+ sub_group_reduce_add(sumf.s8), sub_group_reduce_add(sumf.s9),
+ sub_group_reduce_add(sumf.sa), sub_group_reduce_add(sumf.sb),
+ sub_group_reduce_add(sumf.sc), sub_group_reduce_add(sumf.sd),
+ sub_group_reduce_add(sumf.se), sub_group_reduce_add(sumf.sf)
+ );
+
+ if (get_sub_group_local_id() == 0) {
+ if (first_row + 0 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+ }
+ if (first_row + 1 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+ }
+ if (first_row + 2 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+ }
+ if (first_row + 3 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+ }
+
+ if (first_row + 4 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
+ }
+ if (first_row + 5 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
+ }
+ if (first_row + 6 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
+ }
+ if (first_row + 7 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
+ }
+
+ if (first_row + 8 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 8] = tot.s8;
+ }
+ if (first_row + 9 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 9] = tot.s9;
+ }
+ if (first_row + 10 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 10] = tot.sa;
+ }
+ if (first_row + 11 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 11] = tot.sb;
+ }
+
+ if (first_row + 12 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 12] = tot.sc;
+ }
+ if (first_row + 13 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 13] = tot.sd;
+ }
+ if (first_row + 14 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 14] = tot.se;
+ }
+ if (first_row + 15 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 15] = tot.sf;
+ }
+ }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_1d_16x_flat(
+ global uchar * src0_q,
+ global half * src0_d,
+ global float * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne10,
+ int ne12,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+ src1 = (global float*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ mul_mat_q_n_f32_1d_16x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
+
+//------------------------------------------------------------------------------
+// kernel_mul_mat_q4_0_f32_flat_v0
+//------------------------------------------------------------------------------
+inline float block_q_4_0_dot_y_flat_v2(
+ half x,
+ half d,
+ float sumy,
+ float4 yl
+) {
+ uchar2 q = as_uchar2(x);
+ float acc = 0.0f;
+
+ acc += (q.s0 & 0x0F) * yl.s0;
+ acc += (q.s1 & 0x0F) * yl.s1;
+
+ acc += (q.s0 & 0xF0) * yl.s2;
+ acc += (q.s1 & 0xF0) * yl.s3;
+
+ return d * (sumy * -8.f + acc);;
+}
+
+inline float block_q_4_0_dot_y_flat_v4(
+ float x,
+ half d,
+ float sumy,
+ float8 yl
+) {
+ uchar4 q = as_uchar4(x);
+ float acc = 0.0f;
+
+ acc += (q.s0 & 0x0F) * yl.s0;
+ acc += (q.s1 & 0x0F) * yl.s1;
+ acc += (q.s2 & 0x0F) * yl.s2;
+ acc += (q.s3 & 0x0F) * yl.s3;
+
+ acc += (q.s0 & 0xF0) * yl.s4;
+ acc += (q.s1 & 0xF0) * yl.s5;
+ acc += (q.s2 & 0xF0) * yl.s6;
+ acc += (q.s3 & 0xF0) * yl.s7;
+
+ return d * (sumy * -8.f + acc);;
+}
+
+inline float block_q_4_0_dot_y_flat_v8(
+ float2 x,
+ half d,
+ float sumy,
+ float16 yl
+) {
+ uchar8 q = as_uchar8(x);
+ float acc = 0.0f;
+
+ acc += (q.s0 & 0x0F) * yl.s0;
+ acc += (q.s1 & 0x0F) * yl.s1;
+ acc += (q.s2 & 0x0F) * yl.s2;
+ acc += (q.s3 & 0x0F) * yl.s3;
+ acc += (q.s4 & 0x0F) * yl.s4;
+ acc += (q.s5 & 0x0F) * yl.s5;
+ acc += (q.s6 & 0x0F) * yl.s6;
+ acc += (q.s7 & 0x0F) * yl.s7;
+
+ acc += (q.s0 & 0xF0) * yl.s8;
+ acc += (q.s1 & 0xF0) * yl.s9;
+ acc += (q.s2 & 0xF0) * yl.sa;
+ acc += (q.s3 & 0xF0) * yl.sb;
+ acc += (q.s4 & 0xF0) * yl.sc;
+ acc += (q.s5 & 0xF0) * yl.sd;
+ acc += (q.s6 & 0xF0) * yl.se;
+ acc += (q.s7 & 0xF0) * yl.sf;
+
+ return d * (sumy * -8.f + acc);;
+}
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define THREADS_PER_BLK 4 // Number of threads per block, or each thread process 1/THREADS_PER_BLK of a block
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 16
+#elif defined (ADRENO_GPU)
+#define THREADS_PER_BLK 4
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+
+#if THREADS_PER_BLK == 2 // Each thread processes 1/2 block
+# define ACT_TY float16
+# define Q_BLK_LD_TY float2
+# define block_q_4_0_dot_y_flat block_q_4_0_dot_y_flat_v8
+#elif THREADS_PER_BLK == 4 // Each thread processes 1/4 block
+# define ACT_TY float8
+# define Q_BLK_LD_TY float
+# define block_q_4_0_dot_y_flat block_q_4_0_dot_y_flat_v4
+#elif THREADS_PER_BLK == 8 // Each thread processes 1/8 block
+# define ACT_TY float4
+# define Q_BLK_LD_TY half
+# define block_q_4_0_dot_y_flat block_q_4_0_dot_y_flat_v2
+#endif
+
+#define BTYES_PER_THREAD_IN_BLK (QK4_0/2/THREADS_PER_BLK)
+
+#if N_DST == 2
+# define SUM_TY float2
+#elif N_DST == 4
+# define SUM_TY float4
+#elif N_DST == 8
+# define SUM_TY float8
+#elif N_DST == 16
+# define SUM_TY float16
+#endif
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_flat_v0(
+ global uchar * src0_q,
+ global half * src0_d,
+ global float * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne10,
+ int ne12,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+ src1 = (global float*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ const int nb = ne00/QK4_0;
+
+ int r0 = get_group_id(0);
+ int r1 = get_group_id(1);
+ int im = get_group_id(2);
+
+ int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+ int i12 = im%ne12;
+ int i13 = im/ne12;
+
+ // The number of scales is the same as the number of blocks.
+ ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+ // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
+ ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
+
+ global uchar * x = (global uchar *) src0_q + offset0_q;
+ global half * d = (global half *) src0_d + offset0_d;
+ global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
+
+ int ix = get_sub_group_local_id()/THREADS_PER_BLK;
+ int il = get_sub_group_local_id()%THREADS_PER_BLK;
+
+ global float * yb = y + ix*QK4_0 + BTYES_PER_THREAD_IN_BLK*il;
+
+ // Registers for caching activation
+ ACT_TY yl = 0.f;
+
+ // Registers for caching quants
+ Q_BLK_LD_TY q_blk_0 = 0, q_blk_1 = 0;
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+ Q_BLK_LD_TY q_blk_2 = 0, q_blk_3 = 0;
+#endif
+#if N_DST == 8 || N_DST == 16
+ Q_BLK_LD_TY q_blk_4 = 0, q_blk_5 = 0, q_blk_6 = 0, q_blk_7 = 0;
+#endif
+
+ // Partial sum
+ SUM_TY sumf = 0.f;
+
+ for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/THREADS_PER_BLK) {
+ float sumy = 0.f;
+
+ q_blk_0 = *(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 0*nb*QK4_0/2);
+ q_blk_1 = *(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 1*nb*QK4_0/2);
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+ q_blk_2 = *(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 2*nb*QK4_0/2);
+ q_blk_3 = *(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 3*nb*QK4_0/2);
+#endif
+#if N_DST == 8 || N_DST == 16
+ q_blk_4 = (*(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 4*nb*QK4_0/2));
+ q_blk_5 = (*(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 5*nb*QK4_0/2));
+ q_blk_6 = (*(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 6*nb*QK4_0/2));
+ q_blk_7 = (*(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 7*nb*QK4_0/2));
+#endif
+
+ // Load activation
+#if THREADS_PER_BLK == 2 // Each thread processes 1/2 block
+ yl.s01234567 = *(global float8 *)(yb);
+ yl.s89abcdef = *(global float8 *)(yb + 16);
+
+ sumy += yl.s0;
+ sumy += yl.s1;
+ sumy += yl.s2;
+ sumy += yl.s3;
+ sumy += yl.s4;
+ sumy += yl.s5;
+ sumy += yl.s6;
+ sumy += yl.s7;
+ sumy += yl.s8; yl.s8 /= 16.f;
+ sumy += yl.s9; yl.s9 /= 16.f;
+ sumy += yl.sa; yl.sa /= 16.f;
+ sumy += yl.sb; yl.sb /= 16.f;
+ sumy += yl.sc; yl.sc /= 16.f;
+ sumy += yl.sd; yl.sd /= 16.f;
+ sumy += yl.se; yl.se /= 16.f;
+ sumy += yl.sf; yl.sf /= 16.f;
+#elif THREADS_PER_BLK == 4 // Each thread processes 1/4 block
+ yl.s0123 = *(global float4 *)(yb);
+ yl.s4567 = *(global float4 *)(yb + 16);
+
+ sumy += yl.s0;
+ sumy += yl.s1;
+ sumy += yl.s2;
+ sumy += yl.s3;
+ sumy += yl.s4; yl.s4 /= 16.f;
+ sumy += yl.s5; yl.s5 /= 16.f;
+ sumy += yl.s6; yl.s6 /= 16.f;
+ sumy += yl.s7; yl.s7 /= 16.f;
+#elif THREADS_PER_BLK == 8 // Each thread processes 1/8 block
+ yl.s01 = *(global float2 *)(yb);
+ yl.s23 = *(global float2 *)(yb + 16);
+
+ sumy += yl.s0;
+ sumy += yl.s1;
+ sumy += yl.s2; yl.s2 /= 16.f;
+ sumy += yl.s3; yl.s3 /= 16.f;
+#endif
+
+ sumf.s0 += block_q_4_0_dot_y_flat(q_blk_0, *(d + ib + 0*nb), sumy, yl);
+ sumf.s1 += block_q_4_0_dot_y_flat(q_blk_1, *(d + ib + 1*nb), sumy, yl);
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+ sumf.s2 += block_q_4_0_dot_y_flat(q_blk_2, *(d + ib + 2*nb), sumy, yl);
+ sumf.s3 += block_q_4_0_dot_y_flat(q_blk_3, *(d + ib + 3*nb), sumy, yl);
+#endif
+#if N_DST == 8 || N_DST == 16
+ sumf.s4 += block_q_4_0_dot_y_flat(q_blk_4, *(d + ib + 4*nb), sumy, yl);
+ sumf.s5 += block_q_4_0_dot_y_flat(q_blk_5, *(d + ib + 5*nb), sumy, yl);
+ sumf.s6 += block_q_4_0_dot_y_flat(q_blk_6, *(d + ib + 6*nb), sumy, yl);
+ sumf.s7 += block_q_4_0_dot_y_flat(q_blk_7, *(d + ib + 7*nb), sumy, yl);
+#endif
+
+ yb += QK4_0 * (N_SIMDWIDTH/THREADS_PER_BLK);
+ }
+
+ SUM_TY tot = (SUM_TY)(
+ sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1)
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+ , sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
+#endif
+#if N_DST == 8 || N_DST == 16
+ , sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5)
+ , sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
+#endif
+ );
+
+ if (get_sub_group_local_id() == 0) {
+ if (first_row + 0 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+ }
+ if (first_row + 1 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+ }
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+ if (first_row + 2 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+ }
+ if (first_row + 3 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+ }
+#endif
+#if N_DST == 8 || N_DST == 16
+ if (first_row + 4 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
+ }
+ if (first_row + 5 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
+ }
+ if (first_row + 6 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
+ }
+ if (first_row + 7 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
+ }
+#endif
+ }
+}
+
+//------------------------------------------------------------------------------
+// Using image1d_buffer_t
+
+#if defined(cl_qcom_subgroup_shuffle)
+#pragma OPENCL EXTENSION cl_qcom_subgroup_shuffle : enable
+float qcom_sub_group_reduce_add(float sum) {
+ sum += qcom_sub_group_shuffle_down(sum, 32, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
+ sum += qcom_sub_group_shuffle_down(sum, 16, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
+ sum += qcom_sub_group_shuffle_down(sum, 8, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
+ sum += qcom_sub_group_shuffle_down(sum, 4, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
+ sum += qcom_sub_group_shuffle_down(sum, 2, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
+ sum += qcom_sub_group_shuffle_down(sum, 1, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
+ return sum;
+}
+#define sub_group_reduce_add qcom_sub_group_reduce_add
+#else
+#define sub_group_reduce_add sub_group_reduce_add
+#endif
+
+#undef THREADS_PER_BLK
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define THREADS_PER_BLK 4 // Number of threads per block, or each thread process 1/THREADS_PER_BLK of a block
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 16
+#elif defined (ADRENO_GPU)
+#define THREADS_PER_BLK 4
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+
+#if THREADS_PER_BLK == 2 // Each thread processes 1/2 block
+# define ACT_TY float16
+# define Q_BLK_LD_TY float2
+# define EXTRACT_BLK_DATA(tmp, part) *((float2*)&tmp + part)
+# define block_q_4_0_dot_y_flat block_q_4_0_dot_y_flat_v8
+#elif THREADS_PER_BLK == 4 // Each thread processes 1/4 block
+# define ACT_TY float8
+# define Q_BLK_LD_TY float
+# define EXTRACT_BLK_DATA(tmp, part) *((float*)&tmp + part)
+# define block_q_4_0_dot_y_flat block_q_4_0_dot_y_flat_v4
+#elif THREADS_PER_BLK == 8 // Each thread processes 1/8 block
+# define ACT_TY float4
+# define Q_BLK_LD_TY half
+# define EXTRACT_BLK_DATA(tmp, part) *((half*)&tmp + part)
+# define block_q_4_0_dot_y_flat block_q_4_0_dot_y_flat_v2
+#endif
+
+#define BTYES_PER_THREAD_IN_BLK (QK4_0/2/THREADS_PER_BLK)
+
+#if N_DST == 2
+# define SUM_TY float2
+#elif N_DST == 4
+# define SUM_TY float4
+#elif N_DST == 8
+# define SUM_TY float8
+#elif N_DST == 16
+# define SUM_TY float16
+#endif
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_flat_img_v0(
+ read_only image1d_buffer_t src0_q,
+ read_only image1d_buffer_t src0_d,
+ global float * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne10,
+ int ne12,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+ src1 = (global float*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ const int nb = ne00/QK4_0;
+
+ int r0 = get_group_id(0);
+ int r1 = get_group_id(1);
+ int im = get_group_id(2);
+
+ int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+ int i12 = im%ne12;
+ int i13 = im/ne12;
+
+ // The number of scales is the same as the number of blocks.
+ ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+ // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
+ ulong offset0_q = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
+ global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
+
+ int ix = get_sub_group_local_id()/THREADS_PER_BLK;
+ int il = get_sub_group_local_id()%THREADS_PER_BLK;
+
+ global float * yb = y + ix*QK4_0 + BTYES_PER_THREAD_IN_BLK*il;
+
+ // Registers for caching activation
+ ACT_TY yl = 0.f;
+
+ // Registers for caching quants
+ Q_BLK_LD_TY q_blk_0 = 0, q_blk_1 = 0;
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+ Q_BLK_LD_TY q_blk_2 = 0, q_blk_3 = 0;
+#endif
+#if N_DST == 8 || N_DST == 16
+ Q_BLK_LD_TY q_blk_4 = 0, q_blk_5 = 0, q_blk_6 = 0, q_blk_7 = 0;
+#endif
+
+ // Partial sum
+ SUM_TY sumf = 0.f;
+
+ for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/THREADS_PER_BLK) {
+ float sumy = 0.f;;
+
+ float4 tmp;
+ tmp = read_imagef(src0_q, offset0_q + ib + 0*nb);
+ q_blk_0 = EXTRACT_BLK_DATA(tmp, il);
+ tmp = read_imagef(src0_q, offset0_q + ib + 1*nb);
+ q_blk_1 = EXTRACT_BLK_DATA(tmp, il);
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+ tmp = read_imagef(src0_q, offset0_q + ib + 2*nb);
+ q_blk_2 = EXTRACT_BLK_DATA(tmp, il);
+ tmp = read_imagef(src0_q, offset0_q + ib + 3*nb);
+ q_blk_3 = EXTRACT_BLK_DATA(tmp, il);
+#endif
+#if N_DST == 8 || N_DST == 16
+ tmp = read_imagef(src0_q, offset0_q + ib + 4*nb);
+ q_blk_4 = EXTRACT_BLK_DATA(tmp, il);
+ tmp = read_imagef(src0_q, offset0_q + ib + 5*nb);
+ q_blk_5 = EXTRACT_BLK_DATA(tmp, il);
+ tmp = read_imagef(src0_q, offset0_q + ib + 6*nb);
+ q_blk_6 = EXTRACT_BLK_DATA(tmp, il);
+ tmp = read_imagef(src0_q, offset0_q + ib + 7*nb);
+ q_blk_7 = EXTRACT_BLK_DATA(tmp, il);
+#endif
+
+ // Load activation
+#if THREADS_PER_BLK == 2 // Each thread processes 1/2 block
+ yl.s01234567 = *(global float8 *)(yb);
+ yl.s89abcdef = *(global float8 *)(yb + 16);
+
+ sumy += yl.s0;
+ sumy += yl.s1;
+ sumy += yl.s2;
+ sumy += yl.s3;
+ sumy += yl.s4;
+ sumy += yl.s5;
+ sumy += yl.s6;
+ sumy += yl.s7;
+ sumy += yl.s8; yl.s8 /= 16.f;
+ sumy += yl.s9; yl.s9 /= 16.f;
+ sumy += yl.sa; yl.sa /= 16.f;
+ sumy += yl.sb; yl.sb /= 16.f;
+ sumy += yl.sc; yl.sc /= 16.f;
+ sumy += yl.sd; yl.sd /= 16.f;
+ sumy += yl.se; yl.se /= 16.f;
+ sumy += yl.sf; yl.sf /= 16.f;
+#elif THREADS_PER_BLK == 4 // Each thread processes 1/4 block
+ yl.s0123 = *(global float4 *)(yb);
+ yl.s4567 = *(global float4 *)(yb + 16);
+
+ sumy += yl.s0;
+ sumy += yl.s1;
+ sumy += yl.s2;
+ sumy += yl.s3;
+ sumy += yl.s4; yl.s4 /= 16.f;
+ sumy += yl.s5; yl.s5 /= 16.f;
+ sumy += yl.s6; yl.s6 /= 16.f;
+ sumy += yl.s7; yl.s7 /= 16.f;
+#elif THREADS_PER_BLK == 8 // Each thread processes 1/8 block
+ yl.s01 = *(global float2 *)(yb);
+ yl.s23 = *(global float2 *)(yb + 16);
+
+ sumy += yl.s0;
+ sumy += yl.s1;
+ sumy += yl.s2; yl.s2 /= 16.f;
+ sumy += yl.s3; yl.s3 /= 16.f;
+#endif
+
+ sumf.s0 += block_q_4_0_dot_y_flat(q_blk_0, read_imageh(src0_d, offset0_d + ib + 0*nb).s0, sumy, yl);
+ sumf.s1 += block_q_4_0_dot_y_flat(q_blk_1, read_imageh(src0_d, offset0_d + ib + 1*nb).s0, sumy, yl);
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+ sumf.s2 += block_q_4_0_dot_y_flat(q_blk_2, read_imageh(src0_d, offset0_d + ib + 2*nb).s0, sumy, yl);
+ sumf.s3 += block_q_4_0_dot_y_flat(q_blk_3, read_imageh(src0_d, offset0_d + ib + 3*nb).s0, sumy, yl);
+#endif
+#if N_DST == 8 || N_DST == 16
+ sumf.s4 += block_q_4_0_dot_y_flat(q_blk_4, read_imageh(src0_d, offset0_d + ib + 4*nb).s0, sumy, yl);
+ sumf.s5 += block_q_4_0_dot_y_flat(q_blk_5, read_imageh(src0_d, offset0_d + ib + 5*nb).s0, sumy, yl);
+ sumf.s6 += block_q_4_0_dot_y_flat(q_blk_6, read_imageh(src0_d, offset0_d + ib + 6*nb).s0, sumy, yl);
+ sumf.s7 += block_q_4_0_dot_y_flat(q_blk_7, read_imageh(src0_d, offset0_d + ib + 7*nb).s0, sumy, yl);
+#endif
+
+ yb += QK4_0 * (N_SIMDWIDTH/THREADS_PER_BLK);
+ }
+
+ SUM_TY tot = (SUM_TY)(
+ sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1)
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+ , sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
+#endif
+#if N_DST == 8 || N_DST == 16
+ , sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5)
+ , sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
+#endif
+ );
+
+ if (get_sub_group_local_id() == 0) {
+ if (first_row + 0 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+ }
+ if (first_row + 1 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+ }
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+ if (first_row + 2 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+ }
+ if (first_row + 3 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+ }
+#endif
+#if N_DST == 8 || N_DST == 16
+ if (first_row + 4 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
+ }
+ if (first_row + 5 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
+ }
+ if (first_row + 6 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
+ }
+ if (first_row + 7 < ne01) {
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
+ }
+#endif
+ }
+}
+
+//------------------------------------------------------------------------------
+// kernel_mul_mv_q6_K_f32
+//------------------------------------------------------------------------------
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 1 // number of rows each SIMD group works on
+#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // SIMD group size
+#elif defined (ADRENO_GPU)
+#define N_DST 1
+#define N_SIMDGROUP 2
+#define N_SIMDWIDTH 64
+#endif
+
+#define BLOCK_STRIDE (N_SIMDWIDTH/16) // number of blocks each subgroup processes
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mv_q6_K_f32(
+ global void * src0,
+ ulong offset0,
+ global float * src1,
+ ulong offset1,
+ global float * dst,
+ ulong offsetd,
+ int ne00,
+ int ne01,
+ int ne02,
+ int ne10,
+ int ne12,
+ int ne0,
+ int ne1,
+ int r2,
+ int r3
+) {
+ src0 = (global void*)((global char*)src0 + offset0);
+ src1 = (global float*)((global char*)src1 + offset1);
+ dst = (global float*)((global char*)dst + offsetd);
+
+ uchar kmask1 = 0x03;
+ uchar kmask2 = 0x0C;
+ uchar kmask3 = 0x30;
+ uchar kmask4 = 0xC0;
+
+ int nb = ne00/QK_K;
+
+ int r0 = get_group_id(0);
+ int r1 = get_group_id(1);
+ int im = get_group_id(2);
+
+ int row = N_SIMDGROUP * r0 + get_sub_group_id();
+
+ int i12 = im%ne12;
+ int i13 = im/ne12;
+
+ ulong offset_src0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
+ global block_q6_K * x = (global block_q6_K *) src0 + row*nb + offset_src0;
+ global float * yy = (global float *) src1 + r1*ne10 + im*ne00*ne1;
+
+ float sumf = 0;
+
+ // For Q6_K quantization, 16 values forms a subblock, 16 subblock forms a
+ // block. Values in a subblock shares a scale that is quantized with 8 bits;
+ // the entire block shares a single floating point scale.
+ // For work distribution, each thread processes a subblock (16 weights), hence
+ // 16 threads process a (super) block -- a subgroup thus handles SIMDWIDTH/16
+ // (super) blocks -- this is the block stride.
+ // The 16 threads that process a (super) block are split into 2 portions, each has
+ // 8 threads; each portion works on 8 subblocks.
+ // For subgroup of 16 threads, the entire subgroup works on a single (super) block
+ // before moving to the next (super) block. Thread0 - thread7 work on the
+ // first 8 subblocks; thread8 - thread15 works on the last 8 subblocks.
+ // Thread0 - thread3 work on subblocks 0, 2, 4, 6; thread4 - thread7 work on
+ // subblocks 1, 3, 5, 7. Each thread does not work on an entire subblock, but
+ // works on a total of 16 weight values.
+ int tid = get_sub_group_local_id()/BLOCK_STRIDE; // first block_stride groups have tid=0
+ int ix = get_sub_group_local_id()%BLOCK_STRIDE; // first block is 0..block_stride-1
+ int ip = tid/8; // first or second half of (super) block (0 or 1)
+ int il = tid%8; // each half has 8 parts, one per scale
+ int n = 4; // 4 scales at a time (and 4 sums)
+ int l0 = n*il; // offset into half-block, 0..28
+ int is = 8*ip + l0/16; // 0, 1, 8, 9
+
+ int y_offset = 128*ip + l0;
+ int q_offset_l = 64*ip + l0;
+ int q_offset_h = 32*ip + l0;
+
+ for (int i = ix; i < nb; i += BLOCK_STRIDE) {
+
+ global uint8_t * q1 = x[i].ql + q_offset_l;
+ global uint8_t * q2 = q1 + QK_K/8;
+ global uint8_t * qh = x[i].qh + q_offset_h;
+ global int8_t * sc = x[i].scales + is;
+
+ global float * y = yy + i * QK_K + y_offset;
+
+ float dall = x[i].d;
+
+ float4 sums = {0.f, 0.f, 0.f, 0.f};
+
+ sums.s0 += y[0+ 0] * ((float)((q1[0] & 0xF) | ((qh[0] & kmask1) << 4)) - 32.f);
+ sums.s1 += y[0+32] * ((float)((q2[0] & 0xF) | ((qh[0] & kmask2) << 2)) - 32.f);
+ sums.s2 += y[0+64] * ((float)((q1[0] >> 4) | ((qh[0] & kmask3) << 0)) - 32.f);
+ sums.s3 += y[0+96] * ((float)((q2[0] >> 4) | ((qh[0] & kmask4) >> 2)) - 32.f);
+
+ sums.s0 += y[1+ 0] * ((float)((q1[1] & 0xF) | ((qh[1] & kmask1) << 4)) - 32.f);
+ sums.s1 += y[1+32] * ((float)((q2[1] & 0xF) | ((qh[1] & kmask2) << 2)) - 32.f);
+ sums.s2 += y[1+64] * ((float)((q1[1] >> 4) | ((qh[1] & kmask3) << 0)) - 32.f);
+ sums.s3 += y[1+96] * ((float)((q2[1] >> 4) | ((qh[1] & kmask4) >> 2)) - 32.f);
+
+ sums.s0 += y[2+ 0] * ((float)((q1[2] & 0xF) | ((qh[2] & kmask1) << 4)) - 32.f);
+ sums.s1 += y[2+32] * ((float)((q2[2] & 0xF) | ((qh[2] & kmask2) << 2)) - 32.f);
+ sums.s2 += y[2+64] * ((float)((q1[2] >> 4) | ((qh[2] & kmask3) << 0)) - 32.f);
+ sums.s3 += y[2+96] * ((float)((q2[2] >> 4) | ((qh[2] & kmask4) >> 2)) - 32.f);
+
+ sums.s0 += y[3+ 0] * ((float)((q1[3] & 0xF) | ((qh[3] & kmask1) << 4)) - 32.f);
+ sums.s1 += y[3+32] * ((float)((q2[3] & 0xF) | ((qh[3] & kmask2) << 2)) - 32.f);
+ sums.s2 += y[3+64] * ((float)((q1[3] >> 4) | ((qh[3] & kmask3) << 0)) - 32.f);
+ sums.s3 += y[3+96] * ((float)((q2[3] >> 4) | ((qh[3] & kmask4) >> 2)) - 32.f);
+
+ sumf += dall * (sums.s0 * sc[0] + sums.s1 * sc[2] + sums.s2 * sc[4] + sums.s3 * sc[6]);
+ }
+
+ float tot = sub_group_reduce_add(sumf);
+ if (get_sub_group_local_id() == 0) {
+ dst[r1*ne0 + im*ne0*ne1 + row] = tot;
+ }
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
new file mode 100644
index 000000000..57768c803
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
@@ -0,0 +1,130 @@
+// src0_q, src0_d, src1 are transposed as a preprocessing step
+// 4-bit weights are transposed in groups of 4 (unsigned short int)
+// consider weights originally "next to each other", now "on top of each other"
+// each fiber computes a 8x4 tile of output elements
+// using unshuffled weights
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+
+__attribute__((qcom_reqd_sub_group_size("full")))
+kernel void kernel_mul_mat_Ab_Bi_8x4(
+ global const ushort * src0_q, // quantized A
+ global const half * src0_d, // A scales
+ __read_only image1d_buffer_t src1, // B (1d image)
+ global float * dst, // C
+ int m, // M
+ int n, // N with padding
+ int k, // K
+ int n_no_padding // N without padding
+) {
+
+ int m_4 = m >> 2;
+ int n_4 = n >> 2;
+
+ int gy = get_global_id(0);
+ int gx = get_global_id(1);
+ int gx_2 = gx << 2;
+
+ half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0; // 8x4 output elements
+ half8 B; // registers for activations
+ half4 dequantized_weights; // registers for dequantized weights
+ __global const ushort* weight_ptr = src0_q + gx_2; // pointer for weights
+ __global const half* scale_ptr = src0_d + gx_2; // pointer for scales
+
+ for(int i=0; i> 4) - 8) * scale.s0; // dequantize a row of the 16 weights
+ dequantized_weights.s1 = (((bits4.s1 & (0x00F0)) >> 4) - 8) * scale.s1;
+ dequantized_weights.s2 = (((bits4.s2 & (0x00F0)) >> 4) - 8) * scale.s2;
+ dequantized_weights.s3 = (((bits4.s3 & (0x00F0)) >> 4) - 8) * scale.s3;
+ c0 += B * dequantized_weights.s0; //vector-scalar multiplication to accumulate
+ c1 += B * dequantized_weights.s1;
+ c2 += B * dequantized_weights.s2;
+ c3 += B * dequantized_weights.s3;
+
+ // j=2
+ B.s0123 = read_imageh(src1, gy*2 + (i+2)*(n_4));
+ B.s4567 = read_imageh(src1, gy*2 + (i+2)*(n_4)+1);
+ dequantized_weights.s0 = (((bits4.s0 & (0x0F00)) >> 8) - 8) * scale.s0; // dequantize a row of the 16 weights
+ dequantized_weights.s1 = (((bits4.s1 & (0x0F00)) >> 8) - 8) * scale.s1;
+ dequantized_weights.s2 = (((bits4.s2 & (0x0F00)) >> 8) - 8) * scale.s2;
+ dequantized_weights.s3 = (((bits4.s3 & (0x0F00)) >> 8) - 8) * scale.s3;
+ c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
+ c1 += B * dequantized_weights.s1;
+ c2 += B * dequantized_weights.s2;
+ c3 += B * dequantized_weights.s3;
+
+ // j=3
+ B.s0123 = read_imageh(src1, gy*2 + (i+3)*(n_4));
+ B.s4567 = read_imageh(src1, gy*2 + (i+3)*(n_4)+1);
+ dequantized_weights.s0 = (((bits4.s0 & (0xF000)) >> 12) - 8) * scale.s0; // dequantize a row of the 16 weights
+ dequantized_weights.s1 = (((bits4.s1 & (0xF000)) >> 12) - 8) * scale.s1;
+ dequantized_weights.s2 = (((bits4.s2 & (0xF000)) >> 12) - 8) * scale.s2;
+ dequantized_weights.s3 = (((bits4.s3 & (0xF000)) >> 12) - 8) * scale.s3;
+ c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
+ c1 += B * dequantized_weights.s1;
+ c2 += B * dequantized_weights.s2;
+ c3 += B * dequantized_weights.s3;
+ }
+
+ int idx = (gy<<3)*m + (gx<<2); // vectorized store 16 elements
+
+ // conditional check if store is to a valid location. Required when N is not a multiple of 8
+ // if statements allow registers to be reused for each store
+ // provides a performance boost due to reduced register footprint, which increases number of concurrent waves
+ if(idx+3 < m*n_no_padding){
+ vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
+ idx += m;
+ }
+ if(idx+3 < m*n_no_padding){
+ vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
+ idx += m;
+ }
+ if(idx+3 < m*n_no_padding){
+ vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
+ idx += m;
+ }
+ if(idx+3 < m*n_no_padding){
+ vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
+ idx += m;
+ }
+ if(idx+3 < m*n_no_padding){
+ vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
+ idx += m;
+ }
+ if(idx+3 < m*n_no_padding){
+ vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
+ idx += m;
+ }
+ if(idx+3 < m*n_no_padding){
+ vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
+ idx += m;
+ }
+ if(idx+3 < m*n_no_padding){
+ vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
+ }
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
new file mode 100644
index 000000000..d59a0c05d
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
@@ -0,0 +1,32 @@
+// 16-bit transpose, loading/storing an 8x8 tile of elements
+
+kernel void kernel_transpose_16(
+ __read_only image1d_buffer_t input,
+ __write_only image1d_buffer_t output,
+ const uint rows,
+ const uint cols
+) {
+
+ const int i = get_global_id(0);
+ const int j = get_global_id(1);
+ const int i_3 = i<<3;
+ const int j_3 = j<<3;
+
+ ushort8 temp0 = as_ushort8(read_imagef(input, (j_3+0)*cols+i));
+ ushort8 temp1 = as_ushort8(read_imagef(input, (j_3+1)*cols+i));
+ ushort8 temp2 = as_ushort8(read_imagef(input, (j_3+2)*cols+i));
+ ushort8 temp3 = as_ushort8(read_imagef(input, (j_3+3)*cols+i));
+ ushort8 temp4 = as_ushort8(read_imagef(input, (j_3+4)*cols+i));
+ ushort8 temp5 = as_ushort8(read_imagef(input, (j_3+5)*cols+i));
+ ushort8 temp6 = as_ushort8(read_imagef(input, (j_3+6)*cols+i));
+ ushort8 temp7 = as_ushort8(read_imagef(input, (j_3+7)*cols+i));
+
+ write_imagef(output, (i_3+0)*rows+j, as_float4((ushort8)(temp0.s0, temp1.s0, temp2.s0, temp3.s0, temp4.s0, temp5.s0, temp6.s0, temp7.s0)));
+ write_imagef(output, (i_3+1)*rows+j, as_float4((ushort8)(temp0.s1, temp1.s1, temp2.s1, temp3.s1, temp4.s1, temp5.s1, temp6.s1, temp7.s1)));
+ write_imagef(output, (i_3+2)*rows+j, as_float4((ushort8)(temp0.s2, temp1.s2, temp2.s2, temp3.s2, temp4.s2, temp5.s2, temp6.s2, temp7.s2)));
+ write_imagef(output, (i_3+3)*rows+j, as_float4((ushort8)(temp0.s3, temp1.s3, temp2.s3, temp3.s3, temp4.s3, temp5.s3, temp6.s3, temp7.s3)));
+ write_imagef(output, (i_3+4)*rows+j, as_float4((ushort8)(temp0.s4, temp1.s4, temp2.s4, temp3.s4, temp4.s4, temp5.s4, temp6.s4, temp7.s4)));
+ write_imagef(output, (i_3+5)*rows+j, as_float4((ushort8)(temp0.s5, temp1.s5, temp2.s5, temp3.s5, temp4.s5, temp5.s5, temp6.s5, temp7.s5)));
+ write_imagef(output, (i_3+6)*rows+j, as_float4((ushort8)(temp0.s6, temp1.s6, temp2.s6, temp3.s6, temp4.s6, temp5.s6, temp6.s6, temp7.s6)));
+ write_imagef(output, (i_3+7)*rows+j, as_float4((ushort8)(temp0.s7, temp1.s7, temp2.s7, temp3.s7, temp4.s7, temp5.s7, temp6.s7, temp7.s7)));
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl
new file mode 100644
index 000000000..914ec0193
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl
@@ -0,0 +1,25 @@
+// 32-bit transpose, loading/storing a 4x4 tile of elements
+
+kernel void kernel_transpose_32(
+ __read_only image1d_buffer_t input,
+ __write_only image1d_buffer_t output,
+ const uint rows,
+ const uint cols
+) {
+
+ const int i = get_global_id(0);
+ const int j = get_global_id(1);
+ const int i_2 = i<<2;
+ const int j_2 = j<<2;
+
+ float4 temp0 = read_imagef(input, (j_2+0)*cols+i);
+ float4 temp1 = read_imagef(input, (j_2+1)*cols+i);
+ float4 temp2 = read_imagef(input, (j_2+2)*cols+i);
+ float4 temp3 = read_imagef(input, (j_2+3)*cols+i);
+
+ write_imagef(output, (i_2+0)*rows+j, (float4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
+ write_imagef(output, (i_2+1)*rows+j, (float4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
+ write_imagef(output, (i_2+2)*rows+j, (float4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
+ write_imagef(output, (i_2+3)*rows+j, (float4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
+
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl
new file mode 100644
index 000000000..d3bd1fabb
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl
@@ -0,0 +1,35 @@
+// 32-bit transpose, loading/storing a 4x4 tile of elements
+// Only used for activations
+// converts to FP16
+// also adds zero padding for non multiple of 8 prompt lengths
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+kernel void kernel_transpose_32_16(__read_only image1d_buffer_t input, __write_only image1d_buffer_t output, const uint rows, const uint cols, const uint padded_rows) {
+
+ const int i = get_global_id(0);
+ const int j = get_global_id(1);
+ const int i_2 = i<<2;
+ const int j_2 = j<<2;
+ half4 temp0 = {0,0,0,0}; // initialize outputs to 0
+ half4 temp1 = {0,0,0,0};
+ half4 temp2 = {0,0,0,0};
+ half4 temp3 = {0,0,0,0};
+
+ if((j_2+0)*cols+i*4+3 < rows*cols*16){ // only load from a valid location. Otherwise keep register data as 0
+ temp0 = read_imageh(input, (j_2+0)*cols+i);
+ }
+ if((j_2+1)*cols+i*4+3 < rows*cols*16){
+ temp1 = read_imageh(input, (j_2+1)*cols+i);
+ }
+ if((j_2+2)*cols+i*4+3 < rows*cols*16){
+ temp2 = read_imageh(input, (j_2+2)*cols+i);
+ }
+ if((j_2+3)*cols+i*4+3 < rows*cols*16){
+ temp3 = read_imageh(input, (j_2+3)*cols+i);
+ }
+
+ write_imageh(output, (i_2+0)*padded_rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0)); // no conditionals for output, includes zero padding
+ write_imageh(output, (i_2+1)*padded_rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
+ write_imageh(output, (i_2+2)*padded_rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
+ write_imageh(output, (i_2+3)*padded_rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
+}
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 6b9f0b0d9..84f1328e7 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -4488,7 +4488,16 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_SOFT_MAX:
return true;
case GGML_OP_ROPE:
- return ggml_is_contiguous(op->src[0]);
+ {
+ const int mode = ((const int32_t *) op->op_params)[2];
+ if (mode & GGML_ROPE_TYPE_MROPE) {
+ return false;
+ }
+ if (mode & GGML_ROPE_TYPE_VISION) {
+ return false;
+ }
+ return ggml_is_contiguous(op->src[0]);
+ }
case GGML_OP_IM2COL:
// TODO: add support for the new F32 operations
return op->src[0]->type == GGML_TYPE_F16;
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 515d66b39..1696b6e27 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -245,6 +245,7 @@ struct vk_device_struct {
vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
vk_pipeline pipeline_timestep_embedding_f32;
vk_pipeline pipeline_pool2d_f32;
+ vk_pipeline pipeline_rwkv_wkv6_f32;
// [2][2][2] is for {f16acc,f32acc}x{large,small_rows}x{unaligned, aligned}
vk_pipeline pipeline_flash_attn_f32_f16_D64[GGML_TYPE_COUNT][2][2][2];
@@ -528,6 +529,13 @@ struct vk_op_pool2d_push_constants {
int32_t p0; int32_t p1;
};
+struct vk_op_rwkv_wkv6_push_constants {
+ uint32_t B;
+ uint32_t T;
+ uint32_t C;
+ uint32_t H;
+};
+
// Allow pre-recording command buffers
struct vk_staging_memcpy {
vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
@@ -1363,7 +1371,7 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
// Needs to be kept up to date on shader changes
const uint32_t bank_conflict_offset = device->coopmat_support ? 8 : 1;
const uint32_t type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float);
- const uint32_t warps = warptile[0] / device->subgroup_size;
+ const uint32_t warps = warptile[0] / warptile[10];
const uint32_t load_bufs = (warptile[1] + warptile[2]) * (warptile[3] + bank_conflict_offset) * type_size;
const uint32_t mmid_row_ids = mul_mat_id ? 3072 * sizeof(uint32_t) : 0;
@@ -1377,8 +1385,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
std::cerr << "ggml_vulkan: Compiling shaders";
- // some shaders require the subgroup size to be 16 or larger
+ // some shaders have a minimum subgroup size
const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u);
+ const uint32_t subgroup_size_32 = std::max(device->subgroup_size, 32u);
// mulmat
std::vector l_warptile, m_warptile, s_warptile,
@@ -1445,7 +1454,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, tm_l, tn_l, tk_l, device->subgroup_size };
m_warptile_mmq = { 128, 64, 64, 32, device->subgroup_size, 32, 2, tm_m, tn_m, tk_m, device->subgroup_size };
- s_warptile_mmq = { subgroup_size_16, 32, 32, 32, 32, 32, 2, tm_s, tn_s, tk_s, device->subgroup_size };
+ s_warptile_mmq = { subgroup_size_32, 32, 32, 32, 32, 32, 2, tm_s, tn_s, tk_s, device->subgroup_size };
l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 };
m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 };
@@ -1864,7 +1873,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {subgroup_size_16, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
@@ -1878,7 +1887,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {subgroup_size_16, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
@@ -1892,7 +1901,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
- ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {subgroup_size_16, 2*rm}, 1, true);
// dequant shaders
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
@@ -2014,6 +2023,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
+
for (auto &c : compiles) {
c.wait();
}
@@ -5022,6 +5033,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_pool2d_f32;
}
return nullptr;
+ case GGML_OP_RWKV_WKV6:
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_rwkv_wkv6_f32;
+ }
+ return nullptr;
case GGML_OP_LEAKY_RELU:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_leaky_relu_f32;
@@ -5424,6 +5440,134 @@ static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const
}, dryrun);
}
+static void ggml_vk_op_f32_rwkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_rwkv_wkv6_push_constants&& pc, bool dryrun = false) {
+ const ggml_tensor * k = dst->src[0];
+ const ggml_tensor * v = dst->src[1];
+ const ggml_tensor * r = dst->src[2];
+ const ggml_tensor * tf = dst->src[3];
+ const ggml_tensor * td = dst->src[4];
+ const ggml_tensor * state = dst->src[5];
+
+ GGML_ASSERT(!ggml_is_quantized(k->type));
+ GGML_ASSERT(!ggml_is_quantized(v->type));
+ GGML_ASSERT(!ggml_is_quantized(r->type));
+ GGML_ASSERT(!ggml_is_quantized(tf->type));
+ GGML_ASSERT(!ggml_is_quantized(td->type));
+ GGML_ASSERT(!ggml_is_quantized(state->type));
+ GGML_ASSERT(dst->buffer != nullptr);
+
+ vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, k, v, r, dst, GGML_OP_RWKV_WKV6);
+ GGML_ASSERT(pipeline != nullptr);
+
+ if (dryrun) {
+ ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+ return;
+ }
+
+ ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
+ ggml_backend_vk_buffer_context * k_buf_ctx = (ggml_backend_vk_buffer_context *)k->buffer->context;
+ ggml_backend_vk_buffer_context * v_buf_ctx = (ggml_backend_vk_buffer_context *)v->buffer->context;
+ ggml_backend_vk_buffer_context * r_buf_ctx = (ggml_backend_vk_buffer_context *)r->buffer->context;
+ ggml_backend_vk_buffer_context * tf_buf_ctx = (ggml_backend_vk_buffer_context *)tf->buffer->context;
+ ggml_backend_vk_buffer_context * td_buf_ctx = (ggml_backend_vk_buffer_context *)td->buffer->context;
+ ggml_backend_vk_buffer_context * state_buf_ctx = (ggml_backend_vk_buffer_context *)state->buffer->context;
+
+ ggml_vk_sync_buffers(subctx);
+
+ vk_buffer d_D, d_K, d_V, d_R, d_TF, d_TD, d_State;
+ uint64_t k_offset, v_offset, r_offset, tf_offset, td_offset, state_offset, dst_offset;
+ bool K_uma = false, V_uma = false, R_uma = false, TF_uma = false, TD_uma = false, STATE_uma = false, DST_uma = false;
+
+ if (ctx->device->uma) {
+ ggml_vk_host_get(ctx->device, k->data, d_K, k_offset);
+ ggml_vk_host_get(ctx->device, v->data, d_V, v_offset);
+ ggml_vk_host_get(ctx->device, r->data, d_R, r_offset);
+ ggml_vk_host_get(ctx->device, tf->data, d_TF, tf_offset);
+ ggml_vk_host_get(ctx->device, td->data, d_TD, td_offset);
+ ggml_vk_host_get(ctx->device, state->data, d_State, state_offset);
+ ggml_vk_host_get(ctx->device, dst->data, d_D, dst_offset);
+
+ K_uma = d_K != nullptr;
+ V_uma = d_V != nullptr;
+ R_uma = d_R != nullptr;
+ TF_uma = d_TF != nullptr;
+ TD_uma = d_TD != nullptr;
+ STATE_uma = d_State != nullptr;
+ DST_uma = d_D != nullptr;
+ }
+
+ if (!K_uma) {
+ d_K = k_buf_ctx->dev_buffer;
+ k_offset = vk_tensor_offset(k) + k->view_offs;
+ }
+ if (!V_uma) {
+ d_V = v_buf_ctx->dev_buffer;
+ v_offset = vk_tensor_offset(v) + v->view_offs;
+ }
+ if (!R_uma) {
+ d_R = r_buf_ctx->dev_buffer;
+ r_offset = vk_tensor_offset(r) + r->view_offs;
+ }
+ if (!TF_uma) {
+ d_TF = tf_buf_ctx->dev_buffer;
+ tf_offset = vk_tensor_offset(tf) + tf->view_offs;
+ }
+ if (!TD_uma) {
+ d_TD = td_buf_ctx->dev_buffer;
+ td_offset = vk_tensor_offset(td) + td->view_offs;
+ }
+ if (!STATE_uma) {
+ d_State = state_buf_ctx->dev_buffer;
+ state_offset = vk_tensor_offset(state) + state->view_offs;
+ }
+ if (!DST_uma) {
+ d_D = dst_buf_ctx->dev_buffer;
+ dst_offset = vk_tensor_offset(dst) + dst->view_offs;
+ }
+
+ const uint64_t k_size = ggml_nbytes(k);
+ const uint64_t v_size = ggml_nbytes(v);
+ const uint64_t r_size = ggml_nbytes(r);
+ const uint64_t tf_size = ggml_nbytes(tf);
+ const uint64_t td_size = ggml_nbytes(td);
+ const uint64_t state_size = ggml_nbytes(state);
+ const uint64_t dst_size = ggml_nbytes(dst);
+
+ std::array elements = {
+ (uint32_t)(pc.B * pc.H),
+ 1,
+ 1
+ };
+
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
+ vk_subbuffer{ d_K, k_offset, k_size },
+ vk_subbuffer{ d_V, v_offset, v_size },
+ vk_subbuffer{ d_R, r_offset, r_size },
+ vk_subbuffer{ d_TF, tf_offset, tf_size },
+ vk_subbuffer{ d_TD, td_offset, td_size },
+ vk_subbuffer{ d_State, state_offset, state_size },
+ vk_subbuffer{ d_D, dst_offset, dst_size }
+ }, sizeof(vk_op_rwkv_wkv6_push_constants), &pc, elements);
+}
+
+static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
+ const size_t seq_length = dst->src[0]->ne[3];
+ const size_t n_embed = dst->ne[0];
+ const size_t n_heads = dst->src[0]->ne[2];
+ const size_t n_seqs = dst->src[5]->ne[1];
+
+ ggml_vk_op_f32_rwkv6(
+ ctx, subctx, dst,
+ {
+ (uint32_t)n_seqs,
+ (uint32_t)seq_length,
+ (uint32_t)n_embed,
+ (uint32_t)n_heads,
+ },
+ dryrun
+ );
+}
+
static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
int * op_params = (int *)dst->op_params;
@@ -6569,6 +6713,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_IM2COL:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_POOL_2D:
+ case GGML_OP_RWKV_WKV6:
case GGML_OP_LEAKY_RELU:
case GGML_OP_FLASH_ATTN_EXT:
break;
@@ -6768,6 +6913,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_FLASH_ATTN_EXT:
ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node, dryrun);
+ break;
+
+ case GGML_OP_RWKV_WKV6:
+ ggml_vk_rwkv_wkv6(ctx, compute_ctx, node, dryrun);
+
break;
default:
return false;
@@ -6848,6 +6998,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
case GGML_OP_IM2COL:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_POOL_2D:
+ case GGML_OP_RWKV_WKV6:
case GGML_OP_LEAKY_RELU:
case GGML_OP_REPEAT:
buf = tensor->buffer;
@@ -7687,7 +7838,16 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
case GGML_OP_REPEAT:
return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float);
case GGML_OP_ROPE:
- return ggml_is_contiguous(op->src[0]);
+ {
+ const int mode = ((const int32_t *) op->op_params)[2];
+ if (mode & GGML_ROPE_TYPE_MROPE) {
+ return false;
+ }
+ if (mode & GGML_ROPE_TYPE_VISION) {
+ return false;
+ }
+ return ggml_is_contiguous(op->src[0]);
+ }
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
@@ -7715,6 +7875,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
case GGML_OP_IM2COL:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_POOL_2D:
+ case GGML_OP_RWKV_WKV6:
case GGML_OP_LEAKY_RELU:
return true;
default:
@@ -8291,7 +8452,11 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
} else if (tensor->op == GGML_OP_LEAKY_RELU) {
const float * op_params = (const float *)tensor->op_params;
tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false);
- } else {
+ } else if (tensor->op == GGML_OP_RWKV_WKV6) {
+ tensor_clone = ggml_rwkv_wkv6(ggml_ctx, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3],
+ tensor->src[4], tensor->src[5]);
+ }
+ else {
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
GGML_ABORT("fatal error");
}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp
index 6e20b6411..a25808e16 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp
@@ -32,7 +32,7 @@ shared FLOAT_TYPE vals[BLOCK_SIZE];
void soft_max(uint num_iters) {
const uint tid = gl_LocalInvocationID.x;
const uint rowx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
- const uint rowy = rowx % p.KY;
+ const uint rowy = (p.KY > 0) ? (rowx % p.KY) : 0;
if (rowx >= p.nrows_x) {
return;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index c48a228ae..7a0d7285d 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -479,6 +479,8 @@ void process_shaders() {
string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+ string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
+
for (auto &c : compiles) {
c.wait();
}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp b/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp
new file mode 100644
index 000000000..35cc6c45f
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp
@@ -0,0 +1,87 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : require
+
+#define BLOCK_SIZE 64
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout(push_constant) uniform Parameters {
+ uint B;
+ uint T;
+ uint C;
+ uint H;
+};
+
+layout(binding = 0) readonly buffer KBuf { A_TYPE k[]; };
+layout(binding = 1) readonly buffer VBuf { A_TYPE v[]; };
+layout(binding = 2) readonly buffer RBuf { A_TYPE r[]; };
+layout(binding = 3) readonly buffer TimeFBuf { A_TYPE tf[]; };
+layout(binding = 4) readonly buffer TimeDBuf { A_TYPE td[]; };
+layout(binding = 5) readonly buffer StateBuf { A_TYPE state_in[]; };
+layout(binding = 6) buffer DstBuf { A_TYPE dst[]; };
+
+shared A_TYPE _k[BLOCK_SIZE], _r[BLOCK_SIZE], _tf[BLOCK_SIZE], _td[BLOCK_SIZE];
+
+void main() {
+ const uint head_size = BLOCK_SIZE;
+ const uint batch_id = gl_WorkGroupID.x / H;
+ const uint head_id = gl_WorkGroupID.x % H;
+ const uint tid = gl_LocalInvocationID.x;
+
+ const uint state_size = C * head_size;
+ const uint n_seq_tokens = T / B;
+
+ if (batch_id >= B || head_id >= H) {
+ return;
+ }
+
+ A_TYPE state[BLOCK_SIZE];
+ [[unroll]] for (uint i = 0; i < head_size; i++) {
+ state[i] = state_in[batch_id * state_size + head_id * head_size * head_size
+ + i * head_size + tid];
+ }
+
+ barrier();
+ _tf[tid] = tf[head_id * head_size + tid];
+ barrier();
+
+ const uint start_t = batch_id * n_seq_tokens * C + head_id * head_size + tid;
+ const uint end_t = (batch_id + 1) * n_seq_tokens * C + head_id * head_size + tid;
+
+ for (uint t = start_t; t < end_t; t += C) {
+ barrier();
+ _k[tid] = k[t];
+ _r[tid] = r[t];
+ _td[tid] = td[t];
+ barrier();
+
+ const A_TYPE v_val = v[t];
+ A_TYPE y = 0.0;
+
+ [[unroll]] for (uint j = 0; j < head_size; j += 4) {
+ vec4 k_vec = vec4(_k[j], _k[j+1], _k[j+2], _k[j+3]);
+ vec4 r_vec = vec4(_r[j], _r[j+1], _r[j+2], _r[j+3]);
+ vec4 tf_vec = vec4(_tf[j], _tf[j+1], _tf[j+2], _tf[j+3]);
+ vec4 td_vec = vec4(_td[j], _td[j+1], _td[j+2], _td[j+3]);
+ vec4 s_vec = vec4(state[j], state[j+1], state[j+2], state[j+3]);
+
+ vec4 kv = k_vec * v_val;
+
+ vec4 temp = tf_vec * kv + s_vec;
+ y += dot(r_vec, temp);
+
+ s_vec = s_vec * td_vec + kv;
+ state[j] = s_vec.x;
+ state[j+1] = s_vec.y;
+ state[j+2] = s_vec.z;
+ state[j+3] = s_vec.w;
+ }
+
+ dst[t] = y;
+ }
+
+ [[unroll]] for (uint i = 0; i < head_size; i++) {
+ dst[T * C + batch_id * state_size + head_id * head_size * head_size
+ + i * head_size + tid] = state[i];
+ }
+}
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 058941c7a..0efd2b2eb 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3517,15 +3517,18 @@ static struct ggml_tensor * ggml_rope_impl(
GGML_ASSERT(c->ne[0] >= n_dims / 2);
}
+ int sections[4] = {0, 0, 0, 0};
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
- int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
+ int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
memcpy(params + 5, &freq_base, sizeof(float));
memcpy(params + 6, &freq_scale, sizeof(float));
memcpy(params + 7, &ext_factor, sizeof(float));
memcpy(params + 8, &attn_factor, sizeof(float));
memcpy(params + 9, &beta_fast, sizeof(float));
memcpy(params + 10, &beta_slow, sizeof(float));
+ memcpy(params + 11, §ions, sizeof(int)*4);
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_ROPE;
@@ -3547,6 +3550,53 @@ struct ggml_tensor * ggml_rope(
);
}
+struct ggml_tensor * ggml_rope_multi(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ struct ggml_tensor * c,
+ int n_dims,
+ int sections[4],
+ int mode,
+ int n_ctx_orig,
+ float freq_base,
+ float freq_scale,
+ float ext_factor,
+ float attn_factor,
+ float beta_fast,
+ float beta_slow) {
+ // Multimodal Rotary Position Embedding
+ GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
+
+ GGML_ASSERT(ggml_is_vector(b));
+ GGML_ASSERT(b->type == GGML_TYPE_I32);
+ GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
+
+ if (c) {
+ GGML_ASSERT(c->type == GGML_TYPE_F32);
+ GGML_ASSERT(c->ne[0] >= n_dims / 2);
+ }
+
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+ int32_t params[11 + 4] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
+ memcpy(params + 5, &freq_base, sizeof(float));
+ memcpy(params + 6, &freq_scale, sizeof(float));
+ memcpy(params + 7, &ext_factor, sizeof(float));
+ memcpy(params + 8, &attn_factor, sizeof(float));
+ memcpy(params + 9, &beta_fast, sizeof(float));
+ memcpy(params + 10, &beta_slow, sizeof(float));
+ memcpy(¶ms[11], sections, sizeof(int)*4);
+ ggml_set_op_params(result, params, sizeof(params));
+
+ result->op = GGML_OP_ROPE;
+ result->src[0] = a;
+ result->src[1] = b;
+ result->src[2] = c;
+
+ return result;
+}
+
struct ggml_tensor * ggml_rope_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
@@ -5987,12 +6037,12 @@ struct ggml_tensor * ggml_graph_get_tensor(const struct ggml_cgraph * cgraph, co
struct ggml_tensor * ggml_graph_get_grad(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
- return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grads[igrad] : NULL;
+ return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) && cgraph->grads ? cgraph->grads[igrad] : NULL;
}
struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
- return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grad_accs[igrad] : NULL;
+ return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) && cgraph->grad_accs ? cgraph->grad_accs[igrad] : NULL;
}
void ggml_graph_print(const struct ggml_cgraph * cgraph) {
@@ -6439,7 +6489,7 @@ struct gguf_context {
void * data;
};
-static size_t gguf_type_size(enum gguf_type type) {
+size_t gguf_type_size(enum gguf_type type) {
GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
return GGUF_TYPE_SIZE[type];
}
@@ -6567,13 +6617,7 @@ struct gguf_context * gguf_init_empty(void) {
return ctx;
}
-struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
- FILE * file = ggml_fopen(fname, "rb");
- if (!file) {
- fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
- return NULL;
- }
-
+struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
// offset from start of file
size_t offset = 0;
@@ -6586,7 +6630,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
for (uint32_t i = 0; i < sizeof(magic); i++) {
if (magic[i] != GGUF_MAGIC[i]) {
fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
- fclose(file);
return NULL;
}
}
@@ -6597,7 +6640,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context));
if (!ctx) {
fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
- fclose(file);
return NULL;
}
@@ -6615,7 +6657,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
if (ctx->header.version == 1) {
fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
- fclose(file);
gguf_free(ctx);
return NULL;
}
@@ -6628,7 +6669,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
if (!ok) {
fprintf(stderr, "%s: failed to read header\n", __func__);
- fclose(file);
gguf_free(ctx);
return NULL;
}
@@ -6638,12 +6678,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
{
const uint64_t n_kv = ctx->header.n_kv;
- ctx->kv = calloc(n_kv, sizeof(struct gguf_kv));
- if (!ctx->kv) {
- fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
- fclose(file);
- gguf_free(ctx);
- return NULL;
+ if (n_kv > 0) {
+ ctx->kv = calloc(n_kv, sizeof(struct gguf_kv));
+ if (!ctx->kv) {
+ fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
+ gguf_free(ctx);
+ return NULL;
+ }
}
for (uint64_t i = 0; i < n_kv; ++i) {
@@ -6690,7 +6731,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// prevent from integer overflow in the malloc below
if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
- fclose(file);
gguf_free(ctx);
return NULL;
}
@@ -6698,7 +6738,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
kv->value.arr.data = calloc(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
if (!kv->value.arr.data) {
fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
- fclose(file);
gguf_free(ctx);
return NULL;
}
@@ -6710,7 +6749,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// prevent from integer overflow in the malloc below
if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
- fclose(file);
gguf_free(ctx);
return NULL;
}
@@ -6718,7 +6756,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
kv->value.arr.data = calloc(kv->value.arr.n, sizeof(struct gguf_str));
if (!kv->value.arr.data) {
fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
- fclose(file);
gguf_free(ctx);
return NULL;
}
@@ -6749,7 +6786,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
if (!ok) {
fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
- fclose(file);
gguf_free(ctx);
return NULL;
}
@@ -6760,7 +6796,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
ctx->infos = calloc(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
if (!ctx->infos) {
fprintf(stderr, "%s: failed to allocate memory for tensor infos\n", __func__);
- fclose(file);
gguf_free(ctx);
return NULL;
}
@@ -6796,7 +6831,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
if (!ok) {
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
- fclose(file);
gguf_free(ctx);
return NULL;
}
@@ -6839,7 +6873,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// this tensor type support have been removed:
fprintf(stderr, "%s: tensor '%s' of type %d: %s\n",
__func__, info->name.data, (int) info->type, ggml_type_name(info->type));
- fclose(file);
gguf_free(ctx);
return NULL;
}
@@ -6847,7 +6880,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
if (ne % ggml_blck_size(info->type) != 0) {
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
__func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
- fclose(file);
gguf_free(ctx);
return NULL;
}
@@ -6879,7 +6911,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
*params.ctx = ggml_init(pdata);
if (*params.ctx == NULL) {
fprintf(stderr, "%s: failed to initialize context\n", __func__);
- fclose(file);
gguf_free(ctx);
return NULL;
}
@@ -6898,7 +6929,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
if (!ok) {
fprintf(stderr, "%s: failed to read tensor data\n", __func__);
- fclose(file);
ggml_free(ctx_data);
gguf_free(ctx);
return NULL;
@@ -6937,7 +6967,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
if (!ok) {
fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
- fclose(file);
ggml_free(ctx_data);
gguf_free(ctx);
return NULL;
@@ -6946,11 +6975,21 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
ggml_set_no_alloc(ctx_data, params.no_alloc);
}
- fclose(file);
-
return ctx;
}
+struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
+ FILE * file = ggml_fopen(fname, "rb");
+ if (!file) {
+ fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
+ return NULL;
+ }
+
+ struct gguf_context * result = gguf_init_from_file_impl(file, params);
+ fclose(file);
+ return result;
+}
+
void gguf_free(struct gguf_context * ctx) {
if (ctx == NULL) {
return;
@@ -7410,13 +7449,7 @@ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const vo
// fwrite(val, sizeof(char), size, file);
//}
-struct gguf_buf {
- void * data;
- size_t size;
- size_t offset;
-};
-
-static struct gguf_buf gguf_buf_init(size_t size) {
+struct gguf_buf gguf_buf_init(size_t size) {
struct gguf_buf buf = {
/*buf.data =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
/*buf.size =*/ size,
@@ -7426,7 +7459,7 @@ static struct gguf_buf gguf_buf_init(size_t size) {
return buf;
}
-static void gguf_buf_free(struct gguf_buf buf) {
+void gguf_buf_free(struct gguf_buf buf) {
if (buf.data) {
GGML_FREE(buf.data);
}
@@ -7464,7 +7497,7 @@ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_si
buf->offset += el_size;
}
-static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
+void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
// write header
gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 4c8710b39..c2c7cad14 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -131,6 +131,7 @@ class Keys:
class Rope:
DIMENSION_COUNT = "{arch}.rope.dimension_count"
+ DIMENSION_SECTIONS = "{arch}.rope.dimension_sections"
FREQ_BASE = "{arch}.rope.freq_base"
SCALING_TYPE = "{arch}.rope.scaling.type"
SCALING_FACTOR = "{arch}.rope.scaling.factor"
@@ -226,6 +227,7 @@ class MODEL_ARCH(IntEnum):
QWEN = auto()
QWEN2 = auto()
QWEN2MOE = auto()
+ QWEN2VL = auto()
PHI2 = auto()
PHI3 = auto()
PLAMO = auto()
@@ -247,6 +249,7 @@ class MODEL_ARCH(IntEnum):
OLMOE = auto()
OPENELM = auto()
ARCTIC = auto()
+ DEEPSEEK = auto()
DEEPSEEK2 = auto()
CHATGLM = auto()
BITNET = auto()
@@ -388,6 +391,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.QWEN: "qwen",
MODEL_ARCH.QWEN2: "qwen2",
MODEL_ARCH.QWEN2MOE: "qwen2moe",
+ MODEL_ARCH.QWEN2VL: "qwen2vl",
MODEL_ARCH.PHI2: "phi2",
MODEL_ARCH.PHI3: "phi3",
MODEL_ARCH.PLAMO: "plamo",
@@ -409,6 +413,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.OLMOE: "olmoe",
MODEL_ARCH.OPENELM: "openelm",
MODEL_ARCH.ARCTIC: "arctic",
+ MODEL_ARCH.DEEPSEEK: "deepseek",
MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.CHATGLM: "chatglm",
MODEL_ARCH.BITNET: "bitnet",
@@ -772,6 +777,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
+ MODEL_ARCH.QWEN2VL: [
+ MODEL_TENSOR.TOKEN_EMBD,
+ MODEL_TENSOR.OUTPUT_NORM,
+ MODEL_TENSOR.OUTPUT,
+ MODEL_TENSOR.ATTN_NORM,
+ MODEL_TENSOR.ATTN_Q,
+ MODEL_TENSOR.ATTN_K,
+ MODEL_TENSOR.ATTN_V,
+ MODEL_TENSOR.ATTN_OUT,
+ MODEL_TENSOR.FFN_NORM,
+ MODEL_TENSOR.FFN_GATE,
+ MODEL_TENSOR.FFN_DOWN,
+ MODEL_TENSOR.FFN_UP,
+ ],
MODEL_ARCH.QWEN2MOE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
@@ -1141,6 +1160,29 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
],
+ MODEL_ARCH.DEEPSEEK: [
+ MODEL_TENSOR.TOKEN_EMBD,
+ MODEL_TENSOR.OUTPUT_NORM,
+ MODEL_TENSOR.OUTPUT,
+ MODEL_TENSOR.ROPE_FREQS,
+ MODEL_TENSOR.ATTN_NORM,
+ MODEL_TENSOR.ATTN_Q,
+ MODEL_TENSOR.ATTN_K,
+ MODEL_TENSOR.ATTN_V,
+ MODEL_TENSOR.ATTN_OUT,
+ MODEL_TENSOR.ATTN_ROT_EMBD,
+ MODEL_TENSOR.FFN_GATE_INP,
+ MODEL_TENSOR.FFN_NORM,
+ MODEL_TENSOR.FFN_GATE,
+ MODEL_TENSOR.FFN_DOWN,
+ MODEL_TENSOR.FFN_UP,
+ MODEL_TENSOR.FFN_GATE_EXP,
+ MODEL_TENSOR.FFN_DOWN_EXP,
+ MODEL_TENSOR.FFN_UP_EXP,
+ MODEL_TENSOR.FFN_GATE_SHEXP,
+ MODEL_TENSOR.FFN_DOWN_SHEXP,
+ MODEL_TENSOR.FFN_UP_SHEXP,
+ ],
MODEL_ARCH.DEEPSEEK2: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
@@ -1363,6 +1405,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,
],
+ MODEL_ARCH.DEEPSEEK: [
+ MODEL_TENSOR.ROPE_FREQS,
+ MODEL_TENSOR.ATTN_ROT_EMBD,
+ ],
MODEL_ARCH.DEEPSEEK2: [
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,
diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
index e8e61abf8..e17a4e831 100644
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -145,11 +145,10 @@ class GGUFReader:
count = int(count)
itemsize = int(np.empty([], dtype = dtype).itemsize)
end_offs = offset + itemsize * count
- return (
- self.data[offset:end_offs]
- .view(dtype = dtype)[:count]
- .newbyteorder(override_order or self.byte_order)
- )
+ arr = self.data[offset:end_offs].view(dtype=dtype)[:count]
+ if override_order is None:
+ return arr
+ return arr.view(arr.dtype.newbyteorder(override_order))
def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
if field.name in self.fields:
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 7a55d1296..65a64e10d 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -751,6 +751,9 @@ class GGUFWriter:
def add_rope_dimension_count(self, count: int) -> None:
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
+ def add_rope_dimension_sections(self, dims: Sequence[int]) -> None:
+ self.add_array(Keys.Rope.DIMENSION_SECTIONS.format(arch=self.arch), dims)
+
def add_rope_freq_base(self, value: float) -> None:
self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value)
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index f0a7b6478..573d0282e 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -306,7 +306,7 @@ class TensorNameMap:
MODEL_TENSOR.FFN_UP_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
- "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek2
+ "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
),
# AWQ-activation gate
@@ -338,7 +338,7 @@ class TensorNameMap:
MODEL_TENSOR.FFN_GATE_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
- "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek2
+ "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
),
# Feed-forward down
@@ -379,7 +379,7 @@ class TensorNameMap:
MODEL_TENSOR.FFN_DOWN_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
- "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek2
+ "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
),
MODEL_TENSOR.ATTN_Q_NORM: (
diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml
index 10e94876c..9c3956256 100644
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "gguf"
-version = "0.11.0"
+version = "0.13.0"
description = "Read and write ML models in GGUF for GGML"
authors = ["GGML "]
packages = [
diff --git a/include/llama.h b/include/llama.h
index eebbacb80..efbb27d21 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -108,9 +108,11 @@ extern "C" {
};
enum llama_rope_type {
- LLAMA_ROPE_TYPE_NONE = -1,
- LLAMA_ROPE_TYPE_NORM = 0,
- LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
+ LLAMA_ROPE_TYPE_NONE = -1,
+ LLAMA_ROPE_TYPE_NORM = 0,
+ LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
+ LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE,
+ LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
};
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
@@ -1137,16 +1139,12 @@ extern "C" {
const char * grammar_str,
const char * grammar_root);
+ /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
- int32_t n_vocab, // llama_n_vocab()
- llama_token special_eos_id, // llama_token_eos()
- llama_token linefeed_id, // llama_token_nl()
- int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
- float penalty_repeat, // 1.0 = disabled
- float penalty_freq, // 0.0 = disabled
- float penalty_present, // 0.0 = disabled
- bool penalize_nl, // consider newlines as a repeatable token
- bool ignore_eos); // ignore the end-of-sequence token
+ int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
+ float penalty_repeat, // 1.0 = disabled
+ float penalty_freq, // 0.0 = disabled
+ float penalty_present); // 0.0 = disabled
/// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
LLAMA_API struct llama_sampler * llama_sampler_init_dry(
diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh
index 143d98729..e40d1cc6d 100755
--- a/scripts/compare-commits.sh
+++ b/scripts/compare-commits.sh
@@ -20,11 +20,13 @@ if [ -n "$GGML_CUDA" ]; then
cmake_opts="-DGGML_CUDA=ON"
fi
+dir="build-bench"
+
function run {
- rm -fr build > /dev/null
- cmake -B build -S . $cmake_opts > /dev/null
- cmake --build build -t llama-bench > /dev/null
- build/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
+ rm -fr ${dir} > /dev/null
+ cmake -B ${dir} -S . $cmake_opts > /dev/null
+ cmake --build ${dir} -t llama-bench > /dev/null
+ ${dir}/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
}
git checkout $1 > /dev/null
diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 47eae44f7..b4ac38bbf 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-74d66b63eaf207a24f3e93bb922aba131cbf2906
+e6d93f40dffe8733d5d72f1d8fa6b3ca27ae899f
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index fd8ca8a9e..bebff77cf 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1396,19 +1396,15 @@ struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab
// penalties
struct llama_sampler_penalties {
- const int32_t n_vocab;
- const llama_token special_eos_id;
- const llama_token linefeed_id;
-
const int32_t penalty_last_n;
const float penalty_repeat;
const float penalty_freq;
const float penalty_present;
- const bool penalize_nl;
- const bool ignore_eos;
-
ring_buffer prev;
+
+ // a frequency map to count token occurrences
+ std::unordered_map token_count;
};
static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
@@ -1421,76 +1417,50 @@ static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_to
return;
}
+ ctx->token_count[token]++;
+
+ // if the ring buffer is full, remove the oldest token
+ if (ctx->prev.size() >= (size_t) ctx->penalty_last_n) {
+ const auto old = ctx->prev.front();
+
+ ctx->token_count[old]--;
+ if (ctx->token_count[old] == 0) {
+ ctx->token_count.erase(old);
+ }
+ }
+
ctx->prev.push_back(token);
+
+#if 0
+ // sanity check
+ std::unordered_map tmp;
+ for (int i = 0; i < std::min(ctx->penalty_last_n, ctx->prev.size()); ++i) {
+ tmp[ctx->prev.rat(i)]++;
+ }
+
+ assert(ctx->token_count == tmp);
+#endif
}
static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
- if (ctx->ignore_eos) {
- assert(ctx->special_eos_id >= 0);
-
- // optimistically check if the candidates are not yet sorted/shuffled/truncated
- if (cur_p->size > (size_t) ctx->special_eos_id && cur_p->data[ctx->special_eos_id].id == ctx->special_eos_id) {
- cur_p->data[ctx->special_eos_id].logit = -INFINITY;
- } else {
- // else, search for the special EOS token
- for (size_t i = 0; i < cur_p->size; ++i) {
- if (cur_p->data[i].id == ctx->special_eos_id) {
- cur_p->data[i].logit = -INFINITY;
- break;
- }
- }
- }
- }
-
if ((ctx->penalty_last_n == 0) ||
(ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
return;
}
- bool nl_found = false;
- size_t nl_idx = 0;
- float nl_logit = -INFINITY;
- if (!ctx->penalize_nl) {
- assert(ctx->linefeed_id >= 0);
-
- // optimistically check if the candidates are not yet sorted/shuffled/truncated
- if (cur_p->size > (size_t) ctx->linefeed_id && cur_p->data[ctx->linefeed_id].id == ctx->linefeed_id) {
- nl_found = true;
- nl_idx = ctx->linefeed_id;
- nl_logit = cur_p->data[ctx->linefeed_id].logit;
- } else {
- // else, search for the linefeed token
- for (size_t i = 0; i < cur_p->size; ++i) {
- if (cur_p->data[i].id == ctx->linefeed_id) {
- nl_found = true;
- nl_idx = i;
- nl_logit = cur_p->data[i].logit;
- break;
- }
- }
- }
- }
-
- // Create a frequency map to count occurrences of each token in last_tokens
- // TODO: optimize this by maintaining the token count in the sampler context
- using llama_token_cnt = std::unordered_map;
- llama_token_cnt token_count;
-
- for (int i = 0; i < std::min(ctx->penalty_last_n, ctx->prev.size()); ++i) {
- token_count[ctx->prev.rat(i)]++;
- }
-
// Apply frequency and presence penalties to the cur_p
for (size_t i = 0; i < cur_p->size; ++i) {
- const auto token_iter = token_count.find(cur_p->data[i].id);
- if (token_iter == token_count.end()) {
+ const auto token_iter = ctx->token_count.find(cur_p->data[i].id);
+ if (token_iter == ctx->token_count.end()) {
continue;
}
const int count = token_iter->second;
+ assert(count > 0 && count <= ctx->penalty_last_n);
+
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
if (cur_p->data[i].logit <= 0) {
@@ -1503,30 +1473,21 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok
}
cur_p->sorted = false;
-
- if (!ctx->penalize_nl && nl_found) {
- // restore the logit of the newline token if it was penalized
- cur_p->data[nl_idx].logit = nl_logit;
- }
}
static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
ctx->prev.clear();
+ ctx->token_count.clear();
}
static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
auto * result = llama_sampler_init_penalties(
- ctx->n_vocab,
- ctx->special_eos_id,
- ctx->linefeed_id,
ctx->penalty_last_n,
ctx->penalty_repeat,
ctx->penalty_freq,
- ctx->penalty_present,
- ctx->penalize_nl,
- ctx->ignore_eos);
+ ctx->penalty_present);
// copy the state
{
@@ -1552,38 +1513,21 @@ static struct llama_sampler_i llama_sampler_penalties_i = {
};
struct llama_sampler * llama_sampler_init_penalties(
- int32_t n_vocab,
- llama_token special_eos_id,
- llama_token linefeed_id,
int32_t penalty_last_n,
float penalty_repeat,
float penalty_freq,
- float penalty_present,
- bool penalize_nl,
- bool ignore_eos) {
- if (linefeed_id == LLAMA_TOKEN_NULL) {
- penalize_nl = true;
- }
-
- if (special_eos_id == LLAMA_TOKEN_NULL) {
- ignore_eos = false;
- }
-
+ float penalty_present) {
penalty_last_n = std::max(penalty_last_n, 0);
return new llama_sampler {
/* .iface = */ &llama_sampler_penalties_i,
/* .ctx = */ new llama_sampler_penalties {
- /* .n_vocab = */ n_vocab,
- /* .special_eos_id = */ special_eos_id,
- /* .linefeed_id = */ linefeed_id,
/* .penalty_last_n = */ penalty_last_n,
/* .penalty_repeat = */ penalty_repeat,
/* .penalty_freq = */ penalty_freq,
/* .penalty_present = */ penalty_present,
- /* .penalize_nl = */ penalize_nl,
- /* .ignore_eos = */ ignore_eos,
/* .prev = */ ring_buffer(penalty_last_n),
+ /* .token_count = */ {},
},
};
}
@@ -1611,7 +1555,8 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
if (word.find(str) != std::string::npos) {
token_sequences.emplace(token_id, std::vector());
} else {
- size_t word_len = word.size(), str_len = str.size();
+ size_t word_len = word.size();
+ size_t str_len = str.size();
size_t pos = -1;
while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
bool match = true;
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 8c9aaf5a0..e38e59853 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -738,7 +738,7 @@ struct llm_tokenizer_wpm_session {
std::vector words(1, "");
for (const uint32_t cpt : cpts_nfd) {
- const auto flags = unicode_cpt_flags(cpt);
+ const auto flags = unicode_cpt_flags_from_cpt(cpt);
if (flags.is_whitespace) {
if (words.back().size()) { // finish previous word if any
diff --git a/src/llama.cpp b/src/llama.cpp
index 49ef5b78a..b7b04a41d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -163,6 +163,7 @@ enum llm_arch {
LLM_ARCH_QWEN,
LLM_ARCH_QWEN2,
LLM_ARCH_QWEN2MOE,
+ LLM_ARCH_QWEN2VL,
LLM_ARCH_PHI2,
LLM_ARCH_PHI3,
LLM_ARCH_PLAMO,
@@ -183,6 +184,7 @@ enum llm_arch {
LLM_ARCH_OLMOE,
LLM_ARCH_OPENELM,
LLM_ARCH_ARCTIC,
+ LLM_ARCH_DEEPSEEK,
LLM_ARCH_DEEPSEEK2,
LLM_ARCH_CHATGLM,
LLM_ARCH_BITNET,
@@ -217,6 +219,7 @@ static const std::map LLM_ARCH_NAMES = {
{ LLM_ARCH_QWEN, "qwen" },
{ LLM_ARCH_QWEN2, "qwen2" },
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
+ { LLM_ARCH_QWEN2VL, "qwen2vl" },
{ LLM_ARCH_PHI2, "phi2" },
{ LLM_ARCH_PHI3, "phi3" },
{ LLM_ARCH_PLAMO, "plamo" },
@@ -237,6 +240,7 @@ static const std::map LLM_ARCH_NAMES = {
{ LLM_ARCH_OLMOE, "olmoe" },
{ LLM_ARCH_OPENELM, "openelm" },
{ LLM_ARCH_ARCTIC, "arctic" },
+ { LLM_ARCH_DEEPSEEK, "deepseek" },
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
{ LLM_ARCH_CHATGLM, "chatglm" },
{ LLM_ARCH_BITNET, "bitnet" },
@@ -308,6 +312,7 @@ enum llm_kv {
LLM_KV_ATTENTION_SCALE,
LLM_KV_ROPE_DIMENSION_COUNT,
+ LLM_KV_ROPE_DIMENSION_SECTIONS,
LLM_KV_ROPE_FREQ_BASE,
LLM_KV_ROPE_SCALE_LINEAR,
LLM_KV_ROPE_SCALING_TYPE,
@@ -424,6 +429,7 @@ static const std::map LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
+ { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
@@ -898,6 +904,23 @@ static const std::map> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
+ {
+ LLM_ARCH_QWEN2VL,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ },
+ },
{
LLM_ARCH_QWEN2MOE,
{
@@ -1288,6 +1311,33 @@ static const std::map> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
+ {
+ LLM_ARCH_DEEPSEEK,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
+ },
+ },
{
LLM_ARCH_DEEPSEEK2,
{
@@ -1579,6 +1629,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_EXAONE_3,
LLM_CHAT_TEMPLATE_RWKV_WORLD,
LLM_CHAT_TEMPLATE_GRANITE,
+ LLM_CHAT_TEMPLATE_GIGACHAT,
LLM_CHAT_TEMPLATE_UNKNOWN,
};
@@ -1610,6 +1661,7 @@ static const std::map LLM_CHAT_TEMPLATES = {
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
+ { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
};
static llm_arch llm_arch_from_string(const std::string & name) {
@@ -2474,11 +2526,12 @@ struct llama_hparams {
uint32_t time_decay_extra_dim = 0;
uint32_t wkv_head_size = 0;
- float rope_attn_factor = 1.0f;
- float rope_freq_base_train;
- float rope_freq_scale_train;
- uint32_t n_ctx_orig_yarn;
- float rope_yarn_log_mul;
+ float rope_attn_factor = 1.0f;
+ float rope_freq_base_train;
+ float rope_freq_scale_train;
+ uint32_t n_ctx_orig_yarn;
+ float rope_yarn_log_mul;
+ int rope_sections[4];
// for State Space Models
uint32_t ssm_d_conv = 0;
@@ -2535,6 +2588,9 @@ struct llama_hparams {
if (this->rope_finetuned != other.rope_finetuned) return true;
if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
+ if (std::equal(std::begin(this->rope_sections),
+ std::end(this->rope_sections),
+ std::begin(other.rope_sections))) return true;
if (this->ssm_d_conv != other.ssm_d_conv) return true;
if (this->ssm_d_inner != other.ssm_d_inner) return true;
@@ -3378,6 +3434,11 @@ struct llama_context {
// whether we are computing encoder output or decoder output
bool is_encoding = false;
+ // TODO: find a better way to accommodate mutli-dimension position encoding methods
+ // number of position id each token get, 1 for each token in most cases.
+ // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
+ int n_pos_per_token = 1;
+
// output of the encoder part of the encoder-decoder models
std::vector embd_enc;
std::vector> seq_ids_enc;
@@ -5747,6 +5808,13 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
+ case LLM_ARCH_QWEN2VL:
+ {
+ std::array section_dims;
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, section_dims, 4, true);
+ std::copy(section_dims.begin(), section_dims.begin() + 4, std::begin(hparams.rope_sections));
+ }
+ // fall through
case LLM_ARCH_QWEN2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -6057,6 +6125,19 @@ static void llm_load_hparams(
model.type = e_model::MODEL_UNKNOWN;
}
} break;
+ case LLM_ARCH_DEEPSEEK:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+
+ switch (hparams.n_layer) {
+ case 28: model.type = e_model::MODEL_20B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ } break;
case LLM_ARCH_DEEPSEEK2:
{
bool is_lite = (hparams.n_layer == 27);
@@ -6403,6 +6484,7 @@ static void llm_load_vocab(
tokenizer_pre == "phi-2" ||
tokenizer_pre == "jina-es" ||
tokenizer_pre == "jina-de" ||
+ tokenizer_pre == "gigachat" ||
tokenizer_pre == "jina-v1-en" ||
tokenizer_pre == "jina-v2-es" ||
tokenizer_pre == "jina-v2-de" ||
@@ -7054,6 +7136,13 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
+ if (model.arch == LLM_ARCH_DEEPSEEK) {
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
+ }
+
if (model.arch == LLM_ARCH_DEEPSEEK2) {
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
@@ -8167,6 +8256,7 @@ static bool llm_load_tensors(
}
} break;
case LLM_ARCH_QWEN2:
+ case LLM_ARCH_QWEN2VL:
{
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -8827,6 +8917,55 @@ static bool llm_load_tensors(
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
}
} break;
+ case LLM_ARCH_DEEPSEEK:
+ {
+
+ const int64_t n_ff_exp = hparams.n_ff_exp;
+ const int64_t n_expert_shared = hparams.n_expert_shared;
+
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = model.layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ if (i < (int) hparams.n_layer_dense_lead) {
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ } else {
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+ if (n_expert == 0) {
+ throw std::runtime_error("n_expert must be > 0");
+ }
+ if (n_expert_used == 0) {
+ throw std::runtime_error("n_expert_used must be > 0");
+ }
+
+ // MoE branch
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+
+ // Shared expert branch
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+ }
+ }
+ } break;
case LLM_ARCH_DEEPSEEK2:
{
const bool is_lite = (hparams.n_layer == 27);
@@ -12556,6 +12695,124 @@ struct llm_build_context {
return gf;
}
+ struct ggml_cgraph * build_qwen2vl() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+ // inp_pos - contains the positions
+ lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4);
+ cb(lctx.inp_pos, "inp_pos", -1);
+ ggml_set_input(lctx.inp_pos);
+ struct ggml_tensor * inp_pos = lctx.inp_pos;
+
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+ int sections[4];
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+
+ struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+
+ struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_rope_multi(
+ ctx0,
+ ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Qcur, "Qcur", il);
+
+ Kcur = ggml_rope_multi(
+ ctx0,
+ ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Kcur, "Kcur", il);
+
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+ model.layers[il].wo, model.layers[il].bo,
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ }
+
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = llm_build_ffn(ctx0, lctx, cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ // lm_head
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
struct ggml_cgraph * build_qwen2moe() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
@@ -15063,6 +15320,161 @@ struct llm_build_context {
return gf;
}
+ struct ggml_cgraph * build_deepseek() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
+ int32_t n_tokens = this->n_tokens;
+
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+ // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = build_inp_pos();
+
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
+
+ // compute Q and K and RoPE them
+ struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Qcur, "Qcur", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Kcur, "Kcur", il);
+
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+ model.layers[il].wo, model.layers[il].bo,
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+ }
+
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ n_tokens = n_outputs;
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
+ cur = llm_build_ffn(ctx0, lctx, cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ ggml_tensor * moe_out =
+ llm_build_moe_ffn(ctx0, lctx, cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, false,
+ false, hparams.expert_weights_scale,
+ cb, il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // FFN shared expert
+ {
+ ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ }
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ // lm_head
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
struct ggml_cgraph * build_deepseek2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
@@ -16657,6 +17069,11 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_qwen2();
} break;
+ case LLM_ARCH_QWEN2VL:
+ {
+ lctx.n_pos_per_token = 4;
+ result = llm.build_qwen2vl();
+ } break;
case LLM_ARCH_QWEN2MOE:
{
result = llm.build_qwen2moe();
@@ -16745,6 +17162,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_arctic();
} break;
+ case LLM_ARCH_DEEPSEEK:
+ {
+ result = llm.build_deepseek();
+ } break;
case LLM_ARCH_DEEPSEEK2:
{
result = llm.build_deepseek2();
@@ -16875,8 +17296,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
if (ubatch.pos && lctx.inp_pos) {
const int64_t n_tokens = ubatch.n_tokens;
-
- ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
+ auto n_pos = lctx.n_pos_per_token;
+ ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos));
}
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
@@ -19976,6 +20397,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_COMMAND_R:
case LLM_ARCH_OLMO:
case LLM_ARCH_ARCTIC:
+ case LLM_ARCH_DEEPSEEK:
case LLM_ARCH_DEEPSEEK2:
case LLM_ARCH_CHATGLM:
case LLM_ARCH_GRANITE:
@@ -20009,6 +20431,9 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_MINICPM3:
return LLAMA_ROPE_TYPE_NEOX;
+ case LLM_ARCH_QWEN2VL:
+ return LLAMA_ROPE_TYPE_MROPE;
+
// all model arches should be listed explicitly here
case LLM_ARCH_UNKNOWN:
GGML_ABORT("unknown architecture");
@@ -21838,6 +22263,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
return LLM_CHAT_TEMPLATE_RWKV_WORLD;
} else if (tmpl_contains("<|start_of_role|>")) {
return LLM_CHAT_TEMPLATE_GRANITE;
+ } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
+ return LLM_CHAT_TEMPLATE_GIGACHAT;
}
return LLM_CHAT_TEMPLATE_UNKNOWN;
}
@@ -22161,6 +22588,32 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<|start_of_role|>assistant<|end_of_role|>\n";
}
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
+ // GigaChat template
+ bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
+
+ // Handle system message if present
+ if (has_system) {
+ ss << "" << chat[0]->content << "<|message_sep|>";
+ } else {
+ ss << "";
+ }
+
+ // Process remaining messages
+ for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
+ std::string role(chat[i]->role);
+ if (role == "user") {
+ ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
+ << "available functions<|role_sep|>[]<|message_sep|>";
+ } else if (role == "assistant") {
+ ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
+ }
+ }
+
+ // Add generation prompt if needed
+ if (add_ass) {
+ ss << "assistant<|role_sep|>";
+ }
} else {
// template not supported
return -1;
@@ -22180,15 +22633,15 @@ int32_t llama_chat_apply_template(
std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
if (tmpl == nullptr) {
GGML_ASSERT(model != nullptr);
- // load template from model
- std::vector model_template(2048, 0); // longest known template is about 1200 bytes
- std::string template_key = "tokenizer.chat_template";
- int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
- if (res < 0) {
+
+ // load template from model, if available
+ const auto & it = model->gguf_kv.find("tokenizer.chat_template");
+ if (it != model->gguf_kv.end() && it->second.size() > 0) {
+ curr_tmpl = it->second;
+ }
+ else {
// worst case: there is no information about template, we will use chatml by default
- curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
- } else {
- curr_tmpl = std::string(model_template.data(), model_template.size());
+ curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
}
}
diff --git a/src/unicode.cpp b/src/unicode.cpp
index 3d4592635..8ed6b1a51 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -71,15 +71,15 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
throw std::invalid_argument("failed to convert utf8 to codepoint");
}
-//static std::vector unicode_cpt_to_utf16(uint32_t cp) {
+//static std::vector unicode_cpt_to_utf16(uint32_t cpt) {
// std::vector result;
-// if (/* 0x0000 <= cp && */ cp <= 0xffff) {
-// result.emplace_back(cp);
+// if (/* 0x0000 <= cpt && */ cpt <= 0xffff) {
+// result.emplace_back(cpt);
// return result;
// }
-// if (0x10000 <= cp && cp <= 0x10ffff) {
-// result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
-// result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
+// if (0x10000 <= cpt && cpt <= 0x10ffff) {
+// result.emplace_back(0xd800 | ((cpt - 0x10000) >> 10));
+// result.emplace_back(0xdc00 | ((cpt - 0x10000) & 0x03ff));
// return result;
// }
// throw std::invalid_argument("failed to convert codepoint to utf16");
@@ -120,8 +120,8 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
// return result;
//}
-static std::vector unicode_cpt_flags_array() {
- std::vector cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
+static std::vector unicode_cpt_flags_array() {
+ std::vector cpt_flags(MAX_CODEPOINTS, unicode_cpt_flags::UNDEFINED);
assert (unicode_ranges_flags.begin()[0].first == 0);
assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
@@ -253,8 +253,8 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
};
- auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
- return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
+ auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
};
size_t _prev_end = offset_ini;
@@ -371,8 +371,8 @@ static std::vector unicode_regex_split_custom_llama3(const std::string &
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
};
- auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
- return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
+ auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
};
size_t _prev_end = offset_ini;
@@ -572,29 +572,29 @@ static std::vector unicode_regex_split_custom(const std::string & text,
// interface
//
-std::string unicode_cpt_to_utf8(uint32_t cp) {
+std::string unicode_cpt_to_utf8(uint32_t cpt) {
std::string result;
- if (/* 0x00 <= cp && */ cp <= 0x7f) {
- result.push_back(cp);
+ if (/* 0x00 <= cpt && */ cpt <= 0x7f) {
+ result.push_back(cpt);
return result;
}
- if (0x80 <= cp && cp <= 0x7ff) {
- result.push_back(0xc0 | ((cp >> 6) & 0x1f));
- result.push_back(0x80 | (cp & 0x3f));
+ if (0x80 <= cpt && cpt <= 0x7ff) {
+ result.push_back(0xc0 | ((cpt >> 6) & 0x1f));
+ result.push_back(0x80 | (cpt & 0x3f));
return result;
}
- if (0x800 <= cp && cp <= 0xffff) {
- result.push_back(0xe0 | ((cp >> 12) & 0x0f));
- result.push_back(0x80 | ((cp >> 6) & 0x3f));
- result.push_back(0x80 | (cp & 0x3f));
+ if (0x800 <= cpt && cpt <= 0xffff) {
+ result.push_back(0xe0 | ((cpt >> 12) & 0x0f));
+ result.push_back(0x80 | ((cpt >> 6) & 0x3f));
+ result.push_back(0x80 | (cpt & 0x3f));
return result;
}
- if (0x10000 <= cp && cp <= 0x10ffff) {
- result.push_back(0xf0 | ((cp >> 18) & 0x07));
- result.push_back(0x80 | ((cp >> 12) & 0x3f));
- result.push_back(0x80 | ((cp >> 6) & 0x3f));
- result.push_back(0x80 | (cp & 0x3f));
+ if (0x10000 <= cpt && cpt <= 0x10ffff) {
+ result.push_back(0xf0 | ((cpt >> 18) & 0x07));
+ result.push_back(0x80 | ((cpt >> 12) & 0x3f));
+ result.push_back(0x80 | ((cpt >> 6) & 0x3f));
+ result.push_back(0x80 | (cpt & 0x3f));
return result;
}
@@ -624,19 +624,19 @@ std::vector unicode_cpts_from_utf8(const std::string & utf8) {
return result;
}
-codepoint_flags unicode_cpt_flags(const uint32_t cp) {
- static const codepoint_flags undef(codepoint_flags::UNDEFINED);
+unicode_cpt_flags unicode_cpt_flags_from_cpt(const uint32_t cpt) {
+ static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
static const auto cpt_flags = unicode_cpt_flags_array();
- return cp < cpt_flags.size() ? cpt_flags[cp] : undef;
+ return cpt < cpt_flags.size() ? cpt_flags[cpt] : undef;
}
-codepoint_flags unicode_cpt_flags(const std::string & utf8) {
- static const codepoint_flags undef(codepoint_flags::UNDEFINED);
+unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8) {
+ static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
if (utf8.empty()) {
return undef; // undefined
}
size_t offset = 0;
- return unicode_cpt_flags(unicode_cpt_from_utf8(utf8, offset));
+ return unicode_cpt_flags_from_cpt(unicode_cpt_from_utf8(utf8, offset));
}
std::string unicode_byte_to_utf8(uint8_t byte) {
@@ -649,41 +649,41 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
return map.at(utf8);
}
-uint32_t unicode_tolower(uint32_t cp) {
+uint32_t unicode_tolower(uint32_t cpt) {
// binary search
- auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp,
+ auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cpt,
[](const std::pair & pair, uint32_t value) {
return pair.first < value;
});
- if (it != unicode_map_lowercase.end() && it->first == cp) {
+ if (it != unicode_map_lowercase.end() && it->first == cpt) {
return it->second;
}
- return cp; // Return the original code point if no lowercase mapping is found
+ return cpt; // Return the original code point if no lowercase mapping is found
}
std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs) {
// unicode categories
static const std::map k_ucat_enum = {
- { "\\p{N}", codepoint_flags::NUMBER },
- { "\\p{L}", codepoint_flags::LETTER },
- { "\\p{P}", codepoint_flags::PUNCTUATION },
+ { "\\p{N}", unicode_cpt_flags::NUMBER },
+ { "\\p{L}", unicode_cpt_flags::LETTER },
+ { "\\p{P}", unicode_cpt_flags::PUNCTUATION },
};
static const std::map k_ucat_cpt = {
- { codepoint_flags::NUMBER, 0xD1 },
- { codepoint_flags::LETTER, 0xD2 },
- { codepoint_flags::PUNCTUATION, 0xD3 },
+ { unicode_cpt_flags::NUMBER, 0xD1 },
+ { unicode_cpt_flags::LETTER, 0xD2 },
+ { unicode_cpt_flags::PUNCTUATION, 0xD3 },
};
static const std::map k_ucat_map = {
- { codepoint_flags::NUMBER, "\x30-\x39" }, // 0-9
- { codepoint_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
- { codepoint_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
+ { unicode_cpt_flags::NUMBER, "\x30-\x39" }, // 0-9
+ { unicode_cpt_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
+ { unicode_cpt_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
};
// compute collapsed codepoints only if needed by at least one regex
bool need_collapse = false;
- for (auto & regex_expr : regex_exprs) {
+ for (const auto & regex_expr : regex_exprs) {
// search for unicode categories
for (const auto & ucat : k_ucat_enum) {
if (std::string::npos != regex_expr.find(ucat.first)) {
@@ -709,7 +709,7 @@ std::vector unicode_regex_split(const std::string & text, const std
continue;
}
- const auto flags = unicode_cpt_flags(cpts[i]);
+ const auto flags = unicode_cpt_flags_from_cpt(cpts[i]);
if (flags.is_whitespace) {
//NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
@@ -725,7 +725,7 @@ std::vector unicode_regex_split(const std::string & text, const std
std::vector bpe_offsets = { cpts.size() };
- for (auto & regex_expr : regex_exprs) {
+ for (const auto & regex_expr : regex_exprs) {
// first, see if we have an efficient custom regex implementation
auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets);
@@ -739,7 +739,7 @@ std::vector unicode_regex_split(const std::string & text, const std
// if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
// with the corresponding collapsed representation
bool use_collapsed = false;
- for (auto & ucat : k_ucat_enum) {
+ for (const auto & ucat : k_ucat_enum) {
if (std::string::npos != regex_expr.find(ucat.first)) {
use_collapsed = true;
break;
@@ -805,7 +805,7 @@ std::vector unicode_regex_split(const std::string & text, const std
// std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
std::wstring wtext(cpts.begin(), cpts.end());
for (size_t i = 0; i < wtext.size(); ++i) {
- if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) {
+ if (wtext[i] > 0x7F && unicode_cpt_flags_from_cpt(wtext[i]).is_whitespace) {
wtext[i] = 0x0B;
}
}
diff --git a/src/unicode.h b/src/unicode.h
index 008532a24..c27098df7 100644
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -4,9 +4,7 @@
#include
#include
-// TODO: prefix all symbols with "llama_"
-
-struct codepoint_flags {
+struct unicode_cpt_flags {
enum {
UNDEFINED = 0x0001,
NUMBER = 0x0002, // regex: \p{N}
@@ -35,7 +33,7 @@ struct codepoint_flags {
uint16_t is_nfd : 1;
// decode from uint16
- inline codepoint_flags(const uint16_t flags=0) {
+ inline unicode_cpt_flags(const uint16_t flags = 0) {
*reinterpret_cast(this) = flags;
}
@@ -50,18 +48,19 @@ struct codepoint_flags {
size_t unicode_len_utf8(char src);
-std::string unicode_cpt_to_utf8(uint32_t cp);
-uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
+std::string unicode_cpt_to_utf8 (uint32_t cpt);
+uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
+
std::vector unicode_cpts_from_utf8(const std::string & utf8);
std::vector unicode_cpts_normalize_nfd(const std::vector & cpts);
-codepoint_flags unicode_cpt_flags(const uint32_t cp);
-codepoint_flags unicode_cpt_flags(const std::string & utf8);
+unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
+unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);
std::string unicode_byte_to_utf8(uint8_t byte);
-uint8_t unicode_utf8_to_byte(const std::string & utf8);
+uint8_t unicode_utf8_to_byte(const std::string & utf8);
-uint32_t unicode_tolower(uint32_t cp);
+uint32_t unicode_tolower(uint32_t cpt);
std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index daeed4564..2b5e5fd4a 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -129,6 +129,7 @@ llama_target_and_test(test-arg-parser.cpp)
llama_target_and_test(test-chat-template.cpp)
# llama_target_and_test(test-opt.cpp) # SLOW
+llama_target_and_test(test-gguf.cpp)
llama_target_and_test(test-backend-ops.cpp)
llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 9dd41260a..ccdd3fb57 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -2201,7 +2201,15 @@ struct test_rope : public test_case {
ggml_set_name(a, "a");
}
- ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
+ const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
+ const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+ ggml_tensor * pos;
+ if (is_mrope || is_vision) {
+ pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2] * 4);
+ } else {
+ pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
+ }
ggml_set_name(pos, "pos");
ggml_tensor * freq = nullptr;
@@ -2210,7 +2218,20 @@ struct test_rope : public test_case {
ggml_set_name(freq, "freq");
}
- ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+ ggml_tensor * out;
+ if (is_mrope) {
+ if (is_vision) {
+ GGML_ASSERT(n_dims/4 > 0);
+ int rope_sections[4] = {n_dims/4, n_dims/4, 0, 0}; // Vision-RoPE only use first two dimension for image (x, y) coordinate
+ out = ggml_rope_multi(ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+ } else {
+ GGML_ASSERT(n_dims/3 > 0);
+ int rope_sections[4] = {n_dims/3, n_dims/3, n_dims/3, 0};
+ out = ggml_rope_multi(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+ }
+ } else {
+ out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+ }
ggml_set_name(out, "out");
return out;
@@ -2220,11 +2241,12 @@ struct test_rope : public test_case {
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (t->type == GGML_TYPE_I32) {
// pos
- std::vector data(ne_a[2]);
- for (int i = 0; i < ne_a[2]; i++) {
+ const int num_pos_ids = (mode & GGML_ROPE_TYPE_MROPE) ? ne_a[2] * 4 : ne_a[2];
+ std::vector data(num_pos_ids);
+ for (int i = 0; i < num_pos_ids; i++) {
data[i] = rand() % n_ctx;
}
- ggml_backend_tensor_set(t, data.data(), 0, ne_a[2] * sizeof(int));
+ ggml_backend_tensor_set(t, data.data(), 0, num_pos_ids * sizeof(int));
} else {
if (t->ne[0] == n_dims/2) {
// frequency factors in the range [0.9f, 1.1f]
@@ -3527,8 +3549,8 @@ static std::vector> make_test_cases_eval() {
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
for (ggml_type type_dst : all_types) {
- test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
- test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
+ test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
+ test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
}
}
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
@@ -3813,6 +3835,12 @@ static std::vector> make_test_cases_eval() {
test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
}
+ if (all) {
+ test_cases.emplace_back(new test_rope(type, {128, 12, 2, 1}, 128, GGML_ROPE_TYPE_MROPE, 512, fs, ef, af, ff, v)); // rope_multi,m-rope (qwen2vl 2B)
+ test_cases.emplace_back(new test_rope(type, {128, 28, 2, 1}, 128, GGML_ROPE_TYPE_MROPE, 512, fs, ef, af, ff, v)); // rope_multi,m-rope (qwen2vl 7B)
+ test_cases.emplace_back(new test_rope(type, { 80, 16, 2, 1}, 80, GGML_ROPE_TYPE_VISION, 512, fs, ef, af, ff, v)); // rope_multi,m-rope (qwen2vl ViT)
+ }
+
test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
}
}
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index aa140b569..30a910ad5 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -75,6 +75,8 @@ int main(void) {
"{%- if messages[0][\"role\"] == \"system\" %}\n {%- set system_message = messages[0][\"content\"] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif %}\n {%- set ns.index = ns.index + 1 %}\n {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if message[\"role\"] == \"user\" %}\n {%- if tools is not none and (message == user_messages[-1]) %}\n {{- \"[AVAILABLE_TOOLS][\" }}\n {%- for tool in tools %}\n {%- set tool = tool.function %}\n {{- '{\"type\": \"function\", \"function\": {' }}\n {%- for key, val in tool.items() if key != \"return\" %}\n {%- if val is string %}\n {{- '\"' + key + '\": \"' + val + '\"' }}\n {%- else %}\n {{- '\"' + key + '\": ' + val|tojson }}\n {%- endif %}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \"}}\" }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"[/AVAILABLE_TOOLS]\" }}\n {%- endif %}\n {%- if loop.last and system_message is defined %}\n {{- \"[INST]\" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n {%- else %}\n {{- \"[INST]\" + message[\"content\"] + \"[/INST]\" }}\n {%- endif %}\n {%- elif (message.tool_calls is defined and message.tool_calls is not none) %}\n {{- \"[TOOL_CALLS][\" }}\n {%- for tool_call in message.tool_calls %}\n {%- set out = tool_call.function|tojson %}\n {{- out[:-1] }}\n {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" + eos_token }}\n {%- endif %}\n {%- endfor %}\n {%- elif message[\"role\"] == \"assistant\" %}\n {{- message[\"content\"] + eos_token}}\n {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n {%- if message.content is defined and message.content.content is defined %}\n {%- set content = message.content.content %}\n {%- else %}\n {%- set content = message.content %}\n {%- endif %}\n {{- '[TOOL_RESULTS]{\"content\": ' + content|string + \", \" }}\n {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n {%- else %}\n {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n {%- endif %}\n{%- endfor %}\n",
// mistralai/Mistral-Large-Instruct-2411 (mistralai 'v7' template)
"{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'system' %}{{ '[SYSTEM_PROMPT] ' + message['content'] + '[/SYSTEM_PROMPT]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, system and assistant roles are supported!') }}{% endif %}{% endfor %}",
+ // ai-sage/GigaChat-20B-A3B-instruct
+ "{% if messages[0]['role'] == 'system' -%}\n {%- set loop_messages = messages[1:] -%}\n {%- set system_message = bos_token + messages[0]['content'] + additional_special_tokens[1] -%}\n{%- else -%}\n {%- set loop_messages = messages -%}\n {%- set system_message = bos_token + '' -%}\n{%- endif -%}\n{%- for message in loop_messages %}\n {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n {% endif %}\n \n {%- if loop.index0 == 0 -%}\n {{ system_message -}}\n {%- endif -%}\n {%- if message['role'] == 'user' -%}\n {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n {{ 'available functions' + additional_special_tokens[0] + additional_special_tokens[2] + additional_special_tokens[3] + additional_special_tokens[1] -}}\n {%- endif -%}\n {%- if message['role'] == 'assistant' -%}\n {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n {%- endif -%}\n {%- if loop.last and add_generation_prompt -%}\n {{ 'assistant' + additional_special_tokens[0] -}}\n {%- endif -%}\n{%- endfor %}",
};
std::vector expected_output = {
// teknium/OpenHermes-2.5-Mistral-7B
@@ -129,6 +131,8 @@ int main(void) {
"[INST]You are a helpful assistant\n\nHello[/INST]Hi there[INST]Who are you[/INST] I am an assistant [INST]Another question[/INST]",
// mistralai/Mistral-Large-Instruct-2411 (mistralai 'v7' template)
"[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT][INST] Hello[/INST] Hi there[INST] Who are you[/INST] I am an assistant [INST] Another question[/INST]",
+ // ai-sage/GigaChat-20B-A3B-instruct
+ "You are a helpful assistant<|message_sep|>user<|role_sep|>Hello<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>Hi there<|message_sep|>user<|role_sep|>Who are you<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|> I am an assistant <|message_sep|>user<|role_sep|>Another question<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>",
};
std::vector formatted_chat(1024);
int32_t res;
@@ -190,6 +194,7 @@ int main(void) {
assert(fmt_sys("mistral") == "[INST] You are a helpful assistant\n"); // for old pre-v1 templates
assert(fmt_sys("gemma") == ""); // for gemma, system message is merged with user message
assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>");
+ assert(fmt_sys("gigachat") == "You are a helpful assistant<|message_sep|>");
// test llama_chat_format_single for user message
@@ -214,6 +219,7 @@ int main(void) {
assert(fmt_single("mistral") == "[INST] How are you [/INST]"); // for old pre-v1 templates
assert(fmt_single("gemma") == "\nuser\nHow are you\nmodel\n");
assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
+ assert(fmt_single("gigachat") == "user<|role_sep|>How are you<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>");
printf("Test chat templates: OK\n");
diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp
new file mode 100644
index 000000000..e5b4cb7b8
--- /dev/null
+++ b/tests/test-gguf.cpp
@@ -0,0 +1,1303 @@
+#include "ggml.h"
+#include "ggml-backend.h"
+#include "../ggml/src/ggml-impl.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+constexpr int offset_has_kv = 1000;
+constexpr int offset_has_tensors = 2000;
+constexpr int offset_has_data = 3000;
+
+enum handcrafted_file_type {
+ HANDCRAFTED_HEADER_BAD_MAGIC = 10,
+ HANDCRAFTED_HEADER_BAD_VERSION_1 = 20,
+ HANDCRAFTED_HEADER_BAD_VERSION_FUTURE = 30,
+ HANDCRAFTED_HEADER_BAD_N_TENSORS = 40,
+ HANDCRAFTED_HEADER_BAD_N_KV = 50,
+ HANDCRAFTED_HEADER_EMPTY = 800,
+
+ HANDCRAFTED_KV_BAD_KEY_SIZE = 10 + offset_has_kv,
+ HANDCRAFTED_KV_BAD_TYPE = 20 + offset_has_kv,
+ HANDCRAFTED_KV_BAD_VALUE_SIZE = 30 + offset_has_kv,
+ HANDCRAFTED_KV_DUPLICATE_KEY = 40 + offset_has_kv,
+ HANDCRAFTED_KV_SUCCESS = 800 + offset_has_kv,
+
+ HANDCRAFTED_TENSORS_BAD_NAME_SIZE = 10 + offset_has_tensors,
+ HANDCRAFTED_TENSORS_BAD_N_DIMS = 20 + offset_has_tensors,
+ HANDCRAFTED_TENSORS_BAD_SHAPE = 30 + offset_has_tensors,
+ HANDCRAFTED_TENSORS_NE_TOO_BIG = 40 + offset_has_tensors,
+ HANDCRAFTED_TENSORS_BAD_TYPE = 50 + offset_has_tensors,
+ HANDCRAFTED_TENSORS_BAD_OFFSET = 60 + offset_has_tensors,
+ HANDCRAFTED_TENSORS_DUPLICATE_NAME = 70 + offset_has_tensors,
+ HANDCRAFTED_TENSORS_BAD_ALIGNMENT = 80 + offset_has_tensors,
+ HANDCRAFTED_TENSORS_SUCCESS = 800 + offset_has_tensors,
+ HANDCRAFTED_TENSORS_CUSTOM_ALIGN = 810 + offset_has_tensors,
+
+ HANDCRAFTED_DATA_NOT_ENOUGH_DATA = 10 + offset_has_data,
+ HANDCRAFTED_DATA_BAD_ALIGNMENT = 20 + offset_has_data,
+ HANDCRAFTED_DATA_SUCCESS = 800 + offset_has_data,
+ HANDCRAFTED_DATA_CUSTOM_ALIGN = 810 + offset_has_data,
+};
+
+std::string handcrafted_file_type_name(const enum handcrafted_file_type hft) {
+ switch (hft) {
+ case HANDCRAFTED_HEADER_BAD_MAGIC: return "HEADER_BAD_MAGIC";
+ case HANDCRAFTED_HEADER_BAD_VERSION_1: return "HEADER_BAD_VERSION_1";
+ case HANDCRAFTED_HEADER_BAD_VERSION_FUTURE: return "HEADER_BAD_VERSION_FUTURE";
+ case HANDCRAFTED_HEADER_BAD_N_KV: return "HEADER_BAD_N_KV";
+ case HANDCRAFTED_HEADER_BAD_N_TENSORS: return "HEADER_BAD_N_TENSORS";
+ case HANDCRAFTED_HEADER_EMPTY: return "HEADER_EMPTY";
+
+ case HANDCRAFTED_KV_BAD_KEY_SIZE: return "KV_BAD_KEY_SIZE";
+ case HANDCRAFTED_KV_BAD_TYPE: return "KV_BAD_TYPE";
+ case HANDCRAFTED_KV_BAD_VALUE_SIZE: return "KV_BAD_VALUE_SIZE";
+ case HANDCRAFTED_KV_DUPLICATE_KEY: return "KV_DUPLICATE_KEY";
+ case HANDCRAFTED_KV_SUCCESS: return "KV_RANDOM_KV";
+
+ case HANDCRAFTED_TENSORS_BAD_NAME_SIZE: return "TENSORS_BAD_NAME_SIZE";
+ case HANDCRAFTED_TENSORS_BAD_N_DIMS: return "TENSORS_BAD_N_DIMS";
+ case HANDCRAFTED_TENSORS_BAD_SHAPE: return "TENSORS_BAD_SHAPE";
+ case HANDCRAFTED_TENSORS_NE_TOO_BIG: return "TENSORS_NE_TOO_BIG";
+ case HANDCRAFTED_TENSORS_BAD_TYPE: return "TENSORS_BAD_TYPE";
+ case HANDCRAFTED_TENSORS_BAD_OFFSET: return "TENSORS_BAD_OFFSET";
+ case HANDCRAFTED_TENSORS_DUPLICATE_NAME: return "TENSORS_DUPLICATE_NAME";
+ case HANDCRAFTED_TENSORS_BAD_ALIGNMENT: return "TENSORS_BAD_ALIGNMENT";
+ case HANDCRAFTED_TENSORS_SUCCESS: return "TENSORS_SUCCESS";
+ case HANDCRAFTED_TENSORS_CUSTOM_ALIGN: return "TENSORS_CUSTOM_ALIGN";
+
+ case HANDCRAFTED_DATA_NOT_ENOUGH_DATA: return "DATA_NOT_ENOUGH_DATA";
+ case HANDCRAFTED_DATA_BAD_ALIGNMENT: return "DATA_BAD_ALIGNMENT";
+ case HANDCRAFTED_DATA_SUCCESS: return "DATA_SUCCESS";
+ case HANDCRAFTED_DATA_CUSTOM_ALIGN: return "DATA_CUSTOM_ALIGN";
+ }
+ GGML_ABORT("fatal error");
+}
+
+static bool expect_context_not_null(const enum handcrafted_file_type hft) {
+ if (hft < offset_has_kv) {
+ return hft >= HANDCRAFTED_HEADER_EMPTY;
+ }
+ if (hft < offset_has_tensors) {
+ return hft >= HANDCRAFTED_KV_SUCCESS;
+ }
+ if (hft < offset_has_data) {
+ return hft >= HANDCRAFTED_TENSORS_SUCCESS;
+ }
+ return hft >= HANDCRAFTED_DATA_SUCCESS;
+}
+
+typedef std::pair> tensor_config_t;
+
+std::vector get_tensor_configs(std::mt19937 & rng) {
+ std::vector tensor_configs;
+ tensor_configs.reserve(100);
+
+ for (int i = 0; i < 100; ++i) {
+ const enum ggml_type type = ggml_type(rng() % GGML_TYPE_COUNT);
+ if (ggml_type_size(type) == 0) {
+ continue;
+ }
+
+ std::array shape = {1, 1, 1, 1};
+ shape[0] = (1 + rng() % 10) * ggml_blck_size(type);
+ const int n_dims = 1 + rng() % GGML_MAX_DIMS;
+ for (int i = 1; i < n_dims; ++i) {
+ shape[i] = 1 + rng() % 10;
+ }
+
+ tensor_configs.push_back(std::make_pair(type, shape));
+ }
+
+ return tensor_configs;
+}
+
+std::vector> get_kv_types(std::mt19937 rng) {
+ std::vector> kv_types;
+ kv_types.reserve(100);
+
+ for (int i = 0; i < 100; ++i) {
+ const gguf_type type = gguf_type(rng() % GGUF_TYPE_COUNT);
+
+ if (type == GGUF_TYPE_ARRAY) {
+ const gguf_type type_arr = gguf_type(rng() % GGUF_TYPE_COUNT);
+ if (type_arr == GGUF_TYPE_ARRAY) {
+ continue;
+ }
+ kv_types.push_back(std::make_pair(type, type_arr));
+ continue;
+ }
+
+ kv_types.push_back(std::make_pair(type, gguf_type(-1)));
+ }
+ std::shuffle(kv_types.begin(), kv_types.end(), rng);
+
+ return kv_types;
+}
+
+static void helper_write(const void * data, const size_t nbytes, FILE * file) {
+ GGML_ASSERT(fwrite(data, 1, nbytes, file) == nbytes);
+}
+
+static FILE * get_handcrafted_file(const unsigned int seed, const enum handcrafted_file_type hft, const int extra_bytes = 0) {
+ FILE * file = tmpfile();
+
+ std::mt19937 rng(seed);
+
+ if (hft == HANDCRAFTED_HEADER_BAD_MAGIC) {
+ const char bad_magic[4] = {'F', 'U', 'G', 'G'};
+ helper_write(bad_magic, sizeof(bad_magic), file);
+ } else {
+ helper_write(GGUF_MAGIC, 4, file);
+ }
+
+ if (hft == HANDCRAFTED_HEADER_BAD_VERSION_1) {
+ const uint32_t version = 1;
+ helper_write(&version, sizeof(version), file);
+ } else if (hft == HANDCRAFTED_HEADER_BAD_VERSION_FUTURE) {
+ const uint32_t version = GGUF_VERSION + 1;
+ helper_write(&version, sizeof(version), file);
+ } else {
+ const uint32_t version = GGUF_VERSION;
+ helper_write(&version, sizeof(version), file);
+ }
+
+ std::vector tensor_configs;
+ if (hft >= offset_has_tensors) {
+ tensor_configs = get_tensor_configs(rng);
+ }
+
+ if (hft == HANDCRAFTED_HEADER_BAD_N_TENSORS) {
+ const uint64_t n_tensors = -1;
+ helper_write(&n_tensors, sizeof(n_tensors), file);
+ } else {
+ const uint64_t n_tensors = tensor_configs.size();
+ helper_write(&n_tensors, sizeof(n_tensors), file);
+ }
+
+ std::vector> kv_types;
+ if (hft >= offset_has_kv) {
+ kv_types = get_kv_types(rng);
+ }
+ {
+ uint64_t n_kv = kv_types.size();
+ if (hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) {
+ n_kv += 1;
+ } else if (hft == HANDCRAFTED_HEADER_BAD_N_KV) {
+ n_kv = -1;
+ }
+ helper_write(&n_kv, sizeof(n_kv), file);
+ }
+
+ if (hft < offset_has_kv) {
+ for (int i = 0; i < extra_bytes; ++i) {
+ const char tmp = 0;
+ helper_write(&tmp, sizeof(tmp), file);
+ }
+ rewind(file);
+ return file;
+ }
+
+ for (int i = 0; i < int(kv_types.size()); ++i) {
+ const enum gguf_type type = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? -1 : kv_types[i].first);
+ const enum gguf_type type_arr = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? -1 : kv_types[i].second);
+
+ const std::string key = "my_key_" + std::to_string((hft == HANDCRAFTED_KV_DUPLICATE_KEY ? i/2 : i));
+
+ if (hft == HANDCRAFTED_KV_BAD_KEY_SIZE) {
+ const uint64_t n = -1;
+ helper_write(&n, sizeof(n), file);
+ } else {
+ const uint64_t n = key.length();
+ helper_write(&n, sizeof(n), file);
+ }
+ helper_write(key.data(), key.length(), file);
+
+ {
+ const int32_t type32 = int32_t(type);
+ helper_write(&type32, sizeof(type32), file);
+ }
+
+ uint32_t data[16];
+ for (int j = 0; j < 16; ++j) {
+ data[j] = rng();
+ if (type == GGUF_TYPE_STRING || type_arr == GGUF_TYPE_STRING) {
+ data[j] |= 0x01010101; // avoid random null-termination of string
+ }
+ }
+
+ if (type == GGUF_TYPE_STRING) {
+ const uint64_t n = rng() % sizeof(data);
+ helper_write(&n, sizeof(n), file);
+ helper_write(data, n, file);
+ continue;
+ }
+
+ if (type == GGUF_TYPE_ARRAY) {
+ {
+ const int32_t type32 = int32_t(type_arr);
+ helper_write(&type32, sizeof(type32), file);
+ }
+ if (type_arr == GGUF_TYPE_STRING) {
+ const uint64_t nstr = rng() % (16 + 1);
+ helper_write(&nstr, sizeof(nstr), file);
+ for (uint64_t istr = 0; istr < nstr; ++istr) {
+ const uint64_t n = rng() % (sizeof(uint32_t) + 1);
+ helper_write(&n, sizeof(n), file);
+ helper_write(&data[istr], n, file);
+ }
+ continue;
+ }
+ const size_t type_size = gguf_type_size(type_arr);
+ const uint64_t n = (rng() % sizeof(data)) / type_size;
+ helper_write(&n, sizeof(n), file);
+ helper_write(&data, n*type_size, file);
+ continue;
+ }
+
+ size_t type_size = hft == HANDCRAFTED_KV_BAD_TYPE ? 1 : gguf_type_size(type);
+ if (hft == HANDCRAFTED_KV_BAD_VALUE_SIZE) {
+ type_size += rng() % 3;
+ }
+ helper_write(data, type_size, file);
+ }
+
+ if (hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) {
+ const std::string key = "general.alignment";
+ {
+ const uint64_t n = key.length();
+ helper_write(&n, sizeof(n), file);
+ }
+ helper_write(key.data(), key.length(), file);
+
+ const int32_t type = gguf_type(GGUF_TYPE_UINT32);
+ helper_write(&type, sizeof(type), file);
+
+ const uint32_t alignment = GGUF_DEFAULT_ALIGNMENT + 1;
+ helper_write(&alignment, sizeof(alignment), file);
+ }
+
+ if (hft < offset_has_tensors) {
+ for (int i = 0; i < extra_bytes; ++i) {
+ const char tmp = 0;
+ helper_write(&tmp, sizeof(tmp), file);
+ }
+ rewind(file);
+ return file;
+ }
+
+ uint32_t alignment = GGUF_DEFAULT_ALIGNMENT;
+ if (hft == HANDCRAFTED_TENSORS_BAD_ALIGNMENT || hft == HANDCRAFTED_DATA_BAD_ALIGNMENT) {
+ alignment -= 1;
+ } else if (hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) {
+ alignment += 1;
+ }
+
+ uint64_t offset = 0;
+ for (int i = 0; i < int(tensor_configs.size()); ++i) {
+ const ggml_type type = tensor_configs[i].first;
+ const std::array shape = tensor_configs[i].second;
+
+ std::string name = "my_tensor";
+ if (hft != HANDCRAFTED_TENSORS_DUPLICATE_NAME) {
+ name += "_" + std::to_string(i);
+ }
+ if (hft == HANDCRAFTED_TENSORS_BAD_NAME_SIZE) {
+ name += "_with_a_very_long_name_which_is_longer_than_what_is_allowed_for_ggml_tensors";
+ GGML_ASSERT(name.length() >= GGML_MAX_NAME);
+ }
+ {
+ const uint64_t n = name.length();
+ helper_write(&n, sizeof(n), file);
+ }
+ helper_write(name.data(), name.length(), file);
+
+ uint32_t n_dims = hft == HANDCRAFTED_TENSORS_NE_TOO_BIG ? 2 : 1;
+ for (int i = GGML_MAX_DIMS-1; i >= 1; --i) {
+ if (shape[i] != 1) {
+ n_dims = i + 1;
+ break;
+ }
+ }
+ if (hft == HANDCRAFTED_TENSORS_BAD_N_DIMS) {
+ const uint32_t n_dims_bad = GGML_MAX_DIMS + 1;
+ helper_write(&n_dims_bad, sizeof(n_dims_bad), file);
+ } else {
+ helper_write(&n_dims, sizeof(n_dims), file);
+ }
+
+ if (hft == HANDCRAFTED_TENSORS_BAD_SHAPE) {
+ for (uint32_t j = 0; j < n_dims; ++j) {
+ const int64_t bad_dim = -1;
+ helper_write(&bad_dim, sizeof(bad_dim), file);
+ }
+ } else if (hft == HANDCRAFTED_TENSORS_NE_TOO_BIG){
+ for (uint32_t j = 0; j < n_dims; ++j) {
+ const int64_t big_dim = 4*int64_t(INT32_MAX);
+ helper_write(&big_dim, sizeof(big_dim), file);
+ }
+ } else {
+ helper_write(shape.data(), n_dims*sizeof(int64_t), file);
+ }
+
+ {
+ const int32_t type32 = hft == HANDCRAFTED_TENSORS_BAD_TYPE ? -1 : int32_t(type);
+ helper_write(&type32, sizeof(type32), file);
+ }
+
+ if (hft == HANDCRAFTED_TENSORS_BAD_OFFSET) {
+ const uint64_t bad_offset = -1;
+ helper_write(&bad_offset, sizeof(bad_offset), file);
+ } else {
+ helper_write(&offset, sizeof(offset), file);
+ }
+
+ int64_t ne = shape[0];
+ for (uint32_t i = 1; i < n_dims; ++i) {
+ ne *= shape[i];
+ }
+ offset += GGML_PAD(ggml_row_size(type, ne), alignment);
+ }
+
+ const uint32_t alignment_overshoot = ftell(file) % alignment;
+ if (alignment_overshoot != 0) {
+ for (size_t i = alignment_overshoot; i < alignment; ++i) {
+ const char pad = 0;
+ helper_write(&pad, sizeof(pad), file);
+ }
+ }
+
+ if (hft >= offset_has_data) {
+ rng.seed(seed + 1);
+ uint64_t nbytes = offset;
+ if (hft == HANDCRAFTED_DATA_NOT_ENOUGH_DATA) {
+ nbytes -= 1;
+ }
+ for (uint64_t i = 0; i < nbytes; ++i) {
+ const uint8_t random_byte = i % 256;
+ helper_write(&random_byte, sizeof(random_byte), file);
+ }
+ }
+
+ for (int i = 0; i < extra_bytes; ++i) {
+ const char tmp = 0;
+ helper_write(&tmp, sizeof(tmp), file);
+ }
+ rewind(file);
+ return file;
+}
+
+static bool handcrafted_check_header(const gguf_context * gguf_ctx, const unsigned int seed, const bool has_kv, const bool has_tensors, const bool alignment_defined) {
+ if (!gguf_ctx) {
+ return false;
+ }
+
+ std::mt19937 rng(seed);
+
+ std::vector tensor_configs;
+ if (has_tensors) {
+ tensor_configs = get_tensor_configs(rng);
+ }
+ std::vector> kv_types;
+ if (has_kv) {
+ kv_types = get_kv_types(rng);
+ }
+
+ bool ok = true;
+
+ if (gguf_get_version(gguf_ctx) != GGUF_VERSION) {
+ ok = false;
+ }
+ if (gguf_get_n_tensors(gguf_ctx) != int(tensor_configs.size())) {
+ ok = false;
+ }
+ if (gguf_get_n_kv(gguf_ctx) != int(alignment_defined ? kv_types.size() + 1 : kv_types.size())) {
+ ok = false;
+ }
+
+ return ok;
+}
+
+static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned int seed, const bool has_tensors, const bool alignment_defined) {
+ if (!gguf_ctx) {
+ return false;
+ }
+
+ std::mt19937 rng(seed);
+
+ std::vector tensor_configs;
+ if (has_tensors) {
+ tensor_configs = get_tensor_configs(rng);
+ }
+
+ std::vector> kv_types = get_kv_types(rng);
+
+ bool ok = true;
+
+ for (int i = 0; i < int(kv_types.size()); ++i) {
+ const enum gguf_type type = gguf_type(kv_types[i].first);
+ const enum gguf_type type_arr = gguf_type(kv_types[i].second);
+
+ const std::string key = "my_key_" + std::to_string(i);
+
+ uint32_t data[16];
+ for (int j = 0; j < 16; ++j) {
+ data[j] = rng();
+ if (type == GGUF_TYPE_STRING || type_arr == GGUF_TYPE_STRING) {
+ data[j] |= 0x01010101; // avoid random null-termination of string
+ }
+ }
+
+ const char * data8 = reinterpret_cast(data);
+ const int id = gguf_find_key(gguf_ctx, key.c_str());
+
+ if (type == GGUF_TYPE_STRING) {
+ const char * str = gguf_get_val_str(gguf_ctx, id);
+ const uint64_t n = strlen(str);
+ const uint64_t n_expected = rng() % sizeof(data);
+ if (n != n_expected) {
+ ok = false;
+ continue;
+ }
+ if (!std::equal(str, str + n, data8)) {
+ ok = false;
+ }
+ continue;
+ }
+
+ if (type == GGUF_TYPE_ARRAY) {
+ const size_t type_size = gguf_type_size(type_arr);
+ const uint64_t arr_n = gguf_get_arr_n(gguf_ctx, id);
+
+ if (type_arr == GGUF_TYPE_STRING) {
+ const uint64_t nstr_expected = rng() % (16 + 1);
+ if (arr_n != nstr_expected) {
+ ok = false;
+ continue;
+ }
+ for (uint64_t istr = 0; istr < nstr_expected; ++istr) {
+ const char * str = gguf_get_arr_str(gguf_ctx, id, istr);
+ const uint64_t n = strlen(str);
+ const uint64_t n_expected = rng() % (sizeof(uint32_t) + 1);
+
+ if (n != n_expected) {
+ ok = false;
+ continue;
+ }
+ const char * str_expected = reinterpret_cast(&data[istr]);
+ if (strncmp(str, str_expected, n) != 0) {
+ ok = false;
+ continue;
+ }
+ }
+ continue;
+ }
+
+ const uint64_t arr_n_expected = (rng() % sizeof(data)) / type_size;
+ if (arr_n != arr_n_expected) {
+ ok = false;
+ continue;
+ }
+
+ const char * data_gguf = reinterpret_cast(gguf_get_arr_data(gguf_ctx, id));
+ if (!std::equal(data8, data8 + arr_n*type_size, data_gguf)) {
+ ok = false;
+ }
+ continue;
+ }
+
+ const char * data_gguf = reinterpret_cast(gguf_get_val_data(gguf_ctx, id));
+ if (!std::equal(data8, data8 + gguf_type_size(type), data_gguf)) {
+ ok = false;
+ }
+ }
+
+ const uint32_t expected_alignment = alignment_defined ? GGUF_DEFAULT_ALIGNMENT + 1 : GGUF_DEFAULT_ALIGNMENT;
+ if (gguf_get_alignment(gguf_ctx) != expected_alignment) {
+ ok = false;
+ }
+
+ return ok;
+}
+
+static bool handcrafted_check_tensors(const gguf_context * gguf_ctx, const unsigned int seed) {
+ if (!gguf_ctx) {
+ return false;
+ }
+
+ std::mt19937 rng(seed);
+
+ std::vector tensor_configs = get_tensor_configs(rng);
+
+ // Call get_kv_types to get the same RNG state:
+ get_kv_types(rng);
+
+ bool ok = true;
+
+ const int id_alignment = gguf_find_key(gguf_ctx, "general.alignment");
+ const uint32_t alignment = id_alignment >= 0 ? gguf_get_val_u32(gguf_ctx, id_alignment) : GGUF_DEFAULT_ALIGNMENT;
+
+ uint64_t expected_offset = 0;
+ for (int i = 0; i < int(tensor_configs.size()); ++i) {
+ const ggml_type type = tensor_configs[i].first;
+ const std::array shape = tensor_configs[i].second;
+
+ const std::string name = "my_tensor_" + std::to_string(i);
+ const int id = gguf_find_tensor(gguf_ctx, name.c_str());
+
+ if (id >= 0) {
+ if (std::string(gguf_get_tensor_name(gguf_ctx, id)) != name) {
+ ok = false;
+ }
+
+ if (gguf_get_tensor_type(gguf_ctx, id) != type) {
+ ok = false;
+ }
+ } else {
+ ok = false;
+ continue;
+ }
+
+ const size_t offset = gguf_get_tensor_offset(gguf_ctx, id);
+
+ if (offset != expected_offset) {
+ ok = false;
+ }
+
+ int64_t ne = shape[0];
+ for (size_t j = 1; j < GGML_MAX_DIMS; ++j) {
+ ne *= shape[j];
+ }
+ expected_offset += GGML_PAD(ggml_row_size(type, ne), alignment);
+ }
+
+ return ok;
+}
+
+static bool handcrafted_check_tensor_data(const gguf_context * gguf_ctx, const unsigned int seed, FILE * file) {
+ if (!gguf_ctx) {
+ return false;
+ }
+
+ std::mt19937 rng(seed);
+
+ std::vector tensor_configs = get_tensor_configs(rng);
+
+ bool ok = true;
+
+ const uint32_t alignment = GGUF_DEFAULT_ALIGNMENT;
+
+ for (int i = 0; i < int(tensor_configs.size()); ++i) {
+ const ggml_type type = tensor_configs[i].first;
+ const std::array shape = tensor_configs[i].second;
+
+ int64_t ne = shape[0];
+ for (size_t j = 1; j < GGML_MAX_DIMS; ++j) {
+ ne *= shape[j];
+ }
+ const size_t size = ggml_row_size(type, ne);
+
+ const std::string name = "my_tensor_" + std::to_string(i);
+ const size_t offset = gguf_get_tensor_offset(gguf_ctx, gguf_find_tensor(gguf_ctx, name.c_str()));
+
+ std::vector data(size);
+ GGML_ASSERT(fseek(file, gguf_get_data_offset(gguf_ctx) + offset, SEEK_SET) == 0);
+ GGML_ASSERT(fread(data.data(), 1, size, file) == size);
+
+ for (size_t j = 0; j < size; ++j) {
+ const uint8_t expected_byte = (j + offset) % 256;
+ if (data[j] != expected_byte) {
+ ok = false;
+ }
+ }
+ }
+
+ return ok;
+}
+
+static std::pair test_handcrafted_file(const unsigned int seed) {
+ int npass = 0;
+ int ntest = 0;
+
+ const std::vector hfts = {
+ HANDCRAFTED_HEADER_BAD_MAGIC,
+ HANDCRAFTED_HEADER_BAD_VERSION_1,
+ // HANDCRAFTED_FILE_TYPE_BAD_VERSION_FUTURE, // FIXME
+ HANDCRAFTED_HEADER_BAD_N_KV,
+ HANDCRAFTED_HEADER_BAD_N_TENSORS,
+ HANDCRAFTED_HEADER_EMPTY,
+
+ HANDCRAFTED_KV_BAD_KEY_SIZE,
+ HANDCRAFTED_KV_BAD_TYPE,
+ HANDCRAFTED_KV_BAD_VALUE_SIZE,
+ // HANDCRAFTED_FILE_TYPE_DUPLICATE_KEY, // FIXME
+ HANDCRAFTED_KV_SUCCESS,
+
+ HANDCRAFTED_TENSORS_BAD_NAME_SIZE,
+ HANDCRAFTED_TENSORS_BAD_N_DIMS,
+ HANDCRAFTED_TENSORS_BAD_SHAPE,
+ HANDCRAFTED_TENSORS_NE_TOO_BIG,
+ HANDCRAFTED_TENSORS_BAD_TYPE,
+ // HANDCRAFTED_TENSORS_BAD_OFFSET, // FIXME
+ HANDCRAFTED_TENSORS_DUPLICATE_NAME,
+ // HANDCRAFTED_TENSORS_BAD_ALIGNMENT, // FIXME
+ HANDCRAFTED_TENSORS_SUCCESS,
+ HANDCRAFTED_TENSORS_CUSTOM_ALIGN,
+
+ HANDCRAFTED_DATA_NOT_ENOUGH_DATA,
+ // HANDCRAFTED_DATA_BAD_ALIGNMENT, // FIXME
+ HANDCRAFTED_DATA_SUCCESS,
+ HANDCRAFTED_DATA_CUSTOM_ALIGN,
+ };
+
+ for (enum handcrafted_file_type hft : hfts) {
+ printf("%s: handcrafted_file_type=%s\n", __func__, handcrafted_file_type_name(hft).c_str());
+ FILE * file = get_handcrafted_file(seed, hft);
+
+#ifdef _WIN32
+ if (!file) {
+ printf("%s: failed to create tmpfile(), needs elevated privileges on Windows");
+ printf("%s: skipping tests");
+ continue;
+ }
+#else
+ GGML_ASSERT(file);
+#endif // _WIN32
+
+ struct ggml_context * ctx = nullptr;
+ struct gguf_init_params gguf_params = {
+ /*no_alloc =*/ false,
+ /*ctx =*/ hft >= offset_has_data ? &ctx : nullptr,
+ };
+ struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params);
+
+ if (expect_context_not_null(hft)) {
+ printf("%s: - context_not_null: ", __func__);
+ } else {
+ printf("%s: - context_null: ", __func__);
+ }
+ if (bool(gguf_ctx) == expect_context_not_null(hft)) {
+ printf("\033[1;32mOK\033[0m\n");
+ npass++;
+ } else {
+ printf("\033[1;31mFAIL\033[0m\n");
+ }
+ ntest++;
+
+ if (false && hft >= offset_has_data && !expect_context_not_null(hft)) { // FIXME
+ printf("%s: - no_dangling_ggml_context_pointer: ", __func__);
+ if (ctx) {
+ printf("\033[1;31mFAIL\033[0m\n");
+ } else {
+ printf("\033[1;32mOK\033[0m\n");
+ npass++;
+ }
+ ntest++;
+ }
+
+ if (false && expect_context_not_null(hft)) { // FIXME
+ FILE * file_eb = get_handcrafted_file(seed, hft, /*extra_bytes =*/ 1);
+ struct gguf_context * gguf_ctx_eb = gguf_init_from_file_impl(file_eb, gguf_params);
+
+ printf("%s: - context_null_with_extra_bytes: ", __func__);
+ if (gguf_ctx_eb) {
+ printf("\033[1;31mFAIL\033[0m\n");
+ } else {
+ printf("\033[1;32mOK\033[0m\n");
+ npass++;
+ }
+ ntest++;
+
+ gguf_free(gguf_ctx_eb);
+ fclose(file_eb);
+ }
+
+ const bool alignment_defined = hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN;
+
+ if (expect_context_not_null(hft)) {
+ printf("%s: - check_header: ", __func__);
+ if (handcrafted_check_header(gguf_ctx, seed, hft >= offset_has_kv, hft >= offset_has_tensors, alignment_defined)) {
+ printf("\033[1;32mOK\033[0m\n");
+ npass++;
+ } else {
+ printf("\033[1;31mFAIL\033[0m\n");
+ }
+ ntest++;
+ }
+
+ if (expect_context_not_null(hft) && hft >= offset_has_kv) {
+ printf("%s: - check_kv: ", __func__);
+ if (handcrafted_check_kv(gguf_ctx, seed, hft >= offset_has_tensors, alignment_defined)) {
+ printf("\033[1;32mOK\033[0m\n");
+ npass++;
+ } else {
+ printf("\033[1;31mFAIL\033[0m\n");
+ }
+ ntest++;
+ }
+
+ if (expect_context_not_null(hft) && hft >= offset_has_tensors) {
+ printf("%s: - check_tensors: ", __func__);
+ if (handcrafted_check_tensors(gguf_ctx, seed)) {
+ printf("\033[1;32mOK\033[0m\n");
+ npass++;
+ } else {
+ printf("\033[1;31mFAIL\033[0m\n");
+ }
+ ntest++;
+ }
+
+ if (expect_context_not_null(hft) && hft >= offset_has_data) {
+ printf("%s: - check_tensor_data: ", __func__);
+ if (handcrafted_check_tensor_data(gguf_ctx, seed, file)) {
+ printf("\033[1;32mOK\033[0m\n");
+ npass++;
+ } else {
+ printf("\033[1;31mFAIL\033[0m\n");
+ }
+ ntest++;
+ }
+
+ if (gguf_ctx) {
+ ggml_free(ctx);
+ gguf_free(gguf_ctx);
+ }
+ fclose(file);
+ printf("\n");
+ }
+
+ return std::make_pair(npass, ntest);
+}
+
+struct random_gguf_context_result {
+ struct gguf_context * gguf_ctx;
+ struct ggml_context * ctx;
+ ggml_backend_buffer_t buffer;
+};
+
+static struct random_gguf_context_result get_random_gguf_context(ggml_backend_t backend, const unsigned int seed) {
+ std::mt19937 rng(seed);
+
+ struct gguf_context * gguf_ctx = gguf_init_empty();
+
+ for (int i = 0; i < 256; ++i) {
+ const std::string key = "my_key_" + std::to_string(rng() % 1024);
+ const enum gguf_type type = gguf_type(rng() % GGUF_TYPE_COUNT);
+
+ if (type == GGUF_TYPE_STRING || type == GGUF_TYPE_ARRAY) {
+ continue; // FIXME memory leak
+ }
+
+ switch (type) {
+ case GGUF_TYPE_UINT8: gguf_set_val_u8 (gguf_ctx, key.c_str(), rng() % (1 << 7)); break;
+ case GGUF_TYPE_INT8: gguf_set_val_i8 (gguf_ctx, key.c_str(), rng() % (1 << 7) - (1 << 6)); break;
+ case GGUF_TYPE_UINT16: gguf_set_val_u16 (gguf_ctx, key.c_str(), rng() % (1 << 15)); break;
+ case GGUF_TYPE_INT16: gguf_set_val_i16 (gguf_ctx, key.c_str(), rng() % (1 << 15) - (1 << 14)); break;
+ case GGUF_TYPE_UINT32: gguf_set_val_u32 (gguf_ctx, key.c_str(), rng()); break;
+ case GGUF_TYPE_INT32: gguf_set_val_i32 (gguf_ctx, key.c_str(), rng() - (1 << 30)); break;
+ case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (gguf_ctx, key.c_str(), rng() % 1024 - 512); break;
+ case GGUF_TYPE_BOOL: gguf_set_val_bool(gguf_ctx, key.c_str(), rng() % 2 == 0); break;
+ case GGUF_TYPE_STRING: gguf_set_val_str (gguf_ctx, key.c_str(), std::to_string(rng()).c_str()); break;
+ case GGUF_TYPE_UINT64: gguf_set_val_u64 (gguf_ctx, key.c_str(), rng()); break;
+ case GGUF_TYPE_INT64: gguf_set_val_i64 (gguf_ctx, key.c_str(), rng() - (1 << 30)); break;
+ case GGUF_TYPE_FLOAT64: gguf_set_val_f32 (gguf_ctx, key.c_str(), rng() % 1024 - 512); break;
+ case GGUF_TYPE_ARRAY: {
+ const enum gguf_type type_arr = gguf_type(rng() % GGUF_TYPE_COUNT);
+ const uint64_t ne = rng() % 1024;
+
+ switch (type_arr) {
+ case GGUF_TYPE_UINT8:
+ case GGUF_TYPE_INT8:
+ case GGUF_TYPE_UINT16:
+ case GGUF_TYPE_INT16:
+ case GGUF_TYPE_UINT32:
+ case GGUF_TYPE_INT32:
+ case GGUF_TYPE_FLOAT32:
+ case GGUF_TYPE_BOOL:
+ case GGUF_TYPE_UINT64:
+ case GGUF_TYPE_INT64:
+ case GGUF_TYPE_FLOAT64: {
+ const size_t nbytes = ne*gguf_type_size(type_arr);
+ std::vector random_data((nbytes + sizeof(uint32_t) - 1) / sizeof(uint32_t));
+ for (size_t j = 0; j < random_data.size(); ++j) {
+ random_data[j] = rng();
+ }
+ gguf_set_arr_data(gguf_ctx, key.c_str(), type_arr, random_data.data(), ne);
+ } break;
+ case GGUF_TYPE_STRING: {
+ std::vector data_cpp(ne);
+ std::vector data_c(ne);
+ for (size_t j = 0; j < data_cpp.size(); ++j) {
+ data_cpp[j] = std::to_string(rng());
+ data_c[j] = data_cpp[j].c_str();
+ }
+ gguf_set_arr_str(gguf_ctx, key.c_str(), data_c.data(), ne);
+ } break;
+ case GGUF_TYPE_ARRAY: {
+ break; // not supported
+ }
+ case GGUF_TYPE_COUNT:
+ default: {
+ GGML_ABORT("fatal error");
+ } break;
+ }
+ } break;
+ case GGUF_TYPE_COUNT:
+ default: {
+ GGML_ABORT("fatal error");
+ } break;
+ }
+ }
+
+ struct ggml_init_params ggml_params = {
+ /*.mem_size =*/ 256*ggml_tensor_overhead(),
+ /*.mem_buffer =*/ nullptr,
+ /*.no_alloc =*/ true,
+ };
+ struct ggml_context * ctx = ggml_init(ggml_params);
+
+ for (int i = 0; i < 256; ++i) {
+ const std::string name = "my_tensor_" + std::to_string(i);
+ const enum ggml_type type = ggml_type(rng() % GGML_TYPE_COUNT);
+ const size_t type_size = ggml_type_size(type);
+
+ if (type_size == 0) {
+ continue;
+ }
+
+ const int n_dims = 1 + rng() % GGML_MAX_DIMS;
+ int64_t ne[GGML_MAX_DIMS];
+ ne[0] = (1 + rng() % 10) * ggml_blck_size(type);
+ for (int j = 1; j < n_dims; ++j) {
+ ne[j] = 1 + rng() % 10;
+ }
+
+ struct ggml_tensor * tensor = ggml_new_tensor(ctx, type, n_dims, ne);
+ ggml_set_name(tensor, name.c_str());
+ }
+
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
+ for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+ const size_t nbytes = ggml_nbytes(t);
+ std::vector random_data((nbytes + sizeof(uint32_t) - 1) / sizeof(uint32_t));
+ for (size_t j = 0; j < random_data.size(); ++j) {
+ random_data[j] = rng();
+ }
+ ggml_backend_tensor_set(t, random_data.data(), 0, nbytes);
+
+ gguf_add_tensor(gguf_ctx, t);
+ }
+
+ return {gguf_ctx, ctx, buf};
+}
+
+static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other) {
+ bool ok = true;
+
+ const int n_kv = gguf_get_n_kv(ctx);
+ for (int id = 0; id < n_kv; ++id) {
+ const char * name = gguf_get_key(ctx, id);
+
+ const int idx_other = gguf_find_key(other, name);
+ if (idx_other < 0) {
+ ok = false;
+ continue;
+ }
+
+ const gguf_type type = gguf_get_kv_type(ctx, id);
+ if (type != gguf_get_kv_type(other, idx_other)) {
+ ok = false;
+ continue;
+ }
+
+ if (type == GGUF_TYPE_ARRAY) {
+ const int arr_n = gguf_get_arr_n(ctx, id);
+ if (arr_n != gguf_get_arr_n(other, idx_other)) {
+ ok = false;
+ continue;
+ }
+
+ const gguf_type type_arr = gguf_get_arr_type(ctx, id);
+ if (type_arr != gguf_get_arr_type(other, idx_other)) {
+ ok = false;
+ continue;
+ }
+
+ if (type_arr == GGUF_TYPE_STRING) {
+ for (int arr_i = 0; arr_i < arr_n; ++arr_i) {
+ const std::string str = gguf_get_arr_str(ctx, id, arr_i);
+ const std::string str_other = gguf_get_arr_str(other, idx_other, arr_i);
+ if (str != str_other) {
+ ok = false;
+ }
+ }
+ continue;
+ }
+
+ const char * data = reinterpret_cast(gguf_get_arr_data(ctx, id));
+ const char * data_other = reinterpret_cast(gguf_get_arr_data(other, idx_other));
+ if (!std::equal(data, data + arr_n*gguf_type_size(type_arr), data_other)) {
+ ok = false;
+ }
+ continue;
+ }
+
+ if (type == GGUF_TYPE_STRING) {
+ const std::string str = gguf_get_val_str(ctx, id);
+ const std::string str_other = gguf_get_val_str(other, idx_other);
+ if (str != str_other) {
+ ok = false;
+ }
+ continue;
+ }
+
+ const char * data = reinterpret_cast(gguf_get_val_data(ctx, id));
+ const char * data_other = reinterpret_cast(gguf_get_val_data(other, idx_other));
+ if (!std::equal(data, data + gguf_type_size(type), data_other)) {
+ ok = false;
+ }
+ }
+
+ return ok;
+}
+
+static bool all_tensors_in_other(const gguf_context * ctx, const gguf_context * other) {
+ bool ok = true;
+
+ const int n_tensors = gguf_get_n_tensors(ctx);
+ for (int id = 0; id < n_tensors; ++id) {
+ const std::string name = gguf_get_tensor_name(ctx, id);
+
+ const int idx_other = gguf_find_tensor(other, name.c_str());
+ if (id != idx_other) {
+ ok = false;
+ if (idx_other < 0) {
+ continue;
+ }
+ }
+
+ const ggml_type type = gguf_get_tensor_type(ctx, id);
+ if (type != gguf_get_tensor_type(other, id)) {
+ ok = false;
+ }
+
+ const size_t offset = gguf_get_tensor_offset(ctx, id);
+ if (offset != gguf_get_tensor_offset(other, id)) {
+ ok = false;
+ }
+ }
+
+ return ok;
+}
+
+static bool same_tensor_data(const struct ggml_context * orig, const struct ggml_context * read) {
+ bool ok = true;
+
+ struct ggml_tensor * t_orig = ggml_get_first_tensor(orig);
+ struct ggml_tensor * t_read = ggml_get_first_tensor(read);
+ while (t_orig) {
+ if (!t_read) {
+ ok = false;
+ break;
+ }
+
+ const size_t nbytes = ggml_nbytes(t_orig);
+ if (ggml_nbytes(t_read) != nbytes) {
+ ok = false;
+ break;
+ }
+ std::vector data_orig(nbytes);
+ ggml_backend_tensor_get(t_orig, data_orig.data(), 0, nbytes);
+ if (!std::equal(data_orig.data(), data_orig.data() + nbytes, reinterpret_cast(t_read->data))) {
+ ok = false;
+ }
+
+ t_orig = ggml_get_next_tensor(orig, t_orig);
+ t_read = ggml_get_next_tensor(orig, t_read);
+ }
+ if (t_read) {
+ ok = false;
+ }
+
+ return true;
+}
+
+static std::pair test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta) {
+ FILE * file = tmpfile();
+#ifdef _WIN32
+ if (!file) {
+ printf("%s: failed to create tmpfile(), needs elevated privileges on Windows");
+ printf("%s: skipping tests");
+ return std::make_pair(0, 0);
+ }
+#else
+ GGML_ASSERT(file);
+#endif // _WIN32
+
+ if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
+ return std::make_pair(0, 0); // FIXME
+ }
+
+ ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+ printf("%s: device=%s, backend=%s, only_meta=%s\n",
+ __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend), only_meta ? "yes" : "no");
+
+ int npass = 0;
+ int ntest = 0;
+
+ struct gguf_context * gguf_ctx_0;
+ struct ggml_context * ctx_0;
+ ggml_backend_buffer_t bbuf;
+ {
+ struct random_gguf_context_result result = get_random_gguf_context(backend, seed);
+ gguf_ctx_0 = result.gguf_ctx;
+ ctx_0 = result.ctx;
+ bbuf = result.buffer;
+ }
+
+ struct gguf_buf gbuf = gguf_buf_init(16 * 1024);
+ gguf_write_to_buf(gguf_ctx_0, &gbuf, only_meta);
+ helper_write(gbuf.data, gbuf.offset, file);
+ rewind(file);
+
+ struct ggml_context * ctx_1 = nullptr;
+ struct gguf_init_params gguf_params = {
+ /*no_alloc =*/ false,
+ /*ctx =*/ only_meta ? nullptr : &ctx_1,
+ };
+ struct gguf_context * gguf_ctx_1 = gguf_init_from_file_impl(file, gguf_params);
+
+ printf("%s: same_version: ", __func__);
+ if (gguf_get_version(gguf_ctx_0) == gguf_get_version(gguf_ctx_1)) {
+ printf("\033[1;32mOK\033[0m\n");
+ npass++;
+ } else {
+ printf("\033[1;31mFAIL\033[0m\n");
+ }
+ ntest++;
+
+ printf("%s: same_n_kv: ", __func__);
+ if (gguf_get_n_kv(gguf_ctx_0) == gguf_get_n_kv(gguf_ctx_1)) {
+ printf("\033[1;32mOK\033[0m\n");
+ npass++;
+ } else {
+ printf("\033[1;31mFAIL\033[0m\n");
+ }
+ ntest++;
+
+ printf("%s: same_n_tensors: ", __func__);
+ if (gguf_get_n_tensors(gguf_ctx_0) == gguf_get_n_tensors(gguf_ctx_1)) {
+ printf("\033[1;32mOK\033[0m\n");
+ npass++;
+ } else {
+ printf("\033[1;31mFAIL\033[0m\n");
+ }
+ ntest++;
+
+ printf("%s: all_orig_kv_in_read: ", __func__);
+ if (all_kv_in_other(gguf_ctx_0, gguf_ctx_1)) {
+ printf("\033[1;32mOK\033[0m\n");
+ npass++;
+ } else {
+ printf("\033[1;31mFAIL\033[0m\n");
+ }
+ ntest++;
+
+ printf("%s: all_read_kv_in_orig: ", __func__);
+ if (all_kv_in_other(gguf_ctx_1, gguf_ctx_0)) {
+ printf("\033[1;32mOK\033[0m\n");
+ npass++;
+ } else {
+ printf("\033[1;31mFAIL\033[0m\n");
+ }
+ ntest++;
+
+ printf("%s: all_orig_tensors_in_read: ", __func__);
+ if (all_tensors_in_other(gguf_ctx_0, gguf_ctx_1)) {
+ printf("\033[1;32mOK\033[0m\n");
+ npass++;
+ } else {
+ printf("\033[1;31mFAIL\033[0m\n");
+ }
+ ntest++;
+
+ printf("%s: all_read_tensors_in_orig: ", __func__);
+ if (all_tensors_in_other(gguf_ctx_1, gguf_ctx_0)) {
+ printf("\033[1;32mOK\033[0m\n");
+ npass++;
+ } else {
+ printf("\033[1;31mFAIL\033[0m\n");
+ }
+ ntest++;
+
+ if (!only_meta) {
+ printf("%s: same_tensor_data: ", __func__);
+ if (same_tensor_data(ctx_0, ctx_1)) {
+ printf("\033[1;32mOK\033[0m\n");
+ npass++;
+ } else {
+ printf("\033[1;31mFAIL\033[0m\n");
+ }
+ ntest++;
+ }
+
+ ggml_backend_buffer_free(bbuf);
+ ggml_free(ctx_0);
+ ggml_free(ctx_1);
+ gguf_free(gguf_ctx_0);
+ gguf_free(gguf_ctx_1);
+ gguf_buf_free(gbuf);
+ ggml_backend_free(backend);
+ GGML_ASSERT(fclose(file) == 0);
+
+ printf("\n");
+ return std::make_pair(npass, ntest);
+}
+
+static std::pair test_gguf_set_kv(ggml_backend_dev_t dev, const unsigned int seed) {
+ ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+ printf("%s: device=%s, backend=%s\n", __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend));
+
+ int npass = 0;
+ int ntest = 0;
+
+ struct gguf_context * gguf_ctx_0;
+ struct ggml_context * ctx_0;
+ ggml_backend_buffer_t bbuf_0;
+ {
+ struct random_gguf_context_result result = get_random_gguf_context(backend, seed);
+ gguf_ctx_0 = result.gguf_ctx;
+ ctx_0 = result.ctx;
+ bbuf_0 = result.buffer;
+ }
+
+ struct gguf_context * gguf_ctx_1;
+ struct ggml_context * ctx_1;
+ ggml_backend_buffer_t bbuf_1;
+ {
+ struct random_gguf_context_result result = get_random_gguf_context(backend, seed + 1);
+ gguf_ctx_1 = result.gguf_ctx;
+ ctx_1 = result.ctx;
+ bbuf_1 = result.buffer;
+ }
+
+ struct gguf_context * gguf_ctx_2 = gguf_init_empty();
+
+ gguf_set_kv(gguf_ctx_1, gguf_ctx_0);
+ gguf_set_kv(gguf_ctx_2, gguf_ctx_0);
+
+ printf("%s: same_n_kv: ", __func__);
+ if (gguf_get_n_kv(gguf_ctx_0) == gguf_get_n_kv(gguf_ctx_2)) {
+ printf("\033[1;32mOK\033[0m\n");
+ npass++;
+ } else {
+ printf("\033[1;31mFAIL\033[0m\n");
+ }
+ ntest++;
+
+ printf("%s: all_kv_0_in_1: ", __func__);
+ if (all_kv_in_other(gguf_ctx_0, gguf_ctx_1)) {
+ printf("\033[1;32mOK\033[0m\n");
+ npass++;
+ } else {
+ printf("\033[1;31mFAIL\033[0m\n");
+ }
+ ntest++;
+
+ printf("%s: all_kv_0_in_2: ", __func__);
+ if (all_kv_in_other(gguf_ctx_0, gguf_ctx_2)) {
+ printf("\033[1;32mOK\033[0m\n");
+ npass++;
+ } else {
+ printf("\033[1;31mFAIL\033[0m\n");
+ }
+ ntest++;
+
+ gguf_set_kv(gguf_ctx_0, gguf_ctx_1);
+
+ printf("%s: same_n_kv_after_double_copy: ", __func__);
+ if (gguf_get_n_kv(gguf_ctx_0) == gguf_get_n_kv(gguf_ctx_1)) {
+ printf("\033[1;32mOK\033[0m\n");
+ npass++;
+ } else {
+ printf("\033[1;31mFAIL\033[0m\n");
+ }
+ ntest++;
+
+ printf("%s: all_kv_1_in_0_after_double_copy: ", __func__);
+ if (all_kv_in_other(gguf_ctx_1, gguf_ctx_0)) {
+ printf("\033[1;32mOK\033[0m\n");
+ npass++;
+ } else {
+ printf("\033[1;31mFAIL\033[0m\n");
+ }
+ ntest++;
+
+ ggml_backend_buffer_free(bbuf_0);
+ ggml_backend_buffer_free(bbuf_1);
+ ggml_free(ctx_0);
+ ggml_free(ctx_1);
+ gguf_free(gguf_ctx_0);
+ gguf_free(gguf_ctx_1);
+ gguf_free(gguf_ctx_2);
+ ggml_backend_free(backend);
+
+ printf("\n");
+ return std::make_pair(npass, ntest);
+}
+
+static void print_usage() {
+ printf("usage: test-gguf [seed]\n");
+ printf(" if no seed is unspecified then a random seed is used\n");
+}
+
+int main(int argc, char ** argv) {
+ if (argc > 2) {
+ print_usage();
+ return 1;
+ }
+
+ std::random_device rd;
+ const unsigned int seed = argc < 2 ? rd() : std::stoi(argv[1]);
+
+ // Initialize ggml backends early so the prints aren't interleaved with the test results:
+ ggml_backend_dev_count();
+ fprintf(stderr, "\n");
+
+ int npass = 0;
+ int ntest = 0;
+ {
+ std::pair result = test_handcrafted_file(seed);
+ npass += result.first;
+ ntest += result.second;
+ }
+
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+
+ for (bool only_meta : {true, false}) {
+ std::pair result = test_roundtrip(dev, seed, only_meta);
+ npass += result.first;
+ ntest += result.second;
+ }
+
+ {
+ std::pair result = test_gguf_set_kv(dev, seed);
+ npass += result.first;
+ ntest += result.second;
+ }
+ }
+
+ printf("%d/%d tests passed\n", npass, ntest);
+ if (npass != ntest) {
+ printf("\033[1;31mFAIL\033[0m\n");
+ return 1;
+ }
+ printf("\033[1;32mOK\033[0m\n");
+ return 0;
+}
diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp
index 4656b30f0..322b8bb99 100644
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@@ -138,7 +138,7 @@ int main(int /*argc*/, const char ** /*argv*/) {
struct ggml_tensor * x;
// rope f32
- for (int m = 0; m < 3; ++m) {
+ for (int m = 0; m < 5; ++m) {
const int ndims = 4;
const int64_t n_rot = 128;
@@ -147,28 +147,69 @@ int main(int /*argc*/, const char ** /*argv*/) {
const int n_past_0 = 100;
const int n_past_2 = 33;
- struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
- struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
- struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-
- for (int i = 0; i < ne[2]; ++i) {
- ((int32_t *) p0->data)[i] = n_past_0 + i;
- ((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
- ((int32_t *) p2->data)[i] = n_past_2 + i;
- }
-
- // test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
- const int mode = m == 0 ? 0 : m == 1 ? 2 : 4;
-
+ struct ggml_tensor * r0;
+ struct ggml_tensor * r1;
+ struct ggml_tensor * r2;
x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+ int mode = -1;
- // 100, 101, 102, ..., 172
- struct ggml_tensor * r0 = ggml_rope(ctx0, x, p0, n_rot, mode);
- // -67, -67, -67, ..., -67
- struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
+ if (m < 3) {
+ struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+ struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+ struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
- // 33, 34, 35, ..., 105
- struct ggml_tensor * r2 = ggml_rope(ctx0, x, p2, n_rot, mode);
+ for (int i = 0; i < ne[2]; ++i) {
+ ((int32_t *) p0->data)[i] = n_past_0 + i;
+ ((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
+ ((int32_t *) p2->data)[i] = n_past_2 + i;
+ }
+ // test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
+ mode = m == 0 ? 0 : m == 1 ? 2 : 4;
+
+ // 100, 101, 102, ..., 172
+ r0 = ggml_rope(ctx0, x, p0, n_rot, mode);
+ // -67, -67, -67, ..., -67
+ r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
+
+ // 33, 34, 35, ..., 105
+ r2 = ggml_rope(ctx0, x, p2, n_rot, mode);
+ } else {
+ // testing multi-dimension rope position embedding mode
+ struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
+ struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
+ struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
+
+ int sections[4] = {16, 24, 24, 0};
+ mode = (m == 3) ? GGML_ROPE_TYPE_MROPE : GGML_ROPE_TYPE_VISION;
+
+ for (int i = 0; i < ne[2]; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ ((int32_t *) p0->data)[i + ne[2] * j] = n_past_0 + i + j;
+ ((int32_t *) p1->data)[i + ne[2] * j] = n_past_2 - n_past_0;
+ ((int32_t *) p2->data)[i + ne[2] * j] = n_past_2 + i + j;
+ }
+ }
+
+ // [[100, 101, 102, ..., 172],
+ // [101, 102, 103, ..., 173],
+ // [102, 103, 104, ..., 174]]
+ r0 = ggml_rope_multi(
+ ctx0, x, p0, nullptr,
+ n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
+ // [[-67, -67, -67, ..., -67]
+ // [-67, -67, -67, ..., -67]
+ // [-67, -67, -67, ..., -67]]
+ r1 = ggml_rope_multi(
+ ctx0, r0, p1, nullptr,
+ n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
+
+ // [[33, 34, 35, ..., 105]
+ // [34, 35, 36, ..., 106]
+ // [35, 36, 37, ..., 107]]
+ r2 = ggml_rope_multi(
+ ctx0, x, p2, nullptr,
+ n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
+ }
ggml_cgraph * gf = ggml_new_graph(ctx0);
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index e5c9e75e4..c0dcb4848 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -145,7 +145,7 @@ static void test_penalties(
sampler_tester tester(probs, probs_expected);
const size_t n_vocab = probs.size();
- auto * sampler = llama_sampler_init_penalties(n_vocab, LLAMA_TOKEN_NULL, LLAMA_TOKEN_NULL, last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence, false, false);
+ auto * sampler = llama_sampler_init_penalties(last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
for (size_t i = 0; i < last_tokens.size(); i++) {
llama_sampler_accept(sampler, last_tokens[i]);