Merge branch 'master' into xsn/server_pytest

2024-11-26 11:39:31 +01:00 · 2024-11-26 11:39:31 +01:00 · 3a504ae88e
commit 3a504ae88e
parent 58cbcd2371 7066b4cce2
125 changed files with 4927 additions and 2273 deletions
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,161 @@
+---
+Language:        Cpp
+AlignAfterOpenBracket: Align
+AlignArrayOfStructures: Left
+AlignConsecutiveAssignments: AcrossComments
+AlignConsecutiveBitFields: AcrossComments
+AlignConsecutiveDeclarations: AcrossComments
+AlignConsecutiveMacros: AcrossComments
+# AlignConsecutiveShortCaseStatements: AcrossComments
+AlignEscapedNewlines: Left # LeftWithLastLine
+AlignOperands:   Align
+AlignTrailingComments:
+  Kind: Always
+  OverEmptyLines: 1
+AllowAllArgumentsOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: false
+# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Inline
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: Inline
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakBeforeMultilineStrings: true
+BinPackArguments: true
+BinPackParameters: true # OnePerLine
+BitFieldColonSpacing: Both
+BreakBeforeBraces: Custom # Attach
+BraceWrapping:
+  AfterCaseLabel:  true
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  BeforeLambdaBody: false
+  BeforeWhile: false
+  IndentBraces:    false
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+# BreakAdjacentStringLiterals: true
+BreakAfterAttributes: Never
+BreakBeforeBinaryOperators: None
+BreakBeforeInlineASMColon: OnlyMultiline
+BreakBeforeTernaryOperators: false
+# BreakBinaryOperations: Never
+BreakConstructorInitializers: AfterColon
+# BreakFunctionDefinitionParameters: false
+BreakInheritanceList: AfterComma
+BreakStringLiterals: true
+# BreakTemplateDeclarations: Yes
+ColumnLimit:     120
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat:   false
+EmptyLineBeforeAccessModifier: Leave
+EmptyLineAfterAccessModifier: Never
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+IncludeBlocks:   Regroup
+IncludeCategories:
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+    SortPriority:    0
+  - Regex:           '^<.*'
+    Priority:        2
+    SortPriority:    0
+  - Regex:           '.*'
+    Priority:        3
+    SortPriority:    0
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IncludeIsMainSourceRegex: ''
+IndentAccessModifiers: false
+IndentCaseBlocks: true
+IndentCaseLabels: true
+IndentExternBlock: NoIndent
+IndentGotoLabels: false
+IndentPPDirectives: AfterHash
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+InsertBraces:    true # NOTE: may lead to incorrect formatting
+InsertNewlineAtEOF: true
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+LambdaBodyIndentation: Signature
+LineEnding: LF
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+PPIndentWidth: -1
+PackConstructorInitializers: CurrentLine
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Middle
+QualifierAlignment: Left
+#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
+RawStringFormats:
+  - Language:        Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+ReferenceAlignment: Middle
+ReflowComments:  false # IndentOnly
+SeparateDefinitionBlocks: Always
+SortIncludes:    CaseInsensitive
+SortUsingDeclarations: LexicographicNumeric
+SpaceAfterCStyleCast: true
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  Never
+SpacesInContainerLiterals: true
+SpacesInLineCommentPrefix:
+  Minimum: 1
+  Maximum: -1
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+Standard:        c++17
+TabWidth:        4
+UseTab:          Never
+WhitespaceSensitiveMacros: ['STRINGIZE']
+...
+
--- a/.github/ISSUE_TEMPLATE/01-bug-low.yml
+++ b/.github/ISSUE_TEMPLATE/01-bug-low.yml
@ -1,50 +0,0 @@
-name: Low Severity Bugs
-description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
-title: "Bug: "
-labels: ["bug-unconfirmed", "low severity"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
-        If possible, please provide a minimal code example that reproduces the bug.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What happened?
-      description: Also tell us, what did you expect to happen?
-      placeholder: Tell us what you see!
-    validations:
-      required: true
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: What operating system are you seeing the problem on?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
@ -0,0 +1,77 @@
+name: Bug (compilation)
+description: Something goes wrong when trying to compile llama.cpp.
+title: "Compile bug: "
+labels: ["bug-unconfirmed", "compilation"]
+body:
+  - type: markdown
+    attributes:
+      value: >
+        Thanks for taking the time to fill out this bug report!
+        This issue template is intended for bug reports where the compilation of llama.cpp fails.
+        Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
+        If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
+        by clearing `~/.cache/ccache` (on Linux).
+  - type: textarea
+    id: commit
+    attributes:
+      label: Git commit
+      description: Which commit are you trying to compile?
+      placeholder: |
+        $git rev-parse HEAD
+        84a07a17b1b08cf2b9747c633a2372782848a27f
+    validations:
+      required: true
+  - type: dropdown
+    id: operating-system
+    attributes:
+      label: Operating systems
+      description: Which operating systems do you know to be affected?
+      multiple: true
+      options:
+        - Linux
+        - Mac
+        - Windows
+        - BSD
+        - Other? (Please let us know in description)
+    validations:
+      required: true
+  - type: dropdown
+    id: backends
+    attributes:
+        label: GGML backends
+        description: Which GGML backends do you know to be affected?
+        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
+        multiple: true
+    validations:
+      required: true
+  - type: textarea
+    id: info
+    attributes:
+      label: Problem description & steps to reproduce
+      description: >
+        Please give us a summary of the problem and tell us how to reproduce it.
+        If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
+      placeholder: >
+        I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
+        Here are the exact commands that I used: ...
+    validations:
+      required: true
+  - type: textarea
+    id: first_bad_commit
+    attributes:
+      label: First Bad Commit
+      description: >
+        If the bug was not present on an earlier version: when did it start appearing?
+        If possible, please do a git bisect and identify the exact commit that introduced the bug.
+    validations:
+      required: false
+  - type: textarea
+    id: logs
+    attributes:
+      label: Relevant log output
+      description: >
+          Please copy and paste any relevant log output, including the command that you entered and any generated text.
+          This will be automatically formatted into code, so no need for backticks.
+      render: shell
+    validations:
+      required: true
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@ -0,0 +1,101 @@
+name: Bug (model use)
+description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
+title: "Eval bug: "
+labels: ["bug-unconfirmed", "model evaluation"]
+body:
+  - type: markdown
+    attributes:
+      value: >
+        Thanks for taking the time to fill out this bug report!
+        This issue template is intended for bug reports where the model evaluation results
+        (i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
+        If you encountered the issue while using an external UI (e.g. ollama),
+        please reproduce your issue using one of the examples/binaries in this repository.
+        The `llama-cli` binary can be used for simple and reproducible model inference.
+  - type: textarea
+    id: version
+    attributes:
+      label: Name and Version
+      description: Which version of our software are you running? (use `--version` to get a version string)
+      placeholder: |
+        $./llama-cli --version
+        version: 2999 (42b4109e)
+        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
+    validations:
+      required: true
+  - type: dropdown
+    id: operating-system
+    attributes:
+      label: Operating systems
+      description: Which operating systems do you know to be affected?
+      multiple: true
+      options:
+        - Linux
+        - Mac
+        - Windows
+        - BSD
+        - Other? (Please let us know in description)
+    validations:
+      required: true
+  - type: dropdown
+    id: backends
+    attributes:
+        label: GGML backends
+        description: Which GGML backends do you know to be affected?
+        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
+        multiple: true
+    validations:
+      required: true
+  - type: textarea
+    id: hardware
+    attributes:
+      label: Hardware
+      description: Which CPUs/GPUs are you using?
+      placeholder: >
+        e.g. Ryzen 5950X + 2x RTX 4090
+    validations:
+      required: true
+  - type: textarea
+    id: model
+    attributes:
+      label: Models
+      description: >
+        Which model(s) at which quantization were you using when encountering the bug?
+        If you downloaded a GGUF file off of Huggingface, please provide a link.
+      placeholder: >
+        e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
+    validations:
+      required: false
+  - type: textarea
+    id: info
+    attributes:
+      label: Problem description & steps to reproduce
+      description: >
+        Please give us a summary of the problem and tell us how to reproduce it.
+        If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
+        that information would be very much appreciated by us.
+      placeholder: >
+        e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
+        When I use -ngl 0 it works correctly.
+        Here are the exact commands that I used: ...
+    validations:
+      required: true
+  - type: textarea
+    id: first_bad_commit
+    attributes:
+      label: First Bad Commit
+      description: >
+        If the bug was not present on an earlier version: when did it start appearing?
+        If possible, please do a git bisect and identify the exact commit that introduced the bug.
+    validations:
+      required: false
+  - type: textarea
+    id: logs
+    attributes:
+      label: Relevant log output
+      description: >
+          Please copy and paste any relevant log output, including the command that you entered and any generated text.
+          This will be automatically formatted into code, so no need for backticks.
+      render: shell
+    validations:
+      required: true
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@ -0,0 +1,81 @@
+name: Bug (misc.)
+description: Something is not working the way it should (and it's not covered by any of the above cases).
+title: "Misc. bug: "
+labels: ["bug-unconfirmed"]
+body:
+  - type: markdown
+    attributes:
+      value: >
+        Thanks for taking the time to fill out this bug report!
+        This issue template is intended for miscellaneous bugs that don't fit into any other category.
+        If you encountered the issue while using an external UI (e.g. ollama),
+        please reproduce your issue using one of the examples/binaries in this repository.
+  - type: textarea
+    id: version
+    attributes:
+      label: Name and Version
+      description: Which version of our software is affected? (You can use `--version` to get a version string.)
+      placeholder: |
+        $./llama-cli --version
+        version: 2999 (42b4109e)
+        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
+    validations:
+      required: true
+  - type: dropdown
+    id: operating-system
+    attributes:
+      label: Operating systems
+      description: Which operating systems do you know to be affected?
+      multiple: true
+      options:
+        - Linux
+        - Mac
+        - Windows
+        - BSD
+        - Other? (Please let us know in description)
+    validations:
+      required: false
+  - type: dropdown
+    id: module
+    attributes:
+      label: Which llama.cpp modules do you know to be affected?
+      multiple: true
+      options:
+        - Documentation/Github
+        - libllama (core library)
+        - llama-cli
+        - llama-server
+        - llama-bench
+        - llama-quantize
+        - Python/Bash scripts
+        - Test code
+        - Other (Please specify in the next section)
+    validations:
+      required: false
+  - type: textarea
+    id: info
+    attributes:
+      label: Problem description & steps to reproduce
+      description: >
+        Please give us a summary of the problem and tell us how to reproduce it (if applicable).
+    validations:
+      required: true
+  - type: textarea
+    id: first_bad_commit
+    attributes:
+      label: First Bad Commit
+      description: >
+        If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
+        If possible, please do a git bisect and identify the exact commit that introduced the bug.
+    validations:
+      required: false
+  - type: textarea
+    id: logs
+    attributes:
+      label: Relevant log output
+      description: >
+          If applicable, please copy and paste any relevant log output, including the command that you entered and any generated text.
+          This will be automatically formatted into code, so no need for backticks.
+      render: shell
+    validations:
+      required: false
--- a/.github/ISSUE_TEMPLATE/02-bug-medium.yml
+++ b/.github/ISSUE_TEMPLATE/02-bug-medium.yml
@ -1,50 +0,0 @@
-name: Medium Severity Bug
-description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
-title: "Bug: "
-labels: ["bug-unconfirmed", "medium severity"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
-        If possible, please provide a minimal code example that reproduces the bug.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What happened?
-      description: Also tell us, what did you expect to happen?
-      placeholder: Tell us what you see!
-    validations:
-      required: true
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: What operating system are you seeing the problem on?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
--- a/.github/ISSUE_TEMPLATE/020-enhancement.yml
+++ b/.github/ISSUE_TEMPLATE/020-enhancement.yml
@ -1,5 +1,5 @@
 name: Enhancement
-description: Used to request enhancements for llama.cpp
+description: Used to request enhancements for llama.cpp.
 title: "Feature Request: "
 labels: ["enhancement"]
 body:
--- a/.github/ISSUE_TEMPLATE/03-bug-high.yml
+++ b/.github/ISSUE_TEMPLATE/03-bug-high.yml
@ -1,50 +0,0 @@
-name: High Severity Bug
-description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
-title: "Bug: "
-labels: ["bug-unconfirmed", "high severity"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
-        If possible, please provide a minimal code example that reproduces the bug.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What happened?
-      description: Also tell us, what did you expect to happen?
-      placeholder: Tell us what you see!
-    validations:
-      required: true
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: What operating system are you seeing the problem on?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
--- a/.github/ISSUE_TEMPLATE/030-research.yml
+++ b/.github/ISSUE_TEMPLATE/030-research.yml
@ -1,5 +1,5 @@
 name: Research
-description: Track new technical research area
+description: Track new technical research area.
 title: "Research: "
 labels: ["research 🔬"]
 body:
--- a/.github/ISSUE_TEMPLATE/04-bug-critical.yml
+++ b/.github/ISSUE_TEMPLATE/04-bug-critical.yml
@ -1,50 +0,0 @@
-name: Critical Severity Bug
-description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
-title: "Bug: "
-labels: ["bug-unconfirmed", "critical severity"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
-        If possible, please provide a minimal code example that reproduces the bug.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What happened?
-      description: Also tell us, what did you expect to happen?
-      placeholder: Tell us what you see!
-    validations:
-      required: true
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: What operating system are you seeing the problem on?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
--- a/.github/ISSUE_TEMPLATE/040-refactor.yml
+++ b/.github/ISSUE_TEMPLATE/040-refactor.yml
@ -1,5 +1,5 @@
 name: Refactor (Maintainers)
-description: Used to track refactoring opportunities
+description: Used to track refactoring opportunities.
 title: "Refactor: "
 labels: ["refactor"]
 body:
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -952,7 +952,7 @@ jobs:

    env:
      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
+      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
    steps:
      - name: Clone
@ -962,7 +962,8 @@ jobs:
          fetch-depth: 0

      - name: Install
-        run:  scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+        run:  |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL

      - name: Build
        id: cmake_build
@ -981,26 +982,34 @@ jobs:
            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
          fi

-      - name: Pack artifacts
+      - name: Build the release package
        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }}
        run: |
          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin

-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
+
          echo "cp oneAPI running time dll files to ./build/bin done"
          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*

-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+      - name: Upload the release package
+        if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }}
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -10,12 +10,10 @@
 name: Publish Docker image

 on:
-  #pull_request:
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
-  workflow_dispatch: # allows manual triggering, useful for debugging
+  workflow_dispatch: # allows manual triggering
+  schedule:
+    # Rebuild daily rather than on every push because it is expensive
+    - cron: '12 4 * * *'

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@ -29,7 +27,6 @@ permissions:
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
-    #if: github.event.pull_request.draft == false

    runs-on: ubuntu-latest
    env:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -163,8 +163,11 @@ if (GGML_TARGET_DEFINES)
    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
 endif()
 get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
-
-set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
+# all public headers
+set(LLAMA_PUBLIC_HEADERS
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)
+set_target_properties(llama PROPERTIES PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
 install(TARGETS llama LIBRARY PUBLIC_HEADER)

 configure_package_config_file(
--- a/18
+++ b/18
@ -34,6 +34,7 @@ BUILD_TARGETS = \
 	llama-server \
 	llama-simple \
 	llama-simple-chat \
+	llama-run \
 	llama-speculative \
 	llama-tokenize \
 	llama-vdot \
@ -251,7 +252,7 @@ endif
 #

 # keep standard at C11 and C++11
-MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon
+MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
 MK_CFLAGS    = -std=c11   -fPIC
 MK_CXXFLAGS  = -std=c++11 -fPIC
 MK_NVCCFLAGS = -std=c++11
@ -290,6 +291,7 @@ endif
 # some memory allocation are available on Linux through GNU extensions in libc
 ifeq ($(UNAME_S),Linux)
 	MK_CPPFLAGS += -D_GNU_SOURCE
+	MK_LDFLAGS  += -ldl
 endif

 # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
@ -730,10 +732,10 @@ GLSLC_CMD  = glslc
 _ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
 _ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp
 _ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp
-_ggml_vk_input_dir = ggml/src/vulkan-shaders
+_ggml_vk_input_dir = ggml/src/ggml-vulkan/vulkan-shaders
 _ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)

-ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
+ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
 	$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@

 $(_ggml_vk_header): $(_ggml_vk_source)
@ -745,8 +747,8 @@ $(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
 		--target-hpp $(_ggml_vk_header) \
 		--target-cpp $(_ggml_vk_source)

-vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
-	$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp

 endif # GGML_VULKAN

@ -966,6 +968,7 @@ OBJ_COMMON = \
 	$(DIR_COMMON)/console.o \
 	$(DIR_COMMON)/ngram-cache.o \
 	$(DIR_COMMON)/sampling.o \
+	$(DIR_COMMON)/speculative.o \
 	$(DIR_COMMON)/build-info.o \
 	$(DIR_COMMON)/json-schema-to-grammar.o

@ -1165,6 +1168,11 @@ llama-infill: examples/infill/infill.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

+llama-run: examples/run/run.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 llama-simple: examples/simple/simple.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
--- a/Package.swift
+++ b/Package.swift
@ -43,7 +43,8 @@ linkerSettings.append(.linkedFramework("Accelerate"))
 cSettings.append(
    contentsOf: [
        .define("GGML_USE_ACCELERATE"),
-        .define("GGML_USE_METAL")
+        .define("GGML_USE_METAL"),
+        .define("GGML_USE_CPU")
    ]
 )
 #endif
--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@ -3,12 +3,60 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
 set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
 set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)

+set(GGML_STATIC @GGML_STATIC@)
+set(GGML_NATIVE @GGML_NATIVE@)
+set(GGML_LTO    @GGML_LTO@)
+set(GGML_CCACHE @GGML_CCACHE@)
+set(GGML_AVX    @GGML_AVX@)
+set(GGML_AVX2   @GGML_AVX2@)
+set(GGML_AVX512 @GGML_AVX512@)
+set(GGML_AVX512_VBMI @GGML_AVX512_VBMI@)
+set(GGML_AVX512_VNNI @GGML_AVX512_VNNI@)
+set(GGML_AVX512_BF16 @GGML_AVX512_BF16@)
+set(GGML_AMX_TILE @GGML_AMX_TILE@)
+set(GGML_AMX_INT8 @GGML_AMX_INT8@)
+set(GGML_AMX_BF16 @GGML_AMX_BF16@)
+set(GGML_FMA  @GGML_FMA@)
+set(GGML_LASX @GGML_LASX@)
+set(GGML_LSX  @GGML_LSX@)
+set(GGML_RVV  @GGML_RVV@)
+set(GGML_SVE  @GGML_SVE@)
+
 set(GGML_ACCELERATE @GGML_ACCELERATE@)
+set(GGML_OPENMP  @GGML_OPENMP@)
+set(GGML_CPU_HBM @GGML_CPU_HBM@)
+set(GGML_BLAS_VENDOR @GGML_BLAS_VENDOR@)
+
+set(GGML_CUDA_FORCE_MMQ    @GGML_CUDA_FORCE_MMQ@)
+set(GGML_CUDA_FORCE_CUBLAS @GGML_CUDA_FORCE_CUBLAS@)
+set(GGML_CUDA_F16          @GGML_CUDA_F16@)
+set(GGML_CUDA_PEER_MAX_BATCH_SIZE @GGML_CUDA_PEER_MAX_BATCH_SIZE@)
+set(GGML_CUDA_NO_PEER_COPY  @GGML_CUDA_NO_PEER_COPY@)
+set(GGML_CUDA_NO_VMM        @GGML_CUDA_NO_VMM@)
+set(GGML_CUDA_FA_ALL_QUANTS @GGML_CUDA_FA_ALL_QUANTS@)
+set(GGML_CUDA_GRAPHS        @GGML_CUDA_GRAPHS@)
+
+set(GGML_HIP_UMA @GGML_HIP_UMA@)
+
 set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
-set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@)
-set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@)
-set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@)
-set(GGML_OPENMP @GGML_OPENMP@)
+set(GGML_VULKAN_DEBUG         @GGML_VULKAN_DEBUG@)
+set(GGML_VULKAN_MEMORY_DEBUG  @GGML_VULKAN_MEMORY_DEBUG@)
+set(GGML_VULKAN_SHADER_DEBUG_INFO @GGML_VULKAN_SHADER_DEBUG_INFO@)
+set(GGML_VULKAN_PERF      @GGML_VULKAN_PERF@)
+set(GGML_VULKAN_VALIDATE  @GGML_VULKAN_VALIDATE@)
+set(GGML_VULKAN_RUN_TESTS @GGML_VULKAN_RUN_TESTS@)
+
+set(GGML_METAL_USE_BF16 @GGML_METAL_USE_BF16@)
+set(GGML_METAL_NDEBUG   @GGML_METAL_NDEBUG@)
+set(GGML_METAL_SHADER_DEBUG  @GGML_METAL_SHADER_DEBUG@)
+set(GGML_METAL_EMBED_LIBRARY @GGML_METAL_EMBED_LIBRARY@)
+set(GGML_METAL_MACOSX_VERSION_MIN @GGML_METAL_MACOSX_VERSION_MIN@)
+set(GGML_METAL_STD @GGML_METAL_STD@)
+
+set(GGML_SYCL_F16    @GGML_SYCL_F16@)
+set(GGML_SYCL_TARGET @GGML_SYCL_TARGET@)
+set(GGML_SYCL_DEVICE_ARCH @GGML_SYCL_DEVICE_ARCH@)
+

@PACKAGE_INIT@

@ -20,6 +68,7 @@ find_package(Threads REQUIRED)

 set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
 set(_llama_link_deps "")
+set(_llama_link_opts "")
 foreach(_ggml_lib ggml ggml-base)
    string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY")
    find_library(${_ggml_lib_var} ${_ggml_lib}
@ -49,41 +98,63 @@ foreach(backend amx blas cann cpu cuda hip kompute metal musa rpc sycl vulkan)
    endif()
 endforeach()

-if (APPLE AND GGML_ACCELERATE)
-    find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
-endif()
+if (NOT LLAMA_SHARED_LIB)
+    if (APPLE AND GGML_ACCELERATE)
+        find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
+        list(APPEND _llama_link_deps ${ACCELERATE_FRAMEWORK})
+    endif()

-if (GGML_BLAS)
-    find_package(BLAS REQUIRED)
-endif()
+    if (GGML_OPENMP)
+        find_package(OpenMP REQUIRED)
+        list(APPEND _llama_link_deps OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+    endif()

-if (GGML_CUDA)
-    find_package(CUDAToolkit REQUIRED)
-endif()
+    if (GGML_CPU_HBM)
+        find_library(memkind memkind REQUIRED)
+        list(APPEND _llama_link_deps memkind)
+    endif()

-if (GGML_METAL)
-    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
-    find_library(METAL_FRAMEWORK Metal REQUIRED)
-    find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
-endif()
+    if (GGML_BLAS)
+        find_package(BLAS REQUIRED)
+        list(APPEND _llama_link_deps ${BLAS_LIBRARIES})
+        list(APPEND _llama_link_opts ${BLAS_LINKER_FLAGS})
+    endif()

-if (GGML_VULKAN)
-    find_package(Vulkan REQUIRED)
-endif()
+    if (GGML_CUDA)
+        find_package(CUDAToolkit REQUIRED)
+    endif()

-if (GGML_HIP)
-    find_package(hip REQUIRED)
-    find_package(hipblas REQUIRED)
-    find_package(rocblas REQUIRED)
-endif()
+    if (GGML_METAL)
+        find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+        find_library(METAL_FRAMEWORK    Metal REQUIRED)
+        find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
+        list(APPEND _llama_link_deps ${FOUNDATION_LIBRARY}
+                                     ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
+    endif()

-if (GGML_SYCL)
-    find_package(IntelSYCL REQUIRED)
-    find_package(MKL REQUIRED)
-endif()
+    if (GGML_VULKAN)
+        find_package(Vulkan REQUIRED)
+        list(APPEND _llama_link_deps Vulkan::Vulkan)
+    endif()

-if (GGML_OPENMP)
-    find_package(OpenMP REQUIRED)
+    if (GGML_HIP)
+        find_package(hip     REQUIRED)
+        find_package(hipblas REQUIRED)
+        find_package(rocblas REQUIRED)
+        list(APPEND _llama_link_deps hip::host roc::rocblas roc::hipblas)
+    endif()
+
+    if (GGML_SYCL)
+        find_package(DNNL)
+        if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
+            list(APPEND _llama_link_deps DNNL::dnnl)
+        endif()
+        if (WIN32)
+            find_package(IntelSYCL REQUIRED)
+            find_package(MKL       REQUIRED)
+            list(APPEND _llama_link_deps IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
+        endif()
+    endif()
 endif()

 find_library(llama_LIBRARY llama
@ -97,6 +168,7 @@ set_target_properties(llama
    PROPERTIES
        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
        INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
+        INTERFACE_LINK_OPTIONS   "${_llama_link_opts}"
        INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
        IMPORTED_LOCATION "${llama_LIBRARY}"
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -66,6 +66,8 @@ add_library(${TARGET} STATIC
    ngram-cache.h
    sampling.cpp
    sampling.h
+    speculative.cpp
+    speculative.h
    )

 if (BUILD_SHARED_LIBS)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -233,10 +233,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        }
    }

-    postprocess_cpu_params(params.cpuparams, nullptr);
+    postprocess_cpu_params(params.cpuparams,       nullptr);
    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
-    postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
-    postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
+
+    postprocess_cpu_params(params.speculative.cpuparams,       &params.cpuparams);
+    postprocess_cpu_params(params.speculative.cpuparams_batch, &params.cpuparams_batch);

    if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
@ -251,7 +252,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        for (auto & antiprompt : params.antiprompt) {
            string_process_escapes(antiprompt);
        }
-        for (auto & seq_breaker : params.sparams.dry_sequence_breakers) {
+        for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
            string_process_escapes(seq_breaker);
        }
    }
@ -297,6 +298,27 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
    print_options(specific_options);
 }

+static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
+    std::vector<ggml_backend_dev_t> devices;
+    auto dev_names = string_split<std::string>(value, ',');
+    if (dev_names.empty()) {
+        throw std::invalid_argument("no devices specified");
+    }
+    if (dev_names.size() == 1 && dev_names[0] == "none") {
+        devices.push_back(nullptr);
+    } else {
+        for (const auto & device : dev_names) {
+            auto * dev = ggml_backend_dev_by_name(device.c_str());
+            if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
+                throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
+            }
+            devices.push_back(dev);
+        }
+        devices.push_back(nullptr);
+    }
+    return devices;
+}
+
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
    auto ctx_arg = common_params_parser_init(params, ex, print_usage);
    const common_params params_org = ctx_arg.params; // the example can modify the default params
@ -323,13 +345,16 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
 }

 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+    // load dynamic backends
+    ggml_backend_load_all();
+
    common_params_context ctx_arg(params);
    ctx_arg.print_usage = print_usage;
    ctx_arg.ex          = ex;

    std::string sampler_type_chars;
    std::string sampler_type_names;
-    for (const auto & sampler : params.sparams.samplers) {
+    for (const auto & sampler : params.sampling.samplers) {
        sampler_type_chars += common_sampler_type_to_chr(sampler);
        sampler_type_names += common_sampler_type_to_str(sampler) + ";";
    }
@ -407,26 +432,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
        }
    ));
-    add_opt(common_arg(
-        {"-td", "--threads-draft"}, "N",
-        "number of threads to use during generation (default: same as --threads)",
-        [](common_params & params, int value) {
-            params.draft_cpuparams.n_threads = value;
-            if (params.draft_cpuparams.n_threads <= 0) {
-                params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"-tbd", "--threads-batch-draft"}, "N",
-        "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
-        [](common_params & params, int value) {
-            params.draft_cpuparams_batch.n_threads = value;
-            if (params.draft_cpuparams_batch.n_threads <= 0) {
-                params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-C", "--cpu-mask"}, "M",
        "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
@ -515,108 +520,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.cpuparams_batch.poll = value;
        }
    ));
-    add_opt(common_arg(
-        {"-Cd", "--cpu-mask-draft"}, "M",
-        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
-        [](common_params & params, const std::string & mask) {
-            params.draft_cpuparams.mask_valid = true;
-            if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
-                throw std::invalid_argument("invalid cpumask");
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"-Crd", "--cpu-range-draft"}, "lo-hi",
-        "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
-        [](common_params & params, const std::string & range) {
-            params.draft_cpuparams.mask_valid = true;
-            if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
-                throw std::invalid_argument("invalid range");
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"--cpu-strict-draft"}, "<0|1>",
-        "Use strict CPU placement for draft model (default: same as --cpu-strict)",
-        [](common_params & params, int value) {
-            params.draft_cpuparams.strict_cpu = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"--prio-draft"}, "N",
-        string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
-        [](common_params & params, int prio) {
-            if (prio < 0 || prio > 3) {
-                throw std::invalid_argument("invalid value");
-            }
-            params.draft_cpuparams.priority = (enum ggml_sched_priority) prio;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"--poll-draft"}, "<0|1>",
-        "Use polling to wait for draft model work (default: same as --poll])",
-        [](common_params & params, int value) {
-            params.draft_cpuparams.poll = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"-Cbd", "--cpu-mask-batch-draft"}, "M",
-        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
-        [](common_params & params, const std::string & mask) {
-            params.draft_cpuparams_batch.mask_valid = true;
-            if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) {
-                throw std::invalid_argument("invalid cpumask");
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
-        "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
-        [](common_params & params, const std::string & range) {
-            params.draft_cpuparams_batch.mask_valid = true;
-            if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
-                throw std::invalid_argument("invalid cpumask");
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"--cpu-strict-batch-draft"}, "<0|1>",
-        "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
-        [](common_params & params, int value) {
-            params.draft_cpuparams_batch.strict_cpu = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"--prio-batch-draft"}, "N",
-        string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
-        [](common_params & params, int prio) {
-            if (prio < 0 || prio > 3) {
-                throw std::invalid_argument("invalid value");
-            }
-            params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"--poll-batch-draft"}, "<0|1>",
-        "Use polling to wait for draft model work (default: --poll-draft)",
-        [](common_params & params, int value) {
-            params.draft_cpuparams_batch.poll = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(common_arg(
-        {"--draft"}, "N",
-        string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
-        [](common_params & params, int value) {
-            params.n_draft = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
-    add_opt(common_arg(
-        {"-ps", "--p-split"}, "N",
-        string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
-        [](common_params & params, const std::string & value) {
-            params.p_split = std::stof(value);
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-lcs", "--lookup-cache-static"}, "FNAME",
        "path to static lookup cache to use for lookup decoding (not updated by generation)",
@ -701,7 +604,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
        [](common_params & params) {
            params.no_perf = true;
-            params.sparams.no_perf = true;
+            params.sampling.no_perf = true;
        }
    ).set_env("LLAMA_ARG_NO_PERF"));
    add_opt(common_arg(
@ -883,155 +786,155 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
        [](common_params & params, const std::string & value) {
            const auto sampler_names = string_split<std::string>(value, ';');
-            params.sparams.samplers = common_sampler_types_from_names(sampler_names, true);
+            params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"-s", "--seed"}, "SEED",
-        string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),
+        string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED),
        [](common_params & params, const std::string & value) {
-            params.sparams.seed = std::stoul(value);
+            params.sampling.seed = std::stoul(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--sampling-seq"}, "SEQUENCE",
        string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
        [](common_params & params, const std::string & value) {
-            params.sparams.samplers = common_sampler_types_from_chars(value);
+            params.sampling.samplers = common_sampler_types_from_chars(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--ignore-eos"},
        "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
        [](common_params & params) {
-            params.sparams.ignore_eos = true;
+            params.sampling.ignore_eos = true;
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--penalize-nl"},
-        string_format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
+        string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"),
        [](common_params & params) {
-            params.sparams.penalize_nl = true;
+            params.sampling.penalize_nl = true;
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--temp"}, "N",
-        string_format("temperature (default: %.1f)", (double)params.sparams.temp),
+        string_format("temperature (default: %.1f)", (double)params.sampling.temp),
        [](common_params & params, const std::string & value) {
-            params.sparams.temp = std::stof(value);
-            params.sparams.temp = std::max(params.sparams.temp, 0.0f);
+            params.sampling.temp = std::stof(value);
+            params.sampling.temp = std::max(params.sampling.temp, 0.0f);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--top-k"}, "N",
-        string_format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
+        string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
        [](common_params & params, int value) {
-            params.sparams.top_k = value;
+            params.sampling.top_k = value;
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--top-p"}, "N",
-        string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
+        string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
        [](common_params & params, const std::string & value) {
-            params.sparams.top_p = std::stof(value);
+            params.sampling.top_p = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--min-p"}, "N",
-        string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
+        string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
        [](common_params & params, const std::string & value) {
-            params.sparams.min_p = std::stof(value);
+            params.sampling.min_p = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--xtc-probability"}, "N",
-        string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
+        string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
        [](common_params & params, const std::string & value) {
-            params.sparams.xtc_probability = std::stof(value);
+            params.sampling.xtc_probability = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--xtc-threshold"}, "N",
-        string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
+        string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
        [](common_params & params, const std::string & value) {
-            params.sparams.xtc_threshold = std::stof(value);
+            params.sampling.xtc_threshold = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--typical"}, "N",
-        string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
+        string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p),
        [](common_params & params, const std::string & value) {
-            params.sparams.typ_p = std::stof(value);
+            params.sampling.typ_p = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--repeat-last-n"}, "N",
-        string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
+        string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
        [](common_params & params, int value) {
-            params.sparams.penalty_last_n = value;
-            params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
+            params.sampling.penalty_last_n = value;
+            params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--repeat-penalty"}, "N",
-        string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
+        string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
        [](common_params & params, const std::string & value) {
-            params.sparams.penalty_repeat = std::stof(value);
+            params.sampling.penalty_repeat = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--presence-penalty"}, "N",
-        string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
+        string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present),
        [](common_params & params, const std::string & value) {
-            params.sparams.penalty_present = std::stof(value);
+            params.sampling.penalty_present = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--frequency-penalty"}, "N",
-        string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
+        string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq),
        [](common_params & params, const std::string & value) {
-            params.sparams.penalty_freq = std::stof(value);
+            params.sampling.penalty_freq = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--dry-multiplier"}, "N",
-        string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sparams.dry_multiplier),
+        string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier),
        [](common_params & params, const std::string & value) {
-            params.sparams.dry_multiplier = std::stof(value);
+            params.sampling.dry_multiplier = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--dry-base"}, "N",
-        string_format("set DRY sampling base value (default: %.2f)", (double)params.sparams.dry_base),
+        string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base),
        [](common_params & params, const std::string & value) {
            float potential_base = std::stof(value);
            if (potential_base >= 1.0f)
            {
-                params.sparams.dry_base = potential_base;
+                params.sampling.dry_base = potential_base;
            }
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--dry-allowed-length"}, "N",
-        string_format("set allowed length for DRY sampling (default: %d)", params.sparams.dry_allowed_length),
+        string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length),
        [](common_params & params, int value) {
-            params.sparams.dry_allowed_length = value;
+            params.sampling.dry_allowed_length = value;
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--dry-penalty-last-n"}, "N",
-        string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sparams.dry_penalty_last_n),
+        string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
        [](common_params & params, int value) {
-            params.sparams.dry_penalty_last_n = value;
+            params.sampling.dry_penalty_last_n = value;
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--dry-sequence-breaker"}, "STRING",
        string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
-            params.sparams.dry_sequence_breakers.empty() ? "none" :
-            std::accumulate(std::next(params.sparams.dry_sequence_breakers.begin()),
-                params.sparams.dry_sequence_breakers.end(),
-                std::string("'") + (params.sparams.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sparams.dry_sequence_breakers[0]) + "'",
+            params.sampling.dry_sequence_breakers.empty() ? "none" :
+            std::accumulate(std::next(params.sampling.dry_sequence_breakers.begin()),
+                params.sampling.dry_sequence_breakers.end(),
+                std::string("'") + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'",
                [](const std::string& a, const std::string& b) {
                    std::string formatted_b = (b == "\n") ? "\\n" : b;
                    return a + ", '" + formatted_b + "'";
@ -1040,51 +943,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            static bool defaults_cleared = false;

            if (!defaults_cleared) {
-                params.sparams.dry_sequence_breakers.clear();
+                params.sampling.dry_sequence_breakers.clear();
                defaults_cleared = true;
            }

            if (value == "none") {
-                params.sparams.dry_sequence_breakers.clear();
+                params.sampling.dry_sequence_breakers.clear();
            } else {
-                params.sparams.dry_sequence_breakers.emplace_back(value);
+                params.sampling.dry_sequence_breakers.emplace_back(value);
            }
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--dynatemp-range"}, "N",
-        string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
+        string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
        [](common_params & params, const std::string & value) {
-            params.sparams.dynatemp_range = std::stof(value);
+            params.sampling.dynatemp_range = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--dynatemp-exp"}, "N",
-        string_format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
+        string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent),
        [](common_params & params, const std::string & value) {
-            params.sparams.dynatemp_exponent = std::stof(value);
+            params.sampling.dynatemp_exponent = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--mirostat"}, "N",
        string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
-        "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
+        "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
        [](common_params & params, int value) {
-            params.sparams.mirostat = value;
+            params.sampling.mirostat = value;
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--mirostat-lr"}, "N",
-        string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
+        string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
        [](common_params & params, const std::string & value) {
-            params.sparams.mirostat_eta = std::stof(value);
+            params.sampling.mirostat_eta = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--mirostat-ent"}, "N",
-        string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
+        string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
        [](common_params & params, const std::string & value) {
-            params.sparams.mirostat_tau = std::stof(value);
+            params.sampling.mirostat_tau = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
@ -1100,7 +1003,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            try {
                if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
                    const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
-                    params.sparams.logit_bias.push_back({key, bias});
+                    params.sampling.logit_bias.push_back({key, bias});
                } else {
                    throw std::invalid_argument("invalid input format");
                }
@ -1111,9 +1014,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_sparam());
    add_opt(common_arg(
        {"--grammar"}, "GRAMMAR",
-        string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
+        string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()),
        [](common_params & params, const std::string & value) {
-            params.sparams.grammar = value;
+            params.sampling.grammar = value;
        }
    ).set_sparam());
    add_opt(common_arg(
@ -1127,7 +1030,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            std::copy(
                std::istreambuf_iterator<char>(file),
                std::istreambuf_iterator<char>(),
-                std::back_inserter(params.sparams.grammar)
+                std::back_inserter(params.sampling.grammar)
            );
        }
    ).set_sparam());
@ -1135,7 +1038,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"-j", "--json-schema"}, "SCHEMA",
        "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
        [](common_params & params, const std::string & value) {
-            params.sparams.grammar = json_schema_to_grammar(json::parse(value));
+            params.sampling.grammar = json_schema_to_grammar(json::parse(value));
        }
    ).set_sparam());
    add_opt(common_arg(
@ -1433,6 +1336,30 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            else { throw std::invalid_argument("invalid value"); }
        }
    ).set_env("LLAMA_ARG_NUMA"));
+    add_opt(common_arg(
+        {"-dev", "--device"}, "<dev1,dev2,..>",
+        "comma-separated list of devices to use for offloading (none = don't offload)\n"
+        "use --list-devices to see a list of available devices",
+        [](common_params & params, const std::string & value) {
+            params.devices = parse_device_list(value);
+        }
+    ).set_env("LLAMA_ARG_DEVICE"));
+    add_opt(common_arg(
+        {"--list-devices"},
+        "print list of available devices and exit",
+        [](common_params &) {
+            printf("Available devices:\n");
+            for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+                auto * dev = ggml_backend_dev_get(i);
+                if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
+                    size_t free, total;
+                    ggml_backend_dev_memory(dev, &free, &total);
+                    printf("  %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
+                }
+            }
+            exit(0);
+        }
+    ));
    add_opt(common_arg(
        {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
        "number of layers to store in VRAM",
@ -1444,17 +1371,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
        }
    ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
-    add_opt(common_arg(
-        {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
-        "number of layers to store in VRAM for the draft model",
-        [](common_params & params, int value) {
-            params.n_gpu_layers_draft = value;
-            if (!llama_supports_gpu_offload()) {
-                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
-                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-sm", "--split-mode"}, "{none,layer,row}",
        "how to split the model across multiple GPUs, one of:\n"
@ -1468,10 +1384,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            } else if (arg_next == "layer") {
                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
            } else if (arg_next == "row") {
-#ifdef GGML_USE_SYCL
-                fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
-                exit(1);
-#endif // GGML_USE_SYCL
                params.split_mode = LLAMA_SPLIT_MODE_ROW;
            } else {
                throw std::invalid_argument("invalid value");
@ -1593,13 +1505,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.model = value;
        }
    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
-    add_opt(common_arg(
-        {"-md", "--model-draft"}, "FNAME",
-        "draft model for speculative decoding (default: unused)",
-        [](common_params & params, const std::string & value) {
-            params.model_draft = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-mu", "--model-url"}, "MODEL_URL",
        "model download url (default: unused)",
@ -2037,5 +1942,176 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_LOG_TIMESTAMPS"));

+    // speculative parameters
+    add_opt(common_arg(
+        {"-td", "--threads-draft"}, "N",
+        "number of threads to use during generation (default: same as --threads)",
+        [](common_params & params, int value) {
+            params.speculative.cpuparams.n_threads = value;
+            if (params.speculative.cpuparams.n_threads <= 0) {
+                params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"-tbd", "--threads-batch-draft"}, "N",
+        "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
+        [](common_params & params, int value) {
+            params.speculative.cpuparams_batch.n_threads = value;
+            if (params.speculative.cpuparams_batch.n_threads <= 0) {
+                params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"-Cd", "--cpu-mask-draft"}, "M",
+        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
+        [](common_params & params, const std::string & mask) {
+            params.speculative.cpuparams.mask_valid = true;
+            if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"-Crd", "--cpu-range-draft"}, "lo-hi",
+        "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
+        [](common_params & params, const std::string & range) {
+            params.speculative.cpuparams.mask_valid = true;
+            if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) {
+                throw std::invalid_argument("invalid range");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--cpu-strict-draft"}, "<0|1>",
+        "Use strict CPU placement for draft model (default: same as --cpu-strict)",
+        [](common_params & params, int value) {
+            params.speculative.cpuparams.strict_cpu = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--prio-draft"}, "N",
+        string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority),
+        [](common_params & params, int prio) {
+            if (prio < 0 || prio > 3) {
+                throw std::invalid_argument("invalid value");
+            }
+            params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--poll-draft"}, "<0|1>",
+        "Use polling to wait for draft model work (default: same as --poll])",
+        [](common_params & params, int value) {
+            params.speculative.cpuparams.poll = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"-Cbd", "--cpu-mask-batch-draft"}, "M",
+        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
+        [](common_params & params, const std::string & mask) {
+            params.speculative.cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
+        "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
+        [](common_params & params, const std::string & range) {
+            params.speculative.cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--cpu-strict-batch-draft"}, "<0|1>",
+        "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
+        [](common_params & params, int value) {
+            params.speculative.cpuparams_batch.strict_cpu = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--prio-batch-draft"}, "N",
+        string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority),
+        [](common_params & params, int prio) {
+            if (prio < 0 || prio > 3) {
+                throw std::invalid_argument("invalid value");
+            }
+            params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--poll-batch-draft"}, "<0|1>",
+        "Use polling to wait for draft model work (default: --poll-draft)",
+        [](common_params & params, int value) {
+            params.speculative.cpuparams_batch.poll = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--draft-max", "--draft", "--draft-n"}, "N",
+        string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
+        [](common_params & params, int value) {
+            params.speculative.n_max = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--draft-min", "--draft-n-min"}, "N",
+        string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
+        [](common_params & params, int value) {
+            params.speculative.n_min = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--draft-p-split"}, "P",
+        string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
+        [](common_params & params, const std::string & value) {
+            params.speculative.p_split = std::stof(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--draft-p-min"}, "P",
+        string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
+        [](common_params & params, const std::string & value) {
+            params.speculative.p_min = std::stof(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"-cd", "--ctx-size-draft"}, "N",
+        string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
+        [](common_params & params, int value) {
+            params.speculative.n_ctx = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"-devd", "--device-draft"}, "<dev1,dev2,..>",
+        "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
+        "use --list-devices to see a list of available devices",
+        [](common_params & params, const std::string & value) {
+            params.speculative.devices = parse_device_list(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
+        "number of layers to store in VRAM for the draft model",
+        [](common_params & params, int value) {
+            params.speculative.n_gpu_layers = value;
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
+                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"-md", "--model-draft"}, "FNAME",
+        "draft model for speculative decoding (default: unused)",
+        [](common_params & params, const std::string & value) {
+            params.speculative.model = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+
    return ctx_arg;
 }
--- a/common/common.cpp
+++ b/common/common.cpp
@ -536,12 +536,12 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
                    [](const unsigned char c) { return !std::isprint(c); }),
                detokenized.end());

-        buf << "\n" << std::to_string(i)
-            << ":token '" << detokenized << "'"
-            << ":pos " << std::to_string(batch.pos[i])
-            << ":n_seq_id  " << std::to_string(batch.n_seq_id[i])
-            << ":seq_id " << std::to_string(batch.seq_id[i][0])
-            << ":logits " << std::to_string(batch.logits[i]);
+        buf << "\n"          << std::to_string(i)
+            << ", token '"   << detokenized << "'"
+            << ", pos "      << std::to_string(batch.pos[i])
+            << ", n_seq_id " << std::to_string(batch.n_seq_id[i])
+            << ", seq_id "   << std::to_string(batch.seq_id[i][0])
+            << ", logits "   << std::to_string(batch.logits[i]);
    }

    buf << " ]";
@ -925,9 +925,9 @@ struct common_init_result common_init_from_params(common_params & params) {
        common_lora_adapters_apply(lctx, iparams.lora_adapters);
    }

-    if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
+    if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
        LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
-        params.sparams.ignore_eos = false;
+        params.sampling.ignore_eos = false;
    }

    if (params.warmup) {
@ -979,9 +979,12 @@ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_l
    }
 }

-struct llama_model_params common_model_params_to_llama(const common_params & params) {
+struct llama_model_params common_model_params_to_llama(common_params & params) {
    auto mparams = llama_model_default_params();

+    if (!params.devices.empty()) {
+        mparams.devices = params.devices.data();
+    }
    if (params.n_gpu_layers != -1) {
        mparams.n_gpu_layers = params.n_gpu_layers;
    }
@ -1490,6 +1493,66 @@ void common_batch_add(
    batch.n_tokens++;
 }

+//
+// Token utils
+//
+
+size_t common_lcp(const llama_tokens & a, const llama_tokens & b) {
+    size_t i;
+    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
+
+    return i;
+}
+
+size_t common_lcs(const llama_tokens & a, const llama_tokens & b) {
+    // check for empty sequences
+    if (a.empty() || b.empty()) {
+        return 0;
+    }
+
+    // get the lengths of the input sequences
+    size_t a_len = a.size();
+    size_t b_len = b.size();
+
+    // initialize the maximum length of the longest common subsequence (LCS)
+    size_t max_length = 0;
+
+    // use two rows instead of a 2D matrix to optimize space
+    std::vector<size_t> prev_row(b_len + 1, 0);
+    std::vector<size_t> curr_row(b_len + 1, 0);
+
+    // iterate through the elements of a
+    for (size_t i = 1; i <= a_len; i++) {
+        // iterate through the elements of b
+        for (size_t j = 1; j <= b_len; j++) {
+            // if elements at the current positions match
+            if (a[i - 1] == b[j - 1]) {
+                // if it's the first element of either sequences, set LCS length to 1
+                if (i == 1 || j == 1) {
+                    curr_row[j] = 1;
+                } else {
+                    // increment LCS length by 1 compared to the previous element
+                    curr_row[j] = prev_row[j - 1] + 1;
+                }
+
+                // update max_length if necessary
+                if (curr_row[j] > max_length) {
+                    max_length = curr_row[j];
+                }
+            } else {
+                // reset LCS length if elements don't match
+                curr_row[j] = 0;
+            }
+        }
+
+        // update the previous row for the next iteration
+        prev_row = curr_row;
+    }
+
+    // return the maximum length of the LCS
+    return max_length;
+}
+
 //
 // Vocab utils
 //
--- a/common/common.h
+++ b/common/common.h
@ -33,6 +33,8 @@ struct common_lora_adapter_container : common_lora_adapter_info {
    struct llama_lora_adapter * adapter;
 };

+using llama_tokens = std::vector<llama_token>;
+
 // build info
 extern int LLAMA_BUILD_NUMBER;
 extern char const * LLAMA_COMMIT;
@ -101,8 +103,8 @@ enum dimre_method {
    DIMRE_METHOD_MEAN,
 };

-// sampler parameters
-struct common_sampler_params {
+// sampling parameters
+struct common_params_sampling {
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler

    int32_t n_prev             = 64;    // number of previous tokens to remember
@ -153,21 +155,30 @@ struct common_sampler_params {
    std::string print() const;
 };

+struct common_params_speculative {
+    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
+    int32_t n_ctx        =     0; // draft context size
+    int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
+    int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
+    int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
+    float   p_split      =  0.1f; // speculative decoding split probability
+    float   p_min        =  0.9f; // minimum speculative decoding probability (greedy)
+
+    struct cpu_params cpuparams;
+    struct cpu_params cpuparams_batch;
+
+    std::string model = ""; // draft model for speculative decoding                          // NOLINT
+};
+
 struct common_params {
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =  4096; // context size
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                =     0; // number of tokens to keep from initial prompt
-    int32_t n_draft               =     5; // number of tokens to draft during speculative decoding
    int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
    int32_t n_parallel            =     1; // number of parallel sequences to decode
    int32_t n_sequences           =     1; // number of sequences to decode
-    float   p_split               =  0.1f; // speculative decoding split probability
-    int32_t n_gpu_layers          =    -1; // number of layers to store in VRAM (-1 - use default)
-    int32_t n_gpu_layers_draft    =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
-    int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs
    int32_t grp_attn_n            =     1; // group-attention factor
    int32_t grp_attn_w            =   512; // group-attention width
    int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
@ -180,25 +191,29 @@ struct common_params {
    int32_t yarn_orig_ctx         =     0; // YaRN original context length
    float   defrag_thold          =  0.1f; // KV cache defragmentation threshold

+    // offload params
+    std::vector<ggml_backend_dev_t> devices;         // devices to use for offloading
+    int32_t n_gpu_layers                    =    -1; // number of layers to store in VRAM (-1 - use default)
+    int32_t main_gpu                        =     0; // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]               =   {0}; // how split tensors should be distributed across GPUs
+    enum llama_split_mode        split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+
    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;
-    struct cpu_params draft_cpuparams;
-    struct cpu_params draft_cpuparams_batch;

    ggml_backend_sched_eval_callback cb_eval = nullptr;
    void * cb_eval_user_data                 = nullptr;

    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;

-    enum llama_split_mode        split_mode        = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings

-    struct common_sampler_params sparams;
+    struct common_params_sampling sampling;
+    struct common_params_speculative speculative;

    std::string model                = ""; // model path                                                    // NOLINT
-    std::string model_draft          = ""; // draft model for speculative decoding                          // NOLINT
    std::string model_alias          = "unknown"; // model alias                                            // NOLINT
    std::string model_url            = ""; // model url to download                                         // NOLINT
    std::string hf_token             = ""; // HF token                                                      // NOLINT
@ -451,7 +466,7 @@ struct common_init_result {

 struct common_init_result     common_init_from_params(common_params & params);

-struct llama_model_params     common_model_params_to_llama  (const common_params & params);
+struct llama_model_params     common_model_params_to_llama  (      common_params & params);
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

@ -461,7 +476,9 @@ struct llama_model * common_load_model_from_hf(const char * repo, const char * f
 // clear LoRA adapters from context, then apply new list of adapters
 void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);

+//
 // Batch utils
+//

 void common_batch_clear(struct llama_batch & batch);

@ -472,6 +489,16 @@ void common_batch_add(
    const std::vector<llama_seq_id> & seq_ids,
                               bool   logits);

+//
+// Token utils
+//
+
+// longest common prefix
+size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
+
+// longet common subsequence
+size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
+
 //
 // Vocab utils
 //
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -99,7 +99,7 @@ struct ring_buffer {
 };

 struct common_sampler {
-    common_sampler_params params;
+    common_params_sampling params;

    struct llama_sampler * grmr;
    struct llama_sampler * chain;
@ -125,7 +125,7 @@ struct common_sampler {
    }
 };

-std::string common_sampler_params::print() const {
+std::string common_params_sampling::print() const {
    char result[1024];

    snprintf(result, sizeof(result),
@ -141,7 +141,7 @@ std::string common_sampler_params::print() const {
    return std::string(result);
 }

-struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();

    lparams.no_perf = params.no_perf;
@ -320,6 +320,45 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    return cur_p.data[cur_p.selected].id;
 }

+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
+    GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
+
+    std::vector<llama_token> result;
+    result.reserve(idxs.size());
+
+    size_t i = 0;
+    for (; i < draft.size(); i++) {
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+
+        common_sampler_accept(gsmpl, id, true);
+
+        result.push_back(id);
+
+        if (draft[i] != id) {
+            break;
+        }
+    }
+
+    if (i == draft.size()) {
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+
+        common_sampler_accept(gsmpl, id, true);
+
+        result.push_back(id);
+    }
+
+    return result;
+}
+
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
+    std::vector<int> idxs(draft.size() + 1);
+    for (size_t i = 0; i < idxs.size(); ++i) {
+        idxs[i] = i;
+    }
+
+    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
+}
+
 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
    return llama_sampler_get_seed(gsmpl->chain);
 }
--- a/common/sampling.h
+++ b/common/sampling.h
@ -36,7 +36,7 @@ struct common_sampler;

 // llama_sampler API overloads

-struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params);
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);

 void common_sampler_free(struct common_sampler * gsmpl);

@ -60,6 +60,27 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
 //
 llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);

+// generalized version of common_sampler_sample
+//
+// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match
+// if the sampler disagrees at some point, we stop and return the accepted tokens up to now
+//
+//      common_sampler_sample_n(gsmpl, ctx, { idx }, {});
+//
+// is equivalent to
+//
+//      common_sampler_sample(gsmpl, ctx, idx);
+//      common_sampler_accept(gsmpl, token, true);
+//
+// requires: idxs.size() == draft.size() + 1
+//
+// returns at least 1 token, up to idxs.size()
+//
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
+
+// assume idxs == [ 0, 1, 2, ..., draft.size() ]
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
+
 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);

 // helpers
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@ -0,0 +1,270 @@
+#include "speculative.h"
+
+#include "log.h"
+#include "common.h"
+#include "sampling.h"
+
+#include <cstring>
+
+#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  128
+#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
+
+struct common_speculative {
+    struct llama_context * ctx;
+    struct common_sampler * smpl;
+
+    llama_batch batch;
+    llama_tokens prompt;
+};
+
+struct common_speculative * common_speculative_init(
+        struct llama_context * ctx_dft) {
+    auto * result = new common_speculative {
+        /* .ctx    = */ ctx_dft,
+        /* .smpl   = */ nullptr,
+        /* .batch  = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
+        /* .prompt = */ {},
+    };
+
+    // TODO: optimize or pass from outside?
+#if 0
+    {
+        common_params_sampling params;
+        params.no_perf = false;
+
+        params.top_k = 40;
+        params.top_p = 0.9;
+
+        params.samplers = {
+            COMMON_SAMPLER_TYPE_TOP_K,
+            COMMON_SAMPLER_TYPE_TOP_P,
+            COMMON_SAMPLER_TYPE_INFILL,
+        };
+
+        result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
+    }
+#else
+    {
+        common_params_sampling params;
+        params.no_perf = false;
+
+        params.top_k = 10;
+
+        params.samplers = {
+            COMMON_SAMPLER_TYPE_TOP_K,
+        };
+
+        result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
+    }
+#endif
+
+    return result;
+}
+
+void common_speculative_free(struct common_speculative * spec) {
+    common_sampler_free(spec->smpl);
+
+    llama_batch_free(spec->batch);
+
+    delete spec;
+}
+
+bool common_speculative_are_compatible(
+        const struct llama_context * ctx_tgt,
+        const struct llama_context * ctx_dft) {
+    const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
+    const struct llama_model * model_dft = llama_get_model(ctx_dft);
+
+    const bool vocab_type_tgt = llama_vocab_type(model_tgt);
+    LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
+
+    const bool vocab_type_dft = llama_vocab_type(model_dft);
+    LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
+
+    if (vocab_type_tgt != vocab_type_dft) {
+        LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
+                     "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
+        return false;
+    }
+
+    if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
+        llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
+        llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
+        llama_token_eos(model_tgt) != llama_token_eos(model_dft)) {
+        LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
+        LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt));
+        LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft));
+        return false;
+    }
+
+    {
+        const int n_vocab_tgt = llama_n_vocab(model_tgt);
+        const int n_vocab_dft = llama_n_vocab(model_dft);
+
+        const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
+
+        if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
+            LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
+                         "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
+                    __func__, n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
+            return false;
+        }
+
+        for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
+            const char * token_text_tgt = llama_token_get_text(model_tgt, i);
+            const char * token_text_dft = llama_token_get_text(model_dft, i);
+            if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
+                LOG_ERR("%s: draft model vocab must match target model to use speculation but "
+                             "token %d content differs - target '%s', draft '%s'\n", __func__, i,
+                        common_token_to_piece(ctx_tgt, i).c_str(),
+                        common_token_to_piece(ctx_dft, i).c_str());
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+llama_tokens common_speculative_gen_draft(
+        struct common_speculative * spec,
+        struct common_speculative_params params,
+        const llama_tokens & prompt_tgt,
+        llama_token id_last) {
+    auto & batch  = spec->batch;
+    auto & ctx    = spec->ctx;
+    auto & smpl   = spec->smpl;
+    auto & prompt = spec->prompt;
+
+    int reuse_i = 0;
+    int reuse_n = 0;
+
+    const int n_ctx = llama_n_ctx(ctx) - params.n_draft;
+
+    const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
+
+    // reuse as much as possible from the old draft context
+    // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
+    for (int i = 0; i < (int) prompt.size(); ++i) {
+        int cur = 0;
+        while (i_start + cur < (int) prompt_tgt.size() &&
+               i       + cur < (int) prompt.size() &&
+               prompt_tgt[i_start + cur] == prompt[i + cur]) {
+            cur++;
+        }
+
+        if ((cur >= params.n_reuse || n_ctx >= (int) prompt_tgt.size()) && cur > reuse_n) {
+            reuse_i = i;
+            reuse_n = cur;
+        }
+    }
+
+    LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
+
+    llama_tokens result;
+    result.reserve(params.n_draft);
+
+    if (reuse_n == 0) {
+        llama_kv_cache_clear(ctx);
+
+        prompt.clear();
+    } else {
+        // this happens when a previous draft has been discarded (for example, due to being too small), but the
+        // target model agreed with it. in this case, we simply pass back the previous results to save compute
+        if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
+            for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
+                result.push_back(prompt[i]);
+
+                if (params.n_draft <= (int) result.size()) {
+                    break;
+                }
+            }
+
+            return result;
+        }
+
+        if (reuse_i > 0) {
+            llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
+            llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
+
+            prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
+        }
+
+        if (reuse_n < (int) prompt.size()) {
+            llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
+
+            prompt.erase(prompt.begin() + reuse_n, prompt.end());
+        }
+    }
+
+    // prepare a batch to evaluate any new tokens in the prompt
+    common_batch_clear(batch);
+
+    for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) {
+        //LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
+        common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
+
+        prompt.push_back(prompt_tgt[i]);
+    }
+
+    // we should rarely end-up here during normal decoding
+    if (batch.n_tokens > 0) {
+        //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
+
+        llama_decode(ctx, batch);
+    }
+
+    const llama_pos n_past = prompt.size();
+
+    LOG_DBG("%s: n_past = %d\n", __func__, n_past);
+
+    common_batch_clear(batch);
+    common_batch_add  (batch, id_last, n_past, { 0 }, true);
+
+    prompt.push_back(id_last);
+
+    //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());
+
+    llama_decode(ctx, batch);
+
+    common_sampler_reset(smpl);
+
+    // sample n_draft tokens from the draft model
+    for (int i = 0; i < params.n_draft; ++i) {
+        common_batch_clear(batch);
+
+        common_sampler_sample(smpl, ctx, 0, true);
+
+        const auto * cur_p = common_sampler_get_candidates(smpl);
+
+        for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
+            LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                    k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
+        }
+
+        // add drafted token for each sequence
+        const llama_token id = cur_p->data[0].id;
+
+        // only collect very high-confidence draft tokens
+        if (cur_p->data[0].p < params.p_min) {
+            break;
+        }
+
+        common_sampler_accept(smpl, id, true);
+
+        result.push_back(id);
+
+        if (params.n_draft <= (int) result.size()) {
+            break;
+        }
+
+        common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
+
+        // evaluate the drafted tokens on the draft model
+        llama_decode(ctx, batch);
+
+        prompt.push_back(id);
+    }
+
+    return result;
+}
--- a/common/speculative.h
+++ b/common/speculative.h
@ -0,0 +1,28 @@
+#pragma once
+
+#include "llama.h"
+#include "common.h"
+
+struct common_speculative;
+
+struct common_speculative_params {
+    int n_draft = 16;  // max drafted tokens
+    int n_reuse = 256;
+
+    float p_min = 0.9f; // min probabiliy required to accept a token in the draft
+};
+
+struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
+
+void common_speculative_free(struct common_speculative * spec);
+
+bool common_speculative_are_compatible(
+        const struct llama_context * ctx_tgt,
+        const struct llama_context * ctx_dft);
+
+// sample up to n_draft tokens and add them to the batch using the draft model
+llama_tokens common_speculative_gen_draft(
+               struct common_speculative * spec,
+        struct common_speculative_params   params,
+                      const llama_tokens & prompt,
+                             llama_token   id_last);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -2707,7 +2707,7 @@ class XLMRobertaModel(BertModel):
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
        self.gguf_writer.add_add_space_prefix(add_prefix)
-        self.gguf_writer.add_token_type_count(1)
+        self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
        self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
        if precompiled_charsmap:
            self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
@ -3040,9 +3040,9 @@ class OlmoModel(Model):
        return [(self.map_tensor_name(name), data_torch)]


-@Model.register("Olmo1124ForCausalLM")
-class Olmo1124Model(Model):
-    model_arch = gguf.MODEL_ARCH.OLMO_1124
+@Model.register("Olmo2ForCausalLM")
+class Olmo2Model(Model):
+    model_arch = gguf.MODEL_ARCH.OLMO2


@Model.register("OlmoeForCausalLM")
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@ -34,9 +34,10 @@ The SYCL backend would be broken by some PRs due to no online CI.

 The following release is verified with good quality:

-|Commit ID|Tag|Release|Verified  Platform|
-|-|-|-|-|
-|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
+|Commit ID|Tag|Release|Verified  Platform| Update date|
+|-|-|-|-|-|
+|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
+|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||


 ## News
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -12,13 +12,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})

 if (EMSCRIPTEN)
 else()
-    add_subdirectory(cvector-generator)
    add_subdirectory(batched-bench)
    add_subdirectory(batched)
-    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(embedding)
    add_subdirectory(eval-callback)
-    add_subdirectory(export-lora)
    add_subdirectory(gbnf-validator)
    add_subdirectory(gguf-hash)
    add_subdirectory(gguf-split)
@ -27,28 +24,36 @@ else()
    add_subdirectory(imatrix)
    add_subdirectory(infill)
    add_subdirectory(llama-bench)
-    add_subdirectory(llava)
    add_subdirectory(lookahead)
    add_subdirectory(lookup)
    add_subdirectory(main)
    add_subdirectory(parallel)
    add_subdirectory(passkey)
    add_subdirectory(perplexity)
-    add_subdirectory(quantize-stats)
    add_subdirectory(quantize)
    add_subdirectory(retrieval)
-    if (GGML_RPC)
-        add_subdirectory(rpc)
-    endif()
    if (LLAMA_BUILD_SERVER)
-    add_subdirectory(server)
-    endif()
-    if (GGML_SYCL)
-        add_subdirectory(sycl)
+        add_subdirectory(server)
    endif()
    add_subdirectory(save-load-state)
+    add_subdirectory(run)
    add_subdirectory(simple)
    add_subdirectory(simple-chat)
    add_subdirectory(speculative)
+    add_subdirectory(speculative-simple)
    add_subdirectory(tokenize)
+    if (NOT GGML_BACKEND_DL)
+        # these examples use the backends directly and cannot be built with dynamic loading
+        add_subdirectory(convert-llama2c-to-ggml)
+        add_subdirectory(cvector-generator)
+        add_subdirectory(export-lora)
+        add_subdirectory(quantize-stats)
+        add_subdirectory(llava)
+        if (GGML_RPC)
+            add_subdirectory(rpc)
+        endif()
+        if (GGML_SYCL)
+            add_subdirectory(sycl)
+        endif()
+    endif()
 endif()
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -68,10 +68,10 @@ int main(int argc, char ** argv) {

    llama_sampler * smpl = llama_sampler_chain_init(sparams);

-    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
-    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
-    llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
-    llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
+    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k));
+    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sampling.top_p, params.sampling.min_keep));
+    llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
+    llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));

    if (ctx == NULL) {
        LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
--- a/examples/eval-callback/CMakeLists.txt
+++ b/examples/eval-callback/CMakeLists.txt
@ -5,5 +5,6 @@ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)

 set(TEST_TARGET test-eval-callback)
-add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+add_test(NAME ${TEST_TARGET}
+        COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
 set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -73,7 +73,7 @@ int main(int argc, char ** argv) {

    common_init();

-    auto & sparams = params.sparams;
+    auto & sparams = params.sampling;

    console::init(params.simple_io, params.use_color);
    atexit([]() { console::cleanup(); });
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -191,7 +191,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_

    LOG("\n");

-    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);
+    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
    if (!smpl) {
        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
        exit(1);
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@ -237,7 +237,7 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm

    LOG_INF("\n");

-    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);
+    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
    return smpl;
 }

--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@ -115,7 +115,7 @@ int main(int argc, char ** argv) {
    llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);

    // target model sampling context
-    struct common_sampler * smpl = common_sampler_init(model, params.sparams);
+    struct common_sampler * smpl = common_sampler_init(model, params.sampling);

    // verification n-grams
    std::vector<ngram_data> ngrams_cur(G);
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@ -21,7 +21,7 @@ int main(int argc, char ** argv){

    common_init();

-    const int n_draft = params.n_draft;
+    const int n_draft = params.speculative.n_max;

    // init llama.cpp
    llama_backend_init();
@ -40,6 +40,7 @@ int main(int argc, char ** argv){
    common_ngram_cache ngram_cache_context;
    common_ngram_cache ngram_cache_dynamic;
    common_ngram_cache ngram_cache_static;
+
    int64_t t_draft_flat_us = 0;
    int64_t t_draft_us = 0;

--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -22,7 +22,7 @@ int main(int argc, char ** argv){
    common_init();

    // max. number of additional tokens to draft if match is found
-    const int n_draft = params.n_draft;
+    const int n_draft = params.speculative.n_max;

    const bool dump_kv_cache = params.dump_kv_cache;

@ -102,7 +102,7 @@ int main(int argc, char ** argv){

    bool has_eos = false;

-    struct common_sampler * smpl = common_sampler_init(model, params.sparams);
+    struct common_sampler * smpl = common_sampler_init(model, params.sampling);

    std::vector<llama_token> draft;

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -100,7 +100,7 @@ int main(int argc, char ** argv) {

    common_init();

-    auto & sparams = params.sparams;
+    auto & sparams = params.sampling;

    // save choice to use color for later
    // (note for later: this is a slightly awkward choice)
@ -165,6 +165,10 @@ int main(int argc, char ** argv) {

    LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);

+    auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
+    auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
+    auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
+
    struct ggml_threadpool_params tpp_batch =
            ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
    struct ggml_threadpool_params tpp =
@ -174,7 +178,7 @@ int main(int argc, char ** argv) {

    struct ggml_threadpool * threadpool_batch = NULL;
    if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
-        threadpool_batch = ggml_threadpool_new(&tpp_batch);
+        threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
        if (!threadpool_batch) {
            LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
            return 1;
@ -184,7 +188,7 @@ int main(int argc, char ** argv) {
        tpp.paused = true;
    }

-    struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
+    struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
    if (!threadpool) {
        LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
        return 1;
@ -890,8 +894,8 @@ int main(int argc, char ** argv) {

    llama_backend_free();

-    ggml_threadpool_free(threadpool);
-    ggml_threadpool_free(threadpool_batch);
+    ggml_threadpool_free_fn(threadpool);
+    ggml_threadpool_free_fn(threadpool_batch);

    return 0;
 }
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -160,7 +160,7 @@ int main(int argc, char ** argv) {
    for (size_t i = 0; i < clients.size(); ++i) {
        auto & client = clients[i];
        client.id = i;
-        client.smpl = common_sampler_init(model, params.sparams);
+        client.smpl = common_sampler_init(model, params.sampling);
    }

    std::vector<llama_token> tokens_system;
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@ -282,8 +282,8 @@ int main(int argc, char ** argv) {
                return a.second > b.second;
            });

-            LOG("Top %d similar chunks:\n", params.sparams.top_k);
-            for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
+            LOG("Top %d similar chunks:\n", params.sampling.top_k);
+            for (int i = 0; i < std::min(params.sampling.top_k, (int) chunks.size()); i++) {
                LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
                LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
                LOG("similarity: %f\n", similarities[i].second);
--- a/examples/run/CMakeLists.txt
+++ b/examples/run/CMakeLists.txt
@ -0,0 +1,5 @@
+set(TARGET llama-run)
+add_executable(${TARGET} run.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/run/README.md
+++ b/examples/run/README.md
@ -0,0 +1,7 @@
+# llama.cpp/example/run
+
+The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
+
+```bash
+./llama-run Meta-Llama-3.1-8B-Instruct.gguf
+...
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@ -0,0 +1,409 @@
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif
+
+#include <climits>
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "llama-cpp.h"
+
+typedef std::unique_ptr<char[]> char_array_ptr;
+
+struct Argument {
+    std::string flag;
+    std::string help_text;
+};
+
+struct Options {
+    std::string model_path, prompt_non_interactive;
+    int ngl = 99;
+    int n_ctx = 2048;
+};
+
+class ArgumentParser {
+   public:
+    ArgumentParser(const char * program_name) : program_name(program_name) {}
+
+    void add_argument(const std::string & flag, std::string & var, const std::string & help_text = "") {
+        string_args[flag] = &var;
+        arguments.push_back({flag, help_text});
+    }
+
+    void add_argument(const std::string & flag, int & var, const std::string & help_text = "") {
+        int_args[flag] = &var;
+        arguments.push_back({flag, help_text});
+    }
+
+    int parse(int argc, const char ** argv) {
+        for (int i = 1; i < argc; ++i) {
+            std::string arg = argv[i];
+            if (string_args.count(arg)) {
+                if (i + 1 < argc) {
+                    *string_args[arg] = argv[++i];
+                } else {
+                    fprintf(stderr, "error: missing value for %s\n", arg.c_str());
+                    print_usage();
+                    return 1;
+                }
+            } else if (int_args.count(arg)) {
+                if (i + 1 < argc) {
+                    if (parse_int_arg(argv[++i], *int_args[arg]) != 0) {
+                        fprintf(stderr, "error: invalid value for %s: %s\n", arg.c_str(), argv[i]);
+                        print_usage();
+                        return 1;
+                    }
+                } else {
+                    fprintf(stderr, "error: missing value for %s\n", arg.c_str());
+                    print_usage();
+                    return 1;
+                }
+            } else {
+                fprintf(stderr, "error: unrecognized argument %s\n", arg.c_str());
+                print_usage();
+                return 1;
+            }
+        }
+
+        if (string_args["-m"]->empty()) {
+            fprintf(stderr, "error: -m is required\n");
+            print_usage();
+            return 1;
+        }
+
+        return 0;
+    }
+
+   private:
+    const char * program_name;
+    std::unordered_map<std::string, std::string *> string_args;
+    std::unordered_map<std::string, int *> int_args;
+    std::vector<Argument> arguments;
+
+    int parse_int_arg(const char * arg, int & value) {
+        char * end;
+        const long val = std::strtol(arg, &end, 10);
+        if (*end == '\0' && val >= INT_MIN && val <= INT_MAX) {
+            value = static_cast<int>(val);
+            return 0;
+        }
+        return 1;
+    }
+
+    void print_usage() const {
+        printf("\nUsage:\n");
+        printf("  %s [OPTIONS]\n\n", program_name);
+        printf("Options:\n");
+        for (const auto & arg : arguments) {
+            printf("  %-10s %s\n", arg.flag.c_str(), arg.help_text.c_str());
+        }
+
+        printf("\n");
+    }
+};
+
+class LlamaData {
+   public:
+    llama_model_ptr model;
+    llama_sampler_ptr sampler;
+    llama_context_ptr context;
+    std::vector<llama_chat_message> messages;
+
+    int init(const Options & opt) {
+        model = initialize_model(opt.model_path, opt.ngl);
+        if (!model) {
+            return 1;
+        }
+
+        context = initialize_context(model, opt.n_ctx);
+        if (!context) {
+            return 1;
+        }
+
+        sampler = initialize_sampler();
+        return 0;
+    }
+
+   private:
+    // Initializes the model and returns a unique pointer to it
+    llama_model_ptr initialize_model(const std::string & model_path, const int ngl) {
+        llama_model_params model_params = llama_model_default_params();
+        model_params.n_gpu_layers = ngl;
+
+        llama_model_ptr model(llama_load_model_from_file(model_path.c_str(), model_params));
+        if (!model) {
+            fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        }
+
+        return model;
+    }
+
+    // Initializes the context with the specified parameters
+    llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
+        llama_context_params ctx_params = llama_context_default_params();
+        ctx_params.n_ctx = n_ctx;
+        ctx_params.n_batch = n_ctx;
+
+        llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
+        if (!context) {
+            fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__);
+        }
+
+        return context;
+    }
+
+    // Initializes and configures the sampler
+    llama_sampler_ptr initialize_sampler() {
+        llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
+        llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1));
+        llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f));
+        llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
+
+        return sampler;
+    }
+};
+
+// Add a message to `messages` and store its content in `owned_content`
+static void add_message(const char * role, const std::string & text, LlamaData & llama_data,
+                        std::vector<char_array_ptr> & owned_content) {
+    char_array_ptr content(new char[text.size() + 1]);
+    std::strcpy(content.get(), text.c_str());
+    llama_data.messages.push_back({role, content.get()});
+    owned_content.push_back(std::move(content));
+}
+
+// Function to apply the chat template and resize `formatted` if needed
+static int apply_chat_template(const LlamaData & llama_data, std::vector<char> & formatted, const bool append) {
+    int result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
+                                           llama_data.messages.size(), append, formatted.data(), formatted.size());
+    if (result > static_cast<int>(formatted.size())) {
+        formatted.resize(result);
+        result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
+                                           llama_data.messages.size(), append, formatted.data(), formatted.size());
+    }
+
+    return result;
+}
+
+// Function to tokenize the prompt
+static int tokenize_prompt(const llama_model_ptr & model, const std::string & prompt,
+                           std::vector<llama_token> & prompt_tokens) {
+    const int n_prompt_tokens = -llama_tokenize(model.get(), prompt.c_str(), prompt.size(), NULL, 0, true, true);
+    prompt_tokens.resize(n_prompt_tokens);
+    if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
+                       true) < 0) {
+        GGML_ABORT("failed to tokenize the prompt\n");
+    }
+
+    return n_prompt_tokens;
+}
+
+// Check if we have enough space in the context to evaluate this batch
+static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
+    const int n_ctx = llama_n_ctx(ctx.get());
+    const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
+    if (n_ctx_used + batch.n_tokens > n_ctx) {
+        printf("\033[0m\n");
+        fprintf(stderr, "context size exceeded\n");
+        return 1;
+    }
+
+    return 0;
+}
+
+// convert the token to a string
+static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) {
+    char buf[256];
+    int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
+    if (n < 0) {
+        GGML_ABORT("failed to convert token to piece\n");
+    }
+
+    piece = std::string(buf, n);
+    return 0;
+}
+
+static void print_word_and_concatenate_to_response(const std::string & piece, std::string & response) {
+    printf("%s", piece.c_str());
+    fflush(stdout);
+    response += piece;
+}
+
+// helper function to evaluate a prompt and generate a response
+static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
+    std::vector<llama_token> prompt_tokens;
+    const int n_prompt_tokens = tokenize_prompt(llama_data.model, prompt, prompt_tokens);
+    if (n_prompt_tokens < 0) {
+        return 1;
+    }
+
+    // prepare a batch for the prompt
+    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
+    llama_token new_token_id;
+    while (true) {
+        check_context_size(llama_data.context, batch);
+        if (llama_decode(llama_data.context.get(), batch)) {
+            GGML_ABORT("failed to decode\n");
+        }
+
+        // sample the next token, check is it an end of generation?
+        new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1);
+        if (llama_token_is_eog(llama_data.model.get(), new_token_id)) {
+            break;
+        }
+
+        std::string piece;
+        if (convert_token_to_string(llama_data.model, new_token_id, piece)) {
+            return 1;
+        }
+
+        print_word_and_concatenate_to_response(piece, response);
+
+        // prepare the next batch with the sampled token
+        batch = llama_batch_get_one(&new_token_id, 1);
+    }
+
+    return 0;
+}
+
+static int parse_arguments(const int argc, const char ** argv, Options & opt) {
+    ArgumentParser parser(argv[0]);
+    parser.add_argument("-m", opt.model_path, "model");
+    parser.add_argument("-p", opt.prompt_non_interactive, "prompt");
+    parser.add_argument("-c", opt.n_ctx, "context_size");
+    parser.add_argument("-ngl", opt.ngl, "n_gpu_layers");
+    if (parser.parse(argc, argv)) {
+        return 1;
+    }
+
+    return 0;
+}
+
+static int read_user_input(std::string & user) {
+    std::getline(std::cin, user);
+    return user.empty();  // Indicate an error or empty input
+}
+
+// Function to generate a response based on the prompt
+static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response) {
+    // Set response color
+    printf("\033[33m");
+    if (generate(llama_data, prompt, response)) {
+        fprintf(stderr, "failed to generate response\n");
+        return 1;
+    }
+
+    // End response with color reset and newline
+    printf("\n\033[0m");
+    return 0;
+}
+
+// Helper function to apply the chat template and handle errors
+static int apply_chat_template_with_error_handling(const LlamaData & llama_data, std::vector<char> & formatted,
+                                                   const bool is_user_input, int & output_length) {
+    const int new_len = apply_chat_template(llama_data, formatted, is_user_input);
+    if (new_len < 0) {
+        fprintf(stderr, "failed to apply the chat template\n");
+        return -1;
+    }
+
+    output_length = new_len;
+    return 0;
+}
+
+// Helper function to handle user input
+static bool handle_user_input(std::string & user_input, const std::string & prompt_non_interactive) {
+    if (!prompt_non_interactive.empty()) {
+        user_input = prompt_non_interactive;
+        return true;  // No need for interactive input
+    }
+
+    printf("\033[32m> \033[0m");
+    return !read_user_input(user_input);  // Returns false if input ends the loop
+}
+
+// Function to tokenize the prompt
+static int chat_loop(LlamaData & llama_data, std::string & prompt_non_interactive) {
+    std::vector<char_array_ptr> owned_content;
+    std::vector<char> fmtted(llama_n_ctx(llama_data.context.get()));
+    int prev_len = 0;
+
+    while (true) {
+        // Get user input
+        std::string user_input;
+        if (!handle_user_input(user_input, prompt_non_interactive)) {
+            break;
+        }
+
+        add_message("user", prompt_non_interactive.empty() ? user_input : prompt_non_interactive, llama_data,
+                    owned_content);
+
+        int new_len;
+        if (apply_chat_template_with_error_handling(llama_data, fmtted, true, new_len) < 0) {
+            return 1;
+        }
+
+        std::string prompt(fmtted.begin() + prev_len, fmtted.begin() + new_len);
+        std::string response;
+        if (generate_response(llama_data, prompt, response)) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+static void log_callback(const enum ggml_log_level level, const char * text, void *) {
+    if (level == GGML_LOG_LEVEL_ERROR) {
+        fprintf(stderr, "%s", text);
+    }
+}
+
+static bool is_stdin_a_terminal() {
+#if defined(_WIN32)
+    HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
+    DWORD mode;
+    return GetConsoleMode(hStdin, &mode);
+#else
+    return isatty(STDIN_FILENO);
+#endif
+}
+
+static std::string read_pipe_data() {
+    std::ostringstream result;
+    result << std::cin.rdbuf();  // Read all data from std::cin
+    return result.str();
+}
+
+int main(int argc, const char ** argv) {
+    Options opt;
+    if (parse_arguments(argc, argv, opt)) {
+        return 1;
+    }
+
+    if (!is_stdin_a_terminal()) {
+        if (!opt.prompt_non_interactive.empty()) {
+            opt.prompt_non_interactive += "\n\n";
+        }
+
+        opt.prompt_non_interactive += read_pipe_data();
+    }
+
+    llama_log_set(log_callback, nullptr);
+    LlamaData llama_data;
+    if (llama_data.init(opt)) {
+        return 1;
+    }
+
+    if (chat_loop(llama_data, opt.prompt_non_interactive)) {
+        return 1;
+    }
+
+    return 0;
+}
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -9,7 +9,7 @@ int main(int argc, char ** argv) {
    common_params params;

    params.prompt = "The quick brown fox";
-    params.sparams.seed = 1234;
+    params.sampling.seed = 1234;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
        return 1;
@ -42,7 +42,7 @@ int main(int argc, char ** argv) {

    llama_sampler * smpl = llama_sampler_chain_init(sparams);

-    llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
+    llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed));

    // tokenize prompt
    auto tokens = common_tokenize(ctx, params.prompt, true);
@ -106,7 +106,7 @@ int main(int argc, char ** argv) {

    llama_sampler * smpl2 = llama_sampler_chain_init(sparams);

-    llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed));
+    llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sampling.seed));

    printf("\nsecond run: %s", params.prompt.c_str());

@ -169,7 +169,7 @@ int main(int argc, char ** argv) {

    llama_sampler * smpl3 = llama_sampler_chain_init(sparams);

-    llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed));
+    llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed));

    printf("\nsingle seq run: %s", params.prompt.c_str());

--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -412,7 +412,7 @@ node index.js

    `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot.  Default: `-1`

-    `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
+    `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`

    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.

--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -81,7 +81,13 @@
              <path d="M14.5 3a1 1 0 0 1-1 1H13v9a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V4h-.5a1 1 0 0 1-1-1V2a1 1 0 0 1 1-1H6a1 1 0 0 1 1-1h2a1 1 0 0 1 1 1h3.5a1 1 0 0 1 1 1zM4.118 4 4 4.059V13a1 1 0 0 0 1 1h6a1 1 0 0 0 1-1V4.059L11.882 4zM2.5 3h11V2h-11z"/>
            </svg>
          </button>
-
+          <button v-if="messages.length > 0" class="btn mr-1" @click="downloadConv(viewingConvId)" :disabled="isGenerating">
+              <!-- download conversation button -->
+              <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-download" viewBox="0 0 16 16">
+                  <path d="M.5 9.9a.5.5 0 0 1 .5.5v2.5a1 1 0 0 0 1 1h12a1 1 0 0 0 1-1v-2.5a.5.5 0 0 1 1 0v2.5a2 2 0 0 1-2 2H2a2 2 0 0 1-2-2v-2.5a.5.5 0 0 1 .5-.5"/>
+                  <path d="M7.646 11.854a.5.5 0 0 0 .708 0l3-3a.5.5 0 0 0-.708-.708L8.5 10.293V1.5a.5.5 0 0 0-1 0v8.793L5.354 8.146a.5.5 0 1 0-.708.708z"/>
+            </svg>
+          </button>
          <button class="btn" @click="showConfigDialog = true" :disabled="isGenerating">
            <!-- edit config button -->
            <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
@ -526,6 +532,23 @@
            this.fetchMessages();
          }
        },
+        downloadConv(convId) {
+          const conversation = StorageUtils.getOneConversation(convId);
+          if (!conversation) {
+            alert('Conversation not found.');
+            return;
+          }
+          const conversationJson = JSON.stringify(conversation, null, 2);
+          const blob = new Blob([conversationJson], { type: 'application/json' });
+          const url = URL.createObjectURL(blob);
+          const a = document.createElement('a');
+          a.href = url;
+          a.download = `conversation_${convId}.json`;
+          document.body.appendChild(a);
+          a.click();
+          document.body.removeChild(a);
+          URL.revokeObjectURL(url);
+        },
        async sendMessage() {
          if (!this.inputMsg) return;
          const currConvId = this.viewingConvId;
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2,10 +2,11 @@

 #include "arg.h"
 #include "common.h"
-#include "log.h"
-#include "sampling.h"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
+#include "log.h"
+#include "sampling.h"
+#include "speculative.h"

 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
@ -110,7 +111,7 @@ struct server_static_file {

 struct slot_params {
    bool stream       = true;
-    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
+    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt

    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
@ -121,12 +122,21 @@ struct slot_params {
    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit

    std::vector<std::string> antiprompt;
+
+    struct common_params_sampling sampling;
+    struct common_params_speculative speculative;
 };

 struct server_slot {
    int id;
    int id_task = -1;

+    llama_batch batch_spec;
+
+    llama_context * ctx_dft = nullptr;
+
+    common_speculative * spec = nullptr;
+
    // the index relative to completion multi-task request
    size_t index = 0;

@ -175,7 +185,6 @@ struct server_slot {
    // sampling
    json json_schema;

-    struct common_sampler_params sparams;
    struct common_sampler * smpl = nullptr;

    llama_token sampled;
@ -212,7 +221,7 @@ struct server_slot {
        generated_token_probs.clear();
    }

-    bool has_budget(common_params &global_params) {
+    bool has_budget(const common_params & global_params) {
        if (params.n_predict == -1 && global_params.n_predict == -1) {
            return true; // limitless
        }
@ -232,6 +241,10 @@ struct server_slot {
        return state != SLOT_STATE_IDLE;
    }

+    bool can_speculate() const {
+        return ctx_dft && params.speculative.n_max > 0 && params.cache_prompt;
+    }
+
    void add_token(const completion_token_output & token) {
        if (!is_processing()) {
            SLT_WRN(*this, "%s", "slot is not processing\n");
@ -591,11 +604,14 @@ struct server_response {
 };

 struct server_context {
+    common_params params_base;
+
    llama_model * model = nullptr;
    llama_context * ctx = nullptr;
    std::vector<common_lora_adapter_container> loras;

-    common_params params;
+    llama_model * model_dft = nullptr;
+    llama_context_params cparams_dft;

    llama_batch batch = {};

@ -628,27 +644,41 @@ struct server_context {
            model = nullptr;
        }

+        if (model_dft) {
+            llama_free_model(model_dft);
+            model_dft = nullptr;
+        }
+
        // Clear any sampling context
        for (server_slot & slot : slots) {
-            if (slot.smpl != nullptr) {
-                common_sampler_free(slot.smpl);
-            }
+            common_sampler_free(slot.smpl);
+            slot.smpl = nullptr;
+
+            llama_free(slot.ctx_dft);
+            slot.ctx_dft = nullptr;
+
+            common_speculative_free(slot.spec);
+            slot.spec = nullptr;
+
+            llama_batch_free(slot.batch_spec);
        }

        llama_batch_free(batch);
    }

-    bool load_model(const common_params & params_) {
-        params = params_;
+    bool load_model(const common_params & params) {
+        SRV_INF("loading model '%s'\n", params.model.c_str());

-        common_init_result llama_init = common_init_from_params(params);
+        params_base = params;
+
+        common_init_result llama_init = common_init_from_params(params_base);

        model = llama_init.model;
        ctx   = llama_init.context;
        loras = llama_init.lora_adapters;

        if (model == nullptr) {
-            SRV_ERR("failed to load model, '%s'\n", params.model.c_str());
+            SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
            return false;
        }

@ -657,6 +687,41 @@ struct server_context {
        add_bos_token = llama_add_bos_token(model);
        has_eos_token = !llama_add_eos_token(model);

+        if (!params_base.speculative.model.empty()) {
+            SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
+
+            auto params_dft = params_base;
+
+            params_dft.devices      = params_base.speculative.devices;
+            params_dft.model        = params_base.speculative.model;
+            params_dft.n_ctx        = params_base.speculative.n_ctx;
+            params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
+
+            common_init_result llama_init_dft = common_init_from_params(params_dft);
+
+            model_dft = llama_init_dft.model;
+
+            if (model_dft == nullptr) {
+                SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
+                return false;
+            }
+
+            if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
+                SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
+
+                llama_free      (llama_init_dft.context);
+                llama_free_model(llama_init_dft.model);
+
+                return false;
+            }
+
+            cparams_dft = common_context_params_to_llama(params_base);
+            cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
+
+            // the context is not needed - we will create one for each slot
+            llama_free(llama_init_dft.context);
+        }
+
        return true;
    }

@ -674,20 +739,36 @@ struct server_context {
    }

    void init() {
-        const int32_t n_ctx_slot = n_ctx / params.n_parallel;
+        const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;

-        SRV_INF("initializing slots, n_slots = %d\n", params.n_parallel);
+        SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);

-        for (int i = 0; i < params.n_parallel; i++) {
+        for (int i = 0; i < params_base.n_parallel; i++) {
            server_slot slot;

            slot.id = i;
            slot.n_ctx = n_ctx_slot;
-            slot.n_predict = params.n_predict;
+            slot.n_predict = params_base.n_predict;
+
+            if (model_dft) {
+                slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
+
+                slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft);
+                if (slot.ctx_dft == nullptr) {
+                    SRV_ERR("%s", "failed to create draft context\n");
+                    return;
+                }
+
+                slot.spec = common_speculative_init(slot.ctx_dft);
+                if (slot.spec == nullptr) {
+                    SRV_ERR("%s", "failed to create speculator\n");
+                    return;
+                }
+            }

            SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);

-            slot.sparams = params.sparams;
+            slot.params.sampling = params_base.sampling;

            slot.callback_on_release = [this](int) {
                queue_tasks.pop_deferred_task();
@ -707,7 +788,7 @@ struct server_context {
            const int32_t n_batch = llama_n_batch(ctx);

            // only a single seq_id per token is needed
-            batch = llama_batch_init(std::max(n_batch, params.n_parallel), 0, 1);
+            batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
        }

        metrics.init();
@ -743,7 +824,7 @@ struct server_context {
                }

                // length of the Longest Common Subsequence between the current slot's prompt and the input prompt
-                int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
+                int cur_lcs_len = common_lcs(slot.cache_tokens, task.prompt_tokens);

                // fraction of the common subsequence length compared to the current slot's prompt length
                float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
@ -786,9 +867,11 @@ struct server_context {
    }

    bool launch_slot_with_task(server_slot & slot, const server_task & task) {
-        slot_params default_params;
        // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
-        auto default_sparams = params.sparams;
+        slot_params defaults;
+        defaults.sampling    = params_base.sampling;
+        defaults.speculative = params_base.speculative;
+
        const auto & data = task.data;

        if (data.count("__oaicompat") != 0) {
@ -799,42 +882,48 @@ struct server_context {
            slot.oaicompat_model = "";
        }

-        slot.params.stream              = json_value(data, "stream",             false);
-        slot.params.cache_prompt        = json_value(data, "cache_prompt",       false);
-        slot.params.n_predict           = json_value(data, "n_predict",          json_value(data, "max_tokens", default_params.n_predict));
-        slot.params.n_indent            = json_value(data, "n_indent",           default_params.n_indent);
-        slot.sparams.top_k              = json_value(data, "top_k",              default_sparams.top_k);
-        slot.sparams.top_p              = json_value(data, "top_p",              default_sparams.top_p);
-        slot.sparams.min_p              = json_value(data, "min_p",              default_sparams.min_p);
-        slot.sparams.xtc_probability    = json_value(data, "xtc_probability",    default_sparams.xtc_probability);
-        slot.sparams.xtc_threshold      = json_value(data, "xtc_threshold",      default_sparams.xtc_threshold);
-        slot.sparams.typ_p              = json_value(data, "typical_p",          default_sparams.typ_p);
-        slot.sparams.temp               = json_value(data, "temperature",        default_sparams.temp);
-        slot.sparams.dynatemp_range     = json_value(data, "dynatemp_range",     default_sparams.dynatemp_range);
-        slot.sparams.dynatemp_exponent  = json_value(data, "dynatemp_exponent",  default_sparams.dynatemp_exponent);
-        slot.sparams.penalty_last_n     = json_value(data, "repeat_last_n",      default_sparams.penalty_last_n);
-        slot.sparams.penalty_repeat     = json_value(data, "repeat_penalty",     default_sparams.penalty_repeat);
-        slot.sparams.penalty_freq       = json_value(data, "frequency_penalty",  default_sparams.penalty_freq);
-        slot.sparams.penalty_present    = json_value(data, "presence_penalty",   default_sparams.penalty_present);
-        slot.sparams.dry_multiplier     = json_value(data, "dry_multiplier",     default_sparams.dry_multiplier);
-        slot.sparams.dry_base           = json_value(data, "dry_base",           default_sparams.dry_base);
-        slot.sparams.dry_allowed_length = json_value(data, "dry_allowed_length", default_sparams.dry_allowed_length);
-        slot.sparams.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", default_sparams.dry_penalty_last_n);
-        slot.sparams.mirostat           = json_value(data, "mirostat",           default_sparams.mirostat);
-        slot.sparams.mirostat_tau       = json_value(data, "mirostat_tau",       default_sparams.mirostat_tau);
-        slot.sparams.mirostat_eta       = json_value(data, "mirostat_eta",       default_sparams.mirostat_eta);
-        slot.sparams.penalize_nl        = json_value(data, "penalize_nl",        default_sparams.penalize_nl);
-        slot.params.n_keep              = json_value(data, "n_keep",             default_params.n_keep);
-        slot.params.n_discard           = json_value(data, "n_discard",          default_params.n_discard);
-        slot.sparams.seed               = json_value(data, "seed",               default_sparams.seed);
-        slot.sparams.n_probs            = json_value(data, "n_probs",            default_sparams.n_probs);
-        slot.sparams.min_keep           = json_value(data, "min_keep",           default_sparams.min_keep);
-      //slot.params.t_max_prompt_ms     = json_value(data, "t_max_prompt_ms",    default_params.t_max_prompt_ms); // TODO: implement
-        slot.params.t_max_predict_ms    = json_value(data, "t_max_predict_ms",   default_params.t_max_predict_ms);
+        slot.params.stream           = json_value(data, "stream",             false);
+        slot.params.cache_prompt     = json_value(data, "cache_prompt",       true);
+        slot.params.n_predict        = json_value(data, "n_predict",          json_value(data, "max_tokens", defaults.n_predict));
+        slot.params.n_indent         = json_value(data, "n_indent",           defaults.n_indent);
+        slot.params.n_keep           = json_value(data, "n_keep",             defaults.n_keep);
+        slot.params.n_discard        = json_value(data, "n_discard",          defaults.n_discard);
+      //slot.params.t_max_prompt_ms  = json_value(data, "t_max_prompt_ms",    defaults.t_max_prompt_ms); // TODO: implement
+        slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms",   defaults.t_max_predict_ms);

-        if (slot.sparams.dry_base < 1.0f)
-        {
-           slot.sparams.dry_base = default_sparams.dry_base;
+        slot.params.sampling.top_k              = json_value(data, "top_k",              defaults.sampling.top_k);
+        slot.params.sampling.top_p              = json_value(data, "top_p",              defaults.sampling.top_p);
+        slot.params.sampling.min_p              = json_value(data, "min_p",              defaults.sampling.min_p);
+        slot.params.sampling.xtc_probability    = json_value(data, "xtc_probability",    defaults.sampling.xtc_probability);
+        slot.params.sampling.xtc_threshold      = json_value(data, "xtc_threshold",      defaults.sampling.xtc_threshold);
+        slot.params.sampling.typ_p              = json_value(data, "typical_p",          defaults.sampling.typ_p);
+        slot.params.sampling.temp               = json_value(data, "temperature",        defaults.sampling.temp);
+        slot.params.sampling.dynatemp_range     = json_value(data, "dynatemp_range",     defaults.sampling.dynatemp_range);
+        slot.params.sampling.dynatemp_exponent  = json_value(data, "dynatemp_exponent",  defaults.sampling.dynatemp_exponent);
+        slot.params.sampling.penalty_last_n     = json_value(data, "repeat_last_n",      defaults.sampling.penalty_last_n);
+        slot.params.sampling.penalty_repeat     = json_value(data, "repeat_penalty",     defaults.sampling.penalty_repeat);
+        slot.params.sampling.penalty_freq       = json_value(data, "frequency_penalty",  defaults.sampling.penalty_freq);
+        slot.params.sampling.penalty_present    = json_value(data, "presence_penalty",   defaults.sampling.penalty_present);
+        slot.params.sampling.dry_multiplier     = json_value(data, "dry_multiplier",     defaults.sampling.dry_multiplier);
+        slot.params.sampling.dry_base           = json_value(data, "dry_base",           defaults.sampling.dry_base);
+        slot.params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length);
+        slot.params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n);
+        slot.params.sampling.mirostat           = json_value(data, "mirostat",           defaults.sampling.mirostat);
+        slot.params.sampling.mirostat_tau       = json_value(data, "mirostat_tau",       defaults.sampling.mirostat_tau);
+        slot.params.sampling.mirostat_eta       = json_value(data, "mirostat_eta",       defaults.sampling.mirostat_eta);
+        slot.params.sampling.penalize_nl        = json_value(data, "penalize_nl",        defaults.sampling.penalize_nl);
+        slot.params.sampling.seed               = json_value(data, "seed",               defaults.sampling.seed);
+        slot.params.sampling.n_probs            = json_value(data, "n_probs",            defaults.sampling.n_probs);
+        slot.params.sampling.min_keep           = json_value(data, "min_keep",           defaults.sampling.min_keep);
+
+        slot.params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
+        slot.params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
+        slot.params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
+
+        slot.params.speculative.n_min = std::min(slot.params.speculative.n_max, slot.params.speculative.n_min);
+
+        if (slot.params.sampling.dry_base < 1.0f) {
+           slot.params.sampling.dry_base = defaults.sampling.dry_base;
        }

        // sequence breakers for DRY
@ -843,8 +932,8 @@ struct server_context {
            // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39

            if (data.contains("dry_sequence_breakers")) {
-                slot.sparams.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>());
-                if (slot.sparams.dry_sequence_breakers.empty()) {
+                slot.params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>());
+                if (slot.params.sampling.dry_sequence_breakers.empty()) {
                    send_error(task, "Error: dry_sequence_breakers must be a non-empty array of strings", ERROR_TYPE_INVALID_REQUEST);
                    return false;
                }
@ -858,14 +947,14 @@ struct server_context {
        }
        if (data.contains("json_schema") && !data.contains("grammar")) {
            try {
-                auto schema          = json_value(data, "json_schema", json::object());
-                slot.sparams.grammar = json_schema_to_grammar(schema);
+                auto schema                  = json_value(data, "json_schema", json::object());
+                slot.params.sampling.grammar = json_schema_to_grammar(schema);
            } catch (const std::exception & e) {
                send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
                return false;
            }
        } else {
-            slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
+            slot.params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar);
        }

        if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
@ -875,10 +964,10 @@ struct server_context {
        }

        {
-            slot.sparams.logit_bias.clear();
+            slot.params.sampling.logit_bias.clear();

            if (json_value(data, "ignore_eos", false) && has_eos_token) {
-                slot.sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
+                slot.params.sampling.logit_bias.push_back({llama_token_eos(model), -INFINITY});
            }

            const auto & logit_bias = data.find("logit_bias");
@ -899,12 +988,12 @@ struct server_context {
                        if (el[0].is_number_integer()) {
                            llama_token tok = el[0].get<llama_token>();
                            if (tok >= 0 && tok < n_vocab) {
-                                slot.sparams.logit_bias.push_back({tok, bias});
+                                slot.params.sampling.logit_bias.push_back({tok, bias});
                            }
                        } else if (el[0].is_string()) {
                            auto toks = common_tokenize(model, el[0].get<std::string>(), false);
                            for (auto tok : toks) {
-                                slot.sparams.logit_bias.push_back({tok, bias});
+                                slot.params.sampling.logit_bias.push_back({tok, bias});
                            }
                        }
                    }
@ -935,16 +1024,16 @@ struct server_context {
                            sampler_names.emplace_back(name);
                        }
                    }
-                    slot.sparams.samplers = common_sampler_types_from_names(sampler_names, false);
+                    slot.params.sampling.samplers = common_sampler_types_from_names(sampler_names, false);
                } else if (samplers->is_string()){
                    std::string sampler_string;
                    for (const auto & name : *samplers) {
                        sampler_string += name;
                    }
-                    slot.sparams.samplers = common_sampler_types_from_chars(sampler_string);
+                    slot.params.sampling.samplers = common_sampler_types_from_chars(sampler_string);
                }
            } else {
-                slot.sparams.samplers = default_sparams.samplers;
+                slot.params.sampling.samplers = defaults.sampling.samplers;
            }
        }

@ -953,7 +1042,7 @@ struct server_context {
                common_sampler_free(slot.smpl);
            }

-            slot.smpl = common_sampler_init(model, slot.sparams);
+            slot.smpl = common_sampler_init(model, slot.params.sampling);
            if (slot.smpl == nullptr) {
                // for now, the only error that may happen here is invalid grammar
                send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
@ -961,6 +1050,12 @@ struct server_context {
            }
        }

+        if (slot.ctx_dft) {
+            llama_batch_free(slot.batch_spec);
+
+            slot.batch_spec = llama_batch_init(slot.params.speculative.n_max + 1, 0, 1);
+        }
+
        slot.state = SLOT_STATE_STARTED;

        SLT_INF(slot, "%s", "processing task\n");
@ -978,7 +1073,7 @@ struct server_context {

    bool process_token(completion_token_output & result, server_slot & slot) {
        // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = common_token_to_piece(ctx, result.tok, params.special);
+        const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special);
        slot.sampled = result.tok;

        // search stop word and delete it
@ -1043,7 +1138,7 @@ struct server_context {
        }

        // check the limits
-        if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params)) {
+        if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
            slot.stopped_limit  = true;
            slot.has_next_token = false;

@ -1136,50 +1231,54 @@ struct server_context {

    json get_formated_generation(const server_slot & slot) const {
        std::vector<std::string> samplers;
-        samplers.reserve(slot.sparams.samplers.size());
-        for (const auto & sampler : slot.sparams.samplers) {
+        samplers.reserve(slot.params.sampling.samplers.size());
+        for (const auto & sampler : slot.params.sampling.samplers) {
            samplers.emplace_back(common_sampler_type_to_str(sampler));
        }

        return json {
            {"n_ctx",                     slot.n_ctx},
            {"n_predict",                 slot.n_predict},     // Server configured n_predict
-            {"model",                     params.model_alias},
-            {"seed",                      slot.sparams.seed},
+            {"model",                     params_base.model_alias},
+            {"seed",                      slot.params.sampling.seed},
            {"seed_cur",                  slot.smpl ? common_sampler_get_seed(slot.smpl) : 0},
-            {"temperature",               slot.sparams.temp},
-            {"dynatemp_range",            slot.sparams.dynatemp_range},
-            {"dynatemp_exponent",         slot.sparams.dynatemp_exponent},
-            {"top_k",                     slot.sparams.top_k},
-            {"top_p",                     slot.sparams.top_p},
-            {"min_p",                     slot.sparams.min_p},
-            {"xtc_probability",           slot.sparams.xtc_probability},
-            {"xtc_threshold",             slot.sparams.xtc_threshold},
-            {"typical_p",                 slot.sparams.typ_p},
-            {"repeat_last_n",             slot.sparams.penalty_last_n},
-            {"repeat_penalty",            slot.sparams.penalty_repeat},
-            {"presence_penalty",          slot.sparams.penalty_present},
-            {"frequency_penalty",         slot.sparams.penalty_freq},
-            {"dry_multiplier",            slot.sparams.dry_multiplier},
-            {"dry_base",                  slot.sparams.dry_base},
-            {"dry_allowed_length",        slot.sparams.dry_allowed_length},
-            {"dry_penalty_last_n",        slot.sparams.dry_penalty_last_n},
-            {"dry_sequence_breakers",     slot.sparams.dry_sequence_breakers},
-            {"mirostat",                  slot.sparams.mirostat},
-            {"mirostat_tau",              slot.sparams.mirostat_tau},
-            {"mirostat_eta",              slot.sparams.mirostat_eta},
-            {"penalize_nl",               slot.sparams.penalize_nl},
+            {"temperature",               slot.params.sampling.temp},
+            {"dynatemp_range",            slot.params.sampling.dynatemp_range},
+            {"dynatemp_exponent",         slot.params.sampling.dynatemp_exponent},
+            {"top_k",                     slot.params.sampling.top_k},
+            {"top_p",                     slot.params.sampling.top_p},
+            {"min_p",                     slot.params.sampling.min_p},
+            {"xtc_probability",           slot.params.sampling.xtc_probability},
+            {"xtc_threshold",             slot.params.sampling.xtc_threshold},
+            {"typical_p",                 slot.params.sampling.typ_p},
+            {"repeat_last_n",             slot.params.sampling.penalty_last_n},
+            {"repeat_penalty",            slot.params.sampling.penalty_repeat},
+            {"presence_penalty",          slot.params.sampling.penalty_present},
+            {"frequency_penalty",         slot.params.sampling.penalty_freq},
+            {"dry_multiplier",            slot.params.sampling.dry_multiplier},
+            {"dry_base",                  slot.params.sampling.dry_base},
+            {"dry_allowed_length",        slot.params.sampling.dry_allowed_length},
+            {"dry_penalty_last_n",        slot.params.sampling.dry_penalty_last_n},
+            {"dry_sequence_breakers",     slot.params.sampling.dry_sequence_breakers},
+            {"mirostat",                  slot.params.sampling.mirostat},
+            {"mirostat_tau",              slot.params.sampling.mirostat_tau},
+            {"mirostat_eta",              slot.params.sampling.mirostat_eta},
+            {"penalize_nl",               slot.params.sampling.penalize_nl},
            {"stop",                      slot.params.antiprompt},
            {"max_tokens",                slot.params.n_predict}, // User configured n_predict
            {"n_keep",                    slot.params.n_keep},
            {"n_discard",                 slot.params.n_discard},
-            {"ignore_eos",                slot.sparams.ignore_eos},
+            {"ignore_eos",                slot.params.sampling.ignore_eos},
            {"stream",                    slot.params.stream},
-          //{"logit_bias",                slot.sparams.logit_bias},
-            {"n_probs",                   slot.sparams.n_probs},
-            {"min_keep",                  slot.sparams.min_keep},
-            {"grammar",                   slot.sparams.grammar},
+          //{"logit_bias",                slot.params.sampling.logit_bias},
+            {"n_probs",                   slot.params.sampling.n_probs},
+            {"min_keep",                  slot.params.sampling.min_keep},
+            {"grammar",                   slot.params.sampling.grammar},
            {"samplers",                  samplers},
+            {"speculative",               slot.can_speculate()},
+            {"speculative.n_max",         slot.params.speculative.n_max},
+            {"speculative.n_min",         slot.params.speculative.n_min},
+            {"speculative.p_min",         slot.params.speculative.p_min},
        };
    }

@ -1216,7 +1315,7 @@ struct server_context {
            {"index",      slot.index},
        };

-        if (slot.sparams.n_probs > 0) {
+        if (slot.params.sampling.n_probs > 0) {
            const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
            const size_t probs_pos      = std::min(slot.n_sent_token_probs,                       slot.generated_token_probs.size());
            const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
@ -1249,7 +1348,7 @@ struct server_context {
            {"content",             !slot.params.stream ? slot.generated_text : ""},
            {"id_slot",             slot.id},
            {"stop",                true},
-            {"model",               params.model_alias},
+            {"model",               params_base.model_alias},
            {"tokens_predicted",    slot.n_decoded},
            {"tokens_evaluated",    slot.n_prompt_tokens},
            {"generation_settings", get_formated_generation(slot)},
@ -1265,7 +1364,7 @@ struct server_context {
            {"index",               slot.index},
        };

-        if (slot.sparams.n_probs > 0) {
+        if (slot.params.sampling.n_probs > 0) {
            std::vector<completion_token_output> probs;
            if (!slot.params.stream && slot.stopped_word) {
                const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
@ -1422,10 +1521,10 @@ struct server_context {
                            data.at("input_prefix"),
                            data.at("input_suffix"),
                            data.at("input_extra"),
-                            params.n_batch,
-                            params.n_predict,
+                            params_base.n_batch,
+                            params_base.n_predict,
                            slots[0].n_ctx, // TODO: there should be a better way
-                            params.spm_infill,
+                            params_base.spm_infill,
                            tokenized_prompts[i]
                        );
                        create_task(data, tokens);
@ -1798,7 +1897,7 @@ struct server_context {
        // TODO: simplify and improve
        for (server_slot & slot : slots) {
            if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) {
-                if (!params.ctx_shift) {
+                if (!params_base.ctx_shift) {
                    // this check is redundant (for good)
                    // we should never get here, because generation should already stopped in process_token()
                    slot.release();
@ -1864,7 +1963,7 @@ struct server_context {
        int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;

        // next, batch any pending prompts without exceeding n_batch
-        if (params.cont_batching || batch.n_tokens == 0) {
+        if (params_base.cont_batching || batch.n_tokens == 0) {
            for (auto & slot : slots) {
                // this slot still has a prompt to be processed
                if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
@ -1917,7 +2016,7 @@ struct server_context {
                                continue;
                            }
                        } else {
-                            if (!params.ctx_shift) {
+                            if (!params_base.ctx_shift) {
                                // if context shift is disabled, we make sure prompt size is smaller than KV size
                                // TODO: there should be a separate parameter that control prompt truncation
                                //       context shift should be applied only during the generation phase
@ -1960,14 +2059,14 @@ struct server_context {

                            if (slot.params.cache_prompt) {
                                // reuse any previously computed tokens that are common with the new prompt
-                                slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens);
+                                slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens);

                                // reuse chunks from the cached prompt by shifting their KV cache in the new position
-                                if (params.n_cache_reuse > 0) {
+                                if (params_base.n_cache_reuse > 0) {
                                    size_t head_c = slot.n_past; // cache
                                    size_t head_p = slot.n_past; // current prompt

-                                    SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past);
+                                    SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params_base.n_cache_reuse, slot.n_past);

                                    while (head_c < slot.cache_tokens.size() &&
                                           head_p < prompt_tokens.size()) {
@ -1980,7 +2079,7 @@ struct server_context {
                                            n_match++;
                                        }

-                                        if (n_match >= (size_t) params.n_cache_reuse) {
+                                        if (n_match >= (size_t) params_base.n_cache_reuse) {
                                            SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
                                            //for (size_t i = head_p; i < head_p + n_match; i++) {
                                            //    SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
@ -2168,38 +2267,99 @@ struct server_context {
                    continue; // continue loop of slots
                }

-                completion_token_output result;
-                const llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
+                llama_token id;

-                common_sampler_accept(slot.smpl, id, true);
+                {
+                    completion_token_output result;

-                slot.n_decoded += 1;
-                if (slot.n_decoded == 1) {
-                    slot.t_start_generation = ggml_time_us();
-                    slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
-                    metrics.on_prompt_eval(slot);
+                    id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
+
+                    slot.i_batch = -1;
+
+                    common_sampler_accept(slot.smpl, id, true);
+
+                    slot.n_decoded += 1;
+                    if (slot.n_decoded == 1) {
+                        slot.t_start_generation = ggml_time_us();
+                        slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
+                        metrics.on_prompt_eval(slot);
+                    }
+
+                    result.tok = id;
+
+                    const auto * cur_p = common_sampler_get_candidates(slot.smpl);
+
+                    for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) {
+                        result.probs.push_back({
+                            cur_p->data[i].id,
+                                i >= cur_p->size ? 0.0f : cur_p->data[i].p,
+                        });
+                    }
+
+                    if (!process_token(result, slot)) {
+                        // release slot because of stop condition
+                        slot.release();
+                        slot.print_timings();
+                        send_final_response(slot);
+                        metrics.on_prediction(slot);
+                        continue;
+                    }
                }

-                result.tok = id;
-
-                const auto * cur_p = common_sampler_get_candidates(slot.smpl);
-
-                for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
-                    result.probs.push_back({
-                        cur_p->data[i].id,
-                        i >= cur_p->size ? 0.0f : cur_p->data[i].p,
-                    });
+                // check if the slot supports speculative decoding
+                if (!slot.can_speculate()) {
+                    continue;
                }

-                if (!process_token(result, slot)) {
-                    // release slot because of stop condition
-                    slot.release();
-                    slot.print_timings();
-                    send_final_response(slot);
-                    metrics.on_prediction(slot);
+                struct common_speculative_params params_spec;
+                params_spec.n_draft   = slot.params.speculative.n_max;
+                params_spec.n_reuse   = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
+                params_spec.p_min     = slot.params.speculative.p_min;
+
+                llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
+
+                // ignore small drafts
+                if (slot.params.speculative.n_min > (int) draft.size()) {
+                    continue;
                }

-                slot.i_batch = -1;
+                // construct the speculation batch
+                common_batch_clear(slot.batch_spec);
+                common_batch_add  (slot.batch_spec, id, slot.n_past, { slot.id }, true);
+
+                for (size_t i = 0; i < draft.size(); ++i) {
+                    common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true);
+                }
+
+                llama_decode(ctx, slot.batch_spec);
+
+                // the accepted tokens from the speculation
+                const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft);
+
+                slot.n_past    += ids.size();
+                slot.n_decoded += ids.size();
+
+                slot.cache_tokens.push_back(id);
+                slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
+
+                llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
+
+                for (size_t i = 0; i < ids.size(); ++i) {
+                    completion_token_output result;
+
+                    result.tok = ids[i];
+
+                    if (!process_token(result, slot)) {
+                        // release slot because of stop condition
+                        slot.release();
+                        slot.print_timings();
+                        send_final_response(slot);
+                        metrics.on_prediction(slot);
+                        break;
+                    }
+                }
+
+                SRV_DBG("accepted %d/%d draft tokens\n", (int) ids.size() - 1, (int) draft.size());
            }
        }

@ -2697,7 +2857,7 @@ int main(int argc, char ** argv) {
    const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
        json data = {
            { "default_generation_settings", ctx_server.default_generation_settings_for_props },
-            { "total_slots",                 ctx_server.params.n_parallel },
+            { "total_slots",                 ctx_server.params_base.n_parallel },
            { "chat_template",               llama_get_chat_template(ctx_server.model) },
        };

@ -2705,7 +2865,7 @@ int main(int argc, char ** argv) {
    };

    const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
-        if (!ctx_server.params.endpoint_props) {
+        if (!ctx_server.params_base.endpoint_props) {
            res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
            return;
        }
@ -2718,7 +2878,7 @@ int main(int argc, char ** argv) {
    };

    const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) {
-        if (ctx_server.params.embedding) {
+        if (ctx_server.params_base.embedding) {
            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
            return;
        }
@ -2824,7 +2984,7 @@ int main(int argc, char ** argv) {

    // TODO: maybe merge this function with "handle_completions_generic"
    const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
-        if (ctx_server.params.embedding) {
+        if (ctx_server.params_base.embedding) {
            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
            return;
        }
@ -3001,7 +3161,7 @@ int main(int argc, char ** argv) {
    };

    const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
-        if (!ctx_server.params.reranking || ctx_server.params.embedding) {
+        if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) {
            res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
            return;
        }
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -24,7 +24,6 @@
 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"

 using json = nlohmann::ordered_json;
-using llama_tokens = std::vector<llama_token>;

 #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
 #define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
@ -439,62 +438,6 @@ static std::string gen_chatcmplid() {
 // other common utils
 //

-static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) {
-    size_t i;
-    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
-
-    return i;
-}
-
-static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) {
-    // check for empty sequences
-    if (a.empty() || b.empty()) {
-        return 0;
-    }
-
-    // get the lengths of the input sequences
-    size_t a_len = a.size();
-    size_t b_len = b.size();
-
-    // initialize the maximum length of the longest common subsequence (LCS)
-    size_t max_length = 0;
-
-    // use two rows instead of a 2D matrix to optimize space
-    std::vector<size_t> prev_row(b_len + 1, 0);
-    std::vector<size_t> curr_row(b_len + 1, 0);
-
-    // iterate through the elements of a
-    for (size_t i = 1; i <= a_len; i++) {
-        // iterate through the elements of b
-        for (size_t j = 1; j <= b_len; j++) {
-            // if elements at the current positions match
-            if (a[i - 1] == b[j - 1]) {
-                // if it's the first element of either sequences, set LCS length to 1
-                if (i == 1 || j == 1) {
-                    curr_row[j] = 1;
-                } else {
-                    // increment LCS length by 1 compared to the previous element
-                    curr_row[j] = prev_row[j - 1] + 1;
-                }
-
-                // update max_length if necessary
-                if (curr_row[j] > max_length) {
-                    max_length = curr_row[j];
-                }
-            } else {
-                // reset LCS length if elements don't match
-                curr_row[j] = 0;
-            }
-        }
-
-        // update the previous row for the next iteration
-        prev_row = curr_row;
-    }
-
-    // return the maximum length of the LCS
-    return max_length;
-}
-
 static bool ends_with(const std::string & str, const std::string & suffix) {
    return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
 }
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@ -62,6 +62,9 @@ int main(int argc, char ** argv) {
        }
    }, nullptr);

+    // load dynamic backends
+    ggml_backend_load_all();
+
    // initialize the model
    llama_model_params model_params = llama_model_default_params();
    model_params.n_gpu_layers = ngl;
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -74,6 +74,10 @@ int main(int argc, char ** argv) {
        }
    }

+    // load dynamic backends
+
+    ggml_backend_load_all();
+
    // initialize the model

    llama_model_params model_params = llama_model_default_params();
--- a/examples/speculative-simple/CMakeLists.txt
+++ b/examples/speculative-simple/CMakeLists.txt
@ -0,0 +1,5 @@
+set(TARGET llama-speculative-simple)
+add_executable(${TARGET} speculative-simple.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/speculative-simple/README.md
+++ b/examples/speculative-simple/README.md
@ -0,0 +1,12 @@
+# llama.cpp/examples/speculative-simple
+
+Demonstration of basic greedy speculative decoding
+
+```bash
+./bin/llama-speculative-simple \
+    -m  ../models/qwen2.5-32b-coder-instruct/ggml-model-q8_0.gguf \
+    -md ../models/qwen2.5-1.5b-coder-instruct/ggml-model-q4_0.gguf \
+    -f test.txt -c 0 -ngl 99 --color \
+    --sampling-seq k --top-k 1 -fa --temp 0.0 \
+    -ngld 99 --draft-max 16 --draft-min 5 --draft-p-min 0.9
+```
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@ -0,0 +1,274 @@
+#include "arg.h"
+#include "common.h"
+#include "sampling.h"
+#include "speculative.h"
+#include "log.h"
+#include "llama.h"
+
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <vector>
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
+        return 1;
+    }
+
+    if (params.n_predict < -1) {
+        LOG_ERR("%s: --n-predict must be >= -1\n", __func__);
+        return 1;
+    }
+
+    common_init();
+
+    if (params.speculative.model.empty()) {
+        LOG_ERR("%s: --model-draft is required\n", __func__);
+        return 1;
+    }
+
+    // init llama.cpp
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    llama_model * model_tgt = NULL;
+    llama_model * model_dft = NULL;
+
+    llama_context * ctx_tgt = NULL;
+    llama_context * ctx_dft = NULL;
+
+    // load the target model
+    common_init_result llama_init_tgt = common_init_from_params(params);
+
+    model_tgt = llama_init_tgt.model;
+    ctx_tgt   = llama_init_tgt.context;
+
+    // load the draft model
+    params.devices      = params.speculative.devices;
+    params.model        = params.speculative.model;
+    params.n_ctx        = params.speculative.n_ctx;
+    params.n_batch      = params.speculative.n_ctx > 0 ? params.speculative.n_ctx : params.n_batch;
+    params.n_gpu_layers = params.speculative.n_gpu_layers;
+
+    if (params.speculative.cpuparams.n_threads > 0) {
+        params.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
+    }
+
+    params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
+    common_init_result llama_init_dft = common_init_from_params(params);
+
+    model_dft = llama_init_dft.model;
+    ctx_dft   = llama_init_dft.context;
+
+    if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
+        return 1;
+    }
+
+    // Tokenize the prompt
+    std::vector<llama_token> inp;
+    inp = common_tokenize(ctx_tgt, params.prompt, true, true);
+
+    if (llama_n_ctx(ctx_tgt) < (int) inp.size()) {
+        LOG_ERR("%s: the prompt exceeds the context size (%d tokens, ctx %d)\n", __func__, (int) inp.size(), llama_n_ctx(ctx_tgt));
+
+        return 1;
+    }
+
+    if (llama_n_batch(ctx_tgt) < (int) inp.size()) {
+        LOG_ERR("%s: the prompt exceeds the batch size (%d tokens, batch %d)\n", __func__, (int) inp.size(), llama_n_batch(ctx_tgt));
+
+        return 1;
+    }
+
+    LOG("\n\n");
+
+    for (auto id : inp) {
+        LOG("%s", common_token_to_piece(ctx_tgt, id).c_str());
+    }
+
+    // how many tokens to draft each time
+    int n_draft     = params.speculative.n_max;
+    int n_draft_min = params.speculative.n_min;
+
+    float p_min = params.speculative.p_min;
+
+    int n_predict = 0;
+    int n_drafted = 0;
+    int n_accept  = 0;
+
+    // used to determine end of generation
+    bool has_eos = false;
+
+    // ================================================
+    // everything until here is standard initialization
+    // the relevant stuff for speculative decoding starts here
+
+    const auto t_enc_start = ggml_time_us();
+
+    // target model sampling context
+    struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);
+
+    // eval the prompt
+    llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
+
+    // note: keep the last token separate!
+    llama_token id_last = inp.back();
+
+    // all tokens currently in the target context
+    auto prompt_tgt = std::vector<llama_token>(inp.begin(), inp.end() - 1);
+
+    int n_past = inp.size() - 1;
+
+    // init the speculator
+    struct common_speculative_params params_spec;
+    params_spec.n_draft = n_draft;
+    params_spec.n_reuse = llama_n_ctx(ctx_dft) - n_draft;
+    params_spec.p_min   = p_min;
+
+    struct common_speculative * spec = common_speculative_init(ctx_dft);
+
+    llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);
+
+    const auto t_enc_end = ggml_time_us();
+
+    const auto t_dec_start = ggml_time_us();
+
+    while (true) {
+        // optionally, generate draft tokens that can be appended to the target batch
+        //
+        // this is the most important part of the speculation. the more probable tokens that are provided here
+        // the better the performance will be. in theory, this computation can be performed asynchronously and even
+        // offloaded to a remote device. it doesn't even have to be based on an LLM. instead, it can provide tokens
+        // from a cache or lookup tables.
+        //
+        llama_tokens draft = common_speculative_gen_draft(spec, params_spec, prompt_tgt, id_last);
+
+        //LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str());
+
+        // always have a token to evaluate from before - id_last
+        common_batch_clear(batch_tgt);
+        common_batch_add  (batch_tgt, id_last, n_past++, { 0 }, true);
+
+        // evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
+        {
+            // do not waste time on small drafts
+            if (draft.size() < n_draft_min) {
+                draft.clear();
+            }
+
+            for (size_t i = 0; i < draft.size(); ++i) {
+                common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
+            }
+
+            //LOG_DBG("target batch: %s\n", string_from(ctx_tgt, batch_tgt).c_str());
+
+            llama_decode(ctx_tgt, batch_tgt);
+        }
+
+        // sample from the full target batch and return the accepted tokens based on the target sampler
+        //
+        // for each token to be accepted, the sampler would have to sample that same token
+        // in such cases, instead of decoding the sampled token as we normally do, we simply continue with the
+        // available logits from the batch and sample the next token until we run out of logits or the sampler
+        // disagrees with the draft
+        //
+        const auto ids = common_sampler_sample_and_accept_n(smpl, ctx_tgt, draft);
+
+        //LOG_DBG("ids: %s\n", string_from(ctx_tgt, ids).c_str());
+
+        GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token
+
+        n_past    += ids.size() - 1;
+        n_drafted += batch_tgt.n_tokens - 1;
+        n_accept  += ids.size() - 1;
+
+        // process the accepted tokens and update contexts
+        //
+        // this is the standard token post-processing that we normally do
+        // in this case, we do it for a group of accepted tokens at once
+        //
+        {
+            llama_token id;
+            std::string token_str;
+
+            for (size_t i = 0; i < ids.size(); ++i) {
+                id = ids[i];
+
+                ++n_predict;
+
+                if (llama_token_is_eog(model_tgt, id)) {
+                    has_eos = true;
+                    break;
+                }
+
+                token_str = common_token_to_piece(ctx_tgt, id);
+
+                if (params.use_color && i + 1 < ids.size()) {
+                    LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str());
+                } else {
+                    LOG("%s", token_str.c_str());
+                }
+            }
+
+            if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
+                break;
+            }
+
+            LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d, '%s')\n", (int) ids.size() - 1, (int) draft.size(), id, token_str.c_str());
+
+            {
+                LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
+
+                llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
+            }
+
+            prompt_tgt.push_back(id_last);
+            prompt_tgt.insert(prompt_tgt.end(), ids.begin(), ids.end() - 1);
+
+            // remember the last accepted token for the next iteration
+            id_last = id;
+        }
+    }
+
+    auto t_dec_end = ggml_time_us();
+
+    const int n_input = inp.size();
+
+    LOG("\n\n");
+
+    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+
+    LOG_INF("\n");
+    LOG_INF("n_draft   = %d\n", n_draft);
+    LOG_INF("n_predict = %d\n", n_predict);
+    LOG_INF("n_drafted = %d\n", n_drafted);
+    LOG_INF("n_accept  = %d\n", n_accept);
+    LOG_INF("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
+
+    LOG_INF("\n");
+    LOG_INF("draft:\n\n");
+
+    llama_perf_context_print(ctx_dft);
+
+    LOG_INF("\n");
+    LOG_INF("target:\n\n");
+    common_perf_print(ctx_tgt, smpl);
+
+    common_sampler_free(smpl);
+    common_speculative_free(spec);
+
+    llama_free(ctx_tgt);
+    llama_free_model(model_tgt);
+
+    llama_free(ctx_dft);
+    llama_free_model(model_dft);
+
+    llama_backend_free();
+
+    LOG("\n\n");
+
+    return 0;
+}
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -12,7 +12,7 @@
 #include <string>
 #include <vector>

-#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
+#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  128
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5

 struct seq_draft {
@ -33,7 +33,7 @@ int main(int argc, char ** argv) {
    common_params params;

    // needed to get candidate probs even for temp <= 0.0
-    params.sparams.n_probs = 128;
+    params.sampling.n_probs = 128;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
        return 1;
@ -46,7 +46,7 @@ int main(int argc, char ** argv) {

    common_init();

-    if (params.model_draft.empty()) {
+    if (params.speculative.model.empty()) {
        LOG_ERR("%s: --model-draft is required\n", __func__);
        return 1;
    }
@ -55,9 +55,9 @@ int main(int argc, char ** argv) {
    const int n_seq_dft = params.n_parallel;

    // probability threshold for splitting a draft branch (only for n_seq_dft > 1)
-    const float p_split  = params.p_split;
+    const float p_draft_split = params.speculative.p_split;

-    std::default_random_engine rng(params.sparams.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sparams.seed);
+    std::default_random_engine rng(params.sampling.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sampling.seed);
    std::uniform_real_distribution<> u_dist;

    // init llama.cpp
@ -76,13 +76,14 @@ int main(int argc, char ** argv) {
    ctx_tgt = llama_init_tgt.context;

    // load the draft model
-    params.model = params.model_draft;
-    params.n_gpu_layers = params.n_gpu_layers_draft;
-    if (params.draft_cpuparams.n_threads > 0) {
-        params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
+    params.devices = params.speculative.devices;
+    params.model = params.speculative.model;
+    params.n_gpu_layers = params.speculative.n_gpu_layers;
+    if (params.speculative.cpuparams.n_threads > 0) {
+        params.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
    }

-    params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
+    params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
    common_init_result llama_init_dft = common_init_from_params(params);
    model_dft = llama_init_dft.model;
    ctx_dft = llama_init_dft.context;
@ -170,7 +171,7 @@ int main(int argc, char ** argv) {
    //GGML_ASSERT(n_vocab == llama_n_vocab(model_dft));

    // how many tokens to draft each time
-    int n_draft = params.n_draft;
+    int n_draft = params.speculative.n_max;

    int n_predict = 0;
    int n_drafted = 0;
@ -183,14 +184,14 @@ int main(int argc, char ** argv) {
    bool has_eos = false;

    // target model sampling context (reuse the llama_context's sampling instance)
-    struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams);
+    struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);

    // draft sequence data
    std::vector<seq_draft> drafts(n_seq_dft);

    for (int s = 0; s < n_seq_dft; ++s) {
        // allocate llama_sampler for each draft sequence
-        drafts[s].smpl = common_sampler_init(model_dft, params.sparams);
+        drafts[s].smpl = common_sampler_init(model_dft, params.sampling);
    }

    llama_batch batch_dft = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
@ -230,7 +231,7 @@ int main(int argc, char ** argv) {
            // for stochastic sampling, attempt to match the token with the drafted tokens
            {
                bool accept = false;
-                if (params.sparams.temp > 0) {
+                if (params.sampling.temp > 0) {
                    // stochastic verification
                    common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);

@ -494,7 +495,7 @@ int main(int argc, char ** argv) {

                // attempt to split the branch if the probability is high enough
                for (int f = 1; f < 8; ++f) {
-                    if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
+                    if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
                        LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);

                        llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
--- a/flake.lock
+++ b/flake.lock
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1731676054,
-        "narHash": "sha256-OZiZ3m8SCMfh3B6bfGC/Bm4x3qc1m2SVEAlkV6iY7Yg=",
+        "lastModified": 1732014248,
+        "narHash": "sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "5e4fbfb6b3de1aa2872b76d49fafc942626e2add",
+        "rev": "23e89b7da85c3640bbc2173fe04f4bd114342367",
        "type": "github"
      },
      "original": {
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -33,6 +33,7 @@ else()
 endif()

 option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
+option(GGML_BACKEND_DL   "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)

 #
 # option list
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@ -190,6 +190,14 @@ extern "C" {
    typedef void                         (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
    // Get additional buffer types provided by the device (returns a NULL-terminated array)
    typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
+    // Set the abort callback for the backend
+    typedef void                         (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
+    // Get a list of feature flags supported by the backend (returns a NULL-terminated array)
+    struct ggml_backend_feature {
+        const char * name;
+        const char * value;
+    };
+    typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);

    //
    // Backend registry
@ -214,6 +222,13 @@ extern "C" {
    // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
    GGML_API ggml_backend_t ggml_backend_init_best(void);

+    // Load a backend from a dynamic library and register it
+    GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
+    // Unload a backend if loaded dynamically and unregister it
+    GGML_API void               ggml_backend_unload(ggml_backend_reg_t reg);
+    // Load all known backends from dynamic libraries
+    GGML_API void               ggml_backend_load_all(void);
+
    //
    // Backend scheduler
    //
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@ -7,29 +7,6 @@
 extern "C" {
 #endif

-    // Scheduling priorities
-    enum ggml_sched_priority {
-        GGML_SCHED_PRIO_NORMAL,
-        GGML_SCHED_PRIO_MEDIUM,
-        GGML_SCHED_PRIO_HIGH,
-        GGML_SCHED_PRIO_REALTIME
-    };
-
-    // Threadpool params
-    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
-    struct ggml_threadpool_params {
-        bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
-        int                 n_threads;                   // number of threads
-        enum ggml_sched_priority prio;                   // thread priority
-        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
-        bool                strict_cpu;                  // strict cpu placement
-        bool                paused;                      // start in paused state
-    };
-
-    struct ggml_threadpool;     // forward declaration, see ggml.c
-
-    typedef struct ggml_threadpool * ggml_threadpool_t;
-
    // the compute plan that needs to be prepared for ggml_graph_compute()
    // since https://github.com/ggerganov/ggml/issues/287
    struct ggml_cplan {
@ -75,14 +52,11 @@ extern "C" {
    GGML_BACKEND_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
    GGML_BACKEND_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);

-    GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
-    GGML_BACKEND_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
-    GGML_BACKEND_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
-    GGML_BACKEND_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
-    GGML_BACKEND_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
-    GGML_BACKEND_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
-    GGML_BACKEND_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
-    GGML_BACKEND_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API struct ggml_threadpool *      ggml_threadpool_new           (struct ggml_threadpool_params  * params);
+    GGML_BACKEND_API void                          ggml_threadpool_free          (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API int                           ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API void                          ggml_threadpool_pause         (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API void                          ggml_threadpool_resume        (struct ggml_threadpool * threadpool);

    // ggml_graph_plan() has to be called before ggml_graph_compute()
    // when plan.work_size > 0, caller must allocate memory for plan.work_data
@ -104,10 +78,10 @@ extern "C" {
    GGML_BACKEND_API int ggml_cpu_has_sse3       (void);
    GGML_BACKEND_API int ggml_cpu_has_ssse3      (void);
    GGML_BACKEND_API int ggml_cpu_has_avx        (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx_vnni   (void);
    GGML_BACKEND_API int ggml_cpu_has_avx2       (void);
    GGML_BACKEND_API int ggml_cpu_has_f16c       (void);
    GGML_BACKEND_API int ggml_cpu_has_fma        (void);
-    GGML_BACKEND_API int ggml_cpu_has_avx_vnni   (void);
    GGML_BACKEND_API int ggml_cpu_has_avx512     (void);
    GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
    GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -2215,6 +2215,37 @@ extern "C" {

    GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);

+    // ggml threadpool
+    // TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
+    // the goal should be to create an API that other backends can use move everything to the ggml base
+
+    // scheduling priorities
+    enum ggml_sched_priority {
+        GGML_SCHED_PRIO_NORMAL,
+        GGML_SCHED_PRIO_MEDIUM,
+        GGML_SCHED_PRIO_HIGH,
+        GGML_SCHED_PRIO_REALTIME
+    };
+
+    // threadpool params
+    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
+    struct ggml_threadpool_params {
+        bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
+        int                 n_threads;                   // number of threads
+        enum ggml_sched_priority prio;                   // thread priority
+        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
+        bool                strict_cpu;                  // strict cpu placement
+        bool                paused;                      // start in paused state
+    };
+
+    struct ggml_threadpool;     // forward declaration, see ggml.c
+
+    typedef struct ggml_threadpool * ggml_threadpool_t;
+
+    GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
+    GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
+    GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
+
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -202,6 +202,10 @@ endif()

 # ggml

+if (GGML_BACKEND_DL AND NOT BUILD_SHARED_LIBS)
+    message(FATAL_ERROR "GGML_BACKEND_DL requires BUILD_SHARED_LIBS")
+endif()
+
 add_library(ggml-base
            ../include/ggml.h
            ../include/ggml-alloc.h
@ -226,6 +230,31 @@ add_library(ggml

 target_link_libraries(ggml PUBLIC ggml-base)

+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    target_link_libraries(ggml PRIVATE dl)
+endif()
+
+function(ggml_add_backend_library backend)
+    if (GGML_BACKEND_DL)
+        add_library(${backend} MODULE ${ARGN})
+        # write the shared library to the output directory
+        set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+        target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
+    else()
+        add_library(${backend} ${ARGN})
+        target_link_libraries(ggml PUBLIC ${backend})
+        install(TARGETS ${backend} LIBRARY)
+    endif()
+
+    target_link_libraries(${backend} PRIVATE ggml-base)
+    target_include_directories(${backend} PRIVATE ..)
+
+    if (${BUILD_SHARED_LIBS})
+        target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD)
+        target_compile_definitions(${backend} PUBLIC  GGML_BACKEND_SHARED)
+    endif()
+endfunction()
+
 function(ggml_add_backend backend)
    string(TOUPPER "GGML_${backend}" backend_id)
    if (${backend_id})
@ -236,14 +265,10 @@ function(ggml_add_backend backend)
        # however, currently it is necessary for AMX, since it is enabled by default on llama.cpp
        if (${backend_id})
            message(STATUS "Including ${backend} backend")
-            if (${BUILD_SHARED_LIBS})
-                target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD)
-                target_compile_definitions(${backend_target} PUBLIC  GGML_BACKEND_SHARED)
+            if (NOT GGML_BACKEND_DL)
+                string(TOUPPER "GGML_USE_${backend}" backend_use)
+                target_compile_definitions(ggml PUBLIC ${backend_use})
            endif()
-            install(TARGETS ${backend_target} LIBRARY)
-            target_link_libraries(ggml PUBLIC ${backend_target})
-            string(TOUPPER "GGML_USE_${backend}" backend_use)
-            target_compile_definitions(ggml PUBLIC ${backend_use})
        endif()
    endif()
 endfunction()
@ -256,10 +281,10 @@ ggml_add_backend(CUDA)
 ggml_add_backend(HIP)
 ggml_add_backend(Kompute)
 ggml_add_backend(METAL)
+ggml_add_backend(MUSA)
 ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
 ggml_add_backend(Vulkan)
-ggml_add_backend(MUSA)

 foreach (target ggml-base ggml)
    target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
--- a/ggml/src/ggml-amx/CMakeLists.txt
+++ b/ggml/src/ggml-amx/CMakeLists.txt
@ -9,12 +9,10 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MA

    file(GLOB   GGML_SOURCES_AMX "*.cpp")

-    add_library(ggml-amx
-                ${GGML_HEADERS_AMX}
-                ${GGML_SOURCES_AMX})
-
-    target_link_libraries(ggml-amx PRIVATE ggml-base)
-    target_include_directories(ggml-amx PRIVATE . ..)
+    ggml_add_backend_library(ggml-amx
+                             ${GGML_HEADERS_AMX}
+                             ${GGML_SOURCES_AMX}
+                            )

    # this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
    # TODO: integrate AMX backend into the CPU backend
--- a/ggml/src/ggml-amx/ggml-amx.cpp
+++ b/ggml/src/ggml-amx/ggml-amx.cpp
@ -409,8 +409,9 @@ static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = {

 ggml_backend_reg_t ggml_backend_amx_reg(void) {
    static struct ggml_backend_reg ggml_backend_amx_reg = {
-        /* .iface   = */ ggml_backend_amx_reg_i,
-        /* .context = */ NULL,
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_amx_reg_i,
+        /* .context     = */ NULL,
    };

    return &ggml_backend_amx_reg;
@ -444,3 +445,5 @@ ggml_backend_reg_t ggml_backend_amx_reg(void) {
 }

 #endif
+
+GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg)
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@ -8,6 +8,8 @@
 extern "C" {
 #endif

+    #define GGML_BACKEND_API_VERSION 1
+
    //
    // Backend buffer type
    //
@ -63,20 +65,20 @@ extern "C" {
        enum ggml_backend_buffer_usage usage;
    };

-    ggml_backend_buffer_t ggml_backend_buffer_init(
+    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
                   ggml_backend_buffer_type_t buft,
            struct ggml_backend_buffer_i      iface,
                   void *                     context,
                   size_t                     size);

    // do not use directly, use ggml_backend_tensor_copy instead
-    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
+    GGML_API bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);

    // multi-buffer
    // buffer that contains a collection of buffers
-    ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
-    bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
-    void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+    GGML_API ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
+    GGML_API bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
+    GGML_API void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);

    //
    // Backend (stream)
@ -199,17 +201,37 @@ extern "C" {
    };

    struct ggml_backend_reg {
-        // int api_version; // TODO: for dynamic loading
+        int api_version; // initialize to GGML_BACKEND_API_VERSION
        struct ggml_backend_reg_i iface;
        void * context;
    };

-
    // Internal backend registry API
-    void ggml_backend_register(ggml_backend_reg_t reg);
-    void ggml_backend_device_register(ggml_backend_dev_t device);
-    // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
-    // typedef ggml_backend_register_t * (*ggml_backend_init)(void);
+    GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
+    GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
+
+    // Add backend dynamic loading support to the backend
+    typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
+
+    #ifdef GGML_BACKEND_DL
+        #ifdef __cplusplus
+        #    define GGML_BACKEND_DL_IMPL(reg_fn)                                 \
+                extern "C" {                                                     \
+                    GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
+                }                                                                \
+                ggml_backend_reg_t ggml_backend_init(void) {                     \
+                    return reg_fn();                                             \
+                }
+        #else
+        #    define GGML_BACKEND_DL_IMPL(reg_fn)                             \
+                GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
+                ggml_backend_reg_t ggml_backend_init(void) {                 \
+                    return reg_fn();                                         \
+                }
+        #endif
+    #else
+    #    define GGML_BACKEND_DL_IMPL(reg_fn)
+    #endif

 #ifdef  __cplusplus
 }
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@ -1,11 +1,29 @@
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
-#include "ggml-cpu.h"
 #include "ggml-impl.h"
+#include <algorithm>
 #include <cstring>
+#include <string>
 #include <vector>

+#ifdef _WIN32
+#    define WIN32_LEAN_AND_MEAN
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
+#    include <windows.h>
+#elif defined(__APPLE__)
+#    include <mach-o/dyld.h>
+#    include <dlfcn.h>
+#else
+#    include <dlfcn.h>
+#    include <unistd.h>
+#endif
+
 // Backend registry
+#ifdef GGML_USE_CPU
+#include "ggml-cpu.h"
+#endif

 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
@ -43,8 +61,13 @@
 #include "ggml-kompute.h"
 #endif

+struct ggml_backend_reg_entry {
+    ggml_backend_reg_t reg;
+    void * handle;
+};
+
 struct ggml_backend_registry {
-    std::vector<ggml_backend_reg_t> backends;
+    std::vector<ggml_backend_reg_entry> backends;
    std::vector<ggml_backend_dev_t> devices;

    ggml_backend_registry() {
@ -75,11 +98,19 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_KOMPUTE
        register_backend(ggml_backend_kompute_reg());
 #endif
-
+#ifdef GGML_USE_CPU
        register_backend(ggml_backend_cpu_reg());
+#endif
    }

-    void register_backend(ggml_backend_reg_t reg) {
+    ~ggml_backend_registry() {
+        while (!backends.empty()) {
+            // use silent since the log system may have been destroyed at this point
+            unload_backend(backends.back().reg, true);
+        }
+    }
+
+    void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) {
        if (!reg) {
            return;
        }
@ -88,7 +119,7 @@ struct ggml_backend_registry {
        GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
            __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
 #endif
-        backends.push_back(reg);
+        backends.push_back({ reg, handle });
        for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
            register_device(ggml_backend_reg_dev_get(reg, i));
        }
@ -100,6 +131,111 @@ struct ggml_backend_registry {
 #endif
        devices.push_back(device);
    }
+
+    ggml_backend_reg_t load_backend(const char * path, bool silent) {
+#ifdef _WIN32
+        // suppress error dialogs for missing DLLs
+        DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+        SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+        HMODULE handle = LoadLibraryA(path);
+
+        if (!handle) {
+            if (!silent) {
+                GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError());
+            }
+            SetErrorMode(old_mode);
+            return nullptr;
+        }
+
+        ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init");
+
+        SetErrorMode(old_mode);
+
+        if (!backend_init) {
+            if (!silent) {
+                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError());
+            }
+            FreeLibrary(handle);
+            return nullptr;
+        }
+#else
+        void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
+
+        if (!handle) {
+            if (!silent) {
+                GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror());
+            }
+            return nullptr;
+        }
+
+        auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init");
+
+        if (!backend_init) {
+            if (!silent) {
+                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror());
+            }
+            dlclose(handle);
+            return nullptr;
+        }
+#endif
+        ggml_backend_reg_t reg = backend_init();
+
+        if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
+            if (!silent) {
+                if (!reg) {
+                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
+                } else {
+                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
+                                   __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
+                }
+            }
+#ifdef _WIN32
+            FreeLibrary(handle);
+#else
+            dlclose(handle);
+#endif
+            return nullptr;
+        }
+
+        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
+        register_backend(reg, handle);
+        return reg;
+    }
+
+    void unload_backend(ggml_backend_reg_t reg, bool silent) {
+        auto it = std::find_if(backends.begin(), backends.end(),
+                                [reg](ggml_backend_reg_entry entry) { return entry.reg == reg; });
+
+        if (it == backends.end()) {
+            if (!silent) {
+                GGML_LOG_ERROR("%s: backend not found\n", __func__);
+            }
+            return;
+        }
+
+        if (!silent) {
+            GGML_LOG_DEBUG("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg));
+        }
+
+        // remove devices
+        devices.erase(
+            std::remove_if(devices.begin(), devices.end(),
+                            [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
+            devices.end());
+
+        // unload library
+        if (it->handle) {
+#ifdef _WIN32
+            FreeLibrary((HMODULE) it->handle);
+#else
+            dlclose(it->handle);
+#endif
+        }
+
+        // remove backend
+        backends.erase(it);
+    }
 };

 static ggml_backend_registry & get_reg() {
@ -117,23 +253,32 @@ void ggml_backend_device_register(ggml_backend_dev_t device) {
 }

 // Backend (reg) enumeration
+static bool striequals(const char * a, const char * b) {
+    for (; *a && *b; a++, b++) {
+        if (std::tolower(*a) != std::tolower(*b)) {
+            return false;
+        }
+    }
+    return *a == *b;
+}
+
 size_t ggml_backend_reg_count() {
    return get_reg().backends.size();
 }

 ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
    GGML_ASSERT(index < ggml_backend_reg_count());
-    return get_reg().backends[index];
+    return get_reg().backends[index].reg;
 }

 ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
    for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
        ggml_backend_reg_t reg = ggml_backend_reg_get(i);
-        if (std::strcmp(ggml_backend_reg_name(reg), name) == 0) {
+        if (striequals(ggml_backend_reg_name(reg), name)) {
            return reg;
        }
    }
-    return NULL;
+    return nullptr;
 }

 // Device enumeration
@ -149,11 +294,11 @@ ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
 ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
-        if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
+        if (striequals(ggml_backend_dev_name(dev), name)) {
            return dev;
        }
    }
-    return NULL;
+    return nullptr;
 }

 ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
@ -163,14 +308,14 @@ ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
            return dev;
        }
    }
-    return NULL;
+    return nullptr;
 }

 // Convenience functions
 ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
    ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
    if (!dev) {
-        return NULL;
+        return nullptr;
    }
    return ggml_backend_dev_init(dev, params);
 }
@ -178,7 +323,7 @@ ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params)
 ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
    ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
    if (!dev) {
-        return NULL;
+        return nullptr;
    }
    return ggml_backend_dev_init(dev, params);
 }
@ -189,7 +334,97 @@ ggml_backend_t ggml_backend_init_best(void) {
        dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
    }
    if (!dev) {
-        return NULL;
+        return nullptr;
    }
-    return ggml_backend_dev_init(dev, NULL);
+    return ggml_backend_dev_init(dev, nullptr);
+}
+
+// Dynamic loading
+ggml_backend_reg_t ggml_backend_load(const char * path) {
+    return get_reg().load_backend(path, false);
+}
+
+void ggml_backend_unload(ggml_backend_reg_t reg) {
+    get_reg().unload_backend(reg, true);
+}
+
+void ggml_backend_load_all() {
+    std::vector<std::string> search_prefix;
+
+    // add the executable directory to the search path
+    // FIXME: this is convenient for development, but it should probably be disabled in production
+
+#if defined(__APPLE__)
+    // get executable path
+    std::vector<char> path;
+    uint32_t size;
+    while (true) {
+        size = path.size();
+        if (_NSGetExecutablePath(path.data(), &size) == 0) {
+            break;
+        }
+        path.resize(size);
+    }
+    std::string base_path(path.data(), size);
+    // remove executable name
+    auto last_slash = base_path.find_last_of('/');
+    if (last_slash != std::string::npos) {
+        base_path = base_path.substr(0, last_slash);
+    }
+    search_prefix.push_back(base_path + "/");
+#elif defined(__linux__)
+    std::string base_path = ".";
+    std::vector<char> path(1024);
+    while (true) {
+        // get executable path
+        ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
+        if (len == -1) {
+            break;
+        }
+        if (len < (ssize_t) path.size()) {
+            base_path = std::string(path.data(), len);
+            // remove executable name
+            auto last_slash = base_path.find_last_of('/');
+            if (last_slash != std::string::npos) {
+                base_path = base_path.substr(0, last_slash);
+            }
+            break;
+        }
+        path.resize(path.size() * 2);
+    }
+
+    search_prefix.push_back(base_path + "/");
+#endif
+
+    auto & reg = get_reg();
+
+    auto try_load = [&](const std::string & name) {
+        std::string os_name;
+#ifdef _WIN32
+        os_name = "ggml-" + name + ".dll";
+#else
+        os_name = "libggml-" + name + ".so";
+#endif
+        if (reg.load_backend(os_name.c_str(), true)) {
+            return;
+        }
+        for (const auto & prefix : search_prefix) {
+            if (reg.load_backend((prefix + os_name).c_str(), true)) {
+                return;
+            }
+        }
+    };
+
+    try_load("amx");
+    try_load("blas");
+    try_load("cann");
+    try_load("cuda");
+    try_load("hip");
+    try_load("kompute");
+    try_load("metal");
+    try_load("rpc");
+    try_load("sycl");
+    try_load("vulkan");
+    try_load("musa");
+    try_load("cpu");
 }
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@ -252,6 +252,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
 }

 void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    GGML_ASSERT(tensor);
    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;

    if (size == 0) {
@ -266,6 +267,7 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz
 }

 void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(tensor);
    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;

    if (size == 0) {
@ -884,9 +886,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
    for (int i = 0; i < graph->n_nodes; i++) {
        struct ggml_tensor * node = graph->nodes[i];
        int * node_backend_id = &tensor_backend_id(node);
-        if (ggml_is_view_op(node->op)) {
-            continue;
-        }
        // do not overwrite user assignments
        if (*node_backend_id == -1) {
            *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
--- a/ggml/src/ggml-blas/CMakeLists.txt
+++ b/ggml/src/ggml-blas/CMakeLists.txt
@ -11,12 +11,9 @@ find_package(BLAS)
 if (BLAS_FOUND)
    message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")

-    add_library(ggml-blas
-                ggml-blas.cpp
-                )
-
-    target_link_libraries(ggml-blas PRIVATE ggml-base)
-    target_include_directories(ggml-blas PRIVATE . ..)
+    ggml_add_backend_library(ggml-blas
+                             ggml-blas.cpp
+                            )

    if (${GGML_BLAS_VENDOR} MATCHES "Apple")
        add_compile_definitions(ACCELERATE_NEW_LAPACK)
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@ -506,9 +506,12 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {

 ggml_backend_reg_t ggml_backend_blas_reg(void) {
    static struct ggml_backend_reg ggml_backend_blas_reg = {
-        /* .iface   = */ ggml_backend_blas_reg_i,
-        /* .context = */ NULL,
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_blas_reg_i,
+        /* .context     = */ NULL,
    };

    return &ggml_backend_blas_reg;
 }
+
+GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)
--- a/ggml/src/ggml-cann/CMakeLists.txt
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@ -3,6 +3,33 @@ if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOM
    message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
 endif()

+# Auto-detech Soc type and Soc version, if detect failed, will abort build
+set(SOC_VERSION "")
+function(detect_ascend_soc_type SOC_VERSION)
+    execute_process(
+        COMMAND bash -c "npu-smi info|awk -F' ' 'NF > 0 && NR==7 {print $3}'"
+        OUTPUT_VARIABLE npu_info
+        RESULT_VARIABLE npu_result
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if("${npu_info}" STREQUAL "" OR ${npu_result})
+        message(FATAL_ERROR "Auto-detech ascend soc type failed, please specify manually or check ascend device working normally.")
+    endif()
+    set(${SOC_VERSION} "Ascend${npu_info}" PARENT_SCOPE)
+endfunction()
+
+if(NOT SOC_TYPE)
+    detect_ascend_soc_type(SOC_VERSION)
+    set(SOC_TYPE "${SOC_VERSION}")
+    message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
+else()
+    string(TOLOWER ${SOC_TYPE} SOC_VERSION)
+endif()
+
+# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P.
+string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
+set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
+
 if (CANN_INSTALL_DIR)
    # Only Support Linux.
    if (NOT UNIX)
@ -34,11 +61,13 @@ if (CANN_INSTALL_DIR)

    file(GLOB GGML_SOURCES_CANN "*.cpp")

-    add_library(ggml-cann ${GGML_SOURCES_CANN})
-    target_link_libraries(ggml-cann PRIVATE ggml-base ${CANN_LIBRARIES})
-    target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS})
+    ggml_add_backend_library(ggml-cann ${GGML_SOURCES_CANN})
+    target_link_libraries(ggml-cann PRIVATE ${CANN_LIBRARIES})
+    target_include_directories(ggml-cann PRIVATE ${CANN_INCLUDE_DIRS})
    target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)

+    target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
+
    message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
    message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")
 else()
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@ -21,6 +21,7 @@
 */

 #include "aclnn_ops.h"
+#include "ggml-impl.h"

 #include <aclnnop/aclnn_avgpool2d.h>
 #include <aclnnop/aclnn_cast.h>
@ -241,10 +242,14 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);

-    int64_t concat_dim = 1;
+    const int32_t dim = ggml_get_op_params_i32(dst, 0);
+
+    GGML_ASSERT(dim >= 0 && dim < 4);
+    int32_t acl_dim = 3 - dim;
+
    aclTensor* tensors[] = {acl_src0, acl_src1};
    aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
-    aclnn_concat(ctx, tensorList, acl_dst, concat_dim);
+    aclnn_concat(ctx, tensorList, acl_dst, acl_dim);

    ACL_CHECK(aclDestroyTensorList(tensorList));
    ACL_CHECK(aclDestroyTensor(acl_dst));
@ -1437,10 +1442,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src0 = dst->src[0];  // kernel
    ggml_tensor* src1 = dst->src[1];  // input

-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
-
    GGML_TENSOR_BINARY_OP_LOCALS;

    // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
@ -1462,9 +1463,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    const int64_t OH = is_2D ? ne2 : 1;
    const int64_t OW = ne1;

-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
    // memory allocated increased to 3x when is_2D == false
    const int64_t n_bytes_factor = is_2D ? 1 : 3;

@ -2312,6 +2310,14 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {

    switch (src0->type) {
        case GGML_TYPE_F32:
+        {
+#ifdef ASCEND_310P
+             // Special operation for get_row_f32 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
+            if ((src0->ne[0] % 8) != 0) {
+                size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
+                ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
+            }
+#endif
            aclrtlaunch_ascendc_get_row_f32(
                24, ctx.stream(), src0->data, src1->data, dst->data,
                ((ggml_tensor*)src0->extra)->ne,
@ -2320,7 +2326,16 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
                ((ggml_tensor*)dst->extra)->nb);
            break;
+        }
        case GGML_TYPE_F16:
+        {
+#ifdef ASCEND_310P
+             // Special operation for get_row_f16 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
+            if ((src0->ne[0] % 16) != 0) {
+                size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16
+                ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
+            }
+#endif
            aclrtlaunch_ascendc_get_row_f16(
                24, ctx.stream(), src0->data, src1->data, dst->data,
                ((ggml_tensor*)src0->extra)->ne,
@ -2329,6 +2344,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
                ((ggml_tensor*)dst->extra)->nb);
            break;
+        }
        case GGML_TYPE_Q4_0:
            aclrtlaunch_ascendc_get_row_q4_0(
                24, ctx.stream(), src0->data, src1->data, dst->data,
@ -2841,15 +2857,27 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
    ACL_CHECK(aclDestroyTensor(acl_cos_tensor));
 }

+#ifdef __cplusplus
+extern "C" {
+#endif
+aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(
+    const aclTensor* x, const aclTensor* cos, const aclTensor* sin,
+    int64_t mode, const aclTensor* yOut, uint64_t* workspaceSize,
+    aclOpExecutor** executor);
+aclnnStatus aclnnRotaryPositionEmbedding(void* workspace,
+                                         uint64_t workspaceSize,
+                                         aclOpExecutor* executor,
+                                         aclrtStream stream);
+#ifdef __cplusplus
+}
+#endif
+
 void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    // TODO: use ascendc
    // Only test with LLAMA model.
    ggml_tensor* src0 = dst->src[0];  // input
    ggml_tensor* src2 = dst->src[2];  // freq_factors

-    // TODO: with freq_factors
-    GGML_ASSERT(src2 == NULL);
-
    // param
    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
    // const int n_past     = ((int32_t *) dst->op_params)[0];
@ -2867,13 +2895,19 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
    memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));

-    GGML_ASSERT(n_dims <= ne0);
+    // TODO: with freq_factors
+    GGML_ASSERT(src2 == NULL);
+    // TODO: attn_factor != 1
+    GGML_ASSERT(attn_factor == 1);
+    // TODO: n_dims <= ne0
+    GGML_ASSERT(n_dims == ne0);
    GGML_ASSERT(n_dims % 2 == 0);
-
    // TODO: ext_factor != 0
    GGML_ASSERT(ext_factor == 0);
    // TODO: freq_scale != 1
    GGML_ASSERT(freq_scale == 1);
+    // TODO: type == GGML_TYPE_F16
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);

    const float theta_scale = powf(freq_base, -2.0f / n_dims);

@ -2906,177 +2940,30 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
                     theta_scale, is_neox);

-    // roll input
-    void* input_roll_buffer;
-    aclTensor* acl_minus_one_tensor;
-    void* minus_one_scale_buffer = nullptr;
-    ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
-    ggml_cann_pool_alloc minus_one_scale_allocator(
-        ctx.pool(), sizeof(float_t) * src0->ne[0]);
-    if (!is_neox) {
-        // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
-        input_roll_buffer = roll_allocator.get();
-        int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
-                                    src0->ne[2], src0->ne[3]};
-        size_t input_roll_nb[GGML_MAX_DIMS];
-        input_roll_nb[0] = ggml_type_size(src0->type);
-        for (int i = 1; i < GGML_MAX_DIMS; i++) {
-            input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
-        }
-        aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
-            input_roll_buffer, ggml_cann_type_mapping(src0->type),
-            ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
-            GGML_MAX_DIMS);
-        aclTensor* acl_input_tensor = ggml_cann_create_tensor(
-            src0->data, ggml_cann_type_mapping(src0->type),
-            ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
-            GGML_MAX_DIMS);
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;

-        int64_t shifts[] = {1};
-        int64_t dims[] = {3};
-        aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
-        ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
-        ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+    void* workspaceAddr = nullptr;

-        // init [-1, 1, -1, 1, ...]
-        minus_one_scale_buffer = minus_one_scale_allocator.get();
-
-        int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
-        size_t minus_one_nb[GGML_MAX_DIMS];
-        minus_one_nb[0] = sizeof(float_t);
-        for (int i = 1; i < GGML_MAX_DIMS; i++) {
-            minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
-        }
-        acl_minus_one_tensor = aclnn_ones(
-            ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
-            minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
-        int64_t dim = 3;
-        int64_t* index = new int64_t[src0->ne[0]];
-        for (int i = 0; i < src0->ne[0]; i++) {
-            index[i] = i / 2 * 2;
-        }
-        int64_t index_num = src0->ne[0];
-        float value = -1;
-        aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
-                                index_num, value);
-    } else {
-        // roll input: [q0,q1,q2,...] ->
-        // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
-        input_roll_buffer = roll_allocator.get();
-        aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
-            input_roll_buffer, ggml_cann_type_mapping(src0->type),
-            ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
-        aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
-
-        int64_t shifts[] = {src0->ne[0] / 2};
-        int64_t dims[] = {3};
-        aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
-
-        ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
-        ACL_CHECK(aclDestroyTensor(acl_input_tensor));
-
-        // init [-1, -1, -1, 1, 1，1，...]
-        minus_one_scale_buffer = minus_one_scale_allocator.get();
-
-        int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
-        size_t minus_one_nb[GGML_MAX_DIMS];
-        minus_one_nb[0] = sizeof(float_t);
-        for (int i = 1; i < GGML_MAX_DIMS; i++) {
-            minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
-        }
-        acl_minus_one_tensor = aclnn_ones(
-            ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
-            minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
-        // -1 * first half
-        int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
-        size_t first_half_nb[GGML_MAX_DIMS];
-        first_half_nb[0] = sizeof(float_t);
-        for (int i = 1; i < GGML_MAX_DIMS; i++) {
-            first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
-        }
-        aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
-            minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
-            first_half_nb, GGML_MAX_DIMS);
-        bool inplace = true;
-        float scale = -1;
-        aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
-        ACL_CHECK(aclDestroyTensor(acl_first_half_tensor));
+    int acl_mode = mode;
+    if (mode == 0) {
+        acl_mode = 1;
    }

-    // TODO: n_dims < ne0
-    GGML_ASSERT(n_dims == src0->ne[0]);
-
-    // input * scale
-    ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
-                                                  ggml_nbytes(src0));
-    void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
-    size_t input_nb[GGML_MAX_DIMS];
-    input_nb[0] = ggml_type_size(src0->type);
-    for (int i = 1; i < GGML_MAX_DIMS; i++) {
-        input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
-    }
-    aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
-        input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
-        ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
-    aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
-        input_roll_buffer, ggml_cann_type_mapping(src0->type),
-        ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
-
-    aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
-              acl_input_roll_mul_scale_tensor);
-
-    // output
-    aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
+    aclTensor* acl_x = ggml_cann_create_tensor(src0);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
-    void* output_fp32_buffer;
-    if (src0->type == GGML_TYPE_F32) {
-        aclnn_inplace_mul(ctx, acl_src0, acl_cos_reshape_tensor);
-        aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor,
-                          acl_sin_reshape_tensor);
-        aclnn_add(ctx, acl_src0, acl_input_roll_mul_scale_tensor, acl_dst);
-        // TODO: ne0 != n_dims in mode2
-    } else if (src0->type == GGML_TYPE_F16) {
-        size_t input_fp32_nb[GGML_MAX_DIMS];
-        input_fp32_nb[0] = sizeof(float_t);
-        for (int i = 1; i < GGML_MAX_DIMS; i++) {
-            input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
-        }
-        ggml_cann_pool_alloc fp32_allocator1(
-            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
-        void* input_fp32_buffer1 = fp32_allocator1.get();
-        aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
-            input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
-            input_fp32_nb, GGML_MAX_DIMS);
-        ggml_cann_pool_alloc fp32_allocator2(
-            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
-        void* input_fp32_buffer2 = fp32_allocator2.get();
-        aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
-            input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
-            input_fp32_nb, GGML_MAX_DIMS);
-
-        ggml_cann_pool_alloc fp32_allocator(
-            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
-        output_fp32_buffer = fp32_allocator.get();
-        aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
-            output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
-            input_fp32_nb, GGML_MAX_DIMS);
-        aclnn_mul(ctx, acl_src0, acl_cos_reshape_tensor, input_fp32_tensor1);
-        aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
-                  input_fp32_tensor2);
-        aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
-                  output_fp32_tensor);
-        aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
-
-        ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
-        ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
-        ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
+    ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
+        acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
    }

-    ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
+    ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize,
+                                           executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyTensor(acl_x));
    ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
-    ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
-    ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
-    ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
-    ACL_CHECK(aclDestroyTensor(acl_src0));
+    ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@ -1669,12 +1669,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            }
        case GGML_OP_MUL_MAT: {
            switch (op->src[0]->type) {
+                case GGML_TYPE_Q8_0:
+                    // Current groupsize should not be greater than k-1 in
+                    // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
+                    if (op->src[0]->ne[0] <= QK8_0) {
+                        return false;
+                    }
                case GGML_TYPE_F16:
                case GGML_TYPE_F32:
-                case GGML_TYPE_Q8_0:
-                    // TODO: fix me
-                    // Current groupsize should not be greater than k-1 in
-                    // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
                case GGML_TYPE_Q4_0:
                    return true;
                default:
@ -1706,9 +1708,61 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                    return false;
            }
        }
+        case GGML_OP_CONT: {
+            // TODO: support GGML_TYPE_BF16
+            switch (op->src[0]->type) {
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                    return true;
+                default:
+                    return false;
+            }
+        }
+        case GGML_OP_ROPE: {
+            // TODO: with ops-test v == 1
+            float * freq_scale = (float*)((int32_t*)op->op_params + 6);
+            float * ext_factor = (float*)((int32_t*)op->op_params + 7);
+            float * attn_factor = (float*)((int32_t*)op->op_params + 8);
+            // TODO: with freq_factors
+            if (op->src[2] != NULL) {
+                return false;
+            }
+            // TODO: n_dims <= ne0
+            if (op->src[0]->ne[0] != op->op_params[1]) {
+                return false;
+            }
+            // TODO: ext_factor != 0
+            if (*ext_factor != 0) {
+                return false;
+            }
+            // TODO: freq_scale != 1
+            if (*freq_scale != 1) {
+                return false;
+            }
+            // TODO: attn_factor != 1
+            if (*attn_factor != 1) {
+                return false;
+            }
+            //TODO: type == GGML_TYPE_F16
+            switch (op->src[0]->type) {
+                case GGML_TYPE_F32:
+                    return true;
+                default:
+                    return false;
+            }
+        }
+        case GGML_OP_UPSCALE: {
+            // aclnnUpsampleNearest2dGetWorkspaceSize not support
+            // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
+            if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
+                return false;
+            }
+            return true;
+        }
+        case GGML_OP_IM2COL:
+        case GGML_OP_CONCAT:
        case GGML_OP_DUP:
        case GGML_OP_REPEAT:
-        case GGML_OP_CONCAT:
        case GGML_OP_NONE:
        case GGML_OP_RESHAPE:
        case GGML_OP_VIEW:
@ -1722,17 +1776,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
        case GGML_OP_SCALE:
        case GGML_OP_SQR:
        case GGML_OP_CLAMP:
-        case GGML_OP_CONT:
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX:
-        case GGML_OP_ROPE:
-        case GGML_OP_IM2COL:
        case GGML_OP_POOL_2D:
        case GGML_OP_SUM_ROWS:
        case GGML_OP_ARGSORT:
        case GGML_OP_ACC:
        case GGML_OP_GROUP_NORM:
-        case GGML_OP_UPSCALE:
        case GGML_OP_PAD:
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
@ -2064,16 +2114,17 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
                dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
                ggml_cann_set_device(i);
                ggml_backend_dev_t dev = new ggml_backend_device {
-                    /* .interface = */ ggml_backend_cann_device_interface,
-                    /* .reg       = */ &reg,
-                    /* .context   = */ dev_ctx
+                    /* .iface   = */ ggml_backend_cann_device_interface,
+                    /* .reg     = */ &reg,
+                    /* .context = */ dev_ctx
                };
                ctx->devices.push_back(dev);
            }

            reg = ggml_backend_reg {
-                /* .interface = */ ggml_backend_cann_reg_interface,
-                /* .context   = */ ctx
+                /* .api_version = */ GGML_BACKEND_API_VERSION,
+                /* .iface       = */ ggml_backend_cann_reg_interface,
+                /* .context     = */ ctx
            };
        }

@ -2126,3 +2177,5 @@ void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
    ggml_cann_set_device(device);
    ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
 }
+
+GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg)
--- a/ggml/src/ggml-cann/kernels/CMakeLists.txt
+++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt
@ -1,7 +1,3 @@
-if (NOT SOC_TYPE)
-    set (SOC_TYPE "Ascend910B3")
-endif()
-
 file(GLOB SRC_FILES
    get_row_f32.cpp
    get_row_f16.cpp
@ -13,7 +9,6 @@ file(GLOB SRC_FILES
    dup.cpp
 )

-string(TOLOWER ${SOC_TYPE} SOC_VERSION)
 set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
 set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")

@ -30,4 +25,6 @@ ascendc_library(ascendc_kernels STATIC
    ${SRC_FILES}
 )

+message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.")
+ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
 # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
--- a/ggml/src/ggml-cann/kernels/dup.cpp
+++ b/ggml/src/ggml-cann/kernels/dup.cpp
@ -5,6 +5,7 @@
 using namespace AscendC;

 #define BUFFER_NUM 2
+const int64_t SUPPORTED_MAX_DIM = 65535;  // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>

 template <typename SRC_T, typename DST_T>
 class DupByRows {
@ -19,6 +20,7 @@ class DupByRows {
        // Input has four dims.
        int64_t op_block_num = GetBlockNum();
        int64_t op_block_idx = GetBlockIdx();
+        assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM);

        // param
        num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
@ -51,24 +53,36 @@ class DupByRows {

    __aicore__ inline void copy_in() {
        LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
-
-        DataCopyExtParams dataCopyParams;
-        dataCopyParams.blockCount = 1;
-        dataCopyParams.blockLen = num_elem * sizeof(SRC_T);
-        DataCopyPadExtParams<SRC_T> padParams;
-        DataCopyPad(src_local, src_gm, dataCopyParams, padParams);
-
+        const size_t elem_per_block = 32 / sizeof(SRC_T);
+        size_t tail = num_elem % elem_per_block;
+        size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
+        DataCopy(src_local, src_gm, cpy_elements_len);
        src_queue.EnQue(src_local);
    }

    __aicore__ inline void copy_out() {
        LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
-
+#ifdef ASCEND_310P
+        const size_t elem_per_block = 32 / sizeof(DST_T);
+        size_t tail = num_elem % elem_per_block;
+        size_t len = num_elem & ~(elem_per_block - 1);
+        if (len > 0) {
+            DataCopy(dst_gm, dst_local, len);
+        }
+        if(tail != 0) {
+            for (size_t i = tail; i < elem_per_block; i++) {
+                dst_local[len + i].SetValue(0, 0);
+            }
+            SetAtomicAdd<float>();
+            DataCopy(dst_gm[len], dst_local[len], elem_per_block);
+            SetAtomicNone();
+        }
+#else
        DataCopyExtParams dataCopyParams;
        dataCopyParams.blockCount = 1;
        dataCopyParams.blockLen = num_elem * sizeof(DST_T);
        DataCopyPad(dst_gm, dst_local, dataCopyParams);
-
+#endif
        dst_queue.FreeTensor(dst_local);
    }

--- a/ggml/src/ggml-cann/kernels/get_row_f16.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_f16.cpp
@ -14,7 +14,7 @@ class GET_ROW_F16 {
                                int64_t *output_ne_ub, size_t *output_nb_ub) {
        // TODO, use template for F16/f32
        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
+        op_block_idx = GetBlockIdx();

        for (int i = 0; i < 4; i++) {
            input_ne[i] = input_ne_ub[i];
@ -59,32 +59,42 @@ class GET_ROW_F16 {
    }

    __aicore__ inline void copy_in(uint32_t offset, size_t len) {
+        size_t origin_len = len;
        LocalTensor<half> input_local = input_queue.AllocTensor<half>();
-        size_t tail = len % 32;
-        len = len & ~31;
-        DataCopy(input_local, input_gm[offset], len);
+        const size_t elem_per_block = 32 / sizeof(half);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
        if(tail != 0) {
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = tail * sizeof(half);
-            DataCopyPadExtParams<half> padParams;
-            DataCopyPad(input_local[len], input_gm[offset + len],
-                        dataCopyParams, padParams);
+            len += elem_per_block;
        }
+        DataCopy(input_local, input_gm[offset], len);
        input_queue.EnQue(input_local);
    }

    __aicore__ inline void copy_out(uint32_t offset, size_t len) {
        LocalTensor<float> output_local = output_queue.DeQue<float>();
-        size_t tail = len % 32;
-        len = len & ~31;
-        DataCopy(output_gm[offset], output_local, len);
+        const size_t elem_per_block = 32 / sizeof(float);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
+        if (len > 0) {
+            DataCopy(output_gm[offset], output_local, len);
+        }
+
        if(tail != 0) {
+#ifdef ASCEND_310P
+            for (size_t i = tail; i < elem_per_block; i++) {
+                output_local[len + i].SetValue(0, 0);
+            }
+            SetAtomicAdd<float>();
+            DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
+            SetAtomicNone();
+#else
            DataCopyExtParams dataCopyParams;
            dataCopyParams.blockCount = 1;
            dataCopyParams.blockLen = tail * sizeof(float);
            DataCopyPad(output_gm[offset + len], output_local[len],
                        dataCopyParams);
+#endif
        }
        output_queue.FreeTensor(output_local);
    }
@ -150,6 +160,7 @@ class GET_ROW_F16 {
    GlobalTensor<float> output_gm;
    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    int64_t op_block_idx;
 };

 template <typename T>
--- a/ggml/src/ggml-cann/kernels/get_row_f32.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_f32.cpp
@ -13,7 +13,7 @@ class GET_ROW_F32 {
                                int64_t *indices_ne_ub, size_t *indices_nb_ub,
                                int64_t *output_ne_ub, size_t *output_nb_ub) {
        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
+        op_block_idx = GetBlockIdx();

        for (int i = 0; i < 4; i++) {
            input_ne[i] = input_ne_ub[i];
@ -55,31 +55,40 @@ class GET_ROW_F32 {

    __aicore__ inline void copy_in(uint32_t offset, size_t len) {
        LocalTensor<float> input_local = input_queue.AllocTensor<float>();
-        size_t tail = len % 32;
-        len = len & ~31;
-        DataCopy(input_local, input_gm[offset], len);
+        const size_t elem_per_block = 32 / sizeof(float);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
        if(tail != 0) {
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = tail * sizeof(float);
-            DataCopyPadExtParams<float> padParams;
-            DataCopyPad(input_local[len], input_gm[offset + len],
-                        dataCopyParams, padParams);
+            len += elem_per_block;
        }
+        DataCopy(input_local, input_gm[offset], len);
        input_queue.EnQue(input_local);
    }

    __aicore__ inline void copy_out(uint32_t offset, size_t len) {
        LocalTensor<float> output_local = output_queue.DeQue<float>();
-        size_t tail = len % 32;
-        len = len & ~31;
-        DataCopy(output_gm[offset], output_local, len);
+        const size_t elem_per_block = 32 / sizeof(float);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
+        if (len > 0) {
+            DataCopy(output_gm[offset], output_local, len);
+        }
+
        if(tail != 0) {
+#ifdef ASCEND_310P
+            for (size_t i = tail; i < elem_per_block; i++) {
+                output_local[len + i].SetValue(0, 0);
+            }
+            SetAtomicAdd<float>();
+            DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
+            SetAtomicNone();
+#else
            DataCopyExtParams dataCopyParams;
            dataCopyParams.blockCount = 1;
            dataCopyParams.blockLen = tail * sizeof(float);
            DataCopyPad(output_gm[offset + len], output_local[len],
                        dataCopyParams);
+#endif
        }
        output_queue.FreeTensor(output_local);
    }
@ -144,6 +153,7 @@ class GET_ROW_F32 {
    GlobalTensor<float> output_gm;
    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    int64_t op_block_idx;
 };

 template <typename T>
--- a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
@ -110,9 +110,12 @@ class GET_ROW_Q4_0 {
        LocalTensor<float> output_local = output_queue.AllocTensor<float>();

        // TODO: cast more data to speed up.
+#ifdef ASCEND_310P
+        // TODO: 310P support quantification
+#else
        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
        Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
-
+#endif
        // Only mul need compile by group.
        half scale = scale_gm.GetValue(scale_offset);

--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@ -1,14 +1,13 @@
-add_library(ggml-cpu
-            ggml-cpu.c
-            ggml-cpu.cpp
-            ggml-cpu-aarch64.c
-            ggml-cpu-aarch64.h
-            ggml-cpu-quants.c
-            ggml-cpu-quants.h
-            )
+ggml_add_backend_library(ggml-cpu
+                         ggml-cpu.c
+                         ggml-cpu.cpp
+                         ggml-cpu-aarch64.c
+                         ggml-cpu-aarch64.h
+                         ggml-cpu-quants.c
+                         ggml-cpu-quants.h
+                        )

-target_link_libraries(ggml-cpu PRIVATE ggml-base)
-target_include_directories(ggml-cpu PRIVATE . ..)
+target_include_directories(ggml-cpu PRIVATE .)

 if (APPLE AND GGML_ACCELERATE)
    find_library(ACCELERATE_FRAMEWORK Accelerate)
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -13578,29 +13578,6 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int

 #endif // GGML_USE_OPENMP

-void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
-    p->n_threads  = n_threads;
-    p->prio       = 0;     // default priority (usually means normal or inherited)
-    p->poll       = 50;    // hybrid-polling enabled
-    p->strict_cpu = false; // no strict placement (all threads share same cpumask)
-    p->paused     = false; // threads are ready to go
-    memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
-}
-
-struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
-    struct ggml_threadpool_params p;
-    ggml_threadpool_params_init(&p, n_threads);
-    return p;
-}
-
-bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
-    if (p0->n_threads      != p1->n_threads  )    return false;
-    if (p0->prio           != p1->prio       )    return false;
-    if (p0->poll           != p1->poll       )    return false;
-    if (p0->strict_cpu     != p1->strict_cpu )    return false;
-    return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
-}
-
 static struct ggml_threadpool * ggml_threadpool_new_impl(
    struct ggml_threadpool_params * tpp,
               struct ggml_cgraph * cgraph,
@ -13896,7 +13873,7 @@ int ggml_cpu_has_vsx(void) {
 }

 int ggml_cpu_has_neon(void) {
-#if defined(__ARM_ARCH)
+#if defined(__ARM_ARCH) && defined(__ARM_NEON)
    return ggml_arm_arch_features.has_neon;
 #else
    return 0;
@ -13904,7 +13881,7 @@ int ggml_cpu_has_neon(void) {
 }

 int ggml_cpu_has_sve(void) {
-#if defined(__ARM_ARCH)
+#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
    return ggml_arm_arch_features.has_sve;
 #else
    return 0;
@ -13912,7 +13889,7 @@ int ggml_cpu_has_sve(void) {
 }

 int ggml_cpu_has_matmul_int8(void) {
-#if defined(__ARM_ARCH)
+#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8)
    return ggml_arm_arch_features.has_i8mm;
 #else
    return 0;
@ -13920,7 +13897,7 @@ int ggml_cpu_has_matmul_int8(void) {
 }

 int ggml_cpu_get_sve_cnt(void) {
-#if defined(__ARM_ARCH)
+#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
    return ggml_arm_arch_features.sve_cnt;
 #else
    return 0;
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@ -541,16 +541,12 @@ static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg
    return &ggml_backend_cpu_device;
 }

-struct ggml_backend_feature {
-    const char * name;
-    const char * value;
-};
-
-// Not used yet
 // This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically,
-// and additionally to allow other backends to expose their own list of features that applications can query using the same API.
+// and additionally to allow other backends to expose their own list of features that applications can query using the same API
 static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) {
    static std::vector<ggml_backend_feature> features = []() {
+        ggml_cpu_init();
+
        std::vector<ggml_backend_feature> features;
        if (ggml_cpu_has_sse3()) {
            features.push_back({ "SSE3", "1" });
@ -561,6 +557,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
        if (ggml_cpu_has_avx()) {
            features.push_back({ "AVX", "1" });
        }
+        if (ggml_cpu_has_avx_vnni()) {
+            features.push_back({ "AVX_VNNI", "1" });
+        }
        if (ggml_cpu_has_avx2()) {
            features.push_back({ "AVX2", "1" });
        }
@ -570,9 +569,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
        if (ggml_cpu_has_fma()) {
            features.push_back({ "FMA", "1" });
        }
-        if (ggml_cpu_has_avx_vnni()) {
-            features.push_back({ "AVX_VNNI", "1" });
-        }
        if (ggml_cpu_has_avx512()) {
            features.push_back({ "AVX512", "1" });
        }
@ -619,6 +615,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
        if (ggml_cpu_has_llamafile()) {
            features.push_back({ "LLAMAFILE", "1" });
        }
+        // TODO: rename this
+    #ifdef GGML_USE_CPU_AARCH64
+        features.push_back({ "AARCH64_REPACK", "1" });
+    #endif

        features.push_back({ nullptr, nullptr });

@ -637,6 +637,29 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
    if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
        return (void *)ggml_backend_cpu_get_extra_bufts;
    }
+    if (strcmp(name, "ggml_backend_get_features") == 0) {
+        return (void *)ggml_backend_cpu_get_features;
+    }
+    if (strcmp(name, "ggml_backend_set_abort_callback") == 0) {
+        return (void *)ggml_backend_cpu_set_abort_callback;
+    }
+    if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) {
+        return (void *)ggml_numa_init;
+    }
+    if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
+        return (void *)ggml_is_numa;
+    }
+
+    // threadpool - TODO:  move to ggml-base
+    if (strcmp(name, "ggml_threadpool_new") == 0) {
+        return (void *)ggml_threadpool_new;
+    }
+    if (strcmp(name, "ggml_threadpool_free") == 0) {
+        return (void *)ggml_threadpool_free;
+    }
+    if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) {
+        return (void *)ggml_backend_cpu_set_threadpool;
+    }

    return NULL;

@ -655,9 +678,12 @@ ggml_backend_reg_t ggml_backend_cpu_reg(void) {
    ggml_cpu_init();

    static struct ggml_backend_reg ggml_backend_cpu_reg = {
-        /* .iface   = */ ggml_backend_cpu_reg_i,
-        /* .context = */ NULL,
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_cpu_reg_i,
+        /* .context     = */ NULL,
    };

    return &ggml_backend_cpu_reg;
 }
+
+GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg)
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@ -46,13 +46,10 @@ if (CUDAToolkit_FOUND)
        list(APPEND GGML_SOURCES_CUDA ${SRCS})
    endif()

-    add_library(ggml-cuda
-                ${GGML_HEADERS_CUDA}
-                ${GGML_SOURCES_CUDA}
-                )
-
-    target_link_libraries(ggml-cuda PRIVATE ggml-base)
-    target_include_directories(ggml-cuda PRIVATE . ..)
+    ggml_add_backend_library(ggml-cuda
+                             ${GGML_HEADERS_CUDA}
+                             ${GGML_SOURCES_CUDA}
+                            )

    add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})

--- a/ggml/src/ggml-cuda/argmax.cu
+++ b/ggml/src/ggml-cuda/argmax.cu
@ -1,57 +1,69 @@
-#include "common.cuh"
-#include "argmax.cuh"
-#include "sum.cuh"
-
+#include <algorithm>
 #include <cstdint>

-static __global__ void argmax_f32(
-    const float * x, int32_t * dst, const int64_t ncols, const int64_t nrows) {
+#include "argmax.cuh"
+#include "common.cuh"
+#include "sum.cuh"

-    int argmax_thread = 0;
-    const int64_t row0 = (int64_t)blockIdx.x*WARP_SIZE;
+static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __restrict__ dst, const int64_t ncols) {
+    const int64_t row = blockIdx.x;

-#pragma unroll
-    for (int64_t row1 = 0; row1 < WARP_SIZE; ++row1) {
-        const int64_t row = row0 + row1;
+    float maxval = -FLT_MAX;
+    int   argmax = -1;
+    const float * rowx = x + row * ncols;

-        if (row >= nrows) {
-            break;
+    for (int32_t col = threadIdx.x; col < ncols; col += blockDim.x) {
+        const float val = rowx[col];
+        if (val > maxval) {
+            maxval = val;
+            argmax = col;
        }
-
-        float maxval = -FLT_MAX;
-        int   argmax = -1;
-
-        for (int32_t col = threadIdx.x; col < ncols; col += WARP_SIZE) {
-            const float val        = x[row*ncols + col];
-            const int   bigger     = val > maxval;
-            const int   not_bigger = bigger ^ 0x00000001;
-
-            maxval = maxval*not_bigger + val*bigger;
-            argmax = argmax*not_bigger + col*bigger;
-        }
-
-#pragma unroll
-        for (int mask = 16; mask > 0; mask >>= 1) {
-            const float val        = __shfl_xor_sync(0xFFFFFFFF, maxval, mask, WARP_SIZE);
-            const int   col        = __shfl_xor_sync(0xFFFFFFFF, argmax, mask, WARP_SIZE);
-            const int   bigger     = val > maxval;
-            const int   not_bigger = bigger ^ 0x00000001;
-
-            maxval = maxval*not_bigger + val*bigger;
-            argmax = argmax*not_bigger + col*bigger;
-        }
-
-        const int store = row1 == threadIdx.x;
-        argmax_thread += store*argmax;
    }

-    const int row = row0 + threadIdx.x;
-
-    if (row >= nrows) {
-        return;
+#pragma unroll
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
+        const int   col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
+        if (val > maxval) {
+            maxval = val;
+            argmax = col;
+        }
    }

-    dst[row] = argmax_thread;
+    const int n_warps = blockDim.x / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    if (n_warps > 1) {
+        constexpr int    max_warps = 1024 / WARP_SIZE;
+        __shared__ float shared_maxval[max_warps];
+        __shared__ int   shared_argmax[max_warps];
+        if (lane_id == 0) {
+            shared_maxval[warp_id] = maxval;
+            shared_argmax[warp_id] = argmax;
+        }
+
+        __syncthreads();
+
+        if (warp_id == 0) {
+            if (lane_id < n_warps) {
+                maxval = shared_maxval[lane_id];
+                argmax = shared_argmax[lane_id];
+            }
+#pragma unroll
+            for (int offset = 16; offset > 0; offset >>= 1) {
+                const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
+                const int   col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
+                if (val > maxval) {
+                    maxval = val;
+                    argmax = col;
+                }
+            }
+        }
+    }
+
+    if (warp_id == 0 && lane_id == 0) {
+        dst[row] = argmax;
+    }
 }

 void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@ -70,10 +82,10 @@ void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

    cudaStream_t stream = ctx.stream();

-    const int64_t num_blocks = (nrows + WARP_SIZE - 1) / WARP_SIZE;
-
-    const dim3 blocks_dim(WARP_SIZE, 1, 1);
+    const int64_t num_blocks = nrows;
+    const int64_t num_threads = std::min<int64_t>(1024, (ne00 + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE);
+    const dim3 blocks_dim(num_threads, 1, 1);
    const dim3 blocks_num(num_blocks, 1, 1);

-    argmax_f32<<<blocks_num, blocks_dim, 0, stream>>>(src0_d, dst_d, ne00, nrows);
+    argmax_f32<<<blocks_num, blocks_dim, 0, stream>>>(src0_d, dst_d, ne00);
 }
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@ -180,8 +180,8 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
    return __reduce_add_sync(0xffffffff, x);
 #else
 #pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        x += __shfl_xor_sync(0xffffffff, x, offset, 32);
    }
    return x;
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
@ -189,17 +189,17 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {

 static __device__ __forceinline__ float warp_reduce_sum(float x) {
 #pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        x += __shfl_xor_sync(0xffffffff, x, offset, 32);
    }
    return x;
 }

 static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
 #pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
-        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        a.x += __shfl_xor_sync(0xffffffff, a.x, offset, 32);
+        a.y += __shfl_xor_sync(0xffffffff, a.y, offset, 32);
    }
    return a;
 }
@ -209,16 +209,16 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {

 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 #pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        const half2 a_other = __shfl_xor_sync(0xffffffff, a, offset, 32);
        reinterpret_cast<half&>(a.x) +=  __low2half(a_other);
        reinterpret_cast<half&>(a.y) += __high2half(a_other);
    }
    return a;
 #else
 #pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, 32));
    }
    return a;
 #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
@ -231,8 +231,8 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {

 static __device__ __forceinline__ float warp_reduce_max(float x) {
 #pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
    }
    return x;
 }
@ -275,8 +275,8 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
 static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 #pragma unroll
-   for (int mask = 16; mask > 0; mask >>= 1) {
-       x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
+   for (int offset = 16; offset > 0; offset >>= 1) {
+       x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
   }
   return x;
 #else
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -3126,6 +3126,61 @@ static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t re
    return ctx->devices[index];
 }

+static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) {
+    static std::vector<ggml_backend_feature> features = []() {
+        std::vector<ggml_backend_feature> features;
+    #define _STRINGIFY(...) #__VA_ARGS__
+    #define STRINGIFY(...) _STRINGIFY(__VA_ARGS__)
+
+    #ifdef __CUDA_ARCH_LIST__
+        features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) });
+    #endif
+
+    #ifdef GGML_CUDA_FORCE_MMQ
+        features.push_back({ "FORCE_MMQ", "1" });
+    #endif
+
+    #ifdef GGML_CUDA_FORCE_CUBLAS
+        features.push_back({ "FORCE_CUBLAS", "1" });
+    #endif
+
+    #ifdef GGML_CUDA_NO_VMM
+        features.push_back({ "NO_VMM", "1" });
+    #endif
+
+    #ifdef GGML_CUDA_NO_PEER_COPY
+        features.push_back({ "NO_PEER_COPY", "1" });
+    #endif
+
+    #ifdef GGML_CUDA_F16
+        features.push_back({ "F16", "1" });
+    #endif
+
+    #ifdef GGML_CUDA_USE_GRAPHS
+        features.push_back({ "USE_GRAPHS", "1" });
+    #endif
+
+    #ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
+        features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) });
+    #endif
+
+    #ifdef GGML_CUDA_FA_ALL_QUANTS
+        features.push_back({ "FA_ALL_QUANTS", "1" });
+    #endif
+
+    #undef _STRINGIFY
+    #undef STRINGIFY
+
+        features.push_back({ nullptr, nullptr });
+
+        return features;
+    }();
+
+    return features.data();
+
+    GGML_UNUSED(reg);
+}
+
 static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
    GGML_UNUSED(reg);
    if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
@ -3137,6 +3192,9 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con
    if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) {
        return (void *)ggml_backend_cuda_unregister_host_buffer;
    }
+    if (strcmp(name, "ggml_backend_get_features") == 0) {
+        return (void *)ggml_backend_cuda_get_features;
+    }
    return nullptr;
 }

@ -3169,16 +3227,17 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                dev_ctx->description = prop.name;

                ggml_backend_dev_t dev = new ggml_backend_device {
-                    /* .interface = */ ggml_backend_cuda_device_interface,
-                    /* .reg       = */ &reg,
-                    /* .context   = */ dev_ctx
+                    /* .iface   = */ ggml_backend_cuda_device_interface,
+                    /* .reg     = */ &reg,
+                    /* .context = */ dev_ctx
                };
                ctx->devices.push_back(dev);
            }

            reg = ggml_backend_reg {
-                /* .interface = */ ggml_backend_cuda_reg_interface,
-                /* .context   = */ ctx
+                /* .api_version = */ GGML_BACKEND_API_VERSION,
+                /* .iface       = */ ggml_backend_cuda_reg_interface,
+                /* .context     = */ ctx
            };
        }

@ -3209,3 +3268,5 @@ ggml_backend_t ggml_backend_cuda_init(int device) {

    return cuda_backend;
 }
+
+GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg)
--- a/ggml/src/ggml-cuda/quantize.cu
+++ b/ggml/src/ggml-cuda/quantize.cu
@ -69,8 +69,8 @@ static __global__ void quantize_mmq_q8_1(

    // Exchange max. abs. value between vals_per_scale/4 threads.
 #pragma unroll
-    for (int mask = vals_per_scale/8; mask > 0; mask >>= 1) {
-        amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, mask, WARP_SIZE));
+    for (int offset = vals_per_scale/8; offset > 0; offset >>= 1) {
+        amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, offset, WARP_SIZE));
    }

    float sum;
@ -79,8 +79,8 @@ static __global__ void quantize_mmq_q8_1(

        // Exchange calculate sum across vals_per_sum/4 threads.
 #pragma unroll
-        for (int mask = vals_per_sum/8; mask > 0; mask >>= 1) {
-            sum += __shfl_xor_sync(0xFFFFFFFF, sum, mask, WARP_SIZE);
+        for (int offset = vals_per_sum/8; offset > 0; offset >>= 1) {
+            sum += __shfl_xor_sync(0xFFFFFFFF, sum, offset, WARP_SIZE);
        }
    }

--- a/ggml/src/ggml-hip/CMakeLists.txt
+++ b/ggml/src/ggml-hip/CMakeLists.txt
@ -64,12 +64,10 @@ else()
    list(APPEND GGML_SOURCES_ROCM ${SRCS})
 endif()

-add_library(ggml-hip
-            ${GGML_HEADERS_ROCM}
-            ${GGML_SOURCES_ROCM})
-
-target_link_libraries(ggml-hip PRIVATE ggml-base)
-target_include_directories(ggml-hip PRIVATE . ..)
+ggml_add_backend_library(ggml-hip
+                         ${GGML_HEADERS_ROCM}
+                         ${GGML_SOURCES_ROCM}
+                        )

 # TODO: do not use CUDA definitions for HIP
 target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@ -295,6 +295,9 @@ struct ggml_cgraph {
    enum ggml_cgraph_eval_order order;
 };

+// returns a slice of cgraph with nodes [i0, i1)
+// the slice does not have leafs or gradients
+// if you need the gradients, get them from the original graph
 struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);

 // Memory allocation
--- a/ggml/src/ggml-kompute/CMakeLists.txt
+++ b/ggml/src/ggml-kompute/CMakeLists.txt
@ -6,13 +6,13 @@ if (NOT glslc_executable)
    message(FATAL_ERROR "glslc not found")
 endif()

-add_library(ggml-kompute
-            ggml-kompute.cpp
-            ../../include/ggml-kompute.h
-            )
+ggml_add_backend_library(ggml-kompute
+                         ggml-kompute.cpp
+                         ../../include/ggml-kompute.h
+                        )

 target_link_libraries(ggml-kompute PRIVATE ggml-base kompute)
-target_include_directories(ggml-kompute PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR})
+target_include_directories(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR})

 add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)

--- a/ggml/src/ggml-kompute/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute/ggml-kompute.cpp
@ -2176,9 +2176,12 @@ static const struct ggml_backend_reg_i ggml_backend_kompute_reg_i = {

 ggml_backend_reg_t ggml_backend_kompute_reg() {
    static ggml_backend_reg reg = {
-        /* .iface   = */ ggml_backend_kompute_reg_i,
-        /* .context = */ nullptr,
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_kompute_reg_i,
+        /* .context     = */ nullptr,
    };

    return &reg;
 }
+
+GGML_BACKEND_DL_IMPL(ggml_backend_kompute_reg)
--- a/ggml/src/ggml-metal/CMakeLists.txt
+++ b/ggml/src/ggml-metal/CMakeLists.txt
@ -4,19 +4,16 @@ find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)

 message(STATUS "Metal framework found")

-add_library(ggml-metal
-            ggml-metal.m
-            )
+ggml_add_backend_library(ggml-metal
+                         ggml-metal.m
+                        )

 target_link_libraries(ggml-metal PRIVATE
-                      ggml-base
                      ${FOUNDATION_LIBRARY}
                      ${METAL_FRAMEWORK}
                      ${METALKIT_FRAMEWORK}
                      )

-target_include_directories(ggml-metal PRIVATE . ..)
-
 if (GGML_METAL_NDEBUG)
    add_compile_definitions(GGML_METAL_NDEBUG)
 endif()
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@ -1927,7 +1927,7 @@ static void ggml_metal_encode_node(

                // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                // to the matrix-vector kernel
-                int ne11_mm_min = 1;
+                int ne11_mm_min = 4;

 #if 0
                // the numbers below are measured on M2 Ultra for 7B and 13B models
@ -1951,316 +1951,316 @@ static void ggml_metal_encode_node(
                        }
 #endif

-                        // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
-                        // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-                        if ([device supportsFamily:MTLGPUFamilyApple7] &&
-                                !ggml_is_transposed(src0) &&
-                                !ggml_is_transposed(src1) &&
-                                src1t == GGML_TYPE_F32 &&
-                                ne00 % 32 == 0 && ne00 >= 64 &&
-                                (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) {
-                            //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
+                // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
+                // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
+                if ([device supportsFamily:MTLGPUFamilyApple7] &&
+                        !ggml_is_transposed(src0) &&
+                        !ggml_is_transposed(src1) &&
+                        src1t == GGML_TYPE_F32 &&
+                        ne00 % 32 == 0 && ne00 >= 64 &&
+                        (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) {
+                    //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);

-                            // some Metal matrix data types require aligned pointers
-                            // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
-                            switch (src0->type) {
-                                case GGML_TYPE_F32:  GGML_ASSERT(nb01 % 16 == 0); break;
-                                case GGML_TYPE_F16:  GGML_ASSERT(nb01 % 8  == 0); break;
-                                case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8  == 0); break;
-                                default: break;
-                            }
+                    // some Metal matrix data types require aligned pointers
+                    // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
+                    switch (src0->type) {
+                        case GGML_TYPE_F32:  GGML_ASSERT(nb01 % 16 == 0); break;
+                        case GGML_TYPE_F16:  GGML_ASSERT(nb01 % 8  == 0); break;
+                        case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8  == 0); break;
+                        default: break;
+                    }

-                            id<MTLComputePipelineState> pipeline = nil;
+                    id<MTLComputePipelineState> pipeline = nil;

-                            switch (src0->type) {
-                                case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32    ].pipeline; break;
-                                case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32    ].pipeline; break;
-                                case GGML_TYPE_BF16:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32   ].pipeline; break;
-                                case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32   ].pipeline; break;
-                                case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32   ].pipeline; break;
-                                case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32   ].pipeline; break;
-                                case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32   ].pipeline; break;
-                                case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32   ].pipeline; break;
-                                case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32   ].pipeline; break;
-                                case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32   ].pipeline; break;
-                                case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32   ].pipeline; break;
-                                case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32   ].pipeline; break;
-                                case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32   ].pipeline; break;
-                                case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
-                                case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
-                                case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
-                                case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32  ].pipeline; break;
-                                case GGML_TYPE_IQ2_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32  ].pipeline; break;
-                                case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32  ].pipeline; break;
-                                case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32  ].pipeline; break;
-                                case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
-                                case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
-                                default: GGML_ABORT("MUL MAT-MAT not implemented");
-                            }
+                    switch (src0->type) {
+                        case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32    ].pipeline; break;
+                        case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32    ].pipeline; break;
+                        case GGML_TYPE_BF16:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32   ].pipeline; break;
+                        case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32   ].pipeline; break;
+                        case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32   ].pipeline; break;
+                        case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32   ].pipeline; break;
+                        case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32   ].pipeline; break;
+                        case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32   ].pipeline; break;
+                        case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32   ].pipeline; break;
+                        case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
+                        case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
+                        case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
+                        case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ2_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
+                        case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
+                        default: GGML_ABORT("MUL MAT-MAT not implemented");
+                    }

-                            ggml_metal_kargs_mul_mm args = {
-                                /*.ne00 =*/ ne00,
-                                /*.ne02 =*/ ne02,
-                                /*.nb01 =*/ nb01,
-                                /*.nb02 =*/ nb02,
-                                /*.nb03 =*/ nb03,
-                                /*.ne12 =*/ ne12,
-                                /*.nb10 =*/ nb10,
-                                /*.nb11 =*/ nb11,
-                                /*.nb12 =*/ nb12,
-                                /*.nb13 =*/ nb13,
-                                /*.ne0  =*/ ne0,
-                                /*.ne1  =*/ ne1,
-                                /*.r2   =*/ r2,
-                                /*.r3   =*/ r3,
-                            };
+                    ggml_metal_kargs_mul_mm args = {
+                        /*.ne00 =*/ ne00,
+                        /*.ne02 =*/ ne02,
+                        /*.nb01 =*/ nb01,
+                        /*.nb02 =*/ nb02,
+                        /*.nb03 =*/ nb03,
+                        /*.ne12 =*/ ne12,
+                        /*.nb10 =*/ nb10,
+                        /*.nb11 =*/ nb11,
+                        /*.nb12 =*/ nb12,
+                        /*.nb13 =*/ nb13,
+                        /*.ne0  =*/ ne0,
+                        /*.ne1  =*/ ne1,
+                        /*.r2   =*/ r2,
+                        /*.r3   =*/ r3,
+                    };

-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBytes:&args    length:sizeof(args) atIndex:0];
-                            [encoder setBuffer:id_src0 offset:offs_src0    atIndex:1];
-                            [encoder setBuffer:id_src1 offset:offs_src1    atIndex:2];
-                            [encoder setBuffer:id_dst  offset:offs_dst     atIndex:3];
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBytes:&args    length:sizeof(args) atIndex:0];
+                    [encoder setBuffer:id_src0 offset:offs_src0    atIndex:1];
+                    [encoder setBuffer:id_src1 offset:offs_src1    atIndex:2];
+                    [encoder setBuffer:id_dst  offset:offs_dst     atIndex:3];

-                            [encoder setThreadgroupMemoryLength:8192 atIndex:0];
-                            [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
-                        } else {
-                            int nth0 = 32;
-                            int nth1 = 1;
-                            int nrows = 1;
-                            //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
+                    [encoder setThreadgroupMemoryLength:8192 atIndex:0];
+                    [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                } else {
+                    int nth0 = 32;
+                    int nth1 = 1;
+                    int nrows = 1;
+                    //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);

-                            id<MTLComputePipelineState> pipeline = nil;
+                    id<MTLComputePipelineState> pipeline = nil;

-                            // use custom matrix x vector kernel
-                            switch (src0t) {
-                                case GGML_TYPE_F32:
-                                    {
-                                        GGML_ASSERT(src1t == GGML_TYPE_F32);
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline;
+                    // use custom matrix x vector kernel
+                    switch (src0t) {
+                        case GGML_TYPE_F32:
+                            {
+                                GGML_ASSERT(src1t == GGML_TYPE_F32);
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline;
+                                nrows = 4;
+                            } break;
+                        case GGML_TYPE_F16:
+                            {
+                                nth0 = 32;
+                                nth1 = 1;
+                                if (src1t == GGML_TYPE_F32) {
+                                    if (ne11 * ne12 < 4) {
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline;
+                                    } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline;
+                                        nrows = ne11;
+                                    } else {
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline;
                                        nrows = 4;
-                                    } break;
-                                case GGML_TYPE_F16:
-                                    {
-                                        nth0 = 32;
-                                        nth1 = 1;
-                                        if (src1t == GGML_TYPE_F32) {
-                                            if (ne11 * ne12 < 4) {
-                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline;
-                                            } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
-                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline;
-                                                nrows = ne11;
-                                            } else {
-                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline;
-                                                nrows = 4;
-                                            }
-                                        } else {
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline;
-                                            nrows = 4;
-                                        }
-                                    } break;
-                                case GGML_TYPE_BF16:
-                                    {
-                                        nth0 = 32;
-                                        nth1 = 1;
-                                        if (src1t == GGML_TYPE_F32) {
-                                            if (ne11 * ne12 < 4) {
-                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW].pipeline;
-                                            } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
-                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4].pipeline;
-                                                nrows = ne11;
-                                            } else {
-                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32].pipeline;
-                                                nrows = 4;
-                                            }
-                                        } else {
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16].pipeline;
-                                            nrows = 4;
-                                        }
-                                    } break;
-                                case GGML_TYPE_Q4_0:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q4_1:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q5_0:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q5_1:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q8_0:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q2_K:
-                                    {
-                                        nth0 = 2;
-                                        nth1 = 32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q3_K:
-                                    {
-                                        nth0 = 2;
-                                        nth1 = 32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q4_K:
-                                    {
-                                        nth0 = 4; //1;
-                                        nth1 = 8; //32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q5_K:
-                                    {
-                                        nth0 = 2;
-                                        nth1 = 32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q6_K:
-                                    {
-                                        nth0 = 2;
-                                        nth1 = 32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ2_XXS:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ2_XS:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ3_XXS:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ3_S:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ2_S:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ1_S:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ1_M:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ4_NL:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ4_XS:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32].pipeline;
-                                    } break;
-                                default:
-                                    {
-                                        GGML_LOG_ERROR("Asserting on type %d\n", (int)src0t);
-                                        GGML_ABORT("not implemented");
                                    }
-                            };
+                                } else {
+                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline;
+                                    nrows = 4;
+                                }
+                            } break;
+                        case GGML_TYPE_BF16:
+                            {
+                                nth0 = 32;
+                                nth1 = 1;
+                                if (src1t == GGML_TYPE_F32) {
+                                    if (ne11 * ne12 < 4) {
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW].pipeline;
+                                    } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4].pipeline;
+                                        nrows = ne11;
+                                    } else {
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32].pipeline;
+                                        nrows = 4;
+                                    }
+                                } else {
+                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16].pipeline;
+                                    nrows = 4;
+                                }
+                            } break;
+                        case GGML_TYPE_Q4_0:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q4_1:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q5_0:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q5_1:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q8_0:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q2_K:
+                            {
+                                nth0 = 2;
+                                nth1 = 32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q3_K:
+                            {
+                                nth0 = 2;
+                                nth1 = 32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q4_K:
+                            {
+                                nth0 = 4; //1;
+                                nth1 = 8; //32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q5_K:
+                            {
+                                nth0 = 2;
+                                nth1 = 32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q6_K:
+                            {
+                                nth0 = 2;
+                                nth1 = 32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ2_XXS:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ2_XS:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ3_XXS:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ3_S:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ2_S:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ1_S:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ1_M:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ4_NL:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ4_XS:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32].pipeline;
+                            } break;
+                        default:
+                            {
+                                GGML_LOG_ERROR("Asserting on type %d\n", (int)src0t);
+                                GGML_ABORT("not implemented");
+                            }
+                    };

-                            ggml_metal_kargs_mul_mv args = {
-                                /*.ne00 =*/ ne00,
-                                /*.ne01 =*/ ne01,
-                                /*.ne02 =*/ ne02,
-                                /*.nb00 =*/ nb00,
-                                /*.nb01 =*/ nb01,
-                                /*.nb02 =*/ nb02,
-                                /*.nb03 =*/ nb03,
-                                /*.ne10 =*/ ne10,
-                                /*.ne11 =*/ ne11,
-                                /*.ne12 =*/ ne12,
-                                /*.nb10 =*/ nb10,
-                                /*.nb11 =*/ nb11,
-                                /*.nb12 =*/ nb12,
-                                /*.nb13 =*/ nb13,
-                                /*.ne0  =*/ ne0,
-                                /*.ne1  =*/ ne1,
-                                /*.r2   =*/ r2,
-                                /*.r3   =*/ r3,
-                            };
+                    ggml_metal_kargs_mul_mv args = {
+                        /*.ne00 =*/ ne00,
+                        /*.ne01 =*/ ne01,
+                        /*.ne02 =*/ ne02,
+                        /*.nb00 =*/ nb00,
+                        /*.nb01 =*/ nb01,
+                        /*.nb02 =*/ nb02,
+                        /*.nb03 =*/ nb03,
+                        /*.ne10 =*/ ne10,
+                        /*.ne11 =*/ ne11,
+                        /*.ne12 =*/ ne12,
+                        /*.nb10 =*/ nb10,
+                        /*.nb11 =*/ nb11,
+                        /*.nb12 =*/ nb12,
+                        /*.nb13 =*/ nb13,
+                        /*.ne0  =*/ ne0,
+                        /*.ne1  =*/ ne1,
+                        /*.r2   =*/ r2,
+                        /*.r3   =*/ r3,
+                    };

-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBytes:&args length:sizeof(args) atIndex:0];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
-                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];

-                            if (src0t == GGML_TYPE_Q4_0  || src0t == GGML_TYPE_Q4_1  || src0t == GGML_TYPE_Q5_0 ||
-                                src0t == GGML_TYPE_Q5_1  || src0t == GGML_TYPE_Q8_0  || src0t == GGML_TYPE_Q2_K ||
-                                src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
-                                const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
-                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) {
-                                const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
-                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) {
-                                const int mem_size = 32*sizeof(float);
-                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_Q4_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_Q3_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_Q5_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_Q6_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            } else {
-                                const int64_t ny = (ne11 + nrows - 1)/nrows;
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                        }
+                    if (src0t == GGML_TYPE_Q4_0  || src0t == GGML_TYPE_Q4_1  || src0t == GGML_TYPE_Q5_0 ||
+                        src0t == GGML_TYPE_Q5_1  || src0t == GGML_TYPE_Q8_0  || src0t == GGML_TYPE_Q2_K ||
+                        src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
+                        const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
+                        [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) {
+                        const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
+                        [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) {
+                        const int mem_size = 32*sizeof(float);
+                        [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_Q4_K) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_Q3_K) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_Q5_K) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_Q6_K) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    } else {
+                        const int64_t ny = (ne11 + nrows - 1)/nrows;
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                }
            } break;
        case GGML_OP_MUL_MAT_ID:
            {
@ -4372,19 +4372,45 @@ static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t r
    GGML_UNUSED(index);
 }

+static struct ggml_backend_feature g_ggml_backend_metal_features[] = {
+#if defined(GGML_METAL_EMBED_LIBRARY)
+    { "EMBED_LIBRARY", "1" },
+#endif
+#if defined(GGML_METAL_USE_BF16)
+    { "BF16", "1" },
+#endif
+    { nil, nil },
+};
+
+static struct ggml_backend_feature * ggml_backend_metal_get_features(ggml_backend_reg_t reg) {
+    return g_ggml_backend_metal_features;
+
+    GGML_UNUSED(reg);
+}
+
+static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    if (strcmp(name, "ggml_backend_get_features") == 0) {
+        return (void *)ggml_backend_metal_get_features;
+    }
+
+    return NULL;
+
+    GGML_UNUSED(reg);
+}
 static struct ggml_backend_reg_i ggml_backend_metal_reg_i = {
    /* .get_name         = */ ggml_backend_metal_reg_get_name,
    /* .device_count     = */ ggml_backend_metal_reg_device_count,
    /* .device_get       = */ ggml_backend_metal_reg_device_get,
-    /* .get_proc_address = */ NULL,
+    /* .get_proc_address = */ ggml_backend_metal_get_proc_address,
 };

 ggml_backend_reg_t ggml_backend_metal_reg(void) {
    // TODO: make this thread-safe somehow?
    {
        g_ggml_backend_metal_reg = (struct ggml_backend_reg) {
-            /* .iface   = */ ggml_backend_metal_reg_i,
-            /* .context = */ NULL,
+            /* .api_version = */ GGML_BACKEND_API_VERSION,
+            /* .iface       = */ ggml_backend_metal_reg_i,
+            /* .context     = */ NULL,
        };

        g_ggml_backend_metal_device = (struct ggml_backend_device) {
@ -4396,3 +4422,5 @@ ggml_backend_reg_t ggml_backend_metal_reg(void) {

    return &g_ggml_backend_metal_reg;
 }
+
+GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg)
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@ -5447,12 +5447,12 @@ kernel void kernel_mul_mm(
    const int im = tgpig.z;

    // if this block is of 64x32 shape or smaller
-    short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
-    short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
+    const short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
+    const short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;

    // a thread shouldn't load data outside of the matrix
-    short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
-    short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
+    const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
+    const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;

    simdgroup_T8x8     ma[4];
    simdgroup_float8x8 mb[2];
@ -5467,20 +5467,23 @@ kernel void kernel_mul_mm(
    const int i12 = im%args.ne12;
    const int i13 = im/args.ne12;

-    uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
-    short    offset1 = il/nl;
+    const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const short    offset1 = il/nl;
+
+    device const block_q * x = (device const block_q *)(src0
+        + args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;

-    device const block_q * x = (device const block_q *)(src0 + (r0*BLOCK_SIZE_M + thread_row)*args.nb01 + offset0) + offset1;
    device const float   * y = (device const float   *)(src1
        + args.nb13*i13
        + args.nb12*i12
-        + args.nb11*(r1 * BLOCK_SIZE_N + thread_col)
+        + args.nb11*(r1*BLOCK_SIZE_N + thread_col)
        + args.nb10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));

    for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
        // load data and store to threadgroup memory
        T4x4 temp_a;
        dequantize_func(x, il, temp_a);
+
        threadgroup_barrier(mem_flags::mem_threadgroup);

        #pragma unroll(16)
@ -5490,44 +5493,46 @@ kernel void kernel_mul_mm(
            +                     (tiitg/THREAD_PER_ROW)%8  + (i&7)*8) = temp_a[i/4][i%4];
        }

-        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL)*8*32 + 8*(tiitg/THREAD_PER_COL)) = *((device float2x4 *) y);
+        *(threadgroup float2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = *((device float2x4 *) y);

        il = (il + 2 < nl) ? il + 2 : il % 2;
-        x  = (il < 2) ? x + (2+nl-1)/nl : x;
+        x  = (il < 2) ? x + (2 + nl - 1)/nl : x;
        y += BLOCK_SIZE_K;

        threadgroup_barrier(mem_flags::mem_threadgroup);

        // load matrices from threadgroup memory and conduct outer products
-        threadgroup T     * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
-        threadgroup float * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
+        threadgroup const T     * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
+        threadgroup const float * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));

        #pragma unroll(4)
-        for (short ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
+        for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) {
            #pragma unroll(4)
            for (short i = 0; i < 4; i++) {
                simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
            }
+
            simdgroup_barrier(mem_flags::mem_none);
+
            #pragma unroll(2)
            for (short i = 0; i < 2; i++) {
                simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
            }

-            lsma += BLOCK_SIZE_M/SG_MAT_ROW * SG_MAT_SIZE;
-            lsmb += BLOCK_SIZE_N/SG_MAT_ROW * SG_MAT_SIZE;
-
            #pragma unroll(8)
            for (short i = 0; i < 8; i++){
                simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
            }
+
+            lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE;
+            lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE;
        }
    }

    if ((r0 + 1) * BLOCK_SIZE_M <= args.ne0 && (r1 + 1) * BLOCK_SIZE_N <= args.ne1) {
        device float * C = (device float *) dst +
-            (BLOCK_SIZE_M * r0 + 32 * (sgitg &  1)) + \
-            (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
+            (BLOCK_SIZE_M * r0 + 32*(sgitg &  1)) + \
+            (BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;

        for (short i = 0; i < 8; i++) {
            simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.ne0 * (i/4), args.ne0);
@ -5536,7 +5541,7 @@ kernel void kernel_mul_mm(
        // block is smaller than 64x32, we should avoid writing data outside of the matrix
        threadgroup_barrier(mem_flags::mem_threadgroup);
        threadgroup float * temp_str = ((threadgroup float *) shmem) \
-                                      + 32 * (sgitg&1) + (16 * (sgitg>>1))*BLOCK_SIZE_M;
+                                     + 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M;
        for (short i = 0; i < 8; i++) {
            simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
        }
--- a/ggml/src/ggml-musa/CMakeLists.txt
+++ b/ggml/src/ggml-musa/CMakeLists.txt
@ -47,12 +47,10 @@ if (MUSAToolkit_FOUND)
        set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22")
    endforeach()

-    add_library(ggml-musa
-                ${GGML_HEADERS_MUSA}
-                ${GGML_SOURCES_MUSA})
-
-    target_link_libraries(ggml-musa PRIVATE ggml-base)
-    target_include_directories(ggml-musa PRIVATE . ..)
+    ggml_add_backend_library(ggml-musa
+                             ${GGML_HEADERS_MUSA}
+                             ${GGML_SOURCES_MUSA}
+                            )

    # TODO: do not use CUDA definitions for MUSA
    target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
--- a/ggml/src/ggml-opt.cpp
+++ b/ggml/src/ggml-opt.cpp
@ -14,51 +14,51 @@
 #include <vector>

 struct ggml_opt_dataset {
-    struct ggml_context   * ctx;
-    ggml_backend_buffer_t   buf;
-    struct ggml_tensor    * data;
-    struct ggml_tensor    * labels;
+    struct ggml_context   * ctx    = nullptr;
+    ggml_backend_buffer_t   buf    = nullptr;
+    struct ggml_tensor    * data   = nullptr;
+    struct ggml_tensor    * labels = nullptr;

-    int64_t ndata;
-    int64_t ndata_shard;
-    size_t  nbs_data;
-    size_t  nbs_labels;
+    int64_t ndata       = -1;
+    int64_t ndata_shard = -1;
+    size_t  nbs_data    = -1;
+    size_t  nbs_labels  = -1;

    std::vector<int64_t> permutation;
 };

 struct ggml_opt_context {
-    ggml_backend_sched_t    backend_sched;
-    ggml_cgraph           * allocated_graph;
-    ggml_cgraph           * allocated_graph_copy;
-    struct ggml_context   * ctx_static;
-    struct ggml_context   * ctx_static_cpu;
-    struct ggml_context   * ctx_compute;
-    struct ggml_context   * ctx_copy;
-    ggml_backend_buffer_t   buf_static;
-    ggml_backend_buffer_t   buf_static_cpu;
+    ggml_backend_sched_t    backend_sched        = nullptr;
+    ggml_cgraph           * allocated_graph      = nullptr;
+    ggml_cgraph           * allocated_graph_copy = nullptr;
+    struct ggml_context   * ctx_static           = nullptr;
+    struct ggml_context   * ctx_static_cpu       = nullptr;
+    struct ggml_context   * ctx_compute          = nullptr;
+    struct ggml_context   * ctx_copy             = nullptr;
+    ggml_backend_buffer_t   buf_static           = nullptr;
+    ggml_backend_buffer_t   buf_static_cpu       = nullptr;
    std::mt19937            rng;

-    struct ggml_tensor * inputs;
-    struct ggml_tensor * outputs;
-    struct ggml_tensor * labels;
+    struct ggml_tensor * inputs  = nullptr;
+    struct ggml_tensor * outputs = nullptr;
+    struct ggml_tensor * labels  = nullptr;

-    struct ggml_tensor * loss;
-    struct ggml_tensor * pred;
-    struct ggml_tensor * ncorrect;
+    struct ggml_tensor * loss     = nullptr;
+    struct ggml_tensor * pred     = nullptr;
+    struct ggml_tensor * ncorrect = nullptr;

-    struct ggml_cgraph * gf;
-    struct ggml_cgraph * gb_grad;
-    struct ggml_cgraph * gb_opt;
+    struct ggml_cgraph * gf      = nullptr;
+    struct ggml_cgraph * gb_grad = nullptr;
+    struct ggml_cgraph * gb_opt  = nullptr;

-    int64_t iter;
-    int32_t opt_period;
-    int32_t opt_i;
-    bool    loss_per_datapoint;
+    int64_t iter               = 1;
+    int32_t opt_period         = 1;
+    int32_t opt_i              = 0;
+    bool    loss_per_datapoint = false;

-    ggml_opt_get_optimizer_params get_opt_pars;
-    void * get_opt_pars_ud;
-    struct ggml_tensor * adamw_params;
+    ggml_opt_get_optimizer_params get_opt_pars = nullptr;
+    void * get_opt_pars_ud                     = nullptr;
+    struct ggml_tensor * adamw_params          = nullptr;
 };

 struct ggml_opt_result {
@ -67,8 +67,8 @@ struct ggml_opt_result {
    std::vector<int32_t> pred;
    int64_t              ncorrect = 0;

-    bool loss_per_datapoint = false;
-    int64_t opt_period = -1;
+    int64_t opt_period         = -1;
+    bool    loss_per_datapoint = false;
 };

 // ====== Dataset ======
@ -188,11 +188,11 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us
 }

 struct ggml_opt_params ggml_opt_default_params(
-        ggml_backend_sched_t backend_sched,
-        struct ggml_context * ctx_compute,
-        struct ggml_tensor * inputs,
-        struct ggml_tensor * outputs,
-        enum ggml_opt_loss_type loss_type) {
+        ggml_backend_sched_t      backend_sched,
+        struct ggml_context     * ctx_compute,
+        struct ggml_tensor      * inputs,
+        struct ggml_tensor      * outputs,
+        enum ggml_opt_loss_type   loss_type) {
    return {
        /*backend_sched   =*/ backend_sched,
        /*ctx_compute     =*/ ctx_compute,
@ -237,25 +237,33 @@ static ggml_tensor * map_tensor(std::map<ggml_tensor *, ggml_tensor *> & tensor_
    return new_tensor;
 }

-static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * graph) {
+static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) {
    std::map<ggml_tensor *, ggml_tensor *> tensor_map;

-    ggml_cgraph * new_graph = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true);
+    ggml_cgraph * dst = ggml_new_graph_custom(ctx, src->size, /*grads =*/ true);

-    for (int i = 0; i < graph->n_leafs; i++) {
-        ggml_build_forward_expand(new_graph, map_tensor(tensor_map, ctx, graph->leafs[i]));
+    for (int i = 0; i < src->n_leafs; i++) {
+        ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i]));
    }
-    for (int i = 0; i < graph->n_nodes; i++) {
-        ggml_build_forward_expand(new_graph, map_tensor(tensor_map, ctx, graph->nodes[i]));
+    GGML_ASSERT(dst->n_leafs == src->n_leafs);
+    for (int i = 0; i < src->n_nodes; i++) {
+        ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i]));
    }
-    for (int i = 0; i < graph->n_nodes; ++i) {
-        const size_t igrad_src = ggml_hash_find(&graph->visited_hash_set, graph->nodes[i]);
-        const size_t igrad_dst = ggml_hash_find(&new_graph->visited_hash_set, new_graph->nodes[i]);
-        graph->grads[igrad_dst]     = new_graph->grads[igrad_src];
-        graph->grad_accs[igrad_dst] = new_graph->grad_accs[igrad_src];
+    GGML_ASSERT(dst->n_nodes == src->n_nodes);
+    for (int i = 0; i < src->n_nodes; ++i) {
+        const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
+        const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);
+
+        GGML_ASSERT(igrad_src != GGML_HASHSET_FULL);
+        GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src));
+        GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL);
+        GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst));
+
+        dst->grads[igrad_dst]     = src->grads[igrad_src];
+        dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
    }

-    return new_graph;
+    return dst;
 }

 static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph) {
@ -284,18 +292,13 @@ static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph

 ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
    ggml_opt_context_t result = new struct ggml_opt_context;
-    result->backend_sched        = params.backend_sched;
-    result->allocated_graph      = nullptr;
-    result->allocated_graph_copy = nullptr;
-    result->ctx_compute          = params.ctx_compute;
-    result->ctx_copy             = nullptr;
-    result->inputs               = params.inputs;
-    result->outputs              = params.outputs;
-    result->iter                 = 1;
-    result->opt_period           = params.opt_period;
-    result->opt_i                = 0;
-    result->get_opt_pars         = params.get_opt_pars;
-    result->get_opt_pars_ud      = params.get_opt_pars_ud;
+    result->backend_sched   = params.backend_sched;
+    result->ctx_compute     = params.ctx_compute;
+    result->inputs          = params.inputs;
+    result->outputs         = params.outputs;
+    result->opt_period      = params.opt_period;
+    result->get_opt_pars    = params.get_opt_pars;
+    result->get_opt_pars_ud = params.get_opt_pars_ud;

    GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically");
    GGML_ASSERT(result->opt_period >= 1);
@ -348,7 +351,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {

    switch (params.loss_type) {
        case GGML_OPT_LOSS_TYPE_MEAN: {
-            result->labels = nullptr;
            result->loss = ggml_sum(result->ctx_static, result->outputs);
            ggml_set_name(result->loss, "loss_sum");
            const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
@ -358,7 +360,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
            break;
        }
        case GGML_OPT_LOSS_TYPE_SUM: {
-            result->labels = nullptr;
            result->loss = ggml_sum(result->ctx_static, result->outputs);
            ggml_set_name(result->loss, "loss_sum");
            result->loss_per_datapoint = false;
@ -413,14 +414,7 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
    }

    if (params.build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
-        result->gb_grad = nullptr;
-        result->gb_opt  = nullptr;
-
        result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
-        result->buf_static_cpu = nullptr;
-
-        ggml_opt_alloc_graph(result, result->gf);
-
        return result;
    }

@ -429,14 +423,8 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
    ggml_build_backward_expand(result->ctx_static, result->ctx_compute, result->gb_grad, accumulate);

    if (params.build_type == GGML_OPT_BUILD_TYPE_GRAD) {
-        result->gb_opt  = nullptr;
-
        result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
-        result->buf_static_cpu = nullptr;
-
-        ggml_opt_alloc_graph(result, result->gb_grad);
        ggml_graph_reset(result->gb_grad);
-
        return result;
    }

@ -466,7 +454,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {

    result->buf_static_cpu = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx_static_cpu, ggml_backend_cpu_buffer_type());

-    ggml_opt_alloc_graph(result, result->gb_opt);
    ggml_graph_reset(result->gb_opt);

    return result;
--- a/ggml/src/ggml-rpc/CMakeLists.txt
+++ b/ggml/src/ggml-rpc/CMakeLists.txt
@ -1,10 +1,8 @@
 message(STATUS "Using RPC backend")

-add_library(ggml-rpc
-            ggml-rpc.cpp)
-
-target_link_libraries(ggml-rpc PRIVATE ggml-base)
-target_include_directories(ggml-rpc PRIVATE . ..)
+ggml_add_backend_library(ggml-rpc
+                         ggml-rpc.cpp
+                        )

 if (WIN32)
    target_link_libraries(ggml-rpc PRIVATE ws2_32)
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@ -1369,8 +1369,9 @@ static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {

 ggml_backend_reg_t ggml_backend_rpc_reg(void) {
    static struct ggml_backend_reg ggml_backend_rpc_reg = {
-        /* .iface   = */ ggml_backend_rpc_reg_i,
-        /* .context = */ NULL,
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_rpc_reg_i,
+        /* .context     = */ NULL,
    };

    return &ggml_backend_rpc_reg;
@ -1401,3 +1402,5 @@ ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) {

    return dev;
 }
+
+GGML_BACKEND_DL_IMPL(ggml_backend_rpc_reg)
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@ -16,12 +16,10 @@ endif()
 message(STATUS "SYCL found")
 #todo: AOT

-add_library(ggml-sycl
-            ggml-sycl.cpp
-            ../../include/ggml-sycl.h)
-
-target_link_libraries(ggml-sycl PRIVATE ggml-base)
-target_include_directories(ggml-sycl PRIVATE . ..)
+ggml_add_backend_library(ggml-sycl
+                         ggml-sycl.cpp
+                         ../../include/ggml-sycl.h
+                        )

 if (GGML_SYCL_F16)
    if (GGML_SYCL_TARGET STREQUAL "AMD")
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@ -4637,16 +4637,17 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
                dev_ctx->description = prop.get_name();

                ggml_backend_dev_t dev = new ggml_backend_device {
-                    /* .interface = */ ggml_backend_sycl_device_interface,
-                    /* .reg       = */ &reg,
-                    /* .context   = */ dev_ctx
+                    /* .iface       = */ ggml_backend_sycl_device_interface,
+                    /* .reg         = */ &reg,
+                    /* .context     = */ dev_ctx
                };
                ctx->devices.push_back(dev);
            }

            reg = ggml_backend_reg {
-                /* .interface = */ ggml_backend_sycl_reg_interface,
-                /* .context   = */ ctx
+                /* .api_version = */ GGML_BACKEND_API_VERSION,
+                /* .iface       = */ ggml_backend_sycl_reg_interface,
+                /* .context     = */ ctx
            };
        }

@ -4678,3 +4679,4 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
    return sycl_backend;
 }

+GGML_BACKEND_DL_IMPL(ggml_backend_sycl_reg)
--- a/ggml/src/ggml-vulkan/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/CMakeLists.txt
@ -3,13 +3,13 @@ find_package(Vulkan COMPONENTS glslc REQUIRED)
 if (Vulkan_FOUND)
    message(STATUS "Vulkan found")

-    add_library(ggml-vulkan
-                ggml-vulkan.cpp
-                ../../include/ggml-vulkan.h
-                )
+    ggml_add_backend_library(ggml-vulkan
+                             ggml-vulkan.cpp
+                             ../../include/ggml-vulkan.h
+                            )

-    target_link_libraries(ggml-vulkan PRIVATE ggml-base Vulkan::Vulkan)
-    target_include_directories(ggml-vulkan PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR})
+    target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
+    target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR})

    # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
    # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -158,6 +158,7 @@ struct vk_device_struct {
    std::string name;
    uint64_t max_memory_allocation_size;
    bool fp16;
+    bool pipeline_robustness;
    vk::Device device;
    uint32_t vendor_id;
    vk_queue compute_queue;
@ -654,7 +655,7 @@ static uint32_t compile_count = 0;
 static std::mutex compile_count_mutex;
 static std::condition_variable compile_count_cond;

-static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants, uint32_t align) {
+static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants, uint32_t align, bool disable_robustness) {
    VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
    GGML_ASSERT(parameter_count > 0);
    GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
@ -724,6 +725,15 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
        vk::PipelineCreateFlags(),
        pipeline_shader_create_info,
        pipeline->layout);
+
+    vk::PipelineRobustnessCreateInfoEXT rci;
+
+    if (device->pipeline_robustness && disable_robustness) {
+        rci.storageBuffers = vk::PipelineRobustnessBufferBehaviorEXT::eDisabled;
+        rci.uniformBuffers = vk::PipelineRobustnessBufferBehaviorEXT::eDisabled;
+        compute_pipeline_create_info.setPNext(&rci);
+    }
+
    pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;

    {
@ -1261,7 +1271,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();

    std::vector<std::future<void>> compiles;
-    auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants, uint32_t align) {
+    auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants, uint32_t align, bool disable_robustness = false) {
        {
            // wait until fewer than N compiles are in progress
            uint32_t N = std::max(1u, std::thread::hardware_concurrency());
@ -1271,7 +1281,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
            }
            compile_count++;
        }
-        compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align));
+        compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness));
    };

    if (device->fp16) {
@ -1370,45 +1380,45 @@ static void ggml_vk_load_shaders(vk_device& device) {
    // computing two rows per workgroup is a benefit for Q4_0 -> Q5_1, but not for Q8_0.
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32",  mul_mat_vec_f32_f32_f32_len,  mul_mat_vec_f32_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32",  mul_mat_vec_f16_f32_f32_len,  mul_mat_vec_f16_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);

    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32",  mul_mat_vec_f32_f16_f32_len,  mul_mat_vec_f32_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32",  mul_mat_vec_f16_f16_f32_len,  mul_mat_vec_f16_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1, true);

    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32",  mul_mat_vec_id_f32_f32_len,  mul_mat_vec_id_f32_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32",  mul_mat_vec_id_f16_f32_len,  mul_mat_vec_id_f16_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);

    // dequant shaders
    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16",   dequant_f32_len,  dequant_f32_data,  "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
@ -1591,12 +1601,15 @@ static vk_device ggml_vk_get_device(size_t idx) {

        bool fp16_storage = false;
        bool fp16_compute = false;
+        bool pipeline_robustness = false;

        for (const auto& properties : ext_props) {
            if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
                fp16_storage = true;
            } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
                fp16_compute = true;
+            } else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) {
+                pipeline_robustness = true;
            }
        }

@ -1642,10 +1655,22 @@ static vk_device ggml_vk_get_device(size_t idx) {
        vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES;
        vk11_features.pNext = &vk12_features;

+        VkPhysicalDevicePipelineRobustnessFeaturesEXT pl_robustness_features;
+        pl_robustness_features.pNext = nullptr;
+        pl_robustness_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_ROBUSTNESS_FEATURES_EXT;
+        pl_robustness_features.pipelineRobustness = VK_FALSE;
+
+        if (pipeline_robustness) {
+            vk12_features.pNext = &pl_robustness_features;
+            device_extensions.push_back("VK_EXT_pipeline_robustness");
+        }
+
        vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2);

        device->fp16 = device->fp16 && vk12_features.shaderFloat16;

+        device->pipeline_robustness = pl_robustness_features.pipelineRobustness;
+
        if (!vk11_features.storageBuffer16BitAccess) {
            std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl;
            throw std::runtime_error("Unsupported device");
@ -3190,7 +3215,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&

    if (ne01 > max_groups_x) {
        groups_z = 64;
-        groups_x /= groups_z;
+        groups_x = CEIL_DIV(groups_x, groups_z);
    }

    // compute
@ -3767,7 +3792,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte

    if (ne01 > max_groups_x) {
        groups_z = 64;
-        groups_x /= groups_z;
+        groups_x = CEIL_DIV(groups_x, groups_z);
    }

    // compute
@ -6713,8 +6738,9 @@ static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = {

 ggml_backend_reg_t ggml_backend_vk_reg() {
    static ggml_backend_reg reg = {
-        /* .iface   = */ ggml_backend_vk_reg_i,
-        /* .context = */ nullptr,
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_vk_reg_i,
+        /* .context     = */ nullptr,
    };

    return &reg;
@ -7340,3 +7366,5 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
    VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")");
 }
 #endif
+
+GGML_BACKEND_DL_IMPL(ggml_backend_vk_reg)
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
@ -2,6 +2,15 @@
 #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
 #endif

+#include "types.comp"
+
+#if defined(A_TYPE_PACKED16)
+layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];};
+#endif
+#if defined(A_TYPE_PACKED32)
+layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];};
+#endif
+
 #if defined(DATA_A_F32)
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
    return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]);
@ -20,6 +29,11 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
    return (vec2(vui & 0xF, vui >> 4) - 8.0f) * d;
 }
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const float d = float(data_a_packed16[a_offset + ib].d);
+    const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
+    return (vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, (vui >> 12) & 0xF) - 8.0f) * d;
+}
 #endif

 #if defined(DATA_A_Q4_1)
@ -29,6 +43,12 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
    return vec2(vui & 0xF, vui >> 4) * d + m;
 }
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const float d = float(data_a_packed16[a_offset + ib].d);
+    const float m = float(data_a_packed16[a_offset + ib].m);
+    const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
+    return vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, (vui >> 12) & 0xF) * d + m;
+}
 #endif

 #if defined(DATA_A_Q5_0)
@ -39,6 +59,14 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
    return (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f) * d;
 }
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const float d = float(data_a_packed16[a_offset + ib].d);
+    const uint uint_qh = uint(data_a_packed16[a_offset + ib].qh[1]) << 16 | data_a_packed16[a_offset + ib].qh[0];
+    const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
+    const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10);
+    const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
+    return (vec4(((vui >> 0) & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, ((vui >> 12) & 0xF) | qh1.y) - 16.0f) * d;
+}
 #endif

 #if defined(DATA_A_Q5_1)
@ -50,6 +78,15 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
    return vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) * d + m;
 }
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const float d = float(data_a_packed16[a_offset + ib].d);
+    const float m = float(data_a_packed16[a_offset + ib].m);
+    const uint uint_qh = data_a_packed16[a_offset + ib].qh;
+    const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
+    const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10);
+    const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
+    return vec4(((vui >> 0) & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, ((vui >> 12) & 0xF) | qh1.y) * d + m;
+}
 #endif

 #if defined(DATA_A_Q8_0)
@ -57,6 +94,12 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
    const float d = float(data_a[a_offset + ib].d);
    return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d;
 }
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const float d = float(data_a_packed16[a_offset + ib].d);
+    uint32_t v0 = data_a_packed16[a_offset + ib].qs[iqs/2];
+    uint32_t v1 = data_a_packed16[a_offset + ib].qs[iqs/2 + 1];
+    return vec4(int8_t(v0 & 0xFF), int8_t((v0 >> 8) & 0xFF), int8_t(v1 & 0xFF), int8_t((v1 >> 8) & 0xFF)) * d;
+}
 #endif

 #if defined(DATA_A_IQ4_NL)
@ -65,4 +108,9 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
    return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
 }
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const float d = float(data_a_packed16[a_offset + ib].d);
+    const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
+    return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[(vui >> 12) & 0xF]) * d;
+}
 #endif
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp
@ -10,6 +10,8 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 void main() {
    const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;

+    init_iq4nl_shmem();
+
    const uint tid = gl_LocalInvocationID.x % 64;
    const uint il  = tid/32;
    const uint ir  = tid%32;
--- a/Show more
+++ b/Show more