Merge branch 'master' into gg/metal-opt-mul-mat-id

This commit is contained in:
Georgi Gerganov 2024-01-02 16:22:47 +02:00
commit c73e598d1c
No known key found for this signature in database
GPG key ID: BF970631944C16B7
19 changed files with 719 additions and 336 deletions

View file

@ -8,12 +8,13 @@
pkgsCuda, pkgsCuda,
... ...
}: }:
lib.optionalAttrs (system == "aarch64-linux") { {
packages = legacyPackages =
let let
caps.jetson-xavier = "7.2"; caps.llamaPackagesXavier = "7.2";
caps.jetson-orin = "8.7"; caps.llamaPackagesOrin = "8.7";
caps.jetson-nano = "5.3"; caps.llamaPackagesTX2 = "6.2";
caps.llamaPackagesNano = "5.3";
pkgsFor = pkgsFor =
cap: cap:
@ -27,6 +28,12 @@
}; };
}; };
in in
builtins.mapAttrs (name: cap: ((pkgsFor cap).callPackage ./scope.nix { }).llama-cpp) caps; builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
packages = lib.optionalAttrs (system == "aarch64-linux") {
jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
};
}; };
} }

View file

@ -515,7 +515,6 @@ jobs:
- name: Build Xcode project - name: Build Xcode project
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
# freeBSD-latest: # freeBSD-latest:
# runs-on: macos-12 # runs-on: macos-12
# steps: # steps:

112
.github/workflows/nix-ci.yml vendored Normal file
View file

@ -0,0 +1,112 @@
name: Nix CI
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
jobs:
nix-eval:
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest, macos-latest ]
runs-on: ${{ matrix.os }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@v9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: |
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2
with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- name: List all flake outputs
run: nix flake show --all-systems
- name: Show all output paths
run: >
nix run github:nix-community/nix-eval-jobs
-- --gc-roots-dir gcroot
--flake
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
nix-build:
if: ${{ vars.CACHIX_NAME != '' }}
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest, macos-latest ]
runs-on: ${{ matrix.os }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@v9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: |
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2
with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- name: Set-up cachix to push the results to
uses: cachix/cachix-action@v13
with:
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
name: ${{ vars.CACHIX_NAME }}
- name: Build
run: >
nix run github:Mic92/nix-fast-build
-- --skip-cached --no-nom
--flake
".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
nix-build-aarch64:
if: ${{ vars.CACHIX_NAME != '' }}
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install QEMU
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
run: |
sudo apt-get install -y qemu-user-static qemu-system-aarch64
sudo usermod -a -G kvm $USER
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@v9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: |
extra-platforms = aarch64-linux
extra-system-features = nixos-test kvm
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2
with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- name: Set-up cachix to push the results to
uses: cachix/cachix-action@v13
with:
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
name: ${{ vars.CACHIX_NAME }}
- name: Show all output paths
run: >
nix run github:nix-community/nix-eval-jobs
-- --gc-roots-dir gcroot
--flake
".#packages.aarch64-linux"
- name: Build
run: >
nix run github:Mic92/nix-fast-build
-- --skip-cached --no-nom
--systems aarch64-linux
--flake
".#checks.aarch64-linux"

22
.github/workflows/nix-flake-update.yml vendored Normal file
View file

@ -0,0 +1,22 @@
name: update-flake-lock
on:
workflow_dispatch:
schedule:
- cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
jobs:
lockfile:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@main
- name: Update flake.lock
uses: DeterminateSystems/update-flake-lock@main
with:
pr-title: "nix: update flake.lock"
pr-labels: |
nix
pr-reviewers: philiptaron,SomeoneSerge
token: ${{ secrets.GITHUB_TOKEN }}

View file

@ -1,23 +0,0 @@
# Make the flake discoverable on https://flakestry.dev
name: "Publish a flake to flakestry"
on:
push:
tags:
- "v?[0-9]+.[0-9]+.[0-9]+"
- "v?[0-9]+.[0-9]+"
workflow_dispatch:
inputs:
tag:
description: "The existing tag to publish"
type: "string"
required: true
jobs:
publish-flake:
runs-on: ubuntu-latest
permissions:
id-token: "write"
contents: "read"
steps:
- uses: flakestry/flakestry-publish@main
with:
version: "${{ inputs.tag || github.ref_name }}"

36
.github/workflows/nix-publish-flake.yml vendored Normal file
View file

@ -0,0 +1,36 @@
# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
name: "Publish a flake to flakestry & flakehub"
on:
push:
tags:
- "*"
workflow_dispatch:
inputs:
tag:
description: "The existing tag to publish"
type: "string"
required: true
jobs:
flakestry-publish:
runs-on: ubuntu-latest
permissions:
id-token: "write"
contents: "read"
steps:
- uses: flakestry/flakestry-publish@main
with:
version: "${{ inputs.tag || github.ref_name }}"
flakehub-publish:
runs-on: "ubuntu-latest"
permissions:
id-token: "write"
contents: "read"
steps:
- uses: "actions/checkout@v4"
with:
ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
- uses: "DeterminateSystems/nix-installer-action@main"
- uses: "DeterminateSystems/flakehub-push@main"
with:
visibility: "public"
tag: "${{ inputs.tag }}"

View file

@ -1,2 +1,2 @@
torch>=2.0.0 torch>=2.1.1
transformers>=4.32.0 transformers>=4.32.0

View file

@ -59,7 +59,7 @@ class Model:
from safetensors import safe_open from safetensors import safe_open
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
else: else:
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", weights_only=True)) ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
with ctx as model_part: with ctx as model_part:
for name in model_part.keys(): for name in model_part.keys():

View file

@ -61,7 +61,7 @@ For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' L
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
``` ```
The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values. The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values.
Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime. Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`. If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.

View file

@ -146,6 +146,27 @@ static std::string get_ftype(int ftype) {
} }
} }
//
// image data
//
// RGB uint8 image
struct clip_image_u8 {
int nx;
int ny;
std::vector<uint8_t> buf;
};
// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
struct clip_image_f32 {
int nx;
int ny;
std::vector<float> buf;
};
// //
// clip layers // clip layers
// //
@ -204,16 +225,21 @@ struct clip_vision_model {
}; };
struct clip_ctx { struct clip_ctx {
bool has_text_encoder = false; bool has_text_encoder = false;
bool has_vision_encoder = false; bool has_vision_encoder = false;
bool has_llava_projector = false; bool has_llava_projector = false;
struct clip_vision_model vision_model; struct clip_vision_model vision_model;
float image_mean[3]; float image_mean[3];
float image_std[3]; float image_std[3];
bool use_gelu = false; bool use_gelu = false;
int32_t ftype = 1; int32_t ftype = 1;
struct ggml_context * ctx;
struct gguf_context * ctx_gguf; struct gguf_context * ctx_gguf;
struct ggml_context * ctx_data;
std::vector<uint8_t> buf_compute_meta;
// memory buffers to evaluate the model // memory buffers to evaluate the model
ggml_backend_buffer_t params_buffer = NULL; ggml_backend_buffer_t params_buffer = NULL;
@ -222,7 +248,7 @@ struct clip_ctx {
ggml_allocr * compute_alloc = NULL; ggml_allocr * compute_alloc = NULL;
}; };
static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_image_f32_batch * imgs) { static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n"); printf("This gguf file seems to have no vision encoder\n");
return nullptr; return nullptr;
@ -243,13 +269,14 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
//const int projection_dim = hparams.projection_dim; //const int projection_dim = hparams.projection_dim;
const float eps = hparams.eps; const float eps = hparams.eps;
int batch_size = imgs->size; int batch_size = imgs->size;
if(ctx->has_llava_projector) { if (ctx->has_llava_projector) {
GGML_ASSERT(batch_size == 1); GGML_ASSERT(batch_size == 1);
} }
struct ggml_init_params params = { struct ggml_init_params params = {
/*.mem_size =*/ GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead(), /*.mem_size =*/ ctx->buf_compute_meta.size(),
/*.mem_buffer =*/ NULL, /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
/*.no_alloc =*/ true, /*.no_alloc =*/ true,
}; };
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
@ -272,7 +299,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
for (int k = 0; k < 3; k++) { for (int k = 0; k < 3; k++) {
for (int y = 0; y < ny; y++) { for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) { for (int x = 0; x < nx; x++) {
data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].data[3 * (y * nx + x) + k]; data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k];
} }
} }
} }
@ -413,7 +440,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
ggml_allocr_alloc(ctx->compute_alloc, patches); ggml_allocr_alloc(ctx->compute_alloc, patches);
if (!ggml_allocr_is_measure(ctx->compute_alloc)) { if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
int* patches_data = (int*)malloc(ggml_nbytes(patches)); int* patches_data = (int*)malloc(ggml_nbytes(patches));
for (int i = 0; i < num_positions; i++) { for (int i = 0; i < num_patches; i++) {
patches_data[i] = i + 1; patches_data[i] = i + 1;
} }
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
@ -561,8 +588,8 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
/*.no_alloc =*/ true, /*.no_alloc =*/ true,
}; };
new_clip->ctx = ggml_init(params); new_clip->ctx_data = ggml_init(params);
if (!new_clip->ctx) { if (!new_clip->ctx_data) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__); fprintf(stderr, "%s: ggml_init() failed\n", __func__);
clip_free(new_clip); clip_free(new_clip);
return nullptr; return nullptr;
@ -579,7 +606,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i); const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * t = ggml_get_tensor(meta, name); struct ggml_tensor * t = ggml_get_tensor(meta, name);
struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx, t); struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx_data, t);
ggml_set_name(cur, name); ggml_set_name(cur, name);
} }
@ -588,7 +615,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
ggml_allocr* alloc = ggml_allocr_new_from_buffer(new_clip->params_buffer); ggml_allocr* alloc = ggml_allocr_new_from_buffer(new_clip->params_buffer);
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i); const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx, name); struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
ggml_allocr_alloc(alloc, cur); ggml_allocr_alloc(alloc, cur);
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
fin.seekg(offset, std::ios::beg); fin.seekg(offset, std::ios::beg);
@ -617,20 +644,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
// load vision model // load vision model
auto & vision_model = new_clip->vision_model; auto & vision_model = new_clip->vision_model;
auto & hparams = vision_model.hparams; auto & hparams = vision_model.hparams;
hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision")); hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision"));
hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision")); hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision"));
hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "vision")); hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "vision"));
hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision")); hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision"));
hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE); hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE);
hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE); hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE);
hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision")); hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision")); hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN); int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
int idx_std = get_key_idx(ctx, KEY_IMAGE_STD); int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean)); new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean));
new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std)); new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
} }
if (verbosity >= 2) { if (verbosity >= 2) {
@ -644,35 +671,35 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
printf("v_n_layer %d\n", hparams.n_layer); printf("v_n_layer %d\n", hparams.n_layer);
} }
vision_model.patch_embeddings = get_tensor(new_clip->ctx, TN_PATCH_EMBD); vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.class_embedding = get_tensor(new_clip->ctx, TN_CLASS_EMBD); vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx, format(TN_POS_EMBD, "v")); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
vision_model.pre_ln_w = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "weight")); vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
vision_model.pre_ln_b = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "bias")); vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
vision_model.mm_0_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "weight")); vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "bias")); vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
vision_model.mm_2_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "weight")); vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_2_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "bias")); vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
vision_model.layers.resize(hparams.n_layer); vision_model.layers.resize(hparams.n_layer);
for (int il = 0; il < hparams.n_layer; ++il) { for (int il = 0; il < hparams.n_layer; ++il) {
auto & layer = vision_model.layers[il]; auto & layer = vision_model.layers[il];
layer.k_w = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "weight")); layer.k_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "weight"));
layer.q_w = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "weight")); layer.q_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "weight"));
layer.v_w = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "weight")); layer.v_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "weight"));
layer.o_w = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "weight")); layer.o_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "weight"));
layer.ln_1_w = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "weight")); layer.ln_1_w = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "weight"));
layer.ln_2_w = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "weight")); layer.ln_2_w = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "weight"));
layer.ff_i_w = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "weight")); layer.ff_i_w = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "weight"));
layer.ff_o_w = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "weight")); layer.ff_o_w = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "weight"));
layer.k_b = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "bias")); layer.k_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "bias"));
layer.q_b = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "bias")); layer.q_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "bias"));
layer.v_b = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "bias")); layer.v_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "bias"));
layer.o_b = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "bias")); layer.o_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "bias"));
layer.ln_1_b = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "bias")); layer.ln_1_b = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "bias"));
layer.ln_2_b = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "bias")); layer.ln_2_b = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "bias"));
layer.ff_i_b = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "bias")); layer.ff_i_b = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "bias"));
layer.ff_o_b = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "bias")); layer.ff_o_b = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "bias"));
} }
} }
@ -680,8 +707,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->ctx_gguf = ctx; new_clip->ctx_gguf = ctx;
// measure mem requirement and allocate // measure mem requirement and allocate
{ {
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
new_clip->compute_alloc = ggml_allocr_new_measure_from_backend(new_clip->backend); new_clip->compute_alloc = ggml_allocr_new_measure_from_backend(new_clip->backend);
clip_image_f32_batch batch; clip_image_f32_batch batch;
batch.size = 1; batch.size = 1;
@ -697,26 +725,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
return new_clip; return new_clip;
} }
clip_image_u8 * make_clip_image_u8() { struct clip_image_u8 * clip_image_u8_init() {
auto img = new clip_image_u8(); return new clip_image_u8();
return img;
} }
clip_image_f32 * make_clip_image_f32() { return new clip_image_f32(); }
void clip_image_u8_free(clip_image_u8 * img) { if (img->data) { delete[] img->data; } delete img; } struct clip_image_f32 * clip_image_f32_init() {
void clip_image_f32_free(clip_image_f32 * img) { if (img->data) { delete[] img->data; } delete img; } return new clip_image_f32();
}
void clip_image_u8_free (struct clip_image_u8 * img) { delete img; }
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) { static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
img->nx = nx; img->nx = nx;
img->ny = ny; img->ny = ny;
img->size = nx * ny * 3; img->buf.resize(3 * nx * ny);
img->data = new uint8_t[img->size](); memcpy(img->buf.data(), data, img->buf.size());
memcpy(img->data, data, img->size);
} }
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
int nx, ny, nc; int nx, ny, nc;
auto data = stbi_load(fname, &nx, &ny, &nc, 3); auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
if (!data) { if (!data) {
fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname); fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
return false; return false;
@ -728,7 +757,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) { bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
int nx, ny, nc; int nx, ny, nc;
auto data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3); auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
if (!data) { if (!data) {
fprintf(stderr, "%s: failed to decode image bytes\n", __func__); fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
return false; return false;
@ -740,7 +769,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
// normalize: x = (x - mean) / std // normalize: x = (x - mean) / std
// TODO: implement bicubic interpolation instead of linear. // TODO: implement bicubic interpolation instead of linear.
bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) { bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n"); printf("This gguf file seems to have no vision encoder\n");
return false; return false;
@ -749,18 +778,17 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
clip_image_u8 * temp = make_clip_image_u8(); // we will keep the input image data here temporarily clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily
if (pad2square && img->nx != img->ny) { if (pad2square && img->nx != img->ny) {
int longer_side = std::max(img->nx, img->ny); int longer_side = std::max(img->nx, img->ny);
temp->nx = longer_side; temp->nx = longer_side;
temp->ny = longer_side; temp->ny = longer_side;
temp->size = 3 * longer_side * longer_side; temp->buf.resize(3 * longer_side * longer_side);
temp->data = new uint8_t[temp->size](); const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
// fill with background color // fill with background color
for (size_t i = 0; i < temp->size; i++) { for (size_t i = 0; i < temp->buf.size(); i++) {
temp->data[i] = bc[i % 3]; temp->buf[i] = bc[i % 3];
} }
// copy from the input image // copy from the input image
@ -768,17 +796,16 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
for (int x = 0; x < img->nx; x++) { for (int x = 0; x < img->nx; x++) {
const int i = 3 * (y * img->nx + x); const int i = 3 * (y * img->nx + x);
const int j = 3 * (y * temp->nx + x); const int j = 3 * (y * temp->nx + x);
temp->data[j] = img->data[i]; temp->buf[j] = img->buf[i];
temp->data[j+1] = img->data[i+1]; temp->buf[j+1] = img->buf[i+1];
temp->data[j+2] = img->data[i+2]; temp->buf[j+2] = img->buf[i+2];
} }
} }
} else { } else {
temp->nx = img->nx; temp->nx = img->nx;
temp->ny = img->ny; temp->ny = img->ny;
temp->size = img->size; temp->buf.resize(img->buf.size());
temp->data = new uint8_t[temp->size](); memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
memcpy(&temp->data[0], &img->data[0], temp->size); // copy
} }
const int nx = temp->nx; const int nx = temp->nx;
@ -789,8 +816,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
res->nx = nx2; res->nx = nx2;
res->ny = ny2; res->ny = ny2;
res->size = 3 * nx2 * ny2; res->buf.resize(3 * nx2 * ny2);
res->data = new float[res->size]();
const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size; const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size;
@ -821,10 +847,10 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
const int j10 = 3 * (y1 * nx + x0) + c; const int j10 = 3 * (y1 * nx + x0) + c;
const int j11 = 3 * (y1 * nx + x1) + c; const int j11 = 3 * (y1 * nx + x1) + c;
const float v00 = temp->data[j00]; const float v00 = temp->buf[j00];
const float v01 = temp->data[j01]; const float v01 = temp->buf[j01];
const float v10 = temp->data[j10]; const float v10 = temp->buf[j10];
const float v11 = temp->data[j11]; const float v11 = temp->buf[j11];
const float v0 = v00 * (1.0f - dx) + v01 * dx; const float v0 = v00 * (1.0f - dx) + v01 * dx;
const float v1 = v10 * (1.0f - dx) + v11 * dx; const float v1 = v10 * (1.0f - dx) + v11 * dx;
@ -835,7 +861,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
const int i = 3 * (y * nx3 + x) + c; const int i = 3 * (y * nx3 + x) + c;
res->data[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c]; res->buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
} }
} }
} }
@ -845,12 +871,13 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
} }
void clip_free(clip_ctx * ctx) { void clip_free(clip_ctx * ctx) {
ggml_free(ctx->ctx); ggml_free(ctx->ctx_data);
gguf_free(ctx->ctx_gguf); gguf_free(ctx->ctx_gguf);
delete ctx; delete ctx;
} }
bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n"); printf("This gguf file seems to have no vision encoder\n");
return false; return false;
@ -862,8 +889,7 @@ bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32
return clip_image_batch_encode(ctx, n_threads, &imgs, vec); return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
} }
bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) { bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n"); printf("This gguf file seems to have no vision encoder\n");
return false; return false;
@ -906,31 +932,32 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
ggml_type type = GGML_TYPE_Q4_1; ggml_type type = GGML_TYPE_Q4_1;
switch (itype) { switch (itype) {
case 2: case 2:
type = GGML_TYPE_Q4_0; type = GGML_TYPE_Q4_0;
break; break;
case 3: case 3:
type = GGML_TYPE_Q4_1; type = GGML_TYPE_Q4_1;
break; break;
case 6: case 6:
type = GGML_TYPE_Q5_0; type = GGML_TYPE_Q5_0;
break; break;
case 7: case 7:
type = GGML_TYPE_Q5_1; type = GGML_TYPE_Q5_1;
break; break;
case 8: case 8:
type = GGML_TYPE_Q8_0; type = GGML_TYPE_Q8_0;
break; break;
default: default:
fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
return false; return false;
}; };
auto ctx_clip = clip_model_load(fname_inp, 2); auto * ctx_clip = clip_model_load(fname_inp, 2);
const auto & ctx_src = ctx_clip->ctx_gguf;
const auto & ctx_data = ctx_clip->ctx;
auto ctx_out = gguf_init_empty(); const auto & ctx_src = ctx_clip->ctx_gguf;
const auto & ctx_data = ctx_clip->ctx_data;
auto * ctx_out = gguf_init_empty();
gguf_set_kv(ctx_out, ctx_src); gguf_set_kv(ctx_out, ctx_src);
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
gguf_set_val_u32(ctx_out, "general.file_type", itype); gguf_set_val_u32(ctx_out, "general.file_type", itype);

View file

@ -35,31 +35,14 @@ struct clip_vision_hparams {
float eps; float eps;
}; };
/** load mmproj model */ CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
/** free mmproj model */
CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API void clip_free(struct clip_ctx * ctx);
size_t clip_embd_nbytes(const struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
int clip_n_patches(const struct clip_ctx * ctx);
int clip_n_mmproj_embd(const struct clip_ctx * ctx);
// RGB uint8 image CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
struct clip_image_u8 { CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
int nx;
int ny;
uint8_t * data = NULL;
size_t size;
};
// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
struct clip_image_f32 {
int nx;
int ny;
float * data = NULL;
size_t size;
};
struct clip_image_u8_batch { struct clip_image_u8_batch {
struct clip_image_u8 * data; struct clip_image_u8 * data;
@ -71,21 +54,22 @@ struct clip_image_f32_batch {
size_t size; size_t size;
}; };
struct clip_image_u8 * make_clip_image_u8(); CLIP_API struct clip_image_u8 * clip_image_u8_init ();
struct clip_image_f32 * make_clip_image_f32(); CLIP_API struct clip_image_f32 * clip_image_f32_init();
CLIP_API void clip_image_u8_free(clip_image_u8 * img);
CLIP_API void clip_image_f32_free(clip_image_f32 * img); CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */ /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square); CLIP_API bool clip_image_preprocess (struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, bool pad2square);
bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec); CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs, CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
float * vec);
bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -10,7 +10,7 @@
#include "base64.hpp" #include "base64.hpp"
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
clip_image_f32 * img_res = make_clip_image_f32(); clip_image_f32 * img_res = clip_image_f32_init();
if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) { if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) {
fprintf(stderr, "%s: unable to preprocess image\n", __func__); fprintf(stderr, "%s: unable to preprocess image\n", __func__);
clip_image_f32_free(img_res); clip_image_f32_free(img_res);
@ -86,7 +86,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
} }
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) { LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
clip_image_u8 * img = make_clip_image_u8(); clip_image_u8 * img = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
clip_image_u8_free(img); clip_image_u8_free(img);
fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__); fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);

View file

@ -82,7 +82,7 @@ static inline bool is_base64(uint8_t c)
return (isalnum(c) || (c == '+') || (c == '/')); return (isalnum(c) || (c == '+') || (c == '/'));
} }
static std::vector<uint8_t> base64_decode(std::string const &encoded_string) static std::vector<uint8_t> base64_decode(const std::string & encoded_string)
{ {
int i = 0; int i = 0;
int j = 0; int j = 0;
@ -209,10 +209,10 @@ struct slot_image
int32_t id; int32_t id;
bool request_encode_image = false; bool request_encode_image = false;
float* image_embedding = nullptr; float * image_embedding = nullptr;
int32_t image_tokens = 0; int32_t image_tokens = 0;
clip_image_u8 img_data; clip_image_u8 * img_data;
std::string prefix_prompt; // before of this image std::string prefix_prompt; // before of this image
}; };
@ -434,10 +434,12 @@ struct llama_client_slot
generated_token_probs.clear(); generated_token_probs.clear();
for (slot_image &img : images) for (slot_image & img : images)
{ {
free(img.image_embedding); free(img.image_embedding);
delete[] img.img_data.data; if (img.img_data) {
clip_image_u8_free(img.img_data);
}
img.prefix_prompt = ""; img.prefix_prompt = "";
} }
@ -851,24 +853,17 @@ struct llama_server_context
{ {
for (const auto &img : *images_data) for (const auto &img : *images_data)
{ {
std::string data_b64 = img["data"].get<std::string>(); const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
slot_image img_sl; slot_image img_sl;
img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size(); img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
int width, height, channels; img_sl.img_data = clip_image_u8_init();
std::vector<uint8_t> image_buffer = base64_decode(data_b64); if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
data_b64.clear(); {
auto data = stbi_load_from_memory(image_buffer.data(), image_buffer.size(), &width, &height, &channels, 3);
if (!data) {
LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id); LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
return false; return false;
} }
LOG_TEE("slot %i - image loaded [id: %i] resolution (%i x %i)\n", slot->id, img_sl.id, width, height); LOG_TEE("slot %i - loaded image\n", slot->id);
img_sl.img_data.nx = width;
img_sl.img_data.ny = height;
img_sl.img_data.size = width * height * 3;
img_sl.img_data.data = new uint8_t[width * height * 3]();
memcpy(img_sl.img_data.data, data, width * height * 3);
stbi_image_free(data);
img_sl.request_encode_image = true; img_sl.request_encode_image = true;
slot->images.push_back(img_sl); slot->images.push_back(img_sl);
} }
@ -1143,8 +1138,8 @@ struct llama_server_context
{ {
continue; continue;
} }
clip_image_f32 img_res; clip_image_f32 * img_res = clip_image_f32_init();
if (!clip_image_preprocess(clp_ctx, &img.img_data, &img_res, /*pad2square =*/ true)) if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/ true))
{ {
LOG_TEE("Error processing the given image"); LOG_TEE("Error processing the given image");
clip_free(clp_ctx); clip_free(clp_ctx);
@ -1159,11 +1154,12 @@ struct llama_server_context
return false; return false;
} }
LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id); LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
if (!clip_image_encode(clp_ctx, params.n_threads, &img_res, img.image_embedding)) if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
{ {
LOG_TEE("Unable to encode image\n"); LOG_TEE("Unable to encode image\n");
return false; return false;
} }
clip_image_f32_free(img_res);
img.request_encode_image = false; img.request_encode_image = false;
} }
@ -2020,6 +2016,10 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
printf(" --log-disable disables logging to a file.\n"); printf(" --log-disable disables logging to a file.\n");
printf("\n"); printf("\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf("\n");
} }
static void server_params_parse(int argc, char **argv, server_params &sparams, static void server_params_parse(int argc, char **argv, server_params &sparams,
@ -2383,6 +2383,49 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
log_set_target(stdout); log_set_target(stdout);
LOG_INFO("logging to file is disabled.", {}); LOG_INFO("logging to file is disabled.", {});
} }
else if (arg == "--override-kv")
{
if (++i >= argc) {
invalid_param = true;
break;
}
char * sep = strchr(argv[i], '=');
if (sep == nullptr || sep - argv[i] >= 128) {
fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
invalid_param = true;
break;
}
struct llama_model_kv_override kvo;
std::strncpy(kvo.key, argv[i], sep - argv[i]);
kvo.key[sep - argv[i]] = 0;
sep++;
if (strncmp(sep, "int:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_INT;
kvo.int_value = std::atol(sep);
} else if (strncmp(sep, "float:", 6) == 0) {
sep += 6;
kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
kvo.float_value = std::atof(sep);
} else if (strncmp(sep, "bool:", 5) == 0) {
sep += 5;
kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
if (std::strcmp(sep, "true") == 0) {
kvo.bool_value = true;
} else if (std::strcmp(sep, "false") == 0) {
kvo.bool_value = false;
} else {
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
invalid_param = true;
break;
}
} else {
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
invalid_param = true;
break;
}
params.kv_overrides.push_back(kvo);
}
else else
{ {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@ -2390,6 +2433,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
exit(1); exit(1);
} }
} }
if (!params.kv_overrides.empty()) {
params.kv_overrides.emplace_back(llama_model_kv_override());
params.kv_overrides.back().key[0] = 0;
}
if (invalid_param) if (invalid_param)
{ {

6
flake.lock generated
View file

@ -20,11 +20,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1703559957, "lastModified": 1703637592,
"narHash": "sha256-x9PUuMEPGUOMB51zNxrDr2QoHbYWlCS2xhFedm9MC5Q=", "narHash": "sha256-8MXjxU0RfFfzl57Zy3OfXCITS0qWDNLzlBAdwxGZwfY=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "75dd68c36f458c6593c5bbb48abfd3e59bfed380", "rev": "cfc3698c31b1fb9cdcf10f36c9643460264d0ca8",
"type": "github" "type": "github"
}, },
"original": { "original": {

View file

@ -6,6 +6,29 @@
flake-parts.url = "github:hercules-ci/flake-parts"; flake-parts.url = "github:hercules-ci/flake-parts";
}; };
# Optional binary cache
nixConfig = {
extra-substituters = [
# Populated by the CI in ggerganov/llama.cpp
"https://llama-cpp.cachix.org"
# A development cache for nixpkgs imported with `config.cudaSupport = true`.
# Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
# This lets one skip building e.g. the CUDA-enabled openmpi.
# TODO: Replace once nix-community obtains an official one.
"https://cuda-maintainers.cachix.org"
];
# Verify these are the same keys as published on
# - https://app.cachix.org/cache/llama-cpp
# - https://app.cachix.org/cache/cuda-maintainers
extra-trusted-public-keys = [
"llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
"cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
];
};
# For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl: # For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl:
# #
# ```bash # ```bash
@ -74,26 +97,48 @@
{ {
config, config,
lib, lib,
system,
pkgs, pkgs,
pkgsCuda, pkgsCuda,
pkgsRocm, pkgsRocm,
... ...
}: }:
{ {
# Unlike `.#packages`, legacyPackages may contain values of
# arbitrary types (including nested attrsets) and may even throw
# exceptions. This attribute isn't recursed into by `nix flake
# show` either.
#
# You can add arbitrary scripts to `.devops/nix/scope.nix` and
# access them as `nix build .#llamaPackages.${scriptName}` using
# the same path you would with an overlay.
legacyPackages = {
llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
};
# We don't use the overlay here so as to avoid making too many instances of nixpkgs, # We don't use the overlay here so as to avoid making too many instances of nixpkgs,
# cf. https://zimbatm.com/notes/1000-instances-of-nixpkgs # cf. https://zimbatm.com/notes/1000-instances-of-nixpkgs
packages = packages =
{ {
default = (pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; }).llama-cpp; default = config.legacyPackages.llamaPackages.llama-cpp;
} }
// lib.optionalAttrs pkgs.stdenv.isLinux { // lib.optionalAttrs pkgs.stdenv.isLinux {
opencl = config.packages.default.override { useOpenCL = true; }; opencl = config.packages.default.override { useOpenCL = true; };
cuda = (pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; }).llama-cpp; cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
rocm = (pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; }).llama-cpp;
mpi-cpu = config.packages.default.override { useMpi = true; }; mpi-cpu = config.packages.default.override { useMpi = true; };
mpi-cuda = config.packages.default.override { useMpi = true; }; mpi-cuda = config.packages.default.override { useMpi = true; };
}
// lib.optionalAttrs (system == "x86_64-linux") {
rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp;
}; };
# Packages exposed in `.#checks` will be built by the CI and by
# `nix flake check`. Currently we expose all packages, but we could
# make more granular choices
checks = config.packages;
}; };
}; };
} }

View file

@ -410,13 +410,17 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
#if !defined(__ARM_FEATURE_DOTPROD) #if !defined(__ARM_FEATURE_DOTPROD)
inline static int32x4_t vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) { inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b)); const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b)); const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))); return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
} }
#else
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
#endif #endif
#endif #endif
@ -2481,8 +2485,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
// dot product into int32x4_t // dot product into int32x4_t
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
@ -2769,8 +2773,8 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
// dot product into int32x4_t // dot product into int32x4_t
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d); sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
@ -2936,11 +2940,11 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
} }
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@ -3228,11 +3232,11 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d); ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d); ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
} }
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1; *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
@ -3483,12 +3487,12 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
const int8x16_t y1_1 = vld1q_s8(y1->qs + 16); const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
} }
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@ -3598,8 +3602,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
// We use this macro instead of a function call because for some reason // We use this macro instead of a function call because for some reason
// the code runs 2-3% slower, even if the function is declared inline // the code runs 2-3% slower, even if the function is declared inline
#define MULTIPLY_ACCUM_WITH_SCALE(index)\ #define MULTIPLY_ACCUM_WITH_SCALE(index)\
isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\ isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)]; isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\ #define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\ q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
@ -3973,10 +3977,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3)); q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3)); q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));
isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0]; isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1]; isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2]; isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3]; isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
sum += d * (isum1 + isum2); sum += d * (isum1 + isum2);
} }
@ -4256,10 +4260,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2])); q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3])); q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0]; isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1]; isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2]; isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3]; isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
scale += 4; scale += 4;
@ -4273,10 +4277,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2])); q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3])); q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0]; isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1]; isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2]; isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3]; isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
scale += 4; scale += 4;
@ -4757,10 +4761,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2])); q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3])); q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3]));
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0]; isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2]; isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1]; isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3]; isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
sum += d * isum; sum += d * isum;
@ -5109,14 +5113,14 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
sumi1 += vaddvq_s32(p1) * scales[2*j+0]; sumi1 += vaddvq_s32(p1) * scales[2*j+0];
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32; q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4)); q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4)); q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
sumi2 += vaddvq_s32(p2) * scales[2*j+1]; sumi2 += vaddvq_s32(p2) * scales[2*j+1];
} }
@ -5449,13 +5453,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
const int32_t sumi1 = vaddvq_s32(p1) * scales[0]; const int32_t sumi1 = vaddvq_s32(p1) * scales[0];
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4)); q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4)); q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]); const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
const int32_t sumi2 = vaddvq_s32(p2) * scales[1]; const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
sumf += d * (sumi1 + sumi2); sumf += d * (sumi1 + sumi2);
@ -5722,8 +5726,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2])); q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3])); q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++; sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++; sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
} }
sumf += d * sumi - dmin * sumi_mins; sumf += d * sumi - dmin * sumi_mins;
@ -6112,10 +6116,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2])); q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3])); q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));
int32_t sumi1 = sc[0] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0])); int32_t sumi1 = sc[0] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
int32_t sumi2 = sc[1] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1])); int32_t sumi2 = sc[1] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
int32_t sumi3 = sc[2] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2])); int32_t sumi3 = sc[2] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
int32_t sumi4 = sc[3] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3])); int32_t sumi4 = sc[3] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
sumf += d * (sumi1 + sumi2 + sumi3 + sumi4); sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
} }
@ -6399,10 +6403,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2])); q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3])); q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
scale += 4; scale += 4;
@ -6426,10 +6430,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2])); q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3])); q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
scale += 4; scale += 4;
} }
//sum += isum * d_all * y[i].d; //sum += isum * d_all * y[i].d;
@ -6816,10 +6820,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s); q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s); q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
sum += isum * d_all * y[i].d; sum += isum * d_all * y[i].d;

View file

@ -46,6 +46,8 @@ class Keys:
HEAD_COUNT_KV = "{arch}.attention.head_count_kv" HEAD_COUNT_KV = "{arch}.attention.head_count_kv"
MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias"
CLAMP_KQV = "{arch}.attention.clamp_kqv" CLAMP_KQV = "{arch}.attention.clamp_kqv"
KEY_LENGTH = "{arch}.attention.key_length"
VALUE_LENGTH = "{arch}.attention.value_length"
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"

View file

@ -333,6 +333,12 @@ class GGUFWriter:
def add_head_count_kv(self, count: int) -> None: def add_head_count_kv(self, count: int) -> None:
self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count) self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count)
def add_key_length(self, length: int) -> None:
self.add_uint32(Keys.Attention.KEY_LENGTH.format(arch=self.arch), length)
def add_value_length(self, length: int) -> None:
self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length)
def add_max_alibi_bias(self, bias: float) -> None: def add_max_alibi_bias(self, bias: float) -> None:
self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias) self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias)

271
llama.cpp
View file

@ -245,6 +245,8 @@ enum llm_kv {
LLM_KV_ATTENTION_HEAD_COUNT_KV, LLM_KV_ATTENTION_HEAD_COUNT_KV,
LLM_KV_ATTENTION_MAX_ALIBI_BIAS, LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
LLM_KV_ATTENTION_CLAMP_KQV, LLM_KV_ATTENTION_CLAMP_KQV,
LLM_KV_ATTENTION_KEY_LENGTH,
LLM_KV_ATTENTION_VALUE_LENGTH,
LLM_KV_ATTENTION_LAYERNORM_EPS, LLM_KV_ATTENTION_LAYERNORM_EPS,
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
@ -297,6 +299,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
{ LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" }, { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
{ LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" }, { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
{ LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" }, { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" }, { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
@ -1284,6 +1288,8 @@ struct llama_hparams {
uint32_t n_head_kv; uint32_t n_head_kv;
uint32_t n_layer; uint32_t n_layer;
uint32_t n_rot; uint32_t n_rot;
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
uint32_t n_ff; uint32_t n_ff;
uint32_t n_expert = 0; uint32_t n_expert = 0;
uint32_t n_expert_used = 0; uint32_t n_expert_used = 0;
@ -1310,6 +1316,8 @@ struct llama_hparams {
if (this->n_head_kv != other.n_head_kv) return true; if (this->n_head_kv != other.n_head_kv) return true;
if (this->n_layer != other.n_layer) return true; if (this->n_layer != other.n_layer) return true;
if (this->n_rot != other.n_rot) return true; if (this->n_rot != other.n_rot) return true;
if (this->n_embd_head_k != other.n_embd_head_k) return true;
if (this->n_embd_head_v != other.n_embd_head_v) return true;
if (this->n_ff != other.n_ff) return true; if (this->n_ff != other.n_ff) return true;
if (this->n_expert != other.n_expert) return true; if (this->n_expert != other.n_expert) return true;
if (this->n_expert_used != other.n_expert_used) return true; if (this->n_expert_used != other.n_expert_used) return true;
@ -1331,12 +1339,12 @@ struct llama_hparams {
return n_head/n_head_kv; return n_head/n_head_kv;
} }
uint32_t n_embd_head() const { uint32_t n_embd_k_gqa() const { // dimension of key embeddings across all k-v heads
return n_embd/n_head; return n_embd_head_k * n_head_kv;
} }
uint32_t n_embd_gqa() const { uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads
return n_embd/n_gqa(); return n_embd_head_v * n_head_kv;
} }
}; };
@ -1645,8 +1653,9 @@ static bool llama_kv_cache_init(
uint32_t n_ctx, uint32_t n_ctx,
int n_gpu_layers, int n_gpu_layers,
bool offload) { bool offload) {
const uint32_t n_embd = hparams.n_embd_gqa(); const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const uint32_t n_layer = hparams.n_layer; const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
const uint32_t n_layer = hparams.n_layer;
cache.has_shift = false; cache.has_shift = false;
@ -1677,8 +1686,8 @@ static bool llama_kv_cache_init(
const int i_gpu_start = (int) n_layer - n_gpu_layers; const int i_gpu_start = (int) n_layer - n_gpu_layers;
for (int i = 0; i < (int) n_layer; i++) { for (int i = 0; i < (int) n_layer; i++) {
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx); ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd_k_gqa*n_ctx);
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx); ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd_v_gqa*n_ctx);
ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(v, "cache_v_l%d", i); ggml_format_name(v, "cache_v_l%d", i);
cache.k_l.push_back(k); cache.k_l.push_back(k);
@ -2672,6 +2681,12 @@ static void llm_load_hparams(
// gpt-j n_rot = rotary_dim // gpt-j n_rot = rotary_dim
} }
hparams.n_embd_head_k = hparams.n_embd / hparams.n_head;
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
hparams.n_embd_head_v = hparams.n_embd / hparams.n_head;
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
// arch-specific KVs // arch-specific KVs
switch (model.arch) { switch (model.arch) {
case LLM_ARCH_LLAMA: case LLM_ARCH_LLAMA:
@ -3082,8 +3097,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head); LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa()); LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
LLAMA_LOG_INFO("%s: n_embd_k_gqa = %u\n", __func__, hparams.n_embd_k_gqa());
LLAMA_LOG_INFO("%s: n_embd_v_gqa = %u\n", __func__, hparams.n_embd_v_gqa());
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps); LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv); LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
@ -3173,10 +3192,11 @@ static bool llm_load_tensors(
// create tensors for the weights // create tensors for the weights
{ {
const int64_t n_embd = hparams.n_embd; const int64_t n_embd = hparams.n_embd;
const int64_t n_embd_gqa = hparams.n_embd_gqa(); const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const int64_t n_layer = hparams.n_layer; const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
const int64_t n_vocab = hparams.n_vocab; const int64_t n_layer = hparams.n_layer;
const int64_t n_vocab = hparams.n_vocab;
const auto tn = LLM_TN(model.arch); const auto tn = LLM_TN(model.arch);
switch (model.arch) { switch (model.arch) {
@ -3202,7 +3222,10 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
} }
const uint32_t n_ff = hparams.n_ff; const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers; const int i_gpu_start = n_layer - n_gpu_layers;
@ -3270,7 +3293,10 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
} }
const uint32_t n_ff = hparams.n_ff; const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers; const int i_gpu_start = n_layer - n_gpu_layers;
@ -3318,7 +3344,10 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
} }
const uint32_t n_ff = hparams.n_ff; const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers; const int i_gpu_start = n_layer - n_gpu_layers;
@ -3368,7 +3397,10 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
} }
const uint32_t n_ff = hparams.n_ff; const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers; const int i_gpu_start = n_layer - n_gpu_layers;
@ -3420,7 +3452,11 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
} }
const uint32_t n_ff = hparams.n_ff; const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers; const int i_gpu_start = n_layer - n_gpu_layers;
model.layers.resize(n_layer); model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) { for (uint32_t i = 0; i < n_layer; ++i) {
@ -3469,7 +3505,10 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
} }
const uint32_t n_ff = hparams.n_ff; const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers; const int i_gpu_start = n_layer - n_gpu_layers;
@ -3520,7 +3559,10 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
} }
const uint32_t n_ff = hparams.n_ff; const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers; const int i_gpu_start = n_layer - n_gpu_layers;
@ -3567,7 +3609,10 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
} }
const uint32_t n_ff = hparams.n_ff; const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers; const int i_gpu_start = n_layer - n_gpu_layers;
@ -3665,7 +3710,10 @@ static bool llm_load_tensors(
model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output); model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
} }
const uint32_t n_ff = hparams.n_ff; const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers; const int i_gpu_start = n_layer - n_gpu_layers;
@ -3714,7 +3762,10 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
} }
const uint32_t n_ff = hparams.n_ff; const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers; const int i_gpu_start = n_layer - n_gpu_layers;
@ -3761,7 +3812,10 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
} }
const uint32_t n_ff = hparams.n_ff; const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers; const int i_gpu_start = n_layer - n_gpu_layers;
@ -4000,8 +4054,8 @@ static struct ggml_tensor * llm_build_inp_embd(
return inpL; return inpL;
} }
// Persimmon: n_rot = n_embd_head/2 // Persimmon: n_rot = n_embd_head_k/2
// Other: n_rot = n_embd_head // Other: n_rot = n_embd_head_k
static void llm_build_k_shift( static void llm_build_k_shift(
struct ggml_context * ctx, struct ggml_context * ctx,
const llama_hparams & hparams, const llama_hparams & hparams,
@ -4014,17 +4068,17 @@ static void llm_build_k_shift(
float freq_base, float freq_base,
float freq_scale, float freq_scale,
const llm_build_cb & cb) { const llm_build_cb & cb) {
const int64_t n_layer = hparams.n_layer; const int64_t n_layer = hparams.n_layer;
const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_embd_gqa = hparams.n_embd_gqa(); const int64_t n_embd_head_k = hparams.n_embd_head_k;
const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx; const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
const float ext_factor = cparams.yarn_ext_factor; const float ext_factor = cparams.yarn_ext_factor;
const float attn_factor = cparams.yarn_attn_factor; const float attn_factor = cparams.yarn_attn_factor;
const float beta_fast = cparams.yarn_beta_fast; const float beta_fast = cparams.yarn_beta_fast;
const float beta_slow = cparams.yarn_beta_slow; const float beta_slow = cparams.yarn_beta_slow;
GGML_ASSERT(n_embd_head % n_rot == 0); GGML_ASSERT(n_embd_head_k % n_rot == 0);
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx); struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
cb(K_shift, "K_shift", -1); cb(K_shift, "K_shift", -1);
@ -4042,9 +4096,9 @@ static void llm_build_k_shift(
// we rotate only the first n_rot dimensions // we rotate only the first n_rot dimensions
ggml_rope_custom_inplace(ctx, ggml_rope_custom_inplace(ctx,
ggml_view_3d(ctx, kv.k_l[il], ggml_view_3d(ctx, kv.k_l[il],
n_embd_head, n_head_kv, n_ctx, n_embd_head_k, n_head_kv, n_ctx,
ggml_row_size(kv.k_l[il]->type, n_embd_head), ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
ggml_row_size(kv.k_l[il]->type, n_embd_gqa), ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
0), 0),
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow); ext_factor, attn_factor, beta_fast, beta_slow);
@ -4065,18 +4119,19 @@ static void llm_build_kv_store(
int32_t kv_head, int32_t kv_head,
const llm_build_cb & cb, const llm_build_cb & cb,
int64_t il) { int64_t il) {
const int64_t n_embd_gqa = hparams.n_embd_gqa(); const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
// compute the transposed [n_tokens, n_embd] V matrix // compute the transposed [n_tokens, n_embd] V matrix
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens)); struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
//struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
cb(v_cur_t, "v_cur_t", il); cb(v_cur_t, "v_cur_t", il);
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa, struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
(ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head); (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
cb(k_cache_view, "k_cache_view", il); cb(k_cache_view, "k_cache_view", il);
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa, struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
( n_ctx)*ggml_element_size(kv.v_l[il]), ( n_ctx)*ggml_element_size(kv.v_l[il]),
(kv_head)*ggml_element_size(kv.v_l[il])); (kv_head)*ggml_element_size(kv.v_l[il]));
cb(v_cache_view, "v_cache_view", il); cb(v_cache_view, "v_cache_view", il);
@ -4226,20 +4281,20 @@ static struct ggml_tensor * llm_build_kqv(
float kq_scale, float kq_scale,
const llm_build_cb & cb, const llm_build_cb & cb,
int il) { int il) {
const int64_t n_embd = hparams.n_embd; const int64_t n_head = hparams.n_head;
const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head_k = hparams.n_embd_head_k;
const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const int64_t n_embd_gqa = hparams.n_embd_gqa(); const int64_t n_embd_head_v = hparams.n_embd_head_v;
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3); struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
cb(q, "q", il); cb(q, "q", il);
struct ggml_tensor * k = struct ggml_tensor * k =
ggml_view_3d(ctx, kv.k_l[il], ggml_view_3d(ctx, kv.k_l[il],
n_embd_head, n_kv, n_head_kv, n_embd_head_k, n_kv, n_head_kv,
ggml_row_size(kv.k_l[il]->type, n_embd_gqa), ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
ggml_row_size(kv.k_l[il]->type, n_embd_head), ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
0); 0);
cb(k, "k", il); cb(k, "k", il);
@ -4278,9 +4333,9 @@ static struct ggml_tensor * llm_build_kqv(
// split cached v into n_head heads // split cached v into n_head heads
struct ggml_tensor * v = struct ggml_tensor * v =
ggml_view_3d(ctx, kv.v_l[il], ggml_view_3d(ctx, kv.v_l[il],
n_kv, n_embd_head, n_head_kv, n_kv, n_embd_head_v, n_head_kv,
ggml_element_size(kv.v_l[il])*n_ctx, ggml_element_size(kv.v_l[il])*n_ctx,
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head, ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
0); 0);
cb(v, "v", il); cb(v, "v", il);
@ -4290,7 +4345,7 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3); struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
cb(kqv_merged, "kqv_merged", il); cb(kqv_merged, "kqv_merged", il);
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens); struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
cb(cur, "kqv_merged_cont", il); cb(cur, "kqv_merged_cont", il);
cur = ggml_mul_mat(ctx, wo, cur); cur = ggml_mul_mat(ctx, wo, cur);
@ -4317,8 +4372,10 @@ struct llm_build_context {
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train) const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
const int64_t n_head; const int64_t n_head;
const int64_t n_head_kv; const int64_t n_head_kv;
const int64_t n_embd_head; const int64_t n_embd_head_k;
const int64_t n_embd_gqa; const int64_t n_embd_k_gqa;
const int64_t n_embd_head_v;
const int64_t n_embd_v_gqa;
const int64_t n_expert; const int64_t n_expert;
const int64_t n_expert_used; const int64_t n_expert_used;
@ -4360,8 +4417,10 @@ struct llm_build_context {
n_ctx (cparams.n_ctx), n_ctx (cparams.n_ctx),
n_head (hparams.n_head), n_head (hparams.n_head),
n_head_kv (hparams.n_head_kv), n_head_kv (hparams.n_head_kv),
n_embd_head (hparams.n_embd_head()), n_embd_head_k (hparams.n_embd_head_k),
n_embd_gqa (hparams.n_embd_gqa()), n_embd_k_gqa (hparams.n_embd_k_gqa()),
n_embd_head_v (hparams.n_embd_head_v),
n_embd_v_gqa (hparams.n_embd_v_gqa()),
n_expert (hparams.n_expert), n_expert (hparams.n_expert),
n_expert_used (hparams.n_expert_used), n_expert_used (hparams.n_expert_used),
freq_base (cparams.rope_freq_base), freq_base (cparams.rope_freq_base),
@ -4404,6 +4463,8 @@ struct llm_build_context {
struct ggml_cgraph * build_llama() { struct ggml_cgraph * build_llama() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot); GGML_ASSERT(n_embd_head == hparams.n_rot);
struct ggml_tensor * cur; struct ggml_tensor * cur;
@ -4588,6 +4649,9 @@ struct llm_build_context {
struct ggml_cgraph * build_baichuan() { struct ggml_cgraph * build_baichuan() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
@ -4705,6 +4769,11 @@ struct llm_build_context {
struct ggml_cgraph * build_falcon() { struct ggml_cgraph * build_falcon() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_gqa == n_embd);
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
@ -4824,6 +4893,11 @@ struct llm_build_context {
struct ggml_cgraph * build_starcoder() { struct ggml_cgraph * build_starcoder() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_gqa == n_embd);
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * pos; struct ggml_tensor * pos;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
@ -4920,7 +4994,12 @@ struct llm_build_context {
struct ggml_cgraph * build_persimmon() { struct ggml_cgraph * build_persimmon() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_rot = n_embd_head / 2; const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_gqa == n_embd);
const int64_t n_rot = n_embd_head_k / 2;
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
@ -5129,6 +5208,11 @@ struct llm_build_context {
struct ggml_cgraph * build_refact() { struct ggml_cgraph * build_refact() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_gqa == n_embd);
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
@ -5217,6 +5301,11 @@ struct llm_build_context {
struct ggml_cgraph * build_bloom() { struct ggml_cgraph * build_bloom() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_gqa == n_embd);
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
@ -5308,6 +5397,11 @@ struct llm_build_context {
struct ggml_cgraph * build_mpt() { struct ggml_cgraph * build_mpt() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_gqa == n_embd);
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
@ -5403,6 +5497,9 @@ struct llm_build_context {
struct ggml_cgraph * build_stablelm() { struct ggml_cgraph * build_stablelm() {
struct ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_cgraph * gf = ggml_new_graph(ctx0);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
@ -5513,6 +5610,9 @@ struct llm_build_context {
struct ggml_cgraph * build_qwen() { struct ggml_cgraph * build_qwen() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
@ -5624,6 +5724,11 @@ struct llm_build_context {
struct ggml_cgraph * build_phi2() { struct ggml_cgraph * build_phi2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_gqa == n_embd);
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * attn_norm_output; struct ggml_tensor * attn_norm_output;
struct ggml_tensor * ffn_output; struct ggml_tensor * ffn_output;
@ -5736,6 +5841,9 @@ struct llm_build_context {
struct ggml_cgraph * build_plamo() { struct ggml_cgraph * build_plamo() {
struct ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_cgraph * gf = ggml_new_graph(ctx0);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
@ -5840,6 +5948,11 @@ struct llm_build_context {
struct ggml_cgraph * build_gpt2() { struct ggml_cgraph * build_gpt2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_gqa == n_embd);
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * pos; struct ggml_tensor * pos;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
@ -9627,8 +9740,8 @@ struct llama_context * llama_new_context_with_model(
const ggml_type type_k = params.type_k; const ggml_type type_k = params.type_k;
const ggml_type type_v = params.type_v; const ggml_type type_v = params.type_v;
GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_k) == 0); GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_v) == 0); GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
// reserve memory for context buffers // reserve memory for context buffers
if (!hparams.vocab_only) { if (!hparams.vocab_only) {
@ -10173,9 +10286,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
const auto & hparams = ctx->model.hparams; const auto & hparams = ctx->model.hparams;
const auto & cparams = ctx->cparams; const auto & cparams = ctx->cparams;
const auto n_layer = hparams.n_layer; const auto n_layer = hparams.n_layer;
const auto n_embd = hparams.n_embd_gqa(); const auto n_embd_k_gqa = hparams.n_embd_k_gqa();
const auto n_ctx = cparams.n_ctx; const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
const auto n_ctx = cparams.n_ctx;
const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf); const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
const uint32_t kv_head = kv_self.head; const uint32_t kv_head = kv_self.head;
@ -10197,15 +10311,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
std::vector<struct ggml_tensor *> vout2d(n_layer); std::vector<struct ggml_tensor *> vout2d(n_layer);
for (int il = 0; il < (int) n_layer; ++il) { for (int il = 0; il < (int) n_layer; ++il) {
kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head); kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd); vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il], ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
n_embd, kv_head, n_embd_k_gqa, kv_head,
elt_size*n_embd, 0); elt_size*n_embd_k_gqa, 0);
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il], ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
kv_head, n_embd, kv_head, n_embd_v_gqa,
elt_size*n_ctx, 0); elt_size*n_ctx, 0);
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il])); ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
@ -10312,9 +10426,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
const auto & hparams = ctx->model.hparams; const auto & hparams = ctx->model.hparams;
const auto & cparams = ctx->cparams; const auto & cparams = ctx->cparams;
const int n_layer = hparams.n_layer; const int n_layer = hparams.n_layer;
const int n_embd = hparams.n_embd_gqa(); const int n_embd_k_gqa = hparams.n_embd_k_gqa();
const int n_ctx = cparams.n_ctx; const int n_embd_v_gqa = hparams.n_embd_v_gqa();
const int n_ctx = cparams.n_ctx;
size_t kv_buf_size; size_t kv_buf_size;
uint32_t kv_head; uint32_t kv_head;
@ -10338,15 +10453,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
std::vector<struct ggml_tensor *> vin2d(n_layer); std::vector<struct ggml_tensor *> vin2d(n_layer);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head); kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd); vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il], ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
n_embd, kv_head, n_embd_k_gqa, kv_head,
elt_size*n_embd, 0); elt_size*n_embd_k_gqa, 0);
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il], ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
kv_head, n_embd, kv_head, n_embd_v_gqa,
elt_size*n_ctx, 0); elt_size*n_ctx, 0);
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d)); ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));