Merge remote-tracking branch 'origin/master' into test/server-add-ci-test
This commit is contained in:
commit
8b96bdaf08
12 changed files with 192 additions and 53 deletions
37
.devops/nix/docker.nix
Normal file
37
.devops/nix/docker.nix
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
{
|
||||||
|
lib,
|
||||||
|
dockerTools,
|
||||||
|
buildEnv,
|
||||||
|
llama-cpp,
|
||||||
|
interactive ? true,
|
||||||
|
coreutils,
|
||||||
|
}:
|
||||||
|
|
||||||
|
# A tar that can be fed into `docker load`:
|
||||||
|
#
|
||||||
|
# $ nix build .#llamaPackages.docker
|
||||||
|
# $ docker load < result
|
||||||
|
|
||||||
|
# For details and variations cf.
|
||||||
|
# - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage
|
||||||
|
# - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922
|
||||||
|
# - https://nixery.dev/
|
||||||
|
|
||||||
|
# Approximate (compressed) sizes, at the time of writing, are:
|
||||||
|
#
|
||||||
|
# .#llamaPackages.docker: 125M;
|
||||||
|
# .#llamaPackagesCuda.docker: 537M;
|
||||||
|
# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
|
||||||
|
|
||||||
|
dockerTools.buildLayeredImage {
|
||||||
|
name = llama-cpp.pname;
|
||||||
|
tag = "latest";
|
||||||
|
|
||||||
|
contents =
|
||||||
|
[ llama-cpp ]
|
||||||
|
++ lib.optionals interactive [
|
||||||
|
coreutils
|
||||||
|
dockerTools.binSh
|
||||||
|
dockerTools.caCertificates
|
||||||
|
];
|
||||||
|
}
|
|
@ -12,5 +12,8 @@ lib.makeScope newScope (
|
||||||
self: {
|
self: {
|
||||||
inherit llamaVersion;
|
inherit llamaVersion;
|
||||||
llama-cpp = self.callPackage ./package.nix { };
|
llama-cpp = self.callPackage ./package.nix { };
|
||||||
|
docker = self.callPackage ./docker.nix { };
|
||||||
|
docker-min = self.callPackage ./docker.nix { interactive = false; };
|
||||||
|
sif = self.callPackage ./sif.nix { };
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
27
.devops/nix/sif.nix
Normal file
27
.devops/nix/sif.nix
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
{
|
||||||
|
lib,
|
||||||
|
singularity-tools,
|
||||||
|
llama-cpp,
|
||||||
|
bashInteractive,
|
||||||
|
interactive ? false,
|
||||||
|
}:
|
||||||
|
|
||||||
|
let
|
||||||
|
optionalInt = cond: x: if cond then x else 0;
|
||||||
|
in
|
||||||
|
singularity-tools.buildImage rec {
|
||||||
|
inherit (llama-cpp) name;
|
||||||
|
contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
|
||||||
|
|
||||||
|
# These are excessive (but safe) for most variants. Building singularity
|
||||||
|
# images requires superuser privileges, so we build them inside a VM in a
|
||||||
|
# writable image of pre-determined size.
|
||||||
|
#
|
||||||
|
# ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846
|
||||||
|
#
|
||||||
|
# Expected image sizes:
|
||||||
|
# - cpu/blas: 150M,
|
||||||
|
# - cuda, all gencodes: 560M,
|
||||||
|
diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
|
||||||
|
memSize = diskSize;
|
||||||
|
}
|
7
.github/workflows/nix-ci-aarch64.yml
vendored
7
.github/workflows/nix-ci-aarch64.yml
vendored
|
@ -19,7 +19,6 @@ on:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
nix-build-aarch64:
|
nix-build-aarch64:
|
||||||
if: ${{ vars.CACHIX_NAME != '' }}
|
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
|
@ -37,8 +36,8 @@ jobs:
|
||||||
extra-conf: |
|
extra-conf: |
|
||||||
extra-platforms = aarch64-linux
|
extra-platforms = aarch64-linux
|
||||||
extra-system-features = nixos-test kvm
|
extra-system-features = nixos-test kvm
|
||||||
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
|
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||||
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||||
with:
|
with:
|
||||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||||
|
@ -46,7 +45,7 @@ jobs:
|
||||||
uses: cachix/cachix-action@v13
|
uses: cachix/cachix-action@v13
|
||||||
with:
|
with:
|
||||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||||
name: ${{ vars.CACHIX_NAME }}
|
name: llama-cpp
|
||||||
- name: Show all output paths
|
- name: Show all output paths
|
||||||
run: >
|
run: >
|
||||||
nix run github:nix-community/nix-eval-jobs
|
nix run github:nix-community/nix-eval-jobs
|
||||||
|
|
11
.github/workflows/nix-ci.yml
vendored
11
.github/workflows/nix-ci.yml
vendored
|
@ -23,8 +23,8 @@ jobs:
|
||||||
with:
|
with:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
extra-conf: |
|
extra-conf: |
|
||||||
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
|
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||||
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||||
with:
|
with:
|
||||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||||
|
@ -37,7 +37,6 @@ jobs:
|
||||||
--flake
|
--flake
|
||||||
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
||||||
nix-build:
|
nix-build:
|
||||||
if: ${{ vars.CACHIX_NAME != '' }}
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
|
@ -51,8 +50,8 @@ jobs:
|
||||||
with:
|
with:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
extra-conf: |
|
extra-conf: |
|
||||||
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
|
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||||
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||||
with:
|
with:
|
||||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||||
|
@ -60,7 +59,7 @@ jobs:
|
||||||
uses: cachix/cachix-action@v13
|
uses: cachix/cachix-action@v13
|
||||||
with:
|
with:
|
||||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||||
name: ${{ vars.CACHIX_NAME }}
|
name: llama-cpp
|
||||||
- name: Build
|
- name: Build
|
||||||
run: >
|
run: >
|
||||||
nix run github:Mic92/nix-fast-build
|
nix run github:Mic92/nix-fast-build
|
||||||
|
|
|
@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||||
|
|
||||||
### Hot topics
|
### Hot topics
|
||||||
|
|
||||||
|
- Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||||
- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631
|
- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631
|
||||||
- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590
|
- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590
|
||||||
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
|
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
|
||||||
|
|
|
@ -655,6 +655,8 @@ class OrionModel(Model):
|
||||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||||
self.gguf_writer.add_head_count(head_count)
|
self.gguf_writer.add_head_count(head_count)
|
||||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
self.gguf_writer.add_head_count_kv(head_count_kv)
|
||||||
|
# note: config provides rms norm but it is actually layer norm
|
||||||
|
# ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
|
||||||
self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
|
self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
|
||||||
|
|
||||||
def write_tensors(self):
|
def write_tensors(self):
|
||||||
|
@ -1031,7 +1033,6 @@ class PersimmonModel(Model):
|
||||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
self.gguf_writer.add_head_count_kv(head_count_kv)
|
||||||
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
|
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
|
||||||
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
self._set_vocab_sentencepiece()
|
self._set_vocab_sentencepiece()
|
||||||
|
|
|
@ -41,6 +41,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
|
||||||
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
||||||
- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
|
- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
|
||||||
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
|
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
|
||||||
|
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||||
|
|
||||||
## Build
|
## Build
|
||||||
|
|
||||||
|
@ -150,7 +151,7 @@ node index.js
|
||||||
|
|
||||||
`temperature`: Adjust the randomness of the generated text (default: 0.8).
|
`temperature`: Adjust the randomness of the generated text (default: 0.8).
|
||||||
|
|
||||||
`dynatemp_range`: Dynamic temperature range (default: 0.0, 0.0 = disabled).
|
`dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` (default: 0.0, 0.0 = disabled).
|
||||||
|
|
||||||
`dynatemp_exponent`: Dynamic temperature exponent (default: 1.0).
|
`dynatemp_exponent`: Dynamic temperature exponent (default: 1.0).
|
||||||
|
|
||||||
|
@ -208,7 +209,7 @@ node index.js
|
||||||
|
|
||||||
`slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
|
`slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
|
||||||
|
|
||||||
`cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
|
`cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. (default: false)
|
||||||
|
|
||||||
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
||||||
|
|
||||||
|
@ -241,7 +242,7 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||||
|
|
||||||
- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
|
- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
|
||||||
- `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
|
- `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
|
||||||
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
|
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
|
||||||
- `model`: The path to the model loaded with `-m`
|
- `model`: The path to the model loaded with `-m`
|
||||||
- `prompt`: The provided `prompt`
|
- `prompt`: The provided `prompt`
|
||||||
- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
|
- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
|
||||||
|
|
|
@ -400,6 +400,16 @@ struct llama_server_context
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void validate_model_chat_template(server_params & sparams) {
|
||||||
|
llama_chat_message chat[] = {{"user", "test"}};
|
||||||
|
std::vector<char> buf(1);
|
||||||
|
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
|
||||||
|
if (res < 0) {
|
||||||
|
LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
|
||||||
|
sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void initialize() {
|
void initialize() {
|
||||||
// create slots
|
// create slots
|
||||||
all_slots_are_idle = true;
|
all_slots_are_idle = true;
|
||||||
|
@ -2739,6 +2749,11 @@ int main(int argc, char **argv)
|
||||||
LOG_INFO("model loaded", {});
|
LOG_INFO("model loaded", {});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sparams.chat_template.empty()) { // custom chat template is not supplied
|
||||||
|
// check if the template comes with the model is supported by us
|
||||||
|
llama.validate_model_chat_template(sparams);
|
||||||
|
}
|
||||||
|
|
||||||
// Middleware for API key validation
|
// Middleware for API key validation
|
||||||
auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
|
auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
|
||||||
// If API key is not set, skip validation
|
// If API key is not set, skip validation
|
||||||
|
|
95
llama.cpp
95
llama.cpp
|
@ -2791,13 +2791,7 @@ struct llama_model_loader {
|
||||||
|
|
||||||
std::vector<no_init<uint8_t>> read_buf;
|
std::vector<no_init<uint8_t>> read_buf;
|
||||||
|
|
||||||
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||||
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
|
||||||
if (!cur) {
|
|
||||||
// some tensors may be allocated in a different context
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (progress_callback) {
|
if (progress_callback) {
|
||||||
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -3722,7 +3716,7 @@ static bool llm_load_tensors(
|
||||||
}
|
}
|
||||||
|
|
||||||
// create one context per buffer type
|
// create one context per buffer type
|
||||||
size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
|
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
||||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||||
for (auto & it : buft_layer_count) {
|
for (auto & it : buft_layer_count) {
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
|
@ -3860,6 +3854,7 @@ static bool llm_load_tensors(
|
||||||
} else {
|
} else {
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
||||||
ml.n_created--; // artificial tensor
|
ml.n_created--; // artificial tensor
|
||||||
|
ml.size_data += ggml_nbytes(model.output);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4059,6 +4054,8 @@ static bool llm_load_tensors(
|
||||||
// output
|
// output
|
||||||
{
|
{
|
||||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
|
||||||
|
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4069,13 +4066,22 @@ static bool llm_load_tensors(
|
||||||
auto & layer = model.layers[i];
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
|
||||||
|
|
||||||
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
||||||
|
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
|
||||||
|
|
||||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||||
|
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
|
||||||
|
|
||||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||||
|
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
|
||||||
|
|
||||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
||||||
|
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
|
||||||
|
|
||||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
|
||||||
|
|
||||||
// AWQ ScaleActivation layer
|
// AWQ ScaleActivation layer
|
||||||
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
|
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
|
||||||
|
@ -4394,6 +4400,9 @@ static bool llm_load_tensors(
|
||||||
|
|
||||||
// output
|
// output
|
||||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
|
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
|
||||||
|
ml.n_created--; // artificial tensor
|
||||||
|
ml.size_data += ggml_nbytes(model.output);
|
||||||
|
|
||||||
const int64_t n_ff = hparams.n_ff;
|
const int64_t n_ff = hparams.n_ff;
|
||||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||||
|
@ -6173,7 +6182,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
attn_norm = llm_build_norm(ctx0, inpL, hparams,
|
attn_norm = llm_build_norm(ctx0, inpL, hparams,
|
||||||
model.layers[il].attn_norm,
|
model.layers[il].attn_norm,
|
||||||
NULL,
|
model.layers[il].attn_norm_b,
|
||||||
LLM_NORM, cb, il);
|
LLM_NORM, cb, il);
|
||||||
cb(attn_norm, "attn_norm", il);
|
cb(attn_norm, "attn_norm", il);
|
||||||
|
|
||||||
|
@ -6184,6 +6193,11 @@ struct llm_build_context {
|
||||||
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
||||||
cb(cur, "wqkv", il);
|
cb(cur, "wqkv", il);
|
||||||
|
|
||||||
|
if (model.layers[il].bqkv){
|
||||||
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||||
|
cb(cur, "bqkv", il);
|
||||||
|
}
|
||||||
|
|
||||||
if (hparams.f_clamp_kqv > 0.0f) {
|
if (hparams.f_clamp_kqv > 0.0f) {
|
||||||
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
||||||
cb(cur, "wqkv_clamped", il);
|
cb(cur, "wqkv_clamped", il);
|
||||||
|
@ -6200,7 +6214,7 @@ struct llm_build_context {
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
@ -6213,13 +6227,13 @@ struct llm_build_context {
|
||||||
{
|
{
|
||||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||||
model.layers[il].ffn_norm,
|
model.layers[il].ffn_norm,
|
||||||
NULL,
|
model.layers[il].ffn_norm_b,
|
||||||
LLM_NORM, cb, il);
|
LLM_NORM, cb, il);
|
||||||
cb(cur, "ffn_norm", il);
|
cb(cur, "ffn_norm", il);
|
||||||
cur = llm_build_ffn(ctx0, cur,
|
cur = llm_build_ffn(ctx0, cur,
|
||||||
model.layers[il].ffn_up, NULL,
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||||
NULL, NULL,
|
NULL, NULL,
|
||||||
model.layers[il].ffn_down, NULL,
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||||
model.layers[il].ffn_act,
|
model.layers[il].ffn_act,
|
||||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
|
@ -6236,7 +6250,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = llm_build_norm(ctx0, cur, hparams,
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
model.output_norm,
|
model.output_norm,
|
||||||
NULL,
|
model.output_norm_b,
|
||||||
LLM_NORM, cb, -1);
|
LLM_NORM, cb, -1);
|
||||||
cb(cur, "result_norm", -1);
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
@ -7525,7 +7539,7 @@ struct llm_build_context {
|
||||||
cb(cur, "result_norm", -1);
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
// lm_head
|
// lm_head
|
||||||
cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||||
cb(cur, "result_output", -1);
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
@ -12174,18 +12188,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
||||||
data_ctx->write(&kv_used, sizeof(kv_used));
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
||||||
|
|
||||||
if (kv_buf_size) {
|
if (kv_buf_size) {
|
||||||
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
|
||||||
|
|
||||||
std::vector<uint8_t> tmp_buf;
|
std::vector<uint8_t> tmp_buf;
|
||||||
for (int il = 0; il < (int) n_layer; ++il) {
|
for (int il = 0; il < (int) n_layer; ++il) {
|
||||||
tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head);
|
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
||||||
|
tmp_buf.resize(k_size);
|
||||||
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
||||||
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
||||||
|
|
||||||
// v is not contiguous, copy row by row
|
// v is not contiguous, copy row by row
|
||||||
tmp_buf.resize(elt_size*kv_head);
|
size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
||||||
|
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
||||||
|
tmp_buf.resize(v_row_size);
|
||||||
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
||||||
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size());
|
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
|
||||||
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -12287,17 +12302,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||||
if (kv_buf_size) {
|
if (kv_buf_size) {
|
||||||
GGML_ASSERT(kv_self.total_size() == kv_buf_size);
|
GGML_ASSERT(kv_self.total_size() == kv_buf_size);
|
||||||
|
|
||||||
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
|
||||||
|
|
||||||
for (int il = 0; il < (int) n_layer; ++il) {
|
for (int il = 0; il < (int) n_layer; ++il) {
|
||||||
size_t k_size = elt_size*n_embd_k_gqa*kv_head;
|
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
||||||
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
||||||
inp += k_size;
|
inp += k_size;
|
||||||
|
|
||||||
// v is not contiguous, copy row by row
|
// v is not contiguous, copy row by row
|
||||||
size_t v_row_size = elt_size*kv_head;
|
size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
||||||
|
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
||||||
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
||||||
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size);
|
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
||||||
inp += v_row_size;
|
inp += v_row_size;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -12759,6 +12773,37 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|assistant|>\n";
|
ss << "<|assistant|>\n";
|
||||||
}
|
}
|
||||||
|
} else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
|
||||||
|
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
||||||
|
for (auto message : chat) {
|
||||||
|
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
||||||
|
ss << bos << message->role << "\n" << message->content << "</s>\n";
|
||||||
|
}
|
||||||
|
if (add_ass) {
|
||||||
|
ss << "<s>assistant\n";
|
||||||
|
}
|
||||||
|
} else if (tmpl.find("<start_of_turn>") != std::string::npos) {
|
||||||
|
// google/gemma-7b-it
|
||||||
|
std::string system_prompt = "";
|
||||||
|
for (auto message : chat) {
|
||||||
|
std::string role(message->role);
|
||||||
|
if (role == "system") {
|
||||||
|
// there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
|
||||||
|
system_prompt = trim(message->content);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// in gemma, "assistant" is "model"
|
||||||
|
role = role == "assistant" ? "model" : message->role;
|
||||||
|
ss << "<start_of_turn>" << role << "\n";
|
||||||
|
if (!system_prompt.empty() && role != "model") {
|
||||||
|
ss << system_prompt << "\n\n";
|
||||||
|
system_prompt = "";
|
||||||
|
}
|
||||||
|
ss << trim(message->content) << "<end_of_turn>\n";
|
||||||
|
}
|
||||||
|
if (add_ass) {
|
||||||
|
ss << "<start_of_turn>model\n";
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// template not supported
|
// template not supported
|
||||||
return -1;
|
return -1;
|
||||||
|
|
2
llama.h
2
llama.h
|
@ -708,7 +708,7 @@ extern "C" {
|
||||||
|
|
||||||
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
||||||
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
||||||
/// NOTE: This function only support some known jinja templates. It is not a jinja parser.
|
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
||||||
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
|
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
|
||||||
/// @param chat Pointer to a list of multiple llama_chat_message
|
/// @param chat Pointer to a list of multiple llama_chat_message
|
||||||
/// @param n_msg Number of llama_chat_message in this chat
|
/// @param n_msg Number of llama_chat_message in this chat
|
||||||
|
|
|
@ -27,12 +27,24 @@ int main(void) {
|
||||||
"{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <<SYS>>\\\\n' + messages[idx]['content'] + '\\\\n<</SYS>>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}",
|
"{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <<SYS>>\\\\n' + messages[idx]['content'] + '\\\\n<</SYS>>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}",
|
||||||
// bofenghuang/vigogne-2-70b-chat
|
// bofenghuang/vigogne-2-70b-chat
|
||||||
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\\\n' + system_message + '\\\\n<</SYS>>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\\\n' + content.strip() + '\\\\n<</SYS>>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
|
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\\\n' + system_message + '\\\\n<</SYS>>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\\\n' + content.strip() + '\\\\n<</SYS>>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
|
||||||
|
// mlabonne/AlphaMonarch-7B
|
||||||
|
"{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
|
||||||
|
// google/gemma-7b-it
|
||||||
|
"{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
|
||||||
};
|
};
|
||||||
std::vector<std::string> expected_substr = {
|
std::vector<std::string> expected_output = {
|
||||||
"<|im_start|>assistant\n I am an assistant <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant",
|
// teknium/OpenHermes-2.5-Mistral-7B
|
||||||
"[/INST]Hi there</s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
|
"<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there<|im_end|>\n<|im_start|>user\nWho are you<|im_end|>\n<|im_start|>assistant\n I am an assistant <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant\n",
|
||||||
"</s><s>[INST] Who are you [/INST] I am an assistant </s><s>[INST] Another question [/INST]",
|
// mistralai/Mistral-7B-Instruct-v0.2
|
||||||
"[/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
|
"[INST] You are a helpful assistant\nHello [/INST]Hi there</s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
|
||||||
|
// TheBloke/FusionNet_34Bx2_MoE-AWQ
|
||||||
|
"[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s><s>[INST] Who are you [/INST] I am an assistant </s><s>[INST] Another question [/INST]",
|
||||||
|
// bofenghuang/vigogne-2-70b-chat
|
||||||
|
"[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
|
||||||
|
// mlabonne/AlphaMonarch-7B
|
||||||
|
"system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n I am an assistant </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
|
||||||
|
// google/gemma-7b-it
|
||||||
|
"<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
|
||||||
};
|
};
|
||||||
std::vector<char> formatted_chat(1024);
|
std::vector<char> formatted_chat(1024);
|
||||||
int32_t res;
|
int32_t res;
|
||||||
|
@ -43,7 +55,7 @@ int main(void) {
|
||||||
|
|
||||||
for (size_t i = 0; i < templates.size(); i++) {
|
for (size_t i = 0; i < templates.size(); i++) {
|
||||||
std::string custom_template = templates[i];
|
std::string custom_template = templates[i];
|
||||||
std::string substr = expected_substr[i];
|
std::string expected = expected_output[i];
|
||||||
formatted_chat.resize(1024);
|
formatted_chat.resize(1024);
|
||||||
res = llama_chat_apply_template(
|
res = llama_chat_apply_template(
|
||||||
nullptr,
|
nullptr,
|
||||||
|
@ -57,8 +69,7 @@ int main(void) {
|
||||||
formatted_chat.resize(res);
|
formatted_chat.resize(res);
|
||||||
std::string output(formatted_chat.data(), formatted_chat.size());
|
std::string output(formatted_chat.data(), formatted_chat.size());
|
||||||
std::cout << output << "\n-------------------------\n";
|
std::cout << output << "\n-------------------------\n";
|
||||||
// expect the "formatted_chat" to contain pre-defined strings
|
assert(output == expected);
|
||||||
assert(output.find(substr) != std::string::npos);
|
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue