Merge remote-tracking branch 'origin/master' into server_branch

This commit is contained in:
pudepiedj 2024-03-03 06:56:31 +00:00
commit 265741aa0f
27 changed files with 1121 additions and 591 deletions

View file

@ -10,6 +10,8 @@ on:
pull_request: pull_request:
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*'] paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
schedule:
- cron: '00 0 * * *'
jobs: jobs:
server: server:
@ -70,14 +72,15 @@ jobs:
run: | run: |
pip install -r examples/server/tests/requirements.txt pip install -r examples/server/tests/requirements.txt
- name: Download models
id: download_models
run: |
cd examples/server/tests
../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf
- name: Tests - name: Tests
id: server_integration_test id: server_integration_tests
run: | run: |
cd examples/server/tests cd examples/server/tests
PORT=8888 ./tests.sh PORT=8888 ./tests.sh
- name: Slow tests
id: server_integration_tests_slow
if: github.event.schedule != ''
run: |
cd examples/server/tests
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow

View file

@ -786,7 +786,7 @@ And after 4.45 hours, you will have the final perplexity.
### Interactive mode ### Interactive mode
If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter. If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`. In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
Here is an example of a few-shot interaction, invoked with the command Here is an example of a few-shot interaction, invoked with the command
@ -850,7 +850,7 @@ Sample run:
``` ```
== Running in interactive mode. == == Running in interactive mode. ==
- Press Ctrl+C to interject at any time. - Press Ctrl+C to interject at any time.
- Press Return to return control to LLaMa. - Press Return to return control to LLaMA.
- If you want to submit another line, end your input in '\'. - If you want to submit another line, end your input in '\'.
Below is an instruction that describes a task. Write a response that appropriately completes the request. Below is an instruction that describes a task. Write a response that appropriately completes the request.

View file

@ -8,9 +8,10 @@ import json
import os import os
import re import re
import sys import sys
from abc import ABC, abstractmethod
from enum import IntEnum from enum import IntEnum
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, Sequence, cast from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast
import numpy as np import numpy as np
import torch import torch
@ -35,8 +36,11 @@ class SentencePieceTokenTypes(IntEnum):
UNUSED = 5 UNUSED = 5
BYTE = 6 BYTE = 6
AnyModel = TypeVar("AnyModel", bound="type[Model]")
class Model(ABC):
_model_classes: dict[str, type[Model]] = {}
class Model:
def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool): def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
self.dir_model = dir_model self.dir_model = dir_model
self.ftype = ftype self.ftype = ftype
@ -47,10 +51,14 @@ class Model:
self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin") self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
self.part_names = self._get_part_names() self.part_names = self._get_part_names()
self.hparams = Model.load_hparams(self.dir_model) self.hparams = Model.load_hparams(self.dir_model)
self.model_arch = self._get_model_architecture()
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False) self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
@property
@abstractmethod
def model_arch(self) -> gguf.MODEL_ARCH:
pass
def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any: def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any:
key = next((k for k in keys if k in self.hparams), None) key = next((k for k in keys if k in self.hparams), None)
if key is not None: if key is not None:
@ -176,55 +184,21 @@ class Model:
with open(dir_model / "config.json", "r", encoding="utf-8") as f: with open(dir_model / "config.json", "r", encoding="utf-8") as f:
return json.load(f) return json.load(f)
@staticmethod @classmethod
def from_model_architecture(model_architecture): def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
if model_architecture == "GPTNeoXForCausalLM": assert names
return GPTNeoXModel def func(modelcls: type[Model]):
if model_architecture == "BloomForCausalLM": for name in names:
return BloomModel cls._model_classes[name] = modelcls
if model_architecture == "MPTForCausalLM": return modelcls
return MPTModel return func
if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
return BaichuanModel @classmethod
if model_architecture in ("FalconForCausalLM", "RWForCausalLM"): def from_model_architecture(cls, arch):
return FalconModel try:
if model_architecture == "GPTBigCodeForCausalLM": return cls._model_classes[arch]
return StarCoderModel except KeyError:
if model_architecture == "GPTRefactForCausalLM": raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
return RefactModel
if model_architecture == "PersimmonForCausalLM":
return PersimmonModel
if model_architecture in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
return StableLMModel
if model_architecture == "QWenLMHeadModel":
return QwenModel
if model_architecture == "Qwen2ForCausalLM":
return Model
if model_architecture == "MixtralForCausalLM":
return MixtralModel
if model_architecture == "GPT2LMHeadModel":
return GPT2Model
if model_architecture == "PhiForCausalLM":
return Phi2Model
if model_architecture == "PlamoForCausalLM":
return PlamoModel
if model_architecture == "CodeShellForCausalLM":
return CodeShellModel
if model_architecture == "OrionForCausalLM":
return OrionModel
if model_architecture == "InternLM2ForCausalLM":
return InternLM2Model
if model_architecture == "MiniCPMForCausalLM":
return MiniCPMModel
if model_architecture == "BertModel":
return BertModel
if model_architecture == "NomicBertModel":
return NomicBertModel
if model_architecture == "GemmaForCausalLM":
return GemmaModel
if model_architecture == "Starcoder2ForCausalLM":
return Model
return Model
def _is_model_safetensors(self) -> bool: def _is_model_safetensors(self) -> bool:
return Model.count_model_parts(self.dir_model, ".safetensors") > 0 return Model.count_model_parts(self.dir_model, ".safetensors") > 0
@ -239,57 +213,6 @@ class Model:
return ("pytorch_model.bin",) return ("pytorch_model.bin",)
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1)) return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
def _get_model_architecture(self) -> gguf.MODEL_ARCH:
arch = self.hparams["architectures"][0]
if arch == "GPTNeoXForCausalLM":
return gguf.MODEL_ARCH.GPTNEOX
if arch == "BloomForCausalLM":
return gguf.MODEL_ARCH.BLOOM
if arch == "MPTForCausalLM":
return gguf.MODEL_ARCH.MPT
if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
return gguf.MODEL_ARCH.BAICHUAN
if arch in ("FalconForCausalLM", "RWForCausalLM"):
return gguf.MODEL_ARCH.FALCON
if arch == "GPTBigCodeForCausalLM":
return gguf.MODEL_ARCH.STARCODER
if arch == "GPTRefactForCausalLM":
return gguf.MODEL_ARCH.REFACT
if arch == "PersimmonForCausalLM":
return gguf.MODEL_ARCH.PERSIMMON
if arch in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
return gguf.MODEL_ARCH.STABLELM
if arch == "QWenLMHeadModel":
return gguf.MODEL_ARCH.QWEN
if arch == "Qwen2ForCausalLM":
return gguf.MODEL_ARCH.QWEN2
if arch == "MixtralForCausalLM":
return gguf.MODEL_ARCH.LLAMA
if arch == "GPT2LMHeadModel":
return gguf.MODEL_ARCH.GPT2
if arch == "PhiForCausalLM":
return gguf.MODEL_ARCH.PHI2
if arch == "PlamoForCausalLM":
return gguf.MODEL_ARCH.PLAMO
if arch == "CodeShellForCausalLM":
return gguf.MODEL_ARCH.CODESHELL
if arch == "OrionForCausalLM":
return gguf.MODEL_ARCH.ORION
if arch == "InternLM2ForCausalLM":
return gguf.MODEL_ARCH.INTERNLM2
if arch == "MiniCPMForCausalLM":
return gguf.MODEL_ARCH.MINICPM
if arch == "BertModel":
return gguf.MODEL_ARCH.BERT
if arch == "NomicBertModel":
return gguf.MODEL_ARCH.NOMIC_BERT
if arch == "GemmaForCausalLM":
return gguf.MODEL_ARCH.GEMMA
if arch == "Starcoder2ForCausalLM":
return gguf.MODEL_ARCH.STARCODER2
raise NotImplementedError(f'Architecture "{arch}" not supported!')
def _set_vocab_gpt2(self): def _set_vocab_gpt2(self):
dir_model = self.dir_model dir_model = self.dir_model
hparams = self.hparams hparams = self.hparams
@ -457,7 +380,10 @@ class Model:
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
@Model.register("GPTNeoXForCausalLM")
class GPTNeoXModel(Model): class GPTNeoXModel(Model):
model_arch = gguf.MODEL_ARCH.GPTNEOX
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"] block_count = self.hparams["num_hidden_layers"]
@ -474,7 +400,10 @@ class GPTNeoXModel(Model):
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
@Model.register("BloomForCausalLM")
class BloomModel(Model): class BloomModel(Model):
model_arch = gguf.MODEL_ARCH.BLOOM
def set_gguf_parameters(self): def set_gguf_parameters(self):
self.gguf_writer.add_name("Bloom") self.gguf_writer.add_name("Bloom")
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
@ -566,7 +495,10 @@ class BloomModel(Model):
print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
@Model.register("MPTForCausalLM")
class MPTModel(Model): class MPTModel(Model):
model_arch = gguf.MODEL_ARCH.MPT
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = self.hparams["n_layers"] block_count = self.hparams["n_layers"]
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name)
@ -629,7 +561,10 @@ class MPTModel(Model):
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@Model.register("OrionForCausalLM")
class OrionModel(Model): class OrionModel(Model):
model_arch = gguf.MODEL_ARCH.ORION
def set_vocab(self): def set_vocab(self):
self._set_vocab_sentencepiece() self._set_vocab_sentencepiece()
@ -708,7 +643,10 @@ class OrionModel(Model):
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
class BaichuanModel(Model): class BaichuanModel(Model):
model_arch = gguf.MODEL_ARCH.BAICHUAN
def set_vocab(self): def set_vocab(self):
self._set_vocab_sentencepiece() self._set_vocab_sentencepiece()
@ -823,7 +761,10 @@ class BaichuanModel(Model):
return weights[r * n_part:r * n_part + r, ...] return weights[r * n_part:r * n_part + r, ...]
@Model.register("FalconForCausalLM", "RWForCausalLM")
class FalconModel(Model): class FalconModel(Model):
model_arch = gguf.MODEL_ARCH.FALCON
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = self.hparams.get("num_hidden_layers") block_count = self.hparams.get("num_hidden_layers")
if block_count is None: if block_count is None:
@ -916,7 +857,10 @@ class FalconModel(Model):
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@Model.register("GPTBigCodeForCausalLM")
class StarCoderModel(Model): class StarCoderModel(Model):
model_arch = gguf.MODEL_ARCH.STARCODER
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = self.hparams["n_layer"] block_count = self.hparams["n_layer"]
@ -931,7 +875,10 @@ class StarCoderModel(Model):
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
@Model.register("GPTRefactForCausalLM")
class RefactModel(Model): class RefactModel(Model):
model_arch = gguf.MODEL_ARCH.REFACT
def set_gguf_parameters(self): def set_gguf_parameters(self):
hidden_dim = self.hparams["n_embd"] hidden_dim = self.hparams["n_embd"]
inner_dim = 4 * hidden_dim inner_dim = 4 * hidden_dim
@ -1015,7 +962,10 @@ class RefactModel(Model):
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@Model.register("PersimmonForCausalLM")
class PersimmonModel(Model): class PersimmonModel(Model):
model_arch = gguf.MODEL_ARCH.PERSIMMON
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
head_count = self.hparams["num_attention_heads"] head_count = self.hparams["num_attention_heads"]
@ -1063,7 +1013,10 @@ class PersimmonModel(Model):
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
class StableLMModel(Model): class StableLMModel(Model):
model_arch = gguf.MODEL_ARCH.STABLELM
def set_vocab(self): def set_vocab(self):
if (self.dir_model / "tokenizer.json").is_file(): if (self.dir_model / "tokenizer.json").is_file():
self._set_vocab_gpt2() self._set_vocab_gpt2()
@ -1087,12 +1040,18 @@ class StableLMModel(Model):
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
@Model.register("MixtralForCausalLM")
class MixtralModel(Model): class MixtralModel(Model):
model_arch = gguf.MODEL_ARCH.LLAMA
def set_vocab(self): def set_vocab(self):
self._set_vocab_sentencepiece() self._set_vocab_sentencepiece()
@Model.register("MiniCPMForCausalLM")
class MiniCPMModel(Model): class MiniCPMModel(Model):
model_arch = gguf.MODEL_ARCH.MINICPM
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"] block_count = self.hparams["num_hidden_layers"]
self.gguf_writer.add_name("MiniCPM") self.gguf_writer.add_name("MiniCPM")
@ -1169,7 +1128,10 @@ class MiniCPMModel(Model):
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@Model.register("QWenLMHeadModel")
class QwenModel(Model): class QwenModel(Model):
model_arch = gguf.MODEL_ARCH.QWEN
@staticmethod @staticmethod
def token_bytes_to_string(b): def token_bytes_to_string(b):
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
@ -1249,7 +1211,15 @@ class QwenModel(Model):
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@Model.register("Qwen2ForCausalLM")
class Qwen2Model(Model):
model_arch = gguf.MODEL_ARCH.QWEN2
@Model.register("GPT2LMHeadModel")
class GPT2Model(Model): class GPT2Model(Model):
model_arch = gguf.MODEL_ARCH.GPT2
def set_gguf_parameters(self): def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_block_count(self.hparams["n_layer"]) self.gguf_writer.add_block_count(self.hparams["n_layer"])
@ -1311,7 +1281,10 @@ class GPT2Model(Model):
self.gguf_writer.add_tensor("output.weight", data) self.gguf_writer.add_tensor("output.weight", data)
@Model.register("PhiForCausalLM")
class Phi2Model(Model): class Phi2Model(Model):
model_arch = gguf.MODEL_ARCH.PHI2
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
@ -1333,7 +1306,10 @@ class Phi2Model(Model):
self.gguf_writer.add_add_bos_token(False) self.gguf_writer.add_add_bos_token(False)
@Model.register("PlamoForCausalLM")
class PlamoModel(Model): class PlamoModel(Model):
model_arch = gguf.MODEL_ARCH.PLAMO
def set_vocab(self): def set_vocab(self):
self._set_vocab_sentencepiece() self._set_vocab_sentencepiece()
@ -1412,7 +1388,10 @@ class PlamoModel(Model):
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@Model.register("CodeShellForCausalLM")
class CodeShellModel(Model): class CodeShellModel(Model):
model_arch = gguf.MODEL_ARCH.CODESHELL
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = self.hparams["n_layer"] block_count = self.hparams["n_layer"]
@ -1477,7 +1456,10 @@ class CodeShellModel(Model):
print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
@Model.register("InternLM2ForCausalLM")
class InternLM2Model(Model): class InternLM2Model(Model):
model_arch = gguf.MODEL_ARCH.INTERNLM2
def set_vocab(self): def set_vocab(self):
# (TODO): Is there a better way? # (TODO): Is there a better way?
# Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character
@ -1649,7 +1631,10 @@ in chat mode so that the conversation can end normally.")
self.post_write_tensors(tensor_map, name, data_torch) self.post_write_tensors(tensor_map, name, data_torch)
@Model.register("BertModel")
class BertModel(Model): class BertModel(Model):
model_arch = gguf.MODEL_ARCH.BERT
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.vocab_size = None self.vocab_size = None
@ -1679,7 +1664,7 @@ class BertModel(Model):
else: else:
raise NotImplementedError("Only MEAN and CLS pooling types supported") raise NotImplementedError("Only MEAN and CLS pooling types supported")
self.gguf_writer.add_pooling_type(pooling_type.value) self.gguf_writer.add_pooling_type(pooling_type)
def set_vocab(self): def set_vocab(self):
path = self.dir_model path = self.dir_model
@ -1755,7 +1740,10 @@ class BertModel(Model):
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@Model.register("NomicBertModel")
class NomicBertModel(BertModel): class NomicBertModel(BertModel):
model_arch = gguf.MODEL_ARCH.NOMIC_BERT
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
@ -1792,7 +1780,10 @@ class NomicBertModel(BertModel):
yield name, data yield name, data
@Model.register("GemmaForCausalLM")
class GemmaModel(Model): class GemmaModel(Model):
model_arch = gguf.MODEL_ARCH.GEMMA
def set_vocab(self): def set_vocab(self):
self._set_vocab_sentencepiece() self._set_vocab_sentencepiece()
@ -1848,6 +1839,11 @@ class GemmaModel(Model):
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
@Model.register("Starcoder2ForCausalLM")
class StarCoder2Model(Model):
model_arch = gguf.MODEL_ARCH.STARCODER2
###### CONVERSION LOGIC ###### ###### CONVERSION LOGIC ######

View file

@ -373,7 +373,7 @@ def handle_metadata(cfg, hp):
raise ValueError('Unable to load metadata') raise ValueError('Unable to load metadata')
vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir) vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
vocab_factory = convert.VocabFactory(vocab_path) vocab_factory = convert.VocabFactory(vocab_path)
vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir) vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype.split(","), cfg.model_metadata_dir)
convert.check_vocab_size(params, vocab) convert.check_vocab_size(params, vocab)
return params, vocab, special_vocab return params, vocab, special_vocab
@ -398,8 +398,8 @@ def handle_args():
help ='Load HuggingFace/.pth vocab and metadata from the specified directory') help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
parser.add_argument("--vocab-dir", type=Path, parser.add_argument("--vocab-dir", type=Path,
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm", parser.add_argument("--vocabtype", default="spm,hfft",
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)") help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
return parser.parse_args() return parser.parse_args()

View file

@ -1282,35 +1282,32 @@ def load_some_model(path: Path) -> ModelPlus:
class VocabFactory: class VocabFactory:
_FILES = {"spm": "tokenizer.model", "bpe": "vocab.json", "hfft": "tokenizer.json"}
def __init__(self, path: Path): def __init__(self, path: Path):
self.path = path self.path = path
self.files: dict[str, Path | None] = { self.file_paths = self._detect_files()
"tokenizer.model": None, print(f"Found vocab files: {self.file_paths}")
"vocab.json": None,
"tokenizer.json": None,
}
self._detect_files()
def _detect_files(self): def _detect_files(self) -> dict[str, Path | None]:
for file in self.files.keys(): def locate(file: str) -> Path | None:
file_path = self.path / file if (path := self.path / file).exists():
parent_file_path = self.path.parent / file return path
if file_path.exists(): if (path := self.path.parent / file).exists():
self.files[file] = file_path return path
elif parent_file_path.exists(): return None
self.files[file] = parent_file_path
print(f"Found vocab files: {self.files}")
def _select_file(self, vocabtype: str | None) -> Path: return {vt: locate(f) for vt, f in self._FILES.items()}
if vocabtype in ["spm", "bpe"]:
for file_key in self.files.keys(): def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
if (file := self.files[file_key]) is not None: for vtype in vocab_types:
return file try:
raise FileNotFoundError(f"{vocabtype} vocab not found.") path = self.file_paths[vtype]
if vocabtype == "hfft": except KeyError:
# For Hugging Face Fast Tokenizer, return the directory path instead of a specific file raise ValueError(f"Unsupported vocabulary type {vtype}") from None
return self.path if path is not None:
raise ValueError(f"Unsupported vocabulary type {vocabtype}") return vtype, path
raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab: def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
load_merges = vocabtype == "bpe" load_merges = vocabtype == "bpe"
@ -1322,30 +1319,30 @@ class VocabFactory:
n_vocab=n_vocab, n_vocab=n_vocab,
) )
def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]: def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
path = self._select_file(vocabtype) vocab_type, path = self._select_file(vocab_types)
print(f"Loading vocab file '{path}', type '{vocabtype}'") print(f"Loading vocab file {path!r}, type {vocab_type!r}")
added_tokens_path = path.parent / "added_tokens.json" added_tokens_path = path.parent / "added_tokens.json"
vocab: Vocab vocab: Vocab
if vocabtype == "bpe": if vocab_type == "bpe":
vocab = BpeVocab( vocab = BpeVocab(
path, added_tokens_path if added_tokens_path.exists() else None path, added_tokens_path if added_tokens_path.exists() else None
) )
elif vocabtype == "spm": elif vocab_type == "spm":
vocab = SentencePieceVocab( vocab = SentencePieceVocab(
path, added_tokens_path if added_tokens_path.exists() else None path, added_tokens_path if added_tokens_path.exists() else None
) )
elif vocabtype == "hfft": elif vocab_type == "hfft":
vocab = HfVocab( vocab = HfVocab(
path, added_tokens_path if added_tokens_path.exists() else None path.parent, added_tokens_path if added_tokens_path.exists() else None
) )
else: else:
raise ValueError(f"Unsupported vocabulary type {vocabtype}") raise ValueError(vocab_type)
# FIXME: Respect --vocab-dir? # FIXME: Respect --vocab-dir?
special_vocab = self._create_special_vocab( special_vocab = self._create_special_vocab(
vocab, vocab,
vocabtype, vocab_type,
model_parent_path, model_parent_path,
) )
return vocab, special_vocab return vocab, special_vocab
@ -1379,15 +1376,14 @@ def main(args_in: list[str] | None = None) -> None:
if np.uint32(1) == np.uint32(1).newbyteorder("<"): if np.uint32(1) == np.uint32(1).newbyteorder("<"):
# We currently only support Q8_0 output on little endian systems. # We currently only support Q8_0 output on little endian systems.
output_choices.append("q8_0") output_choices.append("q8_0")
vocab_types = ["spm", "bpe", "hfft"] parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None) parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None)
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm") parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
@ -1448,7 +1444,7 @@ def main(args_in: list[str] | None = None) -> None:
model_parent_path = model_plus.paths[0].parent model_parent_path = model_plus.paths[0].parent
vocab_path = Path(args.vocab_dir or args.model or model_parent_path) vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
vocab_factory = VocabFactory(vocab_path) vocab_factory = VocabFactory(vocab_path)
vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type, model_parent_path) vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type.split(","), model_parent_path)
if args.vocab_only: if args.vocab_only:
if not args.outfile: if not args.outfile:

View file

@ -378,10 +378,10 @@ int main(int argc, char ** argv) {
if (params.interactive) { if (params.interactive) {
const char *control_message; const char *control_message;
if (params.multiline_input) { if (params.multiline_input) {
control_message = " - To return control to LLaMa, end your input with '\\'.\n" control_message = " - To return control to LLaMA, end your input with '\\'.\n"
" - To return control without starting a new line, end your input with '/'.\n"; " - To return control without starting a new line, end your input with '/'.\n";
} else { } else {
control_message = " - Press Return to return control to LLaMa.\n" control_message = " - Press Return to return control to LLaMA.\n"
" - To return control without starting a new line, end your input with '/'.\n" " - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n"; " - If you want to submit another line, end your input with '\\'.\n";
} }

View file

@ -1811,8 +1811,8 @@ struct llama_server_context
} }
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
// if input prompt is too big, truncate it // if input prompt is too big, truncate it, if group attention self-extend is disabled
if (slot.n_prompt_tokens >= slot.n_ctx) if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx)
{ {
const int n_left = slot.n_ctx - slot.params.n_keep; const int n_left = slot.n_ctx - slot.params.n_keep;
const int n_block_size = n_left / 2; const int n_block_size = n_left / 2;
@ -1887,9 +1887,11 @@ struct llama_server_context
} }
LOG_INFO("slot progression", { LOG_INFO("slot progression", {
{ "slot_id", slot.id }, { "slot_id", slot.id },
{ "task_id", slot.task_id }, { "task_id", slot.task_id },
{ "n_past", slot.n_past }, { "n_past", slot.n_past },
{ "n_past_se", slot.n_past_se },
{ "ga_i", slot.ga_i },
{ "n_prompt_tokens_processed", slot.n_prompt_tokens_processed } { "n_prompt_tokens_processed", slot.n_prompt_tokens_processed }
}); });
} }
@ -2113,6 +2115,17 @@ struct llama_server_context
return true; return true;
} }
json model_meta() {
return json{
{"vocab_type", llama_vocab_type(model)},
{"n_vocab", llama_n_vocab(model)},
{"n_ctx_train", llama_n_ctx_train(model)},
{"n_embd", llama_n_embd(model)},
{"n_params", llama_model_n_params(model)},
{"size", llama_model_size(model)},
};
}
}; };
static void server_print_usage(const char *argv0, const gpt_params &params, static void server_print_usage(const char *argv0, const gpt_params &params,
@ -3075,9 +3088,10 @@ int main(int argc, char **argv)
for (const auto& metric_def : metrics_def) { for (const auto& metric_def : metrics_def) {
std::string name = metric_def["name"]; std::string name = metric_def["name"];
std::string help = metric_def["help"]; std::string help = metric_def["help"];
prometheus << "# HELP llamacpp:" << name << " " << help << "\n" auto value = json_value(metric_def, "value", 0);
<< "# TYPE llamacpp:" << name << " " << type << "\n" prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
<< "llamacpp:" << name << " " << metric_def["value"] << "\n"; << "# TYPE llamacpp:" << name << " " << type << "\n"
<< "llamacpp:" << name << " " << value << "\n";
} }
} }
@ -3165,6 +3179,7 @@ int main(int argc, char **argv)
state.store(SERVER_STATE_READY); state.store(SERVER_STATE_READY);
LOG_INFO("model loaded", {}); LOG_INFO("model loaded", {});
} }
const auto model_meta = llama.model_meta();
if (sparams.chat_template.empty()) { // custom chat template is not supplied if (sparams.chat_template.empty()) { // custom chat template is not supplied
// check if the template comes with the model is supported by us // check if the template comes with the model is supported by us
@ -3329,7 +3344,7 @@ int main(int argc, char **argv)
} }
}); });
svr.Get("/v1/models", [&params](const httplib::Request& req, httplib::Response& res) svr.Get("/v1/models", [&params, &model_meta](const httplib::Request& req, httplib::Response& res)
{ {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
std::time_t t = std::time(0); std::time_t t = std::time(0);
@ -3338,10 +3353,11 @@ int main(int argc, char **argv)
{"object", "list"}, {"object", "list"},
{"data", { {"data", {
{ {
{"id", params.model_alias}, {"id", params.model_alias},
{"object", "model"}, {"object", "model"},
{"created", t}, {"created", t},
{"owned_by", "llamacpp"} {"owned_by", "llamacpp"},
{"meta", model_meta}
}, },
}} }}
}; };

View file

@ -1,22 +1,30 @@
# Server tests # Server tests
Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development) and [behave](https://behave.readthedocs.io/en/latest/): Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development)
* [issues.feature](./features/issues.feature) Pending issues scenario and [behave](https://behave.readthedocs.io/en/latest/):
* [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests
* [security.feature](./features/security.feature) Security, CORS and API Key * [issues.feature](./features/issues.feature) Pending issues scenario
* [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc... * [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests
* [security.feature](./features/security.feature) Security, CORS and API Key
* [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc...
Tests target GitHub workflows job runners with 4 vCPU. Tests target GitHub workflows job runners with 4 vCPU.
Requests are using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html) based http client. Requests are
using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html)
based http client.
Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail. To mitigate it, you can increase values in `n_predict`, `kv_size`. Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail.
To mitigate it, you can increase values in `n_predict`, `kv_size`.
### Install dependencies ### Install dependencies
`pip install -r requirements.txt` `pip install -r requirements.txt`
### Run tests ### Run tests
1. Build the server 1. Build the server
```shell ```shell
cd ../../.. cd ../../..
mkdir build mkdir build
@ -24,24 +32,36 @@ cd build
cmake ../ cmake ../
cmake --build . --target server cmake --build . --target server
``` ```
2. download required models:
1. `../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf` 2. Start the test: `./tests.sh`
3. Start the test: `./tests.sh`
It's possible to override some scenario steps values with environment variables: It's possible to override some scenario steps values with environment variables:
- `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080`
- `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server` | variable | description |
- `DEBUG` -> "ON" to enable steps and server verbose mode `--verbose` |--------------------------|------------------------------------------------------------------------------------------------|
- `SERVER_LOG_FORMAT_JSON` -> if set switch server logs to json format | `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/server` |
| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` |
| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format |
| `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` |
### Run @bug, @wip or @wrong_usage annotated scenario ### Run @bug, @wip or @wrong_usage annotated scenario
Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope. Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope.
- `@bug` annotation aims to link a scenario with a GitHub issue. - `@bug` annotation aims to link a scenario with a GitHub issue.
- `@wrong_usage` are meant to show user issue that are actually an expected behavior - `@wrong_usage` are meant to show user issue that are actually an expected behavior
- `@wip` to focus on a scenario working in progress - `@wip` to focus on a scenario working in progress
- `@slow` heavy test, disabled by default
To run a scenario annotated with `@bug`, start: To run a scenario annotated with `@bug`, start:
`DEBUG=ON ./tests.sh --no-skipped --tags bug`
```shell
DEBUG=ON ./tests.sh --no-skipped --tags bug
```
After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated. After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.
```shell
./tests.sh --no-skipped --tags bug,wrong_usage || echo "should failed but compile"
```

View file

@ -7,7 +7,10 @@ from signal import SIGKILL
def before_scenario(context, scenario): def before_scenario(context, scenario):
print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m") context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON'
if context.debug:
print("DEBUG=ON\n")
print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m\n")
port = 8080 port = 8080
if 'PORT' in os.environ: if 'PORT' in os.environ:
port = int(os.environ['PORT']) port = int(os.environ['PORT'])

View file

@ -1,4 +1,5 @@
# List of ongoing issues # List of ongoing issues
# run with: DEBUG=ON ./tests.sh --no-skipped --tags bug
@bug @bug
Feature: Issues Feature: Issues
# No confirmed issue at the moment # No confirmed issue at the moment

View file

@ -1,11 +1,12 @@
@llama.cpp @llama.cpp
@parallel
Feature: Parallel Feature: Parallel
Background: Server startup Background: Server startup
Given a server listening on localhost:8080 Given a server listening on localhost:8080
And a model file stories260K.gguf And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And a model alias tinyllama-2
And 42 as server seed And 42 as server seed
And 512 as batch size
And 64 KV cache size And 64 KV cache size
And 2 slots And 2 slots
And embeddings extraction And embeddings extraction

View file

@ -0,0 +1,55 @@
# run with: ./tests.sh --no-skipped --tags passkey
@passkey
@slow
Feature: Passkey / Self-extend with context shift
Background: Server startup
Given a server listening on localhost:8080
# Generates a long text of junk and inserts a secret passkey number inside it.
# Then we query the LLM for the secret passkey.
# see #3856 and #4810
Scenario Outline: Passkey
Given a model file <hf_file> from HF repo <hf_repo>
And <n_batch> as batch size
And <n_junk> as number of junk
And <n_predicted> server max tokens to predict
And 42 as seed
And <n_ctx> KV cache size
And 1 slots
And <n_ga> group attention factor to extend context size through self-extend
And <n_ga_w> group attention width to extend context size through self-extend
# Can be override with N_GPU_LAYERS
And <ngl> GPU offloaded layers
Then the server is starting
Then the server is healthy
Given available models
Then model 0 is trained on <n_ctx_train> tokens context
Given a prefix prompt:
"""
here is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.
"""
And a passkey prompt template:
"""
The pass key is <passkey> Remember it. <passkey> is the pass key.
"""
And a junk suffix prompt:
"""
The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.
"""
And a suffix prompt:
"""
What is the pass key? The pass key is
"""
Given a "<passkey>" passkey challenge prompt with the passkey inserted every <i_pos> junk
And a completion request with no api error
Then <n_predicted> tokens are predicted matching <re_content>
Examples:
| hf_repo | hf_file | n_ctx_train | ngl | n_ctx | n_batch | n_ga | n_ga_w | n_junk | i_pos | passkey | n_predicted | re_content |
| TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 8192 | 512 | 4 | 512 | 250 | 50 | 42 | 1 | 42 |
| TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 8192 | 512 | 2 | 512 | 250 | 50 | 42 | 1 | \b((?!42)\w)+\b |
#| TheBloke/Llama-2-7B-GGUF | llama-2-7b.Q2_K.gguf | 4096 | 3 | 16384 | 512 | 4 | 512 | 500 | 300 | 1234 | 5 | 1234 |
#| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768 | 2 | 16384 | 512 | 4 | 512 | 500 | 100 | 0987 | 5 | 0
# 987 |

View file

@ -1,9 +1,10 @@
@llama.cpp @llama.cpp
@security
Feature: Security Feature: Security
Background: Server startup with an api key defined Background: Server startup with an api key defined
Given a server listening on localhost:8080 Given a server listening on localhost:8080
And a model file stories260K.gguf And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And a server api key llama.cpp And a server api key llama.cpp
Then the server is starting Then the server is starting
Then the server is healthy Then the server is healthy

View file

@ -1,15 +1,17 @@
@llama.cpp @llama.cpp
@server
Feature: llama.cpp server Feature: llama.cpp server
Background: Server startup Background: Server startup
Given a server listening on localhost:8080 Given a server listening on localhost:8080
And a model file stories260K.gguf And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And a model alias tinyllama-2 And a model alias tinyllama-2
And 42 as server seed And 42 as server seed
# KV Cache corresponds to the total amount of tokens # KV Cache corresponds to the total amount of tokens
# that can be stored across all independent sequences: #4130 # that can be stored across all independent sequences: #4130
# see --ctx-size and #5568 # see --ctx-size and #5568
And 32 KV cache size And 32 KV cache size
And 512 as batch size
And 1 slots And 1 slots
And embeddings extraction And embeddings extraction
And 32 server max tokens to predict And 32 server max tokens to predict
@ -29,9 +31,9 @@ Feature: llama.cpp server
And prometheus metrics are exposed And prometheus metrics are exposed
Examples: Prompts Examples: Prompts
| prompt | n_predict | re_content | n_predicted | | prompt | n_predict | re_content | n_predicted |
| I believe the meaning of life is | 8 | (read<or>going)+ | 8 | | I believe the meaning of life is | 8 | (read\|going)+ | 8 |
| Write a joke about AI | 64 | (park<or>friends<or>scared<or>always)+ | 32 | | Write a joke about AI | 64 | (park\|friends\|scared\|always)+ | 32 |
Scenario Outline: OAI Compatibility Scenario Outline: OAI Compatibility
Given a model <model> Given a model <model>
@ -43,9 +45,9 @@ Feature: llama.cpp server
Then <n_predicted> tokens are predicted matching <re_content> Then <n_predicted> tokens are predicted matching <re_content>
Examples: Prompts Examples: Prompts
| model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming | | model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming |
| llama-2 | Book | What is the best book | 8 | (Mom<or>what)+ | 8 | disabled | | llama-2 | Book | What is the best book | 8 | (Mom\|what)+ | 8 | disabled |
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks<or>happy<or>bird)+ | 32 | enabled | | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks\|happy\|bird)+ | 32 | enabled |
Scenario: Embedding Scenario: Embedding
When embeddings are computed for: When embeddings are computed for:
@ -75,10 +77,15 @@ Feature: llama.cpp server
When an OAI compatible embeddings computation request for multiple inputs When an OAI compatible embeddings computation request for multiple inputs
Then embeddings are generated Then embeddings are generated
Scenario: Tokenize / Detokenize Scenario: Tokenize / Detokenize
When tokenizing: When tokenizing:
""" """
What is the capital of France ? What is the capital of France ?
""" """
Then tokens can be detokenize Then tokens can be detokenize
Scenario: Models available
Given available models
Then 1 models are supported
Then model 0 is identified by tinyllama-2
Then model 0 is trained on 128 tokens context

View file

@ -13,6 +13,7 @@ import aiohttp
import openai import openai
from behave import step from behave import step
from behave.api.async_step import async_run_until_complete from behave.api.async_step import async_run_until_complete
from huggingface_hub import hf_hub_download
from prometheus_client import parser from prometheus_client import parser
@ -26,17 +27,23 @@ def step_server_config(context, server_fqdn, server_port):
context.base_url = f'http://{context.server_fqdn}:{context.server_port}' context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON'
context.model_alias = None context.model_alias = None
context.n_batch = None
context.n_ctx = None context.n_ctx = None
context.n_ga = None
context.n_ga_w = None
context.n_gpu_layer = None
context.n_predict = None context.n_predict = None
context.n_server_predict = None context.n_server_predict = None
context.n_slots = None context.n_slots = None
context.prompt_prefix = None
context.prompt_suffix = None
context.server_api_key = None context.server_api_key = None
context.server_continuous_batching = False context.server_continuous_batching = False
context.server_embeddings = False context.server_embeddings = False
context.server_metrics = False context.server_metrics = False
context.server_process = None context.server_process = None
context.seed = None
context.server_seed = None context.server_seed = None
context.user_api_key = None context.user_api_key = None
@ -45,9 +52,11 @@ def step_server_config(context, server_fqdn, server_port):
context.prompts = [] context.prompts = []
@step(u'a model file {model_file}') @step(u'a model file {hf_file} from HF repo {hf_repo}')
def step_model_file(context, model_file): def step_download_hf_model(context, hf_file, hf_repo):
context.model_file = model_file context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file)
if context.debug:
print(f"model file: {context.model_file}\n")
@step(u'a model alias {model_alias}') @step(u'a model alias {model_alias}')
@ -55,24 +64,34 @@ def step_model_alias(context, model_alias):
context.model_alias = model_alias context.model_alias = model_alias
@step(u'{seed} as server seed') @step(u'{seed:d} as server seed')
def step_seed(context, seed): def step_seed(context, seed):
context.server_seed = int(seed) context.server_seed = seed
@step(u'{n_ctx} KV cache size') @step(u'{ngl:d} GPU offloaded layers')
def step_n_gpu_layer(context, ngl):
if 'N_GPU_LAYERS' in os.environ:
new_ngl = int(os.environ['N_GPU_LAYERS'])
if context.debug:
print(f"-ngl upgraded from {ngl} to {new_ngl}")
ngl = new_ngl
context.n_gpu_layer = ngl
@step(u'{n_ctx:d} KV cache size')
def step_n_ctx(context, n_ctx): def step_n_ctx(context, n_ctx):
context.n_ctx = int(n_ctx) context.n_ctx = n_ctx
@step(u'{n_slots} slots') @step(u'{n_slots:d} slots')
def step_n_slots(context, n_slots): def step_n_slots(context, n_slots):
context.n_slots = int(n_slots) context.n_slots = n_slots
@step(u'{n_predict} server max tokens to predict') @step(u'{n_predict:d} server max tokens to predict')
def step_server_n_predict(context, n_predict): def step_server_n_predict(context, n_predict):
context.n_server_predict = int(n_predict) context.n_server_predict = n_predict
@step(u'continuous batching') @step(u'continuous batching')
@ -116,11 +135,13 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
case 'ready' | 'idle': case 'ready' | 'idle':
await wait_for_health_status(context, context.base_url, 200, 'ok', await wait_for_health_status(context, context.base_url, 200, 'ok',
timeout=10,
params={'fail_on_no_slot': 0, 'include_slots': 0}, params={'fail_on_no_slot': 0, 'include_slots': 0},
slots_idle=context.n_slots, slots_idle=context.n_slots,
slots_processing=0, slots_processing=0,
expected_slots=[{'id': slot_id, 'state': 0} expected_slots=[{'id': slot_id, 'state': 0}
for slot_id in range(context.n_slots)]) for slot_id in
range(context.n_slots if context.n_slots else 1)])
case 'busy': case 'busy':
await wait_for_health_status(context, context.base_url, 503, await wait_for_health_status(context, context.base_url, 503,
'no slot available', 'no slot available',
@ -128,7 +149,8 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
slots_idle=0, slots_idle=0,
slots_processing=context.n_slots, slots_processing=context.n_slots,
expected_slots=[{'id': slot_id, 'state': 1} expected_slots=[{'id': slot_id, 'state': 1}
for slot_id in range(context.n_slots)]) for slot_id in
range(context.n_slots if context.n_slots else 1)])
case _: case _:
assert False, "unknown status" assert False, "unknown status"
@ -157,24 +179,24 @@ async def step_request_completion(context, api_error):
context.base_url, context.base_url,
debug=context.debug, debug=context.debug,
n_predict=context.n_predict, n_predict=context.n_predict,
server_seed=context.server_seed, seed=await completions_seed(context),
expect_api_error=expect_api_error, expect_api_error=expect_api_error,
user_api_key=context.user_api_key) user_api_key=context.user_api_key)
context.tasks_result.append(completion) context.tasks_result.append(completion)
if context.debug: if context.debug:
print(f"Completion response: {completion}") print(f"Completion response: {completion}\n")
if expect_api_error: if expect_api_error:
assert completion == 401, f"completion must be an 401 status code: {completion}" assert completion == 401, f"completion must be an 401 status code: {completion}"
@step(u'{predicted_n} tokens are predicted matching {re_content}') @step(u'{predicted_n:d} tokens are predicted matching {re_content}')
def step_n_tokens_predicted_with_content(context, predicted_n, re_content): def step_n_tokens_predicted_with_content(context, predicted_n, re_content):
assert_n_tokens_predicted(context.tasks_result.pop(), int(predicted_n), re_content) assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n, re_content)
@step(u'{predicted_n} tokens are predicted') @step(u'{predicted_n:d} tokens are predicted')
def step_n_tokens_predicted(context, predicted_n): def step_n_tokens_predicted(context, predicted_n):
assert_n_tokens_predicted(context.tasks_result.pop(), int(predicted_n)) assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n)
@step(u'a user prompt {user_prompt}') @step(u'a user prompt {user_prompt}')
@ -192,9 +214,9 @@ def step_model(context, model):
context.model = model context.model = model
@step(u'{max_tokens} max tokens to predict') @step(u'{max_tokens:d} max tokens to predict')
def step_max_tokens(context, max_tokens): def step_max_tokens(context, max_tokens):
context.n_predict = int(max_tokens) context.n_predict = max_tokens
@step(u'streaming is {enable_streaming}') @step(u'streaming is {enable_streaming}')
@ -222,11 +244,70 @@ def step_server_api_key(context, server_api_key):
context.server_api_key = server_api_key context.server_api_key = server_api_key
@step(u'{n_junk:d} as number of junk')
def step_n_junk(context, n_junk):
context.n_junk = n_junk
@step(u'{n_batch:d} as batch size')
def step_n_batch(context, n_batch):
context.n_batch = n_batch
@step(u'{seed:d} as seed')
def step_seed(context, seed):
context.seed = seed
@step(u'a prefix prompt')
def step_prompt_prefix(context):
context.prompt_prefix = context.text
@step(u'a junk suffix prompt')
def step_prompt_junk_suffix(context):
context.prompt_junk_suffix = context.text
@step(u'a suffix prompt')
def step_prompt_suffix(context):
context.prompt_suffix = context.text
@step(u'{n_ga:d} group attention factor'
u' to extend context size through self-extend')
def step_impl(context, n_ga):
context.n_ga = n_ga
@step(u'{n_ga_w:d} group attention width to extend context size through self-extend')
def step_impl(context, n_ga_w):
context.n_ga_w = n_ga_w
@step(u'a passkey prompt template')
def step_prompt_passkey(context):
context.prompt_passkey = context.text
@step(u'a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk')
def step_prompt_passkey(context, passkey, i_pos):
prompt = ""
for i in range(context.n_junk):
if i % context.n_junk == i_pos:
prompt += context.prompt_passkey # the passkey is already substituted
prompt += context.prompt_junk_suffix
if context.debug:
passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m"
print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n")
context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix)
@step(u'an OAI compatible chat completions request with {api_error} api error') @step(u'an OAI compatible chat completions request with {api_error} api error')
@async_run_until_complete @async_run_until_complete
async def step_oai_chat_completions(context, api_error): async def step_oai_chat_completions(context, api_error):
if context.debug: if context.debug:
print(f"Submitting OAI compatible completions request...") print(f"Submitting OAI compatible completions request...\n")
expect_api_error = api_error == 'raised' expect_api_error = api_error == 'raised'
completion = await oai_chat_completions(context.prompts.pop(), completion = await oai_chat_completions(context.prompts.pop(),
context.system_prompt, context.system_prompt,
@ -241,8 +322,7 @@ async def step_oai_chat_completions(context, api_error):
enable_streaming=context.enable_streaming enable_streaming=context.enable_streaming
if hasattr(context, 'enable_streaming') else None, if hasattr(context, 'enable_streaming') else None,
server_seed=context.server_seed seed=await completions_seed(context),
if hasattr(context, 'server_seed') else None,
user_api_key=context.user_api_key user_api_key=context.user_api_key
if hasattr(context, 'user_api_key') else None, if hasattr(context, 'user_api_key') else None,
@ -276,8 +356,10 @@ async def step_concurrent_completion_requests(context):
# prompt is inserted automatically # prompt is inserted automatically
context.base_url, context.base_url,
debug=context.debug, debug=context.debug,
prompt_prefix=context.prompt_prefix,
prompt_suffix=context.prompt_suffix,
n_predict=context.n_predict if hasattr(context, 'n_predict') else None, n_predict=context.n_predict if hasattr(context, 'n_predict') else None,
server_seed=context.server_seed if hasattr(context, 'server_seed') else None, seed=await completions_seed(context),
user_api_key=context.user_api_key if hasattr(context, user_api_key=context.user_api_key if hasattr(context,
'user_api_key') else None) 'user_api_key') else None)
@ -297,8 +379,7 @@ async def step_oai_chat_completions(context):
if hasattr(context, 'n_predict') else None, if hasattr(context, 'n_predict') else None,
enable_streaming=context.enable_streaming enable_streaming=context.enable_streaming
if hasattr(context, 'enable_streaming') else None, if hasattr(context, 'enable_streaming') else None,
server_seed=context.server_seed seed=await completions_seed(context),
if hasattr(context, 'server_seed') else None,
user_api_key=context.user_api_key user_api_key=context.user_api_key
if hasattr(context, 'user_api_key') else None) if hasattr(context, 'user_api_key') else None)
@ -318,7 +399,9 @@ async def step_oai_chat_completions(context):
if hasattr(context, 'n_predict') else None, if hasattr(context, 'n_predict') else None,
enable_streaming=context.enable_streaming enable_streaming=context.enable_streaming
if hasattr(context, 'enable_streaming') else None, if hasattr(context, 'enable_streaming') else None,
server_seed=context.server_seed seed=context.seed
if hasattr(context, 'seed') else
context.server_seed
if hasattr(context, 'server_seed') else None, if hasattr(context, 'server_seed') else None,
user_api_key=context.user_api_key user_api_key=context.user_api_key
if hasattr(context, 'user_api_key') else None) if hasattr(context, 'user_api_key') else None)
@ -330,11 +413,10 @@ async def step_all_prompts_are_predicted(context):
await all_prompts_are_predicted(context) await all_prompts_are_predicted(context)
@step(u'all prompts are predicted with {n_predict} tokens') @step(u'all prompts are predicted with {n_expected_predicted:d} tokens')
@async_run_until_complete @async_run_until_complete
async def step_all_prompts_are_predicted_with_n_tokens(context, n_predict): async def step_all_prompts_are_predicted_with_n_tokens(context, n_expected_predicted):
expected_predicted_n = int(n_predict) await all_prompts_are_predicted(context, n_expected_predicted)
await all_prompts_are_predicted(context, expected_predicted_n)
async def all_prompts_are_predicted(context, expected_predicted_n=None): async def all_prompts_are_predicted(context, expected_predicted_n=None):
@ -464,6 +546,8 @@ async def step_prometheus_metrics_exported(context):
assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4" assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4"
metrics_raw = await metrics_response.text() metrics_raw = await metrics_response.text()
metric_exported = False metric_exported = False
if context.debug:
print(f"/metrics answer:\n{metrics_raw}\n")
for metric in parser.text_string_to_metric_families(metrics_raw): for metric in parser.text_string_to_metric_families(metrics_raw):
match metric.name: match metric.name:
case "llamacpp:kv_cache_usage_ratio": case "llamacpp:kv_cache_usage_ratio":
@ -472,6 +556,37 @@ async def step_prometheus_metrics_exported(context):
assert metric_exported, "No metrics exported" assert metric_exported, "No metrics exported"
@step(u'available models')
def step_available_models(context):
# openai client always expects an api_key
openai.api_key = context.user_api_key if context.user_api_key is not None else 'nope'
openai.api_base = f'{context.base_url}/v1'
context.models = openai.Model.list().data
@step(u'{n_model:d} models are supported')
def step_supported_models(context, n_model):
if context.debug:
print("server models available:", context.models)
assert len(context.models) == n_model
@step(u'model {i_model:d} is {param} {preposition} {param_value}')
def step_supported_models(context, i_model, param, preposition, param_value):
assert i_model < len(context.models)
model = context.models[i_model]
param_value = param_value.split(' ', 1)[0]
match param:
case 'identified':
value = model.id
case 'trained':
value = str(model.meta.n_ctx_train)
case _:
assert False, "param {param} not supported"
assert param_value == value, f"model param {param} {value} != {param_value}"
async def concurrent_requests(context, f_completion, *args, **kwargs): async def concurrent_requests(context, f_completion, *args, **kwargs):
n_prompts = len(context.prompts) n_prompts = len(context.prompts)
if context.debug: if context.debug:
@ -486,8 +601,10 @@ async def concurrent_requests(context, f_completion, *args, **kwargs):
async def request_completion(prompt, async def request_completion(prompt,
base_url, base_url,
debug=False, debug=False,
prompt_prefix=None,
prompt_suffix=None,
n_predict=None, n_predict=None,
server_seed=None, seed=None,
expect_api_error=None, expect_api_error=None,
user_api_key=None): user_api_key=None):
if debug: if debug:
@ -504,11 +621,14 @@ async def request_completion(prompt,
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
async with session.post(f'{base_url}/completion', async with session.post(f'{base_url}/completion',
json={ json={
"input_prefix": prompt_prefix,
"prompt": prompt, "prompt": prompt,
"n_predict": int(n_predict) if n_predict is not None else -1, "input_suffix": prompt_suffix,
"seed": server_seed if server_seed is not None else 42 "n_predict": n_predict if n_predict is not None else -1,
"seed": seed if seed is not None else 42
}, },
headers=headers) as response: headers=headers,
timeout=3600) as response:
if expect_api_error is None or not expect_api_error: if expect_api_error is None or not expect_api_error:
assert response.status == 200 assert response.status == 200
assert response.headers['Access-Control-Allow-Origin'] == origin assert response.headers['Access-Control-Allow-Origin'] == origin
@ -526,14 +646,14 @@ async def oai_chat_completions(user_prompt,
model=None, model=None,
n_predict=None, n_predict=None,
enable_streaming=None, enable_streaming=None,
server_seed=None, seed=None,
user_api_key=None, user_api_key=None,
expect_api_error=None): expect_api_error=None):
if debug: if debug:
print(f"Sending OAI Chat completions request: {user_prompt}") print(f"Sending OAI Chat completions request: {user_prompt}")
# openai client always expects an api key # openai client always expects an api key
user_api_key = user_api_key if user_api_key is not None else 'nope' user_api_key = user_api_key if user_api_key is not None else 'nope'
seed = server_seed if server_seed is not None else 42 seed = seed if seed is not None else 42
enable_streaming = enable_streaming if enable_streaming is not None else False enable_streaming = enable_streaming if enable_streaming is not None else False
payload = { payload = {
"messages": [ "messages": [
@ -692,20 +812,32 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
content = completion_response['content'] content = completion_response['content']
n_predicted = completion_response['timings']['predicted_n'] n_predicted = completion_response['timings']['predicted_n']
assert len(content) > 0, "no token predicted" assert len(content) > 0, "no token predicted"
if expected_predicted_n is not None: if re_content is not None:
p = re.compile(re_content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL)
matches = p.finditer(content)
last_match = 0
highlighted = ''
for match in matches:
start, end = match.span()
highlighted += content[last_match: start]
highlighted += '\x1b[33m'
highlighted += content[start: end]
highlighted += '\x1b[0m'
last_match = end
highlighted += content[last_match:]
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
print(f"Checking completion response: {highlighted}\n")
assert last_match > 0, f'/{re_content}/ must match ```{highlighted}```'
if expected_predicted_n and expected_predicted_n > 0:
assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:' assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
f' {n_predicted} <> {expected_predicted_n}') f' {n_predicted} <> {expected_predicted_n}')
if re_content is not None:
re_content = '^.*' + re_content.replace('<or>', '|') + '.*$'
assert re.match(re_content, content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL), (
f'invalid tokens predicted:'
f' ```\n{content}\n``` do not match /{re_content}/')
async def gather_tasks_results(context): async def gather_tasks_results(context):
n_tasks = len(context.concurrent_tasks) n_tasks = len(context.concurrent_tasks)
if context.debug: if context.debug:
print(f"Waiting for all {n_tasks} tasks results...") print(f"Waiting for all {n_tasks} tasks results...\n")
for task_no in range(n_tasks): for task_no in range(n_tasks):
context.tasks_result.append(await context.concurrent_tasks.pop()) context.tasks_result.append(await context.concurrent_tasks.pop())
n_completions = len(context.tasks_result) n_completions = len(context.tasks_result)
@ -716,15 +848,13 @@ async def wait_for_health_status(context,
base_url, base_url,
expected_http_status_code, expected_http_status_code,
expected_health_status, expected_health_status,
timeout=3,
params=None, params=None,
slots_idle=None, slots_idle=None,
slots_processing=None, slots_processing=None,
expected_slots=None): expected_slots=None):
if context.debug: if context.debug:
print(f"Starting checking for health for expected_health_status={expected_health_status}") print(f"Starting checking for health for expected_health_status={expected_health_status}\n")
timeout = 3 # seconds
if expected_health_status == 'ok':
timeout = 10 # CI slow inference
interval = 0.5 interval = 0.5
counter = 0 counter = 0
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
@ -734,7 +864,7 @@ async def wait_for_health_status(context,
health = await health_response.json() health = await health_response.json()
if context.debug: if context.debug:
print(f"HEALTH - response for expected health status='{expected_health_status}' on " print(f"HEALTH - response for expected health status='{expected_health_status}' on "
f"'{base_url}/health'?{params} is {health}") f"'{base_url}/health'?{params} is {health}\n")
if (status_code == expected_http_status_code if (status_code == expected_http_status_code
and health['status'] == expected_health_status and health['status'] == expected_health_status
and (slots_idle is None or health['slots_idle'] == slots_idle) and (slots_idle is None or health['slots_idle'] == slots_idle)
@ -757,7 +887,7 @@ async def wait_for_health_status(context,
if expected_http_status_code == 503: if expected_http_status_code == 503:
if len(context.tasks_result) == 0: if len(context.tasks_result) == 0:
print("\x1b[5;37;43mWARNING: forcing concurrent tasks," print("\x1b[5;37;43mWARNING: forcing concurrent tasks,"
" busy health check missed, probably too fast inference\x1b[0m") " busy health check missed, probably too fast inference\x1b[0m\n")
n_completions = await gather_tasks_results(context) n_completions = await gather_tasks_results(context)
if n_completions > 0: if n_completions > 0:
return return
@ -791,6 +921,11 @@ def assert_slots_status(slots, expected_slots):
f" = {expected[key]} != {slot[key]}") f" = {expected[key]} != {slot[key]}")
async def completions_seed(context):
return context.seed if hasattr(context, 'seed') and context.seed is not None \
else context.server_seed if hasattr(context, 'server_seed') else None
def start_server_background(context): def start_server_background(context):
context.server_path = '../../../build/bin/server' context.server_path = '../../../build/bin/server'
if 'LLAMA_SERVER_BIN_PATH' in os.environ: if 'LLAMA_SERVER_BIN_PATH' in os.environ:
@ -800,27 +935,35 @@ def start_server_background(context):
'--port', context.server_port, '--port', context.server_port,
'--model', context.model_file '--model', context.model_file
] ]
if context.n_batch:
server_args.extend(['--batch-size', context.n_batch])
if context.n_gpu_layer:
server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
if context.server_continuous_batching: if context.server_continuous_batching:
server_args.append('--cont-batching') server_args.append('--cont-batching')
if context.server_embeddings: if context.server_embeddings:
server_args.append('--embedding') server_args.append('--embedding')
if context.server_metrics: if context.server_metrics:
server_args.append('--metrics') server_args.append('--metrics')
if context.model_alias is not None: if context.model_alias:
server_args.extend(['--alias', context.model_alias]) server_args.extend(['--alias', context.model_alias])
if context.n_ctx is not None: if context.n_ctx:
server_args.extend(['--ctx-size', context.n_ctx]) server_args.extend(['--ctx-size', context.n_ctx])
if context.n_slots is not None: if context.n_slots:
server_args.extend(['--parallel', context.n_slots]) server_args.extend(['--parallel', context.n_slots])
if context.n_server_predict is not None: if context.n_server_predict:
server_args.extend(['--n-predict', context.n_server_predict]) server_args.extend(['--n-predict', context.n_server_predict])
if context.server_api_key is not None: if context.server_api_key:
server_args.extend(['--api-key', context.server_api_key]) server_args.extend(['--api-key', context.server_api_key])
if context.n_ga:
server_args.extend(['--grp-attn-n', context.n_ga])
if context.n_ga_w:
server_args.extend(['--grp-attn-w', context.n_ga_w])
if context.debug: if context.debug:
server_args.append('--verbose') server_args.append('--verbose')
if 'SERVER_LOG_FORMAT_JSON' not in os.environ: if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
server_args.extend(['--log-format', "text"]) server_args.extend(['--log-format', "text"])
print(f"starting server with: {context.server_path}", *server_args) print(f"starting server with: {context.server_path} {server_args}\n")
context.server_process = subprocess.Popen( context.server_process = subprocess.Popen(
[str(arg) for arg in [context.server_path, *server_args]], [str(arg) for arg in [context.server_path, *server_args]],
close_fds=True) close_fds=True)

View file

@ -1,4 +1,4 @@
# run with ./test.sh --tags wrong_usage # run with: ./tests.sh --no-skipped --tags wrong_usage
@wrong_usage @wrong_usage
Feature: Wrong usage of llama.cpp server Feature: Wrong usage of llama.cpp server
@ -7,7 +7,7 @@ Feature: Wrong usage of llama.cpp server
# or pass n_predict/max_tokens in the request. # or pass n_predict/max_tokens in the request.
Scenario: Infinite loop Scenario: Infinite loop
Given a server listening on localhost:8080 Given a server listening on localhost:8080
And a model file stories260K.gguf And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
# Uncomment below to fix the issue # Uncomment below to fix the issue
#And 64 server max tokens to predict #And 64 server max tokens to predict
Then the server is starting Then the server is starting
@ -18,4 +18,5 @@ Feature: Wrong usage of llama.cpp server
# Uncomment below to fix the issue # Uncomment below to fix the issue
#And 128 max tokens to predict #And 128 max tokens to predict
Given concurrent completion requests Given concurrent completion requests
Then the server is idle
Then all prompts are predicted Then all prompts are predicted

View file

@ -1,4 +1,5 @@
aiohttp~=3.9.3 aiohttp~=3.9.3
behave~=1.2.6 behave~=1.2.6
huggingface_hub~=0.20.3
openai~=0.25.0 openai~=0.25.0
prometheus-client~=0.20.0 prometheus-client~=0.20.0

View file

@ -5,7 +5,7 @@ set -eu
if [ $# -lt 1 ] if [ $# -lt 1 ]
then then
# Start @llama.cpp scenario # Start @llama.cpp scenario
behave --summary --stop --no-capture --exclude 'issues|wrong_usages' --tags llama.cpp behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
else else
behave "$@" behave "$@"
fi fi

View file

@ -219,8 +219,7 @@ static inline void server_log(
for (const auto& el : log.items()) for (const auto& el : log.items())
{ {
const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
snprintf(buf, 1024, "\033[85;0H %s=%s", el.key().c_str(), value.c_str()); ss << " " << el.key() << "=" << value;
ss << buf;
} }
const std::string str = ss.str(); const std::string str = ss.str();

18
flake.lock generated
View file

@ -5,11 +5,11 @@
"nixpkgs-lib": "nixpkgs-lib" "nixpkgs-lib": "nixpkgs-lib"
}, },
"locked": { "locked": {
"lastModified": 1706830856, "lastModified": 1709336216,
"narHash": "sha256-a0NYyp+h9hlb7ddVz4LUn1vT/PLwqfrWYcHMvFB1xYg=", "narHash": "sha256-Dt/wOWeW6Sqm11Yh+2+t0dfEWxoMxGBvv3JpIocFl9E=",
"owner": "hercules-ci", "owner": "hercules-ci",
"repo": "flake-parts", "repo": "flake-parts",
"rev": "b253292d9c0a5ead9bc98c4e9a26c6312e27d69f", "rev": "f7b3c975cf067e56e7cda6cb098ebe3fb4d74ca2",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -20,11 +20,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1708655239, "lastModified": 1709237383,
"narHash": "sha256-ZrP/yACUvDB+zbqYJsln4iwotbH6CTZiTkANJ0AgDv4=", "narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "cbc4211f0afffe6dfd2478a62615dd5175a13f9a", "rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -37,11 +37,11 @@
"nixpkgs-lib": { "nixpkgs-lib": {
"locked": { "locked": {
"dir": "lib", "dir": "lib",
"lastModified": 1706550542, "lastModified": 1709237383,
"narHash": "sha256-UcsnCG6wx++23yeER4Hg18CXWbgNpqNXcHIo5/1Y+hc=", "narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "97b17f32362e475016f942bbdfda4a4a72a8a652", "rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8",
"type": "github" "type": "github"
}, },
"original": { "original": {

View file

@ -2018,74 +2018,73 @@ static const __device__ uint32_t iq3xxs_grid[256] = {
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
}; };
static const __device__ uint32_t iq3xs_grid[512] = { static const __device__ uint32_t iq3s_grid[512] = {
0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14, 0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305,
0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414, 0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905,
0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24, 0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09,
0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c, 0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b,
0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c, 0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b,
0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34, 0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d,
0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c, 0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03,
0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414, 0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505,
0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c, 0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03,
0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404, 0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901,
0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434, 0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d,
0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c, 0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303,
0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404, 0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501,
0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414, 0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105,
0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414, 0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505,
0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404, 0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101,
0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c, 0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707,
0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c, 0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b,
0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404, 0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01,
0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e, 0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f,
0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14, 0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305,
0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c, 0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103,
0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424, 0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509,
0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c, 0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503,
0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c, 0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b,
0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e, 0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f,
0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e, 0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f,
0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e, 0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f,
0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424, 0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109,
0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e, 0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f,
0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424, 0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509,
0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404, 0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501,
0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c, 0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303,
0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e, 0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f,
0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c, 0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907,
0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c, 0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703,
0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c, 0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03,
0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404, 0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01,
0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04, 0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01,
0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c, 0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903,
0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414, 0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505,
0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c, 0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b,
0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c, 0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107,
0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424, 0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509,
0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c, 0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303,
0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c, 0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103,
0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414, 0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05,
0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c, 0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b,
0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e, 0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f,
0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04, 0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701,
0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424, 0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909,
0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14, 0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305,
0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34, 0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d,
0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c, 0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b,
0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434, 0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d,
0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c, 0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307,
0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424, 0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09,
0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24, 0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309,
0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24, 0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709,
0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e, 0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f,
0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c, 0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303,
0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c, 0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503,
0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c, 0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b,
0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404, 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
}; };
static const __device__ uint64_t iq1s_grid[512] = { static const __device__ uint64_t iq1s_grid[512] = {
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000, 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01, 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
@ -2392,9 +2391,9 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
const int ib = tid%8; // 0...7 const int ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il; dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint8_t * qs = x[i].qs + 8*ib; const uint8_t * qs = x[i].qs + 8*ib;
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256))); const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256))); const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f; const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
const uint8_t signs = x[i].signs[4*ib + il]; const uint8_t signs = x[i].signs[4*ib + il];
for (int j = 0; j < 4; ++j) { for (int j = 0; j < 4; ++j) {
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
@ -5211,8 +5210,8 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
const int8_t * q8 = bq8_1[ib32].qs; const int8_t * q8 = bq8_1[ib32].qs;
int sumi = 0; int sumi = 0;
for (int l = 0; l < 4; ++l) { for (int l = 0; l < 4; ++l) {
const uint32_t * grid1 = iq3xs_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)); const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
const uint32_t * grid2 = iq3xs_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)); const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201); uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201); uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
const int grid_l = __vsub4(grid1[0] ^ signs0, signs0); const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
@ -5221,7 +5220,7 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
sumi = __dp4a(grid_h, *((int *)q8+1), sumi); sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
q8 += 8; q8 += 8;
} }
const float d = (float)bq2->d * (0.5f + ((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds) * 0.5f; const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds);
return d * sumi; return d * sumi;
#else #else
assert(false); assert(false);

View file

@ -4087,71 +4087,71 @@ constexpr constant static uint32_t iq3xxs_grid[256] = {
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
}; };
constexpr constant static uint32_t iq3xs_grid[512] = { constexpr constant static uint32_t iq3s_grid[512] = {
0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14, 0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305,
0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414, 0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905,
0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24, 0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09,
0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c, 0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b,
0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c, 0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b,
0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34, 0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d,
0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c, 0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03,
0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414, 0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505,
0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c, 0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03,
0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404, 0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901,
0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434, 0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d,
0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c, 0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303,
0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404, 0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501,
0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414, 0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105,
0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414, 0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505,
0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404, 0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101,
0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c, 0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707,
0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c, 0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b,
0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404, 0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01,
0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e, 0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f,
0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14, 0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305,
0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c, 0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103,
0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424, 0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509,
0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c, 0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503,
0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c, 0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b,
0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e, 0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f,
0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e, 0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f,
0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e, 0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f,
0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424, 0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109,
0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e, 0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f,
0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424, 0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509,
0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404, 0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501,
0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c, 0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303,
0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e, 0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f,
0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c, 0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907,
0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c, 0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703,
0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c, 0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03,
0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404, 0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01,
0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04, 0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01,
0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c, 0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903,
0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414, 0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505,
0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c, 0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b,
0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c, 0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107,
0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424, 0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509,
0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c, 0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303,
0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c, 0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103,
0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414, 0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05,
0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c, 0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b,
0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e, 0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f,
0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04, 0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701,
0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424, 0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909,
0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14, 0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305,
0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34, 0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d,
0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c, 0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b,
0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434, 0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d,
0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c, 0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307,
0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424, 0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09,
0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24, 0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309,
0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24, 0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709,
0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e, 0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f,
0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c, 0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303,
0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c, 0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503,
0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c, 0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b,
0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404, 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
}; };
#define NGRID_IQ1S 512 #define NGRID_IQ1S 512
@ -4742,7 +4742,7 @@ void kernel_mul_mv_iq3_s_f32_impl(
{ {
int nval = 8; int nval = 8;
int pos = (32*sgitg + tiisg)*nval; int pos = (32*sgitg + tiisg)*nval;
for (int i = 0; i < nval; ++i) values[pos + i] = iq3xs_grid[pos + i]; for (int i = 0; i < nval; ++i) values[pos + i] = iq3s_grid[pos + i];
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
} }
@ -4769,12 +4769,14 @@ void kernel_mul_mv_iq3_s_f32_impl(
for (int row = 0; row < N_DST; row++) { for (int row = 0; row < N_DST; row++) {
const float db = dh[0]; const float db = dh[0];
const float d = db * (0.5f + ((sc[0] >> 4*(ib%2)) & 0xf)); const float d = db * (1 + 2*((sc[0] >> 4*(ib%2)) & 0xf));
float2 sum = {0}; float2 sum = {0};
for (int l = 0; l < 4; ++l) { for (int l = 0; l < 4; ++l) {
const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256))); const threadgroup uint32_t * table1 = qh[0] & kmask_iq2xs[2*l+0] ? values + 256 : values;
const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256))); const threadgroup uint32_t * table2 = qh[0] & kmask_iq2xs[2*l+1] ? values + 256 : values;
const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(table1 + qs[2*l+0]);
const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(table2 + qs[2*l+1]);
for (int j = 0; j < 4; ++j) { for (int j = 0; j < 4; ++j) {
sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l] & kmask_iq2xs[j+0]); sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l] & kmask_iq2xs[j+0]);
sum[1] += yl[8*l + j + 4] * grid2[j] * select(1, -1, signs[l] & kmask_iq2xs[j+4]); sum[1] += yl[8*l + j + 4] * grid2[j] * select(1, -1, signs[l] & kmask_iq2xs[j+4]);
@ -4795,7 +4797,7 @@ void kernel_mul_mv_iq3_s_f32_impl(
for (int row = 0; row < N_DST; ++row) { for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.5f; dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
} }
} }
} }
@ -5685,15 +5687,15 @@ void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 &
device const uint8_t * qs = xb->qs + 8*ib32; device const uint8_t * qs = xb->qs + 8*ib32;
device const uint8_t * signs = xb->signs + 4*ib32 + 2*il; device const uint8_t * signs = xb->signs + 4*ib32 + 2*il;
const uint8_t qh = xb->qh[ib32] >> 4*il; const uint8_t qh = xb->qh[ib32] >> 4*il;
const float dl = d * (0.5f + ((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * 0.5f; const float dl = d * (1 + 2*((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf));
constant uint8_t * grid1 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+0] | ((qh << 8) & 256))); constant uint8_t * grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+0] | ((qh << 8) & 256)));
constant uint8_t * grid2 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+1] | ((qh << 7) & 256))); constant uint8_t * grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+1] | ((qh << 7) & 256)));
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]); reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]);
reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]); reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]);
} }
grid1 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+2] | ((qh << 6) & 256))); grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+2] | ((qh << 6) & 256)));
grid2 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+3] | ((qh << 5) & 256))); grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+3] | ((qh << 5) & 256)));
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]); reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]);
reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]); reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]);

View file

@ -3818,71 +3818,71 @@ static const uint32_t iq3xxs_grid[256] = {
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
}; };
static const uint32_t iq3xs_grid[512] = { static const uint32_t iq3s_grid[512] = {
0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14, 0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305,
0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414, 0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905,
0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24, 0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09,
0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c, 0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b,
0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c, 0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b,
0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34, 0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d,
0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c, 0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03,
0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414, 0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505,
0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c, 0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03,
0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404, 0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901,
0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434, 0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d,
0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c, 0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303,
0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404, 0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501,
0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414, 0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105,
0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414, 0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505,
0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404, 0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101,
0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c, 0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707,
0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c, 0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b,
0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404, 0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01,
0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e, 0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f,
0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14, 0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305,
0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c, 0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103,
0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424, 0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509,
0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c, 0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503,
0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c, 0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b,
0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e, 0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f,
0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e, 0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f,
0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e, 0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f,
0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424, 0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109,
0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e, 0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f,
0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424, 0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509,
0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404, 0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501,
0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c, 0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303,
0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e, 0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f,
0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c, 0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907,
0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c, 0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703,
0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c, 0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03,
0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404, 0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01,
0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04, 0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01,
0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c, 0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903,
0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414, 0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505,
0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c, 0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b,
0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c, 0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107,
0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424, 0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509,
0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c, 0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303,
0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c, 0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103,
0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414, 0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05,
0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c, 0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b,
0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e, 0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f,
0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04, 0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701,
0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424, 0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909,
0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14, 0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305,
0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34, 0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d,
0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c, 0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b,
0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434, 0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d,
0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c, 0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307,
0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424, 0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09,
0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24, 0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309,
0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24, 0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709,
0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e, 0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f,
0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c, 0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303,
0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c, 0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503,
0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c, 0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b,
0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404, 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
}; };
#define NGRID_IQ2XXS 512 #define NGRID_IQ2XXS 512
@ -4162,11 +4162,11 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
const uint8_t * signs = x[i].signs; const uint8_t * signs = x[i].signs;
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
const float db1 = d * (0.5f + (x[i].scales[ib32/2] & 0xf)) * 0.5f; const float db1 = d * (1 + 2*(x[i].scales[ib32/2] & 0xf));
const float db2 = d * (0.5f + (x[i].scales[ib32/2] >> 4)) * 0.5f; const float db2 = d * (1 + 2*(x[i].scales[ib32/2] >> 4));
for (int l = 0; l < 4; ++l) { for (int l = 0; l < 4; ++l) {
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256))); const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256))); const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
for (int j = 0; j < 4; ++j) { for (int j = 0; j < 4; ++j) {
y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f); y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f); y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
@ -4176,8 +4176,8 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
qs += 8; qs += 8;
signs += 4; signs += 4;
for (int l = 0; l < 4; ++l) { for (int l = 0; l < 4; ++l) {
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256))); const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)));
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256))); const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)));
for (int j = 0; j < 4; ++j) { for (int j = 0; j < 4; ++j) {
y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f); y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f); y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
@ -10089,18 +10089,34 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
#if defined(__ARM_NEON) #if defined(__ARM_NEON)
typedef union {
uint16x8_t vec_index;
uint16_t index[8];
} vec_index_t;
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
}; };
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,}; static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1); static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1};
const uint8x16_t mask2 = vld1q_u8(k_mask2);
const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
const uint8x16_t mask2 = vld1q_u8(k_mask2);
const int16x8_t hshift = vld1q_s16(k_shift);
const uint16x8_t m256 = vdupq_n_u16(256);
const uint8x16_t m1 = vdupq_n_u8(1);
uint8x16x2_t vs; uint8x16x2_t vs;
ggml_int8x16x4_t q3s; ggml_int8x16x4_t q3s;
ggml_int8x16x4_t q8b; ggml_int8x16x4_t q8b;
vec_index_t idx;
#if QK_K == 256
uint32_t scales32[2];
const uint8_t * scales8 = (const uint8_t *)scales32;
#endif
float sumf = 0; float sumf = 0;
for (int i = 0; i < nb; ++i) { for (int i = 0; i < nb; ++i) {
@ -10109,47 +10125,63 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
const uint8_t * restrict qh = x[i].qh; const uint8_t * restrict qh = x[i].qh;
const uint16_t * restrict signs = (const uint16_t *)x[i].signs; const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
const int8_t * restrict q8 = y[i].qs; const int8_t * restrict q8 = y[i].qs;
#if QK_K == 256
memcpy(scales32, x[i].scales, 4);
scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101;
#endif
int sumi1 = 0, sumi2 = 0; int sumi1 = 0, sumi2 = 0;
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
q8b = ggml_vld1q_s8_x4(q8); q8 += 64; q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
const uint32x4_t aux32x4_0 = {iq3xs_grid[qs[ 0] | ((qh[ib32+0] << 8) & 256)], iq3xs_grid[qs[ 1] | ((qh[ib32+0] << 7) & 256)],
iq3xs_grid[qs[ 2] | ((qh[ib32+0] << 6) & 256)], iq3xs_grid[qs[ 3] | ((qh[ib32+0] << 5) & 256)]}; const uint8x16_t idx_l = vld1q_u8(qs); qs += 16;
const uint32x4_t aux32x4_1 = {iq3xs_grid[qs[ 4] | ((qh[ib32+0] << 4) & 256)], iq3xs_grid[qs[ 5] | ((qh[ib32+0] << 3) & 256)], idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256));
iq3xs_grid[qs[ 6] | ((qh[ib32+0] << 2) & 256)], iq3xs_grid[qs[ 7] | ((qh[ib32+0] << 1) & 256)]}; const uint32x4_t aux32x4_0 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
const uint32x4_t aux32x4_2 = {iq3xs_grid[qs[ 8] | ((qh[ib32+1] << 8) & 256)], iq3xs_grid[qs[ 9] | ((qh[ib32+1] << 7) & 256)], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]};
iq3xs_grid[qs[10] | ((qh[ib32+1] << 6) & 256)], iq3xs_grid[qs[11] | ((qh[ib32+1] << 5) & 256)]}; const uint32x4_t aux32x4_1 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
const uint32x4_t aux32x4_3 = {iq3xs_grid[qs[12] | ((qh[ib32+1] << 4) & 256)], iq3xs_grid[qs[13] | ((qh[ib32+1] << 3) & 256)], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]};
iq3xs_grid[qs[14] | ((qh[ib32+1] << 2) & 256)], iq3xs_grid[qs[15] | ((qh[ib32+1] << 1) & 256)]}; idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256));
qs += 16; const uint32x4_t aux32x4_2 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]};
const uint32x4_t aux32x4_3 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]};
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16))); vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
vs.val[0] = vceqq_u8(vs.val[0], mask2); vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
vs.val[1] = vceqq_u8(vs.val[1], mask2); vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
q3s.val[0] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_0))), vreinterpretq_s8_u8(vs.val[0])); q3s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_0));
q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1])); q3s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_1));
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16))); vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
vs.val[0] = vceqq_u8(vs.val[0], mask2); vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
vs.val[1] = vceqq_u8(vs.val[1], mask2); vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
signs += 4; signs += 4;
q3s.val[2] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_2))), vreinterpretq_s8_u8(vs.val[0])); q3s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_2));
q3s.val[3] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_3))), vreinterpretq_s8_u8(vs.val[1])); q3s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_3));
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]); const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]); const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
#if QK_K == 256
sumi1 += vaddvq_s32(p1) * scales8[ib32/2+0];
sumi2 += vaddvq_s32(p2) * scales8[ib32/2+4];
#else
sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf)); sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf));
sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >> 4)); sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >> 4));
#endif
} }
sumf += d*(sumi1 + sumi2); sumf += d*(sumi1 + sumi2);
} }
*s = 0.25f * sumf; *s = sumf;
#elif defined(__AVX2__) #elif defined(__AVX2__)
@ -10164,6 +10196,16 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1); const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2); const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
const __m256i idx_mask = _mm256_set1_epi32(256);
typedef union {
__m256i vec[2];
uint32_t index[16];
} index_t;
index_t idx;
__m256 accumf = _mm256_setzero_ps(); __m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) { for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
@ -10176,24 +10218,25 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
const __m256i q2_1 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+0] << 1) & 256)], const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16;
iq3xs_grid[qs[6] | ((qh[ib32+0] << 2) & 256)], idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]);
iq3xs_grid[qs[5] | ((qh[ib32+0] << 3) & 256)], idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]);
iq3xs_grid[qs[4] | ((qh[ib32+0] << 4) & 256)], idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask);
iq3xs_grid[qs[3] | ((qh[ib32+0] << 5) & 256)], idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask);
iq3xs_grid[qs[2] | ((qh[ib32+0] << 6) & 256)], idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l)));
iq3xs_grid[qs[1] | ((qh[ib32+0] << 7) & 256)], idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1)));
iq3xs_grid[qs[0] | ((qh[ib32+0] << 8) & 256)]);
qs += 8; // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
const __m256i q2_2 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+1] << 1) & 256)], //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
iq3xs_grid[qs[6] | ((qh[ib32+1] << 2) & 256)], //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
iq3xs_grid[qs[5] | ((qh[ib32+1] << 3) & 256)], const __m256i q2_1 = _mm256_set_epi32(
iq3xs_grid[qs[4] | ((qh[ib32+1] << 4) & 256)], iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
iq3xs_grid[qs[3] | ((qh[ib32+1] << 5) & 256)], iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
iq3xs_grid[qs[2] | ((qh[ib32+1] << 6) & 256)], );
iq3xs_grid[qs[1] | ((qh[ib32+1] << 7) & 256)], const __m256i q2_2 = _mm256_set_epi32(
iq3xs_grid[qs[0] | ((qh[ib32+1] << 8) & 256)]); iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
qs += 8; iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
);
__m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16)); __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
@ -10221,7 +10264,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
} }
*s = 0.25f * hsum_float_8(accumf); *s = hsum_float_8(accumf);
#else #else
@ -10238,8 +10281,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1; const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
int32_t sumi = 0; int32_t sumi = 0;
for (int l = 0; l < 4; ++l) { for (int l = 0; l < 4; ++l) {
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256))); const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256))); const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
for (int j = 0; j < 4; ++j) { for (int j = 0; j < 4; ++j) {
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1); sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1); sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
@ -10251,8 +10294,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
bsum += sumi * ls1; bsum += sumi * ls1;
sumi = 0; sumi = 0;
for (int l = 0; l < 4; ++l) { for (int l = 0; l < 4; ++l) {
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256))); const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256))); const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
for (int j = 0; j < 4; ++j) { for (int j = 0; j < 4; ++j) {
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1); sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1); sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
@ -10265,7 +10308,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
} }
sumf += d * bsum; sumf += d * bsum;
} }
*s = 0.25f * sumf; *s = sumf;
#endif #endif
} }
@ -11912,7 +11955,8 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
} }
float best = 0; float best = 0;
float scale = max/(2*kMaxQ-1); float scale = max/(2*kMaxQ-1);
for (int is = -15; is <= 15; ++is) { for (int k = 0; k < bs4; ++k) is_on_grid[k] = false;
for (int is = -9; is <= 9; ++is) {
float id = (2*kMaxQ-1+is*0.2f)/max; float id = (2*kMaxQ-1+is*0.2f)/max;
float this_scale = 1/id; float this_scale = 1/id;
for (int k = 0; k < bs4; ++k) { for (int k = 0; k < bs4; ++k) {
@ -11948,7 +11992,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
if (n_not_ongrid > 0 && scale > 0) { if (n_not_ongrid > 0 && scale > 0) {
float id = 1/scale; float id = 1/scale;
for (int k = 0; k < bs4; ++k) { for (int k = 0; k < bs4; ++k) {
if (is_on_grid[k]) continue; //if (is_on_grid[k]) continue;
uint16_t u = 0; uint16_t u = 0;
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
int l = nearest_int(0.5f*(id*xval[4*k+i]-1)); int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
@ -12004,7 +12048,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
} }
float d = max_scale/31; float d = max_scale/31;
y[ibl].d = GGML_FP32_TO_FP16(d); y[ibl].d = GGML_FP32_TO_FP16(d * 1.033f);
float id = 1/d; float id = 1/d;
for (int ib = 0; ib < QK_K/block_size; ib += 2) { for (int ib = 0; ib < QK_K/block_size; ib += 2) {
int l1 = nearest_int(0.5f*(id*scales[ib+0]-1)); int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));

View file

@ -362,7 +362,7 @@ class GGUFWriter:
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value) self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
def add_pooling_type(self, value: PoolingType) -> None: def add_pooling_type(self, value: PoolingType) -> None:
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value) self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
def add_rope_dimension_count(self, count: int) -> None: def add_rope_dimension_count(self, count: int) -> None:
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)

130
llama.cpp
View file

@ -216,7 +216,7 @@ enum llm_arch {
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
}; };
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = { static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" }, { LLM_ARCH_LLAMA, "llama" },
{ LLM_ARCH_FALCON, "falcon" }, { LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GPT2, "gpt2" }, { LLM_ARCH_GPT2, "gpt2" },
@ -241,6 +241,7 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_MINICPM, "minicpm" }, { LLM_ARCH_MINICPM, "minicpm" },
{ LLM_ARCH_GEMMA, "gemma" }, { LLM_ARCH_GEMMA, "gemma" },
{ LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_STARCODER2, "starcoder2" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
}; };
enum llm_kv { enum llm_kv {
@ -301,7 +302,7 @@ enum llm_kv {
LLM_KV_TOKENIZER_RWKV, LLM_KV_TOKENIZER_RWKV,
}; };
static std::map<llm_kv, const char *> LLM_KV_NAMES = { static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" }, { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" }, { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" }, { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
@ -365,7 +366,7 @@ struct LLM_KV {
llm_arch arch; llm_arch arch;
std::string operator()(llm_kv kv) const { std::string operator()(llm_kv kv) const {
return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]); return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
} }
}; };
@ -400,7 +401,7 @@ enum llm_tensor {
LLM_TENSOR_LAYER_OUT_NORM, LLM_TENSOR_LAYER_OUT_NORM,
}; };
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = { static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
{ {
LLM_ARCH_LLAMA, LLM_ARCH_LLAMA,
{ {
@ -833,38 +834,38 @@ struct LLM_TN {
llm_arch arch; llm_arch arch;
std::string operator()(llm_tensor tensor) const { std::string operator()(llm_tensor tensor) const {
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
return "__missing__"; return "__missing__";
} }
return LLM_TENSOR_NAMES[arch].at(tensor); return LLM_TENSOR_NAMES.at(arch).at(tensor);
} }
std::string operator()(llm_tensor tensor, const std::string & suffix) const { std::string operator()(llm_tensor tensor, const std::string & suffix) const {
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
return "__missing__"; return "__missing__";
} }
return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix; return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix;
} }
std::string operator()(llm_tensor tensor, int bid) const { std::string operator()(llm_tensor tensor, int bid) const {
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
return "__missing__"; return "__missing__";
} }
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid); return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid);
} }
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const { std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
return "__missing__"; return "__missing__";
} }
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix; return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix;
} }
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const { std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
return "__missing__"; return "__missing__";
} }
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix; return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
} }
}; };
@ -872,7 +873,7 @@ struct LLM_TN {
// gguf helpers // gguf helpers
// //
static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = { static const std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" }, { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" }, { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" }, { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
@ -1986,6 +1987,9 @@ struct llama_context {
std::vector<uint8_t> buf_compute_meta; std::vector<uint8_t> buf_compute_meta;
ggml_backend_sched_t sched = nullptr; ggml_backend_sched_t sched = nullptr;
ggml_abort_callback abort_callback = nullptr;
void * abort_callback_data = nullptr;
// input tensors // input tensors
ggml_backend_buffer_t buf_input = nullptr; ggml_backend_buffer_t buf_input = nullptr;
ggml_context * ctx_input = nullptr; ggml_context * ctx_input = nullptr;
@ -8070,6 +8074,7 @@ static void llama_graph_compute(
if (lctx.backend_cpu != nullptr) { if (lctx.backend_cpu != nullptr) {
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
} }
ggml_backend_sched_graph_compute(lctx.sched, gf); ggml_backend_sched_graph_compute(lctx.sched, gf);
@ -10835,7 +10840,7 @@ struct quantize_state_internal {
{} {}
}; };
static void llama_convert_tensor_internal( static void llama_tensor_dequantize_internal(
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers, struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
const size_t nelements, const int nthread const size_t nelements, const int nthread
) { ) {
@ -11176,6 +11181,46 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
return new_type; return new_type;
} }
static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, int64_t * hist_cur, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
std::mutex mutex;
int counter = 0;
size_t new_size = 0;
if (nthread < 2) {
// single-thread
return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur, imatrix);
}
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
nrows, n_per_row, imatrix]() {
std::array<int64_t, 1 << 4> local_hist = {};
const int nrows_per_chunk = chunk_size / n_per_row;
size_t local_size = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int first_row = counter; counter += nrows_per_chunk;
if (first_row >= nrows) {
if (local_size > 0) {
for (int j=0; j<int(local_hist.size()); ++j) {
hist_cur[j] += local_hist[j];
}
new_size += local_size;
}
break;
}
lock.unlock();
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
}
};
for (int it = 0; it < nthread - 1; ++it) {
workers.emplace_back(compute);
}
compute();
for (auto & w : workers) { w.join(); }
workers.clear();
return new_size;
}
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
ggml_type quantized_type; ggml_type quantized_type;
llama_ftype ftype = params->ftype; llama_ftype ftype = params->ftype;
@ -11288,7 +11333,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
std::vector<std::thread> workers; std::vector<std::thread> workers;
workers.reserve(nthread); workers.reserve(nthread);
std::mutex mutex;
int idx = 0; int idx = 0;
@ -11402,7 +11446,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) { } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type))); throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
} else { } else {
llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread); llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
f32_data = (float *) f32_conv_buf.data(); f32_data = (float *) f32_conv_buf.data();
} }
@ -11423,41 +11467,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
const int nchunk = (nelements + chunk_size - 1)/chunk_size; const int nchunk = (nelements + chunk_size - 1)/chunk_size;
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
if (nthread_use < 2) { new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, hist_cur.data(), imatrix, workers, nthread_use);
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
} else {
int counter = 0;
new_size = 0;
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
nrows, n_per_row, imatrix]() {
std::array<int64_t, 1 << 4> local_hist = {};
const int nrows_per_chunk = chunk_size / n_per_row;
size_t local_size = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int first_row = counter; counter += nrows_per_chunk;
if (first_row >= nrows) {
if (local_size > 0) {
for (int j=0; j<int(local_hist.size()); ++j) {
hist_cur[j] += local_hist[j];
}
new_size += local_size;
}
break;
}
lock.unlock();
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
}
};
for (int it = 0; it < nthread_use - 1; ++it) {
workers.emplace_back(compute);
}
compute();
for (auto & w : workers) { w.join(); }
workers.clear();
}
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
int64_t tot_count = 0; int64_t tot_count = 0;
@ -11850,6 +11860,8 @@ struct llama_context_params llama_context_default_params() {
/*.embedding =*/ false, /*.embedding =*/ false,
/*.offload_kqv =*/ true, /*.offload_kqv =*/ true,
/*.do_pooling =*/ true, /*.do_pooling =*/ true,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
}; };
return result; return result;
@ -12032,8 +12044,11 @@ struct llama_context * llama_new_context_with_model(
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
ctx->rng = std::mt19937(params.seed); ctx->abort_callback = params.abort_callback;
ctx->logits_all = params.logits_all; ctx->abort_callback_data = params.abort_callback_data;
ctx->rng = std::mt19937(params.seed);
ctx->logits_all = params.logits_all;
const ggml_type type_k = params.type_k; const ggml_type type_k = params.type_k;
const ggml_type type_v = params.type_v; const ggml_type type_v = params.type_v;
@ -12983,6 +12998,11 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
ctx->cparams.n_threads_batch = n_threads_batch; ctx->cparams.n_threads_batch = n_threads_batch;
} }
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
ctx->abort_callback = abort_callback;
ctx->abort_callback_data = abort_callback_data;
}
struct llama_batch llama_batch_get_one( struct llama_batch llama_batch_get_one(
llama_token * tokens, llama_token * tokens,
int32_t n_tokens, int32_t n_tokens,

13
llama.h
View file

@ -255,10 +255,16 @@ extern "C" {
enum ggml_type type_v; // data type for V cache enum ggml_type type_v; // data type for V cache
// Keep the booleans together to avoid misalignment during copy-by-value. // Keep the booleans together to avoid misalignment during copy-by-value.
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
bool embedding; // embedding mode only bool embedding; // embedding mode only
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
// Abort callback
// if it returns true, execution of llama_decode() will be aborted
// currently works only with CPU execution
ggml_abort_callback abort_callback;
void * abort_callback_data;
}; };
// model quantization parameters // model quantization parameters
@ -632,7 +638,10 @@ extern "C" {
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens) // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch); LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
// Token logits obtained from the last call to llama_eval() // Set abort callback
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
// Token logits obtained from the last call to llama_decode()
// The logits for the last token are stored in the last row // The logits for the last token are stored in the last row
// Logits for which llama_batch.logits[i] == 0 are undefined // Logits for which llama_batch.logits[i] == 0 are undefined
// Rows: n_tokens provided with llama_batch // Rows: n_tokens provided with llama_batch

213
scripts/pod-llama.sh Normal file
View file

@ -0,0 +1,213 @@
#!/bin/bash
#
# Use this script only on fresh pods (runpod.io)!
# Otherwise, it can break your environment!
#
if [ -z "$1" ]; then
echo "Usage: $0 <data>"
echo " 0: no models"
echo " 1: tinyllama-1b"
echo " 2: codellama-7b"
echo " 3: codellama-13b"
echo " 4: codellama-34b"
echo " 5: codellama-7b-instruct"
echo " 6: codellama-13b-instruct"
echo " 7: codellama-34b-instruct"
exit 1
fi
set -x
# setup deps
apt-get update
apt-get install -y git-lfs cmake cmake-curses-gui vim ruby
git-lfs install
if [ ! -d "/workspace" ]; then
ln -sfn $(pwd) /workspace
fi
# download data
cd /workspace
# this is useful to git clone repos without doubling the disk size due to .git
git clone https://github.com/iboB/git-lfs-download
ln -sfn /workspace/git-lfs-download/git-lfs-download /usr/local/bin/git-lfs-download
# llama.cpp
cd /workspace
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
LLAMA_CUBLAS=1 make -j
ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3 ./models/tinyllama-1b
ln -sfn /workspace/CodeLlama-7b-hf ./models/codellama-7b
ln -sfn /workspace/CodeLlama-13b-hf ./models/codellama-13b
ln -sfn /workspace/CodeLlama-34b-hf ./models/codellama-34b
ln -sfn /workspace/CodeLlama-7b-Instruct-hf ./models/codellama-7b-instruct
ln -sfn /workspace/CodeLlama-13b-Instruct-hf ./models/codellama-13b-instruct
ln -sfn /workspace/CodeLlama-34b-Instruct-hf ./models/codellama-34b-instruct
pip install -r requirements.txt
# cmake
cd /workspace/llama.cpp
mkdir build-cublas
cd build-cublas
cmake -DLLAMA_CUBLAS=1 ../
make -j
if [ "$1" -eq "0" ]; then
exit 0
fi
# more models
if [ "$1" -eq "1" ]; then
cd /workspace
git-lfs-download https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.3
cd /workspace/llama.cpp
python3 convert.py ./models/tinyllama-1b --outfile ./models/tinyllama-1b/ggml-model-f16.gguf --outtype f16
./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0
./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k
./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "2" ]; then
cd /workspace
git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-hf --without *safetensors*
rm -v ./CodeLlama-7b-hf/*safetensors*
cd /workspace/llama.cpp
python3 convert.py ./models/codellama-7b --outfile ./models/codellama-7b/ggml-model-f16.gguf --outtype f16
./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0
./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k
./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "3" ]; then
cd /workspace
git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-hf --without *safetensors*
rm -v ./CodeLlama-13b-hf/*safetensors*
cd /workspace/llama.cpp
python3 convert.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16
./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0
./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k
./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "4" ]; then
cd /workspace
git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-hf --without *safetensors*
rm -v ./CodeLlama-34b-hf/*safetensors*
cd /workspace/llama.cpp
python3 convert.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16
./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0
./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k
./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "5" ]; then
cd /workspace
git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf --without *safetensors*
rm -v ./CodeLlama-7b-Instruct-hf/*safetensors*
cd /workspace/llama.cpp
python3 convert.py ./models/codellama-7b-instruct --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf --outtype f16
./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0
./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k
./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "6" ]; then
cd /workspace
git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf --without *safetensors*
rm -v ./CodeLlama-13b-Instruct-hf/*safetensors*
cd /workspace/llama.cpp
python3 convert.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16
./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0
./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k
./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "7" ]; then
cd /workspace
git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf --without *safetensors*
rm -v ./CodeLlama-34b-Instruct-hf/*safetensors*
cd /workspace/llama.cpp
python3 convert.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16
./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0
./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k
./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "1" ]; then
# perf + perplexity
cd /workspace/llama.cpp/build-cublas
make -j && ../scripts/run-all-perf.sh tinyllama-1b "f16" "-ngl 99 -t 1 -p 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,32,64,128,256,512,1024,2048 -n 128"
../scripts/get-wikitext-2.sh
unzip wikitext-2-raw-v1.zip
make -j && ./bin/perplexity -m ../models/tinyllama-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32
# batched
cd /workspace/llama.cpp
LLAMA_CUBLAS=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
# batched-bench
cd /workspace/llama.cpp
LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
# parallel
cd /workspace/llama.cpp
LLAMA_CUBLAS=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
fi
# speculative
#if [ "$1" -eq "7" ]; then
# cd /workspace/llama.cpp
#
# LLAMA_CUBLAS=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
#fi
# more benches
#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1