Merge branch 'master' into xsn/chat_template_prefix_postfix
This commit is contained in:
commit
ada54292c6
29 changed files with 1188 additions and 936 deletions
2
.github/workflows/bench.yml
vendored
2
.github/workflows/bench.yml
vendored
|
@ -32,7 +32,7 @@ on:
|
||||||
- cron: '04 2 * * *'
|
- cron: '04 2 * * *'
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}-${{ github.event.inputs.sha }}
|
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}-${{ github.event.inputs.sha }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
|
2
.github/workflows/server.yml
vendored
2
.github/workflows/server.yml
vendored
|
@ -23,7 +23,7 @@ on:
|
||||||
- cron: '2 4 * * *'
|
- cron: '2 4 * * *'
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
|
|
@ -43,17 +43,11 @@ else()
|
||||||
set(LLAMA_METAL_DEFAULT OFF)
|
set(LLAMA_METAL_DEFAULT OFF)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# TODO: fix this for Android CI
|
if (CMAKE_SYSTEM_NAME MATCHES "ANDROID")
|
||||||
# https://github.com/ggerganov/llama.cpp/pull/6716#issuecomment-2061509191
|
set(LLAMA_LLAMAFILE_DEFAULT OFF)
|
||||||
#if (CMAKE_SYSTEM_NAME MATCHES "ANDROID")
|
else()
|
||||||
# set(LLAMA_LLAMAFILE_DEFAULT OFF)
|
set(LLAMA_LLAMAFILE_DEFAULT ON)
|
||||||
#else()
|
endif()
|
||||||
# set(LLAMA_LLAMAFILE_DEFAULT ON)
|
|
||||||
#endif()
|
|
||||||
|
|
||||||
# TODO: temporary disable until MoE is fixed
|
|
||||||
# https://github.com/ggerganov/llama.cpp/pull/6716
|
|
||||||
set(LLAMA_LLAMAFILE_DEFAULT OFF)
|
|
||||||
|
|
||||||
# general
|
# general
|
||||||
option(BUILD_SHARED_LIBS "build shared libraries" OFF)
|
option(BUILD_SHARED_LIBS "build shared libraries" OFF)
|
||||||
|
|
4
Makefile
4
Makefile
|
@ -384,10 +384,6 @@ ifdef LLAMA_OPENBLAS
|
||||||
MK_LDFLAGS += $(shell pkg-config --libs openblas)
|
MK_LDFLAGS += $(shell pkg-config --libs openblas)
|
||||||
endif # LLAMA_OPENBLAS
|
endif # LLAMA_OPENBLAS
|
||||||
|
|
||||||
# TODO: temporary disable until MoE is fixed
|
|
||||||
# https://github.com/ggerganov/llama.cpp/pull/6716
|
|
||||||
LLAMA_NO_LLAMAFILE := 1
|
|
||||||
|
|
||||||
ifndef LLAMA_NO_LLAMAFILE
|
ifndef LLAMA_NO_LLAMAFILE
|
||||||
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
||||||
OBJS += sgemm.o
|
OBJS += sgemm.o
|
||||||
|
|
|
@ -229,12 +229,12 @@ source /opt/intel/oneapi/setvars.sh
|
||||||
# Build LLAMA with MKL BLAS acceleration for intel GPU
|
# Build LLAMA with MKL BLAS acceleration for intel GPU
|
||||||
mkdir -p build && cd build
|
mkdir -p build && cd build
|
||||||
|
|
||||||
# Option 1: Use FP16 for better performance in long-prompt inference
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
|
||||||
|
|
||||||
# Option 2: Use FP32 by default
|
|
||||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
|
# Option 2: Use FP16
|
||||||
|
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
||||||
|
|
||||||
#build all binary
|
#build all binary
|
||||||
cmake --build . --config Release -j -v
|
cmake --build . --config Release -j -v
|
||||||
```
|
```
|
||||||
|
@ -250,12 +250,12 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
|
||||||
# Build LLAMA with Nvidia BLAS acceleration through SYCL
|
# Build LLAMA with Nvidia BLAS acceleration through SYCL
|
||||||
mkdir -p build && cd build
|
mkdir -p build && cd build
|
||||||
|
|
||||||
# Option 1: Use FP16 for better performance in long-prompt inference
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
|
||||||
|
|
||||||
# Option 2: Use FP32 by default
|
|
||||||
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
|
# Option 2: Use FP16
|
||||||
|
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
||||||
|
|
||||||
#build all binary
|
#build all binary
|
||||||
cmake --build . --config Release -j -v
|
cmake --build . --config Release -j -v
|
||||||
|
|
||||||
|
@ -416,6 +416,10 @@ mkdir -p build
|
||||||
cd build
|
cd build
|
||||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||||
|
|
||||||
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
|
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||||
|
|
||||||
|
# Option 2: Or FP16
|
||||||
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||||
|
|
||||||
make -j
|
make -j
|
||||||
|
|
|
@ -242,7 +242,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
// This is temporary, in the future the samplign state will be moved fully to llama_sampling_context.
|
||||||
params.seed = std::stoul(argv[i]);
|
params.seed = std::stoul(argv[i]);
|
||||||
|
sparams.seed = std::stoul(argv[i]);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "-t" || arg == "--threads") {
|
if (arg == "-t" || arg == "--threads") {
|
||||||
|
@ -2326,12 +2328,12 @@ std::vector<llama_token> llama_tokenize(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
||||||
std::vector<char> result(8, 0);
|
std::vector<char> result(8, 0);
|
||||||
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
|
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
||||||
if (n_tokens < 0) {
|
if (n_tokens < 0) {
|
||||||
result.resize(-n_tokens);
|
result.resize(-n_tokens);
|
||||||
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
|
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
||||||
GGML_ASSERT(check == -n_tokens);
|
GGML_ASSERT(check == -n_tokens);
|
||||||
} else {
|
} else {
|
||||||
result.resize(n_tokens);
|
result.resize(n_tokens);
|
||||||
|
|
|
@ -86,8 +86,8 @@ struct gpt_params {
|
||||||
|
|
||||||
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||||
|
|
||||||
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||||
llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||||
|
|
||||||
// // sampling parameters
|
// // sampling parameters
|
||||||
struct llama_sampling_params sparams;
|
struct llama_sampling_params sparams;
|
||||||
|
@ -237,11 +237,12 @@ std::vector<llama_token> llama_tokenize(
|
||||||
bool add_special,
|
bool add_special,
|
||||||
bool parse_special = false);
|
bool parse_special = false);
|
||||||
|
|
||||||
// tokenizes a token into a piece
|
// tokenizes a token into a piece, optionally renders special/control tokens
|
||||||
// should work similar to Python's `tokenizer.id_to_piece`
|
// should work similar to Python's `tokenizer.id_to_piece`
|
||||||
std::string llama_token_to_piece(
|
std::string llama_token_to_piece(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
llama_token token);
|
llama_token token,
|
||||||
|
bool special = true);
|
||||||
|
|
||||||
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
|
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
|
||||||
// that takes into account the tokenizer type and decides how to handle the leading space
|
// that takes into account the tokenizer type and decides how to handle the leading space
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
#define LLAMA_API_INTERNAL
|
||||||
#include "sampling.h"
|
#include "sampling.h"
|
||||||
|
#include <random>
|
||||||
|
|
||||||
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
|
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
|
||||||
struct llama_sampling_context * result = new llama_sampling_context();
|
struct llama_sampling_context * result = new llama_sampling_context();
|
||||||
|
@ -33,6 +35,8 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
|
||||||
|
|
||||||
result->prev.resize(params.n_prev);
|
result->prev.resize(params.n_prev);
|
||||||
|
|
||||||
|
llama_sampling_set_rng_seed(result, params.seed);
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -62,6 +66,13 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
|
||||||
ctx->cur.clear();
|
ctx->cur.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
|
||||||
|
if (seed == LLAMA_DEFAULT_SEED) {
|
||||||
|
seed = time(NULL);
|
||||||
|
}
|
||||||
|
ctx->rng.seed(seed);
|
||||||
|
}
|
||||||
|
|
||||||
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
|
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
|
||||||
if (dst->grammar) {
|
if (dst->grammar) {
|
||||||
llama_grammar_free(dst->grammar);
|
llama_grammar_free(dst->grammar);
|
||||||
|
@ -203,7 +214,7 @@ static llama_token llama_sampling_sample_impl(
|
||||||
|
|
||||||
sampler_queue(ctx_main, params, cur_p, min_keep);
|
sampler_queue(ctx_main, params, cur_p, min_keep);
|
||||||
|
|
||||||
id = llama_sample_token(ctx_main, &cur_p);
|
id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
|
||||||
|
|
||||||
//{
|
//{
|
||||||
// const int n_top = 10;
|
// const int n_top = 10;
|
||||||
|
|
|
@ -4,9 +4,10 @@
|
||||||
|
|
||||||
#include "grammar-parser.h"
|
#include "grammar-parser.h"
|
||||||
|
|
||||||
|
#include <random>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
// sampler types
|
// sampler types
|
||||||
enum class llama_sampler_type : char {
|
enum class llama_sampler_type : char {
|
||||||
|
@ -39,6 +40,7 @@ typedef struct llama_sampling_params {
|
||||||
float mirostat_tau = 5.00f; // target entropy
|
float mirostat_tau = 5.00f; // target entropy
|
||||||
float mirostat_eta = 0.10f; // learning rate
|
float mirostat_eta = 0.10f; // learning rate
|
||||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||||
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
|
||||||
|
|
||||||
std::vector<llama_sampler_type> samplers_sequence = {
|
std::vector<llama_sampler_type> samplers_sequence = {
|
||||||
llama_sampler_type::TOP_K,
|
llama_sampler_type::TOP_K,
|
||||||
|
@ -79,6 +81,8 @@ struct llama_sampling_context {
|
||||||
// TODO: replace with ring-buffer
|
// TODO: replace with ring-buffer
|
||||||
std::vector<llama_token> prev;
|
std::vector<llama_token> prev;
|
||||||
std::vector<llama_token_data> cur;
|
std::vector<llama_token_data> cur;
|
||||||
|
|
||||||
|
std::mt19937 rng;
|
||||||
};
|
};
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
@ -93,6 +97,9 @@ void llama_sampling_free(struct llama_sampling_context * ctx);
|
||||||
// - reset grammar
|
// - reset grammar
|
||||||
void llama_sampling_reset(llama_sampling_context * ctx);
|
void llama_sampling_reset(llama_sampling_context * ctx);
|
||||||
|
|
||||||
|
// Set the sampler seed
|
||||||
|
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
|
||||||
|
|
||||||
// Copy the sampler context
|
// Copy the sampler context
|
||||||
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
|
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
|
||||||
|
|
||||||
|
|
|
@ -363,6 +363,16 @@ class Model(ABC):
|
||||||
scores.append(-1000.0)
|
scores.append(-1000.0)
|
||||||
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
|
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
|
||||||
|
|
||||||
|
if vocab_size > len(tokens):
|
||||||
|
pad_count = vocab_size - len(tokens)
|
||||||
|
print(
|
||||||
|
f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]"
|
||||||
|
)
|
||||||
|
for i in range(1, pad_count + 1):
|
||||||
|
tokens.append(f"[PAD{i}]")
|
||||||
|
scores.append(-1000.0)
|
||||||
|
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
||||||
|
|
||||||
assert len(tokens) == vocab_size
|
assert len(tokens) == vocab_size
|
||||||
|
|
||||||
self.gguf_writer.add_tokenizer_model("llama")
|
self.gguf_writer.add_tokenizer_model("llama")
|
||||||
|
@ -1789,6 +1799,12 @@ class QwenModel(Model):
|
||||||
class Qwen2Model(Model):
|
class Qwen2Model(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.QWEN2
|
model_arch = gguf.MODEL_ARCH.QWEN2
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
try:
|
||||||
|
self._set_vocab_sentencepiece()
|
||||||
|
except FileNotFoundError:
|
||||||
|
self._set_vocab_gpt2()
|
||||||
|
|
||||||
|
|
||||||
@Model.register("Qwen2MoeForCausalLM")
|
@Model.register("Qwen2MoeForCausalLM")
|
||||||
class Qwen2MoeModel(Model):
|
class Qwen2MoeModel(Model):
|
||||||
|
@ -1979,6 +1995,91 @@ class Phi2Model(Model):
|
||||||
self.gguf_writer.add_add_bos_token(False)
|
self.gguf_writer.add_add_bos_token(False)
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("Phi3ForCausalLM")
|
||||||
|
class Phi3MiniModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.PHI3
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
|
tokenizer_path = self.dir_model / 'tokenizer.model'
|
||||||
|
|
||||||
|
if not tokenizer_path.is_file():
|
||||||
|
print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
tokenizer = SentencePieceProcessor(str(tokenizer_path))
|
||||||
|
|
||||||
|
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||||
|
|
||||||
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||||
|
scores: list[float] = [-10000.0] * vocab_size
|
||||||
|
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
|
||||||
|
|
||||||
|
for token_id in range(tokenizer.vocab_size()):
|
||||||
|
|
||||||
|
piece = tokenizer.id_to_piece(token_id)
|
||||||
|
text = piece.encode("utf-8")
|
||||||
|
score = tokenizer.get_score(token_id)
|
||||||
|
|
||||||
|
toktype = SentencePieceTokenTypes.NORMAL
|
||||||
|
if tokenizer.is_unknown(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||||
|
elif tokenizer.is_control(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.CONTROL
|
||||||
|
elif tokenizer.is_unused(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.UNUSED
|
||||||
|
elif tokenizer.is_byte(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.BYTE
|
||||||
|
|
||||||
|
tokens[token_id] = text
|
||||||
|
scores[token_id] = score
|
||||||
|
toktypes[token_id] = toktype
|
||||||
|
|
||||||
|
added_tokens_file = self.dir_model / 'added_tokens.json'
|
||||||
|
if added_tokens_file.is_file():
|
||||||
|
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
||||||
|
added_tokens_json = json.load(f)
|
||||||
|
|
||||||
|
for key in added_tokens_json:
|
||||||
|
token_id = added_tokens_json[key]
|
||||||
|
if (token_id >= vocab_size):
|
||||||
|
print(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
tokens[token_id] = key.encode("utf-8")
|
||||||
|
scores[token_id] = -1000.0
|
||||||
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
|
||||||
|
self.gguf_writer.add_tokenizer_model("llama")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
|
||||||
|
|
||||||
|
rot_pct = 1.0
|
||||||
|
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
||||||
|
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
||||||
|
rms_eps = self.find_hparam(["rms_norm_eps"])
|
||||||
|
|
||||||
|
self.gguf_writer.add_name("Phi3")
|
||||||
|
self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
|
||||||
|
|
||||||
|
self.gguf_writer.add_embedding_length(n_embd)
|
||||||
|
self.gguf_writer.add_feed_forward_length(8192)
|
||||||
|
self.gguf_writer.add_block_count(block_count)
|
||||||
|
self.gguf_writer.add_head_count(n_head)
|
||||||
|
self.gguf_writer.add_head_count_kv(n_head)
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
|
||||||
|
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("PlamoForCausalLM")
|
@Model.register("PlamoForCausalLM")
|
||||||
class PlamoModel(Model):
|
class PlamoModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.PLAMO
|
model_arch = gguf.MODEL_ARCH.PLAMO
|
||||||
|
|
|
@ -30,7 +30,6 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
llama_set_rng_seed(ctx, params.seed);
|
|
||||||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
|
|
|
@ -38,7 +38,6 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
llama_set_rng_seed(ctx, params.seed);
|
|
||||||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
|
|
|
@ -240,7 +240,6 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
session_tokens.resize(n_token_count_out);
|
session_tokens.resize(n_token_count_out);
|
||||||
llama_set_rng_seed(ctx, params.seed);
|
|
||||||
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
|
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -881,11 +881,11 @@
|
||||||
.replace(/&/g, '&')
|
.replace(/&/g, '&')
|
||||||
.replace(/</g, '<')
|
.replace(/</g, '<')
|
||||||
.replace(/>/g, '>')
|
.replace(/>/g, '>')
|
||||||
.replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
|
.replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1<h3>$2</h3>')
|
||||||
.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
|
.replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
|
||||||
.replace(/__(.*?)__/g, '<strong>$1</strong>')
|
.replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
|
||||||
.replace(/\*(.*?)\*/g, '<em>$1</em>')
|
.replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
|
||||||
.replace(/_(.*?)_/g, '<em>$1</em>')
|
.replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
|
||||||
.replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
|
.replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
|
||||||
.replace(/`(.*?)`/g, '<code>$1</code>')
|
.replace(/`(.*?)`/g, '<code>$1</code>')
|
||||||
.replace(/\n/gim, '<br />');
|
.replace(/\n/gim, '<br />');
|
||||||
|
|
|
@ -854,7 +854,7 @@ struct server_context {
|
||||||
slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
|
slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
|
||||||
slot.params.n_keep = json_value(data, "n_keep", slot.params.n_keep);
|
slot.params.n_keep = json_value(data, "n_keep", slot.params.n_keep);
|
||||||
slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard);
|
slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard);
|
||||||
slot.params.seed = json_value(data, "seed", default_params.seed);
|
slot.sparams.seed = json_value(data, "seed", default_sparams.seed);
|
||||||
slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||||
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
||||||
|
|
||||||
|
@ -1028,7 +1028,6 @@ struct server_context {
|
||||||
send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
|
send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
llama_set_rng_seed(ctx, slot.params.seed);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.command = SLOT_COMMAND_LOAD_PROMPT;
|
slot.command = SLOT_COMMAND_LOAD_PROMPT;
|
||||||
|
@ -1118,7 +1117,7 @@ struct server_context {
|
||||||
|
|
||||||
bool process_token(completion_token_output & result, server_slot & slot) {
|
bool process_token(completion_token_output & result, server_slot & slot) {
|
||||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||||
const std::string token_str = llama_token_to_piece(ctx, result.tok);
|
const std::string token_str = llama_token_to_piece(ctx, result.tok, false);
|
||||||
slot.sampled = result.tok;
|
slot.sampled = result.tok;
|
||||||
|
|
||||||
// search stop word and delete it
|
// search stop word and delete it
|
||||||
|
|
57
examples/server/tests/features/results.feature
Normal file
57
examples/server/tests/features/results.feature
Normal file
|
@ -0,0 +1,57 @@
|
||||||
|
@llama.cpp
|
||||||
|
@results
|
||||||
|
Feature: Results
|
||||||
|
|
||||||
|
Background: Server startup
|
||||||
|
Given a server listening on localhost:8080
|
||||||
|
And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
|
||||||
|
And a model file test-model-00001-of-00003.gguf
|
||||||
|
And 128 as batch size
|
||||||
|
And 256 KV cache size
|
||||||
|
And 128 max tokens to predict
|
||||||
|
|
||||||
|
Scenario Outline: Multi users completion
|
||||||
|
Given <n_slots> slots
|
||||||
|
And continuous batching
|
||||||
|
Then the server is starting
|
||||||
|
Then the server is healthy
|
||||||
|
|
||||||
|
Given 42 as seed
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long story about AI.
|
||||||
|
"""
|
||||||
|
|
||||||
|
Given 42 as seed
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long story about AI.
|
||||||
|
"""
|
||||||
|
|
||||||
|
Given 42 as seed
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long story about AI.
|
||||||
|
"""
|
||||||
|
|
||||||
|
Given 42 as seed
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long story about AI.
|
||||||
|
"""
|
||||||
|
|
||||||
|
Given 42 as seed
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long story about AI.
|
||||||
|
"""
|
||||||
|
|
||||||
|
Given concurrent completion requests
|
||||||
|
Then the server is busy
|
||||||
|
Then the server is idle
|
||||||
|
And all slots are idle
|
||||||
|
Then all predictions are equal
|
||||||
|
Examples:
|
||||||
|
| n_slots |
|
||||||
|
| 1 |
|
||||||
|
| 2 |
|
|
@ -61,6 +61,7 @@ def step_server_config(context, server_fqdn, server_port):
|
||||||
context.server_metrics = False
|
context.server_metrics = False
|
||||||
context.server_process = None
|
context.server_process = None
|
||||||
context.seed = None
|
context.seed = None
|
||||||
|
context.draft = None
|
||||||
context.server_seed = None
|
context.server_seed = None
|
||||||
context.user_api_key = None
|
context.user_api_key = None
|
||||||
context.response_format = None
|
context.response_format = None
|
||||||
|
@ -107,6 +108,11 @@ def step_n_gpu_layer(context, ngl):
|
||||||
context.n_gpu_layer = ngl
|
context.n_gpu_layer = ngl
|
||||||
|
|
||||||
|
|
||||||
|
@step('{draft:d} as draft')
|
||||||
|
def step_draft(context, draft):
|
||||||
|
context.draft = draft
|
||||||
|
|
||||||
|
|
||||||
@step('{n_ctx:d} KV cache size')
|
@step('{n_ctx:d} KV cache size')
|
||||||
def step_n_ctx(context, n_ctx):
|
def step_n_ctx(context, n_ctx):
|
||||||
context.n_ctx = n_ctx
|
context.n_ctx = n_ctx
|
||||||
|
@ -254,6 +260,15 @@ def step_n_tokens_predicted(context, predicted_n):
|
||||||
assert_n_tokens_predicted(context.completion, predicted_n)
|
assert_n_tokens_predicted(context.completion, predicted_n)
|
||||||
|
|
||||||
|
|
||||||
|
@step('all predictions are equal')
|
||||||
|
@async_run_until_complete
|
||||||
|
async def step_predictions_equal(context):
|
||||||
|
n_completions = await gather_tasks_results(context)
|
||||||
|
assert n_completions >= 2, "need at least 2 completions"
|
||||||
|
assert_all_predictions_equal(context.tasks_result)
|
||||||
|
context.tasks_result = []
|
||||||
|
|
||||||
|
|
||||||
@step('the completion is truncated')
|
@step('the completion is truncated')
|
||||||
def step_assert_completion_truncated(context):
|
def step_assert_completion_truncated(context):
|
||||||
step_assert_completion_truncated(context, '')
|
step_assert_completion_truncated(context, '')
|
||||||
|
@ -1020,6 +1035,23 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
|
||||||
assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
|
assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
|
||||||
f' {n_predicted} <> {expected_predicted_n}')
|
f' {n_predicted} <> {expected_predicted_n}')
|
||||||
|
|
||||||
|
def assert_all_predictions_equal(completion_responses):
|
||||||
|
content_0 = completion_responses[0]['content']
|
||||||
|
|
||||||
|
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
|
||||||
|
print(f"content 0: {content_0}")
|
||||||
|
|
||||||
|
i = 1
|
||||||
|
for response in completion_responses[1:]:
|
||||||
|
content = response['content']
|
||||||
|
|
||||||
|
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
|
||||||
|
print(f"content {i}: {content}")
|
||||||
|
|
||||||
|
assert content == content_0, "contents not equal"
|
||||||
|
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
|
||||||
async def gather_tasks_results(context):
|
async def gather_tasks_results(context):
|
||||||
n_tasks = len(context.concurrent_tasks)
|
n_tasks = len(context.concurrent_tasks)
|
||||||
|
@ -1148,6 +1180,8 @@ def start_server_background(context):
|
||||||
server_args.extend(['--ubatch-size', context.n_ubatch])
|
server_args.extend(['--ubatch-size', context.n_ubatch])
|
||||||
if context.n_gpu_layer:
|
if context.n_gpu_layer:
|
||||||
server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
|
server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
|
||||||
|
if context.draft is not None:
|
||||||
|
server_args.extend(['--draft', context.draft])
|
||||||
if context.server_continuous_batching:
|
if context.server_continuous_batching:
|
||||||
server_args.append('--cont-batching')
|
server_args.append('--cont-batching')
|
||||||
if context.server_embeddings:
|
if context.server_embeddings:
|
||||||
|
|
6
flake.lock
generated
6
flake.lock
generated
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1712791164,
|
"lastModified": 1713537308,
|
||||||
"narHash": "sha256-3sbWO1mbpWsLepZGbWaMovSO7ndZeFqDSdX0hZ9nVyw=",
|
"narHash": "sha256-XtTSSIB2DA6tOv+l0FhvfDMiyCmhoRbNB+0SeInZkbk=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "1042fd8b148a9105f3c0aca3a6177fd1d9360ba5",
|
"rev": "5c24cf2f0a12ad855f444c30b2421d044120c66f",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
16
ggml-alloc.c
16
ggml-alloc.c
|
@ -371,16 +371,16 @@ struct ggml_gallocr {
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
||||||
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
|
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
|
||||||
GGML_ASSERT(galloc != NULL);
|
GGML_ASSERT(galloc != NULL);
|
||||||
|
|
||||||
galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
|
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
||||||
GGML_ASSERT(galloc->bufts != NULL);
|
GGML_ASSERT(galloc->bufts != NULL);
|
||||||
|
|
||||||
galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
|
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
|
||||||
GGML_ASSERT(galloc->buffers != NULL);
|
GGML_ASSERT(galloc->buffers != NULL);
|
||||||
|
|
||||||
galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
|
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
||||||
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
||||||
|
|
||||||
for (int i = 0; i < n_bufs; i++) {
|
for (int i = 0; i < n_bufs; i++) {
|
||||||
|
@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
free(galloc->hash_set.keys);
|
free(galloc->hash_set.keys);
|
||||||
free(galloc->hash_values);
|
free(galloc->hash_values);
|
||||||
galloc->hash_set.size = hash_size;
|
galloc->hash_set.size = hash_size;
|
||||||
galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
|
galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
|
||||||
galloc->hash_values = calloc(sizeof(struct hash_node), hash_size);
|
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
|
||||||
GGML_ASSERT(galloc->hash_set.keys != NULL);
|
GGML_ASSERT(galloc->hash_set.keys != NULL);
|
||||||
GGML_ASSERT(galloc->hash_values != NULL);
|
GGML_ASSERT(galloc->hash_values != NULL);
|
||||||
} else {
|
} else {
|
||||||
|
@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
// set the node_allocs from the hash table
|
// set the node_allocs from the hash table
|
||||||
if (galloc->n_nodes < graph->n_nodes) {
|
if (galloc->n_nodes < graph->n_nodes) {
|
||||||
free(galloc->node_allocs);
|
free(galloc->node_allocs);
|
||||||
galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
|
galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
|
||||||
GGML_ASSERT(galloc->node_allocs != NULL);
|
GGML_ASSERT(galloc->node_allocs != NULL);
|
||||||
}
|
}
|
||||||
galloc->n_nodes = graph->n_nodes;
|
galloc->n_nodes = graph->n_nodes;
|
||||||
|
@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
}
|
}
|
||||||
if (galloc->n_leafs < graph->n_leafs) {
|
if (galloc->n_leafs < graph->n_leafs) {
|
||||||
free(galloc->leaf_allocs);
|
free(galloc->leaf_allocs);
|
||||||
galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
|
galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
|
||||||
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
||||||
}
|
}
|
||||||
galloc->n_leafs = graph->n_leafs;
|
galloc->n_leafs = graph->n_leafs;
|
||||||
|
|
|
@ -1725,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||||
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
||||||
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
||||||
|
|
||||||
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
||||||
|
|
||||||
// initialize hash table
|
// initialize hash table
|
||||||
sched->hash_set = ggml_hash_set_new(graph_size);
|
sched->hash_set = ggml_hash_set_new(graph_size);
|
||||||
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
|
||||||
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
|
||||||
|
|
||||||
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
||||||
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
|
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
||||||
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
|
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
||||||
|
|
||||||
sched->n_backends = n_backends;
|
sched->n_backends = n_backends;
|
||||||
|
|
||||||
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
||||||
|
|
||||||
const int initial_splits_capacity = 16;
|
const int initial_splits_capacity = 16;
|
||||||
sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
|
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
||||||
sched->splits_capacity = initial_splits_capacity;
|
sched->splits_capacity = initial_splits_capacity;
|
||||||
|
|
||||||
for (int b = 0; b < n_backends; b++) {
|
for (int b = 0; b < n_backends; b++) {
|
||||||
|
@ -1972,10 +1972,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
||||||
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
||||||
struct ggml_hash_set hash_set = {
|
struct ggml_hash_set hash_set = {
|
||||||
/* .size = */ graph->visited_hash_table.size,
|
/* .size = */ graph->visited_hash_table.size,
|
||||||
/* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
|
/* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
|
||||||
};
|
};
|
||||||
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
|
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
||||||
bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
|
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
|
||||||
|
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
||||||
|
|
260
ggml-impl.h
260
ggml-impl.h
|
@ -45,7 +45,7 @@ extern "C" {
|
||||||
// 16-bit float
|
// 16-bit float
|
||||||
// on Arm, we use __fp16
|
// on Arm, we use __fp16
|
||||||
// on x86, we use uint16_t
|
// on x86, we use uint16_t
|
||||||
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
#if defined(__ARM_NEON)
|
||||||
|
|
||||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||||
//
|
//
|
||||||
|
@ -53,8 +53,262 @@ extern "C" {
|
||||||
//
|
//
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
|
||||||
|
typedef uint16_t ggml_fp16_internal_t;
|
||||||
|
|
||||||
|
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
typedef __fp16 ggml_fp16_internal_t;
|
typedef __fp16 ggml_fp16_internal_t;
|
||||||
|
|
||||||
|
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
||||||
|
|
||||||
|
#endif // _MSC_VER
|
||||||
|
|
||||||
|
#if !defined(__aarch64__)
|
||||||
|
|
||||||
|
// 32-bit ARM compatibility
|
||||||
|
|
||||||
|
// vaddvq_s16
|
||||||
|
// vpaddq_s16
|
||||||
|
// vpaddq_s32
|
||||||
|
// vaddvq_s32
|
||||||
|
// vaddvq_f32
|
||||||
|
// vmaxvq_f32
|
||||||
|
// vcvtnq_s32_f32
|
||||||
|
// vzip1_u8
|
||||||
|
// vzip2_u8
|
||||||
|
|
||||||
|
inline static int32_t vaddvq_s16(int16x8_t v) {
|
||||||
|
return
|
||||||
|
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
||||||
|
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
||||||
|
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
||||||
|
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
||||||
|
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
||||||
|
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
||||||
|
return vcombine_s16(a0, b0);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
||||||
|
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
||||||
|
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
||||||
|
return vcombine_s32(a0, b0);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||||
|
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static float vaddvq_f32(float32x4_t v) {
|
||||||
|
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static float vmaxvq_f32(float32x4_t v) {
|
||||||
|
return
|
||||||
|
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
||||||
|
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
||||||
|
int32x4_t res;
|
||||||
|
|
||||||
|
res[0] = roundf(vgetq_lane_f32(v, 0));
|
||||||
|
res[1] = roundf(vgetq_lane_f32(v, 1));
|
||||||
|
res[2] = roundf(vgetq_lane_f32(v, 2));
|
||||||
|
res[3] = roundf(vgetq_lane_f32(v, 3));
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
||||||
|
uint8x8_t res;
|
||||||
|
|
||||||
|
res[0] = a[0]; res[1] = b[0];
|
||||||
|
res[2] = a[1]; res[3] = b[1];
|
||||||
|
res[4] = a[2]; res[5] = b[2];
|
||||||
|
res[6] = a[3]; res[7] = b[3];
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
||||||
|
uint8x8_t res;
|
||||||
|
|
||||||
|
res[0] = a[4]; res[1] = b[4];
|
||||||
|
res[2] = a[5]; res[3] = b[5];
|
||||||
|
res[4] = a[6]; res[5] = b[6];
|
||||||
|
res[6] = a[7]; res[7] = b[7];
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
// vld1q_s16_x2
|
||||||
|
// vld1q_u8_x2
|
||||||
|
// vld1q_u8_x4
|
||||||
|
// vld1q_s8_x2
|
||||||
|
// vld1q_s8_x4
|
||||||
|
// TODO: double-check these work correctly
|
||||||
|
|
||||||
|
typedef struct ggml_int16x8x2_t {
|
||||||
|
int16x8_t val[2];
|
||||||
|
} ggml_int16x8x2_t;
|
||||||
|
|
||||||
|
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
||||||
|
ggml_int16x8x2_t res;
|
||||||
|
|
||||||
|
res.val[0] = vld1q_s16(ptr + 0);
|
||||||
|
res.val[1] = vld1q_s16(ptr + 8);
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef struct ggml_uint8x16x2_t {
|
||||||
|
uint8x16_t val[2];
|
||||||
|
} ggml_uint8x16x2_t;
|
||||||
|
|
||||||
|
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
||||||
|
ggml_uint8x16x2_t res;
|
||||||
|
|
||||||
|
res.val[0] = vld1q_u8(ptr + 0);
|
||||||
|
res.val[1] = vld1q_u8(ptr + 16);
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef struct ggml_uint8x16x4_t {
|
||||||
|
uint8x16_t val[4];
|
||||||
|
} ggml_uint8x16x4_t;
|
||||||
|
|
||||||
|
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
||||||
|
ggml_uint8x16x4_t res;
|
||||||
|
|
||||||
|
res.val[0] = vld1q_u8(ptr + 0);
|
||||||
|
res.val[1] = vld1q_u8(ptr + 16);
|
||||||
|
res.val[2] = vld1q_u8(ptr + 32);
|
||||||
|
res.val[3] = vld1q_u8(ptr + 48);
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef struct ggml_int8x16x2_t {
|
||||||
|
int8x16_t val[2];
|
||||||
|
} ggml_int8x16x2_t;
|
||||||
|
|
||||||
|
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
||||||
|
ggml_int8x16x2_t res;
|
||||||
|
|
||||||
|
res.val[0] = vld1q_s8(ptr + 0);
|
||||||
|
res.val[1] = vld1q_s8(ptr + 16);
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef struct ggml_int8x16x4_t {
|
||||||
|
int8x16_t val[4];
|
||||||
|
} ggml_int8x16x4_t;
|
||||||
|
|
||||||
|
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
||||||
|
ggml_int8x16x4_t res;
|
||||||
|
|
||||||
|
res.val[0] = vld1q_s8(ptr + 0);
|
||||||
|
res.val[1] = vld1q_s8(ptr + 16);
|
||||||
|
res.val[2] = vld1q_s8(ptr + 32);
|
||||||
|
res.val[3] = vld1q_s8(ptr + 48);
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE: not tested
|
||||||
|
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
||||||
|
int8x16_t res;
|
||||||
|
|
||||||
|
res[ 0] = a[b[ 0]];
|
||||||
|
res[ 1] = a[b[ 1]];
|
||||||
|
res[ 2] = a[b[ 2]];
|
||||||
|
res[ 3] = a[b[ 3]];
|
||||||
|
res[ 4] = a[b[ 4]];
|
||||||
|
res[ 5] = a[b[ 5]];
|
||||||
|
res[ 6] = a[b[ 6]];
|
||||||
|
res[ 7] = a[b[ 7]];
|
||||||
|
res[ 8] = a[b[ 8]];
|
||||||
|
res[ 9] = a[b[ 9]];
|
||||||
|
res[10] = a[b[10]];
|
||||||
|
res[11] = a[b[11]];
|
||||||
|
res[12] = a[b[12]];
|
||||||
|
res[13] = a[b[13]];
|
||||||
|
res[14] = a[b[14]];
|
||||||
|
res[15] = a[b[15]];
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE: not tested
|
||||||
|
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
||||||
|
uint8x16_t res;
|
||||||
|
|
||||||
|
res[ 0] = a[b[ 0]];
|
||||||
|
res[ 1] = a[b[ 1]];
|
||||||
|
res[ 2] = a[b[ 2]];
|
||||||
|
res[ 3] = a[b[ 3]];
|
||||||
|
res[ 4] = a[b[ 4]];
|
||||||
|
res[ 5] = a[b[ 5]];
|
||||||
|
res[ 6] = a[b[ 6]];
|
||||||
|
res[ 7] = a[b[ 7]];
|
||||||
|
res[ 8] = a[b[ 8]];
|
||||||
|
res[ 9] = a[b[ 9]];
|
||||||
|
res[10] = a[b[10]];
|
||||||
|
res[11] = a[b[11]];
|
||||||
|
res[12] = a[b[12]];
|
||||||
|
res[13] = a[b[13]];
|
||||||
|
res[14] = a[b[14]];
|
||||||
|
res[15] = a[b[15]];
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define ggml_int16x8x2_t int16x8x2_t
|
||||||
|
#define ggml_uint8x16x2_t uint8x16x2_t
|
||||||
|
#define ggml_uint8x16x4_t uint8x16x4_t
|
||||||
|
#define ggml_int8x16x2_t int8x16x2_t
|
||||||
|
#define ggml_int8x16x4_t int8x16x4_t
|
||||||
|
|
||||||
|
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
||||||
|
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
||||||
|
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
||||||
|
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
||||||
|
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
||||||
|
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
||||||
|
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
||||||
|
|
||||||
|
#endif // !defined(__aarch64__)
|
||||||
|
|
||||||
|
#if !defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
|
||||||
|
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||||
|
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
||||||
|
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
||||||
|
|
||||||
|
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
||||||
|
|
||||||
|
#endif // !defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
|
||||||
|
#endif // defined(__ARM_NEON)
|
||||||
|
|
||||||
|
#if defined(__ARM_NEON) && !defined(__MSC_VER)
|
||||||
|
|
||||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||||
|
|
||||||
|
@ -75,8 +329,6 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
typedef uint16_t ggml_fp16_internal_t;
|
|
||||||
|
|
||||||
#ifdef __wasm_simd128__
|
#ifdef __wasm_simd128__
|
||||||
#include <wasm_simd128.h>
|
#include <wasm_simd128.h>
|
||||||
#else
|
#else
|
||||||
|
@ -221,7 +473,7 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||||
|
|
||||||
#endif // __F16C__
|
#endif // __F16C__
|
||||||
|
|
||||||
#endif // __ARM_NEON
|
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
||||||
|
|
||||||
// precomputed f32 table for f16 (256 KB)
|
// precomputed f32 table for f16 (256 KB)
|
||||||
// defined in ggml.c, initialized in ggml_init()
|
// defined in ggml.c, initialized in ggml_init()
|
||||||
|
|
287
ggml-quants.c
287
ggml-quants.c
|
@ -14,41 +14,6 @@
|
||||||
#include <stdlib.h> // for qsort
|
#include <stdlib.h> // for qsort
|
||||||
#include <stdio.h> // for GGML_ASSERT
|
#include <stdio.h> // for GGML_ASSERT
|
||||||
|
|
||||||
#ifdef __ARM_NEON
|
|
||||||
|
|
||||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
|
||||||
//
|
|
||||||
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
|
||||||
//
|
|
||||||
#include <arm_neon.h>
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#ifdef __wasm_simd128__
|
|
||||||
#include <wasm_simd128.h>
|
|
||||||
#else
|
|
||||||
#if defined(__POWER9_VECTOR__) || defined(__powerpc64__)
|
|
||||||
#include <altivec.h>
|
|
||||||
#undef bool
|
|
||||||
#define bool _Bool
|
|
||||||
#else
|
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
|
||||||
#include <intrin.h>
|
|
||||||
#else
|
|
||||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
|
||||||
#if !defined(__riscv)
|
|
||||||
#include <immintrin.h>
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __riscv_v_intrinsic
|
|
||||||
#include <riscv_vector.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef MIN
|
#undef MIN
|
||||||
#undef MAX
|
#undef MAX
|
||||||
|
|
||||||
|
@ -276,258 +241,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
||||||
#endif // __AVX__ || __AVX2__ || __AVX512F__
|
#endif // __AVX__ || __AVX2__ || __AVX512F__
|
||||||
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
||||||
|
|
||||||
#if defined(__ARM_NEON)
|
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
|
|
||||||
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if !defined(__aarch64__)
|
|
||||||
|
|
||||||
// 64-bit compatibility
|
|
||||||
|
|
||||||
// vaddvq_s16
|
|
||||||
// vpaddq_s16
|
|
||||||
// vpaddq_s32
|
|
||||||
// vaddvq_s32
|
|
||||||
// vaddvq_f32
|
|
||||||
// vmaxvq_f32
|
|
||||||
// vcvtnq_s32_f32
|
|
||||||
// vzip1_u8
|
|
||||||
// vzip2_u8
|
|
||||||
|
|
||||||
inline static int32_t vaddvq_s16(int16x8_t v) {
|
|
||||||
return
|
|
||||||
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
|
||||||
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
|
||||||
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
|
||||||
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
|
||||||
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
|
||||||
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
|
||||||
return vcombine_s16(a0, b0);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
|
||||||
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
|
||||||
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
|
||||||
return vcombine_s32(a0, b0);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static int32_t vaddvq_s32(int32x4_t v) {
|
|
||||||
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static float vaddvq_f32(float32x4_t v) {
|
|
||||||
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static float vmaxvq_f32(float32x4_t v) {
|
|
||||||
return
|
|
||||||
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
|
||||||
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
|
||||||
int32x4_t res;
|
|
||||||
|
|
||||||
res[0] = roundf(vgetq_lane_f32(v, 0));
|
|
||||||
res[1] = roundf(vgetq_lane_f32(v, 1));
|
|
||||||
res[2] = roundf(vgetq_lane_f32(v, 2));
|
|
||||||
res[3] = roundf(vgetq_lane_f32(v, 3));
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
|
||||||
uint8x8_t res;
|
|
||||||
|
|
||||||
res[0] = a[0]; res[1] = b[0];
|
|
||||||
res[2] = a[1]; res[3] = b[1];
|
|
||||||
res[4] = a[2]; res[5] = b[2];
|
|
||||||
res[6] = a[3]; res[7] = b[3];
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
|
||||||
uint8x8_t res;
|
|
||||||
|
|
||||||
res[0] = a[4]; res[1] = b[4];
|
|
||||||
res[2] = a[5]; res[3] = b[5];
|
|
||||||
res[4] = a[6]; res[5] = b[6];
|
|
||||||
res[6] = a[7]; res[7] = b[7];
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
// vld1q_s16_x2
|
|
||||||
// vld1q_u8_x2
|
|
||||||
// vld1q_u8_x4
|
|
||||||
// vld1q_s8_x2
|
|
||||||
// vld1q_s8_x4
|
|
||||||
// TODO: double-check these work correctly
|
|
||||||
|
|
||||||
typedef struct ggml_int16x8x2_t {
|
|
||||||
int16x8_t val[2];
|
|
||||||
} ggml_int16x8x2_t;
|
|
||||||
|
|
||||||
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
|
||||||
ggml_int16x8x2_t res;
|
|
||||||
|
|
||||||
res.val[0] = vld1q_s16(ptr + 0);
|
|
||||||
res.val[1] = vld1q_s16(ptr + 8);
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
typedef struct ggml_uint8x16x2_t {
|
|
||||||
uint8x16_t val[2];
|
|
||||||
} ggml_uint8x16x2_t;
|
|
||||||
|
|
||||||
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
|
||||||
ggml_uint8x16x2_t res;
|
|
||||||
|
|
||||||
res.val[0] = vld1q_u8(ptr + 0);
|
|
||||||
res.val[1] = vld1q_u8(ptr + 16);
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
typedef struct ggml_uint8x16x4_t {
|
|
||||||
uint8x16_t val[4];
|
|
||||||
} ggml_uint8x16x4_t;
|
|
||||||
|
|
||||||
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
|
||||||
ggml_uint8x16x4_t res;
|
|
||||||
|
|
||||||
res.val[0] = vld1q_u8(ptr + 0);
|
|
||||||
res.val[1] = vld1q_u8(ptr + 16);
|
|
||||||
res.val[2] = vld1q_u8(ptr + 32);
|
|
||||||
res.val[3] = vld1q_u8(ptr + 48);
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
typedef struct ggml_int8x16x2_t {
|
|
||||||
int8x16_t val[2];
|
|
||||||
} ggml_int8x16x2_t;
|
|
||||||
|
|
||||||
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
|
||||||
ggml_int8x16x2_t res;
|
|
||||||
|
|
||||||
res.val[0] = vld1q_s8(ptr + 0);
|
|
||||||
res.val[1] = vld1q_s8(ptr + 16);
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
typedef struct ggml_int8x16x4_t {
|
|
||||||
int8x16_t val[4];
|
|
||||||
} ggml_int8x16x4_t;
|
|
||||||
|
|
||||||
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
|
||||||
ggml_int8x16x4_t res;
|
|
||||||
|
|
||||||
res.val[0] = vld1q_s8(ptr + 0);
|
|
||||||
res.val[1] = vld1q_s8(ptr + 16);
|
|
||||||
res.val[2] = vld1q_s8(ptr + 32);
|
|
||||||
res.val[3] = vld1q_s8(ptr + 48);
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
// NOTE: not tested
|
|
||||||
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
|
||||||
int8x16_t res;
|
|
||||||
|
|
||||||
res[ 0] = a[b[ 0]];
|
|
||||||
res[ 1] = a[b[ 1]];
|
|
||||||
res[ 2] = a[b[ 2]];
|
|
||||||
res[ 3] = a[b[ 3]];
|
|
||||||
res[ 4] = a[b[ 4]];
|
|
||||||
res[ 5] = a[b[ 5]];
|
|
||||||
res[ 6] = a[b[ 6]];
|
|
||||||
res[ 7] = a[b[ 7]];
|
|
||||||
res[ 8] = a[b[ 8]];
|
|
||||||
res[ 9] = a[b[ 9]];
|
|
||||||
res[10] = a[b[10]];
|
|
||||||
res[11] = a[b[11]];
|
|
||||||
res[12] = a[b[12]];
|
|
||||||
res[13] = a[b[13]];
|
|
||||||
res[14] = a[b[14]];
|
|
||||||
res[15] = a[b[15]];
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
// NOTE: not tested
|
|
||||||
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
|
||||||
uint8x16_t res;
|
|
||||||
|
|
||||||
res[ 0] = a[b[ 0]];
|
|
||||||
res[ 1] = a[b[ 1]];
|
|
||||||
res[ 2] = a[b[ 2]];
|
|
||||||
res[ 3] = a[b[ 3]];
|
|
||||||
res[ 4] = a[b[ 4]];
|
|
||||||
res[ 5] = a[b[ 5]];
|
|
||||||
res[ 6] = a[b[ 6]];
|
|
||||||
res[ 7] = a[b[ 7]];
|
|
||||||
res[ 8] = a[b[ 8]];
|
|
||||||
res[ 9] = a[b[ 9]];
|
|
||||||
res[10] = a[b[10]];
|
|
||||||
res[11] = a[b[11]];
|
|
||||||
res[12] = a[b[12]];
|
|
||||||
res[13] = a[b[13]];
|
|
||||||
res[14] = a[b[14]];
|
|
||||||
res[15] = a[b[15]];
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#define ggml_int16x8x2_t int16x8x2_t
|
|
||||||
#define ggml_uint8x16x2_t uint8x16x2_t
|
|
||||||
#define ggml_uint8x16x4_t uint8x16x4_t
|
|
||||||
#define ggml_int8x16x2_t int8x16x2_t
|
|
||||||
#define ggml_int8x16x4_t int8x16x4_t
|
|
||||||
|
|
||||||
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
|
||||||
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
|
||||||
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
|
||||||
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
|
||||||
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
|
||||||
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
|
||||||
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if !defined(__ARM_FEATURE_DOTPROD)
|
|
||||||
|
|
||||||
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
|
||||||
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
|
||||||
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
|
||||||
|
|
||||||
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
|
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
|
||||||
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
|
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
|
||||||
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
|
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
|
||||||
|
|
8
ggml.c
8
ggml.c
|
@ -10825,7 +10825,7 @@ static void ggml_compute_forward_mul_mat(
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if GGML_USE_LLAMAFILE
|
#if GGML_USE_LLAMAFILE
|
||||||
if (nb10 == ggml_type_size(src1->type)) {
|
if (src1_cont) {
|
||||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||||
|
@ -10878,15 +10878,13 @@ UseGgmlGemm1:;
|
||||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||||
|
|
||||||
#if GGML_USE_LLAMAFILE
|
#if GGML_USE_LLAMAFILE
|
||||||
if (nb10 == ggml_type_size(src1->type) || src1->type != vec_dot_type) {
|
if (src1->type != vec_dot_type) {
|
||||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||||
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
||||||
nb01/ggml_type_size(src0->type),
|
nb01/ggml_type_size(src0->type),
|
||||||
(const char *)wdata + ggml_row_size(vec_dot_type,
|
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
|
||||||
nb12/ggml_type_size(src1->type)*i12 +
|
|
||||||
nb13/ggml_type_size(src1->type)*i13),
|
|
||||||
row_size/ggml_type_size(vec_dot_type),
|
row_size/ggml_type_size(vec_dot_type),
|
||||||
(char *)dst->data + i12*nb2 + i13*nb3,
|
(char *)dst->data + i12*nb2 + i13*nb3,
|
||||||
nb1/ggml_type_size(dst->type),
|
nb1/ggml_type_size(dst->type),
|
||||||
|
|
|
@ -124,6 +124,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
QWEN2 = auto()
|
QWEN2 = auto()
|
||||||
QWEN2MOE = auto()
|
QWEN2MOE = auto()
|
||||||
PHI2 = auto()
|
PHI2 = auto()
|
||||||
|
PHI3 = auto()
|
||||||
PLAMO = auto()
|
PLAMO = auto()
|
||||||
CODESHELL = auto()
|
CODESHELL = auto()
|
||||||
ORION = auto()
|
ORION = auto()
|
||||||
|
@ -200,6 +201,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.QWEN2: "qwen2",
|
MODEL_ARCH.QWEN2: "qwen2",
|
||||||
MODEL_ARCH.QWEN2MOE: "qwen2moe",
|
MODEL_ARCH.QWEN2MOE: "qwen2moe",
|
||||||
MODEL_ARCH.PHI2: "phi2",
|
MODEL_ARCH.PHI2: "phi2",
|
||||||
|
MODEL_ARCH.PHI3: "phi3",
|
||||||
MODEL_ARCH.PLAMO: "plamo",
|
MODEL_ARCH.PLAMO: "plamo",
|
||||||
MODEL_ARCH.CODESHELL: "codeshell",
|
MODEL_ARCH.CODESHELL: "codeshell",
|
||||||
MODEL_ARCH.ORION: "orion",
|
MODEL_ARCH.ORION: "orion",
|
||||||
|
@ -550,6 +552,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.PHI3: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
MODEL_ARCH.CODESHELL: [
|
MODEL_ARCH.CODESHELL: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.POS_EMBD,
|
MODEL_TENSOR.POS_EMBD,
|
||||||
|
|
|
@ -117,6 +117,7 @@ class TensorNameMap:
|
||||||
"h.{bid}.attn.c_attn", # gpt2
|
"h.{bid}.attn.c_attn", # gpt2
|
||||||
"transformer.h.{bid}.mixer.Wqkv", # phi2
|
"transformer.h.{bid}.mixer.Wqkv", # phi2
|
||||||
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
|
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
|
||||||
|
"model.layers.{bid}.self_attn.qkv_proj" # phi3
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention query
|
# Attention query
|
||||||
|
@ -234,6 +235,7 @@ class TensorNameMap:
|
||||||
"h.{bid}.mlp.c_fc", # gpt2
|
"h.{bid}.mlp.c_fc", # gpt2
|
||||||
"transformer.h.{bid}.mlp.fc1", # phi2
|
"transformer.h.{bid}.mlp.fc1", # phi2
|
||||||
"model.layers.{bid}.mlp.fc1", # phi2
|
"model.layers.{bid}.mlp.fc1", # phi2
|
||||||
|
"model.layers.{bid}.mlp.gate_up_proj", # phi3
|
||||||
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
||||||
"model.layers.{bid}.feed_forward.w3", # internlm2
|
"model.layers.{bid}.feed_forward.w3", # internlm2
|
||||||
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||||
|
|
216
llama.cpp
216
llama.cpp
|
@ -211,6 +211,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_QWEN2,
|
LLM_ARCH_QWEN2,
|
||||||
LLM_ARCH_QWEN2MOE,
|
LLM_ARCH_QWEN2MOE,
|
||||||
LLM_ARCH_PHI2,
|
LLM_ARCH_PHI2,
|
||||||
|
LLM_ARCH_PHI3,
|
||||||
LLM_ARCH_PLAMO,
|
LLM_ARCH_PLAMO,
|
||||||
LLM_ARCH_CODESHELL,
|
LLM_ARCH_CODESHELL,
|
||||||
LLM_ARCH_ORION,
|
LLM_ARCH_ORION,
|
||||||
|
@ -246,6 +247,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_QWEN2, "qwen2" },
|
{ LLM_ARCH_QWEN2, "qwen2" },
|
||||||
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
||||||
{ LLM_ARCH_PHI2, "phi2" },
|
{ LLM_ARCH_PHI2, "phi2" },
|
||||||
|
{ LLM_ARCH_PHI3, "phi3" },
|
||||||
{ LLM_ARCH_PLAMO, "plamo" },
|
{ LLM_ARCH_PLAMO, "plamo" },
|
||||||
{ LLM_ARCH_CODESHELL, "codeshell" },
|
{ LLM_ARCH_CODESHELL, "codeshell" },
|
||||||
{ LLM_ARCH_ORION, "orion" },
|
{ LLM_ARCH_ORION, "orion" },
|
||||||
|
@ -793,6 +795,23 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_PHI3,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_PLAMO,
|
LLM_ARCH_PLAMO,
|
||||||
{
|
{
|
||||||
|
@ -3955,6 +3974,16 @@ static void llm_load_hparams(
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
|
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 24: model.type = e_model::MODEL_1B; break;
|
||||||
|
case 32: model.type = e_model::MODEL_3B; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
case LLM_ARCH_PHI3:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 24: model.type = e_model::MODEL_1B; break;
|
case 24: model.type = e_model::MODEL_1B; break;
|
||||||
case 32: model.type = e_model::MODEL_3B; break;
|
case 32: model.type = e_model::MODEL_3B; break;
|
||||||
|
@ -4340,7 +4369,7 @@ static void llm_load_vocab(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// find EOT token: "<|eot_id|>", "<|im_emd|>", "<end_of_turn>", etc.
|
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
|
||||||
//
|
//
|
||||||
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
|
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
|
||||||
// for now, we apply this workaround to find the EOT token based on its text
|
// for now, we apply this workaround to find the EOT token based on its text
|
||||||
|
@ -4351,7 +4380,8 @@ static void llm_load_vocab(
|
||||||
// need to fix convert script
|
// need to fix convert script
|
||||||
//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
|
//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
|
||||||
(t.first == "<|eot_id|>" ||
|
(t.first == "<|eot_id|>" ||
|
||||||
t.first == "<|im_emd|>" ||
|
t.first == "<|im_end|>" ||
|
||||||
|
t.first == "<|end|>" ||
|
||||||
t.first == "<end_of_turn>"
|
t.first == "<end_of_turn>"
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
|
@ -5375,6 +5405,33 @@ static bool llm_load_tensors(
|
||||||
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_PHI3:
|
||||||
|
{
|
||||||
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
|
||||||
|
|
||||||
|
// output
|
||||||
|
{
|
||||||
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
|
||||||
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
ggml_context* ctx_layer = ctx_for_layer(i);
|
||||||
|
ggml_context* ctx_split = ctx_for_layer_split(i);
|
||||||
|
|
||||||
|
auto& layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
|
||||||
|
|
||||||
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
|
||||||
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
|
||||||
|
|
||||||
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
|
||||||
|
|
||||||
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
|
||||||
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_PLAMO:
|
case LLM_ARCH_PLAMO:
|
||||||
{
|
{
|
||||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
@ -6326,7 +6383,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||||
cb(kq, "kq", il);
|
cb(kq, "kq", il);
|
||||||
|
|
||||||
if (model.arch == LLM_ARCH_PHI2) {
|
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
||||||
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
||||||
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
||||||
|
@ -8967,12 +9024,140 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = ggml_add(ctx0, cur, model.output_b);
|
cur = ggml_add(ctx0, cur, model.output_b);
|
||||||
cb(cur, "result_output", -1);
|
cb(cur, "result_output", -1);
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_phi3() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
auto residual = inpL;
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
model.layers[il].attn_norm,
|
||||||
|
NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(attn_norm_output, "attn_norm", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * Qcur = nullptr;
|
||||||
|
struct ggml_tensor * Kcur = nullptr;
|
||||||
|
struct ggml_tensor * Vcur = nullptr;
|
||||||
|
|
||||||
|
if (model.layers[il].wqkv) {
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
|
||||||
|
cb(cur, "wqkv", il);
|
||||||
|
|
||||||
|
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
|
||||||
|
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
|
||||||
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
||||||
|
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
||||||
|
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
||||||
|
}
|
||||||
|
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
|
||||||
|
Qcur = ggml_rope_custom(
|
||||||
|
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
||||||
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_custom(
|
||||||
|
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
||||||
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
|
model.layers[il].wo, NULL,
|
||||||
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (il == n_layer - 1) {
|
||||||
|
// skip computing output for unused tokens
|
||||||
|
struct ggml_tensor* inp_out_ids = build_inp_out_ids();
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, residual);
|
||||||
|
residual = cur;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
// FF
|
||||||
|
// special-case: the up and gate tensors are merged into a single tensor
|
||||||
|
// TOOD: support into llm_build_ffn
|
||||||
|
{
|
||||||
|
struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
|
||||||
|
cb(up, "ffn_up", il);
|
||||||
|
|
||||||
|
auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0));
|
||||||
|
auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2));
|
||||||
|
|
||||||
|
y = ggml_mul(ctx0, y, ggml_silu(ctx0, g));
|
||||||
|
cb(y, "ffn_gate", il);
|
||||||
|
|
||||||
|
auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y);
|
||||||
|
cb(down, "ffn_down", il);
|
||||||
|
|
||||||
|
cur = down;
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, residual, cur);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
model.output_norm,
|
||||||
|
NULL,
|
||||||
|
LLM_NORM_RMS, cb, -1);
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
struct ggml_cgraph * build_plamo() {
|
struct ggml_cgraph * build_plamo() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||||
|
|
||||||
|
@ -10474,6 +10659,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm.build_phi2();
|
result = llm.build_phi2();
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_PHI3:
|
||||||
|
{
|
||||||
|
result = llm.build_phi3();
|
||||||
|
} break;
|
||||||
case LLM_ARCH_PLAMO:
|
case LLM_ARCH_PLAMO:
|
||||||
{
|
{
|
||||||
result = llm.build_plamo();
|
result = llm.build_plamo();
|
||||||
|
@ -13478,7 +13667,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
|
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
|
||||||
GGML_ASSERT(ctx);
|
GGML_ASSERT(ctx);
|
||||||
|
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
@ -13491,7 +13680,6 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
||||||
}
|
}
|
||||||
|
|
||||||
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
||||||
auto & rng = ctx->rng;
|
|
||||||
int idx = dist(rng);
|
int idx = dist(rng);
|
||||||
|
|
||||||
llama_token result = candidates->data[idx].id;
|
llama_token result = candidates->data[idx].id;
|
||||||
|
@ -13501,6 +13689,10 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
|
||||||
|
return llama_sample_token_with_rng(ctx, candidates, ctx->rng);
|
||||||
|
}
|
||||||
|
|
||||||
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
|
@ -15393,6 +15585,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
case LLM_ARCH_QWEN2:
|
case LLM_ARCH_QWEN2:
|
||||||
case LLM_ARCH_QWEN2MOE:
|
case LLM_ARCH_QWEN2MOE:
|
||||||
case LLM_ARCH_PHI2:
|
case LLM_ARCH_PHI2:
|
||||||
|
case LLM_ARCH_PHI3:
|
||||||
case LLM_ARCH_GEMMA:
|
case LLM_ARCH_GEMMA:
|
||||||
case LLM_ARCH_STARCODER2:
|
case LLM_ARCH_STARCODER2:
|
||||||
return LLAMA_ROPE_TYPE_NEOX;
|
return LLAMA_ROPE_TYPE_NEOX;
|
||||||
|
@ -15406,6 +15599,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
return LLAMA_ROPE_TYPE_NONE;
|
return LLAMA_ROPE_TYPE_NONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
|
||||||
|
return ctx->cparams.pooling_type;
|
||||||
|
}
|
||||||
|
|
||||||
int32_t llama_n_vocab(const struct llama_model * model) {
|
int32_t llama_n_vocab(const struct llama_model * model) {
|
||||||
return model->hparams.n_vocab;
|
return model->hparams.n_vocab;
|
||||||
}
|
}
|
||||||
|
@ -17098,6 +17295,15 @@ LLAMA_API int32_t llama_chat_get_model_template(
|
||||||
if (model_template.empty()) {
|
if (model_template.empty()) {
|
||||||
model_template = get_meta(default_meta);
|
model_template = get_meta(default_meta);
|
||||||
}
|
}
|
||||||
|
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
|
||||||
|
// Phi 3
|
||||||
|
for (auto message : chat) {
|
||||||
|
std::string role(message->role);
|
||||||
|
ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
|
||||||
|
}
|
||||||
|
if (add_ass) {
|
||||||
|
ss << "<|assistant|>\n";
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// default template
|
// default template
|
||||||
model_template = get_meta(default_meta);
|
model_template = get_meta(default_meta);
|
||||||
|
|
13
llama.h
13
llama.h
|
@ -408,7 +408,9 @@ extern "C" {
|
||||||
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
||||||
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
||||||
|
|
||||||
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
|
||||||
|
|
||||||
|
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
|
||||||
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
||||||
|
|
||||||
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
||||||
|
@ -1057,7 +1059,7 @@ extern "C" {
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
llama_token_data_array * candidates);
|
llama_token_data_array * candidates);
|
||||||
|
|
||||||
/// @details Randomly selects a token from the candidates based on their probabilities.
|
/// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
|
||||||
LLAMA_API llama_token llama_sample_token(
|
LLAMA_API llama_token llama_sample_token(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
llama_token_data_array * candidates);
|
llama_token_data_array * candidates);
|
||||||
|
@ -1144,8 +1146,9 @@ extern "C" {
|
||||||
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
|
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
|
||||||
#ifdef LLAMA_API_INTERNAL
|
#ifdef LLAMA_API_INTERNAL
|
||||||
|
|
||||||
#include <vector>
|
#include <random>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
struct ggml_tensor;
|
struct ggml_tensor;
|
||||||
|
|
||||||
|
@ -1182,6 +1185,10 @@ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
||||||
const std::string & src,
|
const std::string & src,
|
||||||
llama_partial_utf8 partial_start);
|
llama_partial_utf8 partial_start);
|
||||||
|
|
||||||
|
// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
|
||||||
|
// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
|
||||||
|
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);
|
||||||
|
|
||||||
#endif // LLAMA_API_INTERNAL
|
#endif // LLAMA_API_INTERNAL
|
||||||
|
|
||||||
#endif // LLAMA_H
|
#endif // LLAMA_H
|
||||||
|
|
|
@ -49,6 +49,8 @@ int main(void) {
|
||||||
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
|
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
|
||||||
// Llama-3
|
// Llama-3
|
||||||
"{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
|
"{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
|
||||||
|
// Phi-3
|
||||||
|
"{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + ' ' + message['content'] + '<|end|> ' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|> ' }}{% else %}{{ eos_token }}{% endif %}"
|
||||||
};
|
};
|
||||||
std::vector<std::string> expected_output = {
|
std::vector<std::string> expected_output = {
|
||||||
// teknium/OpenHermes-2.5-Mistral-7B
|
// teknium/OpenHermes-2.5-Mistral-7B
|
||||||
|
@ -77,6 +79,8 @@ int main(void) {
|
||||||
"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
|
"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
|
||||||
// Llama 3
|
// Llama 3
|
||||||
"<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
|
"<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
|
||||||
|
// Phi 3
|
||||||
|
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\nI am an assistant<|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
||||||
};
|
};
|
||||||
std::vector<char> formatted_chat(1024);
|
std::vector<char> formatted_chat(1024);
|
||||||
int32_t res;
|
int32_t res;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue