Merge branch 'master' into support-mamba-ssm
This commit is contained in:
commit
5544f5211b
36 changed files with 46612 additions and 47293 deletions
|
@ -1,5 +1,6 @@
|
||||||
{
|
{
|
||||||
lib,
|
lib,
|
||||||
|
glibc,
|
||||||
config,
|
config,
|
||||||
stdenv,
|
stdenv,
|
||||||
mkShell,
|
mkShell,
|
||||||
|
@ -30,6 +31,11 @@
|
||||||
useRocm ? config.rocmSupport,
|
useRocm ? config.rocmSupport,
|
||||||
useVulkan ? false,
|
useVulkan ? false,
|
||||||
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
||||||
|
|
||||||
|
# It's necessary to consistently use backendStdenv when building with CUDA support,
|
||||||
|
# otherwise we get libstdc++ errors downstream.
|
||||||
|
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
|
||||||
|
enableStatic ? effectiveStdenv.hostPlatform.isStatic
|
||||||
}@inputs:
|
}@inputs:
|
||||||
|
|
||||||
let
|
let
|
||||||
|
@ -41,10 +47,7 @@ let
|
||||||
versionOlder
|
versionOlder
|
||||||
;
|
;
|
||||||
|
|
||||||
# It's necessary to consistently use backendStdenv when building with CUDA support,
|
|
||||||
# otherwise we get libstdc++ errors downstream.
|
|
||||||
stdenv = throw "Use effectiveStdenv instead";
|
stdenv = throw "Use effectiveStdenv instead";
|
||||||
effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
|
|
||||||
|
|
||||||
suffices =
|
suffices =
|
||||||
lib.optionals useBlas [ "BLAS" ]
|
lib.optionals useBlas [ "BLAS" ]
|
||||||
|
@ -167,6 +170,9 @@ effectiveStdenv.mkDerivation (
|
||||||
# TODO: Replace with autoAddDriverRunpath
|
# TODO: Replace with autoAddDriverRunpath
|
||||||
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
|
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
|
||||||
cudaPackages.autoAddOpenGLRunpathHook
|
cudaPackages.autoAddOpenGLRunpathHook
|
||||||
|
]
|
||||||
|
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
|
||||||
|
glibc.static
|
||||||
];
|
];
|
||||||
|
|
||||||
buildInputs =
|
buildInputs =
|
||||||
|
@ -181,7 +187,7 @@ effectiveStdenv.mkDerivation (
|
||||||
[
|
[
|
||||||
(cmakeBool "LLAMA_NATIVE" false)
|
(cmakeBool "LLAMA_NATIVE" false)
|
||||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
||||||
(cmakeBool "BUILD_SHARED_LIBS" true)
|
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
||||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||||
(cmakeBool "LLAMA_BLAS" useBlas)
|
(cmakeBool "LLAMA_BLAS" useBlas)
|
||||||
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
|
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
|
||||||
|
@ -190,6 +196,7 @@ effectiveStdenv.mkDerivation (
|
||||||
(cmakeBool "LLAMA_METAL" useMetalKit)
|
(cmakeBool "LLAMA_METAL" useMetalKit)
|
||||||
(cmakeBool "LLAMA_MPI" useMpi)
|
(cmakeBool "LLAMA_MPI" useMpi)
|
||||||
(cmakeBool "LLAMA_VULKAN" useVulkan)
|
(cmakeBool "LLAMA_VULKAN" useVulkan)
|
||||||
|
(cmakeBool "LLAMA_STATIC" enableStatic)
|
||||||
]
|
]
|
||||||
++ optionals useCuda [
|
++ optionals useCuda [
|
||||||
(
|
(
|
||||||
|
|
|
@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||||
|
|
||||||
### Recent API changes
|
### Recent API changes
|
||||||
|
|
||||||
|
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
|
||||||
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
|
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
|
||||||
|
|
||||||
### Hot topics
|
### Hot topics
|
||||||
|
|
|
@ -19,7 +19,12 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(GIT_INDEX "${GIT_DIR}/index")
|
if(EXISTS "${GIT_DIR}/index")
|
||||||
|
set(GIT_INDEX "${GIT_DIR}/index")
|
||||||
|
else()
|
||||||
|
message(WARNING "Git index not found in git repository.")
|
||||||
|
set(GIT_INDEX "")
|
||||||
|
endif()
|
||||||
else()
|
else()
|
||||||
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
|
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
|
||||||
set(GIT_INDEX "")
|
set(GIT_INDEX "")
|
||||||
|
|
|
@ -513,12 +513,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.n_sequences = std::stoi(argv[i]);
|
params.n_sequences = std::stoi(argv[i]);
|
||||||
} else if (arg == "--p-accept" || arg == "-pa") {
|
|
||||||
if (++i >= argc) {
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
params.p_accept = std::stof(argv[i]);
|
|
||||||
} else if (arg == "--p-split" || arg == "-ps") {
|
} else if (arg == "--p-split" || arg == "-ps") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -1044,7 +1038,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
||||||
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
|
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
|
||||||
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
|
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
|
||||||
printf(" -pa N, --p-accept N speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept);
|
|
||||||
printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
|
printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
|
||||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
||||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
|
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
|
||||||
|
@ -1300,7 +1293,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||||
cparams.seed = params.seed;
|
cparams.seed = params.seed;
|
||||||
cparams.logits_all = params.logits_all;
|
cparams.logits_all = params.logits_all;
|
||||||
cparams.embedding = params.embedding;
|
cparams.embeddings = params.embedding;
|
||||||
cparams.rope_scaling_type = params.rope_scaling_type;
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
||||||
cparams.rope_freq_base = params.rope_freq_base;
|
cparams.rope_freq_base = params.rope_freq_base;
|
||||||
cparams.rope_freq_scale = params.rope_freq_scale;
|
cparams.rope_freq_scale = params.rope_freq_scale;
|
||||||
|
|
|
@ -43,7 +43,7 @@ extern char const *LLAMA_BUILD_TARGET;
|
||||||
int32_t get_num_physical_cores();
|
int32_t get_num_physical_cores();
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
uint32_t seed = -1; // RNG seed
|
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||||
|
|
||||||
int32_t n_threads = get_num_physical_cores();
|
int32_t n_threads = get_num_physical_cores();
|
||||||
int32_t n_threads_draft = -1;
|
int32_t n_threads_draft = -1;
|
||||||
|
@ -53,11 +53,10 @@ struct gpt_params {
|
||||||
int32_t n_ctx = 512; // context size
|
int32_t n_ctx = 512; // context size
|
||||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
int32_t n_draft = 8; // number of tokens to draft during speculative decoding
|
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
|
||||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||||
int32_t n_sequences = 1; // number of sequences to decode
|
int32_t n_sequences = 1; // number of sequences to decode
|
||||||
float p_accept = 0.5f; // speculative decoding accept probability
|
|
||||||
float p_split = 0.1f; // speculative decoding split probability
|
float p_split = 0.1f; // speculative decoding split probability
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||||
|
|
|
@ -295,6 +295,77 @@ static llama_token llama_sampling_sample_impl(
|
||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static llama_token_data_array llama_sample_probability_distribution_impl(
|
||||||
|
struct llama_sampling_context * ctx_sampling,
|
||||||
|
struct llama_context * ctx_main,
|
||||||
|
struct llama_context * ctx_cfg,
|
||||||
|
const int idx) {
|
||||||
|
const llama_sampling_params & params = ctx_sampling->params;
|
||||||
|
|
||||||
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
||||||
|
|
||||||
|
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
|
||||||
|
const float penalty_repeat = params.penalty_repeat;
|
||||||
|
const float penalty_freq = params.penalty_freq;
|
||||||
|
const float penalty_present = params.penalty_present;
|
||||||
|
const bool penalize_nl = params.penalize_nl;
|
||||||
|
|
||||||
|
auto & prev = ctx_sampling->prev;
|
||||||
|
auto & cur = ctx_sampling->cur;
|
||||||
|
|
||||||
|
// Get a pointer to the logits
|
||||||
|
float * logits = llama_get_logits_ith(ctx_main, idx);
|
||||||
|
|
||||||
|
// Declare original_logits at the beginning of the function scope
|
||||||
|
std::vector<float> original_logits;
|
||||||
|
|
||||||
|
// apply params.logit_bias map
|
||||||
|
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
||||||
|
logits[it->first] += it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ctx_cfg) {
|
||||||
|
float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
|
||||||
|
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur.clear();
|
||||||
|
|
||||||
|
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
|
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
||||||
|
|
||||||
|
// apply penalties
|
||||||
|
const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
|
||||||
|
const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
|
||||||
|
if (penalty_tokens_used_size) {
|
||||||
|
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
|
||||||
|
|
||||||
|
llama_sample_repetition_penalties(ctx_main, &cur_p,
|
||||||
|
penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
|
||||||
|
penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
|
||||||
|
|
||||||
|
if (!penalize_nl) {
|
||||||
|
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
||||||
|
if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
|
||||||
|
cur_p.data[idx].logit = nl_logit;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// apply grammar checks
|
||||||
|
if (ctx_sampling->grammar != NULL) {
|
||||||
|
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_sample_softmax(ctx_main, &cur_p);
|
||||||
|
return cur_p;
|
||||||
|
}
|
||||||
|
|
||||||
llama_token llama_sampling_sample(
|
llama_token llama_sampling_sample(
|
||||||
struct llama_sampling_context * ctx_sampling,
|
struct llama_sampling_context * ctx_sampling,
|
||||||
struct llama_context * ctx_main,
|
struct llama_context * ctx_main,
|
||||||
|
@ -304,6 +375,14 @@ llama_token llama_sampling_sample(
|
||||||
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
|
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_token_data_array llama_sampling_probability_distribution(
|
||||||
|
struct llama_sampling_context * ctx_sampling,
|
||||||
|
struct llama_context * ctx_main,
|
||||||
|
struct llama_context * ctx_cfg,
|
||||||
|
const int idx) {
|
||||||
|
return llama_sample_probability_distribution_impl(ctx_sampling,ctx_main, ctx_cfg, idx);
|
||||||
|
}
|
||||||
|
|
||||||
void llama_sampling_accept(
|
void llama_sampling_accept(
|
||||||
struct llama_sampling_context * ctx_sampling,
|
struct llama_sampling_context * ctx_sampling,
|
||||||
struct llama_context * ctx_main,
|
struct llama_context * ctx_main,
|
||||||
|
|
|
@ -131,6 +131,13 @@ llama_token llama_sampling_sample(
|
||||||
struct llama_context * ctx_cfg,
|
struct llama_context * ctx_cfg,
|
||||||
int idx = 0);
|
int idx = 0);
|
||||||
|
|
||||||
|
// returns the probability that token of given id will be sampled
|
||||||
|
llama_token_data_array llama_sampling_probability_distribution(
|
||||||
|
struct llama_sampling_context * ctx_sampling,
|
||||||
|
struct llama_context * ctx_main,
|
||||||
|
struct llama_context * ctx_cfg,
|
||||||
|
int idx = 0);
|
||||||
|
|
||||||
void llama_sampling_accept(
|
void llama_sampling_accept(
|
||||||
struct llama_sampling_context * ctx_sampling,
|
struct llama_sampling_context * ctx_sampling,
|
||||||
struct llama_context * ctx_main,
|
struct llama_context * ctx_main,
|
||||||
|
|
|
@ -36,8 +36,10 @@ class SentencePieceTokenTypes(IntEnum):
|
||||||
UNUSED = 5
|
UNUSED = 5
|
||||||
BYTE = 6
|
BYTE = 6
|
||||||
|
|
||||||
|
|
||||||
AnyModel = TypeVar("AnyModel", bound="type[Model]")
|
AnyModel = TypeVar("AnyModel", bound="type[Model]")
|
||||||
|
|
||||||
|
|
||||||
class Model(ABC):
|
class Model(ABC):
|
||||||
_model_classes: dict[str, type[Model]] = {}
|
_model_classes: dict[str, type[Model]] = {}
|
||||||
|
|
||||||
|
@ -187,6 +189,7 @@ class Model(ABC):
|
||||||
@classmethod
|
@classmethod
|
||||||
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
|
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
|
||||||
assert names
|
assert names
|
||||||
|
|
||||||
def func(modelcls: type[Model]):
|
def func(modelcls: type[Model]):
|
||||||
for name in names:
|
for name in names:
|
||||||
cls._model_classes[name] = modelcls
|
cls._model_classes[name] = modelcls
|
||||||
|
|
|
@ -19,11 +19,11 @@ static std::vector<std::string> split_lines(const std::string & s) {
|
||||||
|
|
||||||
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
|
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
|
||||||
for (size_t i = 0; i < tokens.size(); i++) {
|
for (size_t i = 0; i < tokens.size(); i++) {
|
||||||
llama_batch_add(batch, tokens[i], i, { seq_id }, false);
|
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void normalize(float * vec, float * out, int n) {
|
static void normalize(const float * vec, float * out, int n) {
|
||||||
float norm = 0;
|
float norm = 0;
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
norm += vec[i] * vec[i];
|
norm += vec[i] * vec[i];
|
||||||
|
@ -45,10 +45,23 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||||
}
|
}
|
||||||
|
|
||||||
// normalize on copy
|
// normalize on copy
|
||||||
for (int k = 0; k < n_seq; k++) {
|
for (int i = 0; i < batch.n_tokens; i++) {
|
||||||
float * emb = llama_get_embeddings_ith(ctx, k);
|
if (!batch.logits[i]) {
|
||||||
float * out = output + k * n_embd;
|
continue;
|
||||||
normalize(emb, out, n_embd);
|
}
|
||||||
|
|
||||||
|
// try to get sequence embeddings - supported only when pooling_type is not NONE
|
||||||
|
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
|
||||||
|
if (embd == NULL) {
|
||||||
|
embd = llama_get_embeddings_ith(ctx, i);
|
||||||
|
if (embd == NULL) {
|
||||||
|
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
float * out = output + batch.seq_id[i][0] * n_embd;
|
||||||
|
normalize(embd, out, n_embd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -132,7 +145,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// initialize batch
|
// initialize batch
|
||||||
const int n_prompts = prompts.size();
|
const int n_prompts = prompts.size();
|
||||||
struct llama_batch batch = llama_batch_init(n_batch, 0, n_prompts);
|
struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
||||||
|
|
||||||
// allocate output
|
// allocate output
|
||||||
const int n_embd = llama_n_embd(model);
|
const int n_embd = llama_n_embd(model);
|
||||||
|
@ -145,6 +158,7 @@ int main(int argc, char ** argv) {
|
||||||
for (int k = 0; k < n_prompts; k++) {
|
for (int k = 0; k < n_prompts; k++) {
|
||||||
// clamp to n_batch tokens
|
// clamp to n_batch tokens
|
||||||
auto & inp = inputs[k];
|
auto & inp = inputs[k];
|
||||||
|
|
||||||
const uint64_t n_toks = inp.size();
|
const uint64_t n_toks = inp.size();
|
||||||
|
|
||||||
// encode if at capacity
|
// encode if at capacity
|
||||||
|
|
|
@ -511,6 +511,14 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<llama_token> embd;
|
std::vector<llama_token> embd;
|
||||||
std::vector<llama_token> embd_guidance;
|
std::vector<llama_token> embd_guidance;
|
||||||
|
|
||||||
|
// tokenized antiprompts
|
||||||
|
std::vector<std::vector<llama_token>> antiprompt_ids;
|
||||||
|
|
||||||
|
antiprompt_ids.reserve(params.antiprompt.size());
|
||||||
|
for (const std::string & antiprompt : params.antiprompt) {
|
||||||
|
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
|
||||||
|
}
|
||||||
|
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
||||||
|
|
||||||
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
||||||
|
@ -769,6 +777,18 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// check for reverse prompt using special tokens
|
||||||
|
llama_token last_token = llama_sampling_last(ctx_sampling);
|
||||||
|
for (std::vector<llama_token> ids : antiprompt_ids) {
|
||||||
|
if (ids.size() == 1 && last_token == ids[0]) {
|
||||||
|
if (params.interactive) {
|
||||||
|
is_interacting = true;
|
||||||
|
}
|
||||||
|
is_antiprompt = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (is_antiprompt) {
|
if (is_antiprompt) {
|
||||||
LOG("found antiprompt: %s\n", last_output.c_str());
|
LOG("found antiprompt: %s\n", last_output.c_str());
|
||||||
}
|
}
|
||||||
|
|
34
examples/server-embd.py
Normal file
34
examples/server-embd.py
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
import asyncio
|
||||||
|
import requests
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
n = 8
|
||||||
|
|
||||||
|
result = []
|
||||||
|
|
||||||
|
async def requests_post_async(*args, **kwargs):
|
||||||
|
return await asyncio.to_thread(requests.post, *args, **kwargs)
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
model_url = "http://127.0.0.1:6900"
|
||||||
|
responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
|
||||||
|
url= f"{model_url}/embedding",
|
||||||
|
json= {"content": str(i)*1024}
|
||||||
|
) for i in range(n)])
|
||||||
|
|
||||||
|
for response in responses:
|
||||||
|
embedding = response.json()["embedding"]
|
||||||
|
print(embedding[-8:])
|
||||||
|
result.append(embedding)
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
|
|
||||||
|
# compute cosine similarity
|
||||||
|
|
||||||
|
for i in range(n-1):
|
||||||
|
for j in range(i+1, n):
|
||||||
|
embedding1 = np.array(result[i])
|
||||||
|
embedding2 = np.array(result[j])
|
||||||
|
similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
|
||||||
|
print(f"Similarity between {i} and {j}: {similarity:.2f}")
|
||||||
|
|
|
@ -417,7 +417,7 @@ struct llama_server_context
|
||||||
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
|
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
|
||||||
if (res < 0) {
|
if (res < 0) {
|
||||||
LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
|
LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
|
||||||
sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
|
sparams.chat_template = "chatml";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1214,7 +1214,7 @@ struct llama_server_context
|
||||||
queue_results.send(res);
|
queue_results.send(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
void send_embedding(server_slot &slot)
|
void send_embedding(server_slot & slot, const llama_batch & batch)
|
||||||
{
|
{
|
||||||
task_result res;
|
task_result res;
|
||||||
res.id = slot.task_id;
|
res.id = slot.task_id;
|
||||||
|
@ -1223,6 +1223,7 @@ struct llama_server_context
|
||||||
res.stop = true;
|
res.stop = true;
|
||||||
|
|
||||||
const int n_embd = llama_n_embd(model);
|
const int n_embd = llama_n_embd(model);
|
||||||
|
|
||||||
if (!params.embedding)
|
if (!params.embedding)
|
||||||
{
|
{
|
||||||
LOG_WARNING("embedding disabled", {{"params.embedding", params.embedding}});
|
LOG_WARNING("embedding disabled", {{"params.embedding", params.embedding}});
|
||||||
|
@ -1233,12 +1234,29 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
const float *data = llama_get_embeddings(ctx);
|
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||||
std::vector<float> embedding(data, data + n_embd);
|
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
|
||||||
res.result_json = json
|
continue;
|
||||||
{
|
}
|
||||||
{"embedding", embedding},
|
|
||||||
};
|
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
|
||||||
|
if (embd == NULL) {
|
||||||
|
embd = llama_get_embeddings_ith(ctx, i);
|
||||||
|
if (embd == NULL) {
|
||||||
|
LOG_ERROR("failed to get embeddings for token", {{"token", batch.token[i]}, {"seq_id", batch.seq_id[i][0]}});
|
||||||
|
res.result_json = json
|
||||||
|
{
|
||||||
|
{"embedding", std::vector<float>(n_embd, 0.0f)},
|
||||||
|
};
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
res.result_json = json
|
||||||
|
{
|
||||||
|
{"embedding", std::vector<float>(embd, embd + n_embd)},
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
queue_results.send(res);
|
queue_results.send(res);
|
||||||
}
|
}
|
||||||
|
@ -1900,7 +1918,7 @@ struct llama_server_context
|
||||||
|
|
||||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
|
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
|
||||||
{
|
{
|
||||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
|
||||||
|
|
||||||
for (auto & slot : slots)
|
for (auto & slot : slots)
|
||||||
{
|
{
|
||||||
|
@ -1973,7 +1991,7 @@ struct llama_server_context
|
||||||
// prompt evaluated for embedding
|
// prompt evaluated for embedding
|
||||||
if (slot.embedding)
|
if (slot.embedding)
|
||||||
{
|
{
|
||||||
send_embedding(slot);
|
send_embedding(slot, batch_view);
|
||||||
slot.release();
|
slot.release();
|
||||||
slot.i_batch = -1;
|
slot.i_batch = -1;
|
||||||
continue;
|
continue;
|
||||||
|
@ -2055,6 +2073,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
|
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
|
||||||
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
|
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
|
||||||
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
||||||
|
printf(" --pooling {none,mean,cls}\n");
|
||||||
|
printf(" pooling type for embeddings, use model default if unspecified\n");
|
||||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||||
|
@ -2295,6 +2315,18 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
}
|
}
|
||||||
params.yarn_beta_slow = std::stof(argv[i]);
|
params.yarn_beta_slow = std::stof(argv[i]);
|
||||||
}
|
}
|
||||||
|
else if (arg == "--pooling")
|
||||||
|
{
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
std::string value(argv[i]);
|
||||||
|
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
|
||||||
|
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
|
||||||
|
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
|
||||||
|
else { invalid_param = true; break; }
|
||||||
|
}
|
||||||
else if (arg == "--threads" || arg == "-t")
|
else if (arg == "--threads" || arg == "-t")
|
||||||
{
|
{
|
||||||
if (++i >= argc)
|
if (++i >= argc)
|
||||||
|
@ -2349,7 +2381,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.n_batch = std::stoi(argv[i]);
|
params.n_batch = std::stoi(argv[i]);
|
||||||
params.n_batch = std::min(512, params.n_batch);
|
|
||||||
}
|
}
|
||||||
else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
|
else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
|
||||||
{
|
{
|
||||||
|
|
|
@ -6,3 +6,4 @@ More info:
|
||||||
|
|
||||||
- https://github.com/ggerganov/llama.cpp/pull/2926
|
- https://github.com/ggerganov/llama.cpp/pull/2926
|
||||||
- https://github.com/ggerganov/llama.cpp/pull/3624
|
- https://github.com/ggerganov/llama.cpp/pull/3624
|
||||||
|
- https://github.com/ggerganov/llama.cpp/pull/5625
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <set>
|
||||||
|
|
||||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
|
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
|
||||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||||
|
@ -18,6 +19,7 @@ struct seq_draft {
|
||||||
std::vector<int> i_batch_tgt;
|
std::vector<int> i_batch_tgt;
|
||||||
|
|
||||||
std::vector<llama_token> tokens;
|
std::vector<llama_token> tokens;
|
||||||
|
std::vector<std::vector<llama_token_data>> dists;
|
||||||
|
|
||||||
struct llama_sampling_context * ctx_sampling;
|
struct llama_sampling_context * ctx_sampling;
|
||||||
};
|
};
|
||||||
|
@ -37,12 +39,15 @@ int main(int argc, char ** argv) {
|
||||||
// max number of parallel drafting sequences (i.e. tree branches)
|
// max number of parallel drafting sequences (i.e. tree branches)
|
||||||
const int n_seq_dft = params.n_parallel;
|
const int n_seq_dft = params.n_parallel;
|
||||||
|
|
||||||
// probability threshold for accepting a token from the draft model
|
|
||||||
const float p_accept = params.p_accept;
|
|
||||||
|
|
||||||
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
|
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
|
||||||
const float p_split = params.p_split;
|
const float p_split = params.p_split;
|
||||||
|
|
||||||
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||||
|
params.seed = time(NULL);
|
||||||
|
}
|
||||||
|
std::default_random_engine rng(params.seed);
|
||||||
|
std::uniform_real_distribution<> u_dist;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
log_set_target(log_filename_generator("speculative", "log"));
|
log_set_target(log_filename_generator("speculative", "log"));
|
||||||
LOG_TEE("Log start\n");
|
LOG_TEE("Log start\n");
|
||||||
|
@ -166,7 +171,9 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<seq_draft> drafts(n_seq_dft);
|
std::vector<seq_draft> drafts(n_seq_dft);
|
||||||
|
|
||||||
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
|
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
|
||||||
params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
|
if (params.sparams.temp == 0) {
|
||||||
|
params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
|
||||||
|
}
|
||||||
|
|
||||||
for (int s = 0; s < n_seq_dft; ++s) {
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
|
drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
|
||||||
|
@ -182,12 +189,15 @@ int main(int argc, char ** argv) {
|
||||||
drafts[0].i_batch_tgt[0] = 0;
|
drafts[0].i_batch_tgt[0] = 0;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
|
std::set<int> active_seqs = {};
|
||||||
|
|
||||||
// print current draft sequences
|
// print current draft sequences
|
||||||
for (int s = 0; s < n_seq_dft; ++s) {
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
if (!drafts[s].active) {
|
if (!drafts[s].active) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
active_seqs.insert(s);
|
||||||
const auto & tokens = drafts[s].tokens;
|
const auto & tokens = drafts[s].tokens;
|
||||||
|
|
||||||
LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
|
LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
|
||||||
|
@ -196,48 +206,156 @@ int main(int argc, char ** argv) {
|
||||||
int i_dft = 0;
|
int i_dft = 0;
|
||||||
int s_keep = 0;
|
int s_keep = 0;
|
||||||
|
|
||||||
|
llama_token token_id;
|
||||||
|
std::string token_str;
|
||||||
|
|
||||||
|
// loop until we fail to accept a drafted token or we run out of drafted tokens
|
||||||
while (true) {
|
while (true) {
|
||||||
LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
|
|
||||||
|
|
||||||
// sample from the target model
|
|
||||||
llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
|
|
||||||
|
|
||||||
llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
|
|
||||||
|
|
||||||
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
|
|
||||||
|
|
||||||
const std::string token_str = llama_token_to_piece(ctx_tgt, id);
|
|
||||||
|
|
||||||
if (!params.use_color) {
|
|
||||||
printf("%s", token_str.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (id == llama_token_eos(model_tgt)) {
|
|
||||||
has_eos = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
++n_predict;
|
|
||||||
|
|
||||||
// check if the target token matches any of the drafts
|
// check if the target token matches any of the drafts
|
||||||
|
// for stochastic sampling, attempt to match the token with the drafted tokens
|
||||||
{
|
{
|
||||||
bool matches = false;
|
bool accept = false;
|
||||||
|
if (params.sparams.temp > 0) {
|
||||||
|
// stochastic verification
|
||||||
|
|
||||||
for (int s = 0; s < n_seq_dft; ++s) {
|
llama_token_data_array dist_tgt = llama_sampling_probability_distribution(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||||
if (!drafts[s].active) {
|
float p_tgt = 0, p_dft = 0;
|
||||||
continue;
|
|
||||||
|
// GGML_ASSERT(dist_tgt.size() == dist_dft.size());
|
||||||
|
|
||||||
|
while (active_seqs.size() > 0) {
|
||||||
|
// randomly select a sequence to verify from active sequences
|
||||||
|
std::uniform_int_distribution<unsigned int> u_int_dist(0, active_seqs.size() - 1);
|
||||||
|
int s = *std::next(active_seqs.begin(), u_int_dist(rng));
|
||||||
|
if (i_dft >= (int) drafts[s].tokens.size()) {
|
||||||
|
drafts[s].active = false;
|
||||||
|
active_seqs.erase(s);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (accept) {
|
||||||
|
// if we already accepted a token, we can skip the rest
|
||||||
|
if (drafts[s].tokens[i_dft] != drafts[s_keep].tokens[i_dft]) {
|
||||||
|
drafts[s].active = false;
|
||||||
|
active_seqs.erase(s);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
|
||||||
|
float r = u_dist(rng);
|
||||||
|
llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), true };
|
||||||
|
// acquire the token probabilities assigned by the draft and target models
|
||||||
|
for (size_t i = 0; i < dist_tgt.size; i++) {
|
||||||
|
if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
|
||||||
|
p_tgt = dist_tgt.data[i].p;
|
||||||
|
}
|
||||||
|
if (dist_dft.data[i].id == drafts[s].tokens[i_dft]) {
|
||||||
|
p_dft = dist_dft.data[i].p;
|
||||||
|
}
|
||||||
|
if (p_tgt && p_dft) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
|
||||||
|
if (r <= p_tgt / p_dft) {
|
||||||
|
s_keep = s;
|
||||||
|
accept = true;
|
||||||
|
token_id = drafts[s].tokens[i_dft];
|
||||||
|
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
||||||
|
llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
|
||||||
|
|
||||||
|
LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
|
||||||
|
drafts[s].active = false;
|
||||||
|
|
||||||
|
// calculate residual probability
|
||||||
|
GGML_ASSERT(dist_tgt.sorted);
|
||||||
|
GGML_ASSERT(dist_dft.sorted);
|
||||||
|
float sum_probs = 0.0f;
|
||||||
|
|
||||||
|
// sort dist by id
|
||||||
|
std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
|
||||||
|
return a.id < b.id;
|
||||||
|
});
|
||||||
|
std::sort(dist_dft.data, dist_dft.data + dist_dft.size, [](const llama_token_data &a, const llama_token_data &b) {
|
||||||
|
return a.id < b.id;
|
||||||
|
});
|
||||||
|
|
||||||
|
for (size_t i = 0; i < dist_tgt.size; i++) {
|
||||||
|
dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p);
|
||||||
|
sum_probs += dist_tgt.data[i].p;
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < dist_tgt.size; i++) {
|
||||||
|
dist_tgt.data[i].p /= sum_probs;
|
||||||
|
}
|
||||||
|
|
||||||
|
// sort dist_tgt by p desc
|
||||||
|
std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
|
||||||
|
return a.p > b.p;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
active_seqs.erase(s);
|
||||||
|
for(int i = 0; i < n_seq_dft; i++) {
|
||||||
|
if (i == s) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
|
||||||
|
// synchronize active status for sequences with the same drafted token
|
||||||
|
drafts[i].active = drafts[i].active && accept;
|
||||||
|
if (!drafts[i].active) {
|
||||||
|
active_seqs.erase(s);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) {
|
if (!accept) {
|
||||||
LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str());
|
// all drafted tokens were rejected
|
||||||
|
// sample from the target model
|
||||||
|
LOG("all drafted tokens were rejected, sampling from residual distribution\n");
|
||||||
|
token_id = llama_sample_token(ctx_tgt, &dist_tgt);
|
||||||
|
llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
|
||||||
|
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
||||||
|
}
|
||||||
|
|
||||||
s_keep = s;
|
} else {
|
||||||
matches = true;
|
// greedy verification
|
||||||
} else {
|
|
||||||
drafts[s].active = false;
|
// sample from the target model
|
||||||
|
LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||||
|
token_id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||||
|
|
||||||
|
llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
|
||||||
|
|
||||||
|
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
|
||||||
|
|
||||||
|
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
||||||
|
|
||||||
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
|
if (!drafts[s].active) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) {
|
||||||
|
LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
|
||||||
|
|
||||||
|
s_keep = s;
|
||||||
|
accept = true;
|
||||||
|
} else {
|
||||||
|
drafts[s].active = false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (matches) {
|
if (token_id == llama_token_eos(model_tgt)) {
|
||||||
|
has_eos = true;
|
||||||
|
}
|
||||||
|
++n_predict;
|
||||||
|
|
||||||
|
if (accept) {
|
||||||
++n_accept;
|
++n_accept;
|
||||||
++n_past_tgt;
|
++n_past_tgt;
|
||||||
++n_past_dft;
|
++n_past_dft;
|
||||||
|
@ -245,17 +363,21 @@ int main(int argc, char ** argv) {
|
||||||
if (params.use_color) {
|
if (params.use_color) {
|
||||||
// Color token according to its origin sequence
|
// Color token according to its origin sequence
|
||||||
printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
|
printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
|
||||||
fflush(stdout);
|
} else {
|
||||||
|
printf("%s", token_str.c_str());
|
||||||
}
|
}
|
||||||
|
fflush(stdout);
|
||||||
continue;
|
continue;
|
||||||
|
} else {
|
||||||
|
printf("%s", token_str.c_str());
|
||||||
|
fflush(stdout);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (params.use_color) {
|
}
|
||||||
printf("%s", token_str.c_str());
|
|
||||||
}
|
|
||||||
fflush(stdout);
|
|
||||||
|
|
||||||
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
|
{
|
||||||
|
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
|
||||||
|
|
||||||
// TODO: simplify
|
// TODO: simplify
|
||||||
{
|
{
|
||||||
|
@ -275,21 +397,21 @@ int main(int argc, char ** argv) {
|
||||||
drafts[s].active = false;
|
drafts[s].active = false;
|
||||||
drafts[s].tokens.clear();
|
drafts[s].tokens.clear();
|
||||||
drafts[s].i_batch_tgt.clear();
|
drafts[s].i_batch_tgt.clear();
|
||||||
|
drafts[s].dists.clear();
|
||||||
}
|
}
|
||||||
// note: will be erased after the speculation phase
|
// note: will be erased after the speculation phase
|
||||||
drafts[0].tokens.push_back(id);
|
drafts[0].tokens.push_back(token_id);
|
||||||
|
drafts[0].dists.push_back(std::vector<llama_token_data>());
|
||||||
drafts[0].i_batch_tgt.push_back(0);
|
drafts[0].i_batch_tgt.push_back(0);
|
||||||
|
|
||||||
llama_batch_clear(batch_dft);
|
llama_batch_clear(batch_dft);
|
||||||
llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true);
|
llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
||||||
// LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
// LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
||||||
llama_decode (ctx_dft, batch_dft);
|
llama_decode(ctx_dft, batch_dft);
|
||||||
|
|
||||||
++n_past_dft;
|
++n_past_dft;
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_predict > params.n_predict || has_eos) {
|
if (n_predict > params.n_predict || has_eos) {
|
||||||
|
@ -334,12 +456,6 @@ int main(int argc, char ** argv) {
|
||||||
k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
|
k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cur_p[0].p < p_accept) {
|
|
||||||
LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
|
|
||||||
drafts[s].drafting = false;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<int> sa(1, s);
|
std::vector<int> sa(1, s);
|
||||||
|
|
||||||
// attempt to split the branch if the probability is high enough
|
// attempt to split the branch if the probability is high enough
|
||||||
|
@ -367,6 +483,7 @@ int main(int argc, char ** argv) {
|
||||||
drafts[n_seq_cur].skip = true;
|
drafts[n_seq_cur].skip = true;
|
||||||
|
|
||||||
drafts[n_seq_cur].tokens = drafts[s].tokens;
|
drafts[n_seq_cur].tokens = drafts[s].tokens;
|
||||||
|
drafts[n_seq_cur].dists = drafts[s].dists;
|
||||||
drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
|
drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
|
||||||
drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
|
drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
|
||||||
|
|
||||||
|
@ -389,6 +506,8 @@ int main(int argc, char ** argv) {
|
||||||
llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
|
llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
|
||||||
|
|
||||||
drafts[s].tokens.push_back(id);
|
drafts[s].tokens.push_back(id);
|
||||||
|
// save cur_p.data into drafts[s].dists
|
||||||
|
drafts[s].dists.push_back(cur_p);
|
||||||
|
|
||||||
// add unique drafted tokens to the target batch
|
// add unique drafted tokens to the target batch
|
||||||
drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
|
drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
|
||||||
|
@ -440,6 +559,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
drafts[s].tokens.erase(drafts[s].tokens.begin());
|
drafts[s].tokens.erase(drafts[s].tokens.begin());
|
||||||
|
drafts[s].dists.erase(drafts[s].dists.begin());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -91,13 +91,14 @@ extern "C" {
|
||||||
// (optional) complete all pending operations
|
// (optional) complete all pending operations
|
||||||
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
||||||
|
|
||||||
// compute graph with a plan
|
// create a plan for ggml_cgraph and free it
|
||||||
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
||||||
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
||||||
|
|
||||||
|
// compute graph with a plan
|
||||||
|
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
// compute graph without a plan (async)
|
// compute graph without a plan (async)
|
||||||
bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
// check if the backend supports an operation
|
// check if the backend supports an operation
|
||||||
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
|
|
@ -262,11 +262,11 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
|
||||||
backend->iface.graph_plan_free(backend, plan);
|
backend->iface.graph_plan_free(backend, plan);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
backend->iface.graph_plan_compute(backend, plan);
|
return backend->iface.graph_plan_compute(backend, plan);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
return backend->iface.graph_compute(backend, cgraph);
|
return backend->iface.graph_compute(backend, cgraph);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -732,15 +732,15 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
|
||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||||||
|
|
||||||
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
||||||
|
|
||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||||
|
|
||||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
||||||
|
@ -755,8 +755,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
|
||||||
cplan.abort_callback = cpu_ctx->abort_callback;
|
cplan.abort_callback = cpu_ctx->abort_callback;
|
||||||
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
||||||
|
|
||||||
ggml_graph_compute(cgraph, &cplan);
|
return ggml_graph_compute(cgraph, &cplan);
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
|
@ -1437,7 +1436,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
||||||
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
|
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
|
||||||
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
|
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
|
||||||
|
|
||||||
|
@ -1472,8 +1471,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
||||||
|
|
||||||
uint64_t compute_start_us = ggml_time_us();
|
uint64_t compute_start_us = ggml_time_us();
|
||||||
if (!sched->callback_eval) {
|
if (!sched->callback_eval) {
|
||||||
if (!ggml_backend_graph_compute(split_backend, &split->graph)) {
|
enum ggml_status ec = ggml_backend_graph_compute(split_backend, &split->graph);
|
||||||
return false;
|
if (ec != GGML_STATUS_SUCCESS) {
|
||||||
|
return ec;
|
||||||
}
|
}
|
||||||
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
||||||
} else {
|
} else {
|
||||||
|
@ -1494,8 +1494,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
||||||
|
|
||||||
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
||||||
|
|
||||||
if (!ggml_backend_graph_compute(split_backend, &gv)) {
|
enum ggml_status ec = ggml_backend_graph_compute(split_backend, &gv);
|
||||||
return false;
|
if (ec != GGML_STATUS_SUCCESS) {
|
||||||
|
return ec;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
||||||
|
@ -1519,7 +1520,7 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return true;
|
return GGML_STATUS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
|
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
|
||||||
|
@ -1581,7 +1582,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||||
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
||||||
|
|
||||||
if (!sched->is_reset) {
|
if (!sched->is_reset) {
|
||||||
|
@ -1590,14 +1591,10 @@ bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cg
|
||||||
|
|
||||||
ggml_backend_sched_split_graph(sched, graph);
|
ggml_backend_sched_split_graph(sched, graph);
|
||||||
if (!ggml_backend_sched_alloc_splits(sched)) {
|
if (!ggml_backend_sched_alloc_splits(sched)) {
|
||||||
return false;
|
return GGML_STATUS_ALLOC_FAILED;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!ggml_backend_sched_compute_splits(sched)) {
|
return ggml_backend_sched_compute_splits(sched);
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
||||||
|
|
|
@ -66,12 +66,13 @@ extern "C" {
|
||||||
|
|
||||||
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
|
|
||||||
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
|
||||||
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
|
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
|
||||||
// tensor copy between different backends
|
// tensor copy between different backends
|
||||||
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
@ -157,26 +158,26 @@ extern "C" {
|
||||||
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
||||||
|
|
||||||
// Initialize a backend scheduler
|
// Initialize a backend scheduler
|
||||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
|
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
|
||||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||||
// Initialize backend buffers from a measure graph
|
// Initialize backend buffers from a measure graph
|
||||||
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
||||||
// Get the number of splits of the last graph
|
// Get the number of splits of the last graph
|
||||||
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
||||||
|
|
||||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||||
GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||||
|
|
||||||
// Allocate and compute graph on the backend scheduler
|
// Allocate and compute graph on the backend scheduler
|
||||||
GGML_API bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||||
|
|
||||||
// Reset all assignments and allocators - must be called before changing the node backends
|
// Reset all assignments and allocators - must be called before changing the node backends
|
||||||
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
||||||
|
|
||||||
// Set a callback to be called for each resulting node during graph compute
|
// Set a callback to be called for each resulting node during graph compute
|
||||||
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
|
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Utils
|
// Utils
|
||||||
|
|
231
ggml-cuda.cu
231
ggml-cuda.cu
|
@ -616,6 +616,8 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + Q
|
||||||
#define CUDA_UPSCALE_BLOCK_SIZE 256
|
#define CUDA_UPSCALE_BLOCK_SIZE 256
|
||||||
#define CUDA_CONCAT_BLOCK_SIZE 256
|
#define CUDA_CONCAT_BLOCK_SIZE 256
|
||||||
#define CUDA_PAD_BLOCK_SIZE 256
|
#define CUDA_PAD_BLOCK_SIZE 256
|
||||||
|
#define CUDA_ARANGE_BLOCK_SIZE 256
|
||||||
|
#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
|
||||||
#define CUDA_ACC_BLOCK_SIZE 256
|
#define CUDA_ACC_BLOCK_SIZE 256
|
||||||
#define CUDA_IM2COL_BLOCK_SIZE 256
|
#define CUDA_IM2COL_BLOCK_SIZE 256
|
||||||
#define CUDA_POOL2D_BLOCK_SIZE 256
|
#define CUDA_POOL2D_BLOCK_SIZE 256
|
||||||
|
@ -990,17 +992,21 @@ static __global__ void concat_f32(const float * x,const float * y, float * dst,
|
||||||
nidx +
|
nidx +
|
||||||
blockIdx.y * ne0 +
|
blockIdx.y * ne0 +
|
||||||
blockIdx.z * ne0 * gridDim.y;
|
blockIdx.z * ne0 * gridDim.y;
|
||||||
dst[offset_dst] = x[offset_src];
|
dst[offset_dst] = x[offset_src];
|
||||||
} else {
|
} else {
|
||||||
int offset_src =
|
int offset_src =
|
||||||
nidx +
|
nidx +
|
||||||
blockIdx.y * ne0 +
|
blockIdx.y * ne0 +
|
||||||
(blockIdx.z - ne02) * ne0 * gridDim.y;
|
(blockIdx.z - ne02) * ne0 * gridDim.y;
|
||||||
dst[offset_dst] = y[offset_src];
|
dst[offset_dst] = y[offset_src];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int nb02, const int scale_factor) {
|
static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
|
||||||
|
// blockIdx.z: idx of ne02*ne03
|
||||||
|
// blockIdx.y: idx of ne01*scale_factor, aka ne1
|
||||||
|
// blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
|
||||||
|
// ne00xne01: ne00 * ne01
|
||||||
int ne0 = ne00 * scale_factor;
|
int ne0 = ne00 * scale_factor;
|
||||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
if (nidx >= ne0) {
|
if (nidx >= ne0) {
|
||||||
|
@ -1012,7 +1018,7 @@ static __global__ void upscale_f32(const float * x, float * dst, const int ne00,
|
||||||
int offset_src =
|
int offset_src =
|
||||||
i00 +
|
i00 +
|
||||||
i01 * ne00 +
|
i01 * ne00 +
|
||||||
blockIdx.z * nb02;
|
blockIdx.z * ne00xne01;
|
||||||
int offset_dst =
|
int offset_dst =
|
||||||
nidx +
|
nidx +
|
||||||
blockIdx.y * ne0 +
|
blockIdx.y * ne0 +
|
||||||
|
@ -1020,7 +1026,10 @@ static __global__ void upscale_f32(const float * x, float * dst, const int ne00,
|
||||||
dst[offset_dst] = x[offset_src];
|
dst[offset_dst] = x[offset_src];
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02) {
|
static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
|
||||||
|
// blockIdx.z: idx of ne2*ne3, aka ne02*ne03
|
||||||
|
// blockIdx.y: idx of ne1
|
||||||
|
// blockIDx.x: idx of ne0 / BLOCK_SIZE
|
||||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
if (nidx >= ne0) {
|
if (nidx >= ne0) {
|
||||||
return;
|
return;
|
||||||
|
@ -1031,19 +1040,53 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons
|
||||||
nidx +
|
nidx +
|
||||||
blockIdx.y * ne0 +
|
blockIdx.y * ne0 +
|
||||||
blockIdx.z * ne0 * gridDim.y;
|
blockIdx.z * ne0 * gridDim.y;
|
||||||
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02) {
|
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
|
||||||
int offset_src =
|
int offset_src =
|
||||||
nidx +
|
nidx +
|
||||||
blockIdx.y * ne00 +
|
blockIdx.y * ne00 +
|
||||||
blockIdx.z * ne00 * ne01;
|
blockIdx.z * ne00 * ne01;
|
||||||
dst[offset_dst] = x[offset_src];
|
dst[offset_dst] = x[offset_src];
|
||||||
} else {
|
} else {
|
||||||
dst[offset_dst] = 0.0f;
|
dst[offset_dst] = 0.0f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
|
||||||
|
// blockIDx.x: idx of ne0 / BLOCK_SIZE
|
||||||
|
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
if (nidx >= ne0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
dst[nidx] = start + step * nidx;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
|
||||||
|
// blockIDx.y: idx of timesteps->ne[0]
|
||||||
|
// blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
|
||||||
|
int i = blockIdx.y;
|
||||||
|
int j = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
float * embed_data = (float *)((char *)dst + i*nb1);
|
||||||
|
|
||||||
|
if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
|
||||||
|
embed_data[dim] = 0.f;
|
||||||
|
}
|
||||||
|
|
||||||
|
int half = dim / 2;
|
||||||
|
if (j >= half) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
float timestep = timesteps[i];
|
||||||
|
float freq = (float)expf(-logf(max_period) * j / half);
|
||||||
|
float arg = timestep * freq;
|
||||||
|
embed_data[j] = cosf(arg);
|
||||||
|
embed_data[j + half] = sinf(arg);
|
||||||
|
}
|
||||||
|
|
||||||
template <int block_size>
|
template <int block_size>
|
||||||
static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
|
static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
|
||||||
|
// blockIdx.x: num_groups idx
|
||||||
|
// threadIdx.x: block_size idx
|
||||||
int start = blockIdx.x * group_size;
|
int start = blockIdx.x * group_size;
|
||||||
int end = start + group_size;
|
int end = start + group_size;
|
||||||
|
|
||||||
|
@ -6448,7 +6491,7 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
||||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||||
const int nb12, const int nb13) {
|
const int nb12, const int nb13) {
|
||||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
if (i >= ne) {
|
if (i >= ne) {
|
||||||
return;
|
return;
|
||||||
|
@ -6456,17 +6499,17 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
||||||
|
|
||||||
// determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
// determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
||||||
// then combine those indices with the corresponding byte offsets to get the total offsets
|
// then combine those indices with the corresponding byte offsets to get the total offsets
|
||||||
const int i03 = i/(ne00 * ne01 * ne02);
|
const int64_t i03 = i/(ne00 * ne01 * ne02);
|
||||||
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||||
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||||
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
||||||
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
||||||
|
|
||||||
const int i13 = i/(ne10 * ne11 * ne12);
|
const int64_t i13 = i/(ne10 * ne11 * ne12);
|
||||||
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
||||||
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
||||||
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||||
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
|
const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
|
||||||
|
|
||||||
cpy_1(cx + x_offset, cdst + dst_offset);
|
cpy_1(cx + x_offset, cdst + dst_offset);
|
||||||
}
|
}
|
||||||
|
@ -6956,23 +6999,23 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static __global__ void im2col_kernel(
|
static __global__ void im2col_kernel(
|
||||||
const float * x, T * dst, int batch_offset,
|
const float * x, T * dst, int64_t batch_offset,
|
||||||
int offset_delta, int IC, int IW, int IH, int OH, int OW, int KW, int KH, int pelements, int CHW,
|
int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
|
||||||
int s0, int s1, int p0, int p1, int d0, int d1) {
|
int s0, int s1, int p0, int p1, int d0, int d1) {
|
||||||
const int i = threadIdx.x + blockIdx.x * blockDim.x;
|
const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
if (i >= pelements) {
|
if (i >= pelements) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int ksize = OW * (KH > 1 ? KW : 1);
|
const int64_t ksize = OW * (KH > 1 ? KW : 1);
|
||||||
const int kx = i / ksize;
|
const int64_t kx = i / ksize;
|
||||||
const int kd = kx * ksize;
|
const int64_t kd = kx * ksize;
|
||||||
const int ky = (i - kd) / OW;
|
const int64_t ky = (i - kd) / OW;
|
||||||
const int ix = i % OW;
|
const int64_t ix = i % OW;
|
||||||
|
|
||||||
const int oh = blockIdx.y;
|
const int64_t oh = blockIdx.y;
|
||||||
const int batch = blockIdx.z / IC;
|
const int64_t batch = blockIdx.z / IC;
|
||||||
const int ic = blockIdx.z % IC;
|
const int64_t ic = blockIdx.z % IC;
|
||||||
|
|
||||||
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
||||||
const int64_t iih = oh * s1 + ky * d1 - p1;
|
const int64_t iih = oh * s1 + ky * d1 - p1;
|
||||||
|
@ -7298,19 +7341,33 @@ static void concat_f32_cuda(const float * x, const float * y, float * dst, const
|
||||||
concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int scale_factor, cudaStream_t stream) {
|
static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
|
||||||
|
const int scale_factor, cudaStream_t stream) {
|
||||||
int ne0 = (ne00 * scale_factor);
|
int ne0 = (ne00 * scale_factor);
|
||||||
int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
||||||
dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02);
|
dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
|
||||||
upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
|
upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void pad_f32_cuda(const float * x, float * dst,
|
static void pad_f32_cuda(const float * x, float * dst,
|
||||||
const int ne00, const int ne01, const int ne02,
|
const int ne00, const int ne01, const int ne02, const int ne03,
|
||||||
const int ne0, const int ne1, const int ne2, cudaStream_t stream) {
|
const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
|
||||||
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
|
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
|
||||||
dim3 gridDim(num_blocks, ne1, ne2);
|
dim3 gridDim(num_blocks, ne1, ne2*ne3);
|
||||||
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02);
|
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
|
||||||
|
int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
|
||||||
|
arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1,
|
||||||
|
const int dim, const int max_period, cudaStream_t stream) {
|
||||||
|
int half_ceil = (dim + 1) / 2;
|
||||||
|
int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE;
|
||||||
|
dim3 gridDim(num_blocks, ne00, 1);
|
||||||
|
timestep_embedding_f32<<<gridDim, CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE, 0, stream>>>(x, dst, nb1, dim, max_period);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
||||||
|
@ -8443,8 +8500,8 @@ static void soft_max_f32_cuda(const float * x, const float * mask, const float *
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static void im2col_cuda(const float* x, T* dst,
|
static void im2col_cuda(const float* x, T* dst,
|
||||||
int IW, int IH, int OW, int OH, int KW, int KH, int IC,
|
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
|
||||||
int batch, int batch_offset, int offset_delta,
|
int64_t batch, int64_t batch_offset, int64_t offset_delta,
|
||||||
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
||||||
const int parallel_elements = OW * KW * KH;
|
const int parallel_elements = OW * KW * KH;
|
||||||
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
||||||
|
@ -9123,7 +9180,7 @@ static void ggml_cuda_op_group_norm(
|
||||||
|
|
||||||
int num_groups = dst->op_params[0];
|
int num_groups = dst->op_params[0];
|
||||||
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
||||||
group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
|
group_norm_f32_cuda(src0_dd, dst_dd, num_groups * src0->ne[3], group_size, ggml_nelements(src0), main_stream);
|
||||||
|
|
||||||
(void) src1;
|
(void) src1;
|
||||||
(void) dst;
|
(void) dst;
|
||||||
|
@ -9156,7 +9213,7 @@ static void ggml_cuda_op_upscale(
|
||||||
|
|
||||||
const int scale_factor = dst->op_params[0];
|
const int scale_factor = dst->op_params[0];
|
||||||
|
|
||||||
upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
|
upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, main_stream);
|
||||||
|
|
||||||
(void) src1;
|
(void) src1;
|
||||||
(void) dst;
|
(void) dst;
|
||||||
|
@ -9172,8 +9229,49 @@ static void ggml_cuda_op_pad(
|
||||||
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
||||||
|
|
||||||
pad_f32_cuda(src0_dd, dst_dd,
|
pad_f32_cuda(src0_dd, dst_dd,
|
||||||
src0->ne[0], src0->ne[1], src0->ne[2],
|
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||||
dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
|
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], main_stream);
|
||||||
|
|
||||||
|
(void) src1;
|
||||||
|
(void) dst;
|
||||||
|
(void) src1_dd;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_cuda_op_arange(
|
||||||
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||||
|
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
||||||
|
|
||||||
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
float start;
|
||||||
|
float stop;
|
||||||
|
float step;
|
||||||
|
memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
|
||||||
|
memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
|
||||||
|
memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
|
||||||
|
|
||||||
|
int64_t steps = (int64_t)ceil((stop - start) / step);
|
||||||
|
GGML_ASSERT(ggml_nelements(dst) == steps);
|
||||||
|
|
||||||
|
arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream);
|
||||||
|
|
||||||
|
(void) src0;
|
||||||
|
(void) src1;
|
||||||
|
(void) src0_dd;
|
||||||
|
(void) src1_dd;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_cuda_op_timestep_embedding(
|
||||||
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||||
|
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
const int dim = dst->op_params[0];
|
||||||
|
const int max_period = dst->op_params[1];
|
||||||
|
|
||||||
|
timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream);
|
||||||
|
|
||||||
(void) src1;
|
(void) src1;
|
||||||
(void) dst;
|
(void) dst;
|
||||||
|
@ -10458,6 +10556,45 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
||||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||||
|
|
||||||
|
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
|
||||||
|
|
||||||
|
// dd = data device
|
||||||
|
float * src0_ddf = nullptr;
|
||||||
|
float * src1_ddf = nullptr;
|
||||||
|
float * dst_ddf = nullptr;
|
||||||
|
|
||||||
|
cuda_pool_alloc<float> dst_f;
|
||||||
|
|
||||||
|
ggml_cuda_set_device(g_main_device);
|
||||||
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||||
|
|
||||||
|
if (dst_on_device) {
|
||||||
|
dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
||||||
|
} else {
|
||||||
|
dst_ddf = dst_f.alloc(ggml_nelements(dst));
|
||||||
|
}
|
||||||
|
|
||||||
|
// do the computation
|
||||||
|
ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
|
||||||
|
CUDA_CHECK(cudaGetLastError());
|
||||||
|
|
||||||
|
// copy dst to host if necessary
|
||||||
|
if (!dst_on_device) {
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
|
CUDA_CHECK(cudaDeviceSynchronize());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_timestep_embedding);
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
||||||
}
|
}
|
||||||
|
@ -11358,6 +11495,12 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
||||||
case GGML_OP_PAD:
|
case GGML_OP_PAD:
|
||||||
func = ggml_cuda_pad;
|
func = ggml_cuda_pad;
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_ARANGE:
|
||||||
|
func = ggml_cuda_arange;
|
||||||
|
break;
|
||||||
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
|
func = ggml_cuda_timestep_embedding;
|
||||||
|
break;
|
||||||
case GGML_OP_LEAKY_RELU:
|
case GGML_OP_LEAKY_RELU:
|
||||||
func = ggml_cuda_leaky_relu;
|
func = ggml_cuda_leaky_relu;
|
||||||
break;
|
break;
|
||||||
|
@ -12098,7 +12241,7 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||||
UNUSED(backend);
|
UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
ggml_cuda_set_main_device(cuda_ctx->device);
|
ggml_cuda_set_main_device(cuda_ctx->device);
|
||||||
|
@ -12134,7 +12277,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
|
||||||
GGML_ASSERT(ok);
|
GGML_ASSERT(ok);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return GGML_STATUS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||||
|
@ -12253,6 +12396,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
case GGML_OP_GROUP_NORM:
|
case GGML_OP_GROUP_NORM:
|
||||||
case GGML_OP_UPSCALE:
|
case GGML_OP_UPSCALE:
|
||||||
case GGML_OP_PAD:
|
case GGML_OP_PAD:
|
||||||
|
case GGML_OP_ARANGE:
|
||||||
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
case GGML_OP_LEAKY_RELU:
|
case GGML_OP_LEAKY_RELU:
|
||||||
return true;
|
return true;
|
||||||
default:
|
default:
|
||||||
|
|
|
@ -1927,10 +1927,10 @@ static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(g
|
||||||
return ggml_backend_kompute_buffer_type(ctx->device);
|
return ggml_backend_kompute_buffer_type(ctx->device);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
|
auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
|
||||||
ggml_vk_graph_compute(ctx, cgraph);
|
ggml_vk_graph_compute(ctx, cgraph);
|
||||||
return true;
|
return GGML_STATUS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
|
|
70
ggml-metal.m
70
ggml-metal.m
|
@ -163,6 +163,8 @@ enum ggml_metal_kernel_type {
|
||||||
GGML_METAL_KERNEL_TYPE_IM2COL_F32,
|
GGML_METAL_KERNEL_TYPE_IM2COL_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
|
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_PAD_F32,
|
GGML_METAL_KERNEL_TYPE_PAD_F32,
|
||||||
|
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
|
||||||
|
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
||||||
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
|
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
|
||||||
GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
|
GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
|
||||||
|
@ -569,6 +571,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true);
|
||||||
|
@ -697,6 +701,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||||
return false;
|
return false;
|
||||||
case GGML_OP_UPSCALE:
|
case GGML_OP_UPSCALE:
|
||||||
case GGML_OP_PAD:
|
case GGML_OP_PAD:
|
||||||
|
case GGML_OP_ARANGE:
|
||||||
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
case GGML_OP_ARGSORT:
|
case GGML_OP_ARGSORT:
|
||||||
case GGML_OP_LEAKY_RELU:
|
case GGML_OP_LEAKY_RELU:
|
||||||
return true;
|
return true;
|
||||||
|
@ -742,7 +748,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_metal_graph_compute(
|
static enum ggml_status ggml_metal_graph_compute(
|
||||||
struct ggml_metal_context * ctx,
|
struct ggml_metal_context * ctx,
|
||||||
struct ggml_cgraph * gf) {
|
struct ggml_cgraph * gf) {
|
||||||
|
|
||||||
|
@ -1091,7 +1097,8 @@ static bool ggml_metal_graph_compute(
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
const float scale = *(const float *) dst->op_params;
|
float scale;
|
||||||
|
memcpy(&scale, dst->op_params, sizeof(scale));
|
||||||
|
|
||||||
int64_t n = ggml_nelements(dst);
|
int64_t n = ggml_nelements(dst);
|
||||||
|
|
||||||
|
@ -1250,11 +1257,15 @@ static bool ggml_metal_graph_compute(
|
||||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
|
||||||
}
|
}
|
||||||
|
|
||||||
const float scale = ((float *) dst->op_params)[0];
|
float scale;
|
||||||
const float max_bias = ((float *) dst->op_params)[1];
|
float max_bias;
|
||||||
|
|
||||||
|
memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale));
|
||||||
|
memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
|
||||||
|
|
||||||
const int64_t nrows_x = ggml_nrows(src0);
|
const int64_t nrows_x = ggml_nrows(src0);
|
||||||
const int64_t nrows_y = src0->ne[1];
|
const int64_t nrows_y = src0->ne[1];
|
||||||
|
|
||||||
const uint32_t n_head_kv = nrows_x/nrows_y;
|
const uint32_t n_head_kv = nrows_x/nrows_y;
|
||||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
||||||
|
|
||||||
|
@ -2086,6 +2097,7 @@ static bool ggml_metal_graph_compute(
|
||||||
|
|
||||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||||
const int n_head = ((int32_t *) dst->op_params)[1];
|
const int n_head = ((int32_t *) dst->op_params)[1];
|
||||||
|
|
||||||
float max_bias;
|
float max_bias;
|
||||||
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
||||||
|
|
||||||
|
@ -2300,6 +2312,50 @@ static bool ggml_metal_graph_compute(
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_ARANGE:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
float start;
|
||||||
|
float step;
|
||||||
|
|
||||||
|
memcpy(&start, ((int32_t *) dst->op_params) + 0, sizeof(float));
|
||||||
|
memcpy(&step, ((int32_t *) dst->op_params) + 2, sizeof(float));
|
||||||
|
|
||||||
|
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline;
|
||||||
|
|
||||||
|
[encoder setComputePipelineState:pipeline];
|
||||||
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:0];
|
||||||
|
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:1];
|
||||||
|
[encoder setBytes:&start length:sizeof(start) atIndex:2];
|
||||||
|
[encoder setBytes:&step length:sizeof(step) atIndex:3];
|
||||||
|
|
||||||
|
const int nth = MIN(1024, ne0);
|
||||||
|
|
||||||
|
[encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
|
} break;
|
||||||
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
const int dim = dst->op_params[0];
|
||||||
|
const int max_period = dst->op_params[1];
|
||||||
|
|
||||||
|
const int half = dim / 2;
|
||||||
|
|
||||||
|
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline;
|
||||||
|
|
||||||
|
[encoder setComputePipelineState:pipeline];
|
||||||
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:2];
|
||||||
|
[encoder setBytes:&dim length:sizeof(dim) atIndex:3];
|
||||||
|
[encoder setBytes:&max_period length:sizeof(max_period) atIndex:4];
|
||||||
|
|
||||||
|
const int nth = MIN(1024, half);
|
||||||
|
|
||||||
|
[encoder dispatchThreadgroups:MTLSizeMake(ne00, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
|
} break;
|
||||||
case GGML_OP_ARGSORT:
|
case GGML_OP_ARGSORT:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
@ -2428,7 +2484,7 @@ static bool ggml_metal_graph_compute(
|
||||||
MTLCommandBufferStatus status = [command_buffer status];
|
MTLCommandBufferStatus status = [command_buffer status];
|
||||||
if (status != MTLCommandBufferStatusCompleted) {
|
if (status != MTLCommandBufferStatusCompleted) {
|
||||||
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
||||||
return false;
|
return GGML_STATUS_FAILED;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2437,7 +2493,7 @@ static bool ggml_metal_graph_compute(
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
return true;
|
return GGML_STATUS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -2739,7 +2795,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe
|
||||||
UNUSED(backend);
|
UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
||||||
|
|
||||||
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
||||||
|
|
|
@ -1959,6 +1959,49 @@ kernel void kernel_pad_f32(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_arange_f32(
|
||||||
|
device char * dst,
|
||||||
|
constant int64_t & ne0,
|
||||||
|
constant float & start,
|
||||||
|
constant float & step,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
|
||||||
|
device float * dst_ptr = (device float *) dst;
|
||||||
|
|
||||||
|
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||||
|
dst_ptr[i0] = start + step * i0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_timestep_embedding_f32(
|
||||||
|
device const char * src0,
|
||||||
|
device char * dst,
|
||||||
|
constant uint64_t & nb1,
|
||||||
|
constant int & dim,
|
||||||
|
constant int & max_period,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
|
||||||
|
int i = tgpig.x;
|
||||||
|
device float * embed_data = (device float *)(dst + i*nb1);
|
||||||
|
|
||||||
|
int half_ = dim / 2;
|
||||||
|
for (int j = tpitg.x; j < half_; j += ntg.x) {
|
||||||
|
float timestep = ((device float *)src0)[i];
|
||||||
|
float freq = (float)exp(-log((float)max_period) * j / half_);
|
||||||
|
float arg = timestep * freq;
|
||||||
|
embed_data[j ] = cos(arg);
|
||||||
|
embed_data[j + half_] = sin(arg);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dim % 2 != 0 && tpitg.x == 0) {
|
||||||
|
embed_data[dim] = 0.f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// bitonic sort implementation following the CUDA kernels as reference
|
// bitonic sort implementation following the CUDA kernels as reference
|
||||||
typedef void (argsort_t)(
|
typedef void (argsort_t)(
|
||||||
device const float * x,
|
device const float * x,
|
||||||
|
|
|
@ -2231,7 +2231,7 @@ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(gg
|
||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
|
static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
|
||||||
for (int i = 0; i < graph->n_nodes; ++i) {
|
for (int i = 0; i < graph->n_nodes; ++i) {
|
||||||
ggml_tensor * node = graph->nodes[i];
|
ggml_tensor * node = graph->nodes[i];
|
||||||
switch (node->op) {
|
switch (node->op) {
|
||||||
|
@ -2246,7 +2246,7 @@ static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgrap
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return GGML_STATUS_SUCCESS;
|
||||||
|
|
||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,6 +51,7 @@
|
||||||
|
|
||||||
#define UNUSED GGML_UNUSED
|
#define UNUSED GGML_UNUSED
|
||||||
|
|
||||||
|
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
|
||||||
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
||||||
|
|
||||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
||||||
|
@ -9563,7 +9564,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
||||||
|
|
||||||
const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
|
const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
|
||||||
const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
|
const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
|
||||||
const __m256i full_signs = _mm256_set_m128i(full_sign_bits, full_sign_bits);
|
const __m256i full_signs = MM256_SET_M128I(full_sign_bits, full_sign_bits);
|
||||||
|
|
||||||
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||||
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
|
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
|
||||||
|
@ -9585,8 +9586,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
||||||
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
||||||
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
||||||
|
|
||||||
const __m256i sc1 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
|
const __m256i sc1 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
|
||||||
const __m256i sc2 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
|
const __m256i sc2 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
|
||||||
|
|
||||||
const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
|
const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
|
||||||
|
|
||||||
|
@ -9653,8 +9654,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
||||||
|
|
||||||
const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
|
const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
|
||||||
const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
|
const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
|
||||||
const __m256i full_signs_1 = _mm256_set_m128i(full_signs_l, full_signs_l);
|
const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
|
||||||
const __m256i full_signs_2 = _mm256_set_m128i(full_signs_h, full_signs_h);
|
const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
|
||||||
|
|
||||||
__m256i signs;
|
__m256i signs;
|
||||||
signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
|
signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
|
||||||
|
@ -10551,10 +10552,10 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
|
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
|
||||||
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
|
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
|
||||||
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
|
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
|
||||||
const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
||||||
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
||||||
const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
||||||
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
||||||
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
||||||
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
||||||
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
|
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
|
||||||
|
@ -10661,10 +10662,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
||||||
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
||||||
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
||||||
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
||||||
const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
||||||
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
||||||
const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
||||||
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
||||||
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
||||||
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
||||||
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
||||||
|
|
1970
ggml-sycl.cpp
1970
ggml-sycl.cpp
File diff suppressed because it is too large
Load diff
86931
ggml-vulkan-shaders.hpp
86931
ggml-vulkan-shaders.hpp
File diff suppressed because it is too large
Load diff
2132
ggml-vulkan.cpp
2132
ggml-vulkan.cpp
File diff suppressed because it is too large
Load diff
|
@ -10,6 +10,7 @@ extern "C" {
|
||||||
#define GGML_VK_NAME "Vulkan"
|
#define GGML_VK_NAME "Vulkan"
|
||||||
#define GGML_VK_MAX_DEVICES 16
|
#define GGML_VK_MAX_DEVICES 16
|
||||||
|
|
||||||
|
GGML_API void ggml_vk_instance_init(void);
|
||||||
GGML_API void ggml_vk_init_cpu_assist(void);
|
GGML_API void ggml_vk_init_cpu_assist(void);
|
||||||
|
|
||||||
GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
|
GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
|
||||||
|
|
237
ggml.c
237
ggml.c
|
@ -320,6 +320,17 @@ static ggml_fp16_t ggml_table_exp_f16[1 << 16];
|
||||||
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
||||||
float ggml_table_f32_f16[1 << 16];
|
float ggml_table_f32_f16[1 << 16];
|
||||||
|
|
||||||
|
const char * ggml_status_to_string(enum ggml_status status) {
|
||||||
|
switch (status) {
|
||||||
|
case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
|
||||||
|
case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
|
||||||
|
case GGML_STATUS_SUCCESS: return "GGML status: success";
|
||||||
|
case GGML_STATUS_ABORTED: return "GGML status: warning (operation aborted)";
|
||||||
|
}
|
||||||
|
|
||||||
|
return "GGML status: unknown";
|
||||||
|
}
|
||||||
|
|
||||||
// note: do not use these inside ggml.c
|
// note: do not use these inside ggml.c
|
||||||
// these are meant to be used via the ggml.h API
|
// these are meant to be used via the ggml.h API
|
||||||
float ggml_fp16_to_fp32(ggml_fp16_t x) {
|
float ggml_fp16_to_fp32(ggml_fp16_t x) {
|
||||||
|
@ -1822,6 +1833,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"POOL_2D",
|
"POOL_2D",
|
||||||
"UPSCALE",
|
"UPSCALE",
|
||||||
"PAD",
|
"PAD",
|
||||||
|
"ARANGE",
|
||||||
|
"TIMESTEP_EMBEDDING",
|
||||||
"ARGSORT",
|
"ARGSORT",
|
||||||
"LEAKY_RELU",
|
"LEAKY_RELU",
|
||||||
|
|
||||||
|
@ -1852,7 +1865,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"CROSS_ENTROPY_LOSS_BACK",
|
"CROSS_ENTROPY_LOSS_BACK",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
|
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
|
||||||
|
|
||||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"none",
|
"none",
|
||||||
|
@ -1910,6 +1923,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"pool_2d(x)",
|
"pool_2d(x)",
|
||||||
"upscale(x)",
|
"upscale(x)",
|
||||||
"pad(x)",
|
"pad(x)",
|
||||||
|
"arange(start, stop, step)",
|
||||||
|
"timestep_embedding(timesteps, dim, max_period)",
|
||||||
"argsort(x)",
|
"argsort(x)",
|
||||||
"leaky_relu(x)",
|
"leaky_relu(x)",
|
||||||
|
|
||||||
|
@ -1940,7 +1955,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"cross_entropy_loss_back(x,y)",
|
"cross_entropy_loss_back(x,y)",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
|
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
|
||||||
|
|
||||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||||
|
|
||||||
|
@ -2899,11 +2914,21 @@ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_
|
||||||
return ((const int32_t *)(tensor->op_params))[i];
|
return ((const int32_t *)(tensor->op_params))[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
|
||||||
|
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
|
||||||
|
return ((const float *)(tensor->op_params))[i];
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
||||||
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
||||||
((int32_t *)(tensor->op_params))[i] = value;
|
((int32_t *)(tensor->op_params))[i] = value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
|
||||||
|
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
|
||||||
|
((float *)(tensor->op_params))[i] = value;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
|
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
|
||||||
memset(tensor->data, 0, ggml_nbytes(tensor));
|
memset(tensor->data, 0, ggml_nbytes(tensor));
|
||||||
return tensor;
|
return tensor;
|
||||||
|
@ -5902,6 +5927,55 @@ struct ggml_tensor * ggml_upscale(
|
||||||
return ggml_upscale_impl(ctx, a, scale_factor);
|
return ggml_upscale_impl(ctx, a, scale_factor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_arange(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
float start,
|
||||||
|
float stop,
|
||||||
|
float step) {
|
||||||
|
|
||||||
|
GGML_ASSERT(stop > start);
|
||||||
|
|
||||||
|
const int64_t steps = (int64_t) ceilf((stop - start) / step);
|
||||||
|
|
||||||
|
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
|
||||||
|
|
||||||
|
result->op = GGML_OP_ARANGE;
|
||||||
|
ggml_set_op_params_f32(result, 0, start);
|
||||||
|
ggml_set_op_params_f32(result, 1, stop);
|
||||||
|
ggml_set_op_params_f32(result, 2, step);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_timestep_embedding(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * timesteps,
|
||||||
|
int dim,
|
||||||
|
int max_period) {
|
||||||
|
bool is_node = false;
|
||||||
|
|
||||||
|
if (timesteps->grad) {
|
||||||
|
GGML_ASSERT(false); // TODO: implement backward
|
||||||
|
is_node = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int actual_dim = dim;
|
||||||
|
if (dim % 2 != 0) {
|
||||||
|
actual_dim = dim + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);
|
||||||
|
|
||||||
|
result->op = GGML_OP_TIMESTEP_EMBEDDING;
|
||||||
|
ggml_set_op_params_i32(result, 0, dim);
|
||||||
|
ggml_set_op_params_i32(result, 1, max_period);
|
||||||
|
|
||||||
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
result->src[0] = timesteps;
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_argsort
|
// ggml_argsort
|
||||||
|
|
||||||
struct ggml_tensor * ggml_argsort(
|
struct ggml_tensor * ggml_argsort(
|
||||||
|
@ -10337,7 +10411,7 @@ static void ggml_compute_forward_group_norm_f32(
|
||||||
int n_channels = src0->ne[2];
|
int n_channels = src0->ne[2];
|
||||||
int n_groups = dst->op_params[0];
|
int n_groups = dst->op_params[0];
|
||||||
int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
|
int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
|
||||||
for (int i = ith; i < n_groups; i+=nth) {
|
for (int i = ith; i < n_groups; i += nth) {
|
||||||
int start = i * n_channels_per_group;
|
int start = i * n_channels_per_group;
|
||||||
int end = start + n_channels_per_group;
|
int end = start + n_channels_per_group;
|
||||||
if (end > n_channels) {
|
if (end > n_channels) {
|
||||||
|
@ -10351,28 +10425,32 @@ static void ggml_compute_forward_group_norm_f32(
|
||||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
||||||
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
||||||
|
|
||||||
|
ggml_float sumr = 0.0;
|
||||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
||||||
sum += (ggml_float)x[i00];
|
sumr += (ggml_float)x[i00];
|
||||||
}
|
}
|
||||||
|
sum += sumr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
float mean = sum / (ne00 * ne01 * step);
|
const float mean = sum / (ne00 * ne01 * step);
|
||||||
ggml_float sum2 = 0.0;
|
|
||||||
|
|
||||||
|
ggml_float sum2 = 0.0;
|
||||||
for (int64_t i02 = start; i02 < end; i02++) {
|
for (int64_t i02 = start; i02 < end; i02++) {
|
||||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
||||||
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
||||||
|
|
||||||
float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
|
float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
|
||||||
|
|
||||||
|
ggml_float sumr = 0.0;
|
||||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
||||||
float v = x[i00] - mean;
|
float v = x[i00] - mean;
|
||||||
y[i00] = v;
|
y[i00] = v;
|
||||||
sum2 += (ggml_float)(v * v);
|
sumr += (ggml_float)(v * v);
|
||||||
}
|
}
|
||||||
|
sum2 += sumr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
float variance = sum2 / (ne00 * ne01 * step);
|
const float variance = sum2 / (ne00 * ne01 * step);
|
||||||
const float scale = 1.0f / sqrtf(variance + eps);
|
const float scale = 1.0f / sqrtf(variance + eps);
|
||||||
|
|
||||||
for (int64_t i02 = start; i02 < end; i02++) {
|
for (int64_t i02 = start; i02 < end; i02++) {
|
||||||
|
@ -13653,6 +13731,106 @@ static void ggml_compute_forward_pad(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// ggml_compute_forward_arange
|
||||||
|
|
||||||
|
static void ggml_compute_forward_arange_f32(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
struct ggml_tensor * dst) {
|
||||||
|
|
||||||
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(dst->nb[0] == sizeof(float));
|
||||||
|
|
||||||
|
const int ith = params->ith;
|
||||||
|
const int nth = params->nth;
|
||||||
|
|
||||||
|
const float start = ggml_get_op_params_f32(dst, 0);
|
||||||
|
const float stop = ggml_get_op_params_f32(dst, 1);
|
||||||
|
const float step = ggml_get_op_params_f32(dst, 2);
|
||||||
|
|
||||||
|
const int64_t steps = (int64_t) ceilf((stop - start) / step);
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_nelements(dst) == steps);
|
||||||
|
|
||||||
|
for (int64_t i = ith; i < steps; i+= nth) {
|
||||||
|
float value = start + step * i;
|
||||||
|
((float *)dst->data)[i] = value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_compute_forward_arange(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
struct ggml_tensor * dst) {
|
||||||
|
switch (dst->type) {
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_arange_f32(params, dst);
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_compute_forward_timestep_embedding_f32(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
struct ggml_tensor * dst) {
|
||||||
|
|
||||||
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||||
|
|
||||||
|
const int ith = params->ith;
|
||||||
|
const int nth = params->nth;
|
||||||
|
|
||||||
|
GGML_TENSOR_UNARY_OP_LOCALS
|
||||||
|
|
||||||
|
const int dim = ggml_get_op_params_i32(dst, 0);
|
||||||
|
const int max_period = ggml_get_op_params_i32(dst, 1);
|
||||||
|
|
||||||
|
int half = dim / 2;
|
||||||
|
|
||||||
|
for (int64_t i = 0; i < ne00; i++) {
|
||||||
|
float * embed_data = (float *)((char *) dst->data + i*nb1);
|
||||||
|
for (int64_t j = ith; j < half; j += nth) {
|
||||||
|
float timestep = ((float *)src0->data)[i];
|
||||||
|
float freq = (float)expf(-logf(max_period) * j / half);
|
||||||
|
float arg = timestep * freq;
|
||||||
|
embed_data[j] = cosf(arg);
|
||||||
|
embed_data[j + half] = sinf(arg);
|
||||||
|
}
|
||||||
|
if (dim % 2 != 0 && ith == 0) {
|
||||||
|
embed_data[dim] = 0.f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_compute_forward_timestep_embedding(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
struct ggml_tensor * dst) {
|
||||||
|
|
||||||
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_timestep_embedding_f32(params, dst);
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_compute_forward_argsort
|
// ggml_compute_forward_argsort
|
||||||
|
|
||||||
static void ggml_compute_forward_argsort_f32(
|
static void ggml_compute_forward_argsort_f32(
|
||||||
|
@ -15972,6 +16150,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
{
|
{
|
||||||
ggml_compute_forward_pad(params, tensor);
|
ggml_compute_forward_pad(params, tensor);
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_ARANGE:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_arange(params, tensor);
|
||||||
|
} break;
|
||||||
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_timestep_embedding(params, tensor);
|
||||||
|
} break;
|
||||||
case GGML_OP_ARGSORT:
|
case GGML_OP_ARGSORT:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_argsort(params, tensor);
|
ggml_compute_forward_argsort(params, tensor);
|
||||||
|
@ -16982,6 +17168,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||||
{
|
{
|
||||||
GGML_ASSERT(false); // TODO: not implemented
|
GGML_ASSERT(false); // TODO: not implemented
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_ARANGE:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false); // TODO: not implemented
|
||||||
|
} break;
|
||||||
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false); // TODO: not implemented
|
||||||
|
} break;
|
||||||
case GGML_OP_ARGSORT:
|
case GGML_OP_ARGSORT:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(false); // TODO: not implemented
|
GGML_ASSERT(false); // TODO: not implemented
|
||||||
|
@ -17587,6 +17781,7 @@ struct ggml_compute_state {
|
||||||
ggml_thread_t thrd;
|
ggml_thread_t thrd;
|
||||||
int ith;
|
int ith;
|
||||||
struct ggml_compute_state_shared * shared;
|
struct ggml_compute_state_shared * shared;
|
||||||
|
enum ggml_status ec;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
||||||
|
@ -17738,6 +17933,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_ARANGE:
|
||||||
|
{
|
||||||
|
n_tasks = n_threads;
|
||||||
|
} break;
|
||||||
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
|
{
|
||||||
|
n_tasks = n_threads;
|
||||||
|
} break;
|
||||||
case GGML_OP_ARGSORT:
|
case GGML_OP_ARGSORT:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
|
@ -17877,7 +18080,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
while (true) {
|
while (true) {
|
||||||
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
||||||
state->shared->node_n += 1;
|
state->shared->node_n += 1;
|
||||||
return (thread_ret_t) GGML_EXIT_ABORTED;
|
state->ec = GGML_STATUS_ABORTED;
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
||||||
|
@ -17999,7 +18203,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return GGML_EXIT_SUCCESS;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
|
struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
|
@ -18195,7 +18399,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
||||||
return cplan;
|
return cplan;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
||||||
{
|
{
|
||||||
GGML_ASSERT(cplan);
|
GGML_ASSERT(cplan);
|
||||||
GGML_ASSERT(cplan->n_threads > 0);
|
GGML_ASSERT(cplan->n_threads > 0);
|
||||||
|
@ -18239,6 +18443,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
||||||
.thrd = 0,
|
.thrd = 0,
|
||||||
.ith = j,
|
.ith = j,
|
||||||
.shared = &state_shared,
|
.shared = &state_shared,
|
||||||
|
.ec = GGML_STATUS_SUCCESS,
|
||||||
};
|
};
|
||||||
|
|
||||||
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
||||||
|
@ -18249,12 +18454,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
||||||
|
|
||||||
workers[0].ith = 0;
|
workers[0].ith = 0;
|
||||||
workers[0].shared = &state_shared;
|
workers[0].shared = &state_shared;
|
||||||
|
workers[0].ec = GGML_STATUS_SUCCESS;
|
||||||
|
|
||||||
const int64_t perf_start_cycles = ggml_perf_cycles();
|
const int64_t perf_start_cycles = ggml_perf_cycles();
|
||||||
const int64_t perf_start_time_us = ggml_perf_time_us();
|
const int64_t perf_start_time_us = ggml_perf_time_us();
|
||||||
|
|
||||||
// this is a work thread too
|
// this is a work thread too
|
||||||
int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]);
|
ggml_graph_compute_thread(&workers[0]);
|
||||||
|
enum ggml_status compute_status = workers[0].ec;
|
||||||
|
|
||||||
// don't leave affinity set on the main thread
|
// don't leave affinity set on the main thread
|
||||||
clear_numa_thread_affinity();
|
clear_numa_thread_affinity();
|
||||||
|
@ -18264,6 +18471,8 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
||||||
for (int j = 1; j < n_threads; j++) {
|
for (int j = 1; j < n_threads; j++) {
|
||||||
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
||||||
GGML_ASSERT(rc == 0);
|
GGML_ASSERT(rc == 0);
|
||||||
|
if (workers[j].ec != GGML_STATUS_SUCCESS)
|
||||||
|
compute_status = workers[j].ec;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18291,14 +18500,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
||||||
return compute_status;
|
return compute_status;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
|
enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
|
||||||
|
|
||||||
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
|
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
|
||||||
|
|
||||||
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
||||||
|
|
||||||
ggml_graph_compute(cgraph, &cplan);
|
return ggml_graph_compute(cgraph, &cplan);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
|
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
|
||||||
|
|
34
ggml.h
34
ggml.h
|
@ -315,6 +315,16 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
enum ggml_status {
|
||||||
|
GGML_STATUS_ALLOC_FAILED = -2,
|
||||||
|
GGML_STATUS_FAILED = -1,
|
||||||
|
GGML_STATUS_SUCCESS = 0,
|
||||||
|
GGML_STATUS_ABORTED = 1,
|
||||||
|
};
|
||||||
|
|
||||||
|
// get ggml_status name string
|
||||||
|
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
|
||||||
|
|
||||||
typedef uint16_t ggml_fp16_t;
|
typedef uint16_t ggml_fp16_t;
|
||||||
|
|
||||||
// convert FP16 <-> FP32
|
// convert FP16 <-> FP32
|
||||||
|
@ -454,6 +464,8 @@ extern "C" {
|
||||||
GGML_OP_POOL_2D,
|
GGML_OP_POOL_2D,
|
||||||
GGML_OP_UPSCALE, // nearest interpolate
|
GGML_OP_UPSCALE, // nearest interpolate
|
||||||
GGML_OP_PAD,
|
GGML_OP_PAD,
|
||||||
|
GGML_OP_ARANGE,
|
||||||
|
GGML_OP_TIMESTEP_EMBEDDING,
|
||||||
GGML_OP_ARGSORT,
|
GGML_OP_ARGSORT,
|
||||||
GGML_OP_LEAKY_RELU,
|
GGML_OP_LEAKY_RELU,
|
||||||
|
|
||||||
|
@ -1663,6 +1675,15 @@ extern "C" {
|
||||||
int p2,
|
int p2,
|
||||||
int p3);
|
int p3);
|
||||||
|
|
||||||
|
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
|
||||||
|
// timesteps: [N,]
|
||||||
|
// return: [N, dim]
|
||||||
|
GGML_API struct ggml_tensor * ggml_timestep_embedding(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * timesteps,
|
||||||
|
int dim,
|
||||||
|
int max_period);
|
||||||
|
|
||||||
// sort rows
|
// sort rows
|
||||||
enum ggml_sort_order {
|
enum ggml_sort_order {
|
||||||
GGML_SORT_ORDER_ASC,
|
GGML_SORT_ORDER_ASC,
|
||||||
|
@ -1674,6 +1695,12 @@ extern "C" {
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
enum ggml_sort_order order);
|
enum ggml_sort_order order);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_arange(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
float start,
|
||||||
|
float stop,
|
||||||
|
float step);
|
||||||
|
|
||||||
// top k elements per row
|
// top k elements per row
|
||||||
GGML_API struct ggml_tensor * ggml_top_k(
|
GGML_API struct ggml_tensor * ggml_top_k(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
@ -1942,12 +1969,11 @@ extern "C" {
|
||||||
|
|
||||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||||
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
||||||
GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||||
|
|
||||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||||
GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -15,7 +15,7 @@ array ::=
|
||||||
|
|
||||||
string ::=
|
string ::=
|
||||||
"\"" (
|
"\"" (
|
||||||
[^"\\] |
|
[^"\\\x7F\x00-\x1F] |
|
||||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
|
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
|
||||||
)* "\"" ws
|
)* "\"" ws
|
||||||
|
|
||||||
|
|
|
@ -24,7 +24,7 @@ array ::=
|
||||||
|
|
||||||
string ::=
|
string ::=
|
||||||
"\"" (
|
"\"" (
|
||||||
[^"\\] |
|
[^"\\\x7F\x00-\x1F] |
|
||||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
|
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
|
||||||
)* "\"" ws
|
)* "\"" ws
|
||||||
|
|
||||||
|
|
381
llama.cpp
381
llama.cpp
|
@ -1726,7 +1726,7 @@ struct llama_hparams {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_cparams {
|
struct llama_cparams {
|
||||||
uint32_t n_ctx; // context size used during inference
|
uint32_t n_ctx; // context size used during inference
|
||||||
uint32_t n_batch;
|
uint32_t n_batch;
|
||||||
uint32_t n_threads; // number of threads to use for generation
|
uint32_t n_threads; // number of threads to use for generation
|
||||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||||
|
@ -1743,7 +1743,9 @@ struct llama_cparams {
|
||||||
float yarn_beta_slow;
|
float yarn_beta_slow;
|
||||||
float defrag_thold;
|
float defrag_thold;
|
||||||
|
|
||||||
|
bool embeddings;
|
||||||
bool offload_kqv;
|
bool offload_kqv;
|
||||||
|
|
||||||
enum llama_pooling_type pooling_type;
|
enum llama_pooling_type pooling_type;
|
||||||
|
|
||||||
ggml_backend_sched_eval_callback cb_eval;
|
ggml_backend_sched_eval_callback cb_eval;
|
||||||
|
@ -2052,7 +2054,7 @@ struct llama_context {
|
||||||
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
||||||
int32_t n_eval = 0; // number of eval calls
|
int32_t n_eval = 0; // number of eval calls
|
||||||
|
|
||||||
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
// logits output (2-dimensional array: [n_tokens][n_vocab])
|
||||||
std::vector<float> logits;
|
std::vector<float> logits;
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
// guard against access to unset logits
|
// guard against access to unset logits
|
||||||
|
@ -2060,8 +2062,13 @@ struct llama_context {
|
||||||
#endif
|
#endif
|
||||||
bool logits_all = false;
|
bool logits_all = false;
|
||||||
|
|
||||||
// input embedding (1-dimensional array: [n_embd])
|
// embeddings output (2-dimensional array: [n_tokens][n_embd])
|
||||||
std::vector<float> embedding;
|
// populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
|
||||||
|
std::vector<float> embd;
|
||||||
|
|
||||||
|
// sequence embeddings output (map of [n_embd] vectors)
|
||||||
|
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
|
||||||
|
std::map<llama_seq_id, std::vector<float>> embd_seq;
|
||||||
|
|
||||||
// memory buffers used to evaluate the model
|
// memory buffers used to evaluate the model
|
||||||
std::vector<uint8_t> buf_compute_meta;
|
std::vector<uint8_t> buf_compute_meta;
|
||||||
|
@ -5301,8 +5308,8 @@ static struct ggml_tensor * llm_build_kqv(
|
||||||
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE)
|
#if defined(GGML_USE_KOMPUTE)
|
||||||
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, and Kompute")
|
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
|
||||||
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
||||||
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
||||||
if (hparams.f_max_alibi_bias > 0.0f) {
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
||||||
|
@ -5388,6 +5395,7 @@ static struct ggml_tensor * llm_build_kv(
|
||||||
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
|
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
|
|
||||||
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
||||||
q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
|
q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
|
@ -6402,6 +6410,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
|
@ -6409,9 +6418,10 @@ struct llm_build_context {
|
||||||
|
|
||||||
// get input vectors with right size
|
// get input vectors with right size
|
||||||
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
|
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
|
||||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
|
||||||
|
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||||
struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
|
struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
|
||||||
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
|
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
|
||||||
|
|
||||||
// construct input embeddings (token, type, position)
|
// construct input embeddings (token, type, position)
|
||||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||||
|
@ -6429,39 +6439,38 @@ struct llm_build_context {
|
||||||
cb(inpL, "inp_norm", -1);
|
cb(inpL, "inp_norm", -1);
|
||||||
|
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
struct ggml_tensor * KQ_mask = ggml_cont(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_tokens, n_tokens, n_tokens*ggml_type_size(lctx.inp_KQ_mask->type), 0));
|
||||||
cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
|
cb(KQ_mask, "KQ_mask", -1); // [n_tokens, n_tokens]
|
||||||
|
|
||||||
// iterate layers
|
// iterate layers
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * cur = inpL;
|
struct ggml_tensor * cur = inpL;
|
||||||
|
|
||||||
|
struct ggml_tensor * Qcur;
|
||||||
|
struct ggml_tensor * Kcur;
|
||||||
|
struct ggml_tensor * Vcur;
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
if (model.arch == LLM_ARCH_BERT) {
|
if (model.arch == LLM_ARCH_BERT) {
|
||||||
struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
// seems like we just need to do this for Q?
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
|
||||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
||||||
cb(cur, "kqv_out", il);
|
|
||||||
} else {
|
} else {
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
||||||
cb(cur, "wqkv", il);
|
cb(cur, "wqkv", il);
|
||||||
|
|
||||||
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
||||||
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
||||||
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
||||||
|
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
@ -6480,13 +6489,41 @@ struct llm_build_context {
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
|
||||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
||||||
cb(cur, "kqv_out", il);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
||||||
|
struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
|
||||||
|
|
||||||
|
struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
||||||
|
cb(kq, "kq", il);
|
||||||
|
|
||||||
|
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
||||||
|
cb(kq, "kq_soft_max_ext", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
|
||||||
|
cb(v, "v", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
|
||||||
|
cb(kqv, "kqv", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
||||||
|
cb(kqv_merged, "kqv_merged", il);
|
||||||
|
|
||||||
|
cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
|
||||||
|
cb(cur, "kqv_merged_cont", il);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
||||||
|
if (model.layers[il].bo) {
|
||||||
|
cb(cur, "kqv_wo", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (model.layers[il].bo) {
|
||||||
|
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
||||||
|
}
|
||||||
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
// re-add the layer input
|
// re-add the layer input
|
||||||
cur = ggml_add(ctx0, cur, inpL);
|
cur = ggml_add(ctx0, cur, inpL);
|
||||||
|
|
||||||
|
@ -6526,16 +6563,29 @@ struct llm_build_context {
|
||||||
|
|
||||||
// final output
|
// final output
|
||||||
cur = inpL;
|
cur = inpL;
|
||||||
|
cb(cur, "result_embd", -1);
|
||||||
|
|
||||||
// pooling layer
|
// pooling layer
|
||||||
if (pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
switch (pooling_type) {
|
||||||
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
case LLAMA_POOLING_TYPE_NONE:
|
||||||
} else if (pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
{
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
// nop
|
||||||
} else {
|
} break;
|
||||||
GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type");
|
case LLAMA_POOLING_TYPE_MEAN:
|
||||||
|
{
|
||||||
|
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
||||||
|
cb(cur, "result_embd_pooled", -1);
|
||||||
|
} break;
|
||||||
|
case LLAMA_POOLING_TYPE_CLS:
|
||||||
|
{
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
||||||
|
cb(cur, "result_embd_pooled", -1);
|
||||||
|
} break;
|
||||||
|
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false && "Invalid pooling type");
|
||||||
|
} break;
|
||||||
}
|
}
|
||||||
cb(cur, "result_embd", -1);
|
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
@ -8467,7 +8517,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
if (hparams.causal_attn) {
|
||||||
const int64_t n_kv = kv_self.n;
|
const int64_t n_kv = kv_self.n;
|
||||||
const int64_t n_tokens = batch.n_tokens;
|
const int64_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
|
@ -8475,7 +8525,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
|
|
||||||
float * data = (float *) lctx.inp_KQ_mask->data;
|
float * data = (float *) lctx.inp_KQ_mask->data;
|
||||||
|
|
||||||
// For Transformers, use only the previous KV cells (or all, when non-causal)
|
// For causal attention, use only the previous KV cells
|
||||||
// of the correct sequence for each token of the batch.
|
// of the correct sequence for each token of the batch.
|
||||||
// It's assumed that if a token in the batch has multiple sequences, they are equivalent.
|
// It's assumed that if a token in the batch has multiple sequences, they are equivalent.
|
||||||
for (int h = 0; h < 1; ++h) {
|
for (int h = 0; h < 1; ++h) {
|
||||||
|
@ -8485,16 +8535,40 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
|
|
||||||
for (int i = 0; i < n_kv; ++i) {
|
for (int i = 0; i < n_kv; ++i) {
|
||||||
float f;
|
float f;
|
||||||
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
|
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
||||||
(hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
|
|
||||||
f = -INFINITY;
|
f = -INFINITY;
|
||||||
} else {
|
} else {
|
||||||
f = 0;
|
f = 0.0f;
|
||||||
}
|
}
|
||||||
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// non-causal attention attends only the tokens within the batch (i.e. the KV cache is not used)
|
||||||
|
const int64_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
|
assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
||||||
|
|
||||||
|
float * data = (float *) lctx.inp_KQ_mask->data;
|
||||||
|
|
||||||
|
for (int h = 0; h < 1; ++h) {
|
||||||
|
for (int j = 0; j < n_tokens; ++j) {
|
||||||
|
const llama_seq_id seq_id = batch.seq_id[j][0];
|
||||||
|
|
||||||
|
for (int i = 0; i < n_tokens; ++i) {
|
||||||
|
float f = -INFINITY;
|
||||||
|
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||||
|
if (batch.seq_id[i][s] == seq_id) {
|
||||||
|
f = 0.0f;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
data[h*(n_tokens*n_tokens) + j*n_tokens + i] = f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hparams.need_kq_pos) {
|
if (hparams.need_kq_pos) {
|
||||||
|
@ -8513,13 +8587,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
const int64_t n_tokens = batch.n_tokens;
|
const int64_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
||||||
float * data = (float *) lctx.inp_mean->data;
|
|
||||||
|
|
||||||
|
float * data = (float *) lctx.inp_mean->data;
|
||||||
memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
|
memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
|
||||||
|
|
||||||
std::vector<uint64_t> sum(n_tokens, 0);
|
std::vector<uint64_t> sum(n_tokens, 0);
|
||||||
for (int i = 0; i < n_tokens; ++i) {
|
for (int i = 0; i < n_tokens; ++i) {
|
||||||
const llama_seq_id seq_id = batch.seq_id[i][0];
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
||||||
|
|
||||||
|
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
|
||||||
|
|
||||||
sum[seq_id] += 1;
|
sum[seq_id] += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -8541,11 +8618,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
const int64_t n_tokens = batch.n_tokens;
|
const int64_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
||||||
|
|
||||||
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
||||||
|
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
|
||||||
|
|
||||||
for (int i = 0; i < n_tokens; ++i) {
|
for (int i = 0; i < n_tokens; ++i) {
|
||||||
const llama_seq_id seq_id = batch.seq_id[i][0];
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
||||||
const llama_pos pos = batch.pos[i];
|
const llama_pos pos = batch.pos[i];
|
||||||
|
|
||||||
|
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
|
||||||
|
|
||||||
if (pos == 0) {
|
if (pos == 0) {
|
||||||
data[seq_id] = i;
|
data[seq_id] = i;
|
||||||
}
|
}
|
||||||
|
@ -8706,27 +8788,30 @@ static int llama_decode_internal(
|
||||||
batch.seq_id = seq_id_arr.data();
|
batch.seq_id = seq_id_arr.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_kv_cache_update(&lctx);
|
// non-causal masks do not use the KV cache
|
||||||
|
if (hparams.causal_attn) {
|
||||||
|
llama_kv_cache_update(&lctx);
|
||||||
|
|
||||||
// if we have enough unused cells before the current head ->
|
// if we have enough unused cells before the current head ->
|
||||||
// better to start searching from the beginning of the cache, hoping to fill it
|
// better to start searching from the beginning of the cache, hoping to fill it
|
||||||
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
||||||
kv_self.head = 0;
|
kv_self.head = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!kv_self.recurrent) {
|
||||||
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
||||||
|
// after enough generations, the benefit from this heuristic disappears
|
||||||
|
// if we start defragmenting the cache, the benefit from this will be more important
|
||||||
|
kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
||||||
|
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!kv_self.recurrent) {
|
|
||||||
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
|
||||||
// after enough generations, the benefit from this heuristic disappears
|
|
||||||
// if we start defragmenting the cache, the benefit from this will be more important
|
|
||||||
kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
|
||||||
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
|
||||||
|
|
||||||
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_sched_reset(lctx.sched);
|
ggml_backend_sched_reset(lctx.sched);
|
||||||
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
|
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
|
||||||
|
@ -8734,20 +8819,26 @@ static int llama_decode_internal(
|
||||||
ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
|
ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
|
||||||
|
|
||||||
// the output is always the last tensor in the graph
|
// the output is always the last tensor in the graph
|
||||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
||||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
|
||||||
|
|
||||||
if (strcmp(res->name, "result_output") == 0) {
|
if (!hparams.causal_attn) {
|
||||||
// the embeddings could be the second to last tensor, or the third to last tensor
|
res = nullptr; // do not extract logits for embedding models such as BERT
|
||||||
if (strcmp(embeddings->name, "result_norm") != 0) {
|
|
||||||
embeddings = gf->nodes[gf->n_nodes - 3];
|
// token or sequence embeddings
|
||||||
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
embd = gf->nodes[gf->n_nodes - 1];
|
||||||
}
|
|
||||||
} else if (strcmp(res->name, "result_embd") == 0) {
|
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
|
||||||
embeddings = res;
|
|
||||||
res = nullptr;
|
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
if (strcmp(res->name, "result_output") == 0) {
|
||||||
|
// the token embeddings could be the second to last tensor, or the third to last tensor
|
||||||
|
if (strcmp(embd->name, "result_norm") != 0) {
|
||||||
|
embd = gf->nodes[gf->n_nodes - 3];
|
||||||
|
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false && "missing result_output tensor");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||||
|
@ -8814,46 +8905,82 @@ static int llama_decode_internal(
|
||||||
logits_out.clear();
|
logits_out.clear();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
|
ggml_backend_t backend_res = ggml_backend_sched_get_node_backend(lctx.sched, res);
|
||||||
GGML_ASSERT(res_backend != nullptr);
|
GGML_ASSERT(backend_res != nullptr);
|
||||||
|
|
||||||
if (batch.logits) {
|
if (batch.logits) {
|
||||||
logits_out.resize(n_vocab * n_tokens);
|
logits_out.resize(n_vocab * n_tokens);
|
||||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||||
if (batch.logits[i] == 0) {
|
if (batch.logits[i] == 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
|
ggml_backend_tensor_get_async(backend_res, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
logits_valid[i] = true;
|
logits_valid[i] = true;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
} else if (lctx.logits_all) {
|
} else if (lctx.logits_all) {
|
||||||
logits_out.resize(n_vocab * n_tokens);
|
logits_out.resize(n_vocab * n_tokens);
|
||||||
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
|
ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
std::fill(logits_valid.begin(), logits_valid.end(), true);
|
std::fill(logits_valid.begin(), logits_valid.end(), true);
|
||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
logits_out.resize(n_vocab);
|
logits_out.resize(n_vocab);
|
||||||
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
|
ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
logits_valid[0] = true;
|
logits_valid[0] = true;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
ggml_backend_synchronize(res_backend);
|
ggml_backend_synchronize(backend_res);
|
||||||
}
|
}
|
||||||
|
|
||||||
// extract embeddings
|
// extract embeddings
|
||||||
if (!lctx.embedding.empty()) {
|
if (cparams.embeddings && embd) {
|
||||||
auto & embedding_out = lctx.embedding;
|
ggml_backend_t backend_embd = ggml_backend_sched_get_node_backend(lctx.sched, embd);
|
||||||
|
GGML_ASSERT(backend_embd != nullptr);
|
||||||
|
|
||||||
const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
|
switch (cparams.pooling_type) {
|
||||||
const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
|
case LLAMA_POOLING_TYPE_NONE:
|
||||||
|
{
|
||||||
|
// extract token embeddings
|
||||||
|
auto & embd_out = lctx.embd;
|
||||||
|
|
||||||
embedding_out.resize(embd_size);
|
if (batch.logits) {
|
||||||
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
|
embd_out.resize(n_embd * n_tokens);
|
||||||
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||||
ggml_backend_synchronize(embeddings_backend);
|
if (batch.logits[i] == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_tensor_get_async(backend_embd, embd, embd_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
case LLAMA_POOLING_TYPE_CLS:
|
||||||
|
case LLAMA_POOLING_TYPE_MEAN:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
|
||||||
|
|
||||||
|
// extract sequence embeddings
|
||||||
|
auto & embd_seq_out = lctx.embd_seq;
|
||||||
|
embd_seq_out.clear();
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||||
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
||||||
|
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
embd_seq_out[seq_id].resize(n_embd);
|
||||||
|
ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false && "unknown pooling type");
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
ggml_backend_synchronize(backend_embd);
|
||||||
}
|
}
|
||||||
|
|
||||||
// measure the performance only for the single-token evals
|
// measure the performance only for the single-token evals
|
||||||
|
@ -9167,19 +9294,19 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
||||||
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
||||||
const auto& token_data = vocab.id_to_token.at(id);
|
const auto& token_data = vocab.id_to_token.at(id);
|
||||||
switch (llama_vocab_get_type(vocab)) {
|
switch (llama_vocab_get_type(vocab)) {
|
||||||
case LLAMA_VOCAB_TYPE_SPM: {
|
case LLAMA_VOCAB_TYPE_SPM: {
|
||||||
auto buf = token_data.text.substr(3, 2);
|
auto buf = token_data.text.substr(3, 2);
|
||||||
return strtol(buf.c_str(), NULL, 16);
|
return strtol(buf.c_str(), NULL, 16);
|
||||||
}
|
}
|
||||||
case LLAMA_VOCAB_TYPE_BPE: {
|
case LLAMA_VOCAB_TYPE_BPE: {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
return unicode_to_bytes_bpe(token_data.text);
|
return unicode_to_bytes_bpe(token_data.text);
|
||||||
}
|
}
|
||||||
case LLAMA_VOCAB_TYPE_WPM: {
|
case LLAMA_VOCAB_TYPE_WPM: {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -12430,7 +12557,7 @@ struct llama_context_params llama_context_default_params() {
|
||||||
/*.type_k =*/ GGML_TYPE_F16,
|
/*.type_k =*/ GGML_TYPE_F16,
|
||||||
/*.type_v =*/ GGML_TYPE_F16,
|
/*.type_v =*/ GGML_TYPE_F16,
|
||||||
/*.logits_all =*/ false,
|
/*.logits_all =*/ false,
|
||||||
/*.embedding =*/ false,
|
/*.embeddings =*/ false,
|
||||||
/*.offload_kqv =*/ true,
|
/*.offload_kqv =*/ true,
|
||||||
/*.abort_callback =*/ nullptr,
|
/*.abort_callback =*/ nullptr,
|
||||||
/*.abort_callback_data =*/ nullptr,
|
/*.abort_callback_data =*/ nullptr,
|
||||||
|
@ -12582,6 +12709,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
||||||
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
||||||
cparams.defrag_thold = params.defrag_thold;
|
cparams.defrag_thold = params.defrag_thold;
|
||||||
|
cparams.embeddings = params.embeddings;
|
||||||
cparams.offload_kqv = params.offload_kqv;
|
cparams.offload_kqv = params.offload_kqv;
|
||||||
cparams.pooling_type = params.pooling_type;
|
cparams.pooling_type = params.pooling_type;
|
||||||
|
|
||||||
|
@ -12769,8 +12897,8 @@ struct llama_context * llama_new_context_with_model(
|
||||||
// resized during inference, reserve maximum
|
// resized during inference, reserve maximum
|
||||||
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
|
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
|
||||||
|
|
||||||
if (params.embedding) {
|
if (params.embeddings) {
|
||||||
ctx->embedding.resize(hparams.n_embd);
|
ctx->embd.reserve(hparams.n_embd*cparams.n_batch);
|
||||||
}
|
}
|
||||||
|
|
||||||
// graph inputs
|
// graph inputs
|
||||||
|
@ -13220,7 +13348,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
||||||
// assume worst case for logits although only currently set ones are serialized
|
// assume worst case for logits although only currently set ones are serialized
|
||||||
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
||||||
const size_t s_embedding_size = sizeof(size_t);
|
const size_t s_embedding_size = sizeof(size_t);
|
||||||
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
const size_t s_embedding = ctx->embd.capacity() * sizeof(float);
|
||||||
const size_t s_kv_buf_size = sizeof(size_t);
|
const size_t s_kv_buf_size = sizeof(size_t);
|
||||||
const size_t s_kv_head = sizeof(uint32_t);
|
const size_t s_kv_head = sizeof(uint32_t);
|
||||||
const size_t s_kv_size = sizeof(uint32_t);
|
const size_t s_kv_size = sizeof(uint32_t);
|
||||||
|
@ -13329,12 +13457,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
||||||
|
|
||||||
// copy embeddings
|
// copy embeddings
|
||||||
{
|
{
|
||||||
const size_t embedding_size = ctx->embedding.size();
|
const size_t embeddings_size = ctx->embd.size();
|
||||||
|
|
||||||
data_ctx->write(&embedding_size, sizeof(embedding_size));
|
data_ctx->write(&embeddings_size, sizeof(embeddings_size));
|
||||||
|
|
||||||
if (embedding_size) {
|
if (embeddings_size) {
|
||||||
data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
|
data_ctx->write(ctx->embd.data(), embeddings_size * sizeof(float));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13449,15 +13577,17 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
||||||
|
|
||||||
// set embeddings
|
// set embeddings
|
||||||
{
|
{
|
||||||
size_t embedding_size;
|
size_t embeddings_size;
|
||||||
|
|
||||||
memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
|
memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
|
||||||
|
|
||||||
GGML_ASSERT(ctx->embedding.capacity() == embedding_size);
|
GGML_ASSERT(ctx->embd.capacity() == embeddings_size);
|
||||||
|
|
||||||
if (embedding_size) {
|
if (embeddings_size) {
|
||||||
memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
|
ctx->embd.resize(embeddings_size);
|
||||||
inp += embedding_size * sizeof(float);
|
|
||||||
|
memcpy(ctx->embd.data(), inp, embeddings_size * sizeof(float));
|
||||||
|
inp += embeddings_size * sizeof(float);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13717,11 +13847,20 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
||||||
}
|
}
|
||||||
|
|
||||||
float * llama_get_embeddings(struct llama_context * ctx) {
|
float * llama_get_embeddings(struct llama_context * ctx) {
|
||||||
return ctx->embedding.data();
|
return ctx->embd.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
||||||
return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
|
return ctx->embd.data() + i*ctx->model.hparams.n_embd;
|
||||||
|
}
|
||||||
|
|
||||||
|
float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
|
||||||
|
auto it = ctx->embd_seq.find(seq_id);
|
||||||
|
if (it == ctx->embd_seq.end()) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
return it->second.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
||||||
|
@ -13895,7 +14034,7 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
std::string & dest, bool add_ass) {
|
std::string & dest, bool add_ass) {
|
||||||
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
if (tmpl.find("<|im_start|>") != std::string::npos) {
|
if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) {
|
||||||
// chatml template
|
// chatml template
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
||||||
|
@ -13903,7 +14042,7 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|im_start|>assistant\n";
|
ss << "<|im_start|>assistant\n";
|
||||||
}
|
}
|
||||||
} else if (tmpl.find("[INST]") != std::string::npos) {
|
} else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
|
||||||
// llama2 template and its variants
|
// llama2 template and its variants
|
||||||
// [variant] support system message
|
// [variant] support system message
|
||||||
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
|
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
|
||||||
|
@ -13938,7 +14077,7 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// llama2 templates seem to not care about "add_generation_prompt"
|
// llama2 templates seem to not care about "add_generation_prompt"
|
||||||
} else if (tmpl.find("<|user|>") != std::string::npos) {
|
} else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
|
||||||
// zephyr template
|
// zephyr template
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
||||||
|
@ -13946,7 +14085,7 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|assistant|>\n";
|
ss << "<|assistant|>\n";
|
||||||
}
|
}
|
||||||
} else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
|
} else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) {
|
||||||
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
||||||
|
@ -13955,7 +14094,7 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<s>assistant\n";
|
ss << "<s>assistant\n";
|
||||||
}
|
}
|
||||||
} else if (tmpl.find("<start_of_turn>") != std::string::npos) {
|
} else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
|
||||||
// google/gemma-7b-it
|
// google/gemma-7b-it
|
||||||
std::string system_prompt = "";
|
std::string system_prompt = "";
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
|
@ -14002,7 +14141,7 @@ LLAMA_API int32_t llama_chat_apply_template(
|
||||||
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
||||||
if (res < 0) {
|
if (res < 0) {
|
||||||
// worst case: there is no information about template, we will use chatml by default
|
// worst case: there is no information about template, we will use chatml by default
|
||||||
curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
|
curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
|
||||||
} else {
|
} else {
|
||||||
curr_tmpl = std::string(model_template.data(), model_template.size());
|
curr_tmpl = std::string(model_template.data(), model_template.size());
|
||||||
}
|
}
|
||||||
|
|
18
llama.h
18
llama.h
|
@ -163,7 +163,7 @@ extern "C" {
|
||||||
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
||||||
// - pos : the positions of the respective token in the sequence
|
// - pos : the positions of the respective token in the sequence
|
||||||
// - seq_id : the sequence to which the respective token belongs
|
// - seq_id : the sequence to which the respective token belongs
|
||||||
// - logits : if zero, the logits for the respective token will not be output
|
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
||||||
//
|
//
|
||||||
typedef struct llama_batch {
|
typedef struct llama_batch {
|
||||||
int32_t n_tokens;
|
int32_t n_tokens;
|
||||||
|
@ -173,7 +173,7 @@ extern "C" {
|
||||||
llama_pos * pos;
|
llama_pos * pos;
|
||||||
int32_t * n_seq_id;
|
int32_t * n_seq_id;
|
||||||
llama_seq_id ** seq_id;
|
llama_seq_id ** seq_id;
|
||||||
int8_t * logits;
|
int8_t * logits; // TODO: rename this to "output"
|
||||||
|
|
||||||
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
||||||
// for future-proof code, use the above fields instead and ignore everything below
|
// for future-proof code, use the above fields instead and ignore everything below
|
||||||
|
@ -261,7 +261,7 @@ extern "C" {
|
||||||
|
|
||||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||||
bool embedding; // embedding mode only
|
bool embeddings; // if true, extract embeddings (together with logits)
|
||||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
|
|
||||||
// Abort callback
|
// Abort callback
|
||||||
|
@ -657,14 +657,20 @@ extern "C" {
|
||||||
// llama_get_logits(ctx) + i*n_vocab
|
// llama_get_logits(ctx) + i*n_vocab
|
||||||
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
||||||
|
|
||||||
// Get the embeddings for the input
|
// Get all output token embeddings
|
||||||
// shape: [n_embd] (1-dimensional)
|
// shape: [n_tokens*n_embd] (1-dimensional)
|
||||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||||
|
|
||||||
// Get the embeddings for the ith sequence
|
// Get the embeddings for the ith token
|
||||||
// llama_get_embeddings(ctx) + i*n_embd
|
// llama_get_embeddings(ctx) + i*n_embd
|
||||||
|
// shape: [n_embd] (1-dimensional)
|
||||||
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
||||||
|
|
||||||
|
// Get the embeddings for a sequence id
|
||||||
|
// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
|
||||||
|
// shape: [n_embd] (1-dimensional)
|
||||||
|
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Vocab
|
// Vocab
|
||||||
//
|
//
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
b458250b736a7473f7ff3560d47c93f1644f3290
|
8695910a39102609073d0e099aa7c97d6bcb3bf9
|
||||||
|
|
|
@ -1412,6 +1412,50 @@ struct test_pad : public test_case {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// GGML_OP_ARANGE
|
||||||
|
struct test_arange : public test_case {
|
||||||
|
const ggml_type type;
|
||||||
|
const float start;
|
||||||
|
const float stop;
|
||||||
|
const float step;
|
||||||
|
|
||||||
|
std::string vars() override {
|
||||||
|
return VARS_TO_STR4(type, start, stop, step);
|
||||||
|
}
|
||||||
|
|
||||||
|
test_arange(ggml_type type = GGML_TYPE_F32,
|
||||||
|
float start = 0.f, float stop = 10.f, float step = 1.f)
|
||||||
|
: type(type), start(start), stop(stop), step(step) {}
|
||||||
|
|
||||||
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||||
|
ggml_tensor * out = ggml_arange(ctx, start, stop, step);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// GGML_OP_TIMESTEP_EMBEDDING
|
||||||
|
struct test_timestep_embedding : public test_case {
|
||||||
|
const ggml_type type;
|
||||||
|
const std::array<int64_t, 4> ne_a;
|
||||||
|
const int dim;
|
||||||
|
const int max_period;
|
||||||
|
|
||||||
|
std::string vars() override {
|
||||||
|
return VARS_TO_STR4(type, ne_a, dim, max_period);
|
||||||
|
}
|
||||||
|
|
||||||
|
test_timestep_embedding(ggml_type type = GGML_TYPE_F32,
|
||||||
|
std::array<int64_t, 4> ne_a = {2, 1, 1, 1},
|
||||||
|
int dim = 320, int max_period=10000)
|
||||||
|
: type(type), ne_a(ne_a), dim(dim), max_period(max_period) {}
|
||||||
|
|
||||||
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||||
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
||||||
|
ggml_tensor * out = ggml_timestep_embedding(ctx, a, dim, max_period);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// GGML_OP_LEAKY_RELU
|
// GGML_OP_LEAKY_RELU
|
||||||
struct test_leaky_relu : public test_case {
|
struct test_leaky_relu : public test_case {
|
||||||
const ggml_type type;
|
const ggml_type type;
|
||||||
|
@ -2126,6 +2170,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||||
test_cases.emplace_back(new test_group_norm());
|
test_cases.emplace_back(new test_group_norm());
|
||||||
test_cases.emplace_back(new test_acc());
|
test_cases.emplace_back(new test_acc());
|
||||||
test_cases.emplace_back(new test_pad());
|
test_cases.emplace_back(new test_pad());
|
||||||
|
test_cases.emplace_back(new test_arange());
|
||||||
|
test_cases.emplace_back(new test_timestep_embedding());
|
||||||
test_cases.emplace_back(new test_leaky_relu());
|
test_cases.emplace_back(new test_leaky_relu());
|
||||||
|
|
||||||
// these tests are disabled to save execution time, but they can be handy for debugging
|
// these tests are disabled to save execution time, but they can be handy for debugging
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue