Merge branch 'master' into support-mamba-ssm

This commit is contained in:
Francis Couture-Harpin 2024-03-05 12:12:01 -05:00
commit 5544f5211b
36 changed files with 46612 additions and 47293 deletions

View file

@ -1,5 +1,6 @@
{ {
lib, lib,
glibc,
config, config,
stdenv, stdenv,
mkShell, mkShell,
@ -30,6 +31,11 @@
useRocm ? config.rocmSupport, useRocm ? config.rocmSupport,
useVulkan ? false, useVulkan ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
# It's necessary to consistently use backendStdenv when building with CUDA support,
# otherwise we get libstdc++ errors downstream.
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
enableStatic ? effectiveStdenv.hostPlatform.isStatic
}@inputs: }@inputs:
let let
@ -41,10 +47,7 @@ let
versionOlder versionOlder
; ;
# It's necessary to consistently use backendStdenv when building with CUDA support,
# otherwise we get libstdc++ errors downstream.
stdenv = throw "Use effectiveStdenv instead"; stdenv = throw "Use effectiveStdenv instead";
effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
suffices = suffices =
lib.optionals useBlas [ "BLAS" ] lib.optionals useBlas [ "BLAS" ]
@ -167,6 +170,9 @@ effectiveStdenv.mkDerivation (
# TODO: Replace with autoAddDriverRunpath # TODO: Replace with autoAddDriverRunpath
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
cudaPackages.autoAddOpenGLRunpathHook cudaPackages.autoAddOpenGLRunpathHook
]
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
glibc.static
]; ];
buildInputs = buildInputs =
@ -181,7 +187,7 @@ effectiveStdenv.mkDerivation (
[ [
(cmakeBool "LLAMA_NATIVE" false) (cmakeBool "LLAMA_NATIVE" false)
(cmakeBool "LLAMA_BUILD_SERVER" true) (cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "BUILD_SHARED_LIBS" true) (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "LLAMA_BLAS" useBlas) (cmakeBool "LLAMA_BLAS" useBlas)
(cmakeBool "LLAMA_CLBLAST" useOpenCL) (cmakeBool "LLAMA_CLBLAST" useOpenCL)
@ -190,6 +196,7 @@ effectiveStdenv.mkDerivation (
(cmakeBool "LLAMA_METAL" useMetalKit) (cmakeBool "LLAMA_METAL" useMetalKit)
(cmakeBool "LLAMA_MPI" useMpi) (cmakeBool "LLAMA_MPI" useMpi)
(cmakeBool "LLAMA_VULKAN" useVulkan) (cmakeBool "LLAMA_VULKAN" useVulkan)
(cmakeBool "LLAMA_STATIC" enableStatic)
] ]
++ optionals useCuda [ ++ optionals useCuda [
( (

View file

@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Recent API changes ### Recent API changes
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849 - [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
### Hot topics ### Hot topics

View file

@ -19,7 +19,12 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
endif() endif()
endif() endif()
set(GIT_INDEX "${GIT_DIR}/index") if(EXISTS "${GIT_DIR}/index")
set(GIT_INDEX "${GIT_DIR}/index")
else()
message(WARNING "Git index not found in git repository.")
set(GIT_INDEX "")
endif()
else() else()
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.") message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
set(GIT_INDEX "") set(GIT_INDEX "")

View file

@ -513,12 +513,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.n_sequences = std::stoi(argv[i]); params.n_sequences = std::stoi(argv[i]);
} else if (arg == "--p-accept" || arg == "-pa") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.p_accept = std::stof(argv[i]);
} else if (arg == "--p-split" || arg == "-ps") { } else if (arg == "--p-split" || arg == "-ps") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -1044,7 +1038,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel); printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences); printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
printf(" -pa N, --p-accept N speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept);
printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split); printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
@ -1300,7 +1293,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
cparams.seed = params.seed; cparams.seed = params.seed;
cparams.logits_all = params.logits_all; cparams.logits_all = params.logits_all;
cparams.embedding = params.embedding; cparams.embeddings = params.embedding;
cparams.rope_scaling_type = params.rope_scaling_type; cparams.rope_scaling_type = params.rope_scaling_type;
cparams.rope_freq_base = params.rope_freq_base; cparams.rope_freq_base = params.rope_freq_base;
cparams.rope_freq_scale = params.rope_freq_scale; cparams.rope_freq_scale = params.rope_freq_scale;

View file

@ -43,7 +43,7 @@ extern char const *LLAMA_BUILD_TARGET;
int32_t get_num_physical_cores(); int32_t get_num_physical_cores();
struct gpt_params { struct gpt_params {
uint32_t seed = -1; // RNG seed uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
int32_t n_threads = get_num_physical_cores(); int32_t n_threads = get_num_physical_cores();
int32_t n_threads_draft = -1; int32_t n_threads_draft = -1;
@ -53,11 +53,10 @@ struct gpt_params {
int32_t n_ctx = 512; // context size int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 8; // number of tokens to draft during speculative decoding int32_t n_draft = 5; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode int32_t n_sequences = 1; // number of sequences to decode
float p_accept = 0.5f; // speculative decoding accept probability
float p_split = 0.1f; // speculative decoding split probability float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)

View file

@ -295,6 +295,77 @@ static llama_token llama_sampling_sample_impl(
return id; return id;
} }
static llama_token_data_array llama_sample_probability_distribution_impl(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
const int idx) {
const llama_sampling_params & params = ctx_sampling->params;
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
const float penalty_repeat = params.penalty_repeat;
const float penalty_freq = params.penalty_freq;
const float penalty_present = params.penalty_present;
const bool penalize_nl = params.penalize_nl;
auto & prev = ctx_sampling->prev;
auto & cur = ctx_sampling->cur;
// Get a pointer to the logits
float * logits = llama_get_logits_ith(ctx_main, idx);
// Declare original_logits at the beginning of the function scope
std::vector<float> original_logits;
// apply params.logit_bias map
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
if (ctx_cfg) {
float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
}
cur.clear();
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
// apply penalties
const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
if (penalty_tokens_used_size) {
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
llama_sample_repetition_penalties(ctx_main, &cur_p,
penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
cur_p.data[idx].logit = nl_logit;
break;
}
}
}
}
// apply grammar checks
if (ctx_sampling->grammar != NULL) {
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
}
llama_sample_softmax(ctx_main, &cur_p);
return cur_p;
}
llama_token llama_sampling_sample( llama_token llama_sampling_sample(
struct llama_sampling_context * ctx_sampling, struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main, struct llama_context * ctx_main,
@ -304,6 +375,14 @@ llama_token llama_sampling_sample(
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false); return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
} }
llama_token_data_array llama_sampling_probability_distribution(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
const int idx) {
return llama_sample_probability_distribution_impl(ctx_sampling,ctx_main, ctx_cfg, idx);
}
void llama_sampling_accept( void llama_sampling_accept(
struct llama_sampling_context * ctx_sampling, struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main, struct llama_context * ctx_main,

View file

@ -131,6 +131,13 @@ llama_token llama_sampling_sample(
struct llama_context * ctx_cfg, struct llama_context * ctx_cfg,
int idx = 0); int idx = 0);
// returns the probability that token of given id will be sampled
llama_token_data_array llama_sampling_probability_distribution(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
int idx = 0);
void llama_sampling_accept( void llama_sampling_accept(
struct llama_sampling_context * ctx_sampling, struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main, struct llama_context * ctx_main,

View file

@ -36,8 +36,10 @@ class SentencePieceTokenTypes(IntEnum):
UNUSED = 5 UNUSED = 5
BYTE = 6 BYTE = 6
AnyModel = TypeVar("AnyModel", bound="type[Model]") AnyModel = TypeVar("AnyModel", bound="type[Model]")
class Model(ABC): class Model(ABC):
_model_classes: dict[str, type[Model]] = {} _model_classes: dict[str, type[Model]] = {}
@ -187,6 +189,7 @@ class Model(ABC):
@classmethod @classmethod
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
assert names assert names
def func(modelcls: type[Model]): def func(modelcls: type[Model]):
for name in names: for name in names:
cls._model_classes[name] = modelcls cls._model_classes[name] = modelcls

View file

@ -19,11 +19,11 @@ static std::vector<std::string> split_lines(const std::string & s) {
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) { static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
for (size_t i = 0; i < tokens.size(); i++) { for (size_t i = 0; i < tokens.size(); i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, false); llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
} }
} }
static void normalize(float * vec, float * out, int n) { static void normalize(const float * vec, float * out, int n) {
float norm = 0; float norm = 0;
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
norm += vec[i] * vec[i]; norm += vec[i] * vec[i];
@ -45,10 +45,23 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
} }
// normalize on copy // normalize on copy
for (int k = 0; k < n_seq; k++) { for (int i = 0; i < batch.n_tokens; i++) {
float * emb = llama_get_embeddings_ith(ctx, k); if (!batch.logits[i]) {
float * out = output + k * n_embd; continue;
normalize(emb, out, n_embd); }
// try to get sequence embeddings - supported only when pooling_type is not NONE
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
if (embd == NULL) {
embd = llama_get_embeddings_ith(ctx, i);
if (embd == NULL) {
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
continue;
}
}
float * out = output + batch.seq_id[i][0] * n_embd;
normalize(embd, out, n_embd);
} }
} }
@ -132,7 +145,7 @@ int main(int argc, char ** argv) {
// initialize batch // initialize batch
const int n_prompts = prompts.size(); const int n_prompts = prompts.size();
struct llama_batch batch = llama_batch_init(n_batch, 0, n_prompts); struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
// allocate output // allocate output
const int n_embd = llama_n_embd(model); const int n_embd = llama_n_embd(model);
@ -145,6 +158,7 @@ int main(int argc, char ** argv) {
for (int k = 0; k < n_prompts; k++) { for (int k = 0; k < n_prompts; k++) {
// clamp to n_batch tokens // clamp to n_batch tokens
auto & inp = inputs[k]; auto & inp = inputs[k];
const uint64_t n_toks = inp.size(); const uint64_t n_toks = inp.size();
// encode if at capacity // encode if at capacity

View file

@ -511,6 +511,14 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd; std::vector<llama_token> embd;
std::vector<llama_token> embd_guidance; std::vector<llama_token> embd_guidance;
// tokenized antiprompts
std::vector<std::vector<llama_token>> antiprompt_ids;
antiprompt_ids.reserve(params.antiprompt.size());
for (const std::string & antiprompt : params.antiprompt) {
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
}
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
while ((n_remain != 0 && !is_antiprompt) || params.interactive) { while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
@ -769,6 +777,18 @@ int main(int argc, char ** argv) {
} }
} }
// check for reverse prompt using special tokens
llama_token last_token = llama_sampling_last(ctx_sampling);
for (std::vector<llama_token> ids : antiprompt_ids) {
if (ids.size() == 1 && last_token == ids[0]) {
if (params.interactive) {
is_interacting = true;
}
is_antiprompt = true;
break;
}
}
if (is_antiprompt) { if (is_antiprompt) {
LOG("found antiprompt: %s\n", last_output.c_str()); LOG("found antiprompt: %s\n", last_output.c_str());
} }

34
examples/server-embd.py Normal file
View file

@ -0,0 +1,34 @@
import asyncio
import requests
import numpy as np
n = 8
result = []
async def requests_post_async(*args, **kwargs):
return await asyncio.to_thread(requests.post, *args, **kwargs)
async def main():
model_url = "http://127.0.0.1:6900"
responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
url= f"{model_url}/embedding",
json= {"content": str(i)*1024}
) for i in range(n)])
for response in responses:
embedding = response.json()["embedding"]
print(embedding[-8:])
result.append(embedding)
asyncio.run(main())
# compute cosine similarity
for i in range(n-1):
for j in range(i+1, n):
embedding1 = np.array(result[i])
embedding2 = np.array(result[j])
similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
print(f"Similarity between {i} and {j}: {similarity:.2f}")

View file

@ -417,7 +417,7 @@ struct llama_server_context
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size()); int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
if (res < 0) { if (res < 0) {
LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {}); LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template sparams.chat_template = "chatml";
} }
} }
@ -1214,7 +1214,7 @@ struct llama_server_context
queue_results.send(res); queue_results.send(res);
} }
void send_embedding(server_slot &slot) void send_embedding(server_slot & slot, const llama_batch & batch)
{ {
task_result res; task_result res;
res.id = slot.task_id; res.id = slot.task_id;
@ -1223,6 +1223,7 @@ struct llama_server_context
res.stop = true; res.stop = true;
const int n_embd = llama_n_embd(model); const int n_embd = llama_n_embd(model);
if (!params.embedding) if (!params.embedding)
{ {
LOG_WARNING("embedding disabled", {{"params.embedding", params.embedding}}); LOG_WARNING("embedding disabled", {{"params.embedding", params.embedding}});
@ -1233,12 +1234,29 @@ struct llama_server_context
} }
else else
{ {
const float *data = llama_get_embeddings(ctx); for (int i = 0; i < batch.n_tokens; ++i) {
std::vector<float> embedding(data, data + n_embd); if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
res.result_json = json continue;
{ }
{"embedding", embedding},
}; const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
if (embd == NULL) {
embd = llama_get_embeddings_ith(ctx, i);
if (embd == NULL) {
LOG_ERROR("failed to get embeddings for token", {{"token", batch.token[i]}, {"seq_id", batch.seq_id[i][0]}});
res.result_json = json
{
{"embedding", std::vector<float>(n_embd, 0.0f)},
};
continue;
}
}
res.result_json = json
{
{"embedding", std::vector<float>(embd, embd + n_embd)},
};
}
} }
queue_results.send(res); queue_results.send(res);
} }
@ -1900,7 +1918,7 @@ struct llama_server_context
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
{ {
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
for (auto & slot : slots) for (auto & slot : slots)
{ {
@ -1973,7 +1991,7 @@ struct llama_server_context
// prompt evaluated for embedding // prompt evaluated for embedding
if (slot.embedding) if (slot.embedding)
{ {
send_embedding(slot); send_embedding(slot, batch_view);
slot.release(); slot.release();
slot.i_batch = -1; slot.i_batch = -1;
continue; continue;
@ -2055,6 +2073,8 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n"); printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
printf(" --pooling {none,mean,cls}\n");
printf(" pooling type for embeddings, use model default if unspecified\n");
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
@ -2295,6 +2315,18 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
} }
params.yarn_beta_slow = std::stof(argv[i]); params.yarn_beta_slow = std::stof(argv[i]);
} }
else if (arg == "--pooling")
{
if (++i >= argc) {
invalid_param = true;
break;
}
std::string value(argv[i]);
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
else { invalid_param = true; break; }
}
else if (arg == "--threads" || arg == "-t") else if (arg == "--threads" || arg == "-t")
{ {
if (++i >= argc) if (++i >= argc)
@ -2349,7 +2381,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
break; break;
} }
params.n_batch = std::stoi(argv[i]); params.n_batch = std::stoi(argv[i]);
params.n_batch = std::min(512, params.n_batch);
} }
else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
{ {

View file

@ -6,3 +6,4 @@ More info:
- https://github.com/ggerganov/llama.cpp/pull/2926 - https://github.com/ggerganov/llama.cpp/pull/2926
- https://github.com/ggerganov/llama.cpp/pull/3624 - https://github.com/ggerganov/llama.cpp/pull/3624
- https://github.com/ggerganov/llama.cpp/pull/5625

View file

@ -5,6 +5,7 @@
#include <cstdio> #include <cstdio>
#include <string> #include <string>
#include <vector> #include <vector>
#include <set>
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@ -18,6 +19,7 @@ struct seq_draft {
std::vector<int> i_batch_tgt; std::vector<int> i_batch_tgt;
std::vector<llama_token> tokens; std::vector<llama_token> tokens;
std::vector<std::vector<llama_token_data>> dists;
struct llama_sampling_context * ctx_sampling; struct llama_sampling_context * ctx_sampling;
}; };
@ -37,12 +39,15 @@ int main(int argc, char ** argv) {
// max number of parallel drafting sequences (i.e. tree branches) // max number of parallel drafting sequences (i.e. tree branches)
const int n_seq_dft = params.n_parallel; const int n_seq_dft = params.n_parallel;
// probability threshold for accepting a token from the draft model
const float p_accept = params.p_accept;
// probability threshold for splitting a draft branch (only for n_seq_dft > 1) // probability threshold for splitting a draft branch (only for n_seq_dft > 1)
const float p_split = params.p_split; const float p_split = params.p_split;
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
std::default_random_engine rng(params.seed);
std::uniform_real_distribution<> u_dist;
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("speculative", "log")); log_set_target(log_filename_generator("speculative", "log"));
LOG_TEE("Log start\n"); LOG_TEE("Log start\n");
@ -166,7 +171,9 @@ int main(int argc, char ** argv) {
std::vector<seq_draft> drafts(n_seq_dft); std::vector<seq_draft> drafts(n_seq_dft);
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model if (params.sparams.temp == 0) {
params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
}
for (int s = 0; s < n_seq_dft; ++s) { for (int s = 0; s < n_seq_dft; ++s) {
drafts[s].ctx_sampling = llama_sampling_init(params.sparams); drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
@ -182,12 +189,15 @@ int main(int argc, char ** argv) {
drafts[0].i_batch_tgt[0] = 0; drafts[0].i_batch_tgt[0] = 0;
while (true) { while (true) {
std::set<int> active_seqs = {};
// print current draft sequences // print current draft sequences
for (int s = 0; s < n_seq_dft; ++s) { for (int s = 0; s < n_seq_dft; ++s) {
if (!drafts[s].active) { if (!drafts[s].active) {
continue; continue;
} }
active_seqs.insert(s);
const auto & tokens = drafts[s].tokens; const auto & tokens = drafts[s].tokens;
LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str()); LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
@ -196,48 +206,156 @@ int main(int argc, char ** argv) {
int i_dft = 0; int i_dft = 0;
int s_keep = 0; int s_keep = 0;
llama_token token_id;
std::string token_str;
// loop until we fail to accept a drafted token or we run out of drafted tokens
while (true) { while (true) {
LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
// sample from the target model
llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
const std::string token_str = llama_token_to_piece(ctx_tgt, id);
if (!params.use_color) {
printf("%s", token_str.c_str());
}
if (id == llama_token_eos(model_tgt)) {
has_eos = true;
}
++n_predict;
// check if the target token matches any of the drafts // check if the target token matches any of the drafts
// for stochastic sampling, attempt to match the token with the drafted tokens
{ {
bool matches = false; bool accept = false;
if (params.sparams.temp > 0) {
// stochastic verification
for (int s = 0; s < n_seq_dft; ++s) { llama_token_data_array dist_tgt = llama_sampling_probability_distribution(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
if (!drafts[s].active) { float p_tgt = 0, p_dft = 0;
continue;
// GGML_ASSERT(dist_tgt.size() == dist_dft.size());
while (active_seqs.size() > 0) {
// randomly select a sequence to verify from active sequences
std::uniform_int_distribution<unsigned int> u_int_dist(0, active_seqs.size() - 1);
int s = *std::next(active_seqs.begin(), u_int_dist(rng));
if (i_dft >= (int) drafts[s].tokens.size()) {
drafts[s].active = false;
active_seqs.erase(s);
continue;
}
if (accept) {
// if we already accepted a token, we can skip the rest
if (drafts[s].tokens[i_dft] != drafts[s_keep].tokens[i_dft]) {
drafts[s].active = false;
active_seqs.erase(s);
}
continue;
}
LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
float r = u_dist(rng);
llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), true };
// acquire the token probabilities assigned by the draft and target models
for (size_t i = 0; i < dist_tgt.size; i++) {
if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
p_tgt = dist_tgt.data[i].p;
}
if (dist_dft.data[i].id == drafts[s].tokens[i_dft]) {
p_dft = dist_dft.data[i].p;
}
if (p_tgt && p_dft) {
break;
}
}
LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
if (r <= p_tgt / p_dft) {
s_keep = s;
accept = true;
token_id = drafts[s].tokens[i_dft];
token_str = llama_token_to_piece(ctx_tgt, token_id);
llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
break;
} else {
LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
drafts[s].active = false;
// calculate residual probability
GGML_ASSERT(dist_tgt.sorted);
GGML_ASSERT(dist_dft.sorted);
float sum_probs = 0.0f;
// sort dist by id
std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
return a.id < b.id;
});
std::sort(dist_dft.data, dist_dft.data + dist_dft.size, [](const llama_token_data &a, const llama_token_data &b) {
return a.id < b.id;
});
for (size_t i = 0; i < dist_tgt.size; i++) {
dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p);
sum_probs += dist_tgt.data[i].p;
}
for (size_t i = 0; i < dist_tgt.size; i++) {
dist_tgt.data[i].p /= sum_probs;
}
// sort dist_tgt by p desc
std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
return a.p > b.p;
});
}
active_seqs.erase(s);
for(int i = 0; i < n_seq_dft; i++) {
if (i == s) {
continue;
}
if (drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
// synchronize active status for sequences with the same drafted token
drafts[i].active = drafts[i].active && accept;
if (!drafts[i].active) {
active_seqs.erase(s);
}
}
}
} }
if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) { if (!accept) {
LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str()); // all drafted tokens were rejected
// sample from the target model
LOG("all drafted tokens were rejected, sampling from residual distribution\n");
token_id = llama_sample_token(ctx_tgt, &dist_tgt);
llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
token_str = llama_token_to_piece(ctx_tgt, token_id);
}
s_keep = s; } else {
matches = true; // greedy verification
} else {
drafts[s].active = false; // sample from the target model
LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
token_id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
token_str = llama_token_to_piece(ctx_tgt, token_id);
for (int s = 0; s < n_seq_dft; ++s) {
if (!drafts[s].active) {
continue;
}
if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) {
LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
s_keep = s;
accept = true;
} else {
drafts[s].active = false;
}
} }
} }
if (matches) { if (token_id == llama_token_eos(model_tgt)) {
has_eos = true;
}
++n_predict;
if (accept) {
++n_accept; ++n_accept;
++n_past_tgt; ++n_past_tgt;
++n_past_dft; ++n_past_dft;
@ -245,17 +363,21 @@ int main(int argc, char ** argv) {
if (params.use_color) { if (params.use_color) {
// Color token according to its origin sequence // Color token according to its origin sequence
printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str()); printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
fflush(stdout); } else {
printf("%s", token_str.c_str());
} }
fflush(stdout);
continue; continue;
} else {
printf("%s", token_str.c_str());
fflush(stdout);
break;
} }
} }
if (params.use_color) { }
printf("%s", token_str.c_str());
}
fflush(stdout);
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str()); {
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
// TODO: simplify // TODO: simplify
{ {
@ -275,21 +397,21 @@ int main(int argc, char ** argv) {
drafts[s].active = false; drafts[s].active = false;
drafts[s].tokens.clear(); drafts[s].tokens.clear();
drafts[s].i_batch_tgt.clear(); drafts[s].i_batch_tgt.clear();
drafts[s].dists.clear();
} }
// note: will be erased after the speculation phase // note: will be erased after the speculation phase
drafts[0].tokens.push_back(id); drafts[0].tokens.push_back(token_id);
drafts[0].dists.push_back(std::vector<llama_token_data>());
drafts[0].i_batch_tgt.push_back(0); drafts[0].i_batch_tgt.push_back(0);
llama_batch_clear(batch_dft); llama_batch_clear(batch_dft);
llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true); llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1); llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
// LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str()); // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
llama_decode (ctx_dft, batch_dft); llama_decode(ctx_dft, batch_dft);
++n_past_dft; ++n_past_dft;
break;
} }
if (n_predict > params.n_predict || has_eos) { if (n_predict > params.n_predict || has_eos) {
@ -334,12 +456,6 @@ int main(int argc, char ** argv) {
k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str()); k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
} }
if (cur_p[0].p < p_accept) {
LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
drafts[s].drafting = false;
continue;
}
std::vector<int> sa(1, s); std::vector<int> sa(1, s);
// attempt to split the branch if the probability is high enough // attempt to split the branch if the probability is high enough
@ -367,6 +483,7 @@ int main(int argc, char ** argv) {
drafts[n_seq_cur].skip = true; drafts[n_seq_cur].skip = true;
drafts[n_seq_cur].tokens = drafts[s].tokens; drafts[n_seq_cur].tokens = drafts[s].tokens;
drafts[n_seq_cur].dists = drafts[s].dists;
drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft; drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt; drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
@ -389,6 +506,8 @@ int main(int argc, char ** argv) {
llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true); llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
drafts[s].tokens.push_back(id); drafts[s].tokens.push_back(id);
// save cur_p.data into drafts[s].dists
drafts[s].dists.push_back(cur_p);
// add unique drafted tokens to the target batch // add unique drafted tokens to the target batch
drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens); drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
@ -440,6 +559,7 @@ int main(int argc, char ** argv) {
} }
drafts[s].tokens.erase(drafts[s].tokens.begin()); drafts[s].tokens.erase(drafts[s].tokens.begin());
drafts[s].dists.erase(drafts[s].dists.begin());
} }
} }

View file

@ -91,13 +91,14 @@ extern "C" {
// (optional) complete all pending operations // (optional) complete all pending operations
void (*GGML_CALL synchronize)(ggml_backend_t backend); void (*GGML_CALL synchronize)(ggml_backend_t backend);
// compute graph with a plan // create a plan for ggml_cgraph and free it
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// compute graph with a plan
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// compute graph without a plan (async) // compute graph without a plan (async)
bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
// check if the backend supports an operation // check if the backend supports an operation
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);

View file

@ -262,11 +262,11 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
backend->iface.graph_plan_free(backend, plan); backend->iface.graph_plan_free(backend, plan);
} }
void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
backend->iface.graph_plan_compute(backend, plan); return backend->iface.graph_plan_compute(backend, plan);
} }
bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
return backend->iface.graph_compute(backend, cgraph); return backend->iface.graph_compute(backend, cgraph);
} }
@ -732,15 +732,15 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
GGML_UNUSED(backend); GGML_UNUSED(backend);
} }
GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
GGML_UNUSED(backend); GGML_UNUSED(backend);
} }
GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
@ -755,8 +755,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
cplan.abort_callback = cpu_ctx->abort_callback; cplan.abort_callback = cpu_ctx->abort_callback;
cplan.abort_callback_data = cpu_ctx->abort_callback_data; cplan.abort_callback_data = cpu_ctx->abort_callback_data;
ggml_graph_compute(cgraph, &cplan); return ggml_graph_compute(cgraph, &cplan);
return true;
} }
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@ -1437,7 +1436,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
return true; return true;
} }
static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
uint64_t copy_us[GGML_MAX_BACKENDS] = {0}; uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
uint64_t compute_us[GGML_MAX_BACKENDS] = {0}; uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
@ -1472,8 +1471,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
uint64_t compute_start_us = ggml_time_us(); uint64_t compute_start_us = ggml_time_us();
if (!sched->callback_eval) { if (!sched->callback_eval) {
if (!ggml_backend_graph_compute(split_backend, &split->graph)) { enum ggml_status ec = ggml_backend_graph_compute(split_backend, &split->graph);
return false; if (ec != GGML_STATUS_SUCCESS) {
return ec;
} }
//ggml_backend_synchronize(split_backend); // necessary to measure compute time //ggml_backend_synchronize(split_backend); // necessary to measure compute time
} else { } else {
@ -1494,8 +1494,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
if (!ggml_backend_graph_compute(split_backend, &gv)) { enum ggml_status ec = ggml_backend_graph_compute(split_backend, &gv);
return false; if (ec != GGML_STATUS_SUCCESS) {
return ec;
} }
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) { if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
@ -1519,7 +1520,7 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
} }
#endif #endif
return true; return GGML_STATUS_SUCCESS;
} }
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) { ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
@ -1581,7 +1582,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
return true; return true;
} }
bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
if (!sched->is_reset) { if (!sched->is_reset) {
@ -1590,14 +1591,10 @@ bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cg
ggml_backend_sched_split_graph(sched, graph); ggml_backend_sched_split_graph(sched, graph);
if (!ggml_backend_sched_alloc_splits(sched)) { if (!ggml_backend_sched_alloc_splits(sched)) {
return false; return GGML_STATUS_ALLOC_FAILED;
} }
if (!ggml_backend_sched_compute_splits(sched)) { return ggml_backend_sched_compute_splits(sched);
return false;
}
return true;
} }
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) { void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {

View file

@ -66,12 +66,13 @@ extern "C" {
GGML_API void ggml_backend_synchronize(ggml_backend_t backend); GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan); GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op); GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
// tensor copy between different backends // tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
@ -157,26 +158,26 @@ extern "C" {
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
// Initialize a backend scheduler // Initialize a backend scheduler
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size); GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph // Initialize backend buffers from a measure graph
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
// Get the number of splits of the last graph // Get the number of splits of the last graph
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
// Allocate and compute graph on the backend scheduler // Allocate and compute graph on the backend scheduler
GGML_API bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
// Reset all assignments and allocators - must be called before changing the node backends // Reset all assignments and allocators - must be called before changing the node backends
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
// Set a callback to be called for each resulting node during graph compute // Set a callback to be called for each resulting node during graph compute
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data); GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
// //
// Utils // Utils

View file

@ -616,6 +616,8 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + Q
#define CUDA_UPSCALE_BLOCK_SIZE 256 #define CUDA_UPSCALE_BLOCK_SIZE 256
#define CUDA_CONCAT_BLOCK_SIZE 256 #define CUDA_CONCAT_BLOCK_SIZE 256
#define CUDA_PAD_BLOCK_SIZE 256 #define CUDA_PAD_BLOCK_SIZE 256
#define CUDA_ARANGE_BLOCK_SIZE 256
#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
#define CUDA_ACC_BLOCK_SIZE 256 #define CUDA_ACC_BLOCK_SIZE 256
#define CUDA_IM2COL_BLOCK_SIZE 256 #define CUDA_IM2COL_BLOCK_SIZE 256
#define CUDA_POOL2D_BLOCK_SIZE 256 #define CUDA_POOL2D_BLOCK_SIZE 256
@ -990,17 +992,21 @@ static __global__ void concat_f32(const float * x,const float * y, float * dst,
nidx + nidx +
blockIdx.y * ne0 + blockIdx.y * ne0 +
blockIdx.z * ne0 * gridDim.y; blockIdx.z * ne0 * gridDim.y;
dst[offset_dst] = x[offset_src]; dst[offset_dst] = x[offset_src];
} else { } else {
int offset_src = int offset_src =
nidx + nidx +
blockIdx.y * ne0 + blockIdx.y * ne0 +
(blockIdx.z - ne02) * ne0 * gridDim.y; (blockIdx.z - ne02) * ne0 * gridDim.y;
dst[offset_dst] = y[offset_src]; dst[offset_dst] = y[offset_src];
} }
} }
static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int nb02, const int scale_factor) { static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
// blockIdx.z: idx of ne02*ne03
// blockIdx.y: idx of ne01*scale_factor aka ne1
// blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
// ne00xne01: ne00 * ne01
int ne0 = ne00 * scale_factor; int ne0 = ne00 * scale_factor;
int nidx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) { if (nidx >= ne0) {
@ -1012,7 +1018,7 @@ static __global__ void upscale_f32(const float * x, float * dst, const int ne00,
int offset_src = int offset_src =
i00 + i00 +
i01 * ne00 + i01 * ne00 +
blockIdx.z * nb02; blockIdx.z * ne00xne01;
int offset_dst = int offset_dst =
nidx + nidx +
blockIdx.y * ne0 + blockIdx.y * ne0 +
@ -1020,7 +1026,10 @@ static __global__ void upscale_f32(const float * x, float * dst, const int ne00,
dst[offset_dst] = x[offset_src]; dst[offset_dst] = x[offset_src];
} }
static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02) { static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
// blockIdx.z: idx of ne2*ne3, aka ne02*ne03
// blockIdx.y: idx of ne1
// blockIDx.x: idx of ne0 / BLOCK_SIZE
int nidx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) { if (nidx >= ne0) {
return; return;
@ -1031,19 +1040,53 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons
nidx + nidx +
blockIdx.y * ne0 + blockIdx.y * ne0 +
blockIdx.z * ne0 * gridDim.y; blockIdx.z * ne0 * gridDim.y;
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02) { if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
int offset_src = int offset_src =
nidx + nidx +
blockIdx.y * ne00 + blockIdx.y * ne00 +
blockIdx.z * ne00 * ne01; blockIdx.z * ne00 * ne01;
dst[offset_dst] = x[offset_src]; dst[offset_dst] = x[offset_src];
} else { } else {
dst[offset_dst] = 0.0f; dst[offset_dst] = 0.0f;
} }
} }
static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
// blockIDx.x: idx of ne0 / BLOCK_SIZE
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) {
return;
}
dst[nidx] = start + step * nidx;
}
static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
// blockIDx.y: idx of timesteps->ne[0]
// blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
int i = blockIdx.y;
int j = threadIdx.x + blockIdx.x * blockDim.x;
float * embed_data = (float *)((char *)dst + i*nb1);
if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
embed_data[dim] = 0.f;
}
int half = dim / 2;
if (j >= half) {
return;
}
float timestep = timesteps[i];
float freq = (float)expf(-logf(max_period) * j / half);
float arg = timestep * freq;
embed_data[j] = cosf(arg);
embed_data[j + half] = sinf(arg);
}
template <int block_size> template <int block_size>
static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) { static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
// blockIdx.x: num_groups idx
// threadIdx.x: block_size idx
int start = blockIdx.x * group_size; int start = blockIdx.x * group_size;
int end = start + group_size; int end = start + group_size;
@ -6448,7 +6491,7 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
const int nb12, const int nb13) { const int nb12, const int nb13) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= ne) { if (i >= ne) {
return; return;
@ -6456,17 +6499,17 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
// determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
// then combine those indices with the corresponding byte offsets to get the total offsets // then combine those indices with the corresponding byte offsets to get the total offsets
const int i03 = i/(ne00 * ne01 * ne02); const int64_t i03 = i/(ne00 * ne01 * ne02);
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
const int i13 = i/(ne10 * ne11 * ne12); const int64_t i13 = i/(ne10 * ne11 * ne12);
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13; const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
cpy_1(cx + x_offset, cdst + dst_offset); cpy_1(cx + x_offset, cdst + dst_offset);
} }
@ -6956,23 +6999,23 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
template <typename T> template <typename T>
static __global__ void im2col_kernel( static __global__ void im2col_kernel(
const float * x, T * dst, int batch_offset, const float * x, T * dst, int64_t batch_offset,
int offset_delta, int IC, int IW, int IH, int OH, int OW, int KW, int KH, int pelements, int CHW, int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
int s0, int s1, int p0, int p1, int d0, int d1) { int s0, int s1, int p0, int p1, int d0, int d1) {
const int i = threadIdx.x + blockIdx.x * blockDim.x; const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= pelements) { if (i >= pelements) {
return; return;
} }
const int ksize = OW * (KH > 1 ? KW : 1); const int64_t ksize = OW * (KH > 1 ? KW : 1);
const int kx = i / ksize; const int64_t kx = i / ksize;
const int kd = kx * ksize; const int64_t kd = kx * ksize;
const int ky = (i - kd) / OW; const int64_t ky = (i - kd) / OW;
const int ix = i % OW; const int64_t ix = i % OW;
const int oh = blockIdx.y; const int64_t oh = blockIdx.y;
const int batch = blockIdx.z / IC; const int64_t batch = blockIdx.z / IC;
const int ic = blockIdx.z % IC; const int64_t ic = blockIdx.z % IC;
const int64_t iiw = ix * s0 + kx * d0 - p0; const int64_t iiw = ix * s0 + kx * d0 - p0;
const int64_t iih = oh * s1 + ky * d1 - p1; const int64_t iih = oh * s1 + ky * d1 - p1;
@ -7298,19 +7341,33 @@ static void concat_f32_cuda(const float * x, const float * y, float * dst, const
concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02); concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
} }
static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int scale_factor, cudaStream_t stream) { static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
const int scale_factor, cudaStream_t stream) {
int ne0 = (ne00 * scale_factor); int ne0 = (ne00 * scale_factor);
int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE; int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02); dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor); upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
} }
static void pad_f32_cuda(const float * x, float * dst, static void pad_f32_cuda(const float * x, float * dst,
const int ne00, const int ne01, const int ne02, const int ne00, const int ne01, const int ne02, const int ne03,
const int ne0, const int ne1, const int ne2, cudaStream_t stream) { const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE; int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
dim3 gridDim(num_blocks, ne1, ne2); dim3 gridDim(num_blocks, ne1, ne2*ne3);
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02); pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
}
static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
}
static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1,
const int dim, const int max_period, cudaStream_t stream) {
int half_ceil = (dim + 1) / 2;
int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE;
dim3 gridDim(num_blocks, ne00, 1);
timestep_embedding_f32<<<gridDim, CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE, 0, stream>>>(x, dst, nb1, dim, max_period);
} }
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) { static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
@ -8443,8 +8500,8 @@ static void soft_max_f32_cuda(const float * x, const float * mask, const float *
template <typename T> template <typename T>
static void im2col_cuda(const float* x, T* dst, static void im2col_cuda(const float* x, T* dst,
int IW, int IH, int OW, int OH, int KW, int KH, int IC, int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
int batch, int batch_offset, int offset_delta, int64_t batch, int64_t batch_offset, int64_t offset_delta,
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) { int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
const int parallel_elements = OW * KW * KH; const int parallel_elements = OW * KW * KH;
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE; const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
@ -9123,7 +9180,7 @@ static void ggml_cuda_op_group_norm(
int num_groups = dst->op_params[0]; int num_groups = dst->op_params[0];
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream); group_norm_f32_cuda(src0_dd, dst_dd, num_groups * src0->ne[3], group_size, ggml_nelements(src0), main_stream);
(void) src1; (void) src1;
(void) dst; (void) dst;
@ -9156,7 +9213,7 @@ static void ggml_cuda_op_upscale(
const int scale_factor = dst->op_params[0]; const int scale_factor = dst->op_params[0];
upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream); upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, main_stream);
(void) src1; (void) src1;
(void) dst; (void) dst;
@ -9172,8 +9229,49 @@ static void ggml_cuda_op_pad(
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
pad_f32_cuda(src0_dd, dst_dd, pad_f32_cuda(src0_dd, dst_dd,
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
dst->ne[0], dst->ne[1], dst->ne[2], main_stream); dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], main_stream);
(void) src1;
(void) dst;
(void) src1_dd;
}
static void ggml_cuda_op_arange(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(dst->type == GGML_TYPE_F32);
float start;
float stop;
float step;
memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
int64_t steps = (int64_t)ceil((stop - start) / step);
GGML_ASSERT(ggml_nelements(dst) == steps);
arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream);
(void) src0;
(void) src1;
(void) src0_dd;
(void) src1_dd;
}
static void ggml_cuda_op_timestep_embedding(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
const int dim = dst->op_params[0];
const int max_period = dst->op_params[1];
timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream);
(void) src1; (void) src1;
(void) dst; (void) dst;
@ -10458,6 +10556,45 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad); ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
} }
static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
// dd = data device
float * src0_ddf = nullptr;
float * src1_ddf = nullptr;
float * dst_ddf = nullptr;
cuda_pool_alloc<float> dst_f;
ggml_cuda_set_device(g_main_device);
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
if (dst_on_device) {
dst_ddf = (float *) dst_extra->data_device[g_main_device];
} else {
dst_ddf = dst_f.alloc(ggml_nelements(dst));
}
// do the computation
ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
CUDA_CHECK(cudaGetLastError());
// copy dst to host if necessary
if (!dst_on_device) {
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
}
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
CUDA_CHECK(cudaDeviceSynchronize());
}
}
static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_timestep_embedding);
}
static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm); ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
} }
@ -11358,6 +11495,12 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
case GGML_OP_PAD: case GGML_OP_PAD:
func = ggml_cuda_pad; func = ggml_cuda_pad;
break; break;
case GGML_OP_ARANGE:
func = ggml_cuda_arange;
break;
case GGML_OP_TIMESTEP_EMBEDDING:
func = ggml_cuda_timestep_embedding;
break;
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
func = ggml_cuda_leaky_relu; func = ggml_cuda_leaky_relu;
break; break;
@ -12098,7 +12241,7 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
UNUSED(backend); UNUSED(backend);
} }
GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_cuda_set_main_device(cuda_ctx->device); ggml_cuda_set_main_device(cuda_ctx->device);
@ -12134,7 +12277,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
GGML_ASSERT(ok); GGML_ASSERT(ok);
} }
return true; return GGML_STATUS_SUCCESS;
} }
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
@ -12253,6 +12396,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_GROUP_NORM: case GGML_OP_GROUP_NORM:
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
return true; return true;
default: default:

View file

@ -1927,10 +1927,10 @@ static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(g
return ggml_backend_kompute_buffer_type(ctx->device); return ggml_backend_kompute_buffer_type(ctx->device);
} }
static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
auto * ctx = static_cast<ggml_kompute_context *>(backend->context); auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
ggml_vk_graph_compute(ctx, cgraph); ggml_vk_graph_compute(ctx, cgraph);
return true; return GGML_STATUS_SUCCESS;
} }
static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {

View file

@ -163,6 +163,8 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_IM2COL_F32, GGML_METAL_KERNEL_TYPE_IM2COL_F32,
GGML_METAL_KERNEL_TYPE_UPSCALE_F32, GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
GGML_METAL_KERNEL_TYPE_PAD_F32, GGML_METAL_KERNEL_TYPE_PAD_F32,
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
@ -569,6 +571,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true);
@ -697,6 +701,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
return false; return false;
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
return true; return true;
@ -742,7 +748,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
} }
} }
static bool ggml_metal_graph_compute( static enum ggml_status ggml_metal_graph_compute(
struct ggml_metal_context * ctx, struct ggml_metal_context * ctx,
struct ggml_cgraph * gf) { struct ggml_cgraph * gf) {
@ -1091,7 +1097,8 @@ static bool ggml_metal_graph_compute(
{ {
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
const float scale = *(const float *) dst->op_params; float scale;
memcpy(&scale, dst->op_params, sizeof(scale));
int64_t n = ggml_nelements(dst); int64_t n = ggml_nelements(dst);
@ -1250,11 +1257,15 @@ static bool ggml_metal_graph_compute(
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
} }
const float scale = ((float *) dst->op_params)[0]; float scale;
const float max_bias = ((float *) dst->op_params)[1]; float max_bias;
memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale));
memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
const int64_t nrows_x = ggml_nrows(src0); const int64_t nrows_x = ggml_nrows(src0);
const int64_t nrows_y = src0->ne[1]; const int64_t nrows_y = src0->ne[1];
const uint32_t n_head_kv = nrows_x/nrows_y; const uint32_t n_head_kv = nrows_x/nrows_y;
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
@ -2086,6 +2097,7 @@ static bool ggml_metal_graph_compute(
//const int n_past = ((int32_t *) dst->op_params)[0]; //const int n_past = ((int32_t *) dst->op_params)[0];
const int n_head = ((int32_t *) dst->op_params)[1]; const int n_head = ((int32_t *) dst->op_params)[1];
float max_bias; float max_bias;
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
@ -2300,6 +2312,50 @@ static bool ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;
case GGML_OP_ARANGE:
{
GGML_ASSERT(dst->type == GGML_TYPE_F32);
float start;
float step;
memcpy(&start, ((int32_t *) dst->op_params) + 0, sizeof(float));
memcpy(&step, ((int32_t *) dst->op_params) + 2, sizeof(float));
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_dst offset:offs_dst atIndex:0];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:1];
[encoder setBytes:&start length:sizeof(start) atIndex:2];
[encoder setBytes:&step length:sizeof(step) atIndex:3];
const int nth = MIN(1024, ne0);
[encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_TIMESTEP_EMBEDDING:
{
GGML_ASSERT(src0->type == GGML_TYPE_F32);
const int dim = dst->op_params[0];
const int max_period = dst->op_params[1];
const int half = dim / 2;
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:2];
[encoder setBytes:&dim length:sizeof(dim) atIndex:3];
[encoder setBytes:&max_period length:sizeof(max_period) atIndex:4];
const int nth = MIN(1024, half);
[encoder dispatchThreadgroups:MTLSizeMake(ne00, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
{ {
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
@ -2428,7 +2484,7 @@ static bool ggml_metal_graph_compute(
MTLCommandBufferStatus status = [command_buffer status]; MTLCommandBufferStatus status = [command_buffer status];
if (status != MTLCommandBufferStatusCompleted) { if (status != MTLCommandBufferStatusCompleted) {
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
return false; return GGML_STATUS_FAILED;
} }
} }
@ -2437,7 +2493,7 @@ static bool ggml_metal_graph_compute(
} }
} }
return true; return GGML_STATUS_SUCCESS;
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -2739,7 +2795,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe
UNUSED(backend); UNUSED(backend);
} }
GGML_CALL static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
return ggml_metal_graph_compute(metal_ctx, cgraph); return ggml_metal_graph_compute(metal_ctx, cgraph);

View file

@ -1959,6 +1959,49 @@ kernel void kernel_pad_f32(
} }
} }
kernel void kernel_arange_f32(
device char * dst,
constant int64_t & ne0,
constant float & start,
constant float & step,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
device float * dst_ptr = (device float *) dst;
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
dst_ptr[i0] = start + step * i0;
}
}
kernel void kernel_timestep_embedding_f32(
device const char * src0,
device char * dst,
constant uint64_t & nb1,
constant int & dim,
constant int & max_period,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
int i = tgpig.x;
device float * embed_data = (device float *)(dst + i*nb1);
int half_ = dim / 2;
for (int j = tpitg.x; j < half_; j += ntg.x) {
float timestep = ((device float *)src0)[i];
float freq = (float)exp(-log((float)max_period) * j / half_);
float arg = timestep * freq;
embed_data[j ] = cos(arg);
embed_data[j + half_] = sin(arg);
}
if (dim % 2 != 0 && tpitg.x == 0) {
embed_data[dim] = 0.f;
}
}
// bitonic sort implementation following the CUDA kernels as reference // bitonic sort implementation following the CUDA kernels as reference
typedef void (argsort_t)( typedef void (argsort_t)(
device const float * x, device const float * x,

View file

@ -2231,7 +2231,7 @@ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(gg
GGML_UNUSED(backend); GGML_UNUSED(backend);
} }
static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) { static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
for (int i = 0; i < graph->n_nodes; ++i) { for (int i = 0; i < graph->n_nodes; ++i) {
ggml_tensor * node = graph->nodes[i]; ggml_tensor * node = graph->nodes[i];
switch (node->op) { switch (node->op) {
@ -2246,7 +2246,7 @@ static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgrap
} }
} }
return true; return GGML_STATUS_SUCCESS;
GGML_UNUSED(backend); GGML_UNUSED(backend);
} }

View file

@ -51,6 +51,7 @@
#define UNUSED GGML_UNUSED #define UNUSED GGML_UNUSED
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
@ -9563,7 +9564,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting); const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits); const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
const __m256i full_signs = _mm256_set_m128i(full_sign_bits, full_sign_bits); const __m256i full_signs = MM256_SET_M128I(full_sign_bits, full_sign_bits);
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs); const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32)); const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
@ -9585,8 +9586,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
const __m256i sc1 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1)); const __m256i sc1 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
const __m256i sc2 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1)); const __m256i sc2 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2)); const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
@ -9653,8 +9654,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits); const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1); const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
const __m256i full_signs_1 = _mm256_set_m128i(full_signs_l, full_signs_l); const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
const __m256i full_signs_2 = _mm256_set_m128i(full_signs_h, full_signs_h); const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
__m256i signs; __m256i signs;
signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1); signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
@ -10551,10 +10552,10 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs); const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs); const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs); const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)), const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b))); _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)), const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b))); _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone); const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
@ -10661,10 +10662,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16; const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)), const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b))); _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)), const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b))); _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32; const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -10,6 +10,7 @@ extern "C" {
#define GGML_VK_NAME "Vulkan" #define GGML_VK_NAME "Vulkan"
#define GGML_VK_MAX_DEVICES 16 #define GGML_VK_MAX_DEVICES 16
GGML_API void ggml_vk_instance_init(void);
GGML_API void ggml_vk_init_cpu_assist(void); GGML_API void ggml_vk_init_cpu_assist(void);
GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node); GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);

237
ggml.c
View file

@ -320,6 +320,17 @@ static ggml_fp16_t ggml_table_exp_f16[1 << 16];
// precomputed f32 table for f16 (256 KB) (ggml-impl.h) // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
float ggml_table_f32_f16[1 << 16]; float ggml_table_f32_f16[1 << 16];
const char * ggml_status_to_string(enum ggml_status status) {
switch (status) {
case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
case GGML_STATUS_SUCCESS: return "GGML status: success";
case GGML_STATUS_ABORTED: return "GGML status: warning (operation aborted)";
}
return "GGML status: unknown";
}
// note: do not use these inside ggml.c // note: do not use these inside ggml.c
// these are meant to be used via the ggml.h API // these are meant to be used via the ggml.h API
float ggml_fp16_to_fp32(ggml_fp16_t x) { float ggml_fp16_to_fp32(ggml_fp16_t x) {
@ -1822,6 +1833,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"POOL_2D", "POOL_2D",
"UPSCALE", "UPSCALE",
"PAD", "PAD",
"ARANGE",
"TIMESTEP_EMBEDDING",
"ARGSORT", "ARGSORT",
"LEAKY_RELU", "LEAKY_RELU",
@ -1852,7 +1865,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CROSS_ENTROPY_LOSS_BACK", "CROSS_ENTROPY_LOSS_BACK",
}; };
static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74"); static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none", "none",
@ -1910,6 +1923,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"pool_2d(x)", "pool_2d(x)",
"upscale(x)", "upscale(x)",
"pad(x)", "pad(x)",
"arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)",
"argsort(x)", "argsort(x)",
"leaky_relu(x)", "leaky_relu(x)",
@ -1940,7 +1955,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"cross_entropy_loss_back(x,y)", "cross_entropy_loss_back(x,y)",
}; };
static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74"); static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -2899,11 +2914,21 @@ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_
return ((const int32_t *)(tensor->op_params))[i]; return ((const int32_t *)(tensor->op_params))[i];
} }
static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
return ((const float *)(tensor->op_params))[i];
}
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) { static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t)); assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
((int32_t *)(tensor->op_params))[i] = value; ((int32_t *)(tensor->op_params))[i] = value;
} }
static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
((float *)(tensor->op_params))[i] = value;
}
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) { struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
memset(tensor->data, 0, ggml_nbytes(tensor)); memset(tensor->data, 0, ggml_nbytes(tensor));
return tensor; return tensor;
@ -5902,6 +5927,55 @@ struct ggml_tensor * ggml_upscale(
return ggml_upscale_impl(ctx, a, scale_factor); return ggml_upscale_impl(ctx, a, scale_factor);
} }
struct ggml_tensor * ggml_arange(
struct ggml_context * ctx,
float start,
float stop,
float step) {
GGML_ASSERT(stop > start);
const int64_t steps = (int64_t) ceilf((stop - start) / step);
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
result->op = GGML_OP_ARANGE;
ggml_set_op_params_f32(result, 0, start);
ggml_set_op_params_f32(result, 1, stop);
ggml_set_op_params_f32(result, 2, step);
return result;
}
struct ggml_tensor * ggml_timestep_embedding(
struct ggml_context * ctx,
struct ggml_tensor * timesteps,
int dim,
int max_period) {
bool is_node = false;
if (timesteps->grad) {
GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
int actual_dim = dim;
if (dim % 2 != 0) {
actual_dim = dim + 1;
}
struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);
result->op = GGML_OP_TIMESTEP_EMBEDDING;
ggml_set_op_params_i32(result, 0, dim);
ggml_set_op_params_i32(result, 1, max_period);
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = timesteps;
return result;
}
// ggml_argsort // ggml_argsort
struct ggml_tensor * ggml_argsort( struct ggml_tensor * ggml_argsort(
@ -10337,7 +10411,7 @@ static void ggml_compute_forward_group_norm_f32(
int n_channels = src0->ne[2]; int n_channels = src0->ne[2];
int n_groups = dst->op_params[0]; int n_groups = dst->op_params[0];
int n_channels_per_group = (n_channels + n_groups - 1) / n_groups; int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
for (int i = ith; i < n_groups; i+=nth) { for (int i = ith; i < n_groups; i += nth) {
int start = i * n_channels_per_group; int start = i * n_channels_per_group;
int end = start + n_channels_per_group; int end = start + n_channels_per_group;
if (end > n_channels) { if (end > n_channels) {
@ -10351,28 +10425,32 @@ static void ggml_compute_forward_group_norm_f32(
for (int64_t i01 = 0; i01 < ne01; i01++) { for (int64_t i01 = 0; i01 < ne01; i01++) {
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
ggml_float sumr = 0.0;
for (int64_t i00 = 0; i00 < ne00; i00++) { for (int64_t i00 = 0; i00 < ne00; i00++) {
sum += (ggml_float)x[i00]; sumr += (ggml_float)x[i00];
} }
sum += sumr;
} }
} }
float mean = sum / (ne00 * ne01 * step); const float mean = sum / (ne00 * ne01 * step);
ggml_float sum2 = 0.0;
ggml_float sum2 = 0.0;
for (int64_t i02 = start; i02 < end; i02++) { for (int64_t i02 = start; i02 < end; i02++) {
for (int64_t i01 = 0; i01 < ne01; i01++) { for (int64_t i01 = 0; i01 < ne01; i01++) {
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
ggml_float sumr = 0.0;
for (int64_t i00 = 0; i00 < ne00; i00++) { for (int64_t i00 = 0; i00 < ne00; i00++) {
float v = x[i00] - mean; float v = x[i00] - mean;
y[i00] = v; y[i00] = v;
sum2 += (ggml_float)(v * v); sumr += (ggml_float)(v * v);
} }
sum2 += sumr;
} }
} }
float variance = sum2 / (ne00 * ne01 * step); const float variance = sum2 / (ne00 * ne01 * step);
const float scale = 1.0f / sqrtf(variance + eps); const float scale = 1.0f / sqrtf(variance + eps);
for (int64_t i02 = start; i02 < end; i02++) { for (int64_t i02 = start; i02 < end; i02++) {
@ -13653,6 +13731,106 @@ static void ggml_compute_forward_pad(
} }
} }
// ggml_compute_forward_arange
static void ggml_compute_forward_arange_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return;
}
GGML_ASSERT(dst->nb[0] == sizeof(float));
const int ith = params->ith;
const int nth = params->nth;
const float start = ggml_get_op_params_f32(dst, 0);
const float stop = ggml_get_op_params_f32(dst, 1);
const float step = ggml_get_op_params_f32(dst, 2);
const int64_t steps = (int64_t) ceilf((stop - start) / step);
GGML_ASSERT(ggml_nelements(dst) == steps);
for (int64_t i = ith; i < steps; i+= nth) {
float value = start + step * i;
((float *)dst->data)[i] = value;
}
}
static void ggml_compute_forward_arange(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
switch (dst->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_arange_f32(params, dst);
} break;
default:
{
GGML_ASSERT(false);
} break;
}
}
static void ggml_compute_forward_timestep_embedding_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return;
}
const struct ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(src0->nb[0] == sizeof(float));
const int ith = params->ith;
const int nth = params->nth;
GGML_TENSOR_UNARY_OP_LOCALS
const int dim = ggml_get_op_params_i32(dst, 0);
const int max_period = ggml_get_op_params_i32(dst, 1);
int half = dim / 2;
for (int64_t i = 0; i < ne00; i++) {
float * embed_data = (float *)((char *) dst->data + i*nb1);
for (int64_t j = ith; j < half; j += nth) {
float timestep = ((float *)src0->data)[i];
float freq = (float)expf(-logf(max_period) * j / half);
float arg = timestep * freq;
embed_data[j] = cosf(arg);
embed_data[j + half] = sinf(arg);
}
if (dim % 2 != 0 && ith == 0) {
embed_data[dim] = 0.f;
}
}
}
static void ggml_compute_forward_timestep_embedding(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_timestep_embedding_f32(params, dst);
} break;
default:
{
GGML_ASSERT(false);
} break;
}
}
// ggml_compute_forward_argsort // ggml_compute_forward_argsort
static void ggml_compute_forward_argsort_f32( static void ggml_compute_forward_argsort_f32(
@ -15972,6 +16150,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{ {
ggml_compute_forward_pad(params, tensor); ggml_compute_forward_pad(params, tensor);
} break; } break;
case GGML_OP_ARANGE:
{
ggml_compute_forward_arange(params, tensor);
} break;
case GGML_OP_TIMESTEP_EMBEDDING:
{
ggml_compute_forward_timestep_embedding(params, tensor);
} break;
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
{ {
ggml_compute_forward_argsort(params, tensor); ggml_compute_forward_argsort(params, tensor);
@ -16982,6 +17168,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{ {
GGML_ASSERT(false); // TODO: not implemented GGML_ASSERT(false); // TODO: not implemented
} break; } break;
case GGML_OP_ARANGE:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_TIMESTEP_EMBEDDING:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
{ {
GGML_ASSERT(false); // TODO: not implemented GGML_ASSERT(false); // TODO: not implemented
@ -17587,6 +17781,7 @@ struct ggml_compute_state {
ggml_thread_t thrd; ggml_thread_t thrd;
int ith; int ith;
struct ggml_compute_state_shared * shared; struct ggml_compute_state_shared * shared;
enum ggml_status ec;
}; };
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) { static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
@ -17738,6 +17933,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
{ {
n_tasks = n_threads; n_tasks = n_threads;
} break; } break;
case GGML_OP_ARANGE:
{
n_tasks = n_threads;
} break;
case GGML_OP_TIMESTEP_EMBEDDING:
{
n_tasks = n_threads;
} break;
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
{ {
n_tasks = n_threads; n_tasks = n_threads;
@ -17877,7 +18080,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
while (true) { while (true) {
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
state->shared->node_n += 1; state->shared->node_n += 1;
return (thread_ret_t) GGML_EXIT_ABORTED; state->ec = GGML_STATUS_ABORTED;
return 0;
} }
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
@ -17999,7 +18203,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
} }
} }
return GGML_EXIT_SUCCESS; return 0;
} }
struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) { struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
@ -18195,7 +18399,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
return cplan; return cplan;
} }
int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
{ {
GGML_ASSERT(cplan); GGML_ASSERT(cplan);
GGML_ASSERT(cplan->n_threads > 0); GGML_ASSERT(cplan->n_threads > 0);
@ -18239,6 +18443,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
.thrd = 0, .thrd = 0,
.ith = j, .ith = j,
.shared = &state_shared, .shared = &state_shared,
.ec = GGML_STATUS_SUCCESS,
}; };
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
@ -18249,12 +18454,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
workers[0].ith = 0; workers[0].ith = 0;
workers[0].shared = &state_shared; workers[0].shared = &state_shared;
workers[0].ec = GGML_STATUS_SUCCESS;
const int64_t perf_start_cycles = ggml_perf_cycles(); const int64_t perf_start_cycles = ggml_perf_cycles();
const int64_t perf_start_time_us = ggml_perf_time_us(); const int64_t perf_start_time_us = ggml_perf_time_us();
// this is a work thread too // this is a work thread too
int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]); ggml_graph_compute_thread(&workers[0]);
enum ggml_status compute_status = workers[0].ec;
// don't leave affinity set on the main thread // don't leave affinity set on the main thread
clear_numa_thread_affinity(); clear_numa_thread_affinity();
@ -18264,6 +18471,8 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
for (int j = 1; j < n_threads; j++) { for (int j = 1; j < n_threads; j++) {
const int rc = ggml_thread_join(workers[j].thrd, NULL); const int rc = ggml_thread_join(workers[j].thrd, NULL);
GGML_ASSERT(rc == 0); GGML_ASSERT(rc == 0);
if (workers[j].ec != GGML_STATUS_SUCCESS)
compute_status = workers[j].ec;
} }
} }
@ -18291,14 +18500,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
return compute_status; return compute_status;
} }
void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads); struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
ggml_graph_compute(cgraph, &cplan); return ggml_graph_compute(cgraph, &cplan);
} }
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) { struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {

34
ggml.h
View file

@ -315,6 +315,16 @@
extern "C" { extern "C" {
#endif #endif
enum ggml_status {
GGML_STATUS_ALLOC_FAILED = -2,
GGML_STATUS_FAILED = -1,
GGML_STATUS_SUCCESS = 0,
GGML_STATUS_ABORTED = 1,
};
// get ggml_status name string
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
typedef uint16_t ggml_fp16_t; typedef uint16_t ggml_fp16_t;
// convert FP16 <-> FP32 // convert FP16 <-> FP32
@ -454,6 +464,8 @@ extern "C" {
GGML_OP_POOL_2D, GGML_OP_POOL_2D,
GGML_OP_UPSCALE, // nearest interpolate GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD, GGML_OP_PAD,
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT, GGML_OP_ARGSORT,
GGML_OP_LEAKY_RELU, GGML_OP_LEAKY_RELU,
@ -1663,6 +1675,15 @@ extern "C" {
int p2, int p2,
int p3); int p3);
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
// timesteps: [N,]
// return: [N, dim]
GGML_API struct ggml_tensor * ggml_timestep_embedding(
struct ggml_context * ctx,
struct ggml_tensor * timesteps,
int dim,
int max_period);
// sort rows // sort rows
enum ggml_sort_order { enum ggml_sort_order {
GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_ASC,
@ -1674,6 +1695,12 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
enum ggml_sort_order order); enum ggml_sort_order order);
GGML_API struct ggml_tensor * ggml_arange(
struct ggml_context * ctx,
float start,
float stop,
float step);
// top k elements per row // top k elements per row
GGML_API struct ggml_tensor * ggml_top_k( GGML_API struct ggml_tensor * ggml_top_k(
struct ggml_context * ctx, struct ggml_context * ctx,
@ -1942,12 +1969,11 @@ extern "C" {
// ggml_graph_plan() has to be called before ggml_graph_compute() // ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data // when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context // same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);

File diff suppressed because it is too large Load diff

View file

@ -15,7 +15,7 @@ array ::=
string ::= string ::=
"\"" ( "\"" (
[^"\\] | [^"\\\x7F\x00-\x1F] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\"" ws )* "\"" ws

View file

@ -24,7 +24,7 @@ array ::=
string ::= string ::=
"\"" ( "\"" (
[^"\\] | [^"\\\x7F\x00-\x1F] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\"" ws )* "\"" ws

381
llama.cpp
View file

@ -1726,7 +1726,7 @@ struct llama_hparams {
}; };
struct llama_cparams { struct llama_cparams {
uint32_t n_ctx; // context size used during inference uint32_t n_ctx; // context size used during inference
uint32_t n_batch; uint32_t n_batch;
uint32_t n_threads; // number of threads to use for generation uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing uint32_t n_threads_batch; // number of threads to use for batch processing
@ -1743,7 +1743,9 @@ struct llama_cparams {
float yarn_beta_slow; float yarn_beta_slow;
float defrag_thold; float defrag_thold;
bool embeddings;
bool offload_kqv; bool offload_kqv;
enum llama_pooling_type pooling_type; enum llama_pooling_type pooling_type;
ggml_backend_sched_eval_callback cb_eval; ggml_backend_sched_eval_callback cb_eval;
@ -2052,7 +2054,7 @@ struct llama_context {
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
int32_t n_eval = 0; // number of eval calls int32_t n_eval = 0; // number of eval calls
// decode output (2-dimensional array: [n_tokens][n_vocab]) // logits output (2-dimensional array: [n_tokens][n_vocab])
std::vector<float> logits; std::vector<float> logits;
#ifndef NDEBUG #ifndef NDEBUG
// guard against access to unset logits // guard against access to unset logits
@ -2060,8 +2062,13 @@ struct llama_context {
#endif #endif
bool logits_all = false; bool logits_all = false;
// input embedding (1-dimensional array: [n_embd]) // embeddings output (2-dimensional array: [n_tokens][n_embd])
std::vector<float> embedding; // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
std::vector<float> embd;
// sequence embeddings output (map of [n_embd] vectors)
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
std::map<llama_seq_id, std::vector<float>> embd_seq;
// memory buffers used to evaluate the model // memory buffers used to evaluate the model
std::vector<uint8_t> buf_compute_meta; std::vector<uint8_t> buf_compute_meta;
@ -5301,8 +5308,8 @@ static struct ggml_tensor * llm_build_kqv(
ggml_mul_mat_set_prec(kq, GGML_PREC_F32); ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
} }
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE) #if defined(GGML_USE_KOMPUTE)
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, and Kompute") #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024") #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488") #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
if (hparams.f_max_alibi_bias > 0.0f) { if (hparams.f_max_alibi_bias > 0.0f) {
@ -5388,6 +5395,7 @@ static struct ggml_tensor * llm_build_kv(
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il); llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
struct ggml_tensor * cur; struct ggml_tensor * cur;
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b, cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il); q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
@ -6402,6 +6410,7 @@ struct llm_build_context {
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
struct ggml_tensor * cur; struct ggml_tensor * cur;
@ -6409,9 +6418,10 @@ struct llm_build_context {
// get input vectors with right size // get input vectors with right size
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type); const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0); struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0); struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
// construct input embeddings (token, type, position) // construct input embeddings (token, type, position)
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
@ -6429,39 +6439,38 @@ struct llm_build_context {
cb(inpL, "inp_norm", -1); cb(inpL, "inp_norm", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads) // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); struct ggml_tensor * KQ_mask = ggml_cont(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_tokens, n_tokens, n_tokens*ggml_type_size(lctx.inp_KQ_mask->type), 0));
cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens] cb(KQ_mask, "KQ_mask", -1); // [n_tokens, n_tokens]
// iterate layers // iterate layers
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur = inpL; struct ggml_tensor * cur = inpL;
struct ggml_tensor * Qcur;
struct ggml_tensor * Kcur;
struct ggml_tensor * Vcur;
// self-attention // self-attention
if (model.arch == LLM_ARCH_BERT) { if (model.arch == LLM_ARCH_BERT) {
struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq); Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk); Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv); Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
// seems like we just need to do this for Q? Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
} else { } else {
// compute Q and K and RoPE them // compute Q and K and RoPE them
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il); cb(cur, "wqkv", il);
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
@ -6480,13 +6489,41 @@ struct llm_build_context {
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
} }
struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
cb(kq, "kq", il);
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
cb(kq, "kq_soft_max_ext", il);
struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
cb(v, "v", il);
struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
cb(kqv, "kqv", il);
struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
cb(kqv_merged, "kqv_merged", il);
cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
cb(cur, "kqv_merged_cont", il);
ggml_build_forward_expand(gf, cur);
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
if (model.layers[il].bo) {
cb(cur, "kqv_wo", il);
}
if (model.layers[il].bo) {
cur = ggml_add(ctx0, cur, model.layers[il].bo);
}
cb(cur, "kqv_out", il);
// re-add the layer input // re-add the layer input
cur = ggml_add(ctx0, cur, inpL); cur = ggml_add(ctx0, cur, inpL);
@ -6526,16 +6563,29 @@ struct llm_build_context {
// final output // final output
cur = inpL; cur = inpL;
cb(cur, "result_embd", -1);
// pooling layer // pooling layer
if (pooling_type == LLAMA_POOLING_TYPE_MEAN) { switch (pooling_type) {
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean); case LLAMA_POOLING_TYPE_NONE:
} else if (pooling_type == LLAMA_POOLING_TYPE_CLS) { {
cur = ggml_get_rows(ctx0, cur, inp_cls); // nop
} else { } break;
GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type"); case LLAMA_POOLING_TYPE_MEAN:
{
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
cb(cur, "result_embd_pooled", -1);
} break;
case LLAMA_POOLING_TYPE_CLS:
{
cur = ggml_get_rows(ctx0, cur, inp_cls);
cb(cur, "result_embd_pooled", -1);
} break;
case LLAMA_POOLING_TYPE_UNSPECIFIED:
{
GGML_ASSERT(false && "Invalid pooling type");
} break;
} }
cb(cur, "result_embd", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -8467,7 +8517,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos)); ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
} }
{ if (hparams.causal_attn) {
const int64_t n_kv = kv_self.n; const int64_t n_kv = kv_self.n;
const int64_t n_tokens = batch.n_tokens; const int64_t n_tokens = batch.n_tokens;
@ -8475,7 +8525,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
float * data = (float *) lctx.inp_KQ_mask->data; float * data = (float *) lctx.inp_KQ_mask->data;
// For Transformers, use only the previous KV cells (or all, when non-causal) // For causal attention, use only the previous KV cells
// of the correct sequence for each token of the batch. // of the correct sequence for each token of the batch.
// It's assumed that if a token in the batch has multiple sequences, they are equivalent. // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
for (int h = 0; h < 1; ++h) { for (int h = 0; h < 1; ++h) {
@ -8485,16 +8535,40 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
for (int i = 0; i < n_kv; ++i) { for (int i = 0; i < n_kv; ++i) {
float f; float f;
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
(hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
f = -INFINITY; f = -INFINITY;
} else { } else {
f = 0; f = 0.0f;
} }
data[h*(n_kv*n_tokens) + j*n_kv + i] = f; data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
} }
} }
} }
} else {
// non-causal attention attends only the tokens within the batch (i.e. the KV cache is not used)
const int64_t n_tokens = batch.n_tokens;
assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
float * data = (float *) lctx.inp_KQ_mask->data;
for (int h = 0; h < 1; ++h) {
for (int j = 0; j < n_tokens; ++j) {
const llama_seq_id seq_id = batch.seq_id[j][0];
for (int i = 0; i < n_tokens; ++i) {
float f = -INFINITY;
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
if (batch.seq_id[i][s] == seq_id) {
f = 0.0f;
break;
}
}
data[h*(n_tokens*n_tokens) + j*n_tokens + i] = f;
}
}
}
} }
if (hparams.need_kq_pos) { if (hparams.need_kq_pos) {
@ -8513,13 +8587,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
const int64_t n_tokens = batch.n_tokens; const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
float * data = (float *) lctx.inp_mean->data;
float * data = (float *) lctx.inp_mean->data;
memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean)); memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
std::vector<uint64_t> sum(n_tokens, 0); std::vector<uint64_t> sum(n_tokens, 0);
for (int i = 0; i < n_tokens; ++i) { for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0]; const llama_seq_id seq_id = batch.seq_id[i][0];
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
sum[seq_id] += 1; sum[seq_id] += 1;
} }
@ -8541,11 +8618,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
const int64_t n_tokens = batch.n_tokens; const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
uint32_t * data = (uint32_t *) lctx.inp_cls->data; uint32_t * data = (uint32_t *) lctx.inp_cls->data;
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
for (int i = 0; i < n_tokens; ++i) { for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0]; const llama_seq_id seq_id = batch.seq_id[i][0];
const llama_pos pos = batch.pos[i]; const llama_pos pos = batch.pos[i];
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
if (pos == 0) { if (pos == 0) {
data[seq_id] = i; data[seq_id] = i;
} }
@ -8706,27 +8788,30 @@ static int llama_decode_internal(
batch.seq_id = seq_id_arr.data(); batch.seq_id = seq_id_arr.data();
} }
llama_kv_cache_update(&lctx); // non-causal masks do not use the KV cache
if (hparams.causal_attn) {
llama_kv_cache_update(&lctx);
// if we have enough unused cells before the current head -> // if we have enough unused cells before the current head ->
// better to start searching from the beginning of the cache, hoping to fill it // better to start searching from the beginning of the cache, hoping to fill it
if (kv_self.head > kv_self.used + 2*n_tokens) { if (kv_self.head > kv_self.used + 2*n_tokens) {
kv_self.head = 0; kv_self.head = 0;
}
if (!llama_kv_cache_find_slot(kv_self, batch)) {
return 1;
}
if (!kv_self.recurrent) {
// a heuristic, to avoid attending the full cache if it is not yet utilized
// after enough generations, the benefit from this heuristic disappears
// if we start defragmenting the cache, the benefit from this will be more important
kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
//kv_self.n = llama_kv_cache_cell_max(kv_self);
}
} }
if (!llama_kv_cache_find_slot(kv_self, batch)) { //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
return 1;
}
if (!kv_self.recurrent) {
// a heuristic, to avoid attending the full cache if it is not yet utilized
// after enough generations, the benefit from this heuristic disappears
// if we start defragmenting the cache, the benefit from this will be more important
kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
//kv_self.n = llama_kv_cache_cell_max(kv_self);
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
}
ggml_backend_sched_reset(lctx.sched); ggml_backend_sched_reset(lctx.sched);
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
@ -8734,20 +8819,26 @@ static int llama_decode_internal(
ggml_cgraph * gf = llama_build_graph(lctx, batch, false); ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
// the output is always the last tensor in the graph // the output is always the last tensor in the graph
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
if (strcmp(res->name, "result_output") == 0) { if (!hparams.causal_attn) {
// the embeddings could be the second to last tensor, or the third to last tensor res = nullptr; // do not extract logits for embedding models such as BERT
if (strcmp(embeddings->name, "result_norm") != 0) {
embeddings = gf->nodes[gf->n_nodes - 3]; // token or sequence embeddings
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); embd = gf->nodes[gf->n_nodes - 1];
}
} else if (strcmp(res->name, "result_embd") == 0) { GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
embeddings = res;
res = nullptr;
} else { } else {
GGML_ASSERT(false); if (strcmp(res->name, "result_output") == 0) {
// the token embeddings could be the second to last tensor, or the third to last tensor
if (strcmp(embd->name, "result_norm") != 0) {
embd = gf->nodes[gf->n_nodes - 3];
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
}
} else {
GGML_ASSERT(false && "missing result_output tensor");
}
} }
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@ -8814,46 +8905,82 @@ static int llama_decode_internal(
logits_out.clear(); logits_out.clear();
#endif #endif
ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res); ggml_backend_t backend_res = ggml_backend_sched_get_node_backend(lctx.sched, res);
GGML_ASSERT(res_backend != nullptr); GGML_ASSERT(backend_res != nullptr);
if (batch.logits) { if (batch.logits) {
logits_out.resize(n_vocab * n_tokens); logits_out.resize(n_vocab * n_tokens);
for (uint32_t i = 0; i < n_tokens; i++) { for (uint32_t i = 0; i < n_tokens; i++) {
if (batch.logits[i] == 0) { if (batch.logits[i] == 0) {
continue; continue;
} }
ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float)); ggml_backend_tensor_get_async(backend_res, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
#ifndef NDEBUG #ifndef NDEBUG
logits_valid[i] = true; logits_valid[i] = true;
#endif #endif
} }
} else if (lctx.logits_all) { } else if (lctx.logits_all) {
logits_out.resize(n_vocab * n_tokens); logits_out.resize(n_vocab * n_tokens);
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float)); ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
#ifndef NDEBUG #ifndef NDEBUG
std::fill(logits_valid.begin(), logits_valid.end(), true); std::fill(logits_valid.begin(), logits_valid.end(), true);
#endif #endif
} else { } else {
logits_out.resize(n_vocab); logits_out.resize(n_vocab);
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float)); ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
#ifndef NDEBUG #ifndef NDEBUG
logits_valid[0] = true; logits_valid[0] = true;
#endif #endif
} }
ggml_backend_synchronize(res_backend); ggml_backend_synchronize(backend_res);
} }
// extract embeddings // extract embeddings
if (!lctx.embedding.empty()) { if (cparams.embeddings && embd) {
auto & embedding_out = lctx.embedding; ggml_backend_t backend_embd = ggml_backend_sched_get_node_backend(lctx.sched, embd);
GGML_ASSERT(backend_embd != nullptr);
const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0; switch (cparams.pooling_type) {
const int64_t embd_size = res ? n_embd : n_embd * n_tokens; case LLAMA_POOLING_TYPE_NONE:
{
// extract token embeddings
auto & embd_out = lctx.embd;
embedding_out.resize(embd_size); if (batch.logits) {
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings); embd_out.resize(n_embd * n_tokens);
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float)); for (uint32_t i = 0; i < n_tokens; i++) {
ggml_backend_synchronize(embeddings_backend); if (batch.logits[i] == 0) {
continue;
}
ggml_backend_tensor_get_async(backend_embd, embd, embd_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
}
}
} break;
case LLAMA_POOLING_TYPE_CLS:
case LLAMA_POOLING_TYPE_MEAN:
{
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
// extract sequence embeddings
auto & embd_seq_out = lctx.embd_seq;
embd_seq_out.clear();
for (uint32_t i = 0; i < n_tokens; i++) {
const llama_seq_id seq_id = batch.seq_id[i][0];
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
continue;
}
embd_seq_out[seq_id].resize(n_embd);
ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
}
} break;
case LLAMA_POOLING_TYPE_UNSPECIFIED:
{
GGML_ASSERT(false && "unknown pooling type");
} break;
}
ggml_backend_synchronize(backend_embd);
} }
// measure the performance only for the single-token evals // measure the performance only for the single-token evals
@ -9167,19 +9294,19 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
GGML_ASSERT(llama_is_byte_token(vocab, id)); GGML_ASSERT(llama_is_byte_token(vocab, id));
const auto& token_data = vocab.id_to_token.at(id); const auto& token_data = vocab.id_to_token.at(id);
switch (llama_vocab_get_type(vocab)) { switch (llama_vocab_get_type(vocab)) {
case LLAMA_VOCAB_TYPE_SPM: { case LLAMA_VOCAB_TYPE_SPM: {
auto buf = token_data.text.substr(3, 2); auto buf = token_data.text.substr(3, 2);
return strtol(buf.c_str(), NULL, 16); return strtol(buf.c_str(), NULL, 16);
} }
case LLAMA_VOCAB_TYPE_BPE: { case LLAMA_VOCAB_TYPE_BPE: {
GGML_ASSERT(false); GGML_ASSERT(false);
return unicode_to_bytes_bpe(token_data.text); return unicode_to_bytes_bpe(token_data.text);
} }
case LLAMA_VOCAB_TYPE_WPM: { case LLAMA_VOCAB_TYPE_WPM: {
GGML_ASSERT(false); GGML_ASSERT(false);
} }
default: default:
GGML_ASSERT(false); GGML_ASSERT(false);
} }
} }
@ -12430,7 +12557,7 @@ struct llama_context_params llama_context_default_params() {
/*.type_k =*/ GGML_TYPE_F16, /*.type_k =*/ GGML_TYPE_F16,
/*.type_v =*/ GGML_TYPE_F16, /*.type_v =*/ GGML_TYPE_F16,
/*.logits_all =*/ false, /*.logits_all =*/ false,
/*.embedding =*/ false, /*.embeddings =*/ false,
/*.offload_kqv =*/ true, /*.offload_kqv =*/ true,
/*.abort_callback =*/ nullptr, /*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr, /*.abort_callback_data =*/ nullptr,
@ -12582,6 +12709,7 @@ struct llama_context * llama_new_context_with_model(
cparams.yarn_beta_fast = params.yarn_beta_fast; cparams.yarn_beta_fast = params.yarn_beta_fast;
cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.yarn_beta_slow = params.yarn_beta_slow;
cparams.defrag_thold = params.defrag_thold; cparams.defrag_thold = params.defrag_thold;
cparams.embeddings = params.embeddings;
cparams.offload_kqv = params.offload_kqv; cparams.offload_kqv = params.offload_kqv;
cparams.pooling_type = params.pooling_type; cparams.pooling_type = params.pooling_type;
@ -12769,8 +12897,8 @@ struct llama_context * llama_new_context_with_model(
// resized during inference, reserve maximum // resized during inference, reserve maximum
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch); ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
if (params.embedding) { if (params.embeddings) {
ctx->embedding.resize(hparams.n_embd); ctx->embd.reserve(hparams.n_embd*cparams.n_batch);
} }
// graph inputs // graph inputs
@ -13220,7 +13348,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
// assume worst case for logits although only currently set ones are serialized // assume worst case for logits although only currently set ones are serialized
const size_t s_logits = ctx->logits.capacity() * sizeof(float); const size_t s_logits = ctx->logits.capacity() * sizeof(float);
const size_t s_embedding_size = sizeof(size_t); const size_t s_embedding_size = sizeof(size_t);
const size_t s_embedding = ctx->embedding.size() * sizeof(float); const size_t s_embedding = ctx->embd.capacity() * sizeof(float);
const size_t s_kv_buf_size = sizeof(size_t); const size_t s_kv_buf_size = sizeof(size_t);
const size_t s_kv_head = sizeof(uint32_t); const size_t s_kv_head = sizeof(uint32_t);
const size_t s_kv_size = sizeof(uint32_t); const size_t s_kv_size = sizeof(uint32_t);
@ -13329,12 +13457,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
// copy embeddings // copy embeddings
{ {
const size_t embedding_size = ctx->embedding.size(); const size_t embeddings_size = ctx->embd.size();
data_ctx->write(&embedding_size, sizeof(embedding_size)); data_ctx->write(&embeddings_size, sizeof(embeddings_size));
if (embedding_size) { if (embeddings_size) {
data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float)); data_ctx->write(ctx->embd.data(), embeddings_size * sizeof(float));
} }
} }
@ -13449,15 +13577,17 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
// set embeddings // set embeddings
{ {
size_t embedding_size; size_t embeddings_size;
memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size); memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
GGML_ASSERT(ctx->embedding.capacity() == embedding_size); GGML_ASSERT(ctx->embd.capacity() == embeddings_size);
if (embedding_size) { if (embeddings_size) {
memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float)); ctx->embd.resize(embeddings_size);
inp += embedding_size * sizeof(float);
memcpy(ctx->embd.data(), inp, embeddings_size * sizeof(float));
inp += embeddings_size * sizeof(float);
} }
} }
@ -13717,11 +13847,20 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
} }
float * llama_get_embeddings(struct llama_context * ctx) { float * llama_get_embeddings(struct llama_context * ctx) {
return ctx->embedding.data(); return ctx->embd.data();
} }
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) { float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
return ctx->embedding.data() + i*ctx->model.hparams.n_embd; return ctx->embd.data() + i*ctx->model.hparams.n_embd;
}
float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
auto it = ctx->embd_seq.find(seq_id);
if (it == ctx->embd_seq.end()) {
return nullptr;
}
return it->second.data();
} }
const char * llama_token_get_text(const struct llama_model * model, llama_token token) { const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
@ -13895,7 +14034,7 @@ static int32_t llama_chat_apply_template_internal(
std::string & dest, bool add_ass) { std::string & dest, bool add_ass) {
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527 // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
std::stringstream ss; std::stringstream ss;
if (tmpl.find("<|im_start|>") != std::string::npos) { if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) {
// chatml template // chatml template
for (auto message : chat) { for (auto message : chat) {
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n"; ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
@ -13903,7 +14042,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) { if (add_ass) {
ss << "<|im_start|>assistant\n"; ss << "<|im_start|>assistant\n";
} }
} else if (tmpl.find("[INST]") != std::string::npos) { } else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
// llama2 template and its variants // llama2 template and its variants
// [variant] support system message // [variant] support system message
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos; bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
@ -13938,7 +14077,7 @@ static int32_t llama_chat_apply_template_internal(
} }
} }
// llama2 templates seem to not care about "add_generation_prompt" // llama2 templates seem to not care about "add_generation_prompt"
} else if (tmpl.find("<|user|>") != std::string::npos) { } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
// zephyr template // zephyr template
for (auto message : chat) { for (auto message : chat) {
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n"; ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
@ -13946,7 +14085,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) { if (add_ass) {
ss << "<|assistant|>\n"; ss << "<|assistant|>\n";
} }
} else if (tmpl.find("bos_token + message['role']") != std::string::npos) { } else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) {
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history) // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
for (auto message : chat) { for (auto message : chat) {
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
@ -13955,7 +14094,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) { if (add_ass) {
ss << "<s>assistant\n"; ss << "<s>assistant\n";
} }
} else if (tmpl.find("<start_of_turn>") != std::string::npos) { } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
// google/gemma-7b-it // google/gemma-7b-it
std::string system_prompt = ""; std::string system_prompt = "";
for (auto message : chat) { for (auto message : chat) {
@ -14002,7 +14141,7 @@ LLAMA_API int32_t llama_chat_apply_template(
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
if (res < 0) { if (res < 0) {
// worst case: there is no information about template, we will use chatml by default // worst case: there is no information about template, we will use chatml by default
curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
} else { } else {
curr_tmpl = std::string(model_template.data(), model_template.size()); curr_tmpl = std::string(model_template.data(), model_template.size());
} }

18
llama.h
View file

@ -163,7 +163,7 @@ extern "C" {
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL) // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
// - pos : the positions of the respective token in the sequence // - pos : the positions of the respective token in the sequence
// - seq_id : the sequence to which the respective token belongs // - seq_id : the sequence to which the respective token belongs
// - logits : if zero, the logits for the respective token will not be output // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
// //
typedef struct llama_batch { typedef struct llama_batch {
int32_t n_tokens; int32_t n_tokens;
@ -173,7 +173,7 @@ extern "C" {
llama_pos * pos; llama_pos * pos;
int32_t * n_seq_id; int32_t * n_seq_id;
llama_seq_id ** seq_id; llama_seq_id ** seq_id;
int8_t * logits; int8_t * logits; // TODO: rename this to "output"
// NOTE: helpers for smooth API transition - can be deprecated in the future // NOTE: helpers for smooth API transition - can be deprecated in the future
// for future-proof code, use the above fields instead and ignore everything below // for future-proof code, use the above fields instead and ignore everything below
@ -261,7 +261,7 @@ extern "C" {
// Keep the booleans together to avoid misalignment during copy-by-value. // Keep the booleans together to avoid misalignment during copy-by-value.
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
bool embedding; // embedding mode only bool embeddings; // if true, extract embeddings (together with logits)
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
// Abort callback // Abort callback
@ -657,14 +657,20 @@ extern "C" {
// llama_get_logits(ctx) + i*n_vocab // llama_get_logits(ctx) + i*n_vocab
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i); LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
// Get the embeddings for the input // Get all output token embeddings
// shape: [n_embd] (1-dimensional) // shape: [n_tokens*n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
// Get the embeddings for the ith sequence // Get the embeddings for the ith token
// llama_get_embeddings(ctx) + i*n_embd // llama_get_embeddings(ctx) + i*n_embd
// shape: [n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
// Get the embeddings for a sequence id
// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
// shape: [n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
// //
// Vocab // Vocab
// //

View file

@ -1 +1 @@
b458250b736a7473f7ff3560d47c93f1644f3290 8695910a39102609073d0e099aa7c97d6bcb3bf9

View file

@ -1412,6 +1412,50 @@ struct test_pad : public test_case {
} }
}; };
// GGML_OP_ARANGE
struct test_arange : public test_case {
const ggml_type type;
const float start;
const float stop;
const float step;
std::string vars() override {
return VARS_TO_STR4(type, start, stop, step);
}
test_arange(ggml_type type = GGML_TYPE_F32,
float start = 0.f, float stop = 10.f, float step = 1.f)
: type(type), start(start), stop(stop), step(step) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * out = ggml_arange(ctx, start, stop, step);
return out;
}
};
// GGML_OP_TIMESTEP_EMBEDDING
struct test_timestep_embedding : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne_a;
const int dim;
const int max_period;
std::string vars() override {
return VARS_TO_STR4(type, ne_a, dim, max_period);
}
test_timestep_embedding(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne_a = {2, 1, 1, 1},
int dim = 320, int max_period=10000)
: type(type), ne_a(ne_a), dim(dim), max_period(max_period) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
ggml_tensor * out = ggml_timestep_embedding(ctx, a, dim, max_period);
return out;
}
};
// GGML_OP_LEAKY_RELU // GGML_OP_LEAKY_RELU
struct test_leaky_relu : public test_case { struct test_leaky_relu : public test_case {
const ggml_type type; const ggml_type type;
@ -2126,6 +2170,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_group_norm()); test_cases.emplace_back(new test_group_norm());
test_cases.emplace_back(new test_acc()); test_cases.emplace_back(new test_acc());
test_cases.emplace_back(new test_pad()); test_cases.emplace_back(new test_pad());
test_cases.emplace_back(new test_arange());
test_cases.emplace_back(new test_timestep_embedding());
test_cases.emplace_back(new test_leaky_relu()); test_cases.emplace_back(new test_leaky_relu());
// these tests are disabled to save execution time, but they can be handy for debugging // these tests are disabled to save execution time, but they can be handy for debugging