Merge remote-tracking branch 'origin/master' into server_branch

This commit is contained in:
pudepiedj 2024-03-06 12:06:44 +00:00
commit 2d50a35988
36 changed files with 46082 additions and 47260 deletions

View file

@ -1,5 +1,6 @@
{
lib,
glibc,
config,
stdenv,
mkShell,
@ -30,6 +31,11 @@
useRocm ? config.rocmSupport,
useVulkan ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
# It's necessary to consistently use backendStdenv when building with CUDA support,
# otherwise we get libstdc++ errors downstream.
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
enableStatic ? effectiveStdenv.hostPlatform.isStatic
}@inputs:
let
@ -41,10 +47,7 @@ let
versionOlder
;
# It's necessary to consistently use backendStdenv when building with CUDA support,
# otherwise we get libstdc++ errors downstream.
stdenv = throw "Use effectiveStdenv instead";
effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
suffices =
lib.optionals useBlas [ "BLAS" ]
@ -167,6 +170,9 @@ effectiveStdenv.mkDerivation (
# TODO: Replace with autoAddDriverRunpath
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
cudaPackages.autoAddOpenGLRunpathHook
]
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
glibc.static
];
buildInputs =
@ -181,7 +187,7 @@ effectiveStdenv.mkDerivation (
[
(cmakeBool "LLAMA_NATIVE" false)
(cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "BUILD_SHARED_LIBS" true)
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "LLAMA_BLAS" useBlas)
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
@ -190,6 +196,7 @@ effectiveStdenv.mkDerivation (
(cmakeBool "LLAMA_METAL" useMetalKit)
(cmakeBool "LLAMA_MPI" useMpi)
(cmakeBool "LLAMA_VULKAN" useVulkan)
(cmakeBool "LLAMA_STATIC" enableStatic)
]
++ optionals useCuda [
(

View file

@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Recent API changes
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
### Hot topics

View file

@ -45,7 +45,8 @@ fi
if [ ! -z ${GG_BUILD_SYCL} ]; then
if [ -z ${ONEAPI_ROOT} ]; then
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
echo "source /opt/intel/oneapi/setvars.sh"
exit 1
fi

View file

@ -19,7 +19,12 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
endif()
endif()
if(EXISTS "${GIT_DIR}/index")
set(GIT_INDEX "${GIT_DIR}/index")
else()
message(WARNING "Git index not found in git repository.")
set(GIT_INDEX "")
endif()
else()
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
set(GIT_INDEX "")

View file

@ -513,12 +513,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.n_sequences = std::stoi(argv[i]);
} else if (arg == "--p-accept" || arg == "-pa") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.p_accept = std::stof(argv[i]);
} else if (arg == "--p-split" || arg == "-ps") {
if (++i >= argc) {
invalid_param = true;
@ -1044,7 +1038,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
printf(" -pa N, --p-accept N speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept);
printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
@ -1299,7 +1292,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
cparams.seed = params.seed;
cparams.logits_all = params.logits_all;
cparams.embedding = params.embedding;
cparams.embeddings = params.embedding;
cparams.rope_scaling_type = params.rope_scaling_type;
cparams.rope_freq_base = params.rope_freq_base;
cparams.rope_freq_scale = params.rope_freq_scale;

View file

@ -53,11 +53,10 @@ struct gpt_params {
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 8; // number of tokens to draft during speculative decoding
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
float p_accept = 0.5f; // speculative decoding accept probability
float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)

View file

@ -295,6 +295,77 @@ static llama_token llama_sampling_sample_impl(
return id;
}
static llama_token_data_array llama_sample_probability_distribution_impl(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
const int idx) {
const llama_sampling_params & params = ctx_sampling->params;
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
const float penalty_repeat = params.penalty_repeat;
const float penalty_freq = params.penalty_freq;
const float penalty_present = params.penalty_present;
const bool penalize_nl = params.penalize_nl;
auto & prev = ctx_sampling->prev;
auto & cur = ctx_sampling->cur;
// Get a pointer to the logits
float * logits = llama_get_logits_ith(ctx_main, idx);
// Declare original_logits at the beginning of the function scope
std::vector<float> original_logits;
// apply params.logit_bias map
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
if (ctx_cfg) {
float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
}
cur.clear();
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
// apply penalties
const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
if (penalty_tokens_used_size) {
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
llama_sample_repetition_penalties(ctx_main, &cur_p,
penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
cur_p.data[idx].logit = nl_logit;
break;
}
}
}
}
// apply grammar checks
if (ctx_sampling->grammar != NULL) {
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
}
llama_sample_softmax(ctx_main, &cur_p);
return cur_p;
}
llama_token llama_sampling_sample(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
@ -304,6 +375,14 @@ llama_token llama_sampling_sample(
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
}
llama_token_data_array llama_sampling_probability_distribution(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
const int idx) {
return llama_sample_probability_distribution_impl(ctx_sampling,ctx_main, ctx_cfg, idx);
}
void llama_sampling_accept(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,

View file

@ -131,6 +131,13 @@ llama_token llama_sampling_sample(
struct llama_context * ctx_cfg,
int idx = 0);
// returns the probability that token of given id will be sampled
llama_token_data_array llama_sampling_probability_distribution(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
int idx = 0);
void llama_sampling_accept(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,

View file

@ -36,8 +36,10 @@ class SentencePieceTokenTypes(IntEnum):
UNUSED = 5
BYTE = 6
AnyModel = TypeVar("AnyModel", bound="type[Model]")
class Model(ABC):
_model_classes: dict[str, type[Model]] = {}
@ -187,6 +189,7 @@ class Model(ABC):
@classmethod
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
assert names
def func(modelcls: type[Model]):
for name in names:
cls._model_classes[name] = modelcls

View file

@ -1377,7 +1377,6 @@ def main(args_in: list[str] | None = None) -> None:
# We currently only support Q8_0 output on little endian systems.
output_choices.append("q8_0")
parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None)
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
@ -1393,18 +1392,6 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
args = parser.parse_args(args_in)
if args.awq_path:
sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
tmp_model_path = args.model / "weighted_model"
if tmp_model_path.is_dir():
print(f"{tmp_model_path} exists as a weighted model.")
else:
tmp_model_path.mkdir(parents=True, exist_ok=True)
print("Saving new weighted model ...")
add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
print(f"Saved weighted model at {tmp_model_path}.")
args.model = tmp_model_path
if args.dump_single:
model_plus = lazy_load_file(args.model)

View file

@ -19,11 +19,11 @@ static std::vector<std::string> split_lines(const std::string & s) {
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
for (size_t i = 0; i < tokens.size(); i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, false);
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
}
}
static void normalize(float * vec, float * out, int n) {
static void normalize(const float * vec, float * out, int n) {
float norm = 0;
for (int i = 0; i < n; i++) {
norm += vec[i] * vec[i];
@ -45,10 +45,23 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
}
// normalize on copy
for (int k = 0; k < n_seq; k++) {
float * emb = llama_get_embeddings_ith(ctx, k);
float * out = output + k * n_embd;
normalize(emb, out, n_embd);
for (int i = 0; i < batch.n_tokens; i++) {
if (!batch.logits[i]) {
continue;
}
// try to get sequence embeddings - supported only when pooling_type is not NONE
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
if (embd == NULL) {
embd = llama_get_embeddings_ith(ctx, i);
if (embd == NULL) {
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
continue;
}
}
float * out = output + batch.seq_id[i][0] * n_embd;
normalize(embd, out, n_embd);
}
}
@ -132,7 +145,7 @@ int main(int argc, char ** argv) {
// initialize batch
const int n_prompts = prompts.size();
struct llama_batch batch = llama_batch_init(n_batch, 0, n_prompts);
struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
// allocate output
const int n_embd = llama_n_embd(model);
@ -145,6 +158,7 @@ int main(int argc, char ** argv) {
for (int k = 0; k < n_prompts; k++) {
// clamp to n_batch tokens
auto & inp = inputs[k];
const uint64_t n_toks = inp.size();
// encode if at capacity

34
examples/server-embd.py Normal file
View file

@ -0,0 +1,34 @@
import asyncio
import requests
import numpy as np
n = 8
result = []
async def requests_post_async(*args, **kwargs):
return await asyncio.to_thread(requests.post, *args, **kwargs)
async def main():
model_url = "http://127.0.0.1:6900"
responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
url= f"{model_url}/embedding",
json= {"content": str(i)*1024}
) for i in range(n)])
for response in responses:
embedding = response.json()["embedding"]
print(embedding[-8:])
result.append(embedding)
asyncio.run(main())
# compute cosine similarity
for i in range(n-1):
for j in range(i+1, n):
embedding1 = np.array(result[i])
embedding2 = np.array(result[j])
similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
print(f"Similarity between {i} and {j}: {similarity:.2f}")

View file

@ -1308,7 +1308,7 @@ struct llama_server_context
queue_results.send(res);
}
void send_embedding(server_slot &slot)
void send_embedding(server_slot & slot, const llama_batch & batch)
{
task_result res;
res.id = slot.task_id;
@ -1317,6 +1317,7 @@ struct llama_server_context
res.stop = true;
const int n_embd = llama_n_embd(model);
if (!params.embedding)
{
LOG_WARNING("embedding disabled", {{"params.embedding", params.embedding}});
@ -1327,12 +1328,29 @@ struct llama_server_context
}
else
{
const float *data = llama_get_embeddings(ctx);
std::vector<float> embedding(data, data + n_embd);
for (int i = 0; i < batch.n_tokens; ++i) {
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
continue;
}
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
if (embd == NULL) {
embd = llama_get_embeddings_ith(ctx, i);
if (embd == NULL) {
LOG_ERROR("failed to get embeddings for token", {{"token", batch.token[i]}, {"seq_id", batch.seq_id[i][0]}});
res.result_json = json
{
{"embedding", embedding},
{"embedding", std::vector<float>(n_embd, 0.0f)},
};
continue;
}
}
res.result_json = json
{
{"embedding", std::vector<float>(embd, embd + n_embd)},
};
}
}
queue_results.send(res);
}
@ -1948,7 +1966,7 @@ struct llama_server_context
ga_i += ga_w/ga_n;
}
}
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id }, false);
slot_npast++;
}
@ -1984,7 +2002,7 @@ struct llama_server_context
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
{
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
for (auto & slot : slots)
{
@ -2061,7 +2079,7 @@ struct llama_server_context
// prompt evaluated for embedding
if (slot.embedding)
{
send_embedding(slot);
send_embedding(slot, batch_view);
slot.release();
slot.i_batch = -1;
continue;
@ -2149,6 +2167,8 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
printf(" --pooling {none,mean,cls}\n");
printf(" pooling type for embeddings, use model default if unspecified\n");
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
@ -2421,6 +2441,18 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
}
params.yarn_beta_slow = std::stof(argv[i]);
}
else if (arg == "--pooling")
{
if (++i >= argc) {
invalid_param = true;
break;
}
std::string value(argv[i]);
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
else { invalid_param = true; break; }
}
else if (arg == "--threads" || arg == "-t")
{
if (++i >= argc)
@ -2475,7 +2507,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
break;
}
params.n_batch = std::stoi(argv[i]);
params.n_batch = std::min(512, params.n_batch);
}
else if (arg == "-skvg" || arg == "--show-graphics")
{

View file

@ -6,3 +6,4 @@ More info:
- https://github.com/ggerganov/llama.cpp/pull/2926
- https://github.com/ggerganov/llama.cpp/pull/3624
- https://github.com/ggerganov/llama.cpp/pull/5625

View file

@ -5,6 +5,7 @@
#include <cstdio>
#include <string>
#include <vector>
#include <set>
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@ -18,6 +19,7 @@ struct seq_draft {
std::vector<int> i_batch_tgt;
std::vector<llama_token> tokens;
std::vector<std::vector<llama_token_data>> dists;
struct llama_sampling_context * ctx_sampling;
};
@ -37,12 +39,15 @@ int main(int argc, char ** argv) {
// max number of parallel drafting sequences (i.e. tree branches)
const int n_seq_dft = params.n_parallel;
// probability threshold for accepting a token from the draft model
const float p_accept = params.p_accept;
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
const float p_split = params.p_split;
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
std::default_random_engine rng(params.seed);
std::uniform_real_distribution<> u_dist;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("speculative", "log"));
LOG_TEE("Log start\n");
@ -166,7 +171,9 @@ int main(int argc, char ** argv) {
std::vector<seq_draft> drafts(n_seq_dft);
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
if (params.sparams.temp == 0) {
params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
}
for (int s = 0; s < n_seq_dft; ++s) {
drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
@ -182,12 +189,15 @@ int main(int argc, char ** argv) {
drafts[0].i_batch_tgt[0] = 0;
while (true) {
std::set<int> active_seqs = {};
// print current draft sequences
for (int s = 0; s < n_seq_dft; ++s) {
if (!drafts[s].active) {
continue;
}
active_seqs.insert(s);
const auto & tokens = drafts[s].tokens;
LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
@ -196,48 +206,156 @@ int main(int argc, char ** argv) {
int i_dft = 0;
int s_keep = 0;
llama_token token_id;
std::string token_str;
// loop until we fail to accept a drafted token or we run out of drafted tokens
while (true) {
LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
// check if the target token matches any of the drafts
// for stochastic sampling, attempt to match the token with the drafted tokens
{
bool accept = false;
if (params.sparams.temp > 0) {
// stochastic verification
llama_token_data_array dist_tgt = llama_sampling_probability_distribution(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
float p_tgt = 0, p_dft = 0;
// GGML_ASSERT(dist_tgt.size() == dist_dft.size());
while (active_seqs.size() > 0) {
// randomly select a sequence to verify from active sequences
std::uniform_int_distribution<unsigned int> u_int_dist(0, active_seqs.size() - 1);
int s = *std::next(active_seqs.begin(), u_int_dist(rng));
if (i_dft >= (int) drafts[s].tokens.size()) {
drafts[s].active = false;
active_seqs.erase(s);
continue;
}
if (accept) {
// if we already accepted a token, we can skip the rest
if (drafts[s].tokens[i_dft] != drafts[s_keep].tokens[i_dft]) {
drafts[s].active = false;
active_seqs.erase(s);
}
continue;
}
LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
float r = u_dist(rng);
llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), true };
// acquire the token probabilities assigned by the draft and target models
for (size_t i = 0; i < dist_tgt.size; i++) {
if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
p_tgt = dist_tgt.data[i].p;
}
if (dist_dft.data[i].id == drafts[s].tokens[i_dft]) {
p_dft = dist_dft.data[i].p;
}
if (p_tgt && p_dft) {
break;
}
}
LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
if (r <= p_tgt / p_dft) {
s_keep = s;
accept = true;
token_id = drafts[s].tokens[i_dft];
token_str = llama_token_to_piece(ctx_tgt, token_id);
llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
break;
} else {
LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
drafts[s].active = false;
// calculate residual probability
GGML_ASSERT(dist_tgt.sorted);
GGML_ASSERT(dist_dft.sorted);
float sum_probs = 0.0f;
// sort dist by id
std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
return a.id < b.id;
});
std::sort(dist_dft.data, dist_dft.data + dist_dft.size, [](const llama_token_data &a, const llama_token_data &b) {
return a.id < b.id;
});
for (size_t i = 0; i < dist_tgt.size; i++) {
dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p);
sum_probs += dist_tgt.data[i].p;
}
for (size_t i = 0; i < dist_tgt.size; i++) {
dist_tgt.data[i].p /= sum_probs;
}
// sort dist_tgt by p desc
std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
return a.p > b.p;
});
}
active_seqs.erase(s);
for(int i = 0; i < n_seq_dft; i++) {
if (i == s) {
continue;
}
if (drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
// synchronize active status for sequences with the same drafted token
drafts[i].active = drafts[i].active && accept;
if (!drafts[i].active) {
active_seqs.erase(s);
}
}
}
}
if (!accept) {
// all drafted tokens were rejected
// sample from the target model
LOG("all drafted tokens were rejected, sampling from residual distribution\n");
token_id = llama_sample_token(ctx_tgt, &dist_tgt);
llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
token_str = llama_token_to_piece(ctx_tgt, token_id);
}
} else {
// greedy verification
// sample from the target model
llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
token_id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
const std::string token_str = llama_token_to_piece(ctx_tgt, id);
if (!params.use_color) {
printf("%s", token_str.c_str());
}
if (id == llama_token_eos(model_tgt)) {
has_eos = true;
}
++n_predict;
// check if the target token matches any of the drafts
{
bool matches = false;
token_str = llama_token_to_piece(ctx_tgt, token_id);
for (int s = 0; s < n_seq_dft; ++s) {
if (!drafts[s].active) {
continue;
}
if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) {
LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str());
if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) {
LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
s_keep = s;
matches = true;
accept = true;
} else {
drafts[s].active = false;
}
}
}
if (matches) {
if (token_id == llama_token_eos(model_tgt)) {
has_eos = true;
}
++n_predict;
if (accept) {
++n_accept;
++n_past_tgt;
++n_past_dft;
@ -245,17 +363,21 @@ int main(int argc, char ** argv) {
if (params.use_color) {
// Color token according to its origin sequence
printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
fflush(stdout);
}
continue;
}
}
if (params.use_color) {
} else {
printf("%s", token_str.c_str());
}
fflush(stdout);
continue;
} else {
printf("%s", token_str.c_str());
fflush(stdout);
break;
}
}
}
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
{
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
// TODO: simplify
{
@ -275,21 +397,21 @@ int main(int argc, char ** argv) {
drafts[s].active = false;
drafts[s].tokens.clear();
drafts[s].i_batch_tgt.clear();
drafts[s].dists.clear();
}
// note: will be erased after the speculation phase
drafts[0].tokens.push_back(id);
drafts[0].tokens.push_back(token_id);
drafts[0].dists.push_back(std::vector<llama_token_data>());
drafts[0].i_batch_tgt.push_back(0);
llama_batch_clear(batch_dft);
llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true);
llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
// LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
llama_decode (ctx_dft, batch_dft);
llama_decode(ctx_dft, batch_dft);
++n_past_dft;
break;
}
if (n_predict > params.n_predict || has_eos) {
@ -334,12 +456,6 @@ int main(int argc, char ** argv) {
k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
}
if (cur_p[0].p < p_accept) {
LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
drafts[s].drafting = false;
continue;
}
std::vector<int> sa(1, s);
// attempt to split the branch if the probability is high enough
@ -367,6 +483,7 @@ int main(int argc, char ** argv) {
drafts[n_seq_cur].skip = true;
drafts[n_seq_cur].tokens = drafts[s].tokens;
drafts[n_seq_cur].dists = drafts[s].dists;
drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
@ -389,6 +506,8 @@ int main(int argc, char ** argv) {
llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
drafts[s].tokens.push_back(id);
// save cur_p.data into drafts[s].dists
drafts[s].dists.push_back(cur_p);
// add unique drafted tokens to the target batch
drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
@ -440,6 +559,7 @@ int main(int argc, char ** argv) {
}
drafts[s].tokens.erase(drafts[s].tokens.begin());
drafts[s].dists.erase(drafts[s].dists.begin());
}
}

View file

@ -91,13 +91,14 @@ extern "C" {
// (optional) complete all pending operations
void (*GGML_CALL synchronize)(ggml_backend_t backend);
// compute graph with a plan
// create a plan for ggml_cgraph and free it
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// compute graph with a plan
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// compute graph without a plan (async)
bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
// check if the backend supports an operation
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);

View file

@ -262,11 +262,11 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
backend->iface.graph_plan_free(backend, plan);
}
void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
backend->iface.graph_plan_compute(backend, plan);
enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
return backend->iface.graph_plan_compute(backend, plan);
}
bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
return backend->iface.graph_compute(backend, cgraph);
}
@ -732,15 +732,15 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
GGML_UNUSED(backend);
}
GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
GGML_UNUSED(backend);
}
GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
@ -755,8 +755,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
cplan.abort_callback = cpu_ctx->abort_callback;
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
ggml_graph_compute(cgraph, &cplan);
return true;
return ggml_graph_compute(cgraph, &cplan);
}
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@ -1437,7 +1436,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
return true;
}
static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
@ -1472,8 +1471,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
uint64_t compute_start_us = ggml_time_us();
if (!sched->callback_eval) {
if (!ggml_backend_graph_compute(split_backend, &split->graph)) {
return false;
enum ggml_status ec = ggml_backend_graph_compute(split_backend, &split->graph);
if (ec != GGML_STATUS_SUCCESS) {
return ec;
}
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
} else {
@ -1494,8 +1494,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
if (!ggml_backend_graph_compute(split_backend, &gv)) {
return false;
enum ggml_status ec = ggml_backend_graph_compute(split_backend, &gv);
if (ec != GGML_STATUS_SUCCESS) {
return ec;
}
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
@ -1519,7 +1520,7 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
}
#endif
return true;
return GGML_STATUS_SUCCESS;
}
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
@ -1581,7 +1582,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
return true;
}
bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
if (!sched->is_reset) {
@ -1590,14 +1591,10 @@ bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cg
ggml_backend_sched_split_graph(sched, graph);
if (!ggml_backend_sched_alloc_splits(sched)) {
return false;
return GGML_STATUS_ALLOC_FAILED;
}
if (!ggml_backend_sched_compute_splits(sched)) {
return false;
}
return true;
return ggml_backend_sched_compute_splits(sched);
}
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {

View file

@ -66,12 +66,13 @@ extern "C" {
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
@ -170,7 +171,7 @@ extern "C" {
GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
// Allocate and compute graph on the backend scheduler
GGML_API bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
// Reset all assignments and allocators - must be called before changing the node backends
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);

View file

@ -12241,7 +12241,7 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
UNUSED(backend);
}
GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_cuda_set_main_device(cuda_ctx->device);
@ -12277,7 +12277,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
GGML_ASSERT(ok);
}
return true;
return GGML_STATUS_SUCCESS;
}
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {

View file

@ -1927,10 +1927,10 @@ static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(g
return ggml_backend_kompute_buffer_type(ctx->device);
}
static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
ggml_vk_graph_compute(ctx, cgraph);
return true;
return GGML_STATUS_SUCCESS;
}
static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {

View file

@ -748,7 +748,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
}
}
static bool ggml_metal_graph_compute(
static enum ggml_status ggml_metal_graph_compute(
struct ggml_metal_context * ctx,
struct ggml_cgraph * gf) {
@ -2484,7 +2484,7 @@ static bool ggml_metal_graph_compute(
MTLCommandBufferStatus status = [command_buffer status];
if (status != MTLCommandBufferStatusCompleted) {
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
return false;
return GGML_STATUS_FAILED;
}
}
@ -2493,7 +2493,7 @@ static bool ggml_metal_graph_compute(
}
}
return true;
return GGML_STATUS_SUCCESS;
}
////////////////////////////////////////////////////////////////////////////////
@ -2795,7 +2795,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe
UNUSED(backend);
}
GGML_CALL static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
return ggml_metal_graph_compute(metal_ctx, cgraph);

View file

@ -2231,7 +2231,7 @@ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(gg
GGML_UNUSED(backend);
}
static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
for (int i = 0; i < graph->n_nodes; ++i) {
ggml_tensor * node = graph->nodes[i];
switch (node->op) {
@ -2246,7 +2246,7 @@ static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgrap
}
}
return true;
return GGML_STATUS_SUCCESS;
GGML_UNUSED(backend);
}

View file

@ -51,6 +51,7 @@
#define UNUSED GGML_UNUSED
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
@ -463,8 +464,8 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
}
// NOTE: not tested
inline static int8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
int8x16_t res;
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
uint8x16_t res;
res[ 0] = a[b[ 0]];
res[ 1] = a[b[ 1]];
@ -9563,7 +9564,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
const __m256i full_signs = _mm256_set_m128i(full_sign_bits, full_sign_bits);
const __m256i full_signs = MM256_SET_M128I(full_sign_bits, full_sign_bits);
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
@ -9585,8 +9586,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
const __m256i sc1 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
const __m256i sc2 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
const __m256i sc1 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
const __m256i sc2 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
@ -9653,8 +9654,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
const __m256i full_signs_1 = _mm256_set_m128i(full_signs_l, full_signs_l);
const __m256i full_signs_2 = _mm256_set_m128i(full_signs_h, full_signs_h);
const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
__m256i signs;
signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
@ -10551,9 +10552,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
@ -10661,9 +10662,9 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -10,6 +10,7 @@ extern "C" {
#define GGML_VK_NAME "Vulkan"
#define GGML_VK_MAX_DEVICES 16
GGML_API void ggml_vk_instance_init(void);
GGML_API void ggml_vk_init_cpu_assist(void);
GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);

30
ggml.c
View file

@ -320,6 +320,17 @@ static ggml_fp16_t ggml_table_exp_f16[1 << 16];
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
float ggml_table_f32_f16[1 << 16];
const char * ggml_status_to_string(enum ggml_status status) {
switch (status) {
case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
case GGML_STATUS_SUCCESS: return "GGML status: success";
case GGML_STATUS_ABORTED: return "GGML status: warning (operation aborted)";
}
return "GGML status: unknown";
}
// note: do not use these inside ggml.c
// these are meant to be used via the ggml.h API
float ggml_fp16_to_fp32(ggml_fp16_t x) {
@ -17400,6 +17411,7 @@ struct ggml_compute_state {
ggml_thread_t thrd;
int ith;
struct ggml_compute_state_shared * shared;
enum ggml_status ec;
};
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
@ -17693,7 +17705,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
while (true) {
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
state->shared->node_n += 1;
return (thread_ret_t) GGML_EXIT_ABORTED;
state->ec = GGML_STATUS_ABORTED;
return 0;
}
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
@ -17815,7 +17828,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
}
}
return GGML_EXIT_SUCCESS;
return 0;
}
struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
@ -18011,7 +18024,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
return cplan;
}
int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
{
GGML_ASSERT(cplan);
GGML_ASSERT(cplan->n_threads > 0);
@ -18055,6 +18068,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
.thrd = 0,
.ith = j,
.shared = &state_shared,
.ec = GGML_STATUS_SUCCESS,
};
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
@ -18065,12 +18079,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
workers[0].ith = 0;
workers[0].shared = &state_shared;
workers[0].ec = GGML_STATUS_SUCCESS;
const int64_t perf_start_cycles = ggml_perf_cycles();
const int64_t perf_start_time_us = ggml_perf_time_us();
// this is a work thread too
int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]);
ggml_graph_compute_thread(&workers[0]);
enum ggml_status compute_status = workers[0].ec;
// don't leave affinity set on the main thread
clear_numa_thread_affinity();
@ -18080,6 +18096,8 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
for (int j = 1; j < n_threads; j++) {
const int rc = ggml_thread_join(workers[j].thrd, NULL);
GGML_ASSERT(rc == 0);
if (workers[j].ec != GGML_STATUS_SUCCESS)
compute_status = workers[j].ec;
}
}
@ -18107,14 +18125,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
return compute_status;
}
void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
ggml_graph_compute(cgraph, &cplan);
return ggml_graph_compute(cgraph, &cplan);
}
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {

15
ggml.h
View file

@ -315,6 +315,16 @@
extern "C" {
#endif
enum ggml_status {
GGML_STATUS_ALLOC_FAILED = -2,
GGML_STATUS_FAILED = -1,
GGML_STATUS_SUCCESS = 0,
GGML_STATUS_ABORTED = 1,
};
// get ggml_status name string
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
typedef uint16_t ggml_fp16_t;
// convert FP16 <-> FP32
@ -1941,11 +1951,10 @@ extern "C" {
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);

File diff suppressed because it is too large Load diff

View file

@ -15,7 +15,7 @@ array ::=
string ::=
"\"" (
[^"\\] |
[^"\\\x7F\x00-\x1F] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\"" ws

View file

@ -24,7 +24,7 @@ array ::=
string ::=
"\"" (
[^"\\] |
[^"\\\x7F\x00-\x1F] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\"" ws

291
llama.cpp
View file

@ -1682,7 +1682,9 @@ struct llama_cparams {
float yarn_beta_slow;
float defrag_thold;
bool embeddings;
bool offload_kqv;
enum llama_pooling_type pooling_type;
ggml_backend_sched_eval_callback cb_eval;
@ -1972,7 +1974,7 @@ struct llama_context {
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
int32_t n_eval = 0; // number of eval calls
// decode output (2-dimensional array: [n_tokens][n_vocab])
// logits output (2-dimensional array: [n_tokens][n_vocab])
std::vector<float> logits;
#ifndef NDEBUG
// guard against access to unset logits
@ -1980,8 +1982,13 @@ struct llama_context {
#endif
bool logits_all = false;
// input embedding (1-dimensional array: [n_embd])
std::vector<float> embedding;
// embeddings output (2-dimensional array: [n_tokens][n_embd])
// populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
std::vector<float> embd;
// sequence embeddings output (map of [n_embd] vectors)
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
std::map<llama_seq_id, std::vector<float>> embd_seq;
// memory buffers used to evaluate the model
std::vector<uint8_t> buf_compute_meta;
@ -5007,8 +5014,8 @@ static struct ggml_tensor * llm_build_kqv(
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
}
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE)
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, and Kompute")
#if defined(GGML_USE_KOMPUTE)
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
if (hparams.f_max_alibi_bias > 0.0f) {
@ -5092,6 +5099,7 @@ static struct ggml_tensor * llm_build_kv(
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
struct ggml_tensor * cur;
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
cb(cur, "kqv_out", il);
@ -6085,6 +6093,7 @@ struct llm_build_context {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
struct ggml_tensor * cur;
@ -6092,6 +6101,7 @@ struct llm_build_context {
// get input vectors with right size
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
@ -6112,39 +6122,38 @@ struct llm_build_context {
cb(inpL, "inp_norm", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
struct ggml_tensor * KQ_mask = ggml_cont(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_tokens, n_tokens, n_tokens*ggml_type_size(lctx.inp_KQ_mask->type), 0));
cb(KQ_mask, "KQ_mask", -1); // [n_tokens, n_tokens]
// iterate layers
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur = inpL;
struct ggml_tensor * Qcur;
struct ggml_tensor * Kcur;
struct ggml_tensor * Vcur;
// self-attention
if (model.arch == LLM_ARCH_BERT) {
struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
cb(Vcur, "Vcur", il);
// seems like we just need to do this for Q?
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
} else {
// compute Q and K and RoPE them
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il);
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
@ -6163,13 +6172,41 @@ struct llm_build_context {
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
cb(kq, "kq", il);
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
cb(kq, "kq_soft_max_ext", il);
struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
cb(v, "v", il);
struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
cb(kqv, "kqv", il);
struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
cb(kqv_merged, "kqv_merged", il);
cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
cb(cur, "kqv_merged_cont", il);
ggml_build_forward_expand(gf, cur);
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
if (model.layers[il].bo) {
cb(cur, "kqv_wo", il);
}
if (model.layers[il].bo) {
cur = ggml_add(ctx0, cur, model.layers[il].bo);
}
cb(cur, "kqv_out", il);
// re-add the layer input
cur = ggml_add(ctx0, cur, inpL);
@ -6209,16 +6246,29 @@ struct llm_build_context {
// final output
cur = inpL;
cb(cur, "result_embd", -1);
// pooling layer
if (pooling_type == LLAMA_POOLING_TYPE_MEAN) {
switch (pooling_type) {
case LLAMA_POOLING_TYPE_NONE:
{
// nop
} break;
case LLAMA_POOLING_TYPE_MEAN:
{
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
} else if (pooling_type == LLAMA_POOLING_TYPE_CLS) {
cb(cur, "result_embd_pooled", -1);
} break;
case LLAMA_POOLING_TYPE_CLS:
{
cur = ggml_get_rows(ctx0, cur, inp_cls);
} else {
GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type");
cb(cur, "result_embd_pooled", -1);
} break;
case LLAMA_POOLING_TYPE_UNSPECIFIED:
{
GGML_ASSERT(false && "Invalid pooling type");
} break;
}
cb(cur, "result_embd", -1);
ggml_build_forward_expand(gf, cur);
@ -7980,7 +8030,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
}
{
if (hparams.causal_attn) {
const int64_t n_kv = kv_self.n;
const int64_t n_tokens = batch.n_tokens;
@ -7995,16 +8045,40 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
for (int i = 0; i < n_kv; ++i) {
float f;
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
(hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
f = -INFINITY;
} else {
f = 0;
f = 0.0f;
}
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
}
}
}
} else {
// non-causal attention attends only the tokens within the batch (i.e. the KV cache is not used)
const int64_t n_tokens = batch.n_tokens;
assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
float * data = (float *) lctx.inp_KQ_mask->data;
for (int h = 0; h < 1; ++h) {
for (int j = 0; j < n_tokens; ++j) {
const llama_seq_id seq_id = batch.seq_id[j][0];
for (int i = 0; i < n_tokens; ++i) {
float f = -INFINITY;
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
if (batch.seq_id[i][s] == seq_id) {
f = 0.0f;
break;
}
}
data[h*(n_tokens*n_tokens) + j*n_tokens + i] = f;
}
}
}
}
if (hparams.need_kq_pos) {
@ -8023,13 +8097,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
float * data = (float *) lctx.inp_mean->data;
float * data = (float *) lctx.inp_mean->data;
memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
std::vector<uint64_t> sum(n_tokens, 0);
for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0];
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
sum[seq_id] += 1;
}
@ -8051,11 +8128,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0];
const llama_pos pos = batch.pos[i];
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
if (pos == 0) {
data[seq_id] = i;
}
@ -8169,6 +8251,8 @@ static int llama_decode_internal(
batch.seq_id = seq_id_arr.data();
}
// non-causal masks do not use the KV cache
if (hparams.causal_attn) {
llama_kv_cache_update(&lctx);
// if we have enough unused cells before the current head ->
@ -8186,7 +8270,8 @@ static int llama_decode_internal(
// if we start defragmenting the cache, the benefit from this will be more important
kv_self.n = std::min(cparams.n_ctx, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
//kv_self.n = llama_kv_cache_cell_max(kv_self);
// line above and below originally commented out
}
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
ggml_backend_sched_reset(lctx.sched);
@ -8196,19 +8281,25 @@ static int llama_decode_internal(
// the output is always the last tensor in the graph
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
if (strcmp(res->name, "result_output") == 0) {
// the embeddings could be the second to last tensor, or the third to last tensor
if (strcmp(embeddings->name, "result_norm") != 0) {
embeddings = gf->nodes[gf->n_nodes - 3];
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
}
} else if (strcmp(res->name, "result_embd") == 0) {
embeddings = res;
res = nullptr;
if (!hparams.causal_attn) {
res = nullptr; // do not extract logits for embedding models such as BERT
// token or sequence embeddings
embd = gf->nodes[gf->n_nodes - 1];
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
} else {
GGML_ASSERT(false);
if (strcmp(res->name, "result_output") == 0) {
// the token embeddings could be the second to last tensor, or the third to last tensor
if (strcmp(embd->name, "result_norm") != 0) {
embd = gf->nodes[gf->n_nodes - 3];
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
}
} else {
GGML_ASSERT(false && "missing result_output tensor");
}
}
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@ -8275,46 +8366,82 @@ static int llama_decode_internal(
logits_out.clear();
#endif
ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
GGML_ASSERT(res_backend != nullptr);
ggml_backend_t backend_res = ggml_backend_sched_get_node_backend(lctx.sched, res);
GGML_ASSERT(backend_res != nullptr);
if (batch.logits) {
logits_out.resize(n_vocab * n_tokens);
for (uint32_t i = 0; i < n_tokens; i++) {
if (batch.logits[i] == 0) {
continue;
}
ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
ggml_backend_tensor_get_async(backend_res, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
#ifndef NDEBUG
logits_valid[i] = true;
#endif
}
} else if (lctx.logits_all) {
logits_out.resize(n_vocab * n_tokens);
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
#ifndef NDEBUG
std::fill(logits_valid.begin(), logits_valid.end(), true);
#endif
} else {
logits_out.resize(n_vocab);
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
#ifndef NDEBUG
logits_valid[0] = true;
#endif
}
ggml_backend_synchronize(res_backend);
ggml_backend_synchronize(backend_res);
}
// extract embeddings
if (!lctx.embedding.empty()) {
auto & embedding_out = lctx.embedding;
if (cparams.embeddings && embd) {
ggml_backend_t backend_embd = ggml_backend_sched_get_node_backend(lctx.sched, embd);
GGML_ASSERT(backend_embd != nullptr);
const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
switch (cparams.pooling_type) {
case LLAMA_POOLING_TYPE_NONE:
{
// extract token embeddings
auto & embd_out = lctx.embd;
embedding_out.resize(embd_size);
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
ggml_backend_synchronize(embeddings_backend);
if (batch.logits) {
embd_out.resize(n_embd * n_tokens);
for (uint32_t i = 0; i < n_tokens; i++) {
if (batch.logits[i] == 0) {
continue;
}
ggml_backend_tensor_get_async(backend_embd, embd, embd_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
}
}
} break;
case LLAMA_POOLING_TYPE_CLS:
case LLAMA_POOLING_TYPE_MEAN:
{
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
// extract sequence embeddings
auto & embd_seq_out = lctx.embd_seq;
embd_seq_out.clear();
for (uint32_t i = 0; i < n_tokens; i++) {
const llama_seq_id seq_id = batch.seq_id[i][0];
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
continue;
}
embd_seq_out[seq_id].resize(n_embd);
ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
}
} break;
case LLAMA_POOLING_TYPE_UNSPECIFIED:
{
GGML_ASSERT(false && "unknown pooling type");
} break;
}
ggml_backend_synchronize(backend_embd);
}
// measure the performance only for the single-token evals
@ -11864,7 +11991,7 @@ struct llama_context_params llama_context_default_params() {
/*.type_k =*/ GGML_TYPE_F16,
/*.type_v =*/ GGML_TYPE_F16,
/*.logits_all =*/ false,
/*.embedding =*/ false,
/*.embeddings =*/ false,
/*.offload_kqv =*/ true,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
@ -12015,6 +12142,7 @@ struct llama_context * llama_new_context_with_model(
cparams.yarn_beta_fast = params.yarn_beta_fast;
cparams.yarn_beta_slow = params.yarn_beta_slow;
cparams.defrag_thold = params.defrag_thold;
cparams.embeddings = params.embeddings;
cparams.offload_kqv = params.offload_kqv;
cparams.pooling_type = params.pooling_type;
@ -12192,8 +12320,8 @@ struct llama_context * llama_new_context_with_model(
// resized during inference, reserve maximum
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
if (params.embedding) {
ctx->embedding.resize(hparams.n_embd);
if (params.embeddings) {
ctx->embd.reserve(hparams.n_embd*cparams.n_batch);
}
// graph inputs
@ -12628,7 +12756,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
// assume worst case for logits although only currently set ones are serialized
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
const size_t s_embedding_size = sizeof(size_t);
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
const size_t s_embedding = ctx->embd.capacity() * sizeof(float);
const size_t s_kv_buf_size = sizeof(size_t);
const size_t s_kv_head = sizeof(uint32_t);
const size_t s_kv_size = sizeof(uint32_t);
@ -12737,12 +12865,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
// copy embeddings
{
const size_t embedding_size = ctx->embedding.size();
const size_t embeddings_size = ctx->embd.size();
data_ctx->write(&embedding_size, sizeof(embedding_size));
data_ctx->write(&embeddings_size, sizeof(embeddings_size));
if (embedding_size) {
data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
if (embeddings_size) {
data_ctx->write(ctx->embd.data(), embeddings_size * sizeof(float));
}
}
@ -12846,15 +12974,17 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
// set embeddings
{
size_t embedding_size;
size_t embeddings_size;
memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
GGML_ASSERT(ctx->embedding.capacity() == embedding_size);
GGML_ASSERT(ctx->embd.capacity() == embeddings_size);
if (embedding_size) {
memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
inp += embedding_size * sizeof(float);
if (embeddings_size) {
ctx->embd.resize(embeddings_size);
memcpy(ctx->embd.data(), inp, embeddings_size * sizeof(float));
inp += embeddings_size * sizeof(float);
}
}
@ -13104,11 +13234,20 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
}
float * llama_get_embeddings(struct llama_context * ctx) {
return ctx->embedding.data();
return ctx->embd.data();
}
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
return ctx->embd.data() + i*ctx->model.hparams.n_embd;
}
float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
auto it = ctx->embd_seq.find(seq_id);
if (it == ctx->embd_seq.end()) {
return nullptr;
}
return it->second.data();
}
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {

18
llama.h
View file

@ -163,7 +163,7 @@ extern "C" {
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
// - pos : the positions of the respective token in the sequence
// - seq_id : the sequence to which the respective token belongs
// - logits : if zero, the logits for the respective token will not be output
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
//
typedef struct llama_batch {
int32_t n_tokens;
@ -173,7 +173,7 @@ extern "C" {
llama_pos * pos;
int32_t * n_seq_id;
llama_seq_id ** seq_id;
int8_t * logits;
int8_t * logits; // TODO: rename this to "output"
// NOTE: helpers for smooth API transition - can be deprecated in the future
// for future-proof code, use the above fields instead and ignore everything below
@ -260,7 +260,7 @@ extern "C" {
// Keep the booleans together to avoid misalignment during copy-by-value.
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
bool embedding; // embedding mode only
bool embeddings; // if true, extract embeddings (together with logits)
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
// Abort callback
@ -655,14 +655,20 @@ extern "C" {
// llama_get_logits(ctx) + i*n_vocab
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
// Get the embeddings for the input
// shape: [n_embd] (1-dimensional)
// Get all output token embeddings
// shape: [n_tokens*n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
// Get the embeddings for the ith sequence
// Get the embeddings for the ith token
// llama_get_embeddings(ctx) + i*n_embd
// shape: [n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
// Get the embeddings for a sequence id
// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
// shape: [n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
//
// Vocab
//

View file

@ -18,7 +18,7 @@ except ImportError as e:
KEY_PROPERTIES = [
"cpu_info", "gpu_info", "n_gpu_layers", "main_gpu", "cuda", "opencl", "metal", "gpu_blas",
"blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads",
"type_k", "type_v", "no_kv_offload", "mul_mat_q", "tensor_split", "n_prompt", "n_gen"
"type_k", "type_v", "no_kv_offload", "tensor_split", "n_prompt", "n_gen"
]
# Properties that are boolean and are converted to Yes/No for the table:

View file

@ -1 +1 @@
274680868e12427373bab4bec87554431b954704
8695910a39102609073d0e099aa7c97d6bcb3bf9