Merge branch 'master' into xsn/improve_server_works
This commit is contained in:
commit
85c0334084
11 changed files with 728 additions and 318 deletions
3
.github/workflows/build.yml
vendored
3
.github/workflows/build.yml
vendored
|
@ -669,8 +669,7 @@ jobs:
|
|||
run: |
|
||||
cd examples/llama.android
|
||||
|
||||
# Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820).
|
||||
./gradlew build --no-daemon -Pskip-armeabi-v7a
|
||||
./gradlew build --no-daemon
|
||||
|
||||
# freeBSD-latest:
|
||||
# runs-on: macos-12
|
||||
|
|
2
Makefile
2
Makefile
|
@ -597,7 +597,7 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
|
|||
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
||||
ifdef LLAMA_CUBLAS
|
||||
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
||||
CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
||||
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
||||
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
||||
ifndef CUDA_DOCKER_ARCH
|
||||
ifndef CUDA_POWER_ARCH
|
||||
|
|
|
@ -155,6 +155,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
|||
- [semperai/amica](https://github.com/semperai/amica)
|
||||
- [withcatai/catai](https://github.com/withcatai/catai)
|
||||
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
|
||||
- [Msty](https://msty.app) (proprietary)
|
||||
|
||||
---
|
||||
|
||||
|
|
|
@ -447,8 +447,8 @@ int main(int argc, char ** argv) {
|
|||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
|
|
|
@ -21,12 +21,8 @@ android {
|
|||
useSupportLibrary = true
|
||||
}
|
||||
ndk {
|
||||
// Workaround for https://github.com/llvm/llvm-project/issues/65820
|
||||
// affecting armeabi-v7a. Skip armeabi-v7a when invoked with
|
||||
// -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a).
|
||||
if (project.hasProperty("skip-armeabi-v7a")) {
|
||||
abiFilters += listOf("arm64-v8a", "x86_64", "x86")
|
||||
}
|
||||
// Add NDK properties if wanted, e.g.
|
||||
// abiFilters += listOf("arm64-v8a")
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
|
|
|
@ -548,8 +548,8 @@ int main(int argc, char ** argv) {
|
|||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
|
@ -576,9 +576,9 @@ int main(int argc, char ** argv) {
|
|||
LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
|
||||
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
|
||||
|
||||
llama_kv_cache_seq_shift(ctx, 0, ga_i, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div (ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
||||
llama_kv_cache_seq_shift(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
|
||||
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
||||
llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
|
||||
|
||||
n_past -= bd;
|
||||
|
||||
|
|
|
@ -126,7 +126,7 @@ int main(int argc, char ** argv) {
|
|||
const int n_batch = ctx_params.n_batch;
|
||||
const int n_batch_grp = ctx_params.n_batch/n_grp;
|
||||
|
||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch);
|
||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
|
||||
|
||||
// print the prompt token-by-token
|
||||
|
||||
|
@ -146,10 +146,11 @@ int main(int argc, char ** argv) {
|
|||
const int ib = i/n_batch - 1;
|
||||
const int bd = n_batch_grp*(n_grp - 1);
|
||||
|
||||
llama_kv_cache_seq_shift(ctx, 0, n_past - n_batch, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
||||
llama_kv_cache_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
||||
llama_kv_cache_update (ctx);
|
||||
|
||||
n_past -= bd;
|
||||
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||
}
|
||||
|
||||
llama_batch_clear(batch);
|
||||
|
@ -179,10 +180,12 @@ int main(int argc, char ** argv) {
|
|||
|
||||
LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
llama_kv_cache_defrag (ctx);
|
||||
llama_kv_cache_update (ctx);
|
||||
|
||||
n_past -= n_discard;
|
||||
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||
|
||||
llama_batch_clear(batch);
|
||||
|
||||
|
@ -208,10 +211,12 @@ int main(int argc, char ** argv) {
|
|||
if (n_discard > 0) {
|
||||
LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
llama_kv_cache_defrag (ctx);
|
||||
llama_kv_cache_update (ctx);
|
||||
|
||||
n_past -= n_discard;
|
||||
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -670,10 +670,24 @@ struct llama_server_context
|
|||
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch) != 0)
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
|
||||
{
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
return;
|
||||
const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i));
|
||||
llama_batch batch_view = {
|
||||
n_tokens,
|
||||
batch.token + i,
|
||||
nullptr,
|
||||
batch.pos + i,
|
||||
batch.n_seq_id + i,
|
||||
batch.seq_id + i,
|
||||
batch.logits + i,
|
||||
0, 0, 0, // unused
|
||||
};
|
||||
if (llama_decode(ctx, batch_view) != 0)
|
||||
{
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// assign the system KV cache to all parallel sequences
|
||||
|
@ -1396,8 +1410,8 @@ struct llama_server_context
|
|||
{"n_system_tokens", system_tokens.size()},
|
||||
{"n_cache_tokens", slot.cache_tokens.size()}
|
||||
});
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
|
||||
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++)
|
||||
{
|
||||
|
@ -1559,6 +1573,14 @@ struct llama_server_context
|
|||
}
|
||||
|
||||
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
||||
|
||||
// the last token of the cache is not in the KV cache until the next call to llama_decode
|
||||
// (it was sampled, pushed into the "cache_tokens", but not yet put in the context)
|
||||
if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size())
|
||||
{
|
||||
slot.n_past -= 1;
|
||||
}
|
||||
|
||||
slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
|
||||
|
||||
if (slot.ga_n != 1)
|
||||
|
@ -1661,9 +1683,34 @@ struct llama_server_context
|
|||
if (slot.ga_n != 1)
|
||||
{
|
||||
// context extension via Self-Extend
|
||||
<<<<<<< HEAD
|
||||
// TODO @ngxson: What happen if we're retrying with smaller n_batch?
|
||||
// By the second time we retry, "grp_attn_shift" has already been called
|
||||
slot.grp_attn_shift(ctx, n_tokens);
|
||||
=======
|
||||
while (slot.n_past_se >= slot.ga_i + slot.ga_w)
|
||||
{
|
||||
const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w;
|
||||
const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
|
||||
const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
|
||||
LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
|
||||
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
|
||||
|
||||
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
|
||||
llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
|
||||
|
||||
slot.n_past_se -= bd;
|
||||
|
||||
slot.ga_i += slot.ga_w / slot.ga_n;
|
||||
|
||||
LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
|
||||
}
|
||||
slot.n_past_se += n_tokens;
|
||||
>>>>>>> master
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -462,6 +462,30 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
|||
return res;
|
||||
}
|
||||
|
||||
// NOTE: not tested
|
||||
inline static int8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
||||
int8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_int16x8x2_t int16x8x2_t
|
||||
|
@ -476,6 +500,7 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
|||
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
||||
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
||||
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
||||
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -9488,8 +9513,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
qs += 16;
|
||||
|
||||
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
|
||||
vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
||||
vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
||||
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
||||
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
||||
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
||||
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
||||
|
||||
|
@ -9497,8 +9522,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1]));
|
||||
|
||||
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
|
||||
vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
||||
vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
||||
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
||||
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
||||
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
||||
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
||||
|
||||
|
|
34
llama.h
34
llama.h
|
@ -64,6 +64,15 @@ extern "C" {
|
|||
LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
|
||||
};
|
||||
|
||||
// note: these values should be synchronized with ggml_rope
|
||||
// TODO: maybe move this enum to ggml.h (ggml_rope_type)
|
||||
enum llama_rope_type {
|
||||
LLAMA_ROPE_TYPE_NONE = -1,
|
||||
LLAMA_ROPE_TYPE_NORM = 0,
|
||||
LLAMA_ROPE_TYPE_NEOX = 2,
|
||||
LLAMA_ROPE_TYPE_GLM = 4,
|
||||
};
|
||||
|
||||
enum llama_token_type {
|
||||
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
||||
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
||||
|
@ -360,6 +369,7 @@ extern "C" {
|
|||
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
||||
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
||||
|
||||
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
||||
|
@ -514,10 +524,12 @@ extern "C" {
|
|||
llama_seq_id seq_id);
|
||||
|
||||
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
||||
// - lazily on next llama_decode()
|
||||
// - explicitly with llama_kv_cache_update()
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
LLAMA_API void llama_kv_cache_seq_shift(
|
||||
LLAMA_API void llama_kv_cache_seq_add(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
|
@ -525,7 +537,9 @@ extern "C" {
|
|||
llama_pos delta);
|
||||
|
||||
// Integer division of the positions by factor of `d > 1`
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
||||
// - lazily on next llama_decode()
|
||||
// - explicitly with llama_kv_cache_update()
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
LLAMA_API void llama_kv_cache_seq_div(
|
||||
|
@ -535,6 +549,20 @@ extern "C" {
|
|||
llama_pos p1,
|
||||
int d);
|
||||
|
||||
// Returns the largest position present in the KV cache for the specified sequence
|
||||
LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id);
|
||||
|
||||
// Defragment the KV cache
|
||||
// This will be applied:
|
||||
// - lazily on next llama_decode()
|
||||
// - explicitly with llama_kv_cache_update()
|
||||
LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
|
||||
|
||||
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
||||
LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
|
||||
|
||||
//
|
||||
// State / sessions
|
||||
//
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue