Merge branch 'ggerganov:master' into bitnet

This commit is contained in:
Eddie-Wang 2024-06-21 16:19:59 +08:00 committed by GitHub
commit 0520d88edf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 876 additions and 98 deletions

View file

@ -6,7 +6,6 @@
#include "llama.h"
#include <algorithm>
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <codecvt>
@ -542,6 +541,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
else { invalid_param = true; }
return true;
}
@ -1870,6 +1870,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "backend" });
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
if (llama_supports_mlock()) {
options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
}
@ -2657,7 +2658,14 @@ static bool llama_download_file(const std::string & url, const std::string & pat
}
// Set the output file
std::unique_ptr<FILE, decltype(&fclose)> outfile(fopen(path_temporary.c_str(), "wb"), fclose);
struct FILE_deleter {
void operator()(FILE * f) const {
fclose(f);
}
};
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
if (!outfile) {
fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str());
return false;

View file

@ -214,7 +214,7 @@ src_func = f"""
"""
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
convert_py = convert_py_pth.read_text()
convert_py = convert_py_pth.read_text(encoding="utf-8")
convert_py = re.sub(
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
lambda m: m.group(1) + src_func + m.group(3),
@ -222,7 +222,7 @@ convert_py = re.sub(
flags=re.DOTALL | re.MULTILINE,
)
convert_py_pth.write_text(convert_py)
convert_py_pth.write_text(convert_py, encoding="utf-8")
logger.info("+++ convert-hf-to-gguf.py was updated")

View file

@ -17,9 +17,10 @@ static std::vector<std::string> split_lines(const std::string & s) {
return lines;
}
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
for (size_t i = 0; i < tokens.size(); i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
size_t n_tokens = tokens.size();
for (size_t i = 0; i < n_tokens; i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
}
}
@ -40,13 +41,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
// try to get sequence embeddings - supported only when pooling_type is not NONE
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
if (embd == NULL) {
embd = llama_get_embeddings_ith(ctx, i);
if (embd == NULL) {
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
continue;
}
}
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
float * out = output + batch.seq_id[i][0] * n_embd;
//TODO: I would also add a parameter here to enable normalization or not.
@ -97,6 +92,12 @@ int main(int argc, char ** argv) {
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
return 1;
}
if (n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);

View file

@ -44,6 +44,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_cache_clear(ctx);
llama_set_embeddings(ctx, true);
llama_set_causal_attn(ctx, false);
// run model
@ -98,7 +99,9 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
llama_token eos_token = llama_token_eos(mdl);
llama_kv_cache_clear(ctx);
llama_set_embeddings(ctx, false);
llama_set_causal_attn(ctx, true);
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
@ -166,8 +169,7 @@ int main(int argc, char * argv[]) {
llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
// create new context - set to embedding mode
cparams.embeddings = true;
// create generation context
llama_context * ctx = llama_new_context_with_model(mdl, cparams);
// ### Embedding/Representation ###

View file

@ -131,23 +131,30 @@ class LlamaState: ObservableObject {
messageLog += "\(text)"
Task.detached {
while await llamaContext.n_cur < llamaContext.n_len {
let result = await llamaContext.completion_loop()
messageLog += "\(result)"
await MainActor.run {
self.messageLog += "\(result)"
}
}
let t_end = DispatchTime.now().uptimeNanoseconds
let t_generation = Double(t_end - t_heat_end) / NS_PER_S
let t_generation = Double(t_end - t_heat_end) / self.NS_PER_S
let tokens_per_second = Double(await llamaContext.n_len) / t_generation
await llamaContext.clear()
messageLog += """
await MainActor.run {
self.messageLog += """
\n
Done
Heat up took \(t_heat)s
Generated \(tokens_per_second) t/s\n
"""
}
}
}
func bench() async {
guard let llamaContext else {

View file

@ -73,9 +73,10 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
return chunks;
}
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
for (size_t i = 0; i < tokens.size(); i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
size_t n_tokens = tokens.size();
for (size_t i = 0; i < n_tokens; i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
}
}
@ -160,6 +161,12 @@ int main(int argc, char ** argv) {
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
return 1;
}
if (n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);

View file

@ -8814,7 +8814,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
#endif
}
#if defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx)
#if defined (__AVX__) || defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx)
static const int8_t keven_signs_q2xs[1024] = {
1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
@ -8947,6 +8947,61 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
*s = 0.125f * hsum_float_8(accumf);
#elif defined(__AVX__)
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
uint32_t aux32[4];
const uint8_t * aux8 = (const uint8_t *)aux32;
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const uint16_t * restrict q2 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;
__m128i sumi1_0 = _mm_setzero_si128();
__m128i sumi1_1 = _mm_setzero_si128();
__m128i sumi2_0 = _mm_setzero_si128();
__m128i sumi2_1 = _mm_setzero_si128();
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
const __m128i q2_1_0 = _mm_set_epi64x(iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
const __m128i q2_1_1 = _mm_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]]);
const __m128i q2_2_0 = _mm_set_epi64x(iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
const __m128i q2_2_1 = _mm_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]]);
const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]);
const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127]);
const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
const uint16_t ls1 = aux32[1] >> 28;
const uint16_t ls2 = aux32[3] >> 28;
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
}
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
}
*s = 0.125f * hsum_float_8(accumf);
#elif defined(__POWER9_VECTOR__)
const vector int v0 = vec_splats((int32_t)0);
vector float vsumf0 = vec_splats(0.0f);
@ -9290,6 +9345,165 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
}
*s = 0.125f * hsum_float_8(accumf);
#elif defined(__AVX__)
const __m128i mone = _mm_set1_epi8(1);
static const char block_sign_shuffle_mask_1[32] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
};
static const char block_sign_shuffle_mask_2[32] = {
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
};
static const uint8_t bit_selector_mask_bytes[32] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
};
const __m128i bit_selector_mask_0 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes);
const __m128i bit_selector_mask_1 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes + 1);
const __m128i block_sign_shuffle_1_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1);
const __m128i block_sign_shuffle_1_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1 + 1);
const __m128i block_sign_shuffle_2_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2);
const __m128i block_sign_shuffle_2_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2 + 1);
static const uint8_t k_bit_helper[32] = {
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
};
const __m128i bit_helper_0 = _mm_loadu_si128((const __m128i*)k_bit_helper);
const __m128i bit_helper_1 = _mm_loadu_si128((const __m128i*)k_bit_helper + 1);
const __m128i m511 = _mm_set1_epi16(511);
const __m128i m4 = _mm_set1_epi8(0xf);
const __m128i m1 = _mm_set1_epi8(1);
uint64_t aux64;
// somewhat hacky, but gives a significant boost in performance
__m256i aux_gindex;
const uint16_t * gindex = (const uint16_t *)&aux_gindex;
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const uint16_t * restrict q2 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;
memcpy(&aux64, x[i].scales, 8);
__m128i stmp = _mm_set1_epi64x(aux64);
stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
__m128i sumi1_0 = _mm_setzero_si128();
__m128i sumi1_1 = _mm_setzero_si128();
__m128i sumi2_0 = _mm_setzero_si128();
__m128i sumi2_1 = _mm_setzero_si128();
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
const __m128i q2_data_0 = _mm_loadu_si128((const __m128i*)q2);
const __m128i q2_data_1 = _mm_loadu_si128((const __m128i*)q2 + 1); q2 += 16;
aux_gindex = MM256_SET_M128I(_mm_and_si128(q2_data_1, m511), _mm_and_si128(q2_data_0, m511));
const __m128i partial_sign_bits_0 = _mm_srli_epi16(q2_data_0, 9);
const __m128i partial_sign_bits_1 = _mm_srli_epi16(q2_data_1, 9);
const __m128i partial_sign_bits_upper_0 = _mm_srli_epi16(q2_data_0, 13);
const __m128i partial_sign_bits_upper_1 = _mm_srli_epi16(q2_data_1, 13);
const __m128i partial_sign_bits_for_counting_0 = _mm_xor_si128(partial_sign_bits_0, partial_sign_bits_upper_0);
const __m128i partial_sign_bits_for_counting_1 = _mm_xor_si128(partial_sign_bits_1, partial_sign_bits_upper_1);
const __m128i odd_bits_0 = _mm_shuffle_epi8(bit_helper_0, partial_sign_bits_for_counting_0);
const __m128i odd_bits_1 = _mm_shuffle_epi8(bit_helper_1, partial_sign_bits_for_counting_1);
const __m128i full_sign_bits_0 = _mm_or_si128(partial_sign_bits_0, odd_bits_0);
const __m128i full_sign_bits_1 = _mm_or_si128(partial_sign_bits_1, odd_bits_1);
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8_3_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8_3_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8_4_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8_4_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q2_1_0 = _mm_set_epi64x(iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
const __m128i q2_1_1 = _mm_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]]);
const __m128i q2_2_0 = _mm_set_epi64x(iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
const __m128i q2_2_1 = _mm_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]]);
const __m128i q2_3_0 = _mm_set_epi64x(iq2xs_grid[gindex[9]], iq2xs_grid[gindex[8]]);
const __m128i q2_3_1 = _mm_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]]);
const __m128i q2_4_0 = _mm_set_epi64x(iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
const __m128i q2_4_1 = _mm_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]]);
// AVX2 full_signs_1 is full_sign_bits_0 here
// AVX2 full_signs_2 is full_sign_bits_1 here
__m128i signs_0, signs_1;
signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_0);
signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_1);
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, _mm_or_si128(signs_0, mone));
const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, _mm_or_si128(signs_1, mone));
signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_0);
signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_1);
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, _mm_or_si128(signs_0, mone));
const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, _mm_or_si128(signs_1, mone));
signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_0);
signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_1);
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
const __m128i q8s_3_0 = _mm_sign_epi8(q8_3_0, _mm_or_si128(signs_0, mone));
const __m128i q8s_3_1 = _mm_sign_epi8(q8_3_1, _mm_or_si128(signs_1, mone));
signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_0);
signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_1);
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
const __m128i q8s_4_0 = _mm_sign_epi8(q8_4_0, _mm_or_si128(signs_0, mone));
const __m128i q8s_4_1 = _mm_sign_epi8(q8_4_1, _mm_or_si128(signs_1, mone));
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
const __m128i dot3_0 = _mm_maddubs_epi16(q2_3_0, q8s_3_0);
const __m128i dot3_1 = _mm_maddubs_epi16(q2_3_1, q8s_3_1);
const __m128i dot4_0 = _mm_maddubs_epi16(q2_4_0, q8s_4_0);
const __m128i dot4_1 = _mm_maddubs_epi16(q2_4_1, q8s_4_1);
__m128i sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0));
const __m128i sc1_0 = _mm_cvtepi8_epi16(sc_tmp);
const __m128i sc1_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1));
const __m128i sc2_0 = _mm_cvtepi8_epi16(sc_tmp);
const __m128i sc2_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2));
const __m128i sc3_0 = _mm_cvtepi8_epi16(sc_tmp);
const __m128i sc3_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3));
const __m128i sc4_0 = _mm_cvtepi8_epi16(sc_tmp);
const __m128i sc4_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot1_0, sc1_0));
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot1_1, sc1_1));
sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot2_0, sc2_0));
sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot2_1, sc2_1));
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot3_0, sc3_0));
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot3_1, sc3_1));
sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot4_0, sc4_0));
sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot4_1, sc4_1));
}
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
}
*s = 0.125f * hsum_float_8(accumf);
#elif defined(__loongarch_asx)
const __m256i mone = __lasx_xvreplgr2vr_b(1);
@ -9693,6 +9907,98 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
*s = 0.125f * hsum_float_8(accumf);
#elif defined(__AVX__)
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
};
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
};
const __m128i m4 = _mm_set1_epi8(0xf);
const __m128i m1 = _mm_set1_epi8(1);
const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
uint64_t aux64;
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * restrict qs = x[i].qs;
const uint8_t * restrict qh = x[i].qh;
const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
const int8_t * restrict q8 = y[i].qs;
memcpy(&aux64, x[i].scales, 8);
const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
const __m128i scales16_0 = _mm_cvtepi8_epi16(scales8);
const __m128i scales16_1 = _mm_cvtepi8_epi16(_mm_srli_si128(scales8, 8));
__m128i sumi1_0 = _mm_setzero_si128();
__m128i sumi1_1 = _mm_setzero_si128();
__m128i sumi2_0 = _mm_setzero_si128();
__m128i sumi2_1 = _mm_setzero_si128();
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q2_1_0 = _mm_set_epi64x(iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
const __m128i q2_1_1 = _mm_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)]);
const __m128i q2_2_0 = _mm_set_epi64x(iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
const __m128i q2_2_1 = _mm_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)]);
qs += 8;
__m128i aux128_0 = _mm_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
__m128i aux128_1 = aux128_0;
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
aux128_0 = _mm_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
aux128_1 = aux128_0;
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
signs += 4;
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 0)));
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 1)));
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 0)));
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 1)));
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
}
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
}
*s = 0.125f * hsum_float_8(accumf);
#elif defined(__POWER9_VECTOR__)
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
@ -10019,6 +10325,63 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
*s = 0.25f * hsum_float_8(accumf);
#elif defined(__AVX__)
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
uint32_t aux32[2];
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * restrict q3 = x[i].qs;
const uint8_t * restrict gas = x[i].qs + QK_K/4;
const int8_t * restrict q8 = y[i].qs;
__m128i sumi1_0 = _mm_setzero_si128();
__m128i sumi1_1 = _mm_setzero_si128();
__m128i sumi2_0 = _mm_setzero_si128();
__m128i sumi2_1 = _mm_setzero_si128();
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q2_1_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
const __m128i q2_1_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
q3 += 8;
const __m128i q2_2_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
const __m128i q2_2_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
q3 += 8;
memcpy(aux32, gas, 8); gas += 8;
const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127]);
const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
const uint16_t ls1 = aux32[0] >> 28;
const uint16_t ls2 = aux32[1] >> 28;
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
}
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
}
*s = 0.25f * hsum_float_8(accumf);
#elif defined(__POWER9_VECTOR__)
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
@ -10370,6 +10733,112 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
*s = hsum_float_8(accumf);
#elif defined(__AVX__)
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
};
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
};
const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
const __m128i idx_mul_0 = _mm_set_epi32(32, 64, 128, 256);
const __m128i idx_mul_1 = _mm_set_epi32(2, 4, 8, 16);
const __m128i idx_mask = _mm_set1_epi32(256);
typedef union {
__m128i vec[4];
uint32_t index[16];
} index_t;
index_t idx;
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * restrict qs = x[i].qs;
const uint8_t * restrict qh = x[i].qh;
const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
const int8_t * restrict q8 = y[i].qs;
__m128i sumi1_0 = _mm_setzero_si128();
__m128i sumi1_1 = _mm_setzero_si128();
__m128i sumi2_0 = _mm_setzero_si128();
__m128i sumi2_1 = _mm_setzero_si128();
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i qs_tmp = _mm_loadu_si128((const __m128i *)qs);
const __m128i idx_l_0 = _mm_cvtepu8_epi16(qs_tmp);
const __m128i idx_l_1 = _mm_cvtepu8_epi16(_mm_srli_si128(qs_tmp, 8)); qs += 16;
idx.vec[0] = _mm_set1_epi32(qh[ib32+0]);
idx.vec[1] = idx.vec[0];
idx.vec[2] = _mm_set1_epi32(qh[ib32+1]);
idx.vec[3] = idx.vec[2];
idx.vec[0] = _mm_and_si128(_mm_mullo_epi32(idx.vec[0], idx_mul_0), idx_mask);
idx.vec[1] = _mm_and_si128(_mm_mullo_epi32(idx.vec[1], idx_mul_1), idx_mask);
idx.vec[2] = _mm_and_si128(_mm_mullo_epi32(idx.vec[2], idx_mul_0), idx_mask);
idx.vec[3] = _mm_and_si128(_mm_mullo_epi32(idx.vec[3], idx_mul_1), idx_mask);
idx.vec[0] = _mm_or_si128(idx.vec[0], _mm_cvtepi16_epi32(idx_l_0));
idx.vec[1] = _mm_or_si128(idx.vec[1], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_0, 8)));
idx.vec[2] = _mm_or_si128(idx.vec[2], _mm_cvtepi16_epi32(idx_l_1));
idx.vec[3] = _mm_or_si128(idx.vec[3], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_1, 8)));
const __m128i q2_1_0 = _mm_set_epi32(iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]);
const __m128i q2_1_1 = _mm_set_epi32(iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]]);
const __m128i q2_2_0 = _mm_set_epi32(iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[9]], iq3s_grid[idx.index[8]]);
const __m128i q2_2_1 = _mm_set_epi32(iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]]);
__m128i aux128_0 = _mm_set1_epi32(signs[0] | (signs[1] << 16));
__m128i aux128_1 = aux128_0;
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
aux128_0 = _mm_set1_epi32(signs[2] | (signs[3] << 16));
aux128_1 = aux128_0;
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
signs += 4;
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
}
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
}
*s = hsum_float_8(accumf);
#elif defined(__POWER9_VECTOR__)
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
@ -10607,6 +11076,14 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
}
#if defined(__AVX__)
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
const __m128i ax = _mm_sign_epi8(x, x);
const __m128i sy = _mm_sign_epi8(y, x);
return _mm_maddubs_epi16(ax, sy);
}
#endif
#if defined(__AVX2__)
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
const __m256i ax = _mm256_sign_epi8(x, x);
@ -10724,6 +11201,54 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
#elif defined __AVX__
__m256 accum = _mm256_setzero_ps();
float accum1 = 0;
for (int i = 0; i < nb; ++i) {
const int8_t * q8 = y[i].qs;
const uint8_t * qs = x[i].qs;
const uint16_t * qh = x[i].qh;
__m128i sumi1_0 = _mm_setzero_si128();
__m128i sumi1_1 = _mm_setzero_si128();
int sumi1 = 0;
for (int ib = 0; ib < QK_K/32; ib += 2) {
const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]);
const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]);
qs += 8;
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1));
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1));
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2));
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2));
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
}
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
accum1 += d * sumi1;
}
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
#elif defined(__POWER9_VECTOR__)
const vector unsigned char v0 = vec_splats((unsigned char)0x0);
const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
@ -11062,6 +11587,92 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
#elif defined __AVX__
const __m128i mask = _mm_set1_epi16(0x7);
const __m128i mone = _mm_set1_epi16(1);
__m256 accum1 = _mm256_setzero_ps();
__m256 accum2 = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const int8_t * q8 = y[i].qs;
const uint8_t * qs = x[i].qs;
const uint8_t * qh = x[i].qh;
const uint16_t * sc = (const uint16_t *)x[i].scales;
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
__m128i sumi1_0 = _mm_setzero_si128();
__m128i sumi1_1 = _mm_setzero_si128();
__m128i sumi2_0 = _mm_setzero_si128();
__m128i sumi2_1 = _mm_setzero_si128();
for (int ib = 0; ib < QK_K/32; ib += 2) {
const __m128i q1b_1_0 = _mm_set_epi64x(
iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]);
const __m128i q1b_1_1 = _mm_set_epi64x(
iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]);
const __m128i q1b_2_0 = _mm_set_epi64x(
iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]);
const __m128i q1b_2_1 = _mm_set_epi64x(
iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]);
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0);
const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1);
const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0);
const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1);
__m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0);
__m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3);
__m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6);
__m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9);
scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone);
scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone);
scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone);
scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone);
const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0);
const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1);
const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0);
const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1);
const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0);
const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1);
const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0);
const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1);
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0));
sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1));
qs += 8; qh += 4;
}
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
}
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
#else
int sum1[2], sum2[2], delta[4];
@ -11192,6 +11803,44 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
*s = hsum_float_8(_mm256_add_ps(accum1, accum2));
#elif defined __AVX__
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
const __m128i m4b = _mm_set1_epi8(0x0f);
const __m128i mone = _mm_set1_epi16(1);
__m256 accum1 = _mm256_setzero_ps();
__m256 accum2 = _mm256_setzero_ps();
for (int ib = 0; ib < nb; ib += 2) {
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[1].qs);
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[0].qs);
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[0].qs + 1);
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[1].qs);
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[1].qs + 1);
const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
_mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
_mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
y += 2;
x += 2;
}
*s = hsum_float_8(_mm256_add_ps(accum1, accum2));
#elif defined(__POWER9_VECTOR__)
const vector signed char lowMask = vec_splats((signed char)0xF);
const vector signed int v0 = vec_splats((int32_t)0);
@ -11382,6 +12031,54 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
*s = hsum_float_8(accum);
#elif defined __AVX__
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
const __m128i m4b = _mm_set1_epi8(0x0f);
__m256 accum = _mm256_setzero_ps();
for (int ibl = 0; ibl < nb; ++ibl) {
const uint8_t * qs = x[ibl].qs;
const int8_t * q8 = y[ibl].qs;
uint16_t sh = x[ibl].scales_h;
__m128i sumi1_0 = _mm_setzero_si128();
__m128i sumi1_1 = _mm_setzero_si128();
__m128i sumi2_0 = _mm_setzero_si128();
__m128i sumi2_1 = _mm_setzero_si128();
for (int ib = 0; ib < QK_K/32; ib += 2) {
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
sh >>= 4;
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1));
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1));
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2));
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2));
sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0);
sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1);
sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0);
sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1);
}
__m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
__m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
_mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
}
*s = hsum_float_8(accum);
#elif defined(__POWER9_VECTOR__)
const vector signed char lowMask = vec_splats((signed char)0xF);
const vector int v0 = vec_splats((int32_t)0);

169
llama.cpp
View file

@ -2326,6 +2326,8 @@ struct llama_vocab {
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
int max_token_len = 0; // used for optimizing longest token search
std::unordered_map<token, id> token_to_id;
std::vector<token_data> id_to_token;
@ -4981,6 +4983,7 @@ static void llm_load_vocab(
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
vocab.token_to_id[word] = i;
vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
auto & token_data = vocab.id_to_token[i];
token_data.text = std::move(word);
@ -5291,6 +5294,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
if (model.arch == LLM_ARCH_DEEPSEEK2) {
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
@ -7732,6 +7737,50 @@ struct llm_build_context {
return lctx.inp_s_seq;
}
struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
// find result_norm tensor for input
struct ggml_tensor * inp = nullptr;
for (int i = gf->n_nodes - 1; i >= 0; --i) {
inp = gf->nodes[i];
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
break;
} else {
inp = nullptr;
}
}
GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
struct ggml_tensor * cur;
switch (pooling_type) {
case LLAMA_POOLING_TYPE_MEAN:
{
struct ggml_tensor * inp_mean = build_inp_mean();
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
} break;
case LLAMA_POOLING_TYPE_CLS:
case LLAMA_POOLING_TYPE_LAST:
{
struct ggml_tensor * inp_cls = build_inp_cls();
cur = ggml_get_rows(ctx0, inp, inp_cls);
} break;
case LLAMA_POOLING_TYPE_NONE:
{
cur = inp;
} break;
default:
{
GGML_ASSERT(false && "unknown pooling type");
} break;
}
cb(cur, "result_embd_pooled", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
struct ggml_cgraph * build_llama() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
@ -8712,8 +8761,6 @@ struct llm_build_context {
if (model.arch != LLM_ARCH_JINA_BERT_V2) {
inp_pos = build_inp_pos();
}
struct ggml_tensor * inp_mean = build_inp_mean();
struct ggml_tensor * inp_cls = build_inp_cls();
// construct input embeddings (token, type, position)
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
@ -8888,28 +8935,6 @@ struct llm_build_context {
cur = inpL;
cb(cur, "result_embd", -1);
// pooling layer
switch (pooling_type) {
case LLAMA_POOLING_TYPE_NONE:
{
// nop
} break;
case LLAMA_POOLING_TYPE_MEAN:
{
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
cb(cur, "result_embd_pooled", -1);
} break;
case LLAMA_POOLING_TYPE_CLS:
{
cur = ggml_get_rows(ctx0, cur, inp_cls);
cb(cur, "result_embd_pooled", -1);
} break;
case LLAMA_POOLING_TYPE_UNSPECIFIED:
{
GGML_ASSERT(false && "Invalid pooling type");
} break;
}
ggml_build_forward_expand(gf, cur);
return gf;
@ -12142,6 +12167,11 @@ static struct ggml_cgraph * llama_build_graph(
GGML_ASSERT(false);
}
// add on pooling layer
if (lctx.cparams.embeddings) {
result = llm.append_pooling(result);
}
llm.free();
return result;
@ -12231,7 +12261,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
// (!a || b) is a logical implication (a -> b)
// !hparams.causal_attn -> !cparams.causal_attn
(hparams.causal_attn || !cparams.causal_attn) &&
"causal attention with embedding models is not supported"
"causal attention is not supported by this model"
);
if (lctx.inp_KQ_mask) {
@ -12363,6 +12393,37 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
}
}
if (cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(lctx.inp_cls);
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
std::vector<int> last_pos(n_tokens, -1);
std::vector<int> last_row(n_tokens, -1);
for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0];
const llama_pos pos = batch.pos[i];
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
if (pos >= last_pos[seq_id]) {
last_pos[seq_id] = pos;
last_row[seq_id] = i;
}
}
for (int i = 0; i < n_tokens; ++i) {
if (last_row[i] >= 0) {
data[i] = last_row[i];
}
}
}
if (kv_self.recurrent) {
const int64_t n_kv = kv_self.n;
@ -12424,8 +12485,8 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
const bool has_logits = cparams.causal_attn;
const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
const bool has_logits = !cparams.embeddings;
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
@ -12555,11 +12616,13 @@ static int llama_decode_internal(
std::vector<std::vector<llama_seq_id>> seq_id;
// count outputs
if (batch_all.logits) {
if (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) {
n_outputs = n_tokens_all;
} else if (batch_all.logits) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs += batch_all.logits[i] != 0;
}
} else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
} else if (lctx.logits_all) {
n_outputs = n_tokens_all;
} else {
// keep last output only
@ -12690,30 +12753,13 @@ static int llama_decode_internal(
// no output
res = nullptr;
embd = nullptr;
} else if (!hparams.causal_attn) {
res = nullptr; // do not extract logits for embedding models such as BERT
// token or sequence embeddings
embd = gf->nodes[gf->n_nodes - 1];
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
} else if (cparams.embeddings) {
// the embeddings could be in the second to last tensor, or any of the previous tensors
int i_embd = gf->n_nodes - 2;
for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
i_embd = gf->n_nodes - i;
if (i_embd < 0) { break; }
embd = gf->nodes[i_embd];
}
GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
// TODO: use a per-batch flag to know when to skip logits while keeping embeddings
if (!cparams.causal_attn) {
res = nullptr; // do not extract logits when not needed
// skip computing logits
// TODO: is this safe?
gf->n_nodes = i_embd + 1;
res = nullptr; // do not extract logits for embedding case
embd = gf->nodes[gf->n_nodes - 1];
if (strcmp(embd->name, "result_embd_pooled") != 0) {
embd = gf->nodes[gf->n_nodes - 2];
}
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
} else {
embd = nullptr; // do not extract embeddings when not needed
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
@ -12782,11 +12828,10 @@ static int llama_decode_internal(
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
}
} break;
case LLAMA_POOLING_TYPE_CLS:
case LLAMA_POOLING_TYPE_MEAN:
case LLAMA_POOLING_TYPE_CLS:
case LLAMA_POOLING_TYPE_LAST:
{
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
// extract sequence embeddings
auto & embd_seq_out = lctx.embd_seq;
embd_seq_out.clear();
@ -13679,7 +13724,7 @@ private:
struct llm_tokenizer_wpm {
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) const {
const auto & token_map = vocab.token_to_id;
// normalize and split by whitespace
@ -13688,7 +13733,7 @@ struct llm_tokenizer_wpm {
// bos token prepended already
// find the longest tokens that form the words
for (const std::string &word : words) {
for (const std::string & word : words) {
// skip empty words
if (word.size() == 0) {
continue;
@ -13705,7 +13750,7 @@ struct llm_tokenizer_wpm {
for (int i = 0; i < n; ++i) {
// loop through possible match length
bool match = false;
for (int j = n; j > i; j--) {
for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) {
auto it = token_map.find(word1.substr(i, j - i));
if (it != token_map.end()) {
output.push_back(it->second);
@ -13728,7 +13773,8 @@ struct llm_tokenizer_wpm {
}
}
std::vector<std::string> preprocess(const std::string & text) {
// TODO: reduce string copies by using cpts_offs array
std::vector<std::string> preprocess(const std::string & text) const {
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
std::vector<std::string> words(1, "");
@ -14023,6 +14069,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
output.push_back(vocab.special_cls_id);
}
llm_tokenizer_wpm tokenizer(vocab);
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@ -14030,7 +14078,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
#endif
llm_tokenizer_wpm tokenizer(vocab);
tokenizer.tokenize(raw_text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token);
@ -18344,6 +18391,10 @@ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)
ctx->abort_callback_data = abort_callback_data;
}
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
ctx->cparams.embeddings = embeddings;
}
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
ctx->cparams.causal_attn = causal_attn;
}

View file

@ -174,6 +174,7 @@ extern "C" {
LLAMA_POOLING_TYPE_NONE = 0,
LLAMA_POOLING_TYPE_MEAN = 1,
LLAMA_POOLING_TYPE_CLS = 2,
LLAMA_POOLING_TYPE_LAST = 3,
};
enum llama_split_mode {
@ -293,7 +294,6 @@ extern "C" {
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
// (ignored if no pooling layer)
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency, 0 = from model
@ -786,6 +786,10 @@ extern "C" {
// Get the number of threads used for prompt and batch processing (multiple token).
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
// Set whether the model is in embeddings model or not
// If true, embeddings will be returned but logits will not
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
// Set whether to use causal attention or not
// If set to true, the model will only attend to the past tokens
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);

View file

@ -1,2 +1,2 @@
-r ./requirements-convert-legacy-llama.txt
torch~=2.1.1
torch~=2.2.1

View file

@ -1,2 +1,2 @@
-r ./requirements-convert-legacy-llama.txt
torch~=2.1.1
torch~=2.2.1

View file

@ -1,4 +1,4 @@
numpy~=1.24.4
numpy~=1.26.4
sentencepiece~=0.2.0
transformers>=4.40.1,<5.0.0
gguf>=0.1.0

View file

@ -596,6 +596,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
std::vector<uint32_t> result;
result.reserve(utf8.size());
size_t offset = 0;
while (offset < utf8.size()) {
result.push_back(unicode_cpt_from_utf8(utf8, offset));