Merge branch 'master' into compilade/refactor-kv-cache
This commit is contained in:
commit
10c3c419e9
518 changed files with 78202 additions and 66427 deletions
|
@ -7,23 +7,30 @@
|
|||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
static std::vector<std::string> split_lines(const std::string & s) {
|
||||
std::string line;
|
||||
static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
|
||||
std::vector<std::string> lines;
|
||||
std::stringstream ss(s);
|
||||
while (std::getline(ss, line)) {
|
||||
lines.push_back(line);
|
||||
size_t start = 0;
|
||||
size_t end = s.find(separator);
|
||||
|
||||
while (end != std::string::npos) {
|
||||
lines.push_back(s.substr(start, end - start));
|
||||
start = end + separator.length();
|
||||
end = s.find(separator, start);
|
||||
}
|
||||
|
||||
lines.push_back(s.substr(start)); // Add the last part
|
||||
|
||||
return lines;
|
||||
}
|
||||
|
||||
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
|
||||
for (size_t i = 0; i < tokens.size(); i++) {
|
||||
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
|
||||
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
|
||||
size_t n_tokens = tokens.size();
|
||||
for (size_t i = 0; i < n_tokens; i++) {
|
||||
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
|
||||
}
|
||||
}
|
||||
|
||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
llama_past_clear(ctx);
|
||||
|
||||
|
@ -40,22 +47,10 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|||
|
||||
// try to get sequence embeddings - supported only when pooling_type is not NONE
|
||||
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
|
||||
if (embd == NULL) {
|
||||
embd = llama_get_embeddings_ith(ctx, i);
|
||||
if (embd == NULL) {
|
||||
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
|
||||
|
||||
float * out = output + batch.seq_id[i][0] * n_embd;
|
||||
//TODO: I would also add a parameter here to enable normalization or not.
|
||||
/*fprintf(stdout, "unnormalized_embedding:");
|
||||
for (int hh = 0; hh < n_embd; hh++) {
|
||||
fprintf(stdout, "%9.6f ", embd[hh]);
|
||||
}
|
||||
fprintf(stdout, "\n");*/
|
||||
llama_embd_normalize(embd, out, n_embd);
|
||||
llama_embd_normalize(embd, out, n_embd, embd_norm);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -97,6 +92,12 @@ int main(int argc, char ** argv) {
|
|||
const int n_ctx_train = llama_n_ctx_train(model);
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (n_ctx > n_ctx_train) {
|
||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||
__func__, n_ctx_train, n_ctx);
|
||||
|
@ -109,7 +110,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// split the prompt into lines
|
||||
std::vector<std::string> prompts = split_lines(params.prompt);
|
||||
std::vector<std::string> prompts = split_lines(params.prompt, params.embd_sep);
|
||||
|
||||
// max batch size
|
||||
const uint64_t n_batch = params.n_batch;
|
||||
|
@ -169,7 +170,7 @@ int main(int argc, char ** argv) {
|
|||
// encode if at capacity
|
||||
if (batch.n_tokens + n_toks > n_batch) {
|
||||
float * out = emb + p * n_embd;
|
||||
batch_decode(ctx, batch, out, s, n_embd);
|
||||
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||
llama_batch_clear(batch);
|
||||
p += s;
|
||||
s = 0;
|
||||
|
@ -182,29 +183,78 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// final batch
|
||||
float * out = emb + p * n_embd;
|
||||
batch_decode(ctx, batch, out, s, n_embd);
|
||||
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||
|
||||
// print the first part of the embeddings or for a single prompt, the full embedding
|
||||
fprintf(stdout, "\n");
|
||||
for (int j = 0; j < n_prompts; j++) {
|
||||
fprintf(stdout, "embedding %d: ", j);
|
||||
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
||||
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
||||
}
|
||||
if (params.embd_out.empty()) {
|
||||
// print the first part of the embeddings or for a single prompt, the full embedding
|
||||
fprintf(stdout, "\n");
|
||||
}
|
||||
|
||||
// print cosine similarity matrix
|
||||
if (n_prompts > 1) {
|
||||
fprintf(stdout, "\n");
|
||||
printf("cosine similarity matrix:\n\n");
|
||||
for (int i = 0; i < n_prompts; i++) {
|
||||
for (int j = 0; j < n_prompts; j++) {
|
||||
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||
fprintf(stdout, "%6.2f ", sim);
|
||||
for (int j = 0; j < n_prompts; j++) {
|
||||
fprintf(stdout, "embedding %d: ", j);
|
||||
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
||||
if (params.embd_normalize == 0) {
|
||||
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
|
||||
} else {
|
||||
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
||||
}
|
||||
}
|
||||
fprintf(stdout, "\n");
|
||||
}
|
||||
|
||||
// print cosine similarity matrix
|
||||
if (n_prompts > 1) {
|
||||
fprintf(stdout, "\n");
|
||||
printf("cosine similarity matrix:\n\n");
|
||||
for (int i = 0; i < n_prompts; i++) {
|
||||
fprintf(stdout, "%6.6s ", prompts[i].c_str());
|
||||
}
|
||||
fprintf(stdout, "\n");
|
||||
for (int i = 0; i < n_prompts; i++) {
|
||||
for (int j = 0; j < n_prompts; j++) {
|
||||
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||
fprintf(stdout, "%6.2f ", sim);
|
||||
}
|
||||
fprintf(stdout, "%1.10s", prompts[i].c_str());
|
||||
fprintf(stdout, "\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
|
||||
const bool notArray = params.embd_out != "array";
|
||||
|
||||
fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
|
||||
for (int j = 0;;) { // at least one iteration (one prompt)
|
||||
if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
|
||||
fprintf(stdout, "[");
|
||||
for (int i = 0;;) { // at least one iteration (n_embd > 0)
|
||||
fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
|
||||
i++;
|
||||
if (i < n_embd) fprintf(stdout, ","); else break;
|
||||
}
|
||||
fprintf(stdout, notArray ? "]\n }" : "]");
|
||||
j++;
|
||||
if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
|
||||
}
|
||||
fprintf(stdout, notArray ? "\n ]" : "]\n");
|
||||
|
||||
if (params.embd_out == "json+" && n_prompts > 1) {
|
||||
fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
|
||||
for (int i = 0;;) { // at least two iteration (n_prompts > 1)
|
||||
fprintf(stdout, " [");
|
||||
for (int j = 0;;) { // at least two iteration (n_prompts > 1)
|
||||
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||
fprintf(stdout, "%6.2f", sim);
|
||||
j++;
|
||||
if (j < n_prompts) fprintf(stdout, ", "); else break;
|
||||
}
|
||||
fprintf(stdout, " ]");
|
||||
i++;
|
||||
if (i < n_prompts) fprintf(stdout, ",\n"); else break;
|
||||
}
|
||||
fprintf(stdout, "\n ]");
|
||||
}
|
||||
|
||||
if (notArray) fprintf(stdout, "\n}\n");
|
||||
}
|
||||
|
||||
// clean up
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue