mtl : add save/load vocab to ggml file

This commit is contained in:
Georgi Gerganov 2023-06-02 21:00:30 +03:00
parent 03c2d72867
commit 640a889632
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
3 changed files with 68 additions and 15 deletions

View file

@ -24,6 +24,31 @@ int main(int argc, char ** argv) {
struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval); struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
gf.n_threads = 1; gf.n_threads = 1;
{
struct ggml_tensor * t_vocab = ggml_graph_get_tensor(&gf, "vocab");
if (t_vocab == NULL) {
fprintf(stderr, "%s: vocab tensor not found\n", __func__);
return -1;
}
const char * ptr = (const char *) t_vocab->data;
int32_t n_vocab = 0;
memcpy(&n_vocab, ptr, sizeof(n_vocab)); ptr += sizeof(n_vocab);
printf("%s: n_vocab = %d\n", __func__, n_vocab);
for (int i = 0; i < 512; ++i) {
char text[32];
float score;
memcpy(text, ptr, sizeof(text)); ptr += sizeof(text);
memcpy(&score, ptr, sizeof(score)); ptr += sizeof(score);
printf("%s: token[%4d] = %16.*s, score = %6.2f\n", __func__, i, (int) sizeof(text), text, score);
}
}
// allocate work context // allocate work context
static size_t buf_size = gf.work_size; // TODO static size_t buf_size = gf.work_size; // TODO
static void * buf = malloc(buf_size); static void * buf = malloc(buf_size);

View file

@ -108,20 +108,6 @@ struct ggml_mtl_context * llama_mtl_init(
exit(1); exit(1);
} }
} }
#elif 0
// this does not work !?!?!
// load library from "mtl.metallib"
{
NSError * error = nil;
NSString * path = [[NSBundle mainBundle] pathForResource:@"./mtl" ofType:@"metallib"];
ctx->library = [ctx->device newLibraryWithFile:path error:&error];
if (error) {
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
exit(1);
}
}
#else #else
// read the source from "../examples/mtl/mtl.metal" into a string and use newLibraryWithSource // read the source from "../examples/mtl/mtl.metal" into a string and use newLibraryWithSource
{ {

View file

@ -1505,8 +1505,50 @@ static bool llama_eval_internal(
//} //}
if (cgraph_fname) { if (cgraph_fname) {
// TODO: tmp add the vocabulary as a leaf to the computation graph, until better approach is found
{
char tmp[32]; // max token length
// store null-terminated string for simplicity
std::vector<uint8_t> buf_vocab(sizeof(int32_t) + n_vocab*(32 + sizeof(float)));
uint64_t offs = 0;
{
const int32_t n = n_vocab;
memcpy(&buf_vocab[offs], &n, sizeof(n)); offs += sizeof(n);
}
for (int i = 0; i < n_vocab; i++) {
const int32_t id = i;
const float score = lctx.vocab.id_to_token[id].score;
const std::string text = lctx.vocab.id_to_token[id].tok;
snprintf(tmp, sizeof(tmp), "%s", text.c_str());
memcpy(&buf_vocab[offs], tmp, 32); offs += 32;
memcpy(&buf_vocab[offs], &score, sizeof(score)); offs += sizeof(score);
}
struct ggml_init_params params;
params.mem_size = ggml_tensor_overhead();
params.mem_buffer = NULL;
params.no_alloc = true;
ggml_context * ctx_vocab = ggml_init(params);
struct ggml_tensor * t_vocab = ggml_new_tensor_1d(ctx_vocab, GGML_TYPE_I8, buf_vocab.size());
t_vocab->data = buf_vocab.data();
ggml_set_name(t_vocab, "vocab");
gf.leafs[gf.n_leafs++] = t_vocab;
ggml_graph_export(&gf, cgraph_fname); ggml_graph_export(&gf, cgraph_fname);
ggml_free(ctx_vocab);
}
float * logits = (float *) ggml_get_data(inpL); float * logits = (float *) ggml_get_data(inpL);
printf("logits: "); printf("logits: ");