From 0e730dd23b0fb5f93dba574e0a48d9a69dc5dbae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Podiv=C3=ADn?= <66251151+jpodivin@users.noreply.github.com> Date: Mon, 29 May 2023 06:45:50 +0200 Subject: [PATCH 1/5] Adding git in container package dependencies (#1621) Git added to build packages for version information in docker image Signed-off-by: Jiri Podivin --- .devops/full.Dockerfile | 2 +- .devops/main.Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile index 491d67676..01b3111d9 100644 --- a/.devops/full.Dockerfile +++ b/.devops/full.Dockerfile @@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04 FROM ubuntu:$UBUNTU_VERSION as build RUN apt-get update && \ - apt-get install -y build-essential python3 python3-pip + apt-get install -y build-essential python3 python3-pip git COPY requirements.txt requirements.txt diff --git a/.devops/main.Dockerfile b/.devops/main.Dockerfile index 2e629f8ce..fc34a0c18 100644 --- a/.devops/main.Dockerfile +++ b/.devops/main.Dockerfile @@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04 FROM ubuntu:$UBUNTU_VERSION as build RUN apt-get update && \ - apt-get install -y build-essential + apt-get install -y build-essential git WORKDIR /app From 248367605ead6fb7c36d2bfb1ebd8f00a23f7c71 Mon Sep 17 00:00:00 2001 From: DannyDaemonic Date: Mon, 29 May 2023 05:13:40 -0700 Subject: [PATCH 2/5] Work around for recalculating logits in cached prompts (Fixes #1585) (#1609) * Work around for recalculating logits in cached prompts --- examples/main/main.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index c7c591537..6131f5b46 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -360,6 +360,12 @@ int main(int argc, char ** argv) { } } if (i > 0) { + // check if we've used up all the prompt but not all cached tokens + if (embd.size() == i && n_session_consumed < (int) session_tokens.size()) { + // force revaluation of the last token to recalculate logits + i--; + n_past--; + } embd.erase(embd.begin(), embd.begin() + i); } } From 5d1830b99dfd85bb6279adb4dd94aa444afd5b5e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 29 May 2023 19:30:49 +0300 Subject: [PATCH 3/5] ggml : fix bug in ggml_alibi --- ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index 07ea92a58..b4570eb50 100644 --- a/ggml.c +++ b/ggml.c @@ -6317,7 +6317,7 @@ struct ggml_tensor * ggml_alibi( ggml_scratch_save(ctx); - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); ((int32_t *) b->data)[0] = n_past; ((int32_t *) b->data)[1] = n_head; From 7552ac586380f202b75b18aa216ecfefbd438d94 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 29 May 2023 19:31:44 +0300 Subject: [PATCH 4/5] ggml : sync cgraph import / export API --- ggml.c | 482 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- ggml.h | 12 +- 2 files changed, 491 insertions(+), 3 deletions(-) diff --git a/ggml.c b/ggml.c index b4570eb50..4cd0d7211 100644 --- a/ggml.c +++ b/ggml.c @@ -4030,6 +4030,14 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) { ctx->no_alloc = no_alloc; } +void * ggml_get_mem_buffer(struct ggml_context * ctx) { + return ctx->mem_buffer; +} + +size_t ggml_get_mem_size(struct ggml_context * ctx) { + return ctx->mem_size; +} + // IMPORTANT: // when creating "opt" tensors, always save and load the scratch buffer // this is an error prone process, but it is necessary to support inplace @@ -4523,6 +4531,23 @@ struct ggml_tensor * ggml_view_tensor( return result; } +struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) { + struct ggml_object * obj = ctx->objects_begin; + + char * const mem_buffer = ctx->mem_buffer; + + while (obj != NULL) { + struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs); + if (strcmp(cur->name, name) == 0) { + return cur; + } + + obj = obj->next; + } + + return NULL; +} + //////////////////////////////////////////////////////////////////////////////// // ggml_dup @@ -14532,7 +14557,7 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { } } -struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name) { +struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) { for (int i = 0; i < cgraph->n_leafs; i++) { struct ggml_tensor * leaf = cgraph->leafs[i]; @@ -14552,6 +14577,461 @@ struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const return NULL; } +static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) { + const int64_t * ne = tensor->ne; + const size_t * nb = tensor->nb; + + fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n", + ggml_type_name(tensor->type), + ggml_op_name (tensor->op), + tensor->n_dims, + ne[0], ne[1], ne[2], ne[3], + nb[0], nb[1], nb[2], nb[3], + tensor->data, + tensor->name); +} + +static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) { + const int64_t * ne = tensor->ne; + const size_t * nb = tensor->nb; + + fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n", + arg, + ggml_type_name(tensor->type), + ggml_op_name (tensor->op), + tensor->n_dims, + ne[0], ne[1], ne[2], ne[3], + nb[0], nb[1], nb[2], nb[3], + tensor->n_tasks, + tensor->data, + tensor->name); +} + +void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { + assert(cgraph->work == NULL); + assert(cgraph->work_size == 0); + + uint64_t size_eval = 0; + + // compute size of intermediate results + // TODO: does not take into account scratch buffers !!!! + for (int i = 0; i < cgraph->n_nodes; ++i) { + size_eval += ggml_nbytes(cgraph->nodes[i]); + } + + // print + { + FILE * fout = stdout; + + fprintf(fout, "\n"); + fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); + fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); + fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); + fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); + fprintf(fout, "%-16s %8llu\n", "eval", size_eval); + + // header + fprintf(fout, "\n"); + fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n", + "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME"); + + for (int i = 0; i < cgraph->n_leafs; ++i) { + ggml_graph_export_leaf(cgraph->leafs[i], fout); + + GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE); + GGML_ASSERT(cgraph->leafs[i]->src0 == NULL); + GGML_ASSERT(cgraph->leafs[i]->src1 == NULL); + } + + // header + fprintf(fout, "\n"); + fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n", + "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME"); + + for (int i = 0; i < cgraph->n_nodes; ++i) { + ggml_graph_export_node(cgraph->nodes[i], "DST", fout); + + if (cgraph->nodes[i]->src0) { + ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout); + } + + if (cgraph->nodes[i]->src1) { + ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout); + } + + for (int j = 0; j < GGML_MAX_OPT; ++j) { + if (cgraph->nodes[i]->opt[j]) { + ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout); + } + } + + fprintf(fout, "\n"); + } + + fprintf(fout, "\n"); + } + + // write binary data + { + FILE * fout = fopen(fname, "wb"); + + if (!fout) { + fprintf(stderr, "%s: failed to open %s\n", __func__, fname); + return; + } + + // header + { + const uint32_t magic = GGML_FILE_MAGIC; + const uint32_t version = GGML_FILE_VERSION; + const uint32_t n_leafs = cgraph->n_leafs; + const uint32_t nodes = cgraph->n_nodes; + + fwrite(&magic, sizeof(uint32_t), 1, fout); + fwrite(&version, sizeof(uint32_t), 1, fout); + fwrite(&n_leafs, sizeof(uint32_t), 1, fout); + fwrite(&nodes, sizeof(uint32_t), 1, fout); + fwrite(&size_eval, sizeof(uint64_t), 1, fout); + } + + // leafs + { + for (int i = 0; i < cgraph->n_leafs; ++i) { + const struct ggml_tensor * tensor = cgraph->leafs[i]; + + const uint32_t type = tensor->type; + const uint32_t op = tensor->op; + const uint32_t n_dims = tensor->n_dims; + + fwrite(&type, sizeof(uint32_t), 1, fout); + fwrite(&op, sizeof(uint32_t), 1, fout); + fwrite(&n_dims, sizeof(uint32_t), 1, fout); + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + const uint64_t ne = tensor->ne[j]; + const uint64_t nb = tensor->nb[j]; + + fwrite(&ne, sizeof(uint64_t), 1, fout); + fwrite(&nb, sizeof(uint64_t), 1, fout); + } + + // store the pointer address + { + const uint64_t ptr = (uint64_t) tensor->data; + + fwrite(&ptr, sizeof(uint64_t), 1, fout); + } + + fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); + + // dump the data + // TODO: pad this to 32 byte boundary + { + const size_t size = ggml_nbytes(tensor); + + fwrite(tensor->data, sizeof(char), size, fout); + } + } + } + + // nodes + { + for (int i = 0; i < cgraph->n_nodes; ++i) { + const struct ggml_tensor * tensor = cgraph->nodes[i]; + + const uint32_t type = tensor->type; + const uint32_t op = tensor->op; + const uint32_t n_dims = tensor->n_dims; + + fwrite(&type, sizeof(uint32_t), 1, fout); + fwrite(&op, sizeof(uint32_t), 1, fout); + fwrite(&n_dims, sizeof(uint32_t), 1, fout); + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + const uint64_t ne = tensor->ne[j]; + const uint64_t nb = tensor->nb[j]; + + fwrite(&ne, sizeof(uint64_t), 1, fout); + fwrite(&nb, sizeof(uint64_t), 1, fout); + } + + // store the pointer address + { + const uint64_t ptr = (uint64_t) tensor->data; + + fwrite(&ptr, sizeof(uint64_t), 1, fout); + } + + fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); + + // output the op arguments + { + struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL }; + + args[0] = tensor->src0; + args[1] = tensor->src1; + + for (int j = 0; j < GGML_MAX_OPT; ++j) { + args[2 + j] = tensor->opt[j]; + } + + for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) { + if (args[j]) { + int32_t idx = -1; + + // check if leaf + { + for (int k = 0; k < cgraph->n_leafs; ++k) { + if (args[j] == cgraph->leafs[k]) { + idx = k; + break; + } + } + } + + // check if node + if (idx == -1) { + for (int k = 0; k < cgraph->n_nodes; ++k) { + if (args[j] == cgraph->nodes[k]) { + idx = GGML_MAX_NODES + k; + break; + } + } + } + + if (idx == -1) { + fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i); + return; + } + + fwrite(&idx, sizeof(int32_t), 1, fout); + } else { + const int32_t nul = -1; + + fwrite(&nul, sizeof(int32_t), 1, fout); + } + } + } + } + } + + fclose(fout); + } +} + +struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) { + assert(*ctx_data == NULL); + assert(*ctx_eval == NULL); + + struct ggml_cgraph result = { 0 }; + + struct ggml_tensor * data = NULL; + + // read file into data + { + FILE * fin = fopen(fname, "rb"); + + if (!fin) { + fprintf(stderr, "%s: failed to open %s\n", __func__, fname); + return result; + } + + size_t fsize = 0; + + fseek(fin, 0, SEEK_END); + fsize = ftell(fin); + fseek(fin, 0, SEEK_SET); + + // create the data context + { + const size_t overhead = 1*ggml_tensor_overhead(); + + struct ggml_init_params params = { + .mem_size = fsize + overhead, + .mem_buffer = NULL, + .no_alloc = false, + }; + + *ctx_data = ggml_init(params); + + if (!*ctx_data) { + fprintf(stderr, "%s: failed to create ggml context\n", __func__); + return result; + } + } + + data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize); + + fread(data->data, sizeof(char), fsize, fin); + + fclose(fin); + } + + // populate result + { + char * ptr = (char *) data->data; + + const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic); + + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic); + return result; + } + + const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version); + + if (version != GGML_FILE_VERSION) { + fprintf(stderr, "%s: invalid version number\n", __func__); + return result; + } + + const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs); + const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes); + const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval); + + result.n_leafs = n_leafs; + result.n_nodes = n_nodes; + + // create the data context + { + const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead(); + + struct ggml_init_params params = { + .mem_size = size_eval + overhead, + .mem_buffer = NULL, + .no_alloc = true, + }; + + *ctx_eval = ggml_init(params); + + if (!*ctx_eval) { + fprintf(stderr, "%s: failed to create ggml context\n", __func__); + return result; + } + } + + // leafs + { + uint32_t type; + uint32_t op; + uint32_t n_dims; + + for (uint32_t i = 0; i < n_leafs; ++i) { + type = *(const uint32_t *) ptr; ptr += sizeof(type); + op = *(const uint32_t *) ptr; ptr += sizeof(op); + n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); + + int64_t ne[GGML_MAX_DIMS]; + size_t nb[GGML_MAX_DIMS]; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + uint64_t ne_cur; + uint64_t nb_cur; + + ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur); + nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur); + + ne[j] = ne_cur; + nb[j] = nb_cur; + } + + struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); + + tensor->op = (enum ggml_op) op; + + uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); + + memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME; + + tensor->data = (void *) ptr; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + tensor->nb[j] = nb[j]; + } + + result.leafs[i] = tensor; + + ptr += ggml_nbytes(tensor); + + fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); + } + } + + ggml_set_no_alloc(*ctx_eval, false); + + // nodes + { + uint32_t type; + uint32_t op; + uint32_t n_dims; + + for (uint32_t i = 0; i < n_nodes; ++i) { + type = *(const uint32_t *) ptr; ptr += sizeof(type); + op = *(const uint32_t *) ptr; ptr += sizeof(op); + n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); + + int64_t ne[GGML_MAX_DIMS]; + size_t nb[GGML_MAX_DIMS]; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + uint64_t ne_cur; + uint64_t nb_cur; + + ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur); + nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur); + + ne[j] = ne_cur; + nb[j] = nb_cur; + } + + struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); + + tensor->op = (enum ggml_op) op; + + uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); + + memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + tensor->nb[j] = nb[j]; + } + + // parse args + { + struct ggml_tensor ** args[2 + GGML_MAX_OPT] = { + &tensor->src0, + &tensor->src1, + }; + + for (int j = 0; j < GGML_MAX_OPT; ++j) { + args[2 + j] = &tensor->opt[j]; + } + + for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) { + const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx); + + if (arg_idx == -1) { + continue; + } + + if (arg_idx < GGML_MAX_NODES) { + *args[j] = result.leafs[arg_idx]; + } else { + *args[j] = result.nodes[arg_idx - GGML_MAX_NODES]; + } + } + } + + result.nodes[i] = tensor; + + fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); + } + } + } + + return result; +} + void ggml_graph_print(const struct ggml_cgraph * cgraph) { int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0}; diff --git a/ggml.h b/ggml.h index 558138280..60c0ad8bf 100644 --- a/ggml.h +++ b/ggml.h @@ -451,9 +451,12 @@ extern "C" { GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); - GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); + GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch); GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc); + GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx); + GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx); + GGML_API struct ggml_tensor * ggml_new_tensor( struct ggml_context * ctx, enum ggml_type type, @@ -492,6 +495,8 @@ extern "C" { GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src); + GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); + GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); @@ -978,7 +983,10 @@ extern "C" { GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); - GGML_API struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name); + GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); + + GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname); + GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval); // print info and performance information for the graph GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph); From ffb06a345e3a9e30d39aaa5b46a23201a74be6de Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 30 May 2023 21:24:22 +0300 Subject: [PATCH 5/5] OpenLLaMA 3B support (#1588) This adds support to llama.cpp to load the model. Currently missing are changes that are required from convert.py to convert the model correctly. It needs some changes to start reading the JSON configuration for HF models instead of deriving the values by guessing. Co-authored-by: FNsi <125447286+FNsi@users.noreply.github.com> --- llama.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/llama.cpp b/llama.cpp index 5a19316b3..47b4c8dd7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -42,6 +42,7 @@ // available llama models enum e_model { MODEL_UNKNOWN, + MODEL_3B, MODEL_7B, MODEL_13B, MODEL_30B, @@ -58,6 +59,7 @@ static const size_t MB = 1024*1024; static const std::map & MEM_REQ_SCRATCH0() { static std::map k_sizes = { + { MODEL_3B, 128ull * MB }, { MODEL_7B, 512ull * MB }, { MODEL_13B, 512ull * MB }, { MODEL_30B, 512ull * MB }, @@ -69,6 +71,7 @@ static const std::map & MEM_REQ_SCRATCH0() static const std::map & MEM_REQ_SCRATCH1() { static std::map k_sizes = { + { MODEL_3B, 128ull * MB }, { MODEL_7B, 512ull * MB }, { MODEL_13B, 512ull * MB }, { MODEL_30B, 512ull * MB }, @@ -81,6 +84,7 @@ static const std::map & MEM_REQ_SCRATCH1() static const std::map & MEM_REQ_KV_SELF() { static std::map k_sizes = { + { MODEL_3B, 682ull * MB }, { MODEL_7B, 1026ull * MB }, { MODEL_13B, 1608ull * MB }, { MODEL_30B, 3124ull * MB }, @@ -94,6 +98,7 @@ static const std::map & MEM_REQ_KV_SELF() static const std::map & MEM_REQ_EVAL() { static std::map k_sizes = { + { MODEL_3B, 512ull * MB }, { MODEL_7B, 768ull * MB }, { MODEL_13B, 1024ull * MB }, { MODEL_30B, 1280ull * MB }, @@ -899,6 +904,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { static const char *llama_model_type_name(e_model type) { switch (type) { + case MODEL_3B: return "3B"; case MODEL_7B: return "7B"; case MODEL_13B: return "13B"; case MODEL_30B: return "30B"; @@ -932,6 +938,7 @@ static void llama_model_load_internal( { switch (hparams.n_layer) { + case 26: model.type = e_model::MODEL_3B; break; case 32: model.type = e_model::MODEL_7B; break; case 40: model.type = e_model::MODEL_13B; break; case 60: model.type = e_model::MODEL_30B; break;