diff --git a/examples/json-schema-to-grammar.py b/examples/json-schema-to-grammar.py index 2a4cb65bc..6a977f031 100755 --- a/examples/json-schema-to-grammar.py +++ b/examples/json-schema-to-grammar.py @@ -87,7 +87,21 @@ class SchemaConverter: elif schema_type == 'array' and 'items' in schema: # TODO `prefixItems` keyword item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item') - rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space' + list_item_operator = f'("," space {item_rule_name})' + successive_items = "" + min_items = schema.get("minItems", 0) + if min_items > 0: + first_item = f"({item_rule_name})" + successive_items = list_item_operator * (min_items - 1) + min_items -= 1 + else: + first_item = f"({item_rule_name})?" + max_items = schema.get("maxItems") + if max_items is not None and max_items > min_items: + successive_items += (list_item_operator + "?") * (max_items - min_items - 1) + else: + successive_items += list_item_operator + "*" + rule = f'"[" space {first_item} {successive_items} "]" space' return self._add_rule(rule_name, rule) else: diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 98d512f67..ef9e4ba7a 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -616,9 +616,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 KQ = ggml_soft_max_inplace(ctx0, KQ); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size); - KQV = ggml_cont(ctx0, ggml_permute(ctx0, KQV, 0, 2, 1, 3)); + KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cur = ggml_cpy(ctx0, KQV, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size)); + cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size); } // attention output diff --git a/ggml-alloc.c b/ggml-alloc.c index d4123564f..e675306c8 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -377,6 +377,9 @@ struct ggml_gallocr { struct node_alloc * node_allocs; // [n_nodes] int n_nodes; + + struct tensor_alloc * leaf_allocs; // [n_leafs] + int n_leafs; }; ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) { @@ -427,6 +430,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) { free(galloc->buffers); free(galloc->buf_tallocs); free(galloc->node_allocs); + free(galloc->leaf_allocs); free(galloc); } @@ -464,7 +468,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor for (int i = 0; i < GGML_MAX_SRC; i++) { struct ggml_tensor * parent = node->src[i]; if (parent == NULL) { - break; + continue; } // if the node's data is external, then we cannot re-use it @@ -544,22 +548,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *)); memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node)); - // allocate all graph inputs first to avoid overwriting them - for (int i = 0; i < graph->n_nodes; i++) { - if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) { - ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i)); - } - for (int j = 0; j < GGML_MAX_SRC; j++) { - if (graph->nodes[i]->src[j] == NULL) { - continue; - } - if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) { - ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i)); - } - } - } - // count number of children and views + // allocate all graph inputs and leafs first to avoid overwriting them for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -568,14 +558,37 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr ggml_gallocr_hash_get(galloc, view_src)->n_views += 1; } - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } - ggml_gallocr_hash_get(galloc, parent)->n_children += 1; + if (node->flags & GGML_TENSOR_FLAG_INPUT) { + ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i)); } - } + + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + + ggml_gallocr_hash_get(galloc, src)->n_children += 1; + + // allocate explicit inputs and leafs + if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) { + ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i)); + } + } + } + + // allocate the remaining leafs that are unused on the graph + // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes + for (int i = 0; i < graph->n_leafs; i++) { + struct ggml_tensor * leaf = graph->leafs[i]; + struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); + + if (hn->n_children == 0) { + assert(!hn->allocated); + // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer + ggml_gallocr_allocate_node(galloc, leaf, 0); + } + } // allocate tensors for (int i = 0; i < graph->n_nodes; i++) { @@ -586,7 +599,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * parent = node->src[j]; if (parent == NULL) { - break; + continue; } ggml_gallocr_allocate_node(galloc, parent, buffer_id); } @@ -598,7 +611,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * parent = node->src[j]; if (parent == NULL) { - break; + continue; } AT_PRINTF("%s", parent->name); if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { @@ -611,7 +624,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * parent = node->src[j]; if (parent == NULL) { - break; + continue; } struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent); p_hn->n_children -= 1; @@ -696,6 +709,18 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c } } } + if (galloc->n_leafs < graph->n_leafs) { + free(galloc->leaf_allocs); + galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs); + GGML_ASSERT(galloc->leaf_allocs != NULL); + } + galloc->n_leafs = graph->n_leafs; + for (int i = 0; i < graph->n_leafs; i++) { + struct ggml_tensor * leaf = graph->leafs[i]; + struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); + galloc->leaf_allocs[i].offset = hn->offset; + galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf); + } // reallocate buffers if needed for (int i = 0; i < galloc->n_buffers; i++) { @@ -722,8 +747,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { return ggml_gallocr_reserve_n(galloc, graph, NULL); } -static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) { - assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max); +static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) { + assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max); if (node->view_src != NULL) { if (node->buffer == NULL) { @@ -732,29 +757,20 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * // this tensor was allocated without ggml-backend return; } - ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node); + ggml_backend_view_init(galloc->buffers[buffer_id], node); } } else { if (node->data == NULL) { assert(tensor_alloc->offset != SIZE_MAX); - assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max); - void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]); + assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max); + void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]); void * addr = (char *)base + tensor_alloc->offset; - ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr); + ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr); } else { if (node->buffer == NULL) { // this tensor was allocated without ggml-backend return; } - -#ifndef NDEBUG - size_t offset = - (char *)node->data - - (char *)ggml_backend_buffer_get_base(node->buffer); - size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node); - assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset); - assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max); -#endif } } } @@ -773,6 +789,13 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph return true; } + if (galloc->n_leafs != graph->n_leafs) { +#ifndef NDEBUG + fprintf(stderr, "%s: graph has different number of leafs\n", __func__); +#endif + return true; + } + for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; struct node_alloc * node_alloc = &galloc->node_allocs[i]; @@ -827,6 +850,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) } // allocate the graph tensors from the previous assignments + // nodes for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; struct node_alloc * node_alloc = &galloc->node_allocs[i]; @@ -835,9 +859,15 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) if (src == NULL) { continue; } - ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]); + ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]); } - ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst); + ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst); + } + // leafs + for (int i = 0; i < graph->n_leafs; i++) { + struct ggml_tensor * leaf = graph->leafs[i]; + struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i]; + ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc); } return true; diff --git a/ggml-metal.m b/ggml-metal.m index f3c1fff8f..956e323a0 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -277,6 +277,14 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { return NULL; } } else { +#if GGML_METAL_EMBED_LIBRARY + GGML_METAL_LOG_INFO("%s: using embedded metal library\n", __func__); + + extern const char ggml_metallib_start[]; + extern const char ggml_metallib_end[]; + + NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding]; +#else GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); NSString * sourcePath; @@ -299,6 +307,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } +#endif @autoreleasepool { // dictionary of preprocessor macros diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 7a23ab162..733d8f95b 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -5070f078a67c18c11736e78316ab715ca9afde16 +818eeb8a3be99125746a90ec63af8f51516a2ec6