diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
index 23600eea9..a861daa53 100644
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@@ -46,17 +46,17 @@ GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, st
 typedef struct ggml_gallocr * ggml_gallocr_t;
 
 GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
-GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
+GGML_API ggml_gallocr_t ggml_gallocr_new_n(const ggml_backend_buffer_type_t * bufts, int n_bufs);
 GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
 
 // pre-allocate buffers from a measure graph - does not allocate or modify the graph
 // call with a worst-case graph to avoid buffer reallocations
 // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
 // returns false if the buffer allocation failed
-GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, const struct ggml_cgraph * graph);
 GGML_API bool ggml_gallocr_reserve_n(
     ggml_gallocr_t galloc,
-    struct ggml_cgraph * graph,
+    const struct ggml_cgraph * graph,
     const int * node_buffer_ids,
     const int * leaf_buffer_ids);
 
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 5bd8d9c8b..7e8c4c2c2 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -698,7 +698,7 @@ extern "C" {
 
     GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
 
-    GGML_API bool    ggml_get_no_alloc(struct ggml_context * ctx);
+    GGML_API bool    ggml_get_no_alloc(const struct ggml_context * ctx);
     GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
 
     GGML_API void *  ggml_get_mem_buffer     (const struct ggml_context * ctx);
@@ -745,7 +745,7 @@ extern "C" {
     // Context tensor enumeration and lookup
     GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
     GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
-    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
+    GGML_API struct ggml_tensor * ggml_get_tensor(const struct ggml_context * ctx, const char * name);
 
     // Converts a flat index into coordinates
     GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
@@ -763,7 +763,7 @@ extern "C" {
     // Tensor flags
     GGML_API void ggml_set_input(struct ggml_tensor * tensor);
     GGML_API void ggml_set_output(struct ggml_tensor * tensor);
-    GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
+    GGML_API void ggml_set_param(const struct ggml_context * ctx, struct ggml_tensor * tensor);
     GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
 
     //
@@ -927,13 +927,13 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_repeat(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+                                              const struct ggml_tensor  * b);
 
     // sums repetitions in a into shape of b
     GGML_API struct ggml_tensor * ggml_repeat_back(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+                                                   const struct ggml_tensor  * b);
 
     // concat a and b along dim
     // used in stable-diffusion
@@ -1243,7 +1243,7 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_reshape(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+                                               const struct ggml_tensor  * b);
 
     // return view(a)
     // TODO: when we start computing gradient, make a copy instead of view
@@ -1335,7 +1335,7 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a,  // gradients of ggml_get_rows result
             struct ggml_tensor  * b,  // row indices
-            struct ggml_tensor  * c); // data for ggml_get_rows, only used for its shape
+        const struct ggml_tensor  * c); // data for ggml_get_rows, only used for its shape
 
     GGML_API struct ggml_tensor * ggml_diag(
         struct ggml_context     * ctx,
@@ -1563,7 +1563,7 @@ extern "C" {
         struct ggml_context * ctx,
         struct ggml_tensor  * a,  // convolution kernel
         struct ggml_tensor  * b,  // gradient of im2col output
-        int64_t             * ne, // shape of im2col input
+                                                   const int64_t             * ne, // shape of im2col input
         int                   s0, // stride dimension 0
         int                   s1, // stride dimension 1
         int                   p0, // padding dimension 0
@@ -2062,15 +2062,16 @@ extern "C" {
     // graph allocation in a context
     GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
     GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
-    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-    GGML_API void                 ggml_graph_cpy       (struct ggml_cgraph * src, struct ggml_cgraph * dst);
-    GGML_API void                 ggml_graph_reset     (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
+    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, const struct ggml_cgraph * cgraph);
+    GGML_API void                 ggml_graph_cpy       (const struct ggml_cgraph * src, struct ggml_cgraph * dst);
+    GGML_API void                 ggml_graph_reset     (
+                        const struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
     GGML_API void                 ggml_graph_clear     (struct ggml_cgraph * cgraph);
 
-    GGML_API int                   ggml_graph_size   (struct ggml_cgraph * cgraph);
-    GGML_API struct ggml_tensor *  ggml_graph_node   (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
-    GGML_API struct ggml_tensor ** ggml_graph_nodes  (struct ggml_cgraph * cgraph);
-    GGML_API int                   ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
+    GGML_API int                   ggml_graph_size   (const struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_tensor *  ggml_graph_node   (const struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
+    GGML_API struct ggml_tensor ** ggml_graph_nodes  (const struct ggml_cgraph * cgraph);
+    GGML_API int                   ggml_graph_n_nodes(const struct ggml_cgraph * cgraph);
 
     GGML_API void   ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
 
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 9a3bf9f29..c32645ad8 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -377,7 +377,7 @@ struct ggml_gallocr {
     int n_leafs;
 };
 
-ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
+ggml_gallocr_t ggml_gallocr_new_n(const ggml_backend_buffer_type_t * bufts, int n_bufs) {
     ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
     GGML_ASSERT(galloc != NULL);
 
@@ -563,7 +563,7 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
     return node_buffer_ids ? node_buffer_ids[i] : 0;
 }
 
-static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
+static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, const struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
     // clear hash tables
     ggml_hash_set_reset(&galloc->hash_set);
     memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
@@ -670,7 +670,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
     }
 }
 
-bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
+bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, const struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
     size_t min_hash_size = graph->n_nodes + graph->n_leafs;
     // add 25% margin to avoid hash collisions
     min_hash_size += min_hash_size / 4;
@@ -780,11 +780,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
     return true;
 }
 
-bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
+bool ggml_gallocr_reserve(ggml_gallocr_t galloc, const struct ggml_cgraph *graph) {
     return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
 }
 
-static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
+static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, const struct tensor_alloc * tensor_alloc) {
     int buffer_id = tensor_alloc->buffer_id;
     assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
 
@@ -813,7 +813,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
     }
 }
 
-static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
+static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, const struct tensor_alloc * talloc) {
     size_t node_size = 0;
     if (!node->data && !node->view_src) {
         GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
@@ -822,7 +822,7 @@ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_t
     return talloc->size_max >= node_size;
 }
 
-static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
+static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, const struct ggml_cgraph * graph) {
     if (galloc->n_nodes != graph->n_nodes) {
 #ifndef NDEBUG
         GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
@@ -933,8 +933,8 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
 
 // utils
 
-static bool alloc_tensor_range(struct ggml_context * ctx,
-        struct ggml_tensor * first, struct ggml_tensor * last,
+static bool alloc_tensor_range(const struct ggml_context * ctx,
+        struct ggml_tensor * first, const struct ggml_tensor * last,
         ggml_backend_buffer_type_t buft, size_t size,
         ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
     ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 955ed505f..348d7c21f 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -124,7 +124,7 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {
 using dl_handle = void;
 
 struct dl_handle_deleter {
-    void operator()(void * handle) {
+    void operator()(void * handle) const {
         dlclose(handle);
     }
 };
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index eab017889..81c4b0a3e 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -188,13 +188,13 @@ struct ggml_hash_set {
 };
 
 struct ggml_hash_set ggml_hash_set_new(size_t size);
-void                 ggml_hash_set_free(struct ggml_hash_set * hash_set);
+void                 ggml_hash_set_free(const struct ggml_hash_set * hash_set);
 
 // returns the minimum size for a hash set that can hold min_sz elements
 size_t ggml_hash_size(size_t min_sz);
 
 // remove all elements from the hash set
-void ggml_hash_set_reset(struct ggml_hash_set * hash_set);
+void ggml_hash_set_reset(const struct ggml_hash_set * hash_set);
 
 // returns true if key is in the hash set
 static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
@@ -302,7 +302,7 @@ struct ggml_cgraph {
 // returns a slice of cgraph with nodes [i0, i1)
 // the slice does not have leafs or gradients
 // if you need the gradients, get them from the original graph
-struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
+struct ggml_cgraph ggml_graph_view(const struct ggml_cgraph * cgraph, int i0, int i1);
 
 // Memory allocation
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 3b4861542..ea9cd04d4 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1478,7 +1478,7 @@ size_t ggml_used_mem(const struct ggml_context * ctx) {
     return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
 }
 
-bool ggml_get_no_alloc(struct ggml_context * ctx) {
+bool ggml_get_no_alloc(const struct ggml_context * ctx) {
     return ctx->no_alloc;
 }
 
@@ -1789,7 +1789,7 @@ struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struc
     return NULL;
 }
 
-struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
+struct ggml_tensor * ggml_get_tensor(const struct ggml_context * ctx, const char * name) {
     struct ggml_object * obj = ctx->objects_begin;
 
     char * const mem_buffer = ctx->mem_buffer;
@@ -1952,7 +1952,7 @@ static struct ggml_tensor * ggml_acc_impl(
 
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
+    const int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op     = GGML_OP_ACC;
@@ -2292,8 +2292,8 @@ struct ggml_tensor * ggml_count_equal(
 
 struct ggml_tensor * ggml_repeat(
         struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
+        struct ggml_tensor * a,
+        const struct ggml_tensor * b) {
     GGML_ASSERT(ggml_can_repeat(a, b));
 
     struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
@@ -2308,8 +2308,8 @@ struct ggml_tensor * ggml_repeat(
 
 struct ggml_tensor * ggml_repeat_back(
         struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
+        struct ggml_tensor * a,
+        const struct ggml_tensor  * b) {
     GGML_ASSERT(ggml_can_repeat(b, a));
 
     struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
@@ -2836,7 +2836,7 @@ static struct ggml_tensor * ggml_set_impl(
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     GGML_ASSERT(offset < (size_t)(1 << 30));
-    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
+    const int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op     = GGML_OP_SET;
@@ -3014,7 +3014,7 @@ struct ggml_tensor * ggml_cont_4d(
 struct ggml_tensor * ggml_reshape(
         struct ggml_context * ctx,
         struct ggml_tensor * a,
-        struct ggml_tensor * b) {
+        const struct ggml_tensor * b) {
     GGML_ASSERT(ggml_is_contiguous(a));
     // as only the shape of b is relevant, and not its memory layout, b is allowed to be non contiguous.
     GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
@@ -3247,7 +3247,7 @@ struct ggml_tensor * ggml_permute(
     result->op     = GGML_OP_PERMUTE;
     result->src[0] = a;
 
-    int32_t params[] = { axis0, axis1, axis2, axis3 };
+    const int32_t params[] = { axis0, axis1, axis2, axis3 };
     ggml_set_op_params(result, params, sizeof(params));
 
     return result;
@@ -3302,8 +3302,8 @@ struct ggml_tensor * ggml_get_rows(
 struct ggml_tensor * ggml_get_rows_back(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        struct ggml_tensor  * c) {
+        struct ggml_tensor * b,
+        const struct ggml_tensor  * c) {
     GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
     GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0]));
 
@@ -3343,7 +3343,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
         bool                  inplace) {
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    int32_t params[] = { n_past };
+    const int32_t params[] = { n_past };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op     = GGML_OP_DIAG_MASK_INF;
@@ -3375,7 +3375,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
         bool                  inplace) {
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    int32_t params[] = { n_past };
+    const int32_t params[] = { n_past };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op     = GGML_OP_DIAG_MASK_ZERO;
@@ -3423,7 +3423,7 @@ static struct ggml_tensor * ggml_soft_max_impl(
 
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    float params[] = { scale, max_bias };
+    const float params[] = { scale, max_bias };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op     = GGML_OP_SOFT_MAX;
@@ -3758,7 +3758,7 @@ struct ggml_tensor * ggml_clamp(
     // TODO: when implement backward, fix this:
     struct ggml_tensor * result = ggml_view_tensor(ctx, a);
 
-    float params[] = { min, max };
+    const float params[] = { min, max };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op     = GGML_OP_CLAMP;
@@ -3809,7 +3809,7 @@ struct ggml_tensor * ggml_im2col(
     };
 
     struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
-    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
+    const int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op     = GGML_OP_IM2COL;
@@ -3822,8 +3822,8 @@ struct ggml_tensor * ggml_im2col(
 struct ggml_tensor * ggml_im2col_back(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int64_t             * ne,
+        struct ggml_tensor * b,
+        const int64_t             * ne,
         int                   s0,
         int                   s1,
         int                   p0,
@@ -3832,7 +3832,7 @@ struct ggml_tensor * ggml_im2col_back(
         int                   d1,
         bool                  is_2D) {
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
+    const int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op     = GGML_OP_IM2COL_BACK;
@@ -3932,7 +3932,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
     };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
-    int32_t params[] = { s0, p0, d0 };
+    const int32_t params[] = { s0, p0, d0 };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op     = GGML_OP_CONV_TRANSPOSE_1D;
@@ -4067,7 +4067,7 @@ struct ggml_tensor * ggml_pool_1d(
     };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
-    int32_t params[] = { op, k0, s0, p0 };
+    const int32_t params[] = { op, k0, s0, p0 };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op     = GGML_OP_POOL_1D;
@@ -4097,7 +4097,7 @@ struct ggml_tensor * ggml_pool_2d(
     };
     result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
-    int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
+    const int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op     = GGML_OP_POOL_2D;
@@ -4116,11 +4116,10 @@ struct ggml_tensor * ggml_pool_2d_back(
         int                   s0,
         int                   s1,
         float                 p0,
-        float                 p1) {
-    struct ggml_tensor * result;
-    result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, af->ne);
+        float p1) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, af->ne);
 
-    int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
+    const int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op     = GGML_OP_POOL_2D_BACK;
@@ -4206,13 +4205,9 @@ struct ggml_tensor * ggml_pad_reflect_1d(
     GGML_ASSERT(ggml_is_contiguous(a));
     GGML_ASSERT(a->type == GGML_TYPE_F32);
 
-    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
-            a->ne[0] + p0 + p1,
-            a->ne[1],
-            a->ne[2],
-            a->ne[3]);
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0] + p0 + p1, a->ne[1], a->ne[2], a->ne[3]);
 
-    int32_t params[] = { p0, p1 };
+    const int32_t params[] = { p0, p1 };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op     = GGML_OP_PAD_REFLECT_1D;
@@ -4332,7 +4327,7 @@ struct ggml_tensor * ggml_flash_attn_ext(
     int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
-    float params[] = { scale, max_bias, logit_softcap };
+    const float params[] = { scale, max_bias, logit_softcap };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op     = GGML_OP_FLASH_ATTN_EXT;
@@ -4531,10 +4526,13 @@ struct ggml_tensor * ggml_win_part(
     const int npy = (py + a->ne[2])/w;
     const int np  = npx*npy;
 
-    const int64_t ne[4] = { a->ne[0], w, w, np, };
+    const int64_t ne[4] = { a->ne[0], w,
+        w,
+        np,
+    };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
-    int32_t params[] = { npx, npy, w };
+    const int32_t params[] = { npx, npy, w };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op     = GGML_OP_WIN_PART;
@@ -4556,7 +4554,7 @@ struct ggml_tensor * ggml_win_unpart(
     const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
 
-    int32_t params[] = { w };
+    const int32_t params[] = { w };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op     = GGML_OP_WIN_UNPART;
@@ -5141,11 +5139,11 @@ struct ggml_hash_set ggml_hash_set_new(size_t size) {
     return result;
 }
 
-void ggml_hash_set_reset(struct ggml_hash_set * hash_set) {
+void ggml_hash_set_reset(const struct ggml_hash_set * hash_set) {
     memset(hash_set->used, 0, sizeof(ggml_bitset_t) * ggml_bitset_size(hash_set->size));
 }
 
-void ggml_hash_set_free(struct ggml_hash_set * hash_set) {
+void ggml_hash_set_free(const struct ggml_hash_set * hash_set) {
     GGML_FREE(hash_set->used);
     GGML_FREE(hash_set->keys);
 }
@@ -5966,7 +5964,7 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
     return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
 }
 
-struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
+struct ggml_cgraph ggml_graph_view(const struct ggml_cgraph * cgraph0, int i0, int i1) {
     struct ggml_cgraph cgraph = {
         /*.size             =*/ 0,
         /*.n_nodes          =*/ i1 - i0,
@@ -5982,7 +5980,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
     return cgraph;
 }
 
-void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
+void ggml_graph_cpy(const struct ggml_cgraph * src, struct ggml_cgraph * dst) {
     GGML_ASSERT(dst->size >= src->n_leafs);
     GGML_ASSERT(dst->size >= src->n_nodes);
     GGML_ASSERT(dst->visited_hash_set.size >= src->visited_hash_set.size);
@@ -6028,7 +6026,7 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
     }
 }
 
-struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
+struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, const struct ggml_cgraph * cgraph) {
     struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
     ggml_graph_cpy(cgraph, result);
     return result;
@@ -6047,7 +6045,7 @@ struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
     return tensor;
 }
 
-void ggml_graph_reset(struct ggml_cgraph * cgraph) {
+void ggml_graph_reset(const struct ggml_cgraph * cgraph) {
     GGML_ASSERT(cgraph->grads != NULL);
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -6086,11 +6084,11 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
     ggml_hash_set_reset(&cgraph->visited_hash_set);
 }
 
-int ggml_graph_size(struct ggml_cgraph * cgraph) {
+int ggml_graph_size(const struct ggml_cgraph * cgraph) {
     return cgraph->size;
 }
 
-struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
+struct ggml_tensor * ggml_graph_node(const struct ggml_cgraph * cgraph, int i) {
     if (i < 0) {
         GGML_ASSERT(cgraph->n_nodes + i >= 0);
         return cgraph->nodes[cgraph->n_nodes + i];
@@ -6100,11 +6098,11 @@ struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
     return cgraph->nodes[i];
 }
 
-struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) {
+struct ggml_tensor ** ggml_graph_nodes(const struct ggml_cgraph * cgraph) {
     return cgraph->nodes;
 }
 
-int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) {
+int ggml_graph_n_nodes(const struct ggml_cgraph * cgraph) {
     return cgraph->n_nodes;
 }
 
@@ -6357,7 +6355,7 @@ void ggml_set_output(struct ggml_tensor * tensor) {
     tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
 }
 
-void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor) {
+void ggml_set_param(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
     GGML_UNUSED(ctx); // TODO: remove this parameter
     tensor->flags |= GGML_TENSOR_FLAG_PARAM;
 }
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
index ab13669c5..1afd4b73b 100644
--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@@ -1144,6 +1144,7 @@ struct gguf_writer {
 
     template <typename T>
     void write(const T & val) const {
+        buf.reserve(sizeof(val));
         for (size_t i = 0; i < sizeof(val); ++i) {
             buf.push_back(reinterpret_cast<const int8_t *>(&val)[i]);
         }
@@ -1163,6 +1164,7 @@ struct gguf_writer {
             const uint64_t n = val.length();
             write(n);
         }
+        buf.reserve(val.length());
         for (size_t i = 0; i < val.length(); ++i) {
             buf.push_back(reinterpret_cast<const int8_t *>(val.data())[i]);
         }