diff --git a/CMakeLists.txt b/CMakeLists.txt
index dee3534af..0ca43bddc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -355,6 +355,11 @@ else()
     message(STATUS "Unknown architecture")
 endif()
 
+if (MINGW)
+    # Target Windows 8 for PrefetchVirtualMemory
+    add_compile_definitions(_WIN32_WINNT=0x602)
+endif()
+
 #
 # Build libraries
 #
diff --git a/common/log.h b/common/log.h
index c0e814861..e4e1b9f4f 100644
--- a/common/log.h
+++ b/common/log.h
@@ -61,13 +61,13 @@
 //  #define LOG_TARGET stderr
 //  #include "log.h"
 //
-//  The log target can also be redirected to a diffrent function
+//  The log target can also be redirected to a different function
 //  like so:
 //
-//  #define LOG_TARGET log_handler_diffrent()
+//  #define LOG_TARGET log_handler_different()
 //  #include "log.h"
 //
-//  FILE* log_handler_diffrent()
+//  FILE* log_handler_different()
 //  {
 //      return stderr;
 //  }
@@ -421,7 +421,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriS
 
 // Disables logs entirely at runtime.
 //  Makes LOG() and LOG_TEE() produce no output,
-//  untill enabled back.
+//  until enabled back.
 #define log_disable() log_disable_impl()
 
 // INTERNAL, DO NOT USE
diff --git a/convert.py b/convert.py
index 6e95d6cb3..a6fc6b8ea 100755
--- a/convert.py
+++ b/convert.py
@@ -585,7 +585,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
 
     if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
         # Transformers models put different tensors in different files, but
-        # don't split indivdual tensors between files.
+        # don't split individual tensors between files.
         model: LazyModel = {}
         for mp in models_plus:
             model.update(mp.model)
@@ -678,7 +678,7 @@ class LazyUnpickler(pickle.Unpickler):
         return func(*args)
 
     CLASSES: dict[tuple[str, str], Any] = {
-        # getattr used here as a workaround for mypy not being smart enough to detrmine
+        # getattr used here as a workaround for mypy not being smart enough to determine
         # the staticmethods have a __func__ attribute.
         ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
         ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index fc0656c23..4bb7b93b6 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -739,7 +739,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
         temp->ny = longer_side;
         temp->size = 3 * longer_side * longer_side;
         temp->data = new uint8_t[temp->size]();
-        uint8_t bc[3] = {122, 116, 104}; // bakground color in RGB from LLaVA
+        uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
 
         // fill with background color
         for (size_t i = 0; i < temp->size; i++) {
diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py
index 729aaef8f..03688e0ea 100644
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -51,7 +51,7 @@ def bytes_to_unicode():
     The reversible bpe codes work on unicode strings.
     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
     And avoids mapping to whitespace/control characters the bpe code barfs on.
     """
diff --git a/examples/lookahead/README.md b/examples/lookahead/README.md
index 252a6689e..a69a471b4 100644
--- a/examples/lookahead/README.md
+++ b/examples/lookahead/README.md
@@ -1,6 +1,6 @@
 # llama.cpp/examples/lookahead
 
-Demonstartion of lookahead decoding technique:
+Demonstration of lookahead decoding technique:
 
 https://lmsys.org/blog/2023-11-21-lookahead-decoding/
 
diff --git a/examples/server/README.md b/examples/server/README.md
index cfc220f58..0751b9612 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -222,7 +222,7 @@ node index.js
 
     `content`: Set the text to process.
 
-    **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
+-   **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
 
     *Options:*
 
diff --git a/examples/server/json.hpp b/examples/server/json.hpp
index 4d1a37ad7..ea945f346 100644
--- a/examples/server/json.hpp
+++ b/examples/server/json.hpp
@@ -11227,7 +11227,7 @@ class binary_reader
                 }
                 if (is_ndarray) // ndarray dimensional vector can only contain integers, and can not embed another array
                 {
-                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read, exception_message(input_format, "ndarray dimentional vector is not allowed", "size"), nullptr));
+                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read, exception_message(input_format, "ndarray dimensional vector is not allowed", "size"), nullptr));
                 }
                 std::vector<size_t> dim;
                 if (JSON_HEDLEY_UNLIKELY(!get_ubjson_ndarray_size(dim)))
diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js
index b9c442509..c281f0fbd 100644
--- a/examples/server/public/completion.js
+++ b/examples/server/public/completion.js
@@ -114,7 +114,7 @@ export async function* llama(prompt, params = {}, config = {}) {
   return content;
 }
 
-// Call llama, return an event target that you can subcribe to
+// Call llama, return an event target that you can subscribe to
 //
 // Example:
 //
diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index 175c52478..451fd4a3b 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -223,7 +223,7 @@
       repeat_last_n: 256, // 0 = disable penalty, -1 = context size
       repeat_penalty: 1.18, // 1.0 = disabled
       top_k: 40, // <= 0 to use vocab size
-      top_p: 0.5, // 1.0 = disabled
+      top_p: 0.95, // 1.0 = disabled
       min_p: 0.05, // 0 = disabled
       tfs_z: 1.0, // 1.0 = disabled
       typical_p: 1.0, // 1.0 = disabled
@@ -238,7 +238,7 @@
       cache_prompt: true
     })
 
-    /* START: Support for storing prompt templates and parameters in borwser LocalStorage */
+    /* START: Support for storing prompt templates and parameters in browsers LocalStorage */
 
     const local_storage_storageKey = "llamacpp_server_local_storage";
 
@@ -282,7 +282,7 @@
     let importedTemplates = local_storage_getDataAsObject('user_templates')
 
     if (importedTemplates) {
-      // saved templates were successfuly imported.
+      // saved templates were successfully imported.
 
       console.log('Processing saved templates and updating default template')
       params.value = { ...params.value, image_data: [] };
@@ -303,7 +303,7 @@
     }
 
     function userTemplateResetToDefault() {
-      console.log('Reseting themplate to default')
+      console.log('Resetting template to default')
       selectedUserTemplate.value.name = 'default';
       selectedUserTemplate.value.data = savedUserTemplates.value['default'];
     }
@@ -762,7 +762,7 @@
 
           <fieldset class="two">
             ${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
-            ${FloatField({ label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
+            ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
             ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
             ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
             ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 9f464a4ea..b13951432 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2383,6 +2383,7 @@ json oaicompat_completion_params_parse(
     llama_params["__oaicompat"] = true;
 
     // Map OpenAI parameters to llama.cpp parameters
+    llama_params["model"]             = json_value(body, "model", std::string("uknown"));
     llama_params["prompt"]            = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
     llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
     llama_params["temperature"]       = json_value(body, "temperature", 0.8);
diff --git a/examples/speculative/README.md b/examples/speculative/README.md
index d88fd3790..814efa592 100644
--- a/examples/speculative/README.md
+++ b/examples/speculative/README.md
@@ -1,6 +1,6 @@
 # llama.cpp/examples/speculative
 
-Demonstartion of speculative decoding and tree-based speculative decoding techniques
+Demonstration of speculative decoding and tree-based speculative decoding techniques
 
 More info:
 
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index e4d7f64d8..d2174eb51 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -430,7 +430,7 @@ int main(int argc, char ** argv) {
             ++n_past_tgt;
         }
 
-        // the first token is always proposed by the traget model before the speculation loop so we erase it here
+        // the first token is always proposed by the target model before the speculation loop so we erase it here
         for (int s = 0; s < n_seq_dft; ++s) {
             if (!drafts[s].active) {
                 continue;
diff --git a/ggml-alloc.h b/ggml-alloc.h
index ad87cebc8..64a412468 100644
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@@ -43,7 +43,7 @@ GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph
 // ggml-backend v2 API
 //
 
-// Seperate tensor and graph allocator objects
+// Separate tensor and graph allocator objects
 // This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
 // The original API is kept as a wrapper around the new API
 
diff --git a/ggml-quants.c b/ggml-quants.c
index 2879ab946..ca3a4980a 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -3116,7 +3116,7 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
 
     size_t vl = __riscv_vsetvl_e8m1(qk/2);
 
-    // These tempory registers are for masking and shift operations
+    // These temporary registers are for masking and shift operations
     vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
     vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
 
@@ -4759,7 +4759,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
             vl = 16;
 
-            // retreive lane to multiply with scale
+            // retrieve lane to multiply with scale
             vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
             vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
             vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
diff --git a/ggml.c b/ggml.c
index 74cd646d4..c1bb6a448 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1,4 +1,4 @@
-#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
+#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
 #define _USE_MATH_DEFINES // For M_PI on MSVC
 
 #include "ggml-impl.h"
@@ -33,7 +33,7 @@
 // we should just be careful :)
 #pragma warning(disable: 4244 4267)
 
-// disable POSIX deprecation warnigns
+// disable POSIX deprecation warnings
 // these functions are never going away, anyway
 #pragma warning(disable: 4996)
 #endif
@@ -1760,7 +1760,7 @@ static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size
 static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
 
 // WARN:
-// Mis-confguration can lead to problem that's hard to reason about:
+// Mis-configuration can lead to problem that's hard to reason about:
 // * At best  it crash or talks nosense.
 // * At worst it talks slightly difference but hard to perceive.
 //
@@ -7520,7 +7520,7 @@ static void ggml_compute_forward_acc_f32(
     GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
 
     // view src0 and dst with these strides and data offset inbytes during acc
-    // nb0 is implicitely element_size because src0 and dst are contiguous
+    // nb0 is implicitly element_size because src0 and dst are contiguous
     size_t nb1     = ((int32_t *) dst->op_params)[0];
     size_t nb2     = ((int32_t *) dst->op_params)[1];
     size_t nb3     = ((int32_t *) dst->op_params)[2];
@@ -10161,7 +10161,7 @@ static void ggml_compute_forward_set_f32(
     GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
 
     // view src0 and dst with these strides and data offset inbytes during set
-    // nb0 is implicitely element_size because src0 and dst are contiguous
+    // nb0 is implicitly element_size because src0 and dst are contiguous
     size_t nb1     = ((int32_t *) dst->op_params)[0];
     size_t nb2     = ((int32_t *) dst->op_params)[1];
     size_t nb3     = ((int32_t *) dst->op_params)[2];
@@ -14475,7 +14475,7 @@ void ggml_build_backward_gradient_checkpointing(
             // insert new tensors recomputing src, reusing already made replacements,
             // remember replacements: remember new tensors with mapping from corresponding gf nodes
             // recurse for input tensors,
-            // unless (i.e. terminating when) input tensors are replacments (like checkpoints)
+            // unless (i.e. terminating when) input tensors are replacements (like checkpoints)
             node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
         }
         // insert rewritten backward node with replacements made into resulting backward graph gb
diff --git a/ggml.h b/ggml.h
index 84b53946e..47eb5fab8 100644
--- a/ggml.h
+++ b/ggml.h
@@ -215,7 +215,7 @@
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
 
 #define GGML_MAX_DIMS           4
-#define GGML_MAX_PARAMS         1024
+#define GGML_MAX_PARAMS         2048
 #define GGML_MAX_CONTEXTS       64
 #define GGML_MAX_SRC            6
 #define GGML_MAX_NAME           64
diff --git a/gguf-py/README.md b/gguf-py/README.md
index 502b6a510..a27d2fc0e 100644
--- a/gguf-py/README.md
+++ b/gguf-py/README.md
@@ -61,7 +61,7 @@ If you want to publish the package manually for any reason, you need to have `tw
 pip install build twine
 ```
 
-Then, folow these steps to release a new version:
+Then, follow these steps to release a new version:
 
 1. Bump the version in `pyproject.toml`.
 2. Build the package:
diff --git a/llama.cpp b/llama.cpp
index 8bce9ef6b..4f880889a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2788,7 +2788,7 @@ static void llm_load_vocab(
         // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
         //  to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
         //  are special tokens.
-        // From testing, this appears to corelate 1:1 with special tokens.
+        // From testing, this appears to correlate 1:1 with special tokens.
         //
 
         // Counting special tokens and verifying in only one direction
@@ -5876,7 +5876,7 @@ static int llama_decode_internal(
     const int64_t n_embd  = hparams.n_embd;
     const int64_t n_vocab = hparams.n_vocab;
 
-    // helpers for smoother batch API transistion
+    // helpers for smoother batch API transition
     // after deprecating the llama_eval calls, these will be removed
     std::vector<llama_pos> pos;
 
@@ -6876,12 +6876,12 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
 
                 // loop over the text
                 while (true) {
-                    // find the first occurence of a given special token in this fragment
+                    // find the first occurrence of a given special token in this fragment
                     //  passing offset argument only limit the "search area" but match coordinates
                     //  are still relative to the source full raw_text
                     auto match = raw_text->find(special_token, raw_text_base_offset);
 
-                    // no occurences found, stop processing this fragment for a given special token
+                    // no occurrences found, stop processing this fragment for a given special token
                     if (match == std::string::npos) break;
 
                     // check if match is within bounds of offset <-> length
@@ -7766,7 +7766,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
 
     for (size_t i = 0; i < candidates->size; ++i) {
         const llama_token id    = candidates->data[i].id;
-        const std::string & piece = ctx->model.vocab.id_to_token[id].text;
+        const std::string piece = llama_token_to_piece(ctx, id);
         if (id == eos) {
             if (!allow_eos) {
                 candidates->data[i].logit = -INFINITY;
@@ -7978,7 +7978,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
         GGML_ASSERT(false);
     }
 
-    const std::string & piece = ctx->model.vocab.id_to_token[token].text;
+    const std::string piece = llama_token_to_piece(ctx, token);
 
     // Note terminating 0 in decoded string
     const auto   decoded     = decode_utf8(piece, grammar->partial_utf8);
@@ -8092,7 +8092,7 @@ struct llama_beam_search_data {
     }
 
     // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
-    // The repetative patterns below reflect the 2 stages of heaps:
+    // The repetitive patterns below reflect the 2 stages of heaps:
     //  * Gather elements until the vector is full, then call std::make_heap() on it.
     //  * If the heap is full and a new element is found that should be included, pop the
     //    least element to the back(), replace it with the new, then push it into the heap.
diff --git a/llama.h b/llama.h
index 5592166e3..d713821ce 100644
--- a/llama.h
+++ b/llama.h
@@ -216,7 +216,7 @@ extern "C" {
 
         // Keep the booleans together to avoid misalignment during copy-by-value.
         bool mul_mat_q;   // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
-        bool logits_all;  // the llama_eval() call computes all logits, not just the last one
+        bool logits_all;  // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
         bool embedding;   // embedding mode only
         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
     };