diff --git a/examples/common.cpp b/examples/common.cpp
index f5d886acf..df69f2736 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -632,6 +632,9 @@ void console_set_color(console_state & con_st, console_color_t color) {
             case CONSOLE_COLOR_USER_INPUT:
                 fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN);
                 break;
+            case CONSOLE_COLOR_ERROR:
+                fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_RED);
+                break;
         }
         con_st.color = color;
         fflush(con_st.out);
diff --git a/examples/common.h b/examples/common.h
index 826e2ae59..6fedb414a 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -112,7 +112,8 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
 enum console_color_t {
     CONSOLE_COLOR_DEFAULT=0,
     CONSOLE_COLOR_PROMPT,
-    CONSOLE_COLOR_USER_INPUT
+    CONSOLE_COLOR_USER_INPUT,
+    CONSOLE_COLOR_ERROR
 };
 
 struct console_state {
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index de63faa3e..66d563143 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -81,6 +81,9 @@ int main(int argc, char ** argv) {
     if (params.n_ctx > 2048) {
         fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
                 "expect poor results\n", __func__, params.n_ctx);
+    } else if (params.n_ctx < 8) {
+        fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        params.n_ctx = 8;
     }
 
     fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
@@ -331,6 +334,19 @@ int main(int argc, char ** argv) {
     while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
         // predict
         if (embd.size() > 0) {
+            // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
+            // --prompt or --file which uses the same value.
+            auto max_embd_size = n_ctx - 4;
+            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
+            if ((int)embd.size() > max_embd_size) {
+                auto skipped_tokens = embd.size() - max_embd_size;
+                console_set_color(con_st, CONSOLE_COLOR_ERROR);
+                printf("<<input too long: skipped %ld token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
+                fflush(stdout);
+                embd.resize(max_embd_size);
+            }
+
             // infinite text generation via context swapping
             // if we run out of context:
             // - take the n_keep first tokens from the original prompt (via n_past)
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index a62f26e1e..4f2195f77 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1105,6 +1105,9 @@ void * ggml_cuda_host_malloc(size_t size) {
     void * ptr = nullptr;
     cudaError_t err = cudaMallocHost((void **) &ptr, size);
     if (err != cudaSuccess) {
+        // The allocation error can be bypassed. A null ptr will assigned out of this function.
+        // This can fixed the OOM error in WSL.
+        cudaGetLastError();
         fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
             size/1024.0/1024.0, cudaGetErrorString(err));
         return nullptr;
diff --git a/k_quants.c b/k_quants.c
index 4d524494d..a48c82171 100644
--- a/k_quants.c
+++ b/k_quants.c
@@ -1519,7 +1519,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
     const uint8x16_t m4b = vdupq_n_u8(0xf);
 #ifdef __ARM_FEATURE_DOTPROD
-    const uint32x4_t mzero = vdupq_n_s32(0);
+    const int32x4_t mzero = vdupq_n_s32(0);
 #endif
 
     int8x16x2_t q4bytes;
@@ -1745,7 +1745,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 #ifdef __ARM_NEON
 
     const uint8x16_t m4b = vdupq_n_u8(0xf);
-    const uint32x4_t mzero = vdupq_n_u32(0);
+    const int32x4_t mzero = vdupq_n_s32(0);
     const uint8x16_t mone = vdupq_n_u8(1);
     const uint8x16_t mtwo = vdupq_n_u8(2);
 
@@ -2242,5 +2242,3 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
     *s = sumf;
 #endif
 }
-
-