diff --git a/examples/common.cpp b/examples/common.cpp index f5d886acf..df69f2736 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -632,6 +632,9 @@ void console_set_color(console_state & con_st, console_color_t color) { case CONSOLE_COLOR_USER_INPUT: fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN); break; + case CONSOLE_COLOR_ERROR: + fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_RED); + break; } con_st.color = color; fflush(con_st.out); diff --git a/examples/common.h b/examples/common.h index 826e2ae59..6fedb414a 100644 --- a/examples/common.h +++ b/examples/common.h @@ -112,7 +112,8 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params); enum console_color_t { CONSOLE_COLOR_DEFAULT=0, CONSOLE_COLOR_PROMPT, - CONSOLE_COLOR_USER_INPUT + CONSOLE_COLOR_USER_INPUT, + CONSOLE_COLOR_ERROR }; struct console_state { diff --git a/examples/main/main.cpp b/examples/main/main.cpp index de63faa3e..66d563143 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -81,6 +81,9 @@ int main(int argc, char ** argv) { if (params.n_ctx > 2048) { fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" "expect poor results\n", __func__, params.n_ctx); + } else if (params.n_ctx < 8) { + fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__); + params.n_ctx = 8; } fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); @@ -331,6 +334,19 @@ int main(int argc, char ** argv) { while ((n_remain != 0 && !is_antiprompt) || params.interactive) { // predict if (embd.size() > 0) { + // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via + // --prompt or --file which uses the same value. + auto max_embd_size = n_ctx - 4; + // Ensure the input doesn't exceed the context size by truncating embd if necessary. + if ((int)embd.size() > max_embd_size) { + auto skipped_tokens = embd.size() - max_embd_size; + console_set_color(con_st, CONSOLE_COLOR_ERROR); + printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + console_set_color(con_st, CONSOLE_COLOR_DEFAULT); + fflush(stdout); + embd.resize(max_embd_size); + } + // infinite text generation via context swapping // if we run out of context: // - take the n_keep first tokens from the original prompt (via n_past) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index a62f26e1e..4f2195f77 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1105,6 +1105,9 @@ void * ggml_cuda_host_malloc(size_t size) { void * ptr = nullptr; cudaError_t err = cudaMallocHost((void **) &ptr, size); if (err != cudaSuccess) { + // The allocation error can be bypassed. A null ptr will assigned out of this function. + // This can fixed the OOM error in WSL. + cudaGetLastError(); fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", size/1024.0/1024.0, cudaGetErrorString(err)); return nullptr; diff --git a/k_quants.c b/k_quants.c index 4d524494d..a48c82171 100644 --- a/k_quants.c +++ b/k_quants.c @@ -1519,7 +1519,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri const uint8x16_t m4b = vdupq_n_u8(0xf); #ifdef __ARM_FEATURE_DOTPROD - const uint32x4_t mzero = vdupq_n_s32(0); + const int32x4_t mzero = vdupq_n_s32(0); #endif int8x16x2_t q4bytes; @@ -1745,7 +1745,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri #ifdef __ARM_NEON const uint8x16_t m4b = vdupq_n_u8(0xf); - const uint32x4_t mzero = vdupq_n_u32(0); + const int32x4_t mzero = vdupq_n_s32(0); const uint8x16_t mone = vdupq_n_u8(1); const uint8x16_t mtwo = vdupq_n_u8(2); @@ -2242,5 +2242,3 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri *s = sumf; #endif } - -