From 3f1223155a462477ac933474ebc4eab0ce3ca264 Mon Sep 17 00:00:00 2001 From: Artyom Lebedev Date: Sat, 10 Jun 2023 22:51:36 +0300 Subject: [PATCH 1/5] k-quants : GCC12 compilation fix (#1792) --- k_quants.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/k_quants.c b/k_quants.c index 4d524494d..a48c82171 100644 --- a/k_quants.c +++ b/k_quants.c @@ -1519,7 +1519,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri const uint8x16_t m4b = vdupq_n_u8(0xf); #ifdef __ARM_FEATURE_DOTPROD - const uint32x4_t mzero = vdupq_n_s32(0); + const int32x4_t mzero = vdupq_n_s32(0); #endif int8x16x2_t q4bytes; @@ -1745,7 +1745,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri #ifdef __ARM_NEON const uint8x16_t m4b = vdupq_n_u8(0xf); - const uint32x4_t mzero = vdupq_n_u32(0); + const int32x4_t mzero = vdupq_n_s32(0); const uint8x16_t mone = vdupq_n_u8(1); const uint8x16_t mtwo = vdupq_n_u8(2); @@ -2242,5 +2242,3 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri *s = sumf; #endif } - - From 4de0334f5cabf4696eced2e5d6e279fdfaa6c0f2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 10 Jun 2023 22:56:53 +0300 Subject: [PATCH 2/5] cmake : fix Metal build (close #1791) --- CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 84e2a88cb..19cd42dd2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -218,9 +218,6 @@ if (LLAMA_METAL) # copy ggml-metal.metal to bin directory configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY) - if (LLAMA_METAL) - set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") - endif() set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${FOUNDATION_LIBRARY} From 31d2b5f4a4bae081e59b36ab37c6ff6f5b5940ad Mon Sep 17 00:00:00 2001 From: Ryan Landay Date: Sun, 11 Jun 2023 17:38:53 +0800 Subject: [PATCH 3/5] Update SHA256SUMS with current hashes for models quantized using q4_0 (#1798) --- SHA256SUMS | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/SHA256SUMS b/SHA256SUMS index 593c8efaa..ca4d5a4a5 100644 --- a/SHA256SUMS +++ b/SHA256SUMS @@ -1,6 +1,6 @@ 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth 666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q4_0.bin +ec2f2d1f0dfb73b72a4cbac7fa121abbe04c37ab327125a38248f930c0f09ddf models/7B/ggml-model-q4_0.bin ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q4_1.bin ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_0.bin ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_1.bin @@ -8,7 +8,7 @@ ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth 2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q4_0.bin +fad169e6f0f575402cf75945961cb4a8ecd824ba4da6be2af831f320c4348fa5 models/13B/ggml-model-q4_0.bin ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q4_1.bin ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_0.bin ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_1.bin @@ -18,7 +18,7 @@ e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/con 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth 7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q4_0.bin +d2a441403944819492ec8c2002cc36fa38468149bfb4b7b4c52afc7bd9a7166d models/30B/ggml-model-q4_0.bin ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q4_1.bin ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_0.bin ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_1.bin @@ -32,7 +32,7 @@ a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/con 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth 60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q4_0.bin +cde053439fa4910ae454407e2717cc46cc2c2b4995c00c93297a2b52e790fa92 models/65B/ggml-model-q4_0.bin ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q4_1.bin ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_0.bin ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_1.bin From 12b063f0ecf280e98028e444fc492ee6222cdcdc Mon Sep 17 00:00:00 2001 From: Kyle Liang Date: Sun, 11 Jun 2023 21:20:52 +0800 Subject: [PATCH 4/5] Fixed WSL cuda's OOM error (#1594) * In the function , add the cuda error bypass. * remove excessive codes and prints --------- Co-authored-by: liang --- ggml-cuda.cu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index a62f26e1e..4f2195f77 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1105,6 +1105,9 @@ void * ggml_cuda_host_malloc(size_t size) { void * ptr = nullptr; cudaError_t err = cudaMallocHost((void **) &ptr, size); if (err != cudaSuccess) { + // The allocation error can be bypassed. A null ptr will assigned out of this function. + // This can fixed the OOM error in WSL. + cudaGetLastError(); fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", size/1024.0/1024.0, cudaGetErrorString(err)); return nullptr; From fa84c4b3e80199a5683438f062009c031a06c4fa Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Sun, 11 Jun 2023 08:19:17 -0600 Subject: [PATCH 5/5] Fix issue where interactive mode crashes when input exceeds ctx size (#1789) * Fix issue where interactive mode in the main example crashes when input exceeds ctx size * Ensure the context size is at least 8 tokens in the main example. Closes #1768 --- examples/common.cpp | 3 +++ examples/common.h | 3 ++- examples/main/main.cpp | 16 ++++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/examples/common.cpp b/examples/common.cpp index f5d886acf..df69f2736 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -632,6 +632,9 @@ void console_set_color(console_state & con_st, console_color_t color) { case CONSOLE_COLOR_USER_INPUT: fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN); break; + case CONSOLE_COLOR_ERROR: + fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_RED); + break; } con_st.color = color; fflush(con_st.out); diff --git a/examples/common.h b/examples/common.h index 826e2ae59..6fedb414a 100644 --- a/examples/common.h +++ b/examples/common.h @@ -112,7 +112,8 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params); enum console_color_t { CONSOLE_COLOR_DEFAULT=0, CONSOLE_COLOR_PROMPT, - CONSOLE_COLOR_USER_INPUT + CONSOLE_COLOR_USER_INPUT, + CONSOLE_COLOR_ERROR }; struct console_state { diff --git a/examples/main/main.cpp b/examples/main/main.cpp index de63faa3e..66d563143 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -81,6 +81,9 @@ int main(int argc, char ** argv) { if (params.n_ctx > 2048) { fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" "expect poor results\n", __func__, params.n_ctx); + } else if (params.n_ctx < 8) { + fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__); + params.n_ctx = 8; } fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); @@ -331,6 +334,19 @@ int main(int argc, char ** argv) { while ((n_remain != 0 && !is_antiprompt) || params.interactive) { // predict if (embd.size() > 0) { + // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via + // --prompt or --file which uses the same value. + auto max_embd_size = n_ctx - 4; + // Ensure the input doesn't exceed the context size by truncating embd if necessary. + if ((int)embd.size() > max_embd_size) { + auto skipped_tokens = embd.size() - max_embd_size; + console_set_color(con_st, CONSOLE_COLOR_ERROR); + printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + console_set_color(con_st, CONSOLE_COLOR_DEFAULT); + fflush(stdout); + embd.resize(max_embd_size); + } + // infinite text generation via context swapping // if we run out of context: // - take the n_keep first tokens from the original prompt (via n_past)