diff --git a/CMakeLists.txt b/CMakeLists.txt
index 41f5bb737..19cd42dd2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -432,6 +432,9 @@ target_link_libraries(llama PRIVATE
if (BUILD_SHARED_LIBS)
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
+ if (LLAMA_METAL)
+ set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
+ endif()
endif()
if (GGML_SOURCES_CUDA)
diff --git a/Makefile b/Makefile
index 71b1baecf..e68875e6b 100644
--- a/Makefile
+++ b/Makefile
@@ -107,6 +107,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
# Usage AVX-only
#CFLAGS += -mfma -mf16c -mavx
#CXXFLAGS += -mfma -mf16c -mavx
+
+ # Usage SSSE3-only (Not is SSE3!)
+ #CFLAGS += -mssse3
+ #CXXFLAGS += -mssse3
endif
ifneq ($(filter ppc64%,$(UNAME_M)),)
diff --git a/README.md b/README.md
index 0c87af6ee..cc3bd5394 100644
--- a/README.md
+++ b/README.md
@@ -308,7 +308,7 @@ Building the program with BLAS support may lead to some performance improvements
- #### BLIS
- Check [BLIS.md](BLIS.md) for more information.
+ Check [BLIS.md](docs/BLIS.md) for more information.
- #### Intel MKL
diff --git a/SHA256SUMS b/SHA256SUMS
index 593c8efaa..ca4d5a4a5 100644
--- a/SHA256SUMS
+++ b/SHA256SUMS
@@ -1,6 +1,6 @@
700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth
666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q4_0.bin
+ec2f2d1f0dfb73b72a4cbac7fa121abbe04c37ab327125a38248f930c0f09ddf models/7B/ggml-model-q4_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q4_1.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_1.bin
@@ -8,7 +8,7 @@ ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml
745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth
2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q4_0.bin
+fad169e6f0f575402cf75945961cb4a8ecd824ba4da6be2af831f320c4348fa5 models/13B/ggml-model-q4_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q4_1.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_1.bin
@@ -18,7 +18,7 @@ e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/con
24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth
1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth
7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q4_0.bin
+d2a441403944819492ec8c2002cc36fa38468149bfb4b7b4c52afc7bd9a7166d models/30B/ggml-model-q4_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q4_1.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_1.bin
@@ -32,7 +32,7 @@ a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/con
72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth
d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth
60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q4_0.bin
+cde053439fa4910ae454407e2717cc46cc2c2b4995c00c93297a2b52e790fa92 models/65B/ggml-model-q4_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q4_1.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_1.bin
diff --git a/examples/common.cpp b/examples/common.cpp
index d68bd4ba7..54fb4483f 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -656,6 +656,9 @@ void console_set_color(console_state & con_st, console_color_t color) {
case CONSOLE_COLOR_USER_INPUT:
fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN);
break;
+ case CONSOLE_COLOR_ERROR:
+ fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_RED);
+ break;
}
con_st.color = color;
fflush(con_st.out);
diff --git a/examples/common.h b/examples/common.h
index 5eb611841..ffb068874 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -113,7 +113,8 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
enum console_color_t {
CONSOLE_COLOR_DEFAULT=0,
CONSOLE_COLOR_PROMPT,
- CONSOLE_COLOR_USER_INPUT
+ CONSOLE_COLOR_USER_INPUT,
+ CONSOLE_COLOR_ERROR
};
struct console_state {
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 8d9371e19..7f9636aae 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -82,6 +82,9 @@ int main(int argc, char ** argv) {
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
+ } else if (params.n_ctx < 8) {
+ fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+ params.n_ctx = 8;
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
@@ -347,6 +350,19 @@ int main(int argc, char ** argv) {
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict
if (embd.size() > 0) {
+ // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
+ // --prompt or --file which uses the same value.
+ auto max_embd_size = n_ctx - 4;
+ // Ensure the input doesn't exceed the context size by truncating embd if necessary.
+ if ((int)embd.size() > max_embd_size) {
+ auto skipped_tokens = embd.size() - max_embd_size;
+ console_set_color(con_st, CONSOLE_COLOR_ERROR);
+ printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+ console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
+ fflush(stdout);
+ embd.resize(max_embd_size);
+ }
+
// infinite text generation via context swapping
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 947b40202..c6bf1b723 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -3,6 +3,7 @@
#include "llama.h"
#include
+#include
#include