From c574bddb368424b5996cbee2ec45ec050967d404 Mon Sep 17 00:00:00 2001 From: Bono Lv Date: Tue, 1 Aug 2023 20:54:28 +0800 Subject: [PATCH 1/4] fix a typo in examples/server/README.md (#2478) --- examples/server/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/README.md b/examples/server/README.md index e5ca8269b..aee31ae42 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -163,7 +163,7 @@ node . `content`: Set the text to tokenize. - Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`. + Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`. - **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does. From a312193e184b919047f33a5e844d846c5538dbe6 Mon Sep 17 00:00:00 2001 From: Yiming Cui Date: Wed, 2 Aug 2023 14:18:31 +0800 Subject: [PATCH 2/4] readme : Add Chinese LLaMA-2 / Alpaca-2 to supported models (#2475) * add support for chinese llama-2 / alpaca-2 * remove white spaces --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b231d24b8..515c80c42 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ as the main playground for developing new features for the [ggml](https://github - [x] LLaMA 2 🦙🦙 - [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca) - [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all) -- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) +- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) - [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894) - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/) From 81844fbcfd93a162b7aeaea9e4f2ab1358f7f97e Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Wed, 2 Aug 2023 04:06:19 -0400 Subject: [PATCH 3/4] tests : Fix compilation warnings (Linux/GCC) (#2451) * fix hellaswag print format, cast away warning in test-double-float * c++11 cannot use designated initializers * add static to test-grad0.c internal functions * use memcpy in test-double-float.c * port c tests to c++ * use initializer list for ggml_init_params --- Makefile | 6 ++-- examples/common.cpp | 2 +- scripts/sync-ggml.sh | 4 +-- tests/CMakeLists.txt | 6 ++-- ...t-double-float.c => test-double-float.cpp} | 12 ++++--- tests/{test-grad0.c => test-grad0.cpp} | 32 +++++++++---------- tests/{test-opt.c => test-opt.cpp} | 15 +++++---- 7 files changed, 40 insertions(+), 37 deletions(-) rename tests/{test-double-float.c => test-double-float.cpp} (88%) rename tests/{test-grad0.c => test-grad0.cpp} (98%) rename tests/{test-opt.c => test-opt.cpp} (96%) diff --git a/Makefile b/Makefile index 100614b4b..a692a39ea 100644 --- a/Makefile +++ b/Makefile @@ -411,13 +411,13 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -tests/test-double-float: tests/test-double-float.c build-info.h ggml.o llama.o common.o $(OBJS) +tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) -tests/test-grad0: tests/test-grad0.c build-info.h ggml.o llama.o common.o $(OBJS) +tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) -tests/test-opt: tests/test-opt.c build-info.h ggml.o llama.o common.o $(OBJS) +tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS) diff --git a/examples/common.cpp b/examples/common.cpp index e6439841d..3e7c3b696 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -572,7 +572,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stdout, " --temp N temperature (default: %.1f)\n", (double)params.temp); fprintf(stdout, " --perplexity compute perplexity over each ctx window of the prompt\n"); fprintf(stdout, " --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); - fprintf(stdout, " --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %d)\n", params.hellaswag_tasks); + fprintf(stdout, " --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); fprintf(stdout, " --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); if (llama_mlock_supported()) { diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index 02ea6ec15..3d13e852a 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -10,5 +10,5 @@ cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h -cp -rpv ../ggml/tests/test-opt.c ./tests/test-opt.c -cp -rpv ../ggml/tests/test-grad0.c ./tests/test-grad0.c +cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp +cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 11ec6c725..1a40edbec 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -6,10 +6,10 @@ function(llama_add_test source) add_test(NAME ${TEST_TARGET} COMMAND $ ${ARGN}) endfunction() -# llama_add_test(test-double-float.c) # SLOW +# llama_add_test(test-double-float.cpp) # SLOW llama_add_test(test-quantize-fns.cpp) llama_add_test(test-quantize-perf.cpp) llama_add_test(test-sampling.cpp) llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) -llama_add_test(test-grad0.c) # SLOW -# llama_add_test(test-opt.c) # SLOW +llama_add_test(test-grad0.cpp) # SLOW +# llama_add_test(test-opt.cpp) # SLOW diff --git a/tests/test-double-float.c b/tests/test-double-float.cpp similarity index 88% rename from tests/test-double-float.c rename to tests/test-double-float.cpp index 89dafc9f2..b506f273f 100644 --- a/tests/test-double-float.c +++ b/tests/test-double-float.cpp @@ -3,10 +3,11 @@ // This is done by checking all finite (non-NaN, non-infinite) floats. #undef NDEBUG -#include +#include #include -#include -#include +#include +#include +#include #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdouble-promotion" @@ -32,8 +33,9 @@ inline static float silu_float(float x) { int main(void) { uint32_t x = UINT32_MAX; do { - float f = *(float *)&x; - assert(!isfinite(f) || (round_orig(f) == round_float(f))); + float f; + memcpy(&f, &x, sizeof(x)); + assert(!std::isfinite(f) || (round_orig(f) == round_float(f))); } while (x--); #ifdef __F16C__ diff --git a/tests/test-grad0.c b/tests/test-grad0.cpp similarity index 98% rename from tests/test-grad0.c rename to tests/test-grad0.cpp index 6d312216d..75a698d73 100644 --- a/tests/test-grad0.c +++ b/tests/test-grad0.cpp @@ -1,10 +1,10 @@ #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows #include "ggml.h" -#include -#include -#include -#include +#include +#include +#include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -47,16 +47,16 @@ #define GGML_PRINT(...) printf(__VA_ARGS__) -float frand(void) { +static float frand(void) { return (float)rand()/(float)RAND_MAX; } -int irand(int n) { +static int irand(int n) { if (n == 0) return 0; return rand()%n; } -void get_random_dims(int64_t * dims, int ndims) { +static void get_random_dims(int64_t * dims, int ndims) { dims[0] = dims[1] = dims[2] = dims[3] = 1; for (int i = 0; i < ndims; i++) { @@ -64,7 +64,7 @@ void get_random_dims(int64_t * dims, int ndims) { } } -struct ggml_tensor * get_random_tensor_f32( +static struct ggml_tensor * get_random_tensor_f32( struct ggml_context * ctx0, int ndims, int64_t ne[], @@ -112,7 +112,7 @@ struct ggml_tensor * get_random_tensor_f32( return result; } -struct ggml_tensor * get_random_tensor_f16( +static struct ggml_tensor * get_random_tensor_f16( struct ggml_context * ctx0, int ndims, int64_t ne[], @@ -160,7 +160,7 @@ struct ggml_tensor * get_random_tensor_f16( return result; } -struct ggml_tensor * get_random_tensor_i32( +static struct ggml_tensor * get_random_tensor_i32( struct ggml_context * ctx0, int ndims, int64_t ne[], @@ -208,7 +208,7 @@ struct ggml_tensor * get_random_tensor_i32( return result; } -void print_elements(const char* label, const struct ggml_tensor * t) { +static void print_elements(const char* label, const struct ggml_tensor * t) { if (!t) { printf("%s: %s = null\n", __func__, label); return; @@ -228,7 +228,7 @@ void print_elements(const char* label, const struct ggml_tensor * t) { } -bool check_gradient( +static bool check_gradient( const char * op_name, struct ggml_context * ctx0, struct ggml_tensor * x[], @@ -310,7 +310,7 @@ bool check_gradient( } // TODO: clean-up this .. -bool check_mat_mul( +static bool check_mat_mul( const struct ggml_tensor * y, const struct ggml_tensor * x0, const struct ggml_tensor * x1) { @@ -373,9 +373,9 @@ bool check_mat_mul( int main(int argc, const char ** argv) { struct ggml_init_params params = { - .mem_size = 128*1024*1024, - .mem_buffer = NULL, - .no_alloc = false, + /* .mem_size = */ 128*1024*1024, + /* .mem_buffer = */ NULL, + /* .no_alloc = */ false, }; int64_t ne[4]; diff --git a/tests/test-opt.c b/tests/test-opt.cpp similarity index 96% rename from tests/test-opt.c rename to tests/test-opt.cpp index 4eef62bcf..8ab240202 100644 --- a/tests/test-opt.c +++ b/tests/test-opt.cpp @@ -1,9 +1,9 @@ #include "ggml.h" -#include -#include -#include -#include +#include +#include +#include +#include #define MAX_NARGS 2 @@ -119,10 +119,11 @@ void set_element(struct ggml_tensor * t, int idx, float value) { int main(void) { struct ggml_init_params params = { - .mem_size = 1024*1024*1024, - .mem_buffer = NULL, - .no_alloc = false, + /* .mem_size = */ 1024*1024*1024, + /* .mem_buffer = */ NULL, + /* .no_alloc = */ false, }; + struct ggml_context * ctx = ggml_init(params); int64_t ne1[4] = {4, 128, 1, 1}; From 220d9318647a8ce127dbf7c9de5400455f41e7d8 Mon Sep 17 00:00:00 2001 From: ldwang Date: Wed, 2 Aug 2023 16:21:11 +0800 Subject: [PATCH 4/4] readme : add Aquila-7B model series to supported models (#2487) * support bpe tokenizer in convert Signed-off-by: ldwang * support bpe tokenizer in convert Signed-off-by: ldwang * support bpe tokenizer in convert, fix Signed-off-by: ldwang * Add Aquila-7B models in README.md Signed-off-by: ldwang * Up Aquila-7B models in README.md Signed-off-by: ldwang --------- Signed-off-by: ldwang Co-authored-by: ldwang --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 515c80c42..2ece294b7 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,7 @@ as the main playground for developing new features for the [ggml](https://github - [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b) - [X] [WizardLM](https://github.com/nlpxucan/WizardLM) - [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft)) +- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B) **Bindings:** @@ -492,6 +493,9 @@ Building the program with BLAS support may lead to some performance improvements # obtain the original LLaMA model weights and place them in ./models ls ./models 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model + # [Optional] for models using BPE tokenizers + ls ./models + 65B 30B 13B 7B vocab.json # install Python dependencies python3 -m pip install -r requirements.txt @@ -499,6 +503,9 @@ python3 -m pip install -r requirements.txt # convert the 7B model to ggml FP16 format python3 convert.py models/7B/ + # [Optional] for models using BPE tokenizers + python convert.py models/7B/ --vocabtype bpe + # quantize the model to 4-bits (using q4_0 method) ./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0