diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c10e671c5..1a068ae75 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -33,6 +33,20 @@ jobs: run: | make + windows-latest: + runs-on: windows-latest + + steps: + - name: Clone + uses: actions/checkout@v1 + + - name: Build + run: | + mkdir build + cd build + cmake .. + cmake --build . --config Release + # ubuntu-latest-gcc: # runs-on: ubuntu-latest # diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..ca3be38a5 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,128 @@ +cmake_minimum_required(VERSION 3.8) +project("llama.cpp") + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED true) +set(CMAKE_C_STANDARD 11) + +if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") +endif() + +option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) +option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF) + +option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) +option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) +option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) + +if (APPLE) + option(LLAMA_NO_ACCELERATE "llama: disable Accelerate framework" OFF) + option(LLAMA_NO_AVX "llama: disable AVX" OFF) + option(LLAMA_NO_AVX2 "llama: disable AVX2" OFF) + option(LLAMA_NO_FMA "llama: disable FMA" OFF) +endif() + +if (NOT MSVC) + if (LLAMA_SANITIZE_THREAD) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=thread") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread") + endif() + + if (LLAMA_SANITIZE_ADDRESS) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer") + endif() + + if (LLAMA_SANITIZE_UNDEFINED) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined") + endif() +endif() + +if (APPLE AND NOT LLAMA_NO_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate) + if (ACCELERATE_FRAMEWORK) + message(STATUS "Accelerate framework found") + + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) + set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_USE_ACCELERATE) + else() + message(WARNING "Accelerate framework not found") + endif() +endif() + +if (LLAMA_ALL_WARNINGS) + if (NOT MSVC) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} \ + -Wall \ + -Wextra \ + -Wpedantic \ + -Wshadow \ + -Wcast-qual \ + -Wstrict-prototypes \ + -Wpointer-arith \ + -Wno-unused-function \ + ") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \ + -Wall \ + -Wextra \ + -Wpedantic \ + -Wcast-qual \ + ") + else() + # todo : msvc + endif() +endif() + +message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") + +if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") + message(STATUS "ARM detected") +else() + message(STATUS "x86 detected") + if (MSVC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2") + else() + if(NOT LLAMA_NO_AVX) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx") + endif() + if(NOT LLAMA_NO_AVX2) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2") + endif() + if(NOT LLAMA_NO_FMA) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma") + endif() + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c") + endif() +endif() + +# if (LLAMA_PERF) +# set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_PERF) +# endif() + +add_executable(llama + main.cpp + utils.cpp + utils.h) + +add_executable(quantize + quantize.cpp + utils.cpp + utils.h) + +add_library(ggml + ggml.c + ggml.h) + +target_compile_definitions(ggml PUBLIC ${LLAMA_EXTRA_FLAGS}) +target_compile_definitions(llama PUBLIC ${LLAMA_EXTRA_FLAGS}) +target_compile_definitions(quantize PUBLIC ${LLAMA_EXTRA_FLAGS}) + +target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS}) +target_include_directories(ggml PUBLIC .) +target_link_libraries(quantize PRIVATE ggml) +target_link_libraries(llama PRIVATE ggml) diff --git a/Makefile b/Makefile index 8388c290d..1601079a4 100644 --- a/Makefile +++ b/Makefile @@ -48,6 +48,10 @@ ifeq ($(UNAME_S),FreeBSD) CFLAGS += -pthread CXXFLAGS += -pthread endif +ifeq ($(UNAME_S),NetBSD) + CFLAGS += -pthread + CXXFLAGS += -pthread +endif ifeq ($(UNAME_S),Haiku) CFLAGS += -pthread CXXFLAGS += -pthread diff --git a/README.md b/README.md index 65be1a687..e936282f4 100644 --- a/README.md +++ b/README.md @@ -5,11 +5,6 @@ Inference of [Facebook's LLaMA](https://github.com/facebookresearch/llama) model in pure C/C++ -**Hot topics** - -- Running on Windows: https://github.com/ggerganov/llama.cpp/issues/22 -- Fix Tokenizer / Unicode support: https://github.com/ggerganov/llama.cpp/issues/11 - ## Description The main goal is to run the model using 4-bit quantization on a MacBook @@ -23,14 +18,14 @@ The main goal is to run the model using 4-bit quantization on a MacBook This was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022) - I have no idea if it works correctly. Please do not make conclusions about the models based on the results from this implementation. -For all I know, it can be completely wrong. This project is for educational purposes and is not going to be maintained properly. -New features will probably be added mostly through community contributions, if any. +For all I know, it can be completely wrong. This project is for educational purposes. +New features will probably be added mostly through community contributions. Supported platforms: - [X] Mac OS - [X] Linux -- [ ] Windows (soon) +- [X] Windows (via CMake) --- @@ -179,10 +174,6 @@ Note the use of `--color` to distinguish between user input and generated text. ## Limitations -- Not sure if my tokenizer is correct. There are a few places where we might have a mistake: - - https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/convert-pth-to-ggml.py#L79-L87 - - https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/utils.h#L65-L69 - In general, it seems to work, but I think it fails for unicode character support. Hopefully, someone can help with that - I don't know yet how much the quantization affects the quality of the generated text - Probably the token sampling can be improved - The Accelerate framework is actually currently unused since I found that for tensor shapes typical for the Decoder, @@ -192,16 +183,15 @@ Note the use of `--color` to distinguish between user input and generated text. ### Contributing -- There are 2 git branches: [master](https://github.com/ggerganov/llama.cpp/commits/master) and [dev](https://github.com/ggerganov/llama.cpp/commits/dev) -- Contributors can open PRs to either one -- Collaborators can push straight into `dev`, but need to open a PR to get stuff to `master` +- Contributors can open PRs +- Collaborators can push to branches in the `llama.cpp` repo - Collaborators will be invited based on contributions -- `dev` branch is considered unstable -- `master` branch is considered stable and approved. 3-rd party projects should use the `master` branch -General principles to follow when writing code: +### Coding guide-lines - Avoid adding third-party dependencies, extra files, extra headers, etc. - Always consider cross-compatibility with other operating systems and architectures - Avoid fancy looking modern STL constructs, use basic for loops, avoid templates, keep it simple - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit +- Clean-up any tailing whitespaces, use 4 spaces indentation, brackets on same line, `int * var` +- Look at the [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks diff --git a/ggml.c b/ggml.c index fbd7b9339..58a4c9b6d 100644 --- a/ggml.c +++ b/ggml.c @@ -2,7 +2,7 @@ #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW -#elif !defined(__FreeBSD__) +#elif !defined(__FreeBSD__) && !defined(__NetBSD__) #include #endif @@ -1360,34 +1360,20 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b); // dot product into int16x8_t - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); + // assume that vdotq_s32 is always available, if not, should check for __ARM_FEATURE_DOTPROD + int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls); + int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls)); - - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs)); - - const int16x8_t pl_0 = vaddq_s16(pl0l, pl0h); - const int16x8_t ph_0 = vaddq_s16(ph0l, ph0h); - - const int16x8_t pl_1 = vaddq_s16(pl1l, pl1h); - const int16x8_t ph_1 = vaddq_s16(ph1l, ph1h); - - const int16x8_t p_0 = vaddq_s16(pl_0, ph_0); - const int16x8_t p_1 = vaddq_s16(pl_1, ph_1); + p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs); + p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs); // scalar #if defined(__ARM_FEATURE_QRDMX) - sum0 += d0_0*d1_0*vaddvq_s16(p_0); - sum1 += d0_1*d1_1*vaddvq_s16(p_1); + sum0 += d0_0*d1_0*vaddvq_s32(p_0); + sum1 += d0_1*d1_1*vaddvq_s32(p_1); #else - sum0 += d0_0*d1_0*(vgetq_lane_s16(p_0, 0) + vgetq_lane_s16(p_0, 1) + vgetq_lane_s16(p_0, 2) + vgetq_lane_s16(p_0, 3) + vgetq_lane_s16(p_0, 4) + vgetq_lane_s16(p_0, 5) + vgetq_lane_s16(p_0, 6) + vgetq_lane_s16(p_0, 7)); - sum1 += d0_1*d1_1*(vgetq_lane_s16(p_1, 0) + vgetq_lane_s16(p_1, 1) + vgetq_lane_s16(p_1, 2) + vgetq_lane_s16(p_1, 3) + vgetq_lane_s16(p_1, 4) + vgetq_lane_s16(p_1, 5) + vgetq_lane_s16(p_1, 6) + vgetq_lane_s16(p_1, 7)); + sum0 += d0_0*d1_0*(vgetq_lane_s32(p_0, 0) + vgetq_lane_s32(p_0, 1) + vgetq_lane_s32(p_0, 2) + vgetq_lane_s32(p_0, 3)); + sum1 += d0_1*d1_1*(vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3)); #endif } diff --git a/main.cpp b/main.cpp index 9f3a98f1b..a6660021f 100644 --- a/main.cpp +++ b/main.cpp @@ -86,9 +86,6 @@ struct llama_model { std::map tensors; }; - -#define USE_MMAP 1 - #ifndef USE_MMAP #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) #define USE_MMAP 1 @@ -207,9 +204,13 @@ using llama_istream = std::ifstream; // load the model's weights from a file bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) { - printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); + fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); - llama_istream fin{fname}; + llama_istream fin{fname, std::ios::binary}; +#if !USE_MMAP + std::vector f_buf(1024*1024); + fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size()); +#endif if (!fin) { fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); return false; @@ -246,16 +247,16 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; n_parts = LLAMA_N_PARTS.at(hparams.n_embd); - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_mult = %d\n", __func__, hparams.n_mult); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: n_rot = %d\n", __func__, hparams.n_rot); - printf("%s: f16 = %d\n", __func__, hparams.f16); - printf("%s: n_ff = %d\n", __func__, n_ff); - printf("%s: n_parts = %d\n", __func__, n_parts); + fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); + fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx); + fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd); + fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult); + fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head); + fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer); + fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot); + fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16); + fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff); + fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts); } // load vocab @@ -280,7 +281,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab vocab.id_to_token[i] = word; //if (i < 30000) { - // printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); + // fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); //} } } @@ -339,7 +340,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab ctx_size += (5 + 10*n_layer)*256; // object overhead - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); } // create the ggml context @@ -426,7 +427,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); + fprintf(stderr, "%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); } const size_t file_offset = fin.tellg(); @@ -444,9 +445,12 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab fname_part += "." + std::to_string(i); } - printf("%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str()); + fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str()); - llama_istream fin{fname_part}; + llama_istream fin{fname_part, std::ios::binary}; +#if !USE_MMAP + fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size()); +#endif fin.seekg(file_offset); // load weights @@ -454,7 +458,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab int n_tensors = 0; size_t total_size = 0; - printf("%s: ", __func__); + fprintf(stderr, "%s: ", __func__); while (true) { int32_t n_dims; @@ -554,7 +558,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab if (0) { static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; - printf("%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type); + fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type); } size_t bpe = 0; @@ -617,16 +621,16 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab total_size += ggml_nbytes(tensor)/n_parts; } - //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); + //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); if (++n_tensors % 8 == 0) { - printf("."); - fflush(stdout); + fprintf(stderr, "."); + fflush(stderr); } } - printf(" done\n"); + fprintf(stderr, " done\n"); - printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); + fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); } fin.close(); @@ -670,7 +674,7 @@ bool llama_eval( if (mem_per_token > 0 && mem_per_token*N > buf_size) { const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead - //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); + //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); // reallocate buf_size = buf_size_new; @@ -862,7 +866,7 @@ bool llama_eval( if (mem_per_token == 0) { mem_per_token = ggml_used_mem(ctx0)/N; } - //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); + //fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0)); ggml_free(ctx0); @@ -883,6 +887,26 @@ void sigint_handler(int signo) { } #endif +const char * llama_print_system_info(void) { + static std::string s; + + s = ""; + s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; + s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; + s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; + s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; + s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; + s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; + s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; + s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; + s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; + s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; + s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; + s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; + + return s.c_str(); +} + int main(int argc, char ** argv) { ggml_time_init(); const int64_t t_main_start_us = ggml_time_us(); @@ -898,7 +922,7 @@ int main(int argc, char ** argv) { params.seed = time(NULL); } - printf("%s: seed = %d\n", __func__, params.seed); + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); std::mt19937 rng(params.seed); if (params.prompt.empty()) { @@ -925,6 +949,13 @@ int main(int argc, char ** argv) { t_load_us = ggml_time_us() - t_start_us; } + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", + params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + } + int n_past = 0; int64_t t_sample_us = 0; @@ -940,13 +971,13 @@ int main(int argc, char ** argv) { // tokenize the reverse prompt std::vector antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false); - printf("\n"); - printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + fprintf(stderr, "\n"); + fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - printf("%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); + fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); } - printf("\n"); + fprintf(stderr, "\n"); if (params.interactive) { #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; @@ -956,19 +987,19 @@ int main(int argc, char ** argv) { sigaction(SIGINT, &sigint_action, NULL); #endif - printf("%s: interactive mode on.\n", __func__); + fprintf(stderr, "%s: interactive mode on.\n", __func__); if(antiprompt_inp.size()) { - printf("%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str()); - printf("%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size()); + fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str()); + fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size()); for (int i = 0; i < (int) antiprompt_inp.size(); i++) { - printf("%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str()); + fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str()); } - printf("\n"); + fprintf(stderr, "\n"); } } - printf("sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); - printf("\n\n"); + fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); + fprintf(stderr, "\n\n"); std::vector embd; @@ -982,7 +1013,7 @@ int main(int argc, char ** argv) { if (params.interactive) { - printf("== Running in interactive mode. ==\n" + fprintf(stderr, "== Running in interactive mode. ==\n" #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) " - Press Ctrl+C to interject at any time.\n" #endif @@ -1010,7 +1041,7 @@ int main(int argc, char ** argv) { const int64_t t_start_us = ggml_time_us(); if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { - printf("Failed to predict\n"); + fprintf(stderr, "Failed to predict\n"); return 1; } @@ -1123,7 +1154,7 @@ int main(int argc, char ** argv) { // end of text token if (embd.back() == 2) { - printf(" [end of text]\n"); + fprintf(stderr, " [end of text]\n"); break; } } @@ -1133,12 +1164,12 @@ int main(int argc, char ** argv) { { const int64_t t_main_end_us = ggml_time_us(); - printf("\n\n"); - printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); - printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); + fprintf(stderr, "\n\n"); + fprintf(stderr, "%s: mem per token = %8zu bytes\n", __func__, mem_per_token); + fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); + fprintf(stderr, "%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); + fprintf(stderr, "%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); + fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); } ggml_free(model.ctx); diff --git a/utils.cpp b/utils.cpp index b340bd61b..54217f02f 100644 --- a/utils.cpp +++ b/utils.cpp @@ -11,7 +11,7 @@ #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW - #elif !defined(__FreeBSD__) + #elif !defined(__FreeBSD__) && !defined(__NetBSD__) #include #endif