Resolved merge conflicts.

2023-03-14 18:42:12 -05:00 · 2023-03-14 18:42:12 -05:00 · 3878230201
commit 3878230201
parent 6894b63556 2f700a2738
7 changed files with 247 additions and 94 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -33,6 +33,20 @@ jobs:
        run: |
          make
  windows-latest:
    runs-on: windows-latest
    steps:
      - name: Clone
        uses: actions/checkout@v1
      - name: Build
        run: |
          mkdir build
          cd build
          cmake ..
          cmake --build . --config Release
 #  ubuntu-latest-gcc:
 #    runs-on: ubuntu-latest
 #
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,128 @@
 cmake_minimum_required(VERSION 3.8)
 project("llama.cpp")
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 option(LLAMA_ALL_WARNINGS            "llama: enable all compiler warnings"                   ON)
 option(LLAMA_ALL_WARNINGS_3RD_PARTY  "llama: enable all compiler warnings in 3rd party libs" OFF)
 option(LLAMA_SANITIZE_THREAD         "llama: enable thread sanitizer"    OFF)
 option(LLAMA_SANITIZE_ADDRESS        "llama: enable address sanitizer"   OFF)
 option(LLAMA_SANITIZE_UNDEFINED      "llama: enable undefined sanitizer" OFF)
 if (APPLE)
    option(LLAMA_NO_ACCELERATE       "llama: disable Accelerate framework" OFF)
    option(LLAMA_NO_AVX              "llama: disable AVX" OFF)
    option(LLAMA_NO_AVX2             "llama: disable AVX2" OFF)
    option(LLAMA_NO_FMA              "llama: disable FMA" OFF)
 endif()
 if (NOT MSVC)
    if (LLAMA_SANITIZE_THREAD)
        set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -fsanitize=thread")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
    endif()
    if (LLAMA_SANITIZE_ADDRESS)
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=address -fno-omit-frame-pointer")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
    endif()
    if (LLAMA_SANITIZE_UNDEFINED)
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=undefined")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
    endif()
 endif()
 if (APPLE AND NOT LLAMA_NO_ACCELERATE)
    find_library(ACCELERATE_FRAMEWORK Accelerate)
    if (ACCELERATE_FRAMEWORK)
        message(STATUS "Accelerate framework found")
        set(LLAMA_EXTRA_LIBS  ${LLAMA_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
        set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
    else()
        message(WARNING "Accelerate framework not found")
    endif()
 endif()
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} \
            -Wall                           \
            -Wextra                         \
            -Wpedantic                      \
            -Wshadow                        \
            -Wcast-qual                     \
            -Wstrict-prototypes             \
            -Wpointer-arith                 \
            -Wno-unused-function            \
        ")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
            -Wall                           \
            -Wextra                         \
            -Wpedantic                      \
            -Wcast-qual                     \
        ")
    else()
        # todo : msvc
    endif()
 endif()
 message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
 if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
    message(STATUS "ARM detected")
 else()
    message(STATUS "x86 detected")
    if (MSVC)
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
    else()
        if(NOT LLAMA_NO_AVX)
            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
        endif()
        if(NOT LLAMA_NO_AVX2)
            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
        endif()
        if(NOT LLAMA_NO_FMA)
            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
        endif()
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
    endif()
 endif()
 # if (LLAMA_PERF)
 #     set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_PERF)
 # endif()
 add_executable(llama
    main.cpp
    utils.cpp
    utils.h)
 add_executable(quantize
    quantize.cpp
    utils.cpp
    utils.h)
 add_library(ggml
    ggml.c
    ggml.h)
 target_compile_definitions(ggml PUBLIC ${LLAMA_EXTRA_FLAGS})
 target_compile_definitions(llama PUBLIC ${LLAMA_EXTRA_FLAGS})
 target_compile_definitions(quantize PUBLIC ${LLAMA_EXTRA_FLAGS})
 target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS})
 target_include_directories(ggml PUBLIC .)
 target_link_libraries(quantize PRIVATE ggml)
 target_link_libraries(llama PRIVATE ggml)
--- a/4
+++ b/4
@ -48,6 +48,10 @@ ifeq ($(UNAME_S),FreeBSD)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
 endif
 ifeq ($(UNAME_S),NetBSD)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
 endif
 ifeq ($(UNAME_S),Haiku)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
--- a/README.md
+++ b/README.md
@ -5,11 +5,6 @@
 Inference of [Facebook's LLaMA](https://github.com/facebookresearch/llama) model in pure C/C++
 **Hot topics**
 - Running on Windows: https://github.com/ggerganov/llama.cpp/issues/22
 - Fix Tokenizer / Unicode support: https://github.com/ggerganov/llama.cpp/issues/11
 ## Description
 The main goal is to run the model using 4-bit quantization on a MacBook
@ -23,14 +18,14 @@ The main goal is to run the model using 4-bit quantization on a MacBook
 This was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022) - I have no idea if it works correctly.
 Please do not make conclusions about the models based on the results from this implementation.
-For all I know, it can be completely wrong. This project is for educational purposes and is not going to be maintained properly.
+For all I know, it can be completely wrong. This project is for educational purposes.
-New features will probably be added mostly through community contributions, if any.
+New features will probably be added mostly through community contributions.
 Supported platforms:
 - [X] Mac OS
 - [X] Linux
- [ ] Windows (soon)
+- [X] Windows (via CMake)
 ---
@ -179,10 +174,6 @@ Note the use of `--color` to distinguish between user input and generated text.
 ## Limitations
 - Not sure if my tokenizer is correct. There are a few places where we might have a mistake:
  - https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/convert-pth-to-ggml.py#L79-L87
  - https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/utils.h#L65-L69
  In general, it seems to work, but I think it fails for unicode character support. Hopefully, someone can help with that
 - I don't know yet how much the quantization affects the quality of the generated text
 - Probably the token sampling can be improved
 - The Accelerate framework is actually currently unused since I found that for tensor shapes typical for the Decoder,
@ -192,16 +183,15 @@ Note the use of `--color` to distinguish between user input and generated text.
 ### Contributing
- There are 2 git branches: [master](https://github.com/ggerganov/llama.cpp/commits/master) and [dev](https://github.com/ggerganov/llama.cpp/commits/dev)
+- Contributors can open PRs
- Contributors can open PRs to either one
+- Collaborators can push to branches in the `llama.cpp` repo
 - Collaborators can push straight into `dev`, but need to open a PR to get stuff to `master`
 - Collaborators will be invited based on contributions
 - `dev` branch is considered unstable
 - `master` branch is considered stable and approved. 3-rd party projects should use the `master` branch
-General principles to follow when writing code:
+### Coding guide-lines
 - Avoid adding third-party dependencies, extra files, extra headers, etc.
 - Always consider cross-compatibility with other operating systems and architectures
 - Avoid fancy looking modern STL constructs, use basic for loops, avoid templates, keep it simple
 - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
 - Clean-up any tailing whitespaces, use 4 spaces indentation, brackets on same line, `int * var`
 - Look at the [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks
--- a/ggml.c
+++ b/ggml.c
@ -2,7 +2,7 @@
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
-#elif !defined(__FreeBSD__)
+#elif !defined(__FreeBSD__) && !defined(__NetBSD__)
 #include <alloca.h>
 #endif
@ -1360,34 +1360,20 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
        const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b);
        // dot product into int16x8_t
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
+        // assume that vdotq_s32 is always available, if not, should check for __ARM_FEATURE_DOTPROD
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
+        int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls);
        int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls);
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
+        p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs);
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
+        p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs);
        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls));
        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls));
        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs));
        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs));
        const int16x8_t pl_0 = vaddq_s16(pl0l, pl0h);
        const int16x8_t ph_0 = vaddq_s16(ph0l, ph0h);
        const int16x8_t pl_1 = vaddq_s16(pl1l, pl1h);
        const int16x8_t ph_1 = vaddq_s16(ph1l, ph1h);
        const int16x8_t p_0 = vaddq_s16(pl_0, ph_0);
        const int16x8_t p_1 = vaddq_s16(pl_1, ph_1);
        // scalar
 #if defined(__ARM_FEATURE_QRDMX)
-        sum0 += d0_0*d1_0*vaddvq_s16(p_0);
+        sum0 += d0_0*d1_0*vaddvq_s32(p_0);
-        sum1 += d0_1*d1_1*vaddvq_s16(p_1);
+        sum1 += d0_1*d1_1*vaddvq_s32(p_1);
 #else
-        sum0 += d0_0*d1_0*(vgetq_lane_s16(p_0, 0) + vgetq_lane_s16(p_0, 1) + vgetq_lane_s16(p_0, 2) + vgetq_lane_s16(p_0, 3) + vgetq_lane_s16(p_0, 4) + vgetq_lane_s16(p_0, 5) + vgetq_lane_s16(p_0, 6) + vgetq_lane_s16(p_0, 7));
+        sum0 += d0_0*d1_0*(vgetq_lane_s32(p_0, 0) + vgetq_lane_s32(p_0, 1) + vgetq_lane_s32(p_0, 2) + vgetq_lane_s32(p_0, 3));
-        sum1 += d0_1*d1_1*(vgetq_lane_s16(p_1, 0) + vgetq_lane_s16(p_1, 1) + vgetq_lane_s16(p_1, 2) + vgetq_lane_s16(p_1, 3) + vgetq_lane_s16(p_1, 4) + vgetq_lane_s16(p_1, 5) + vgetq_lane_s16(p_1, 6) + vgetq_lane_s16(p_1, 7));
+        sum1 += d0_1*d1_1*(vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3));
 #endif
    }
--- a/main.cpp
+++ b/main.cpp
@ -86,9 +86,6 @@ struct llama_model {
    std::map<std::string, struct ggml_tensor *> tensors;
 };
 #define USE_MMAP 1
 #ifndef USE_MMAP
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #define USE_MMAP 1
@ -207,9 +204,13 @@ using llama_istream = std::ifstream;
 // load the model's weights from a file
 bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
-    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
+    fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
-    llama_istream fin{fname};
+    llama_istream fin{fname, std::ios::binary};
 #if !USE_MMAP
    std::vector<char> f_buf(1024*1024);
    fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
 #endif
    if (!fin) {
        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
        return false;
@ -246,16 +247,16 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
        n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
        n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        fprintf(stderr, "%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        fprintf(stderr, "%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_mult  = %d\n", __func__, hparams.n_mult);
+        fprintf(stderr, "%s: n_mult  = %d\n", __func__, hparams.n_mult);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        fprintf(stderr, "%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
+        fprintf(stderr, "%s: n_rot   = %d\n", __func__, hparams.n_rot);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        fprintf(stderr, "%s: f16     = %d\n", __func__, hparams.f16);
-        printf("%s: n_ff    = %d\n", __func__, n_ff);
+        fprintf(stderr, "%s: n_ff    = %d\n", __func__, n_ff);
-        printf("%s: n_parts = %d\n", __func__, n_parts);
+        fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
    }
    // load vocab
@ -280,7 +281,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
            vocab.id_to_token[i] = word;
            //if (i < 30000) {
-            //    printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
+            //    fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
            //}
        }
    }
@ -339,7 +340,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
        ctx_size += (5 + 10*n_layer)*256; // object overhead
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+        fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
    }
    // create the ggml context
@ -426,7 +427,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-        printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+        fprintf(stderr, "%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
    }
    const size_t file_offset = fin.tellg();
@ -444,9 +445,12 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
            fname_part += "." + std::to_string(i);
        }
-        printf("%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
+        fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
-        llama_istream fin{fname_part};
+        llama_istream fin{fname_part, std::ios::binary};
 #if !USE_MMAP
        fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
 #endif
        fin.seekg(file_offset);
        // load weights
@ -454,7 +458,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
            int n_tensors = 0;
            size_t total_size = 0;
-            printf("%s: ", __func__);
+            fprintf(stderr, "%s: ", __func__);
            while (true) {
                int32_t n_dims;
@ -554,7 +558,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
                if (0) {
                    static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
-                    printf("%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
+                    fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
                }
                size_t bpe = 0;
@ -617,16 +621,16 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
                    total_size += ggml_nbytes(tensor)/n_parts;
                }
-                //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+                //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
                if (++n_tensors % 8 == 0) {
-                    printf(".");
+                    fprintf(stderr, ".");
-                    fflush(stdout);
+                    fflush(stderr);
                }
            }
-            printf(" done\n");
+            fprintf(stderr, " done\n");
-            printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
+            fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
        }
        fin.close();
@ -670,7 +674,7 @@ bool llama_eval(
    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+        //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
        // reallocate
        buf_size = buf_size_new;
@ -862,7 +866,7 @@ bool llama_eval(
    if (mem_per_token == 0) {
        mem_per_token = ggml_used_mem(ctx0)/N;
    }
-    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
+    //fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0));
    ggml_free(ctx0);
@ -883,6 +887,26 @@ void sigint_handler(int signo) {
 }
 #endif
 const char * llama_print_system_info(void) {
    static std::string s;
    s  = "";
    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
    s += "FMA = "       + std::to_string(ggml_cpu_has_fma())       + " | ";
    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
    s += "ARM_FMA = "   + std::to_string(ggml_cpu_has_arm_fma())   + " | ";
    s += "F16C = "      + std::to_string(ggml_cpu_has_f16c())      + " | ";
    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
    s += "SSE3 = "      + std::to_string(ggml_cpu_has_sse3())      + " | ";
    s += "VSX = "       + std::to_string(ggml_cpu_has_vsx())       + " | ";
    return s.c_str();
 }
 int main(int argc, char ** argv) {
    ggml_time_init();
    const int64_t t_main_start_us = ggml_time_us();
@ -898,7 +922,7 @@ int main(int argc, char ** argv) {
        params.seed = time(NULL);
    }
-    printf("%s: seed = %d\n", __func__, params.seed);
+    fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
    std::mt19937 rng(params.seed);
    if (params.prompt.empty()) {
@ -925,6 +949,13 @@ int main(int argc, char ** argv) {
        t_load_us = ggml_time_us() - t_start_us;
    }
    // print system information
    {
        fprintf(stderr, "\n");
        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
    }
    int n_past = 0;
    int64_t t_sample_us  = 0;
@ -940,13 +971,13 @@ int main(int argc, char ** argv) {
    // tokenize the reverse prompt
    std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
-    printf("\n");
+    fprintf(stderr, "\n");
-    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+    fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+    fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
    for (int i = 0; i < (int) embd_inp.size(); i++) {
-        printf("%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
+        fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
    }
-    printf("\n");
+    fprintf(stderr, "\n");
    if (params.interactive) {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
        struct sigaction sigint_action;
@ -956,19 +987,19 @@ int main(int argc, char ** argv) {
        sigaction(SIGINT, &sigint_action, NULL);
 #endif
-        printf("%s: interactive mode on.\n", __func__);
+        fprintf(stderr, "%s: interactive mode on.\n", __func__);
        if(antiprompt_inp.size()) {
-            printf("%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
+            fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
-            printf("%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
+            fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
            for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
-                printf("%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
+                fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
            }
-            printf("\n");
+            fprintf(stderr, "\n");
        }
    }
-    printf("sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
+    fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
-    printf("\n\n");
+    fprintf(stderr, "\n\n");
    std::vector<gpt_vocab::id> embd;
@ -982,7 +1013,7 @@ int main(int argc, char ** argv) {
    if (params.interactive) {
-        printf("== Running in interactive mode. ==\n"
+        fprintf(stderr, "== Running in interactive mode. ==\n"
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
               " - Press Ctrl+C to interject at any time.\n"
 #endif
@ -1010,7 +1041,7 @@ int main(int argc, char ** argv) {
            const int64_t t_start_us = ggml_time_us();
            if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
-                printf("Failed to predict\n");
+                fprintf(stderr, "Failed to predict\n");
                return 1;
            }
@ -1123,7 +1154,7 @@ int main(int argc, char ** argv) {
        // end of text token
        if (embd.back() == 2) {
-            printf(" [end of text]\n");
+            fprintf(stderr, " [end of text]\n");
            break;
        }
    }
@ -1133,12 +1164,12 @@ int main(int argc, char ** argv) {
    {
        const int64_t t_main_end_us = ggml_time_us();
-        printf("\n\n");
+        fprintf(stderr, "\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
+        fprintf(stderr, "%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
+        fprintf(stderr, "%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
+        fprintf(stderr, "%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
+        fprintf(stderr, "%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+        fprintf(stderr, "%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
    }
    ggml_free(model.ctx);
--- a/utils.cpp
+++ b/utils.cpp
@ -11,7 +11,7 @@
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
- #elif !defined(__FreeBSD__)
+ #elif !defined(__FreeBSD__) && !defined(__NetBSD__)
 #include <alloca.h>
 #endif