diff --git a/third_party/ggml/README.cosmo b/third_party/ggml/README.cosmo
index 5c04a7d4f..95a8d5380 100644
--- a/third_party/ggml/README.cosmo
+++ b/third_party/ggml/README.cosmo
@@ -16,6 +16,11 @@ ORIGIN
 
 LOCAL CHANGES
 
+  - Introduce -v and --verbose flags
+  - Don't print stats / diagnostics unless -v is passed
+  - Reduce --top_p default from 0.95 to 0.70
+  - Change --reverse-prompt to no longer imply --interactive
+  - Permit --reverse-prompt specifying custom EOS if non-interactive
   - Refactor headers per cosmo convention
   - Replace code like 'ggjt' with READ32BE("ggjt")
   - Remove C++ exceptions; use Die() function instead
diff --git a/third_party/ggml/common.cc b/third_party/ggml/common.cc
index b6ef67e8b..62aaa405e 100644
--- a/third_party/ggml/common.cc
+++ b/third_party/ggml/common.cc
@@ -91,6 +91,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.seed = std::stoi(argv[i]);
+        } else if (arg == "-v" || arg == "--verbose") {
+            ++params.verbose;
         } else if (arg == "-t" || arg == "--threads") {
             if (++i >= argc) {
                 invalid_param = true;
diff --git a/third_party/ggml/common.h b/third_party/ggml/common.h
index f655baed2..cb5992535 100644
--- a/third_party/ggml/common.h
+++ b/third_party/ggml/common.h
@@ -17,6 +17,7 @@
 
 struct gpt_params {
     int32_t seed          = -1;   // RNG seed
+    int32_t verbose       = 0;    // Logging verbosity
     int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency());
     int32_t n_predict     = 128;  // new tokens to predict
     int32_t repeat_last_n = 64;   // last n tokens to penalize
@@ -27,7 +28,7 @@ struct gpt_params {
 
     // sampling parameters
     int32_t top_k = 40;
-    float   top_p = 0.95f;
+    float   top_p = 0.70f;
     float   temp  = 0.80f;
     float   repeat_penalty  = 1.10f;
 
diff --git a/third_party/ggml/llama.cc b/third_party/ggml/llama.cc
index c8fcb2fb1..8c1fb3a93 100644
--- a/third_party/ggml/llama.cc
+++ b/third_party/ggml/llama.cc
@@ -451,7 +451,7 @@ struct llama_file_loader {
 
     llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
         : file(fname, "rb") {
-        fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
+        // fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
         read_magic();
         read_hparams();
         read_vocab();
@@ -561,7 +561,7 @@ struct llama_file_saver {
     llama_file_loader * any_file_loader;
     llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
         : file(fname, "wb"), any_file_loader(any_file_loader) {
-        fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
+        // fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
         write_magic();
         write_hparams(new_ftype);
         write_vocab();
@@ -919,7 +919,8 @@ static void llama_model_load_internal(
         bool use_mlock,
         bool vocab_only,
         llama_progress_callback progress_callback,
-        void * progress_callback_user_data) {
+        void * progress_callback_user_data,
+        int verbose) {
 
     lctx.t_start_us = ggml_time_us();
 
@@ -943,7 +944,7 @@ static void llama_model_load_internal(
         hparams.n_ctx = n_ctx;
     }
 
-    {
+    if (verbose) {
         fprintf(stderr, "%s: format     = %s\n",  __func__, llama_file_version_name(file_version));
         fprintf(stderr, "%s: n_vocab    = %u\n",  __func__, hparams.n_vocab);
         fprintf(stderr, "%s: n_ctx      = %u\n",  __func__, hparams.n_ctx);
@@ -966,7 +967,9 @@ static void llama_model_load_internal(
 
     size_t ctx_size, mmapped_size;
     ml->calc_sizes(&ctx_size, &mmapped_size);
-    fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
+    if (verbose) {
+        fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
+    }
 
     // print memory requirements
     {
@@ -984,8 +987,10 @@ static void llama_model_load_internal(
         const size_t mem_required_state =
             scale*MEM_REQ_KV_SELF().at(model.type);
 
-        fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
-                mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
+        if (verbose) {
+            fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
+                    mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
+        }
     }
 
     // create the ggml context
@@ -1068,10 +1073,12 @@ static bool llama_model_load(
         bool use_mlock,
         bool vocab_only,
         llama_progress_callback progress_callback,
-        void *progress_callback_user_data) {
+        void *progress_callback_user_data,
+        int verbose) {
     // try {
     llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
-                              vocab_only, progress_callback, progress_callback_user_data);
+                              vocab_only, progress_callback, progress_callback_user_data,
+                              verbose);
     return true;
     // } catch (const std::string & err) {
     //     fprintf(stderr, "error loading model: %s\n", err.c_str());
@@ -1783,7 +1790,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
 struct llama_context * llama_init_from_file(
                              const char * path_model,
-            struct llama_context_params   params) {
+            struct llama_context_params   params,
+                                    int   verbose) {
     ggml_time_init();
 
     llama_context * ctx = new llama_context;
@@ -1793,7 +1801,7 @@ struct llama_context * llama_init_from_file(
     }
 
     unsigned cur_percentage = 0;
-    if (params.progress_callback == NULL) {
+    if (verbose && params.progress_callback == NULL) {
         params.progress_callback_user_data = &cur_percentage;
         params.progress_callback = [](float progress, void * ctx) {
             unsigned * cur_percentage_p = (unsigned *) ctx;
@@ -1816,7 +1824,8 @@ struct llama_context * llama_init_from_file(
 
     if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
                           params.use_mmap, params.use_mlock, params.vocab_only,
-                          params.progress_callback, params.progress_callback_user_data)) {
+                          params.progress_callback, params.progress_callback_user_data,
+                          verbose)) {
         fprintf(stderr, "%s: failed to load model\n", __func__);
         llama_free(ctx);
         return nullptr;
@@ -1830,7 +1839,7 @@ struct llama_context * llama_init_from_file(
             return nullptr;
         }
 
-        {
+        if (verbose) {
             const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
             fprintf(stderr, "%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
         }
diff --git a/third_party/ggml/llama.h b/third_party/ggml/llama.h
index c8cbdaabc..0cbf6e111 100644
--- a/third_party/ggml/llama.h
+++ b/third_party/ggml/llama.h
@@ -87,7 +87,8 @@ extern "C" {
     // Return NULL on failure
     LLAMA_API struct llama_context * llama_init_from_file(
                              const char * path_model,
-            struct llama_context_params   params);
+            struct llama_context_params   params,
+                                    int   verbose);
 
     // Frees all allocated memory
     LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/third_party/ggml/main.cc b/third_party/ggml/main.cc
index af22ef5de..330109dbc 100644
--- a/third_party/ggml/main.cc
+++ b/third_party/ggml/main.cc
@@ -96,8 +96,7 @@ void sigint_handler(int signo) {
         if (!is_interacting) {
             is_interacting=true;
         } else {
-            llama_print_timings(*g_ctx);
-            _exit(130);
+            _exit(128 + signo);
         }
     }
 }
@@ -155,7 +154,9 @@ int main(int argc, char ** argv) {
         params.seed = time(NULL);
     }
 
-    fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+    if (params.verbose) {
+        fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+    }
 
     std::mt19937 rng(params.seed);
     if (params.random_prompt) {
@@ -179,7 +180,7 @@ int main(int argc, char ** argv) {
         lparams.use_mmap   = params.use_mmap;
         lparams.use_mlock  = params.use_mlock;
 
-        ctx = llama_init_from_file(params.model.c_str(), lparams);
+        ctx = llama_init_from_file(params.model.c_str(), lparams, params.verbose);
 
         if (ctx == NULL) {
             fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
@@ -199,7 +200,7 @@ int main(int argc, char ** argv) {
     }
 
     // print system information
-    {
+    if (params.verbose) {
         fprintf(stderr, "\n");
         fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
                 params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
@@ -218,7 +219,9 @@ int main(int argc, char ** argv) {
             llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
         }
 
-        llama_print_timings(ctx);
+        if (params.verbose) {
+            llama_print_timings(ctx);
+        }
         llama_free(ctx);
 
         return 0;
@@ -252,8 +255,8 @@ int main(int argc, char ** argv) {
         params.antiprompt.push_back("### Instruction:\n\n");
     }
 
-    // enable interactive mode if reverse prompt or interactive start is specified
-    if (params.antiprompt.size() != 0 || params.interactive_first) {
+    // enable interactive mode if interactive start is specified
+    if (params.interactive_first) {
         params.interactive = true;
     }
 
@@ -288,28 +291,33 @@ int main(int argc, char ** argv) {
         signal(SIGINT, sigint_handler);
 #endif
 
-        fprintf(stderr, "%s: interactive mode on.\n", __func__);
+        if (params.verbose) {
+            fprintf(stderr, "%s: interactive mode on.\n", __func__);
+        }
 
-        if (params.antiprompt.size()) {
+        if (params.verbose && params.antiprompt.size()) {
             for (auto antiprompt : params.antiprompt) {
                 fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
             }
         }
 
-        if (!params.input_prefix.empty()) {
+        if (params.verbose && !params.input_prefix.empty()) {
             fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
         }
     }
-    fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
-        params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
-    fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-    fprintf(stderr, "\n\n");
+    
+    if (params.verbose) {
+        fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
+                params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
+        fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+        fprintf(stderr, "\n\n");
+    }
 
     // TODO: replace with ring-buffer
     std::vector<llama_token> last_n_tokens(n_ctx);
     std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
 
-    if (params.interactive) {
+    if (params.verbose && params.interactive) {
         fprintf(stderr, "== Running in interactive mode. ==\n"
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
                " - Press Ctrl+C to interject at any time.\n"
@@ -320,7 +328,7 @@ int main(int argc, char ** argv) {
     }
 
     bool is_antiprompt = false;
-    bool input_noecho  = false;
+    bool input_noecho  = !params.verbose;
 
     int n_past     = 0;
     int n_remain   = params.n_predict;
@@ -427,6 +435,40 @@ int main(int argc, char ** argv) {
             }
         }
 
+        // checks for reverse prompt
+        //
+        // 1. in interactive mode, this lets us detect when the llm is
+        //    prompting the user, so we can pause for input, e.g.
+        //
+        //       --interactive
+        //       --prompt $'CompanionAI: How can I help you?\nHuman:'
+        //       --reverse-prompt 'Human:'
+        //
+        // 2. in normal mode, the reverse prompt can be used to specify
+        //    a custom EOS token, e.g.
+        //
+        //       --prompt 'Question: How old are you?\nAnswer: '
+        //       --reverse-prompt $'\n'
+        //
+        if (params.antiprompt.size()) {
+            std::string last_output;
+            for (auto id : last_n_tokens) {
+                last_output += llama_token_to_str(ctx, id);
+            }
+            is_antiprompt = false;
+            // Check if each of the reverse prompts appears at the end of the output.
+            for (std::string & antiprompt : params.antiprompt) {
+                if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
+                    is_antiprompt = true;
+                    break;
+                }
+            }
+            if (is_antiprompt && !params.interactive) {
+                printf("\n");
+                break;
+            }
+        }
+
         // display text
         if (!input_noecho) {
             for (auto id : embd) {
@@ -435,34 +477,20 @@ int main(int argc, char ** argv) {
             fflush(stdout);
         }
         // reset color to default if we there is no pending user input
-        if (!input_noecho && (int)embd_inp.size() == n_consumed) {
+        if (params.verbose && !input_noecho && (int)embd_inp.size() == n_consumed) {
             set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
         }
 
+        if (is_antiprompt) {
+            is_interacting = true;
+            set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
+            fflush(stdout);
+        }
+
         // in interactive mode, and not currently processing queued inputs;
         // check if we should prompt the user for more
         if (params.interactive && (int) embd_inp.size() <= n_consumed) {
 
-            // check for reverse prompt
-            if (params.antiprompt.size()) {
-                std::string last_output;
-                for (auto id : last_n_tokens) {
-                    last_output += llama_token_to_str(ctx, id);
-                }
-
-                is_antiprompt = false;
-                // Check if each of the reverse prompts appears at the end of the output.
-                for (std::string & antiprompt : params.antiprompt) {
-                    if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
-                        is_interacting = true;
-                        is_antiprompt = true;
-                        set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
-                        fflush(stdout);
-                        break;
-                    }
-                }
-            }
-
             if (n_past > 0 && is_interacting) {
                 // potentially set color to indicate we are taking user input
                 set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
@@ -542,7 +570,7 @@ int main(int argc, char ** argv) {
         if (!embd.empty() && embd.back() == llama_token_eos()) {
             if (params.instruct) {
                 is_interacting = true;
-            } else {
+            } else if (params.verbose) {
                 fprintf(stderr, " [end of text]\n");
                 break;
             }
@@ -559,7 +587,9 @@ int main(int argc, char ** argv) {
     signal(SIGINT, SIG_DFL);
 #endif
 
-    llama_print_timings(ctx);
+    if (params.verbose) {
+        llama_print_timings(ctx);
+    }
     llama_free(ctx);
 
     set_console_color(con_st, CONSOLE_COLOR_DEFAULT);