diff --git a/chat.cpp b/chat.cpp
index 885d1f69a..7bce57a2b 100644
--- a/chat.cpp
+++ b/chat.cpp
@@ -919,7 +919,8 @@ int main(int argc, char ** argv) {
                " - If you want to submit another line, end your input in '\\'.\n");
     }
 
-    int remaining_tokens = params.n_predict;
+    // we may want to slide the input window along with the context, but for now we restrict to the context length
+    int remaining_tokens = model.hparams.n_ctx - embd_inp.size();
     int input_consumed = 0;
     bool input_noecho = true;
 
@@ -935,7 +936,7 @@ int main(int argc, char ** argv) {
 
     
 
-    while (true) {
+    while (remaining_tokens > 0) {
         // predict
         if (embd.size() > 0) {
             const int64_t t_start_us = ggml_time_us();
@@ -980,7 +981,7 @@ int main(int argc, char ** argv) {
             input_noecho = false;
 
             // decrement remaining sampling budget
-            // --remaining_tokens;
+            --remaining_tokens;
         } else {
             // some user input remains from prompt or interaction, forward it to processing
             while (embd_inp.size() > input_consumed) {
@@ -1054,6 +1055,8 @@ int main(int argc, char ** argv) {
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
                     embd_inp.insert(embd_inp.end(), response_inp.begin(), response_inp.end());
 
+                    remaining_tokens -= prompt_inp.size() + line_inp.size() + response_inp.size();
+
                     input_noecho = true; // do not echo this again
                 }