diff --git a/.editorconfig b/.editorconfig
index fa84cb064..e092729bd 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -41,7 +41,7 @@ indent_style = tab
 trim_trailing_whitespace = unset
 insert_final_newline = unset
 
-[{tests/chat/templates/*.jinja,tests/chat/goldens/*.txt}]
+[tests/chat/templates/*.jinja]
 indent_style = unset
 indent_size = unset
 end_of_line = unset
diff --git a/Makefile b/Makefile
index 50dc14fa6..e9a093cbb 100644
--- a/Makefile
+++ b/Makefile
@@ -49,7 +49,6 @@ BUILD_TARGETS = \
 
 # Binaries only useful for tests
 TEST_TARGETS = \
-	tests/test-antiprompts \
 	tests/test-arg-parser \
 	tests/test-autorelease \
 	tests/test-backend-ops \
@@ -1475,11 +1474,6 @@ tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
 	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-antiprompts: tests/test-antiprompts.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 tests/test-tool-call: tests/test-tool-call.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
diff --git a/common/common.h b/common/common.h
index 75a189de6..964ea0351 100644
--- a/common/common.h
+++ b/common/common.h
@@ -4,12 +4,9 @@
 
 #include "llama-cpp.h"
 
-#include <functional>
-#include <queue>
 #include <string>
 #include <vector>
 #include <sstream>
-#include <unordered_map>
 
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 821eb0b03..b112bfd6f 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -504,13 +504,12 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> embd;
 
     // single-token antiprompts
-    std::vector<llama_token> antiprompt_single_token;
+    std::vector<llama_token> antiprompt_token;
 
-    antiprompt_single_token.reserve(params.antiprompt.size());
     for (const std::string & antiprompt : params.antiprompt) {
         auto ids = ::common_tokenize(ctx, antiprompt, false, true);
         if (ids.size() == 1) {
-            antiprompt_single_token.push_back(ids[0]);
+            antiprompt_token.push_back(ids[0]);
         }
     }
 
@@ -756,7 +755,7 @@ int main(int argc, char ** argv) {
 
                 // check for reverse prompt using special tokens
                 llama_token last_token = common_sampler_last(smpl);
-                if (std::find(antiprompt_single_token.begin(), antiprompt_single_token.end(), last_token) != antiprompt_single_token.end()) {
+                if (std::find(antiprompt_token.begin(), antiprompt_token.end(), last_token) != antiprompt_token.end()) {
                     if (params.interactive) {
                         is_interacting = true;
                     }
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 10e8a1bdb..97430941e 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -26,7 +26,6 @@
 #include <deque>
 #include <memory>
 #include <mutex>
-#include <optional>
 #include <signal.h>
 #include <thread>
 #include <unordered_map>
@@ -168,6 +167,7 @@ struct slot_params {
             {"min_keep",                  sampling.min_keep},
             {"grammar",                   sampling.grammar},
             {"grammar_trigger_words",     sampling.grammar_trigger_words},
+            {"grammar_trigger_tokens",    sampling.grammar_trigger_tokens},
             {"samplers",                  samplers},
             {"speculative.n_max",         speculative.n_max},
             {"speculative.n_min",         speculative.n_min},
@@ -386,6 +386,14 @@ struct server_task {
             return out;
         };
 
+        {
+            params.antiprompt.clear();
+            const auto stop = data.find("stop");
+            if (stop != data.end()) {
+                params.antiprompt = to_string_vec(*stop);
+            }
+        }
+
         {
             const auto grammar_trigger_words = data.find("grammar_trigger_words");
             if (grammar_trigger_words != data.end()) {
@@ -401,13 +409,6 @@ struct server_task {
             }
         }
 
-        {
-            const auto stop = data.find("stop");
-            if (stop != data.end()) {
-                params.antiprompt = to_string_vec(*stop);
-            }
-        }
-
         {
             const auto samplers = data.find("samplers");
             if (samplers != data.end()) {
@@ -730,7 +731,7 @@ struct server_task_result_cmpl_final : server_task_result {
 
         std::time_t t = std::time(0);
 
-        json res {
+        json res = json {
             {"choices",            json::array({choice})},
             {"created",            t},
             {"model",              oaicompat_model},
@@ -762,13 +763,13 @@ struct server_task_result_cmpl_final : server_task_result {
             finish_reason = "stop";
         }
 
-        json choice {
+        json choice = json {
             {"finish_reason", finish_reason},
             {"index", 0},
             {"delta", json::object()}
         };
 
-        json ret {
+        json ret = json {
             {"choices",            json::array({choice})},
             {"created",            t},
             {"id",                 oaicompat_cmpl_id},
@@ -804,12 +805,10 @@ struct server_task_result_cmpl_partial : server_task_result {
     result_timings timings;
 
     // OAI-compat fields
-    bool                  verbose                   = false;
-    oaicompat_type        oaicompat                 = OAICOMPAT_TYPE_NONE;
-    std::string           oaicompat_model;
-    std::string           oaicompat_cmpl_id;
-    json                  oaicompat_tools;
-    llama_tool_call_style oaicompat_tool_call_style = llama_tool_call_style::None;
+    bool           verbose   = false;
+    oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
+    std::string    oaicompat_model;
+    std::string    oaicompat_cmpl_id;
 
     virtual int get_index() override {
         return index;
@@ -2048,9 +2047,6 @@ struct server_context {
     bool process_token(completion_token_output & result, server_slot & slot) {
         // remember which tokens were sampled - used for repetition penalties during sampling
         const std::string token_str = result.text_to_send;
-        // TODO:
-        // const std::string token_str = result.text_to_send;
-        // const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special || (match.pos != std::string::npos && match.is_grammar_trigger));
         slot.sampled = result.tok;
 
         slot.generated_text += token_str;
@@ -2276,8 +2272,6 @@ struct server_context {
         res->oaicompat         = slot.params.oaicompat;
         res->oaicompat_model   = slot.params.oaicompat_model;
         res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
-        // res->oaicompat_tools   = slot.params.oaicompat_tools;
-        // res->oaicompat_tool_call_style = slot.params.oaicompat_tool_call_style;
 
         // populate res.probs_output
         if (slot.params.sampling.n_probs > 0) {
diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh
index e61d01b16..33fa8cc64 100755
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@@ -1,14 +1,14 @@
 #!/bin/bash
 
 # make sure we are in the right directory
-TESTS_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-cd $TESTS_DIR
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd $SCRIPT_DIR
 
 set -eu
 
 if [[ "${SLOW_TESTS:-0}" == 1 ]]; then
     # Slow tests for tool calls need quite a few models ahead of time to avoid timing out.
-    python $TESTS_DIR/../../../scripts/fetch_server_test_models.py
+    python $SCRIPT_DIR/../../../scripts/fetch_server_test_models.py
 fi
 
 if [ $# -lt 1 ]
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 3c109109a..7641d3410 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -361,7 +361,6 @@ inline std::string format_chat(const common_chat_template & tmpl, const std::vec
         std::string role = json_value(curr_msg, "role", std::string(""));
 
         std::string content;
-
         if (curr_msg.contains("content")) {
             if (curr_msg["content"].is_string()) {
                 content = curr_msg["content"].get<std::string>();
@@ -611,29 +610,16 @@ static json oaicompat_completion_params_parse(
         llama_params["stop"] = json_value(body, "stop", json::array());
     }
 
-    // Handle "response_format" field (https://platform.openai.com/docs/api-reference/chat/create#chat-create-response_format)
+    // Handle "response_format" field
     auto tool_choice = json_value(body, "tool_choice", std::string("auto"));
     if (body.contains("response_format")) {
         json response_format      = json_value(body, "response_format", json::object());
         std::string response_type = json_value(response_format, "type", std::string());
         if (response_type == "json_object") {
-            // Legacy llama.cpp, llama-cpp-python and Together.ai format.
             llama_params["json_schema"] = json_value(response_format, "schema", json::object());
         } else if (response_type == "json_schema") {
-            // OpenAI JSON schema format.
             auto json_schema = json_value(response_format, "json_schema", json::object());
-            json schema = json_value(json_schema, "schema", json::object());
-            std::string description = json_value(json_schema, "description", std::string());
-            if (!description.empty()) {
-                if (schema.contains("description")) {
-                    throw std::runtime_error("Cannot have both a description in the json_schema object and inside its schema.");
-                }
-                schema["description"] = description;
-            }
-            bool strict = json_value(json_schema, "strict", false);
-            if (strict) {
-                llama_params["json_schema"] = schema;
-            }
+            llama_params["json_schema"] = json_value(json_schema, "schema", json::object());
         } else if (!response_type.empty() && response_type != "text") {
             throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
         }
diff --git a/include/llama.h b/include/llama.h
index f6217d98c..b58e33e3c 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1266,8 +1266,6 @@ extern "C" {
     // Returns the sampled token
     LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
 
-    LLAMA_API bool llama_sampler_is_grammar_empty(struct llama_sampler * smpl);
-
     // TODO: extend in the future
     //LLAMA_API void llama_decode_with_sampler(struct llama_context * ctx, struct llama_sampler * smpl, struct llama_batch batch, ...);
 
diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt
index 025e477f6..94de59d7e 100644
--- a/requirements/requirements-all.txt
+++ b/requirements/requirements-all.txt
@@ -1,4 +1,3 @@
--r ../examples/agent/requirements.txt
 -r ../examples/llava/requirements.txt
 -r ../examples/server/bench/requirements.txt
 -r ../examples/server/tests/requirements.txt
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
index b02c4e3cc..3dc593a48 100644
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@@ -1067,7 +1067,6 @@ struct llama_grammar * llama_grammar_init_impl(
     // then the pointers would be invalidated when the local vec_rules goes out of scope.
     return new llama_grammar {
         vocab,
-        
         std::move(vec_rules),
         std::move(stacks),
         /* .partial_utf8 = */ {},
diff --git a/src/llama-grammar.h b/src/llama-grammar.h
index d96a685e2..38e7aff96 100644
--- a/src/llama-grammar.h
+++ b/src/llama-grammar.h
@@ -3,7 +3,6 @@
 #include "llama.h"
 
 #include <map>
-#include <set>
 #include <string>
 #include <vector>
 
@@ -116,6 +115,7 @@ struct llama_grammar {
     // buffer for partially generated UTF-8 sequence from accepted tokens
     llama_partial_utf8 partial_utf8;
 
+    // lazy grammars wait for trigger words or tokens before constraining the sampling.
     bool                     awaiting_trigger;
     std::string              trigger_buffer;
     std::vector<llama_token> trigger_tokens;
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 387ec6567..129888915 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1536,10 +1536,10 @@ struct llama_sampler * llama_sampler_init_grammar(
 
     if (grammar_str != nullptr && grammar_str[0] != '\0') {
         *ctx = {
-            /* .vocab        = */     vocab,
-            /* .grammar_str  = */     grammar_str,
-            /* .grammar_root = */     grammar_root,
-            /* .grammar      = */     llama_grammar_init_impl(vocab, grammar_str, grammar_root, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens),
+            /* .vocab        = */ vocab,
+            /* .grammar_str  = */ grammar_str,
+            /* .grammar_root = */ grammar_root,
+            /* .grammar      = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens),
         };
     } else {
         *ctx = {
@@ -2423,11 +2423,6 @@ uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
     return LLAMA_DEFAULT_SEED;
 }
 
-bool llama_sampler_is_grammar_empty(struct llama_sampler * smpl) {
-    struct llama_sampler_grammar * ctx = (struct llama_sampler_grammar *) smpl->ctx;
-    return ctx->grammar == nullptr;
-}
-
 // perf
 
 struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) {
diff --git a/tests/.gitignore b/tests/.gitignore
index 6f6723930..620a48ee4 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -1,6 +1,4 @@
 *
-!chat/
-!chat/**
 !*.*
 *.o
 ggml-common.h