server.py: default tools work!

2024-03-26 20:58:03 +00:00 · 2024-03-26 20:58:03 +00:00 · d5d9993679
commit d5d9993679
parent ffc74360e2
15 changed files with 449 additions and 223 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -1,6 +1,3 @@
 [submodule "kompute"]
 	path = kompute
 	url = https://github.com/nomic-ai/kompute.git
 [submodule "examples/agents/hermes_function_calling"]
 	path = examples/agents/hermes_function_calling
 	url = https://github.com/NousResearch/Hermes-Function-Calling
--- a/examples/agents/README.md
+++ b/examples/agents/README.md
@ -1,15 +0,0 @@
 Edit `examples/agents/hermes_function_calling/utils.py`:
 ```py
 log_folder = os.environ.get('LOG_FOLDER', os.path.join(script_dir, "inference_logs"))
 ```
 Then run:
 ```bash
 REQUIREMENTS_FILE=<( cat examples/agents/hermes_function_calling/requirements.txt | grep -vE "bitsandbytes|flash-attn" ) \
  examples/agents/run_sandboxed_tools.sh \
    examples/agents/hermes_function_calling/functions.py \
    -e LOG_FOLDER=/data/inference_logs
 ```
--- a/examples/agents/hermes_function_calling
+++ b/examples/agents/hermes_function_calling
@ -1 +0,0 @@
 Subproject commit b4f757e27d87f4ab408f706f482c25a8e1508d59
--- a/examples/agents/requirements.txt
+++ b/examples/agents/requirements.txt
@ -1,3 +0,0 @@
 jsonargparse
 pydantic
 typer[all]
--- a/examples/openai/README.md
+++ b/examples/openai/README.md
@ -1,15 +1,45 @@
-# examples.openai: OpenAI API-compatible server
+# examples.openai: OpenAI API-compatible server + agent / tools examples
 A simple Python server that sits above the C++ [../server](examples/server) and offers improved OAI compatibility.
 ## Usage
 Run a simple test:
 ```bash
-python -m examples.openai -m some-model.gguf
+# Spawns a Python server (which spawns a C++ Server) then hits it w/ a tool-calling request
-
+examples/openai/test.sh
 ```
 To simply run the Python server (+ C++ server under the hood):
 ```bash
 python -m examples.openai
 ```
 ## Tools usage (WIP)
 ```bash
 git clone https://github.com/NousResearch/Hermes-Function-Calling examples/openai/hermes_function_calling
 ```
 Then edit `examples/agents/hermes_function_calling/utils.py`:
 ```py
 log_folder = os.environ.get('LOG_FOLDER', os.path.join(script_dir, "inference_logs"))
 ```
 Then run tools in a sandbox:
 ```bash
 REQUIREMENTS_FILE=<( cat examples/agents/hermes_function_calling/requirements.txt | grep -vE "bitsandbytes|flash-attn" ) \
  examples/agents/run_sandboxed_tools.sh \
    examples/agents/hermes_function_calling/functions.py \
    -e LOG_FOLDER=/data/inference_logs
 ```
 TODO: reactor that reads OpenAPI definitions and does the tool calling
 ## Features
 The new examples/openai/server.py:
--- a/examples/openai/api.py
+++ b/examples/openai/api.py
@ -1,9 +1,14 @@
-from typing import Any, Optional
+from typing import Any, Dict, Optional
 from pydantic import BaseModel, Json
 class ToolCall(BaseModel):
    name: str
    arguments: Dict[str, Any]
 class Message(BaseModel):
    role: str
-    content: str
+    content: Optional[str]
    tool_calls: Optional[list[ToolCall]] = None
 class ToolFunction(BaseModel):
    name: str
--- a/examples/openai/chat_format.py
+++ b/examples/openai/chat_format.py
@ -1,59 +0,0 @@
 from enum import StrEnum
 import jinja2
 from examples.openai.gguf_kvs import GGUFKeyValues, Keys
 def raise_exception(msg: str):
    raise Exception(msg)
 class ToolStyle(StrEnum):
    # https://cookbook.openai.com/examples/how_to_call_functions_with_chat_models
    DEFAULT="Default",
    # https://github.com/MeetKai/functionary
    # TODO: look at https://github.com/ggerganov/llama.cpp/pull/5695
    # https://github.com/MeetKai/functionary/blob/main/functionary/prompt_template/prompt_template_v2.py
    FUNCTIONARY_V2="Functionary V2",
    # https://github.com/NousResearch/Hermes-Function-Calling
    NOUS_RESEARCH_HERMES="Nous-Research-Hermes-Function-Calling",
 class ChatFormat: #(BaseModel):
    def __init__(self, template: str, eos_token: str, bos_token: str):
        env = jinja2.Environment(loader=jinja2.BaseLoader(), trim_blocks=True, lstrip_blocks=True)
        self.template = env.from_string(template)
        self.eos_token = eos_token
        self.bos_token = bos_token
        self.strict_user_assistant_alternation = "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception" in template
        if "<|recipient|>' + tool_call['function']['name']" in template:
            self.tool_style = ToolStyle.FUNCTIONARY_V2
        else:
            self.tool_style = ToolStyle.DEFAULT
    def __str__(self):
        return f"ChatFormat(template={self.template}, eos_token={self.eos_token}, bos_token={self.bos_token})"
    @staticmethod
    def from_gguf(metadata: GGUFKeyValues):
        return ChatFormat(
            template = metadata[Keys.Tokenizer.CHAT_TEMPLATE],
            bos_token = metadata[Keys.Tokenizer.BOS_ID],
            eos_token = metadata[Keys.Tokenizer.EOS_ID])
    # @staticmethod
    # def from_gguf(model: Path):
    #     reader = GGUFReader(model.as_posix())
    #     return ChatFormat(
    #         template = reader.fields[Keys.Tokenizer.CHAT_TEMPLATE].read(),
    #         bos_token = reader.fields[Keys.Tokenizer.BOS_ID].read(),
    #         eos_token = reader.fields[Keys.Tokenizer.EOS_ID].read())
    def render(self, messages: list[dict], add_generation_prompt: bool, omit_bos: bool = False):
        return self.template.render(
            messages=messages,
            eos_token=self.eos_token,
            bos_token='' if omit_bos else self.bos_token,
            raise_exception=raise_exception,
            add_generation_prompt=add_generation_prompt,
        )
--- a/examples/openai/fastify-requirements.txt
+++ b/examples/openai/fastify-requirements.txt
--- a/examples/openai/fastify.py
+++ b/examples/openai/fastify.py
@ -8,8 +8,6 @@ from anyio import Path
 import fastapi, uvicorn
 import typer
 # from langchain_core.tools import BaseTool
 def load_source_as_module(source):
    i = 0
    while (module_name := f'mod_{i}') in sys.modules:
--- a/examples/openai/prompt1.txt
+++ b/examples/openai/prompt1.txt
@ -0,0 +1,43 @@
 <|im_start|>system
 Role:
  You are a function calling AI agent with self-recursion.
  You can call only one function at a time and analyse data you get from function response.
  You are provided with function signatures within <tools></tools> XML tags.
  The current date is: March 25, 2024.
 Objective:
  You may use agentic frameworks for reasoning and planning to help with user query.
  Please call a function and wait for function results to be provided to you in the next iteration.
  Don't make assumptions about what values to plug into function arguments.
  Once you have called a function, results will be fed back to you within <tool_response></tool_response> XML tags.
  Don't make assumptions about tool results if <tool_response> XML tags are not present since function hasn't been executed yet.
  Analyze the data once you get the results and call another function.
  At each iteration please continue adding the your analysis to previous summary.
  Your final response should directly answer the user query with an anlysis or summary of the results of function calls.
 Tools:
  Here are the available tools:
  <tools> 
  {"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","properties":{"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"},"format":{"type":"string","enum":["celsius","fahrenheit"],"description":"The temperature unit to use. Infer this from the users location."}},"required":["location","format"]}}}
  {"type":"function","function":{"name":"get_n_day_weather_forecast","description":"Get an N-day weather forecast","parameters":{"type":"object","properties":{"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"},"format":{"type":"string","enum":["celsius","fahrenheit"],"description":"The temperature unit to use. Infer this from the users location."},"num_days":{"type":"integer","description":"The number of days to forecast"}},"required":["location","format","num_days"]}}} 
  </tools>
  If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows:
  <tool_call>
  {"arguments": {"code_markdown": <python-code>, "name": "code_interpreter"}}
  </tool_call>
  Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree.
 Instructions:
  At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
  Please keep a running summary with analysis of previous function results and summaries from previous iterations.
  Do not stop calling functions until the task has been accomplished or you've reached max iteration of 10.
  Calling multiple functions at once can overload the system and increase cost so call one function at a time please.
  If you plan to continue with analysis, always call another function.
  For each function call return a valid json object (using doulbe quotes) with function name and arguments within <tool_call></tool_call> XML tags as follows:
  <tool_call>
  {"arguments": <args-dict>, "name": <function-name>}
  </tool_call>
 <|im_end|>
 <|im_start|>user
 what is the weather going to be like in San Francisco and Glasgow over the next 4 days (temperature in celsius for both)<|im_end|>
 <|im_start|>assistant
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@ -0,0 +1,242 @@
 from enum import Enum
 import jinja2
 import json
 from pathlib import Path
 import sys
 from typing import Optional, Tuple, Callable
 from typeguard import typechecked
 from examples.json_schema_to_grammar import SchemaConverter
 from examples.openai.api import Tool, Message
 from examples.openai.gguf_kvs import GGUFKeyValues, Keys
 from examples.openai.ts_converter import SchemaToTypeScriptConverter
@typechecked
 def raise_exception(msg: str):
    raise Exception(msg)
 class ChatFormat:
    def __init__(self, template: str, eos_token: str, bos_token: str):
        env = jinja2.Environment(loader=jinja2.BaseLoader(), trim_blocks=True, lstrip_blocks=True)
        self.template = env.from_string(template)
        self.eos_token = eos_token
        self.bos_token = bos_token
        self.strict_user_assistant_alternation = "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception" in template
        if "<|recipient|>' + tool_call['function']['name']" in template:
            self.tool_style = ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2
        else:
            self.tool_style = ToolsPromptStyle.TOOLS_LONG
    def __str__(self):
        return f"ChatFormat(template={self.template}, eos_token={self.eos_token}, bos_token={self.bos_token})"
    @staticmethod
    def from_gguf(metadata: GGUFKeyValues):
        return ChatFormat(
            template = metadata[Keys.Tokenizer.CHAT_TEMPLATE],
            bos_token = metadata[Keys.Tokenizer.BOS_ID],
            eos_token = metadata[Keys.Tokenizer.EOS_ID])
    def render(self, messages: list[dict], add_generation_prompt: bool, omit_bos: bool = False):
        return self.template.render(
            messages=messages,
            eos_token=self.eos_token,
            bos_token='' if omit_bos else self.bos_token,
            raise_exception=raise_exception,
            add_generation_prompt=add_generation_prompt,
        )
 # While the API will be usable with a generic tools usage like OpenAI,
 # (see https://cookbook.openai.com/examples/how_to_call_functions_with_chat_models),
 # each model may need specific prompting (and/or constrained output,
 # especially for models not fine-tuned for tool usage / function calling).
 class ToolsPromptStyle(Enum):
    # Short prompt w/ <tools>schemas</tools>
    TOOLS_SHORT = 1
    # Longer prompt w/ <tools>schemas</tools>
    TOOLS_LONG = 2
    # Large prompt for https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B
    # Requires:
    # - git clone https://github.com/NousResearch/Hermes-Function-Calling examples/openai/hermes_function_calling
    # - Set large context length as their prompts are super long
    TOOLS_HERMES_2_PRO = 3
    # Short prompt w/ TypeScript definitions for https://github.com/MeetKai/functionary
    # https://github.com/MeetKai/functionary/blob/main/functionary/prompt_template/prompt_template_v2.py
    # Note: see this prior attempt to support Functionary: https://github.com/ggerganov/llama.cpp/pull/5695
    TYPESCRIPT_FUNCTIONARY_V2 = 4
@typechecked
 def make_tools_prompt(chat_format: ChatFormat, tools: list[Tool], indent=2) -> Message:
    if chat_format.tool_style == ToolsPromptStyle.TOOLS_SHORT:
        return Message(
            role="system",
            content='\n'.join([
                'Here are the tools available:',
                '<tools>',
                *(json.dumps(tool.model_dump(), indent=indent) for tool in tools),
                '</tools>',
            ])
        )
    elif chat_format.tool_style == ToolsPromptStyle.TOOLS_LONG:
        return Message(
            role="system",
            content='\n'.join([
                '''You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.''',
                '''You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:''',
                '''<tools>''',
                *(json.dumps(tool.model_dump(), indent=indent) for tool in tools),
                '''</tools>''',
                '',
                '''Use the following json schema for each tool call you will make: {"properties": {"arguments": {"title": "Arguments", "type": "object"}, "name": {"title": "Name", "type": "string"}}, "required": ["arguments", "name"], "title": "FunctionCall", "type": "object"}''',
                '',
                '''For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:''',
                '''<tool_call>''',
                '''{"arguments": <args-dict>, "name": <function-name>}''',
                '''</tool_call>''',
            ])
        )
    elif chat_format.tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
        ts_converter = SchemaToTypeScriptConverter()
        return Message(
            role="system",
            content='\n'.join([
                '// Supported function definitions that should be called when necessary.'
                'namespace functions {',
                *[
                    '// ' + tool.function.description.replace('\n', '\n// ') + '\n' + ''
                    'type ' + tool.function.name + ' = (_: ' + ts_converter.visit(tool.function.parameters) + ") => any;\n"
                    for tool in tools
                ],
                '} // namespace functions',
            ])
        )
    elif chat_format.tool_style == ToolsPromptStyle.TOOLS_HERMES_2_PRO:
        # Hackily import https://github.com/NousResearch/Hermes-Function-Calling
        path = str(Path(__file__).parent / "hermes_function_calling")
        if path not in sys.path: sys.path.insert(0, path)
        try:
            from examples.openai.hermes_function_calling.prompter import PromptManager
        except ImportError:
            raise ImportError(f"Please `git clone https://github.com/NousResearch/Hermes-Function-Calling {path}`")
        prompt = PromptManager().generate_prompt(user_prompt=[], tools=[json.dumps(tool) for tool in tools])
        assert len(prompt) == 1 and prompt[0]["role"] == "system"
        return Message(**prompt[0])
    else:
        raise ValueError(f"Unsupported tool call style: {chat_format.tool_style}")
@typechecked
 def _outputs_tool_call_tags(style: ToolsPromptStyle) -> bool:
    return style in (
        ToolsPromptStyle.TOOLS_SHORT,
        ToolsPromptStyle.TOOLS_LONG,
        ToolsPromptStyle.TOOLS_HERMES_2_PRO,
    )
@typechecked
 def make_grammar(chat_format: ChatFormat, tools: list[Tool], response_schema: Optional[dict], indent=2) -> Tuple[Optional[str], Callable[[str], Optional[Message]]]:
    converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
    response_rule = converter.visit(response_schema, "response") if response_schema else None
    delimiter = '<%$[SAMPLE]$%>'
    empty_prompt = chat_format.render([], add_generation_prompt=True).strip()
    planted_prompt = chat_format.render([{"role": "assistant", "content": delimiter}], add_generation_prompt=False).strip()
    assert planted_prompt.startswith(empty_prompt), f"Planted prompt does not start with empty prompt: {planted_prompt} vs {empty_prompt}"
    [prefix, suffix] = planted_prompt[len(empty_prompt):].split(delimiter)
    if tools:
        if _outputs_tool_call_tags(chat_format.tool_style):
            tool_rules = [
                converter.visit(
                    dict(
                        type="object",
                        properties=dict(
                            name=dict(const=tool.function.name),
                            arguments=tool.function.parameters,
                        ),
                        required=['name', 'arguments']
                    ),
                    f'{tool.function.name}-tool-call'
                )
                for tool in tools
            ]
            # Constrain the output to be a non-tool-call message (constrained to a JSON schema or not)
            # OR a tool-call message respecting the schema of any of the tools
            converter._add_rule(
                "root", 
                converter._format_literal(prefix) + " (" +
                    (response_rule or converter.not_literal("<tool_call>")) + " | " +
                    converter._format_literal("<tool_call>") + " (" +
                    ' | '.join(tool_rules) +
                    ") " + converter._format_literal("</tool_call>") +
                ")") # + converter._format_literal(suffix))
            @typechecked
            def parse(s: str) -> Optional[Message]:
                ls = s.lstrip()
                if '<tool_call>'.startswith(ls) or ls.startswith('<tool_call>'):
                    if ls.startswith('<tool_call>') and ls.endswith('</tool_call>' + suffix):
                        tool_call = ls[len('<tool_call>'):-len('</tool_call>' + suffix)]
                        return Message(role="assistant", content=None, tool_calls=[json.loads(tool_call)])
                    return None
                else:
                    return Message(role="assistant", content=s)
            return (converter.format_grammar(), parse)
        elif chat_format.tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
            # Only allowing a single tool call at a time for now.
            # Note that if there were more, they'd be separated by a '<|from|>assistant' literal
            converter._add_rule(
                "root", 
                converter._format_literal(prefix) + " (" +
                    (response_rule or converter.not_literal("<|recipient|>")) + " | " +
                    (' | '.join(
                        converter._format_literal(f"<|recipient|>{tool.function.name}\n<|content|>") + " " +
                        converter.visit(tool.function.parameters, tool.function.name + '-args')
                        for tool in tools
                    )) +
                    ") " +
                ")") # + converter._format_literal(suffix))
            @typechecked
            def parse(s: str) -> Optional[Message]:
                raise NotImplementedError(f'TODO: parse tool_style {chat_format.tool_style}: {s}')
            return (converter.format_grammar(), parse)
    elif response_schema:
        converter._add_rule("root", response_rule + ' ' + converter._format_literal(suffix))
        @typechecked
        def parse(s: str) -> Optional[Message]:
            if response_rule.endswith(suffix):
                return Message(role="assistant", content=s[:-len(suffix)])
        return (converter.format_grammar(), parse)
    else:
        converter._add_rule("root", converter._format_literal(prefix) + ' ' + converter._format_literal(suffix))
        @typechecked
        def parse(s: str) -> Optional[Message]:
            if s.endswith(suffix):
                return Message(role="assistant", content=s[:-len(suffix)])
            return None
        return (None, parse)
--- a/examples/openai/requirements.txt
+++ b/examples/openai/requirements.txt
@ -1,7 +1,7 @@
 fastapi[all]
 gguf
 jinja2
 jsonargparse
 pydantic
 sse-starlette
 uvicorn[all]
 typer[all]
--- a/examples/openai/run_sandboxed_tools.sh
+++ b/examples/openai/run_sandboxed_tools.sh
@ -2,9 +2,9 @@
 #
 # Runs a Python script in a sandboxed environment and makes its functions available as a web service.
 #
-# git submodule add https://github.com/NousResearch/Hermes-Function-Calling examples/agents/hermes_function_calling
+# git submodule add https://github.com/NousResearch/Hermes-Function-Calling examples/openai/hermes_function_calling
-# python examples/agents/fastify.py examples/agents/hermes_function_calling/functions.py
+# python examples/openai/fastify.py examples/openai/hermes_function_calling/functions.py
-# REQUIREMENTS_FILE=<( cat examples/agents/hermes_function_calling/requirements.txt | grep -vE "bitsandbytes|flash-attn" ) examples/agents/run_sandboxed_tools.sh examples/agents/hermes_function_calling/functions.py -e LOG_FOLDER=/data/inference_logs
+# REQUIREMENTS_FILE=<( cat examples/openai/hermes_function_calling/requirements.txt | grep -vE "bitsandbytes|flash-attn" ) examples/agents/run_sandboxed_tools.sh examples/agents/hermes_function_calling/functions.py -e LOG_FOLDER=/data/inference_logs
 set -euo pipefail
 script="$( realpath "$1" )"
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@ -1,35 +1,39 @@
 # https://gist.github.com/ochafik/a3d4a5b9e52390544b205f37fb5a0df3
 # pip install "fastapi[all]" "uvicorn[all]" sse-starlette jsonargparse jinja2 pydantic
 import json, sys, subprocess, atexit
 from pathlib import Path
-# sys.path.insert(0, str(Path(__file__).parent.parent))
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 from examples.openai.llama_cpp_server_api import LlamaCppServerCompletionRequest
 from examples.json_schema_to_grammar import SchemaConverter
 from typing import Optional
 import httpx
 from fastapi import Depends, FastAPI, Request, Response
 from starlette.responses import StreamingResponse
 from fastapi.responses import JSONResponse
 from jsonargparse import CLI
 from examples.openai.ts_converter import SchemaToTypeScriptConverter
 from examples.openai.gguf_kvs import GGUFKeyValues, Keys
-from examples.openai.api import Message, Tool, ToolFunction, ResponseFormat, ChatCompletionRequest
+from examples.openai.api import Message, ChatCompletionRequest
-from examples.openai.chat_format import ChatFormat, ToolStyle
+from examples.openai.prompting import ChatFormat, make_grammar, make_tools_prompt
-def _add_system_prompt(messages: list['Message'], system_prompt: str):
+from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse
 import httpx
 from starlette.responses import StreamingResponse
 from typing import Annotated, Optional
 import typer
 from typeguard import typechecked
@typechecked
 def _add_system_prompt(messages: list[Message], system_prompt: Message) -> list[Message]:
    assert system_prompt.role == "system"
    # TODO: add to last system message, or create a new one just before the last user message
    system_message = next(((i, m) for i, m in enumerate(messages) if m.role == "system"), None)
    if system_message is not None:
        (i, m) = system_message
-        messages[i].content = m.content + '\n' + system_prompt
+        return messages[:i] + [Message(role="system", content=m.content + '\n' + system_prompt.content)] + messages[i+1:]
    else:
-        messages.insert(0, Message(role="system", content=system_prompt))
+        return [Message(role="system", content=system_prompt)] + messages
    return messages
 def main(
-    model: Path = Path("/Users/ochafik/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf"),
+    model: Annotated[Optional[Path], typer.Option("--model", "-m")] = "models/7B/ggml-model-f16.gguf",
    # model: Path = Path("/Users/ochafik/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf"),
    # model_url: Annotated[Optional[str], typer.Option("--model-url", "-mu")] = None,
    host: str = "localhost",
    port: int = 8080,
    main_server_endpoint: Optional[str] = None,
@ -48,7 +52,7 @@ def main(
            "./server", "-m", model,
            "--host", main_server_host, "--port", f'{main_server_port}',
            '-ctk', 'q4_0', '-ctv', 'f16',
-            "-c", f"8192",
+            "-c", f"{2*8192}",
            # "-c", f"{context_length}",
        ])
        atexit.register(server_process.kill)
@ -70,110 +74,10 @@ def main(
            response_schema = None
        messages = chat_request.messages
        parser=None
        grammar=None
        converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
        response_rule = converter.visit(response_schema, "response") if response_schema else None
        delimiter = '<%$[SAMPLE]$%>'
        empty_prompt = chat_format.render([], add_generation_prompt=True)
        planted_prompt = chat_format.render([{"role": "assistant", "content": delimiter}], add_generation_prompt=False)
        assert planted_prompt.startswith(empty_prompt), f"Planted prompt does not start with empty prompt: {planted_prompt} vs {empty_prompt}"
        [prefix, suffix] = planted_prompt[len(empty_prompt):].split(delimiter)
        if chat_request.tools:
-            if chat_format.tool_style in (ToolStyle.DEFAULT, ToolStyle.NOUS_RESEARCH_HERMES):
+            messages = _add_system_prompt(messages, make_tools_prompt(chat_format, chat_request.tools))
                messages = _add_system_prompt(messages, '\n'.join([
                    'Here are the tools available:',
                    '<tools>',
                    *(tool.model_dump_json() for tool in chat_request.tools),
                    '</tools>',
                ]))
-                tool_rules = [
+        (grammar, parser) = make_grammar(chat_format, chat_request.tools, response_schema)
                    converter.visit(
                        dict(
                            type="object",
                            properties=dict(
                                name=dict(const=tool.function.name),
                                arguments=tool.function.parameters,
                            ),
                            required=['name', 'arguments']
                        ),
                        f'{tool.function.name}-tool-call'
                    )
                    for tool in chat_request.tools
                ]
                # Constrain the output to be a non-tool-call message (constrained to a JSON schema or not)
                # OR a tool-call message respecting the schema of any of the tools
                converter._add_rule(
                    "root", 
                    converter._format_literal(prefix) + " (" +
                        (response_rule or converter.not_literal("<tool_call>")) + " | " +
                        converter._format_literal("<tool_call>") + " (" +
                        ' | '.join(tool_rules) +
                        ") " + converter._format_literal("</tool_call>") +
                    ") " + converter._format_literal(suffix))
                grammar = converter.format_grammar()
                def parse(s: str):
                    if '<tool_call>'.startswith(s):
                        if s.startswith('<tool_call>') and s.endswith('</tool_call>' + suffix):
                            s = s[len('<tool_call>'):-len('</tool_call>' + suffix)]
                            return {"role": "assistant", "tool_calls": [json.loads(s)]}
                        return None
                    else:
                        return {"role": "assistant", "content": s}
                parser = parse
            elif chat_format.tool_style == ToolStyle.FUNCTIONARY_V2:
                ts_converter = SchemaToTypeScriptConverter()
                messages = _add_system_prompt(messages, '\n'.join([
                    '// Supported function definitions that should be called when necessary.'
                    'namespace functions {',
                    *[
                        '// ' + tool.function.description.replace('\n', '\n// ') + '\n' + ''
                        'type ' + tool.function.name + ' = (_: ' + ts_converter.visit(tool.function.parameters) + ") => any;\n"
                        for tool in chat_request.tools
                    ],
                    '} // namespace functions',
                ]))
                # Only allowing a single tool call at a time for now.
                # Note that if there were more, they'd be separated by a '<|from|>assistant' literal
                converter._add_rule(
                    "root", 
                    converter._format_literal(prefix) + " (" +
                        (response_rule or converter.not_literal("<|recipient|>")) + " | " +
                        (' | '.join(
                            converter._format_literal(f"<|recipient|>{tool.function.name}\n<|content|>") + " " +
                            converter.visit(tool.function.parameters, tool.function.name + '-args')
                            for tool in chat_request.tools
                        )) +
                        ") " +
                    ") " + converter._format_literal(suffix))
                grammar = converter.format_grammar()
            else:
                raise NotImplementedError(f'Unsupported tool_style: {chat_format.tool_style}')
        elif response_schema:
            converter._add_rule('root', response_rule)
            grammar = converter.format_grammar()
            def parse(s):
                if s.endswith(suffix):
                    s = s[:-len(suffix)]
                    return {"role": "assistant", "content": s}
                return None
            parser = parse
        if chat_format.strict_user_assistant_alternation:
            print("TODO: merge system messages into user messages")
@ -182,11 +86,9 @@ def main(
        # TODO: Test whether the template supports formatting tool_calls
        prompt = chat_format.render(messages, add_generation_prompt=True)
        # print(prompt)
        # print(grammar)
        print(json.dumps(dict(
            prompt=prompt,
            stream=chat_request.stream,
            prompt=prompt,
            grammar=grammar,
        ), indent=2))
        async with httpx.AsyncClient() as client:
@ -195,14 +97,23 @@ def main(
                json=LlamaCppServerCompletionRequest(
                    prompt=prompt,
                    stream=chat_request.stream,
-                    n_predict=100,
+                    n_predict=300,
                    grammar=grammar,
                ).model_dump(),
                headers=headers,
                timeout=None)
-        return StreamingResponse(generate_chunks(response), media_type="text/event-stream") if chat_request.stream \
+        if chat_request.stream:
-            else JSONResponse(response.json())
+            # TODO: Remove suffix from streamed response using partial parser.
            assert not chat_request.tools and not chat_request.response_format, "Streaming not supported yet with tools or response_format"
            return StreamingResponse(generate_chunks(response), media_type="text/event-stream")
        else:
            result = response.json()
            print(json.dumps(result, indent=2))
            message = parser(result["content"])
            assert message is not None, f"Failed to parse response: {response.text}"
            return JSONResponse(message.model_dump())
            # return JSONResponse(response.json())
    async def generate_chunks(response):
        async for chunk in response.aiter_bytes():
@ -211,5 +122,4 @@ def main(
    uvicorn.run(app, host=host, port=port)
 if __name__ == "__main__":
-    CLI(main)
+    typer.run(main)
--- a/examples/openai/test.sh
+++ b/examples/openai/test.sh
@ -0,0 +1,79 @@
 #!/bin/bash
 set -euo pipefail
 SERVER_PID=""
 function cleanup() {
  if [ -n "$SERVER_PID" ]; then
    echo "# Killing server"
    kill $SERVER_PID
    wait $SERVER_PID
  fi
 }
 trap cleanup EXIT
 echo "# Starting the server"
 python -m examples.openai --model ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf &
 SERVER_PID=$!
 sleep 5
 echo "# Send a message to the chat API"
 curl http://localhost:8080/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer $OPENAI_API_KEY" \
  -d '{
    "model": "gpt-3.5-turbo",
    "tools": [{
          "type": "function",
          "function": {
              "name": "get_current_weather",
              "description": "Get the current weather",
              "parameters": {
                  "type": "object",
                  "properties": {
                      "location": {
                          "type": "string",
                          "description": "The city and state, e.g. San Francisco, CA"
                      },
                      "format": {
                          "type": "string",
                          "enum": ["celsius", "fahrenheit"],
                          "description": "The temperature unit to use. Infer this from the users location."
                      }
                  },
                  "required": ["location", "format"]
              }
          }
      }, {
          "type": "function",
          "function": {
              "name": "get_n_day_weather_forecast",
              "description": "Get an N-day weather forecast",
              "parameters": {
                  "type": "object",
                  "properties": {
                      "location": {
                          "type": "string",
                          "description": "The city and state, e.g. San Francisco, CA"
                      },
                      "format": {
                          "type": "string",
                          "enum": ["celsius", "fahrenheit"],
                          "description": "The temperature unit to use. Infer this from the users location."
                      },
                      "num_days": {
                          "type": "integer",
                          "description": "The number of days to forecast"
                      }
                  },
                  "required": ["location", "format", "num_days"]
              }
          }
      }],
    "messages": [
      {"role": "system", "content": "Do not make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."},
      {"role": "user", "content": "what is the weather going to be like in San Francisco and Glasgow over the next 4 days. Give the temperatyre in celsius for both locations."}
    ]
  }'
		`@ -1 +0,0 @@`
			`Subproject commit b4f757e27d87f4ab408f706f482c25a8e1508d59`