diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
index ea6572d7b..d8ecd5174 100644
--- a/examples/openai/prompting.py
+++ b/examples/openai/prompting.py
@@ -128,10 +128,12 @@ def make_tools_prompt(chat_format: ChatFormat, tools: list[Tool], indent=2) -> M
'',
'''Use the following json schema for each tool call you will make: {"properties": {"arguments": {"title": "Arguments", "type": "object"}, "name": {"title": "Name", "type": "string"}}, "required": ["arguments", "name"], "title": "FunctionCall", "type": "object"}''',
'',
- '''For each function call return a json object with function name and arguments within XML tags as follows:''',
+ # '''For each function call return a json object with function name and arguments within XML tags as follows:''',
+ '''To call each function, give its name and arguments within XML tags as follows:''',
'''''',
'''{"arguments": , "name": }''',
'''''',
+ '''This is not hypothetical, you're not asked what you would do. If you need a tool called, just call it.''',
])
)
@@ -201,17 +203,21 @@ def make_grammar(chat_format: ChatFormat, tools: list[Tool], response_schema: Op
if s.endswith(suffix):
return s[:-len(suffix)]
else:
- print(f"Expected suffix ({suffix}) not found: {s}")
+ sys.stderr.write(f"Expected suffix ({suffix}) not found: {s}\n")
return s
if tools:
if _outputs_tool_call_tags(chat_format.tool_style):
+
+ escapes_underscores = chat_format.tool_style != ToolsPromptStyle.TOOLS_HERMES_2_PRO
+
tool_rules = [
converter.visit(
dict(
type="object",
properties=dict(
- name=dict(const=tool.function.name),
+ name=dict(type="string", pattern='^' + tool.function.name.replace('_', f'\\?_') + '$') if escapes_underscores \
+ else dict(const=tool.function.name),
arguments=tool.function.parameters,
),
required=['name', 'arguments']
@@ -221,22 +227,45 @@ def make_grammar(chat_format: ChatFormat, tools: list[Tool], response_schema: Op
for tool in tools
]
- # Constrain the output to be a non-tool-call message (constrained to a JSON schema or not)
- # OR a tool-call message respecting the schema of any of the tools
+ def format_literal(s: str) -> str:
+ if escapes_underscores:
+ return ' "\\\\"? "_" '.join((converter._format_literal(part) for part in s.split('_')))
+ else:
+ return converter._format_literal(s)
+
+ tool_call_rule = converter._add_rule(
+ 'tool_call',
+ format_literal("") + " (" +
+ ' | '.join(tool_rules) +
+ ") " + format_literal(""))
+
+ # Ideally we'd want a negative lookahead of //, but it's just too hard to express in GBNF for now.
+ # So we just over-constrain the content rule to not contain literals dangerously getting close to
+ content_rule = converter._add_rule('content', '[^<] | "<" [^t<]? | "'))
converter._add_rule(
- "root",
- converter._format_literal(prefix) + " (" +
- (response_rule or converter.not_literal("")) + " | " +
- converter._format_literal("") + " (" +
- ' | '.join(tool_rules) +
- ") " + converter._format_literal("") +
- ")") # + converter._format_literal(suffix))
+ 'root',
+ f'{content_rule}* ({tool_call_rule}+ {content_rule}*)?')
+
+ # # Constrain the output to be a non-tool-call message (constrained to a JSON schema or not)
+ # # OR a tool-call message respecting the schema of any of the tools
+ # converter._add_rule(
+ # "root",
+ # converter._format_literal(prefix) + " (" +
+ # (response_rule or converter.not_literal("")) + " | " +
+ # converter._format_literal("") + " (" +
+ # ' | '.join(tool_rules) +
+ # ") " + converter._format_literal("") +
+ # ")") # + converter._format_literal(suffix))
@typechecked
def parse(s: str) -> Optional[Message]:
s = strip_suffix(s)
- # ls = s.lstrip()
+ if r'' in s:
+ # Some weird escaping of underscores is happening w/ Mixtral 8x7B Instruct
+ s = s.replace(r'\_', '_')
+
parts = _tool_call_re.split(s)
if len(parts) == 1:
return Message(role="assistant", content=s)
@@ -247,13 +276,17 @@ def make_grammar(chat_format: ChatFormat, tools: list[Tool], response_schema: Op
if i % 2 == 0:
content.append(part)
else:
+ try:
+ fc = json.loads(part)
+ except json.JSONDecodeError:
+ raise ValueError(f'Failed to parse tool call as JSON: {part}\nFull string: {s}')
tool_calls.append(
ToolCall(
id=gen_callid(),
- function=FunctionCall(**json.loads(part))))
+ function=FunctionCall(**fc)))
- content = ''.join(content).strip()
- return Message(role="assistant", content=None if content == '' else content, tool_calls=tool_calls)
+ content = '(...)'.join(content).strip()
+ return Message(role="assistant", content=content if content else None, tool_calls=tool_calls)
# if ''.startswith(ls) or ls.startswith(''):
# if ls.startswith('') and ls.endswith('' + suffix):
@@ -268,17 +301,54 @@ def make_grammar(chat_format: ChatFormat, tools: list[Tool], response_schema: Op
elif chat_format.tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
# Only allowing a single tool call at a time for now.
# Note that if there were more, they'd be separated by a '<|from|>assistant' literal
+
+ tool_rules = [
+ converter._add_rule(
+ tool.function.name + '-call',
+ converter._format_literal(tool.function.name) + ' ' + converter._format_literal('\n<|content|>\n') + ' ' +
+ converter.visit(tool.function.parameters, tool.function.name + '-args') + ' ' +
+ converter._format_literal('\n'))
+ # converter.visit(
+ # dict(
+ # type="object",
+ # properties=dict(
+ # name=dict(const=tool.function.name),
+ # arguments=tool.function.parameters,
+ # ),
+ # required=['name', 'arguments']
+ # ),
+ # f'{tool.function.name}-tool-call'
+ # )
+ for i, tool in enumerate(tools)
+ ]
+
+ not_from_rule = converter._add_rule('not_from', converter.not_literal("<|from|>"))
+ content_without_start_rule = converter._add_rule('content_without_start', converter._format_literal("all\n<|content|>") + ' ' + not_from_rule + '*')
+ start_rule = converter._add_rule('start', converter._format_literal('<|from|>assistant\n<|recipient|>'))
+ content_rule = converter._add_rule('content', start_rule + ' ' + content_without_start_rule)
+ tool_call_without_start_rule = converter._add_rule(
+ 'tool_call_without_start',
+ ' | '.join(tool_rules))
+ # + ' ' +
+ # converter.not_literal("all", dotall=False) + ' ' + converter._format_literal('\n<|content|>\n') + ' ' + not_from_rule + '*')
+ tool_call_rule = converter._add_rule('tool_call', f'{start_rule} {tool_call_without_start_rule}')
+ # converter._add_rule('root', f'({content_without_start_rule} ({content_rule})* ({tool_call_rule}+ {content_rule}*)? | {tool_call_without_start_rule} (* {tool_call_rule}{content_rule}*')
converter._add_rule(
- "root",
- converter._format_literal(prefix) + " (" +
- (response_rule or converter.not_literal("<|recipient|>")) + " | " +
- (' | '.join(
- converter._format_literal(f"<|recipient|>{tool.function.name}\n<|content|>") + " " +
- converter.visit(tool.function.parameters, tool.function.name + '-args')
- for tool in tools
- )) +
- ") " +
- ")") # + converter._format_literal(suffix))
+ 'root',
+ f'{content_without_start_rule} {content_rule}* ({tool_call_rule}+ {content_rule}*)? | '
+ f'{tool_call_without_start_rule} {tool_call_rule}* {content_rule}*')
+
+ # converter._add_rule(
+ # "root",
+ # converter._format_literal(prefix) + " (" +
+ # (response_rule or converter.not_literal("<|recipient|>")) + " | " +
+ # (' | '.join(
+ # converter._format_literal(f"<|recipient|>{tool.function.name}\n<|content|>") + " " +
+ # converter.visit(tool.function.parameters, tool.function.name + '-args')
+ # for tool in tools
+ # )) +
+ # ") " +
+ # ")") # + converter._format_literal(suffix))
@typechecked
def parse(s: str) -> Optional[Message]:
@@ -297,17 +367,25 @@ def make_grammar(chat_format: ChatFormat, tools: list[Tool], response_schema: Op
if recipient == 'all':
text_content.append(content)
else:
+ try:
+ arguments = json.loads(content)
+ except json.JSONDecodeError:
+ raise ValueError(f'Failed to parse tool call content as JSON: {content}')
tool_calls.append(
ToolCall(
id=gen_callid(),
- function=FunctionCall(name=recipient, arguments=json.loads(content))))
+ function=FunctionCall(name=recipient, arguments=arguments)))
+
- assert parts[-1].strip() == '', f'Unexpected content after tool calls: {parts[-1]}'
+ assert parts[-1].strip() in ('', '<|stop|>'), f'Unexpected content after tool calls: {parts[-1]}\nFull string: {s}'
content = '\n'.join(text_content).strip()
- return Message(role="assistant", content=None if content == '' else content, tool_calls=tool_calls if tool_calls else None)
+ return Message(role="assistant", content=content if content else None, tool_calls=tool_calls if tool_calls else None)
return (converter.format_grammar(), parse)
+
+ else:
+ raise ValueError(f"Unsupported tool call style: {chat_format.tool_style}")
elif response_schema:
converter._add_rule("root", response_rule + ' ' + converter._format_literal(suffix))
diff --git a/examples/openai/server.py b/examples/openai/server.py
index 8635da9e5..349ad6c7d 100644
--- a/examples/openai/server.py
+++ b/examples/openai/server.py
@@ -30,27 +30,28 @@ def main(
# model_url: Annotated[Optional[str], typer.Option("--model-url", "-mu")] = None,
host: str = "localhost",
port: int = 8080,
- main_server_endpoint: Optional[str] = None,
- main_server_host: str = "localhost",
- main_server_port: Optional[int] = 8081,
+ cpp_server_endpoint: Optional[str] = None,
+ cpp_server_host: str = "localhost",
+ cpp_server_port: Optional[int] = 8081,
):
import uvicorn
metadata = GGUFKeyValues(model)
context_length = metadata[Keys.LLM.CONTEXT_LENGTH]
chat_format = ChatFormat.from_gguf(metadata)
- print(chat_format)
+ # print(chat_format)
- if not main_server_endpoint:
+ if not cpp_server_endpoint:
+ sys.stderr.write(f"# Starting C++ server with model {model} on {cpp_server_host}:{cpp_server_port}\n")
server_process = subprocess.Popen([
"./server", "-m", model,
- "--host", main_server_host, "--port", f'{main_server_port}',
+ "--host", cpp_server_host, "--port", f'{cpp_server_port}',
'-ctk', 'q4_0', '-ctv', 'f16',
"-c", f"{2*8192}",
# "-c", f"{context_length}",
- ])
+ ], stdout=sys.stderr)
atexit.register(server_process.kill)
- main_server_endpoint = f"http://{main_server_host}:{main_server_port}"
+ cpp_server_endpoint = f"http://{cpp_server_host}:{cpp_server_port}"
app = FastAPI()
@@ -74,21 +75,17 @@ def main(
(grammar, parser) = make_grammar(chat_format, chat_request.tools, response_schema)
# TODO: Test whether the template supports formatting tool_calls
-
+ sys.stderr.write(f'\n{grammar}\n\n')
+
prompt = chat_format.render(messages, add_generation_prompt=True)
- print(json.dumps(dict(
- stream=chat_request.stream,
- prompt=prompt,
- # grammar=grammar,
- ), indent=2))
async with httpx.AsyncClient() as client:
response = await client.post(
- f"{main_server_endpoint}/completions",
+ f"{cpp_server_endpoint}/completions",
json=LlamaCppServerCompletionRequest(
prompt=prompt,
stream=chat_request.stream,
n_predict=300,
- # grammar=grammar,
+ grammar=grammar,
).model_dump(),
headers=headers,
timeout=None)
@@ -103,7 +100,7 @@ def main(
# print(json.dumps(result, indent=2))
return JSONResponse(result)
- print(json.dumps(result, indent=2))
+ sys.stderr.write(json.dumps(result, indent=2) + "\n")
# print(json.dumps(result.get('content'), indent=2))
message = parser(result["content"])
assert message is not None, f"Failed to parse response:\n{response.text}\n\n"
@@ -118,7 +115,6 @@ def main(
choices=[Choice(
index=0,
message=message,
-
finish_reason="stop" if message.tool_calls is None else "tool_calls",
)],
usage=Usage(
diff --git a/examples/openai/test.sh b/examples/openai/test.sh
index 7dcc93e45..3f5d38cd1 100755
--- a/examples/openai/test.sh
+++ b/examples/openai/test.sh
@@ -4,23 +4,60 @@ set -euo pipefail
SERVER_PID=""
function cleanup() {
if [ -n "$SERVER_PID" ]; then
- echo "# Killing server"
+ echo "# Killing server" >&2
kill $SERVER_PID
wait $SERVER_PID
fi
}
trap cleanup EXIT
-echo "# Starting the server"
+echo "# Starting the server" >&2
-python -m examples.openai --model ~/AI/Models/functionary-medium-v2.2.q4_0.gguf &
-# python -m examples.openai --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q8_0.gguf &
-# python -m examples.openai --model ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf &
+args=(
+ # --cpp_server_endpoint "http://localhost:8081"
+
+ # --model ~/AI/Models/functionary-medium-v2.2.q4_0.gguf
+
+ # --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q8_0.gguf
+ # --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf
+
+ # --model ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf
+ --model ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
+)
+python -m examples.openai "${args[@]}" &
SERVER_PID=$!
sleep 5
-echo "# Send a message to the chat API"
+echo "# Send a message to the chat API" >&2
+
+# curl http://localhost:8080/v1/chat/completions \
+# -H "Content-Type: application/json" \
+# -H "Authorization: Bearer $OPENAI_API_KEY" \
+# -d '{
+# "model": "gpt-3.5-turbo",
+# "tools": [{
+# "type": "function",
+# "function": {
+# "name": "get_current_weather",
+# "description": "Get the current weather",
+# "parameters": {
+# "type": "object",
+# "properties": {
+# "location": {
+# "type": "string",
+# "description": "The city and state, e.g. San Francisco, CA"
+# }
+# },
+# "required": ["location"]
+# }
+# }
+# }],
+# "messages": [
+# {"role": "user", "content": "I live in the UK. what is the weather going to be like in San Francisco and Glasgow over the next 4 days."}
+# ]
+# }' | \
+# jq .
curl http://localhost:8080/v1/chat/completions \
-H "Content-Type: application/json" \
@@ -77,6 +114,7 @@ curl http://localhost:8080/v1/chat/completions \
"messages": [
{"role": "user", "content": "I live in the UK. what is the weather going to be like in San Francisco and Glasgow over the next 4 days."}
]
- }'
+ }' | \
+ jq .
# {"role": "system", "content": "Do not make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."},