server : refactor slot input data, move tokenizer to HTTP thread (#10023)
* server : refactor slot input data, move tokenizer to HTTP thread * move prompt_tokens.empty() check * fix incorrect if branch * fix infinite generation loop * bring back infill validation * add infill test * try fixing format_infill * fix test * remove redundant code * rename completion to inference * update docs * use llama_tokens everywhere
This commit is contained in:
parent
40f2555797
commit
958367bf53
5 changed files with 468 additions and 348 deletions
36
examples/server/tests/features/infill.feature
Normal file
36
examples/server/tests/features/infill.feature
Normal file
|
@ -0,0 +1,36 @@
|
|||
@llama.cpp
|
||||
@infill
|
||||
Feature: llama.cpp server
|
||||
|
||||
# The current model is made by adding FIM tokens to the existing stories260K
|
||||
# We may want to use a better model in the future, maybe something like SmolLM 360M
|
||||
|
||||
Background: Server startup
|
||||
Given a server listening on localhost:8080
|
||||
And a model file tinyllamas/stories260K-infill.gguf from HF repo ggml-org/models
|
||||
And a model file test-model-infill.gguf
|
||||
And a model alias tinyllama-infill
|
||||
And 42 as server seed
|
||||
And 1024 as batch size
|
||||
And 1024 as ubatch size
|
||||
And 2048 KV cache size
|
||||
And 64 max tokens to predict
|
||||
And 0.0 temperature
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Scenario: Infill without input_extra
|
||||
Given a prompt "Complete this"
|
||||
And an infill input extra none none
|
||||
And an infill input prefix "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_"
|
||||
And an infill input suffix "}\n"
|
||||
And an infill request with no api error
|
||||
Then 64 tokens are predicted matching One|day|she|saw|big|scary|bird
|
||||
|
||||
Scenario: Infill with input_extra
|
||||
Given a prompt "Complete this"
|
||||
And an infill input extra "llama.h" "LLAMA_API int32_t llama_n_threads();\n"
|
||||
And an infill input prefix "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_"
|
||||
And an infill input suffix "}\n"
|
||||
And an infill request with no api error
|
||||
Then 64 tokens are predicted matching cuts|Jimmy|mom|came|into|the|room"
|
|
@ -80,6 +80,11 @@ def step_server_config(context, server_fqdn: str, server_port: str):
|
|||
context.lora_file = None
|
||||
context.disable_ctx_shift = False
|
||||
|
||||
# infill
|
||||
context.infill_input_extra = None
|
||||
context.infill_input_suffix = ''
|
||||
context.infill_input_prefix = ''
|
||||
|
||||
context.tasks_result = []
|
||||
context.concurrent_tasks = []
|
||||
context.prompts = []
|
||||
|
@ -291,6 +296,28 @@ async def step_request_completion(context, api_error: Literal['raised'] | str):
|
|||
assert completion == api_error_code, f"completion must be an {api_error_code} status code: {completion}"
|
||||
|
||||
|
||||
@step('an infill request with {api_error} api error')
|
||||
@async_run_until_complete
|
||||
async def step_request_completion(context, api_error: Literal['raised'] | str):
|
||||
if api_error != 'no':
|
||||
raise ValueError(f'api_error={api_error} is not yet implemented')
|
||||
payload = {
|
||||
"prompt": context.prompts[0],
|
||||
"input_suffix": context.infill_input_suffix,
|
||||
"input_prefix": context.infill_input_prefix,
|
||||
"n_predict": context.n_predict,
|
||||
"seed": context.seed,
|
||||
"temperature": context.temperature,
|
||||
}
|
||||
if context.infill_input_extra is not None:
|
||||
payload['input_extra'] = context.infill_input_extra
|
||||
async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
|
||||
async with session.post(f'{context.base_url}/infill',
|
||||
json=payload) as response:
|
||||
assert response.status == 200
|
||||
context.tasks_result = [await response.json()]
|
||||
|
||||
|
||||
@step('{predicted_n:d} tokens are predicted matching {re_content}')
|
||||
def step_n_tokens_predicted_with_content(context, predicted_n, re_content):
|
||||
context.completion = context.tasks_result.pop()
|
||||
|
@ -539,6 +566,25 @@ def step_a_prompt_prompt(context, prompt):
|
|||
context.n_prompts = len(context.prompts)
|
||||
|
||||
|
||||
# TODO: allow this to be repeated
|
||||
@step('an infill input extra {filename} {text}')
|
||||
def step_infill_input_extra(context, filename, text):
|
||||
if filename == 'none':
|
||||
context.infill_input_extra = None
|
||||
else:
|
||||
context.infill_input_extra = [{'filename': filename, 'text': text}]
|
||||
|
||||
|
||||
@step('an infill input suffix {text}')
|
||||
def step_infill_input_suffix(context, text):
|
||||
context.infill_input_suffix = text
|
||||
|
||||
|
||||
@step('an infill input prefix {text}')
|
||||
def step_infill_input_prefix(context, text):
|
||||
context.infill_input_prefix = text
|
||||
|
||||
|
||||
@step('{num_prompts:d} prompts {prompt} with seed {seed:d}')
|
||||
def step_many_prompts(context, num_prompts, prompt, seed):
|
||||
if context.seed is None:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue