From 48baa61eccdca9205daf8d620ba28055c2347b64 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 2 Sep 2024 22:08:38 +0200 Subject: [PATCH 1/2] server : test script : add timeout for all requests (#9282) --- .../server/tests/features/parallel.feature | 4 +-- examples/server/tests/features/steps/steps.py | 36 +++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature index 96a96d6f8..6cd306a2b 100644 --- a/examples/server/tests/features/parallel.feature +++ b/examples/server/tests/features/parallel.feature @@ -52,8 +52,8 @@ Feature: Parallel Then all prompts are predicted with tokens Examples: | streaming | n_predict | - | disabled | 200 | - | enabled | 200 | + | disabled | 128 | + | enabled | 64 | Scenario Outline: Multi users OAI completions compatibility no v1 Given a system prompt You are a writer. diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 1864a694f..18daad476 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -23,6 +23,8 @@ from prometheus_client import parser # pyright: reportRedeclaration=false +DEFAULT_TIMEOUT_SECONDS = aiohttp.ClientTimeout(total=600) + @step("a server listening on {server_fqdn}:{server_port}") def step_server_config(context, server_fqdn: str, server_port: str): context.server_fqdn = server_fqdn @@ -689,7 +691,7 @@ def step_tokenize_set_add_special(context): @async_run_until_complete async def step_tokenize(context): context.tokenized_text = context_text(context) - async with aiohttp.ClientSession() as session: + async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session: tokenize_args = { "content": context.tokenized_text, } @@ -706,7 +708,7 @@ async def step_tokenize(context): @async_run_until_complete async def step_detokenize(context): assert len(context.tokens) > 0 - async with aiohttp.ClientSession() as session: + async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session: async with session.post(f'{context.base_url}/detokenize', json={ "tokens": context.tokens, @@ -735,7 +737,7 @@ def step_strings_for_tokenization(context): @step('an OPTIONS request is sent from {origin}') @async_run_until_complete async def step_options_request(context, origin): - async with aiohttp.ClientSession() as session: + async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session: headers = {'Authorization': f'Bearer {context.user_api_key}', 'Origin': origin} async with session.options(f'{context.base_url}/v1/chat/completions', headers=headers) as response: @@ -751,7 +753,7 @@ def step_check_options_header_value(context, cors_header, cors_header_value): @step('prometheus metrics are exposed') @async_run_until_complete async def step_prometheus_metrics_exported(context): - async with aiohttp.ClientSession() as session: + async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session: async with await session.get(f'{context.base_url}/metrics') as metrics_response: assert metrics_response.status == 200 assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4" @@ -824,7 +826,7 @@ async def concurrent_requests(context, f_completion, *args, **kwargs): @step('the slot {slot_id:d} is saved with filename "{filename}"') @async_run_until_complete async def step_save_slot(context, slot_id, filename): - async with aiohttp.ClientSession() as session: + async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session: async with session.post(f'{context.base_url}/slots/{slot_id}?action=save', json={"filename": filename}, headers={"Content-Type": "application/json"}) as response: @@ -834,7 +836,7 @@ async def step_save_slot(context, slot_id, filename): @step('the slot {slot_id:d} is restored with filename "{filename}"') @async_run_until_complete async def step_restore_slot(context, slot_id, filename): - async with aiohttp.ClientSession() as session: + async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session: async with session.post(f'{context.base_url}/slots/{slot_id}?action=restore', json={"filename": filename}, headers={"Content-Type": "application/json"}) as response: @@ -844,7 +846,7 @@ async def step_restore_slot(context, slot_id, filename): @step('the slot {slot_id:d} is erased') @async_run_until_complete async def step_erase_slot(context, slot_id): - async with aiohttp.ClientSession() as session: + async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session: async with session.post(f'{context.base_url}/slots/{slot_id}?action=erase', headers={"Content-Type": "application/json"}) as response: context.response = response @@ -853,7 +855,7 @@ async def step_erase_slot(context, slot_id): @step('switch {on_or_off} lora adapter {lora_id:d}') @async_run_until_complete async def toggle_lora_adapter(context, on_or_off: str, lora_id: int): - async with aiohttp.ClientSession() as session: + async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session: async with session.post(f'{context.base_url}/lora-adapters', json=[{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}], headers={"Content-Type": "application/json"}) as response: @@ -889,7 +891,7 @@ async def request_completion(prompt, print(f"Set user_api_key: {user_api_key}") headers['Authorization'] = f'Bearer {user_api_key}' - async with aiohttp.ClientSession() as session: + async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session: async with session.post(f'{base_url}/completion', json={ "input_prefix": prompt_prefix, @@ -902,8 +904,7 @@ async def request_completion(prompt, "temperature": temperature if temperature is not None else 0.8, "n_probs": 2, }, - headers=headers, - timeout=3600) as response: + headers=headers) as response: if expect_api_error is None or not expect_api_error: assert response.status == 200 assert response.headers['Access-Control-Allow-Origin'] == origin @@ -961,7 +962,7 @@ async def oai_chat_completions(user_prompt, if async_client: origin = 'llama.cpp' headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin} - async with aiohttp.ClientSession() as session: + async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session: async with session.post(f'{base_url}{base_path}', json=payload, headers=headers) as response: @@ -1048,7 +1049,7 @@ async def oai_chat_completions(user_prompt, async def request_embedding(content, seed, base_url=None) -> list[list[float]]: - async with aiohttp.ClientSession() as session: + async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session: async with session.post(f'{base_url}/embedding', json={ "content": content, @@ -1068,14 +1069,13 @@ async def request_oai_embeddings(input, seed, headers=[] if user_api_key is not None: headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin} - async with aiohttp.ClientSession() as session: + async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session: async with session.post(f'{base_url}/v1/embeddings', json={ "input": input, "model": model, }, - headers=headers, - timeout=3600) as response: + headers=headers) as response: assert response.status == 200, f"received status code not expected: {response.status}" assert response.headers['Access-Control-Allow-Origin'] == origin assert response.headers['Content-Type'] == "application/json; charset=utf-8" @@ -1194,7 +1194,7 @@ async def wait_for_slots_status(context, if 'GITHUB_ACTIONS' in os.environ: timeout *= 2 - async with aiohttp.ClientSession() as session: + async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session: while True: async with await session.get(f'{base_url}/slots', params=params) as slots_response: status_code = slots_response.status @@ -1237,7 +1237,7 @@ def assert_embeddings(embeddings): async def request_slots_status(context, expected_slots): - async with aiohttp.ClientSession() as session: + async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session: async with await session.get(f'{context.base_url}/slots') as slots_response: assert slots_response.status == 200 slots = await slots_response.json() From b69a480af4db8ffd6cbb2efe7ed8132bc7d39073 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 3 Sep 2024 10:00:36 +0300 Subject: [PATCH 2/2] readme : refactor API section + remove old hot topics --- README.md | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index bb2b93a35..e30ab0c8c 100644 --- a/README.md +++ b/README.md @@ -10,32 +10,14 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ -> [!IMPORTANT] -[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809) - ## Recent API changes -- [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006 -- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807 -- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341 -- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122 -- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017 -- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328 -- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796 -- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849 +- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289) +- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291) ## Hot topics -- **`convert.py` has been deprecated and moved to `examples/convert_legacy_llama.py`, please use `convert_hf_to_gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430 -- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021 -- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920 -- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387 -- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404 -- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225 -- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017 -- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981 -- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962 -- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328 +- *add hot topics here* ----