From d9e29036aabcf216ed8117f06c2bce4890c54709 Mon Sep 17 00:00:00 2001 From: Joan Martinez Date: Tue, 7 May 2024 17:22:58 +0200 Subject: [PATCH] test: add a new step test --- examples/server/tests/features/embeddings.feature | 7 +++++++ examples/server/tests/features/server.feature | 2 +- examples/server/tests/features/steps/steps.py | 9 +++++---- unicode.cpp | 2 +- 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature index 6f163ce04..7057fc5aa 100644 --- a/examples/server/tests/features/embeddings.feature +++ b/examples/server/tests/features/embeddings.feature @@ -23,6 +23,13 @@ Feature: llama.cpp server """ Then embeddings are generated + Scenario: Tokenize / Detokenize complex + When tokenizing: + """ + España is a èspciâl café über naïve résumé cañón élite cañas Barça 例子 東京 こんにちは 你好 中国 + """ + Then tokens can be detokenize and is equivalent False + Scenario: OAI Embeddings compatibility Given a model bert-bge-small When an OAI compatible embeddings computation request for: diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index 646a4e49d..04d3c839c 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -91,7 +91,7 @@ Feature: llama.cpp server """ What is the capital of France ? """ - Then tokens can be detokenize + Then tokens can be detokenize and is equivalent True Scenario: Models available Given available models diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index b8dbef21d..22ff7ebca 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -670,9 +670,10 @@ async def step_tokenize(context): context.tokens = tokenize_json['tokens'] -@step('tokens can be detokenize') +@step('tokens can be detokenize and is equivalent {equivalent}') @async_run_until_complete -async def step_detokenize(context): +async def step_detokenize(context, equivalent): + equivalent = equivalent == 'True' assert len(context.tokens) > 0 async with aiohttp.ClientSession() as session: async with session.post(f'{context.base_url}/detokenize', @@ -682,8 +683,8 @@ async def step_detokenize(context): assert response.status == 200 detokenize_json = await response.json() # SPM tokenizer adds a whitespace prefix: https://github.com/google/sentencepiece/issues/15 - assert context.tokenized_text == detokenize_json['content'].strip() - + if equivalent: + assert context.tokenized_text == detokenize_json['content'].strip() @step('an OPTIONS request is sent from {origin}') @async_run_until_complete diff --git a/unicode.cpp b/unicode.cpp index 19587b3b0..d141cc1cc 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -492,7 +492,7 @@ std::vector sort_by_canonical_class(std::vector & cpts) { std::vector canonical_decomposition_cpts(std::vector & cpts, uint32_t starting_offset) { std::vector result; for (auto i = starting_offset; i < cpts.size(); i++) { - auto it = unicode_map_nfd.equal_range(cpts[i]); + const auto& it = unicode_map_nfd.equal_range(cpts[i]); if (it.first != it.second) { uint offset = 0; for (auto jt = it.first; jt != it.second; jt++) {