test: add a new step test
This commit is contained in:
parent
eb5a0e162a
commit
d9e29036aa
4 changed files with 14 additions and 6 deletions
|
@ -23,6 +23,13 @@ Feature: llama.cpp server
|
||||||
"""
|
"""
|
||||||
Then embeddings are generated
|
Then embeddings are generated
|
||||||
|
|
||||||
|
Scenario: Tokenize / Detokenize complex
|
||||||
|
When tokenizing:
|
||||||
|
"""
|
||||||
|
España is a èspciâl café über naïve résumé cañón élite cañas Barça 例子 東京 こんにちは 你好 中国
|
||||||
|
"""
|
||||||
|
Then tokens can be detokenize and is equivalent False
|
||||||
|
|
||||||
Scenario: OAI Embeddings compatibility
|
Scenario: OAI Embeddings compatibility
|
||||||
Given a model bert-bge-small
|
Given a model bert-bge-small
|
||||||
When an OAI compatible embeddings computation request for:
|
When an OAI compatible embeddings computation request for:
|
||||||
|
|
|
@ -91,7 +91,7 @@ Feature: llama.cpp server
|
||||||
"""
|
"""
|
||||||
What is the capital of France ?
|
What is the capital of France ?
|
||||||
"""
|
"""
|
||||||
Then tokens can be detokenize
|
Then tokens can be detokenize and is equivalent True
|
||||||
|
|
||||||
Scenario: Models available
|
Scenario: Models available
|
||||||
Given available models
|
Given available models
|
||||||
|
|
|
@ -670,9 +670,10 @@ async def step_tokenize(context):
|
||||||
context.tokens = tokenize_json['tokens']
|
context.tokens = tokenize_json['tokens']
|
||||||
|
|
||||||
|
|
||||||
@step('tokens can be detokenize')
|
@step('tokens can be detokenize and is equivalent {equivalent}')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_detokenize(context):
|
async def step_detokenize(context, equivalent):
|
||||||
|
equivalent = equivalent == 'True'
|
||||||
assert len(context.tokens) > 0
|
assert len(context.tokens) > 0
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
async with session.post(f'{context.base_url}/detokenize',
|
async with session.post(f'{context.base_url}/detokenize',
|
||||||
|
@ -682,8 +683,8 @@ async def step_detokenize(context):
|
||||||
assert response.status == 200
|
assert response.status == 200
|
||||||
detokenize_json = await response.json()
|
detokenize_json = await response.json()
|
||||||
# SPM tokenizer adds a whitespace prefix: https://github.com/google/sentencepiece/issues/15
|
# SPM tokenizer adds a whitespace prefix: https://github.com/google/sentencepiece/issues/15
|
||||||
assert context.tokenized_text == detokenize_json['content'].strip()
|
if equivalent:
|
||||||
|
assert context.tokenized_text == detokenize_json['content'].strip()
|
||||||
|
|
||||||
@step('an OPTIONS request is sent from {origin}')
|
@step('an OPTIONS request is sent from {origin}')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
|
|
|
@ -492,7 +492,7 @@ std::vector<uint32_t> sort_by_canonical_class(std::vector<uint32_t> & cpts) {
|
||||||
std::vector<uint32_t> canonical_decomposition_cpts(std::vector<uint32_t> & cpts, uint32_t starting_offset) {
|
std::vector<uint32_t> canonical_decomposition_cpts(std::vector<uint32_t> & cpts, uint32_t starting_offset) {
|
||||||
std::vector<uint32_t> result;
|
std::vector<uint32_t> result;
|
||||||
for (auto i = starting_offset; i < cpts.size(); i++) {
|
for (auto i = starting_offset; i < cpts.size(); i++) {
|
||||||
auto it = unicode_map_nfd.equal_range(cpts[i]);
|
const auto& it = unicode_map_nfd.equal_range(cpts[i]);
|
||||||
if (it.first != it.second) {
|
if (it.first != it.second) {
|
||||||
uint offset = 0;
|
uint offset = 0;
|
||||||
for (auto jt = it.first; jt != it.second; jt++) {
|
for (auto jt = it.first; jt != it.second; jt++) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue