Detokenizer fixes (#8039)
* Add llama_detokenize(): - Update header files location - UNKNOWN and CONTROL are 'special pieces' - Remove space after UNKNOWN and CONTROL - Refactor llama_token_to_piece() - Add flag: clean_up_tokenization_spaces - Symmetric params for llama_tokenize() and llama_detokenize() * Update and fix tokenizer tests: - Using llama_detokenize() - Unexpected vocab type as test fail instead of error - Useful when automating tests: - If you don't know in advance the vocab type - Differenciate other loading errors - Skip unicode surrogaes and undefined - Gracefully exit threads - Using exit() is throwing random exceptions - Clean old known problematic codepoints - Minor: confusing hexadecimal codepoint * Update bruteforce random tests - Add detokenizer checks - New generator: ascii_lr_strip - New generator: apostrophe - Add more vocabs files - Detokenize special tokens. - Replace errors with '\uFFFD' when detokenizing to 'utf-8' - More edge cases - Better detokenization results check * Fix add_space_prefix, set false by default * Better leading space removal * Do not remove space when decoding special tokens * Bugfix: custom regexs splits undefined unicode codepoints * 'viking' detokenizer clean spaces
This commit is contained in:
parent
be20e7f49d
commit
213701b51a
11 changed files with 499 additions and 265 deletions
|
@ -322,7 +322,7 @@ actor LlamaContext {
|
|||
defer {
|
||||
result.deallocate()
|
||||
}
|
||||
let nTokens = llama_token_to_piece(model, token, result, 8, false)
|
||||
let nTokens = llama_token_to_piece(model, token, result, 8, 0, false)
|
||||
|
||||
if nTokens < 0 {
|
||||
let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
|
||||
|
@ -330,7 +330,7 @@ actor LlamaContext {
|
|||
defer {
|
||||
newResult.deallocate()
|
||||
}
|
||||
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
|
||||
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false)
|
||||
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
|
||||
return Array(bufferPointer)
|
||||
} else {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue