fix: fix alignment
This commit is contained in:
parent
0771b175aa
commit
22a0113299
5 changed files with 10 additions and 89 deletions
18
llama.cpp
18
llama.cpp
|
@ -8241,9 +8241,6 @@ struct llm_build_context {
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false);
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false);
|
||||||
|
|
||||||
// positions of the tokens in the KV cache
|
|
||||||
struct ggml_tensor * KQ_pos = build_inp_KQ_pos(false);
|
|
||||||
|
|
||||||
// iterate layers
|
// iterate layers
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * cur = inpL;
|
struct ggml_tensor * cur = inpL;
|
||||||
|
@ -8386,7 +8383,6 @@ struct llm_build_context {
|
||||||
// output layer norm
|
// output layer norm
|
||||||
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
|
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
|
||||||
|
|
||||||
|
|
||||||
// input for next layer
|
// input for next layer
|
||||||
inpL = cur;
|
inpL = cur;
|
||||||
}
|
}
|
||||||
|
@ -11506,7 +11502,7 @@ static int llama_decode_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
// non-causal masks do not use the KV cache
|
// non-causal masks do not use the KV cache
|
||||||
if (hparams.causal_attn || model.arch == LLM_ARCH_JINA_BERT_V2) {
|
if (hparams.causal_attn) {
|
||||||
llama_kv_cache_update(&lctx);
|
llama_kv_cache_update(&lctx);
|
||||||
|
|
||||||
// if we have enough unused cells before the current head ->
|
// if we have enough unused cells before the current head ->
|
||||||
|
@ -12350,10 +12346,14 @@ struct llm_tokenizer_bpe {
|
||||||
break;
|
break;
|
||||||
case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
|
case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
|
||||||
//TODO: Apply GPT2 + lowercasing
|
//TODO: Apply GPT2 + lowercasing
|
||||||
word_collection = unicode_regex_split(text, {
|
{
|
||||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
std::string lowercase_text = text;
|
||||||
});
|
std::transform(lowercase_text.begin(), lowercase_text.end(), lowercase_text.begin(), [](unsigned char c){ return std::tolower(c); });
|
||||||
//TODO: Apply lowercasing
|
word_collection = unicode_regex_split(lowercase_text, {
|
||||||
|
"",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
// default regex for BPE tokenization pre-processing
|
// default regex for BPE tokenization pre-processing
|
||||||
word_collection = unicode_regex_split(text, {
|
word_collection = unicode_regex_split(text, {
|
||||||
|
|
1
llama.h
1
llama.h
|
@ -71,7 +71,6 @@ extern "C" {
|
||||||
|
|
||||||
// pre-tokenization types
|
// pre-tokenization types
|
||||||
enum llama_vocab_pre_type {
|
enum llama_vocab_pre_type {
|
||||||
|
|
||||||
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
||||||
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
||||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
||||||
|
|
|
@ -4,7 +4,6 @@
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
|
||||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
|
||||||
|
@ -15,6 +14,4 @@ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuati
|
||||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
|
||||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
|
||||||
extern const std::multimap<uint32_t, uint32_t> unicode_map_nfd;
|
extern const std::multimap<uint32_t, uint32_t> unicode_map_nfd;
|
||||||
extern const std::map<char32_t, char32_t> unicode_map_lowercase;
|
extern const std::map<char32_t, char32_t> unicode_map_lowercase;
|
||||||
extern const std::unordered_map<uint32_t, std::vector<uint32_t>> unicode_decompose_map;
|
|
||||||
extern const std::unordered_map<uint32_t, uint32_t> unicode_canonical_class;
|
|
72
unicode.cpp
72
unicode.cpp
|
@ -14,8 +14,6 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <locale>
|
#include <locale>
|
||||||
#include <codecvt>
|
#include <codecvt>
|
||||||
#include <unicode/unistr.h>
|
|
||||||
#include <unicode/unorm2.h>
|
|
||||||
|
|
||||||
static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
|
static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
|
||||||
std::string result;
|
std::string result;
|
||||||
|
@ -590,68 +588,6 @@ std::string unicode_cpt_to_utf8(uint32_t cp) {
|
||||||
throw std::invalid_argument("invalid codepoint");
|
throw std::invalid_argument("invalid codepoint");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Function to recursively decompose a string
|
|
||||||
std::vector<uint32_t> decompose_cpts(const std::vector<uint32_t> & cpts) {
|
|
||||||
std::vector<uint32_t> result;
|
|
||||||
for (const auto& cpt : cpts) {
|
|
||||||
auto it = unicode_decompose_map.find(cpt);
|
|
||||||
if (it != unicode_decompose_map.end()) {
|
|
||||||
for (const auto& decomp: it->second) {
|
|
||||||
const auto & inner_result = decompose_cpts({decomp});
|
|
||||||
result.insert(result.end(), inner_result.begin(), inner_result.end());
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
result.push_back(cpt);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Function to sort subsequences based on canonical class
|
|
||||||
std::vector<uint32_t> sort_by_canonical_class(const std::vector<uint32_t> & cpts) {
|
|
||||||
std::vector<uint32_t> subsequence;
|
|
||||||
std::vector<uint32_t> result;
|
|
||||||
auto compareByCanonicalClass = [&](const uint32_t& a, const uint32_t& b) {
|
|
||||||
auto cc_a_it = unicode_canonical_class.find(a);
|
|
||||||
if (cc_a_it != unicode_canonical_class.end()) {
|
|
||||||
auto cc_b_it = unicode_canonical_class.find(b);
|
|
||||||
if (cc_b_it != unicode_canonical_class.end()) {
|
|
||||||
return cc_a_it->second < cc_b_it->second;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
};
|
|
||||||
|
|
||||||
for (const auto& cpt : cpts) {
|
|
||||||
auto it = unicode_canonical_class.find(cpt);
|
|
||||||
if (it != unicode_canonical_class.end()) {
|
|
||||||
if (it->second > 0) {
|
|
||||||
subsequence.push_back(cpt);
|
|
||||||
} else {
|
|
||||||
if (!subsequence.empty()) {
|
|
||||||
sort(subsequence.begin(), subsequence.end(), compareByCanonicalClass);
|
|
||||||
for (const auto& codepoint : subsequence) {
|
|
||||||
result.push_back(codepoint);
|
|
||||||
}
|
|
||||||
subsequence.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
result.push_back(cpt);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!subsequence.empty()) {
|
|
||||||
sort(subsequence.begin(), subsequence.end(), compareByCanonicalClass);
|
|
||||||
for (const auto& codepoint : subsequence) {
|
|
||||||
result.push_back(codepoint);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
|
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
|
||||||
std::vector<uint32_t> result;
|
std::vector<uint32_t> result;
|
||||||
result.reserve(cpts.size());
|
result.reserve(cpts.size());
|
||||||
|
@ -666,14 +602,6 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
std::vector<uint32_t> unicode_cpts_normalize_nfc(const std::vector<uint32_t> & cpts) {
|
|
||||||
const auto &decomposed_cpts = decompose_cpts(cpts);
|
|
||||||
const auto &sorted_sequence = sort_by_canonical_class(decomposed_cpts);
|
|
||||||
//TODO: Do canonical composition
|
|
||||||
return sorted_sequence;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
||||||
std::vector<uint32_t> result;
|
std::vector<uint32_t> result;
|
||||||
size_t offset = 0;
|
size_t offset = 0;
|
||||||
|
|
|
@ -17,9 +17,6 @@ std::string unicode_cpt_to_utf8(uint32_t cp);
|
||||||
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
|
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
|
||||||
|
|
||||||
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
|
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
|
||||||
std::vector<uint32_t> unicode_cpts_normalize_nfc(const std::vector<uint32_t> & cpts);
|
|
||||||
std::vector<uint32_t> decompose_cpts(const std::vector<uint32_t> & cpts);
|
|
||||||
std::vector<uint32_t> sort_by_canonical_class(const std::vector<uint32_t> & cpts);
|
|
||||||
|
|
||||||
int unicode_cpt_type(uint32_t cp);
|
int unicode_cpt_type(uint32_t cp);
|
||||||
int unicode_cpt_type(const std::string & utf8);
|
int unicode_cpt_type(const std::string & utf8);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue