llama : add Command R Plus support (#6491)

* Add Command R Plus GGUF

* Add Command R Plus GGUF

* Loading works up to LayerNorm2D

* Export new tensors in 1D so they are not quantized.

* Fix embedding layer based on Noeda's example

* Whitespace

* Add line

* Fix unexpected tokens on MPS. Re-add F16 fix. ((Noeda)

* dranger003: Fix block index overflow in CUDA dequantizing.

* Reverted blocked multiplication code as it still has issues and could affect other Llama arches

* export norms as f32

* fix overflow issues during quant and other cleanup

* Type convention

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* dranger003: Fix more int overflow during quant.

---------

Co-authored-by: S <seast@Ss-Mac-Studio.local>
Co-authored-by: S <s@example.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Carolinabanana 2024-04-09 09:16:13 +01:00 committed by GitHub
parent e11a8999b5
commit 5dc9dd7152
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
16 changed files with 358 additions and 318 deletions

View file

@ -926,6 +926,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
{
@ -5406,6 +5408,11 @@ static bool llm_load_tensors(
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
if (n_layer >= 64){
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head});
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv});
}
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
@ -9454,6 +9461,31 @@ struct llm_build_context {
cb(Vcur, "Vcur", il);
}
if (model.layers[il].attn_q_norm) {
Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
ggml_element_size(Qcur) * n_embd_head,
ggml_element_size(Qcur) * n_embd_head * n_head,
0);
cb(Qcur, "Qcur", il);
Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
ggml_element_size(Kcur) * n_embd_head,
ggml_element_size(Kcur) * n_embd_head * n_head_kv,
0);
cb(Kcur, "Kcur", il);
Qcur = llm_build_norm(ctx0, Qcur, hparams,
model.layers[il].attn_q_norm,
NULL,
LLM_NORM, cb, il);
cb(Qcur, "Qcur", il);
Kcur = llm_build_norm(ctx0, Kcur, hparams,
model.layers[il].attn_k_norm,
NULL,
LLM_NORM, cb, il);
cb(Kcur, "Kcur", il);
}
Qcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
@ -13323,9 +13355,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
return new_type;
}
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
std::mutex mutex;
int counter = 0;
int64_t counter = 0;
size_t new_size = 0;
if (nthread < 2) {
// single-thread
@ -13333,11 +13365,11 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
}
auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
nrows, n_per_row, imatrix]() {
const int nrows_per_chunk = chunk_size / n_per_row;
const int64_t nrows_per_chunk = chunk_size / n_per_row;
size_t local_size = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int first_row = counter; counter += nrows_per_chunk;
int64_t first_row = counter; counter += nrows_per_chunk;
if (first_row >= nrows) {
if (local_size > 0) {
new_size += local_size;
@ -13345,7 +13377,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
break;
}
lock.unlock();
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
}
};
@ -13539,6 +13571,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// quantize only 2D and 3D tensors (experts)
quantize &= (ggml_n_dims(tensor) >= 2);
// do not quantize norm tensors
quantize &= name.find("_norm.weight") == std::string::npos;
quantize &= params->quantize_output_tensor || name != "output.weight";
quantize &= !params->only_copy;
@ -13585,7 +13621,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
new_size = ggml_nbytes(tensor);
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
} else {
const size_t nelements = ggml_nelements(tensor);
const int64_t nelements = ggml_nelements(tensor);
const float * imatrix = nullptr;
if (imatrix_data) {
@ -13637,20 +13673,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
fflush(stdout);
if (work.size() < nelements * 4) {
if (work.size() < (size_t)nelements * 4) {
work.resize(nelements * 4); // upper bound on size
}
new_data = work.data();
const int n_per_row = tensor->ne[0];
const int nrows = tensor->ne[1];
const int64_t n_per_row = tensor->ne[0];
const int64_t nrows = tensor->ne[1];
static const int min_chunk_size = 32 * 512;
const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
static const int64_t min_chunk_size = 32 * 512;
const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
const int nelements_matrix = tensor->ne[0] * tensor->ne[1];
const int nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
// quantize each expert separately since they have different importance matrices
new_size = 0;