diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index fdf7fcece..88bfcecec 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2811,80 +2811,6 @@ class DeepseekV2Model(Model): if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("JAISLMHeadModel") -class JaisModel(Model): - model_arch = gguf.MODEL_ARCH.JAIS - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # SwigLU activation - assert self.hparams["activation_function"] == "swiglu" - # ALiBi position embedding - assert self.hparams["position_embedding_type"] == "alibi" - - # Embeddings scale - self.embeddings_scale = 1.0 - # note: For some JAIS flavors, output is tied to (same as) wte in original model - self.output_is_wte = False - if 'mup_embeddings_scale' in self.hparams: - self.output_is_wte = True # Hack (?) - self.embeddings_scale = self.hparams['mup_embeddings_scale'] - elif 'embeddings_scale' in self.hparams: - self.embeddings_scale = self.hparams['embeddings_scale'] - else: - assert False - - self.width_scale = 1.0 - if 'mup_output_alpha' in self.hparams: - assert 'mup_width_scale' in self.hparams - self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale'] - elif 'width_scale' in self.hparams: - self.width_scale = self.hparams['width_scale'] - else: - assert False - - def set_vocab(self): - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_block_count(self.hparams["n_layer"]) - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"]) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - tensors: list[tuple[str, Tensor]] = [] - - # we don't need these - if name.endswith((".attn.bias", "relative_pe.slopes")): - return tensors - - if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): - data_torch = data_torch.transpose(1, 0) - - new_name = self.map_tensor_name(name) - - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - tensors.append((new_name, data_torch * self.embeddings_scale)) - if self.output_is_wte: - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale)) - elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): - assert not self.output_is_wte - tensors.append((new_name, data_torch * self.width_scale)) - else: - tensors.append((new_name, data_torch)) - - return tensors - - - @Model.register("T5ForConditionalGeneration") @Model.register("T5WithLMHeadModel") class T5Model(Model): @@ -3002,6 +2928,78 @@ class T5Model(Model): return [(self.map_tensor_name(name), data_torch)] +@Model.register("JAISLMHeadModel") +class JaisModel(Model): + model_arch = gguf.MODEL_ARCH.JAIS + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # SwigLU activation + assert self.hparams["activation_function"] == "swiglu" + # ALiBi position embedding + assert self.hparams["position_embedding_type"] == "alibi" + + # Embeddings scale + self.embeddings_scale = 1.0 + # note: For some JAIS flavors, output is tied to (same as) wte in original model + self.output_is_wte = False + if 'mup_embeddings_scale' in self.hparams: + self.output_is_wte = True # Hack (?) + self.embeddings_scale = self.hparams['mup_embeddings_scale'] + elif 'embeddings_scale' in self.hparams: + self.embeddings_scale = self.hparams['embeddings_scale'] + else: + assert False + + self.width_scale = 1.0 + if 'mup_output_alpha' in self.hparams: + assert 'mup_width_scale' in self.hparams + self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale'] + elif 'width_scale' in self.hparams: + self.width_scale = self.hparams['width_scale'] + else: + assert False + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"]) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + tensors: list[tuple[str, Tensor]] = [] + + # we don't need these + if name.endswith((".attn.bias", "relative_pe.slopes")): + return tensors + + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): + data_torch = data_torch.transpose(1, 0) + + new_name = self.map_tensor_name(name) + + if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): + tensors.append((new_name, data_torch * self.embeddings_scale)) + if self.output_is_wte: + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale)) + elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): + assert not self.output_is_wte + tensors.append((new_name, data_torch * self.width_scale)) + else: + tensors.append((new_name, data_torch)) + + return tensors + ###### CONVERSION LOGIC ###### diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 7ab9d0369..cfaf6a6e8 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -733,6 +733,7 @@ int main(int argc, char ** argv) { // Console/Stream Output fprintf(stdout, "%s", token_str.c_str()); + // Record Displayed Tokens To Log // Note: Generated tokens are created one by one hence this check if (embd.size() > 1) { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index dc6488103..f5502afbe 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -13516,13 +13516,13 @@ static void ggml_compute_forward_soft_max_f32( } else { for (int i = 0; i < nc; ++i) { wp[i] += slope*mp_f32[i]; - } } } #ifndef NDEBUG for (int i = 0; i < nc; ++i) { + //printf("p[%d] = %f\n", i, p[i]); assert(!isnan(wp[i])); } #endif diff --git a/src/llama.cpp b/src/llama.cpp index 5d0b7a29f..00f7f708e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6977,6 +6977,8 @@ static bool llm_load_tensors( } break; case LLM_ARCH_JAIS: { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + // Output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); @@ -7009,7 +7011,6 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); - } } break; default: @@ -12384,23 +12385,13 @@ struct llm_build_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; - //struct ggml_tensor * pos; struct ggml_tensor * inpL; inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); - // // inp_pos - contains the positions - // struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); - // pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); - // cb(pos, "pos_embd", -1); - - // inpL = ggml_add(ctx0, inpL, pos); - // cb(inpL, "inpL", -1); - for (int il = 0; il < n_layer; ++il) { cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm,