diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py index fcdf86b3f..331e84e98 100755 --- a/convert-starcoder-hf-to-gguf.py +++ b/convert-starcoder-hf-to-gguf.py @@ -103,9 +103,8 @@ print("gguf: get model metadata") block_count = hparams["n_layer"] gguf_writer.add_name("StarCoder") -gguf_writer.add_context_length(2048) # not in config.json +gguf_writer.add_context_length(hparams["n_positions"]) gguf_writer.add_embedding_length(hparams["n_embd"]) -gguf_writer.add_max_position_embeddings(hparams["n_positions"]) gguf_writer.add_feed_forward_length(4 * hparams["n_embd"]) gguf_writer.add_block_count(block_count) gguf_writer.add_head_count(hparams["n_head"]) diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index 0a9200bf4..e0e0dbcbb 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -36,13 +36,12 @@ KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository" KEY_GENERAL_FILE_TYPE = "general.file_type" # LLM -KEY_CONTEXT_LENGTH = "{arch}.context_length" -KEY_EMBEDDING_LENGTH = "{arch}.embedding_length" -KEY_BLOCK_COUNT = "{arch}.block_count" -KEY_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" -KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" -KEY_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" -KEY_MAX_POSITION_EMBEDDINGS = "{arch}.max_position_embeddings" +KEY_CONTEXT_LENGTH = "{arch}.context_length" +KEY_EMBEDDING_LENGTH = "{arch}.embedding_length" +KEY_BLOCK_COUNT = "{arch}.block_count" +KEY_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" +KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" +KEY_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" # attention KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count" @@ -718,10 +717,6 @@ class GGUFWriter: self.add_uint32( KEY_EMBEDDING_LENGTH.format(arch=self.arch), length) - def add_max_position_embeddings(self, length: int): - self.add_uint32( - KEY_MAX_POSITION_EMBEDDINGS.format(arch=self.arch), length) - def add_block_count(self, length: int): self.add_uint32( KEY_BLOCK_COUNT.format(arch=self.arch), length) diff --git a/llama.cpp b/llama.cpp index a4ced9e08..d75ceee78 100644 --- a/llama.cpp +++ b/llama.cpp @@ -193,7 +193,6 @@ enum llm_kv { LLM_KV_FEED_FORWARD_LENGTH, LLM_KV_USE_PARALLEL_RESIDUAL, LLM_KV_TENSOR_DATA_LAYOUT, - LLM_KV_MAX_POSITION_EMBEDDINGS, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -238,7 +237,6 @@ static std::map LLM_KV_NAMES = { { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" }, { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" }, { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" }, - { LLM_KV_MAX_POSITION_EMBEDDINGS, "%s.max_position_embeddings" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -940,7 +938,6 @@ struct llama_hparams { uint32_t n_layer = 32; uint32_t n_rot = 64; uint32_t n_ff = 11008; - uint32_t n_positions = 0; // StarCoder float f_norm_eps = 1e-5; float f_norm_rms_eps = 1e-5; @@ -1668,7 +1665,6 @@ static void llm_load_hparams( GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT)); GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT)); - GGUF_GET_KEY(ctx, hparams.n_positions, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_MAX_POSITION_EMBEDDINGS)); // n_head_kv is optional, default to n_head hparams.n_head_kv = hparams.n_head; @@ -2215,7 +2211,7 @@ static void llm_load_tensors( case LLM_ARCH_STARCODER: { model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_positions}, GGML_BACKEND_CPU); + model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); // output {