llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)

* phi2 implementation

* fix breaking change

* phi-2 : various fixes

* phi-2 : use layer norm eps

* py : whitespaces

* llama : fix meta KV override bug

* convert : phi don't add BOS token

* convert : revert "added_tokens_decoder" change

* phi-2 : scale Q instead of KQ for better precision

* ggml : fix NeoX rope to rotate just first n_dims

* cuda : less diff in the rope_neox kernel

* ggml : add ggml_mul_mat_set_prec

ggml-ci

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: slaren <slarengh@gmail.com>

* cuda : ggml_cuda_op_mul_mat_cublas support F32 precision

* cuda : remove oboslete comment

---------

Co-authored-by: Ebey Abraham <ebeyabraham@microsoft.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
Ebey Abraham 2023-12-18 17:27:47 +00:00 committed by GitHub
parent 3c04bf6da8
commit b9e74f9bca
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 463 additions and 76 deletions

View file

@ -95,6 +95,7 @@ class MODEL_ARCH(IntEnum):
BLOOM = auto()
STABLELM = auto()
QWEN = auto()
PHI2 = auto()
class MODEL_TENSOR(IntEnum):
@ -140,6 +141,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.BLOOM: "bloom",
MODEL_ARCH.STABLELM: "stablelm",
MODEL_ARCH.QWEN: "qwen",
MODEL_ARCH.PHI2: "phi2",
}
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -350,6 +352,17 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_ARCH.GPT2: [
# TODO
],
MODEL_ARCH.PHI2: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_QKV,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
]
# TODO
}

View file

@ -17,6 +17,7 @@ class TensorNameMap:
"tok_embeddings", # llama-pth
"embeddings.word_embeddings", # bert
"language_model.embedding.word_embeddings", # persimmon
"transformer.embd.wte", # phi2
),
# Token type embeddings
@ -41,6 +42,7 @@ class TensorNameMap:
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen
"output", # llama-pth bloom
"word_embeddings_for_head", # persimmon
"lm_head.linear", # phi2
),
# Output norm
@ -53,6 +55,7 @@ class TensorNameMap:
"transformer.norm_f", # mpt
"ln_f", # refact bloom qwen
"language_model.encoder.final_layernorm", # persimmon
"lm_head.ln", # phi2
),
# Rope frequencies
@ -75,6 +78,7 @@ class TensorNameMap:
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
"model.layers.{bid}.ln1", # yi
"transformer.h.{bid}.ln", # phi2
),
# Attention norm 2
@ -90,6 +94,7 @@ class TensorNameMap:
"transformer.h.{bid}.self_attention.query_key_value", # falcon
"h.{bid}.self_attention.query_key_value", # bloom
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
"transformer.h.{bid}.mixer.Wqkv", # phi2
),
# Attention query
@ -128,6 +133,7 @@ class TensorNameMap:
"encoder.layer.{bid}.attention.output.dense", # bert
"transformer.h.{bid}.attn.out_proj", # gpt-j
"language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
"transformer.h.{bid}.mixer.out_proj", # phi2
),
# Rotary embeddings
@ -167,6 +173,7 @@ class TensorNameMap:
"transformer.h.{bid}.mlp.fc_in", # gpt-j
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
"transformer.h.{bid}.mlp.w1", # qwen
"transformer.h.{bid}.mlp.fc1", # phi2
),
MODEL_TENSOR.FFN_UP_EXP: (
@ -198,6 +205,7 @@ class TensorNameMap:
"encoder.layer.{bid}.output.dense", # bert
"transformer.h.{bid}.mlp.fc_out", # gpt-j
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
"transformer.h.{bid}.mlp.fc2", # phi2
),
MODEL_TENSOR.FFN_DOWN_EXP: (