Merge branch 'ggerganov:master' into master

This commit is contained in:
WangHaoranRobin 2023-06-22 22:14:41 -07:00 committed by GitHub
commit 6c76c31184
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 79 additions and 39 deletions

View file

@ -250,6 +250,15 @@ if (LLAMA_CUBLAS)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif() endif()
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
if (LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "52") # lowest CUDA 12 standard
endif()
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
else() else()
message(WARNING "cuBLAS not found") message(WARNING "cuBLAS not found")
endif() endif()
@ -493,22 +502,6 @@ if (BUILD_SHARED_LIBS)
endif() endif()
endif() endif()
if (GGML_SOURCES_CUDA)
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES "native")
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
set_property(TARGET ggml_static PROPERTY CUDA_ARCHITECTURES "native")
set_property(TARGET ggml_static PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
if (BUILD_SHARED_LIBS)
set_property(TARGET ggml_shared PROPERTY CUDA_ARCHITECTURES "native")
set_property(TARGET ggml_shared PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
endif()
set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES "native")
endif()
# #
# programs, examples and tests # programs, examples and tests

View file

@ -340,7 +340,7 @@ Building the program with BLAS support may lead to some performance improvements
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_DMMV_Y | Positive integer | 1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | | LLAMA_CUDA_DMMV_Y | Positive integer | 1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. | | LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value 2 1 can improve performance for slow GPUs. | | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
- #### CLBlast - #### CLBlast

View file

@ -130,6 +130,14 @@ TENSORS_LIST = make_tensors_list()
TENSORS_SET = set(TENSORS_LIST) TENSORS_SET = set(TENSORS_LIST)
def find_n_mult(n_ff: int, n_embd: int) -> int:
# hardcoded magic range
for n_mult in range(256, 1, -1):
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
if calc_ff == n_ff:
return n_mult
return 1
@dataclass @dataclass
class Params: class Params:
n_vocab: int n_vocab: int
@ -137,21 +145,61 @@ class Params:
n_mult: int n_mult: int
n_head: int n_head: int
n_layer: int n_layer: int
file_type: GGMLFileType
@staticmethod @staticmethod
def guessed(model: 'LazyModel', file_type: GGMLFileType) -> 'Params': def guessed(model: 'LazyModel') -> 'Params':
n_vocab, n_embd = model["tok_embeddings.weight"].shape # try transformer naming first
n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
# try transformer naming first
if "model.layers.0.self_attn.q_proj.weight" in model:
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
else:
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
n_head=n_embd // 128 # guessed
return Params( return Params(
n_vocab=n_vocab, n_vocab=n_vocab,
n_embd=n_embd, n_embd=n_embd,
n_mult=256, n_mult=256,
n_head=n_embd // 128, n_head=n_head,
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model), n_layer=n_layer,
file_type=file_type,
) )
@staticmethod
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
config = json.load(open(config_path))
n_vocab = config["vocab_size"];
n_embd = config["hidden_size"];
n_head = config["num_attention_heads"];
n_layer = config["num_hidden_layers"];
n_ff = config["intermediate_size"];
n_mult = find_n_mult(n_ff, n_embd);
return Params(
n_vocab=n_vocab,
n_embd=n_embd,
n_mult=n_mult,
n_head=n_head,
n_layer=n_layer,
)
@staticmethod
def load(model_plus: 'ModelPlus') -> 'Params':
orig_config_path = model_plus.paths[0].parent / "params.json"
hf_transformer_config_path = model_plus.paths[0].parent / "config.json"
if hf_transformer_config_path.exists():
params = Params.loadHFTransformerJson(model_plus.model, hf_transformer_config_path)
else:
params = Params.guessed(model_plus.model)
print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}')
return params
class SentencePieceVocab: class SentencePieceVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
@ -595,18 +643,17 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description) return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
def convert_transformers_to_orig(model: LazyModel) -> LazyModel: def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
out: LazyModel = {} out: LazyModel = {}
out["tok_embeddings.weight"] = model["model.embed_tokens.weight"] out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
out["norm.weight"] = model["model.norm.weight"] out["norm.weight"] = model["model.norm.weight"]
out["output.weight"] = model["lm_head.weight"] out["output.weight"] = model["lm_head.weight"]
n_head = model["model.layers.0.self_attn.q_proj.weight"].shape[1] // 128
for i in itertools.count(): for i in itertools.count():
if f"model.layers.{i}.self_attn.q_proj.weight" not in model: if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
break break
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], n_head) out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], n_head) out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"] out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
@ -920,7 +967,7 @@ class OutputFile:
def __init__(self, fname_out: Path) -> None: def __init__(self, fname_out: Path) -> None:
self.fout = open(fname_out, "wb") self.fout = open(fname_out, "wb")
def write_file_header(self, params: Params) -> None: def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
self.fout.write(b"ggjt"[::-1]) # magic self.fout.write(b"ggjt"[::-1]) # magic
values = [ values = [
1, # file version 1, # file version
@ -930,7 +977,7 @@ class OutputFile:
params.n_head, params.n_head,
params.n_layer, params.n_layer,
params.n_embd // params.n_head, # rot (obsolete) params.n_embd // params.n_head, # rot (obsolete)
params.file_type.value, file_type.value,
] ]
self.fout.write(struct.pack("i" * len(values), *values)) self.fout.write(struct.pack("i" * len(values), *values))
@ -958,10 +1005,10 @@ class OutputFile:
of.fout.close() of.fout.close()
@staticmethod @staticmethod
def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None: def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None:
check_vocab_size(params, vocab) check_vocab_size(params, vocab)
of = OutputFile(fname_out) of = OutputFile(fname_out)
of.write_file_header(params) of.write_file_header(params, file_type)
print("Writing vocab...") print("Writing vocab...")
of.write_vocab(vocab) of.write_vocab(vocab)
@ -997,11 +1044,11 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi
raise Exception(f"Unexpected combination of types: {name_to_type}") raise Exception(f"Unexpected combination of types: {name_to_type}")
def do_necessary_conversions(model: LazyModel) -> LazyModel: def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel:
model = handle_quantization(model) model = handle_quantization(model)
if "lm_head.weight" in model: if "lm_head.weight" in model:
model = convert_transformers_to_orig(model) model = convert_transformers_to_orig(model, params)
model = filter_and_sort_tensors(model) model = filter_and_sort_tensors(model)
return model return model
@ -1107,14 +1154,14 @@ def load_vocab(path: Path) -> SentencePieceVocab:
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None) return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
def default_outfile(model_paths: List[Path], params: Params) -> Path: def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
namestr = { namestr = {
GGMLFileType.AllF32: "f32", GGMLFileType.AllF32: "f32",
GGMLFileType.MostlyF16: "f16", GGMLFileType.MostlyF16: "f16",
GGMLFileType.MostlyQ4_0: "q4_0", GGMLFileType.MostlyQ4_0: "q4_0",
GGMLFileType.MostlyQ4_1: "q4_1", GGMLFileType.MostlyQ4_1: "q4_1",
GGMLFileType.PerLayerIsQ4_1: "q4_1", GGMLFileType.PerLayerIsQ4_1: "q4_1",
}[params.file_type] }[file_type]
ret = model_paths[0].parent / f"ggml-model-{namestr}.bin" ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
if ret in model_paths: if ret in model_paths:
sys.stderr.write( sys.stderr.write(
@ -1164,13 +1211,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
else: else:
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
vocab = load_vocab(vocab_dir) vocab = load_vocab(vocab_dir)
params = Params.load(model_plus)
model = model_plus.model model = model_plus.model
model = do_necessary_conversions(model) model = do_necessary_conversions(model, params)
output_type = pick_output_type(model, args.outtype) output_type = pick_output_type(model, args.outtype)
model = convert_to_output_type(model, output_type) model = convert_to_output_type(model, output_type)
params = Params.guessed(model, output_type) outfile = args.outfile or default_outfile(model_plus.paths, output_type)
outfile = args.outfile or default_outfile(model_plus.paths, params) OutputFile.write_all(outfile, params, output_type, model, vocab)
OutputFile.write_all(outfile, params, model, vocab)
print(f"Wrote {outfile}") print(f"Wrote {outfile}")