From aeefac4ff760acea5afe66fbfe8d7eca1937b79c Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Thu, 31 Aug 2023 16:49:24 -0600 Subject: [PATCH 01/18] scripts: Use local gguf package when running from repo (#2927) * scripts: Use local gguf when running from repo --- convert-falcon-hf-to-gguf.py | 5 ++++- convert-gptneox-hf-to-gguf.py | 5 ++++- convert-llama-ggmlv3-to-gguf.py | 6 +++++- convert.py | 6 +++++- .../convert-train-checkpoint-to-gguf.py | 5 ++++- 5 files changed, 22 insertions(+), 5 deletions(-) diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py index ec786ff67..271e58972 100755 --- a/convert-falcon-hf-to-gguf.py +++ b/convert-falcon-hf-to-gguf.py @@ -11,11 +11,14 @@ import sys from pathlib import Path from typing import Any -import gguf import numpy as np import torch from transformers import AutoTokenizer # type: ignore[import] +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + def bytes_to_unicode(): # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py diff --git a/convert-gptneox-hf-to-gguf.py b/convert-gptneox-hf-to-gguf.py index 852123d99..b9c8b4607 100755 --- a/convert-gptneox-hf-to-gguf.py +++ b/convert-gptneox-hf-to-gguf.py @@ -11,11 +11,14 @@ import sys from pathlib import Path from typing import Any -import gguf import numpy as np import torch from transformers import AutoTokenizer # type: ignore[import] +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py diff --git a/convert-llama-ggmlv3-to-gguf.py b/convert-llama-ggmlv3-to-gguf.py index 3f39bc39e..08ba0c490 100755 --- a/convert-llama-ggmlv3-to-gguf.py +++ b/convert-llama-ggmlv3-to-gguf.py @@ -7,9 +7,13 @@ import struct import sys from pathlib import Path -import gguf import numpy as np +import os +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + # Note: Does not support GGML_QKK_64 QK_K = 256 # Items here are (block size, type size) diff --git a/convert.py b/convert.py index 9a39edb99..5cc3f6e66 100755 --- a/convert.py +++ b/convert.py @@ -25,10 +25,14 @@ from dataclasses import dataclass from pathlib import Path from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar -import gguf import numpy as np from sentencepiece import SentencePieceProcessor # type: ignore[import] +import os +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + if TYPE_CHECKING: from typing import TypeAlias diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py index 01b3ee92a..a527d6153 100644 --- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py +++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py @@ -2,13 +2,16 @@ # train-text-from-scratch checkpoint --> gguf conversion import argparse -import gguf import os import struct import sys import numpy as np from pathlib import Path +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py' / 'gguf')) +import gguf + # gguf constants LLM_KV_OPTIMIZER_TYPE = "optimizer.type" LLM_KV_OPTIMIZER_TYPE_ADAM = "adam" From 528134dd0267838d9c0250cf1d9621631dff09b2 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 1 Sep 2023 01:32:09 +0200 Subject: [PATCH 02/18] remove convert-llama-7b-pth-to-gguf.py and convert-llama-hf-to-gguf.py (#2906) --- convert-llama-7b-pth-to-gguf.py | 261 ----------------------------- convert-llama-hf-to-gguf.py | 280 -------------------------------- 2 files changed, 541 deletions(-) delete mode 100755 convert-llama-7b-pth-to-gguf.py delete mode 100755 convert-llama-hf-to-gguf.py diff --git a/convert-llama-7b-pth-to-gguf.py b/convert-llama-7b-pth-to-gguf.py deleted file mode 100755 index 6574c11dd..000000000 --- a/convert-llama-7b-pth-to-gguf.py +++ /dev/null @@ -1,261 +0,0 @@ -#!/usr/bin/env python3 -# 7b pth llama --> gguf conversion -# Only models with a single datafile are supported, like 7B -# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model - -from __future__ import annotations - -import argparse -import json -import os -import struct -import sys -from pathlib import Path -from typing import TYPE_CHECKING, Any - -import gguf -import numpy as np -import torch -from sentencepiece import SentencePieceProcessor # type: ignore[import] - -if TYPE_CHECKING: - from typing import TypeAlias - -NDArray: TypeAlias = 'np.ndarray[Any, Any]' - - -def count_model_parts(dir_model: Path) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith("consolidated."): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a PyTorch 7B LLaMA model to a GGML compatible file") - parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") - parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") - parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "LlamaForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - sys.exit() - -# get number of model parts -num_parts = count_model_parts(dir_model) - -if num_parts > 1: - print("gguf: Only models with a single datafile are supported.") - - sys.exit() - -ARCH=gguf.MODEL_ARCH.LLAMA -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - - -print("gguf: get model metadata") - -block_count = hparams["num_hidden_layers"] -head_count = hparams["num_attention_heads"] - -if "num_key_value_heads" in hparams: - head_count_kv = hparams["num_key_value_heads"] -else: - head_count_kv = head_count - -if "_name_or_path" in hparams: - hf_repo = hparams["_name_or_path"] -else: - hf_repo = "" - -if "max_sequence_length" in hparams: - ctx_length = hparams["max_sequence_length"] -elif "max_position_embeddings" in hparams: - ctx_length = hparams["max_position_embeddings"] -else: - print("gguf: can not find ctx length parameter.") - - sys.exit() - - -gguf_writer.add_name(dir_model.name) -gguf_writer.add_source_hf_repo(hf_repo) -gguf_writer.add_tensor_data_layout("Meta AI original pth") -gguf_writer.add_context_length(ctx_length) -gguf_writer.add_embedding_length(hparams["hidden_size"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) -gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) -gguf_writer.add_head_count(head_count) -gguf_writer.add_head_count_kv(head_count_kv) -gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - -if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]: - if "type" in hparams["rope_scaling"]: - if hparams["rope_scaling"]["type"] == "linear": - gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"]) - - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytes] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -tokenizer_model_file = dir_model / 'tokenizer.model' -if not tokenizer_model_file.is_file(): - print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr) - sys.exit(1) - -# vocab type sentencepiece -print("gguf: get sentencepiece tokenizer vocab and scores") - -tokenizer = SentencePieceProcessor(str(tokenizer_model_file)) - -for i in range(tokenizer.vocab_size()): - text: bytes - score: float - - piece = tokenizer.id_to_piece(i) - text = piece.encode("utf-8") - score = tokenizer.get_score(i) - - toktype = 1 # defualt to normal token type - if tokenizer.is_unknown(i): - toktype = 2 - if tokenizer.is_control(i): - toktype = 3 - - # toktype = 4 is user-defined = tokens from added_tokens.json - - if tokenizer.is_unused(i): - toktype = 5 - if tokenizer.is_byte(i): - toktype = 6 - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - -added_tokens_file = dir_model / 'added_tokens.json' -if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - addtokens_json = json.load(f) - - print("gguf: get added tokens") - - for key in addtokens_json: - tokens.append( key.encode("utf-8") ) - scores.append(-1000.0) - toktypes.append(4) # user-defined token type - -gguf_writer.add_tokenizer_model("llama") -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# tensor info -print("gguf: get tensor metadata") - -part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts)) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") - - for name in model_part.keys(): - data = model_part[name] - - # we don't need these - if name == "rope.freqs": - continue - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-llama-hf-to-gguf.py b/convert-llama-hf-to-gguf.py deleted file mode 100755 index c453c83c3..000000000 --- a/convert-llama-hf-to-gguf.py +++ /dev/null @@ -1,280 +0,0 @@ -#!/usr/bin/env python3 -# HF llama --> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import struct -import sys -from pathlib import Path -from typing import TYPE_CHECKING, Any - -import gguf -import numpy as np -import torch -from sentencepiece import SentencePieceProcessor # type: ignore[import] - -if TYPE_CHECKING: - from typing import TypeAlias - -NDArray: TypeAlias = 'np.ndarray[Any, Any]' - -# reverse HF permute back to original pth layout -# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py - - -def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - -def count_model_parts(dir_model: str) -> int: - num_parts = 0 - - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file") - parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") - parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") - parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "LlamaForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit() - -# get number of model parts -num_parts = count_model_parts(dir_model) - -ARCH=gguf.MODEL_ARCH.LLAMA -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -block_count = hparams["num_hidden_layers"] -head_count = hparams["num_attention_heads"] - -if "num_key_value_heads" in hparams: - head_count_kv = hparams["num_key_value_heads"] -else: - head_count_kv = head_count - -if "_name_or_path" in hparams: - hf_repo = hparams["_name_or_path"] -else: - hf_repo = "" - -if "max_sequence_length" in hparams: - ctx_length = hparams["max_sequence_length"] -elif "max_position_embeddings" in hparams: - ctx_length = hparams["max_position_embeddings"] -else: - print("gguf: can not find ctx length parameter.") - - sys.exit() - - -gguf_writer.add_name(dir_model.name) -gguf_writer.add_source_hf_repo(hf_repo) -gguf_writer.add_tensor_data_layout("Meta AI original pth") -gguf_writer.add_context_length(ctx_length) -gguf_writer.add_embedding_length(hparams["hidden_size"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) -gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) -gguf_writer.add_head_count(head_count) -gguf_writer.add_head_count_kv(head_count_kv) -gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - -if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]: - if "type" in hparams["rope_scaling"]: - if hparams["rope_scaling"]["type"] == "linear": - gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"]) - - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytes] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -tokenizer_model_file = dir_model / 'tokenizer.model' -if not tokenizer_model_file.is_file(): - print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr) - sys.exit(1) - -# vocab type sentencepiece -print("gguf: get sentencepiece tokenizer vocab, scores and token types") - -tokenizer = SentencePieceProcessor(str(tokenizer_model_file)) - -for i in range(tokenizer.vocab_size()): - text: bytes - score: float - - piece = tokenizer.id_to_piece(i) - text = piece.encode("utf-8") - score = tokenizer.get_score(i) - - toktype = 1 # defualt to normal token type - if tokenizer.is_unknown(i): - toktype = 2 - if tokenizer.is_control(i): - toktype = 3 - - # toktype = 4 is user-defined = tokens from added_tokens.json - - if tokenizer.is_unused(i): - toktype = 5 - if tokenizer.is_byte(i): - toktype = 6 - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - -added_tokens_file = dir_model / 'added_tokens.json' -if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - addtokens_json = json.load(f) - - print("gguf: get added tokens") - - for key in addtokens_json: - tokens.append( key.encode("utf-8") ) - scores.append(-1000.0) - toktypes.append(4) # user-defined token type - - -gguf_writer.add_tokenizer_model("llama") -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") - - for name in model_part.keys(): - data = model_part[name] - - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # reverse permute these - if name.endswith(".q_proj.weight"): - data = reverse_hf_permute(data, head_count) - if name.endswith(".k_proj.weight"): - data = reverse_hf_permute(data, head_count, head_count_kv) - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") From bce1fef328941499dc0acb76cc7fd7ac90449c2f Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Thu, 31 Aug 2023 22:13:51 -0400 Subject: [PATCH 03/18] convert : fix another python 3.8 issue (#2949) --- convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert.py b/convert.py index 5cc3f6e66..6c89b5ecc 100755 --- a/convert.py +++ b/convert.py @@ -530,7 +530,7 @@ class LazyTensor: raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.') -LazyModel = dict[str, LazyTensor] +LazyModel: TypeAlias = 'dict[str, LazyTensor]' @dataclass From e8d91589258f9204397a7ac5f9b3c857835c98f8 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Fri, 1 Sep 2023 11:15:57 +0300 Subject: [PATCH 04/18] metal: somewhat faster f16 x f32 matrix multiply kernel (#2951) * Somewhat faster f16 x f32 matrix multiply kernel * Better use 32 thread groups for f16 x f32 --------- Co-authored-by: Iwan Kawrakow --- ggml-metal.m | 2 +- ggml-metal.metal | 38 ++++++++++++++++++++++++++++---------- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index e929c4b07..8c3c64f53 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -840,7 +840,7 @@ void ggml_metal_graph_compute( switch (src0t) { case GGML_TYPE_F16: { - nth0 = 64; + nth0 = 32; nth1 = 1; [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; } break; diff --git a/ggml-metal.metal b/ggml-metal.metal index 82e1a0c7a..02db5323e 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -528,24 +528,42 @@ kernel void kernel_mul_mat_f16_f32( device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); - sum[tpitg.x] = 0.0f; + uint ith = tpitg.x; + uint nth = tptg.x; - for (int i = tpitg.x; i < ne00; i += tptg.x) { - sum[tpitg.x] += (float) x[i] * (float) y[i]; + sum[ith] = 0.0f; + + for (int i = ith; i < ne00; i += nth) { + sum[ith] += (float) x[i] * (float) y[i]; } // accumulate the sum from all threads in the threadgroup threadgroup_barrier(mem_flags::mem_threadgroup); - for (uint i = tptg.x/2; i > 0; i /= 2) { - if (tpitg.x < i) { - sum[tpitg.x] += sum[tpitg.x + i]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); + if (ith%4 == 0) { + for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i]; } - - if (tpitg.x == 0) { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (ith%16 == 0) { + for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (ith == 0) { + for (int i = 16; i < nth; i += 16) sum[0] += sum[i]; dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0]; } + + // Original implementation. Left behind commented out for now + //threadgroup_barrier(mem_flags::mem_threadgroup); + //for (uint i = tptg.x/2; i > 0; i /= 2) { + // if (tpitg.x < i) { + // sum[tpitg.x] += sum[tpitg.x + i]; + // } + // threadgroup_barrier(mem_flags::mem_threadgroup); + //} + // + //if (tpitg.x == 0) { + // dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0]; + //} } kernel void kernel_alibi_f32( From 18705a30ef3d6a89e1d7c6cb8cfe8633f760cb53 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Fri, 1 Sep 2023 05:03:49 -0400 Subject: [PATCH 05/18] llama2c : fix segfault and alloc-dealloc-mismatch (#2913) * llama2c : fix segfault if vocab is not found * llama2c : fix mismatch between new[] and delete * llama2c : fix basename on Windows * llama2c : use a destructor to prevent memory leaks --- .../convert-llama2c-to-ggml.cpp | 43 ++++++++++--------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index e9e070b1f..0b03c9d2b 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -75,7 +75,7 @@ typedef struct { int seq_len; // max sequence length } Config; -typedef struct { +struct TransformerWeights { // token embedding table float* token_embedding_table; // (vocab_size, dim) // weights for rmsnorms @@ -97,7 +97,22 @@ typedef struct { // float* freq_cis_imag; // (seq_len, dim/2) // (optional) classifier weights for the logits, on the last layer float* wcls; -} TransformerWeights; + + ~TransformerWeights() { + delete[] token_embedding_table; + delete[] rms_att_weight; + delete[] rms_ffn_weight; + delete[] wq; + delete[] wk; + delete[] wv; + delete[] wo; + delete[] w1; + delete[] w2; + delete[] w3; + delete[] rms_final_weight; + delete[] wcls; + } +}; void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) { // we calloc instead of malloc to keep valgrind happy @@ -173,21 +188,6 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shar return 0; } -void free_weights(TransformerWeights* w) { - delete w->token_embedding_table; - delete w->rms_att_weight; - delete w->rms_ffn_weight; - delete w->wq; - delete w->wk; - delete w->wv; - delete w->wo; - delete w->w1; - delete w->w2; - delete w->w3; - delete w->rms_final_weight; - if (w->wcls) delete w->wcls; -} - void print_sample_weights(TransformerWeights *w){ printf("----- Quick print of first of the weight vales of all the variables\n"); printf("%f\n", w->token_embedding_table[0]); @@ -596,6 +596,10 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) // assume llama2.c vocabulary printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename); llama_file file(filename, "rb"); + if (!file.fp) { + fprintf(stderr, "error: %s: %s\n", strerror(errno), filename); + exit(1); + } const int n_vocab = config->vocab_size; /* uint32_t max_token_length = */ file.read_u32(); // unused vocab->id_to_token.resize(n_vocab); @@ -898,7 +902,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) { } std::string basename(const std::string &path) { - size_t pos = path.find_last_of("/"); + size_t pos = path.find_last_of("/\\"); if (pos == std::string::npos) { return path; } @@ -911,7 +915,7 @@ int main(int argc, char ** argv) { return 1; } Config config; - TransformerWeights weights; + TransformerWeights weights = {}; { FILE *file = fopen(params.fn_llama2c_model, "rb"); if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; } @@ -953,6 +957,5 @@ int main(int argc, char ** argv) { printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model); ggml_free(model.ctx); - free_weights(&weights); return 0; } From 4dcd47d71df8ca4edcc31302744bd93f0c31298e Mon Sep 17 00:00:00 2001 From: staviq Date: Fri, 1 Sep 2023 11:07:06 +0200 Subject: [PATCH 06/18] logs : fix mingw-like builds (fixes #2898) (#2911) * fix mingw-like builds * formatting * make LOG_COMPAT easier to override and extend * simplify win detection * fix for #2940 --- Makefile | 10 +++++----- common/log.h | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index b750540fe..b56df3d8a 100644 --- a/Makefile +++ b/Makefile @@ -79,6 +79,11 @@ ifdef LLAMA_SERVER_VERBOSE CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE) endif +ifdef LLAMA_DISABLE_LOGS + CFLAGS += -DLOG_DISABLE_LOGS + CXXFLAGS += -DLOG_DISABLE_LOGS +endif # LLAMA_DISABLE_LOGS + # warnings CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \ -Wmissing-prototypes -Werror=implicit-int @@ -343,11 +348,6 @@ k_quants.o: k_quants.c k_quants.h $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_NO_K_QUANTS -ifdef LLAMA_DISABLE_LOGS - CFLAGS += -DLOG_DISABLE_LOGS - CXXFLAGS += -DLOG_DISABLE_LOGS -endif # LLAMA_DISABLE_LOGS - # # Print build information # diff --git a/common/log.h b/common/log.h index c1364187d..bf9fafd68 100644 --- a/common/log.h +++ b/common/log.h @@ -154,7 +154,7 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base // #include "log.h" // #ifndef LOG_NO_TIMESTAMPS - #ifndef _WIN32 + #ifndef _MSC_VER #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] " #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() #else @@ -167,7 +167,7 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base #endif #ifdef LOG_TEE_TIMESTAMPS - #ifndef _WIN32 + #ifndef _MSC_VER #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] " #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() #else @@ -187,7 +187,7 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base // #include "log.h" // #ifndef LOG_NO_FILE_LINE_FUNCTION - #ifndef _WIN32 + #ifndef _MSC_VER #define LOG_FLF_FMT "[%24s:%5d][%24s] " #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ #else @@ -200,7 +200,7 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base #endif #ifdef LOG_TEE_FILE_LINE_FUNCTION - #ifndef _WIN32 + #ifndef _MSC_VER #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] " #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ #else @@ -224,7 +224,7 @@ enum LogTriState // INTERNAL, DO NOT USE // USE LOG() INSTEAD // -#ifndef _WIN32 +#ifndef _MSC_VER #define LOG_IMPL(str, ...) \ { \ if (LOG_TARGET != nullptr) \ @@ -247,7 +247,7 @@ enum LogTriState // INTERNAL, DO NOT USE // USE LOG_TEE() INSTEAD // -#ifndef _WIN32 +#ifndef _MSC_VER #define LOG_TEE_IMPL(str, ...) \ { \ if (LOG_TARGET != nullptr) \ @@ -284,7 +284,7 @@ enum LogTriState // Main LOG macro. // behaves like printf, and supports arguments the exact same way. // -#ifndef _WIN32 +#ifndef _MSC_VER #define LOG(...) LOG_IMPL(__VA_ARGS__, "") #else #define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "") @@ -298,14 +298,14 @@ enum LogTriState // Secondary target can be changed just like LOG_TARGET // by defining LOG_TEE_TARGET // -#ifndef _WIN32 +#ifndef _MSC_VER #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "") #else #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "") #endif // LOG macro variants with auto endline. -#ifndef _WIN32 +#ifndef _MSC_VER #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n") #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n") #else @@ -461,7 +461,7 @@ inline void log_test() LOG("13 Hello World this time in yet new file?\n") log_set_target(log_filename_generator("llama_autonamed", "log")); LOG("14 Hello World in log with generated filename!\n") -#ifdef _WIN32 +#ifdef _MSC_VER LOG_TEE("15 Hello msvc TEE without arguments\n") LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test") LOG_TEELN("17 Hello msvc TEELN without arguments\n") From 13268c533177a4dc76bce0b465645d74f0d51d55 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 1 Sep 2023 13:42:41 +0300 Subject: [PATCH 07/18] metal : slight speed-up for add and mul kernels (#2917) --- ggml-metal.m | 20 ++++++++++++++++---- ggml-metal.metal | 32 ++++++++++++++++---------------- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 8c3c64f53..4267db9be 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -680,6 +680,12 @@ void ggml_metal_graph_compute( } break; case GGML_OP_ADD: { + GGML_ASSERT(ggml_is_contiguous(src0)); + + // utilize float4 + GGML_ASSERT(ne00 % 4 == 0); + const int64_t nb = ne00/4; + if (ggml_nelements(src1) == ne10) { // src1 is a row [encoder setComputePipelineState:ctx->pipeline_add_row]; @@ -689,14 +695,20 @@ void ggml_metal_graph_compute( [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&nb length:sizeof(nb) atIndex:3]; - const int64_t n = ggml_nelements(dst); + const int64_t n = ggml_nelements(dst)/4; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case GGML_OP_MUL: { + GGML_ASSERT(ggml_is_contiguous(src0)); + + // utilize float4 + GGML_ASSERT(ne00 % 4 == 0); + const int64_t nb = ne00/4; + if (ggml_nelements(src1) == ne10) { // src1 is a row [encoder setComputePipelineState:ctx->pipeline_mul_row]; @@ -706,9 +718,9 @@ void ggml_metal_graph_compute( [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&nb length:sizeof(nb) atIndex:3]; - const int64_t n = ggml_nelements(dst); + const int64_t n = ggml_nelements(dst)/4; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; diff --git a/ggml-metal.metal b/ggml-metal.metal index 02db5323e..8cdf0b9d2 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -25,9 +25,9 @@ typedef struct { } block_q8_0; kernel void kernel_add( - device const float * src0, - device const float * src1, - device float * dst, + device const float4 * src0, + device const float4 * src1, + device float4 * dst, uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] + src1[tpig]; } @@ -35,18 +35,18 @@ kernel void kernel_add( // assumption: src1 is a row // broadcast src1 into src0 kernel void kernel_add_row( - device const float * src0, - device const float * src1, - device float * dst, - constant int64_t & ne00, + device const float4 * src0, + device const float4 * src1, + device float4 * dst, + constant int64_t & nb, uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] + src1[tpig % ne00]; + dst[tpig] = src0[tpig] + src1[tpig % nb]; } kernel void kernel_mul( - device const float * src0, - device const float * src1, - device float * dst, + device const float4 * src0, + device const float4 * src1, + device float4 * dst, uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] * src1[tpig]; } @@ -54,12 +54,12 @@ kernel void kernel_mul( // assumption: src1 is a row // broadcast src1 into src0 kernel void kernel_mul_row( - device const float * src0, - device const float * src1, - device float * dst, - constant int64_t & ne00, + device const float4 * src0, + device const float4 * src1, + device float4 * dst, + constant int64_t & nb, uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] * src1[tpig % ne00]; + dst[tpig] = src0[tpig] * src1[tpig % nb]; } kernel void kernel_scale( From 5aec2cfaac386eb09aebb75b805860828f00de91 Mon Sep 17 00:00:00 2001 From: Tameem <113388789+AhmadTameem@users.noreply.github.com> Date: Fri, 1 Sep 2023 18:27:40 +0500 Subject: [PATCH 08/18] ggml : add RISC-V vector intrinsics support (#2929) * added support for RISCV CFLAGS & native compile + cross compile options * Add RISC-V Vector Intrinsics Support Added RVV intrinsics for following ggml_vec_dot_q4_0_q8_0 ggml_vec_dot_q4_1_q8_1 ggml_vec_dot_q5_0_q8_0 ggml_vec_dot_q5_1_q8_1 ggml_vec_dot_q8_0_q8_0 Co-authored-by: Sharafat Signed-off-by: Ahmad Tameem --------- Signed-off-by: Ahmad Tameem Co-authored-by: moiz.hussain Co-authored-by: Sharafat --- Makefile | 13 ++++ ggml.c | 227 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 240 insertions(+) diff --git a/Makefile b/Makefile index b56df3d8a..8f73297f4 100644 --- a/Makefile +++ b/Makefile @@ -35,6 +35,11 @@ ifndef UNAME_M UNAME_M := $(shell uname -m) endif +ifdef RISCV_CROSS_COMPILE +CC := riscv64-unknown-linux-gnu-gcc +CXX := riscv64-unknown-linux-gnu-g++ +endif + CCV := $(shell $(CC) --version | head -n 1) CXXV := $(shell $(CXX) --version | head -n 1) @@ -150,6 +155,9 @@ endif # Architecture specific # TODO: probably these flags need to be tweaked on some architectures # feel free to update the Makefile for your architecture and send a pull request or issue + +ifndef RISCV + ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) # Use all CPU extensions that are available: CFLAGS += -march=native -mtune=native @@ -198,6 +206,11 @@ ifneq ($(filter ppc64%,$(UNAME_M)),) endif endif +else + CFLAGS += -march=rv64gcv -mabi=lp64d + CXXFLAGS += -march=rv64gcv -mabi=lp64d +endif + ifndef LLAMA_NO_K_QUANTS CFLAGS += -DGGML_USE_K_QUANTS CXXFLAGS += -DGGML_USE_K_QUANTS diff --git a/ggml.c b/ggml.c index 46ce4a581..cf3955f7f 100644 --- a/ggml.c +++ b/ggml.c @@ -301,6 +301,10 @@ typedef double ggml_float; #endif #endif +#ifdef __riscv_v_intrinsic +#include +#endif + #ifdef __F16C__ #ifdef _MSC_VER @@ -2677,6 +2681,41 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * } *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl); + vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d); + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -2803,6 +2842,38 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * } *s = hsum_float_8(acc) + summs; +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -3037,6 +3108,76 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * } *s = hsum_float_8(acc); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + uint32_t qh; + + // These temp values are for masking and shift operations + uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000}; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + memcpy(&qh, x[i].qh, sizeof(uint32_t)); + + // temporary registers + vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl); + vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl); + vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl); + vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl); + + // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl); + vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl); + vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl); + + // ((qh & (1u << (j + 16))) >> (j + 12)); + vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl); + vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl); + + // narrowing + vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl); + vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl); + + vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl); + vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl); + + // load + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl); + vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl); + + vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl); + vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi; + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -3293,6 +3434,72 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * } *s = hsum_float_8(acc) + summs; +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + uint32_t qh; + + // These temp values are for shift operations + uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + memcpy(&qh, x[i].qh, sizeof(uint32_t)); + + // temporary registers + vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl); + vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl); + + // load qh + vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl); + + // ((qh >> (j + 0)) << 4) & 0x10; + vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl); + vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl); + vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl); + + // ((qh >> (j + 12)) ) & 0x10; + vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl); + vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl); + + // narrowing + vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl); + vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl); + + vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl); + vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl); + + // load + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl); + vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl); + + vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -3404,6 +3611,26 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * } *s = hsum_float_8(acc); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + size_t vl = __riscv_vsetvl_e8m1(qk); + + for (int i = 0; i < nb; i++) { + // load elements + vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl); + vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl); + + vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl); + + vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl); + vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum); + + sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)); + } + + *s = sumf; #else // scalar float sumf = 0.0; From d8d6977f48f1fa402ade38ad32c5b5fb1358d059 Mon Sep 17 00:00:00 2001 From: Ben Siraphob Date: Fri, 1 Sep 2023 09:32:14 -0400 Subject: [PATCH 09/18] examples : add C grammar (#2357) --- grammars/c.gbnf | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 grammars/c.gbnf diff --git a/grammars/c.gbnf b/grammars/c.gbnf new file mode 100644 index 000000000..4a0331dd2 --- /dev/null +++ b/grammars/c.gbnf @@ -0,0 +1,42 @@ +root ::= (declaration)* + +declaration ::= dataType identifier "(" parameter? ")" "{" statement* "}" + +dataType ::= "int" ws | "float" ws | "char" ws +identifier ::= [a-zA-Z_] [a-zA-Z_0-9]* + +parameter ::= dataType identifier + +statement ::= + ( dataType identifier ws "=" ws expression ";" ) | + ( identifier ws "=" ws expression ";" ) | + ( identifier ws "(" argList? ")" ";" ) | + ( "return" ws expression ";" ) | + ( "while" "(" condition ")" "{" statement* "}" ) | + ( "for" "(" forInit ";" ws condition ";" ws forUpdate ")" "{" statement* "}" ) | + ( "if" "(" condition ")" "{" statement* "}" ("else" "{" statement* "}")? ) | + ( singleLineComment ) | + ( multiLineComment ) + +forInit ::= dataType identifier ws "=" ws expression | identifier ws "=" ws expression +forUpdate ::= identifier ws "=" ws expression + +condition ::= expression relationOperator expression +relationOperator ::= ("<=" | "<" | "==" | "!=" | ">=" | ">") + +expression ::= term (("+" | "-") term)* +term ::= factor(("*" | "/") factor)* + +factor ::= identifier | number | unaryTerm | funcCall | parenExpression +unaryTerm ::= "-" factor +funcCall ::= identifier "(" argList? ")" +parenExpression ::= "(" ws expression ws ")" + +argList ::= expression ("," ws expression)* + +number ::= [0-9]+ + +singleLineComment ::= "//" [^\n]* "\n" +multiLineComment ::= "/*" ( [^*] | ("*" [^/]) )* "*/" + +ws ::= ([ \t\n]+) From ef156499721c67748cde01a5436cb6f0648bb4b4 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Fri, 1 Sep 2023 09:34:50 -0400 Subject: [PATCH 10/18] build : fix most gcc and clang warnings (#2861) * fix most gcc and clang warnings * baby-llama : remove commented opt_params_adam * fix some MinGW warnings * fix more MinGW warnings --- CMakeLists.txt | 5 +++++ Makefile | 7 ++++++- common/common.cpp | 6 ++++-- common/console.cpp | 1 + examples/baby-llama/baby-llama.cpp | 5 ----- examples/beam-search/beam-search.cpp | 8 +++++--- examples/server/server.cpp | 8 +++++--- k_quants.c | 8 ++------ llama.cpp | 4 ++-- 9 files changed, 30 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d6c1b3b33..1b7cce9f1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -403,6 +403,7 @@ if (LLAMA_ALL_WARNINGS) -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int + -Wno-unused-function ) set(cxx_flags -Wall @@ -412,6 +413,10 @@ if (LLAMA_ALL_WARNINGS) -Wno-unused-function -Wno-multichar ) + if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + # g++ only + set(cxx_flags ${cxx_flags} -Wno-format-truncation) + endif() else() # todo : msvc endif() diff --git a/Makefile b/Makefile index 8f73297f4..ef1eef6ac 100644 --- a/Makefile +++ b/Makefile @@ -91,9 +91,14 @@ endif # LLAMA_DISABLE_LOGS # warnings CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \ - -Wmissing-prototypes -Werror=implicit-int + -Wmissing-prototypes -Werror=implicit-int -Wno-unused-function CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar +ifeq '' '$(findstring clang++,$(CXX))' + # g++ only + CXXFLAGS += -Wno-format-truncation +endif + # OS specific # TODO: support Windows ifeq ($(UNAME_S),Linux) diff --git a/common/common.cpp b/common/common.cpp index ed09fc27d..41fc59ced 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -24,7 +24,9 @@ #if defined(_WIN32) #define WIN32_LEAN_AND_MEAN -#define NOMINMAX +#ifndef NOMINMAX +# define NOMINMAX +#endif #include #include #include @@ -1027,7 +1029,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str()); fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n"); fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); - fprintf(stream, "hellaswag_tasks: %ld # default: 400\n", params.hellaswag_tasks); + fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx)); const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY; diff --git a/common/console.cpp b/common/console.cpp index 8efa2a674..23545e5be 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -235,6 +235,7 @@ namespace console { int estimateWidth(char32_t codepoint) { #if defined(_WIN32) + (void)codepoint; return 1; #else return wcwidth(codepoint); diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 6fa55b319..a99ece9a6 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1617,15 +1617,10 @@ int main(int argc, char ** argv) { float error_before_opt = ggml_get_f32_1d(e, 0); - struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM); struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS); - opt_params_adam.print_forward_graph = false; - opt_params_adam.print_backward_graph = false; opt_params_lbfgs.print_forward_graph = false; opt_params_lbfgs.print_backward_graph = false; - opt_params_adam.adam.n_iter = 16; opt_params_lbfgs.lbfgs.n_iter = 16; - // ggml_opt(ctx0, opt_params_adam, e); ggml_opt(ctx0, opt_params_lbfgs, e); // ggml_build_forward_expand(&gf, e); diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp index 42c7c7254..4d021434b 100644 --- a/examples/beam-search/beam-search.cpp +++ b/examples/beam-search/beam-search.cpp @@ -22,7 +22,9 @@ #include #elif defined (_WIN32) #define WIN32_LEAN_AND_MEAN -#define NOMINMAX +#ifndef NOMINMAX +# define NOMINMAX +#endif #include #include #endif @@ -73,7 +75,7 @@ void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_stat assert(0u < beams_state.n_beams); const llama_token * tokens = beams_state.beam_views[0].tokens; std::copy(tokens, tokens + n, callback_data.response.end() - n); - printf("%lu", n); + printf("%zu", n); } fflush(stdout); #if 1 // DEBUG: print current beams for this iteration @@ -145,7 +147,7 @@ int main(int argc, char ** argv) if (tokens_list.size() > max_tokens_list_size) { - fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" , + fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" , __func__ , tokens_list.size() , max_tokens_list_size ); return 1; } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b485a5ead..09eac2ec2 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -17,6 +17,8 @@ #include "completion.js.hpp" #include "json-schema-to-grammar.mjs.hpp" +#include + #ifndef SERVER_VERBOSE #define SERVER_VERBOSE 1 #endif @@ -1038,7 +1040,7 @@ static json format_timings(llama_server_context &llama) { const auto timings = llama_get_timings(llama.ctx); - assert(timings.n_eval == llama.num_tokens_predicted); + assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted)); return json{ {"prompt_n", timings.n_p_eval}, @@ -1239,7 +1241,7 @@ void beam_search_callback(void * callback_data, llama_beams_state beams_state) { const llama_token * tokens = beams_state.beam_views[0].tokens; const auto map = [](llama_token tok) { return completion_token_output{{},tok}; }; std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map); - printf("%lu", n); + printf("%zu", n); } fflush(stdout); #if 0 // DEBUG: print current beams for this iteration @@ -1548,7 +1550,7 @@ int main(int argc, char **argv) svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) { - const auto * fmt = "500 Internal Server Error\n%s"; + const char fmt[] = "500 Internal Server Error\n%s"; char buf[BUFSIZ]; try { std::rethrow_exception(std::move(ep)); diff --git a/k_quants.c b/k_quants.c index 3a9b1dafd..3deeaedf7 100644 --- a/k_quants.c +++ b/k_quants.c @@ -183,13 +183,9 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t int ntry, float alpha) { float min = x[0]; float max = x[0]; - float sum_x = 0; - float sum_x2 = 0; for (int i = 1; i < n; ++i) { if (x[i] < min) min = x[i]; if (x[i] > max) max = x[i]; - sum_x += x[i]; - sum_x2 += x[i]*x[i]; } if (max == min) { for (int i = 0; i < n; ++i) L[i] = 0; @@ -2060,7 +2056,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri __m256 acc = _mm256_setzero_ps(); - uint32_t *aux; + const uint32_t *aux; for (int i = 0; i < nb; ++i) { @@ -2070,7 +2066,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const int8_t * restrict q8 = y[i].qs; // Set up scales - aux = (uint32_t *)x[i].scales; + aux = (const uint32_t *)x[i].scales; __m128i scales128 = _mm_set_epi32( ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4), ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4), diff --git a/llama.cpp b/llama.cpp index 98a5da963..5ca119238 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3600,7 +3600,7 @@ static void llama_grammar_advance_stack( std::vector> & new_stacks) { if (stack.empty()) { - new_stacks.push_back(stack); + new_stacks.emplace_back(stack); return; } @@ -3637,7 +3637,7 @@ static void llama_grammar_advance_stack( } case LLAMA_GRETYPE_CHAR: case LLAMA_GRETYPE_CHAR_NOT: - new_stacks.push_back(stack); + new_stacks.emplace_back(stack); break; default: // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range From 49bb9cbe0f598bc43be539b0df8eafb2130cfad3 Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Fri, 1 Sep 2023 15:36:14 +0200 Subject: [PATCH 11/18] docs : add java-llama.cpp to README.md (#2935) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index d727b0554..4e6a0957d 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,7 @@ as the main playground for developing new features for the [ggml](https://github - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj) - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn) +- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp) **UI:** From ee8654bcd0146708988a703e54406d5b553712ea Mon Sep 17 00:00:00 2001 From: m3ndax Date: Fri, 1 Sep 2023 15:47:27 +0200 Subject: [PATCH 12/18] minor : add const qualifiers (#2853) * made the methods const # Conflicts: # examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp * made method const * Update convert-llama2c-to-ggml.cpp removed write_raw and write_u32 * llama2c : remove misleading const --------- Co-authored-by: Georgi Gerganov --- examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp | 2 +- llama.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 0b03c9d2b..0ee7adc52 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -637,7 +637,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) } } -void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){ +void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { int ct; switch (gg_weights->n_dims){ case 1: diff --git a/llama.cpp b/llama.cpp index 5ca119238..23b251caf 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4393,7 +4393,7 @@ struct llama_logit_info { } return min_heap; } - float probability_from_logit(float logit) { + float probability_from_logit(float logit) const { return normalizer * std::exp(logit - max_l); } }; From 6c9c23429bf4e4fcaaddbebadc4638558430a7f2 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Fri, 1 Sep 2023 09:53:14 -0400 Subject: [PATCH 13/18] make : use unaligned vector moves on MinGW (#2945) Fixes #2922 --- Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Makefile b/Makefile index ef1eef6ac..23f050c0d 100644 --- a/Makefile +++ b/Makefile @@ -177,6 +177,14 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) #CXXFLAGS += -mssse3 endif +# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves. +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412 +# https://github.com/ggerganov/llama.cpp/issues/2922 +ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))' + CFLAGS += -Xassembler -muse-unaligned-vector-move + CXXFLAGS += -Xassembler -muse-unaligned-vector-move +endif + ifneq ($(filter aarch64%,$(UNAME_M)),) # Apple M1, M2, etc. # Raspberry Pi 3, 4, Zero 2 (64-bit) From 0d5893668625456c94bbadfddc53fc69cd51c223 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 1 Sep 2023 17:00:40 +0300 Subject: [PATCH 14/18] llama2c : rename function --- .../convert-llama2c-to-ggml.cpp | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 0ee7adc52..9e856c21a 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -637,7 +637,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) } } -void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { +void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { int ct; switch (gg_weights->n_dims){ case 1: @@ -674,13 +674,13 @@ void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, const float } void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) { - // stuff AK weights into GG weights one by one. + // convert AK weights into GG weights one by one. // w->token_embedding_table -> model->tok_embeddings // float* -> struct ggml_tensor - stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table); - stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table); + convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table); + convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table); - stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight); + convert_weights_ak_to_gg(model->norm, w->rms_final_weight); //print_row(model->norm, 0); // for rms-att-weight @@ -690,18 +690,18 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod for (uint32_t i = 0; i < model->hparams.n_layer; ++i){ auto & layer = model->layers[i]; // 1d - stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]); - stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]); + convert_weights_ak_to_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]); + convert_weights_ak_to_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]); // from 3d matrix layer x dim x dim to 2d matrix dim x dim - stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]); - stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]); - stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]); - stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]); + convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]); + convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length]); + convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length]); + convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]); - stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]); - stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]); - stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]); + convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]); + convert_weights_ak_to_gg(layer.w2 , &w->w2[i*n_ff*row_length]); + convert_weights_ak_to_gg(layer.w3 , &w->w3[i*row_length*n_ff]); } struct gguf_context * ctx = gguf_init_empty(); From 5d6f19f16b2173afe2d5c6aee2f5c9fc31038eba Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Fri, 1 Sep 2023 08:02:48 -0600 Subject: [PATCH 15/18] Allow quantize to only copy tensors, some other improvements (#2931) * Allow quantize tool to only copy tensors to allow repackaging models. * Slightly better logic when requantizing. * Change help message to go to `stdout`. --- examples/quantize/quantize.cpp | 24 +++++++++++++++++++----- llama.cpp | 25 +++++++++++++++++-------- llama.h | 1 + 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index df9a214fc..c174be069 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -35,6 +35,8 @@ static const std::vector QUANT_OPTIONS = { { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, + // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching. + { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, }; @@ -71,12 +73,17 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std: // ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // void usage(const char * executable) { - fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); - fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); - fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); - fprintf(stderr, "\nAllowed quantization types:\n"); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); + printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); + printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); + printf("\nAllowed quantization types:\n"); for (auto & it : QUANT_OPTIONS) { - printf(" %2d or %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str()); + if (it.name != "COPY") { + printf(" %2d or ", it.ftype); + } else { + printf(" "); + } + printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str()); } exit(1); } @@ -121,6 +128,9 @@ int main(int argc, char ** argv) { // export as [inp path]/ggml-model-[ftype].gguf fname_out = fpath + "ggml-model-" + ftype_str + ".gguf"; arg_idx++; + if (ftype_str == "COPY") { + params.only_copy = true; + } } else { fname_out = argv[arg_idx]; @@ -133,6 +143,10 @@ int main(int argc, char ** argv) { if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]); return 1; + } else { + if (ftype_str == "COPY") { + params.only_copy = true; + } } arg_idx++; } diff --git a/llama.cpp b/llama.cpp index 23b251caf..3114d3311 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4683,6 +4683,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s llm_load_arch(*ml, model); llm_load_hparams(*ml, model, 0, 0, 0); + if (params->only_copy) { + ftype = model.ftype; + } + const size_t align = GGUF_DEFAULT_ALIGNMENT; struct gguf_context * ctx_out = gguf_init_empty(); @@ -4769,18 +4773,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // quantize only 2D tensors quantize &= (tensor->n_dims == 2); quantize &= params->quantize_output_tensor || name != "output.weight"; - quantize &= quantized_type != tensor->type; + quantize &= !params->only_copy; enum ggml_type new_type; void * new_data; size_t new_size; - if (!quantize) { - new_type = tensor->type; - new_data = tensor->data; - new_size = ggml_nbytes(tensor); - LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0); - } else { + if (quantize) { new_type = quantized_type; #ifdef GGML_USE_K_QUANTS // TODO: avoid hardcoded tensor names - use the TN_* constants @@ -4879,7 +4878,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } #endif - + // If we've decided to quantize to the same type the tensor is already + // in then there's nothing to do. + quantize = tensor->type != new_type; + } + if (!quantize) { + new_type = tensor->type; + new_data = tensor->data; + new_size = ggml_nbytes(tensor); + LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0); + } else { const size_t nelements = ggml_nelements(tensor); float * f32_data; @@ -5310,6 +5318,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, /*.allow_requantize =*/ false, /*.quantize_output_tensor =*/ true, + /*.only_copy =*/ false, }; return result; diff --git a/llama.h b/llama.h index 6e5e1df63..422f28527 100644 --- a/llama.h +++ b/llama.h @@ -164,6 +164,7 @@ extern "C" { enum llama_ftype ftype; // quantize to this llama_ftype bool allow_requantize; // allow quantizing non-f32/f16 tensors bool quantize_output_tensor; // quantize output.weight + bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored } llama_model_quantize_params; // grammar types From 69fdbb9abc8907dd2a9ffdd840cba92d678a660a Mon Sep 17 00:00:00 2001 From: ZHAOKAI WANG Date: Fri, 1 Sep 2023 22:06:44 +0800 Subject: [PATCH 16/18] readme : quick start command fix (#2908) * quick start command fix * quick start win command fix --- examples/main/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/main/README.md b/examples/main/README.md index d555afdcc..2773fe976 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -34,7 +34,7 @@ For an interactive experience, try this command: #### Unix-based systems (Linux, macOS, etc.): ```bash -./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " \ +./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \ 'User: Hi AI: Hello. I am an AI chatbot. Would you like to talk? User: Sure! @@ -45,7 +45,7 @@ User:' #### Windows: ```powershell -main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:" +main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:" ``` The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it): From f04d0028444bc9b3d4225fba47e19d4c3aeb3741 Mon Sep 17 00:00:00 2001 From: Engininja2 <139037756+Engininja2@users.noreply.github.com> Date: Fri, 1 Sep 2023 15:33:19 -0600 Subject: [PATCH 17/18] cuda : vsubss4 for older versions of ROCm/clang (#2942) --- ggml-cuda.cu | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 5fd625630..8357f32f7 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -81,12 +81,29 @@ #if defined(GGML_USE_HIPBLAS) #define __CUDA_ARCH__ 1300 +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); static __device__ __forceinline__ int __vsubss4(const int a, const int b) { const int8x4_t va = reinterpret_cast(a); const int8x4_t vb = reinterpret_cast(b); +#if __has_builtin(__builtin_elementwise_sub_sat) const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); return reinterpret_cast(c); +#else + int8x4_t c; + int16_t tmp; +#pragma unroll + for (int i = 0; i < 4; i++) { + tmp = va[i] - vb[i]; + if(tmp > std::numeric_limits::max()) tmp = std::numeric_limits::max(); + if(tmp < std::numeric_limits::min()) tmp = std::numeric_limits::min(); + c[i] = tmp; + } + return reinterpret_cast(c); +#endif // __has_builtin(__builtin_elementwise_sub_sat) } static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { From 571083f508266c4eb5cb5457d836df5dd3c173ce Mon Sep 17 00:00:00 2001 From: Jhen-Jie Hong Date: Sat, 2 Sep 2023 08:31:46 +0800 Subject: [PATCH 18/18] server : avoid aniprompt in probabilities of final response (#2849) --- examples/server/server.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 09eac2ec2..94def943b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1379,7 +1379,13 @@ int main(int argc, char **argv) } } - const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs); + auto probs = llama.generated_token_probs; + if (llama.params.n_probs > 0 && llama.stopped_word) { + const std::vector stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false); + probs = std::vector(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size()); + } + + const json data = format_final_response(llama, llama.generated_text, probs); llama_print_timings(llama.ctx); @@ -1456,7 +1462,11 @@ int main(int argc, char **argv) if (!llama.has_next_token) { // Generation is done, send extra information. - const json data = format_final_response(llama, "", llama.generated_token_probs); + const json data = format_final_response( + llama, + "", + std::vector(llama.generated_token_probs.begin(), llama.generated_token_probs.begin() + sent_token_probs_index) + ); const std::string str = "data: " +