Merge branch 'ggerganov:master' into gguf-model-template
This commit is contained in:
commit
5481cec7c9
82 changed files with 13507 additions and 1470 deletions
|
@ -6,7 +6,7 @@ ARG CUDA_VERSION=11.7.1
|
|||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
ARG CUDA_DOCKER_ARCH=all
|
||||
|
|
|
@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
|
|||
# Target the CUDA build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||
|
|
|
@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER
|
|||
# Target the CUDA runtime image
|
||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
ARG CUDA_DOCKER_ARCH=all
|
||||
|
@ -25,7 +25,7 @@ ENV GGML_CUDA=1
|
|||
|
||||
RUN make -j$(nproc) llama-cli
|
||||
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libgomp1
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
||||
|
||||
ARG GGML_SYCL_F16=OFF
|
||||
RUN apt-get update && \
|
||||
|
@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
|||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||
cmake --build build --config Release --target llama-cli
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
||||
|
||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
|
|||
# Target the CUDA build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
ARG UBUNTU_VERSION=jammy
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
# Install build tools
|
||||
RUN apt update && apt install -y git build-essential cmake wget libgomp1
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git
|
||||
|
@ -11,7 +11,7 @@ COPY . .
|
|||
|
||||
RUN make -j$(nproc) llama-cli
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libgomp1
|
||||
|
|
|
@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER
|
|||
# Target the CUDA runtime image
|
||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
ARG CUDA_DOCKER_ARCH=all
|
||||
|
@ -27,7 +27,7 @@ ENV LLAMA_CURL=1
|
|||
|
||||
RUN make -j$(nproc) llama-server
|
||||
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
||||
|
||||
ARG GGML_SYCL_F16=OFF
|
||||
RUN apt-get update && \
|
||||
|
@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
|||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
||||
cmake --build build --config Release --target llama-server
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev curl
|
||||
|
|
|
@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
|
|||
# Target the CUDA build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
ARG UBUNTU_VERSION=jammy
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
# Install build tools
|
||||
RUN apt update && apt install -y git build-essential cmake wget
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git libcurl4-openssl-dev curl
|
||||
|
@ -13,7 +13,7 @@ ENV LLAMA_CURL=1
|
|||
|
||||
RUN make -j$(nproc) llama-server
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev libgomp1
|
||||
|
|
|
@ -106,6 +106,7 @@ llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
|
|||
llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
|
||||
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
|
||||
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
|
||||
llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
|
||||
|
||||
#
|
||||
# build the library
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
- Test your changes:
|
||||
- Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
|
||||
- Execute [the full CI locally on your machine](ci/README.md) before publishing
|
||||
- If the pull request contains only documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times
|
||||
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
|
||||
- The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your conveience
|
||||
|
||||
|
|
8
Makefile
8
Makefile
|
@ -795,6 +795,14 @@ ifdef GGML_CUDA_FORCE_DMMV
|
|||
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||
endif # GGML_CUDA_FORCE_DMMV
|
||||
|
||||
ifdef GGML_CUDA_FORCE_MMQ
|
||||
HIPFLAGS += -DGGML_CUDA_FORCE_MMQ
|
||||
endif # GGML_CUDA_FORCE_MMQ
|
||||
|
||||
ifdef GGML_CUDA_FORCE_CUBLAS
|
||||
HIPFLAGS += -DGGML_CUDA_FORCE_CUBLAS
|
||||
endif # GGML_CUDA_FORCE_CUBLAS
|
||||
|
||||
ifdef GGML_CUDA_NO_PEER_COPY
|
||||
HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
|
||||
endif # GGML_CUDA_NO_PEER_COPY
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||

|
||||
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
||||
[](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
||||
[](https://conan.io/center/llama-cpp)
|
||||
|
||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||
|
|
|
@ -685,7 +685,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
if (arg == "--lora") {
|
||||
CHECK_ARG
|
||||
params.lora_adapter.emplace_back(argv[i], 1.0f);
|
||||
params.use_mmap = false;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--lora-scaled") {
|
||||
|
@ -693,7 +692,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
const char* lora_adapter = argv[i];
|
||||
CHECK_ARG
|
||||
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
||||
params.use_mmap = false;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--lora-base") {
|
||||
|
@ -2089,19 +2087,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|||
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
|
||||
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
|
||||
float lora_scale = std::get<1>(params.lora_adapter[i]);
|
||||
int err = llama_model_apply_lora_from_file(model,
|
||||
lora_adapter.c_str(),
|
||||
lora_scale,
|
||||
((i > 0) || params.lora_base.empty())
|
||||
? NULL
|
||||
: params.lora_base.c_str(),
|
||||
params.n_threads);
|
||||
if (err != 0) {
|
||||
auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
|
||||
if (adapter == nullptr) {
|
||||
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||
llama_free(lctx);
|
||||
llama_free_model(model);
|
||||
return std::make_tuple(nullptr, nullptr);
|
||||
}
|
||||
llama_lora_adapter_set(lctx, adapter, lora_scale);
|
||||
}
|
||||
|
||||
if (params.ignore_eos) {
|
||||
|
|
|
@ -37,11 +37,18 @@ struct llama_ngram {
|
|||
}
|
||||
};
|
||||
|
||||
struct llama_token_hash_function {
|
||||
size_t operator()(const llama_token token) const {
|
||||
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
|
||||
return token * 11400714819323198485llu;
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_ngram_hash_function {
|
||||
size_t operator()(const llama_ngram & ngram) const {
|
||||
size_t hash = 0;
|
||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||
hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
|
||||
size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
|
||||
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
|
||||
hash ^= llama_token_hash_function{}(ngram.tokens[i]);
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
|
|
@ -48,34 +48,38 @@ class Model:
|
|||
|
||||
dir_model: Path
|
||||
ftype: gguf.LlamaFileType
|
||||
fname_out: Path | None
|
||||
is_big_endian: bool
|
||||
endianess: gguf.GGUFEndian
|
||||
use_temp_file: bool
|
||||
lazy: bool
|
||||
model_name: str | None
|
||||
part_names: list[str]
|
||||
is_safetensors: bool
|
||||
hparams: dict[str, Any]
|
||||
block_count: int
|
||||
tensor_map: gguf.TensorNameMap
|
||||
tensor_names: set[str] | None
|
||||
fname_out: Path
|
||||
gguf_writer: gguf.GGUFWriter
|
||||
model_name: str | None
|
||||
metadata_override: Path | None
|
||||
|
||||
# subclasses should define this!
|
||||
model_arch: gguf.MODEL_ARCH
|
||||
|
||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool,
|
||||
model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path | None, is_big_endian: bool = False,
|
||||
use_temp_file: bool = False, eager: bool = False,
|
||||
metadata_override: Path | None = None, model_name: str | None = None,
|
||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
||||
if type(self) is Model:
|
||||
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
||||
|
||||
self.dir_model = dir_model
|
||||
self.ftype = ftype
|
||||
self.fname_out = fname_out
|
||||
self.is_big_endian = is_big_endian
|
||||
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
||||
self.use_temp_file = use_temp_file
|
||||
self.lazy = not eager
|
||||
self.model_name = model_name
|
||||
self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
|
||||
self.is_safetensors = len(self.part_names) > 0
|
||||
if not self.is_safetensors:
|
||||
|
@ -84,6 +88,10 @@ class Model:
|
|||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
self.tensor_names = None
|
||||
self.metadata_override = metadata_override
|
||||
self.model_name = model_name
|
||||
|
||||
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
||||
if self.ftype == gguf.LlamaFileType.GUESSED:
|
||||
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
||||
_, first_tensor = next(self.get_tensors())
|
||||
|
@ -93,10 +101,8 @@ class Model:
|
|||
else:
|
||||
logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
|
||||
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
|
||||
ftype_up: str = self.ftype.name.partition("_")[2].upper()
|
||||
ftype_lw: str = ftype_up.lower()
|
||||
# allow templating the file name with the output ftype, useful with the "auto" ftype
|
||||
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
|
||||
|
||||
# Configure GGUF Writer
|
||||
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
|
||||
split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
|
||||
|
||||
|
@ -148,7 +154,14 @@ class Model:
|
|||
tensor_names_from_parts.update(model_part.keys())
|
||||
|
||||
for name in model_part.keys():
|
||||
data = model_part.get_tensor(name) if self.is_safetensors else model_part[name]
|
||||
if self.is_safetensors:
|
||||
if self.lazy:
|
||||
data = model_part.get_slice(name)
|
||||
data = LazyTorchTensor.from_safetensors_slice(data)
|
||||
else:
|
||||
data = model_part.get_tensor(name)
|
||||
else:
|
||||
data = model_part[name]
|
||||
if self.lazy:
|
||||
data = LazyTorchTensor.from_eager(data)
|
||||
yield name, data
|
||||
|
@ -186,7 +199,6 @@ class Model:
|
|||
return new_name
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_block_count(self.block_count)
|
||||
|
||||
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
|
||||
|
@ -243,7 +255,7 @@ class Model:
|
|||
|
||||
return False
|
||||
|
||||
def write_tensors(self):
|
||||
def prepare_tensors(self):
|
||||
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
||||
|
||||
for name, data_torch in self.get_tensors():
|
||||
|
@ -326,9 +338,67 @@ class Model:
|
|||
|
||||
self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
|
||||
|
||||
def set_type(self):
|
||||
self.gguf_writer.add_type(gguf.GGUFType.MODEL)
|
||||
|
||||
def prepare_metadata(self, vocab_only: bool):
|
||||
|
||||
total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
|
||||
|
||||
self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model, self.model_name, total_params)
|
||||
|
||||
# Fallback to model directory name if metadata name is still missing
|
||||
if self.metadata.name is None:
|
||||
self.metadata.name = self.dir_model.name
|
||||
|
||||
# Generate parameter weight class (useful for leader boards) if not yet determined
|
||||
if self.metadata.size_label is None and total_params > 0:
|
||||
self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
|
||||
|
||||
# Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
|
||||
output_type: str = self.ftype.name.partition("_")[2]
|
||||
|
||||
# Filename Output
|
||||
# Note: `not is_dir()` is used because `.is_file()` will not detect
|
||||
# file template strings as it doesn't actually exist as a file
|
||||
if self.fname_out is not None and not self.fname_out.is_dir():
|
||||
# Output path is a custom defined templated filename
|
||||
|
||||
# Process templated file name with the output ftype, useful with the "auto" ftype
|
||||
self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
|
||||
else:
|
||||
# Generate default filename based on model specification and available metadata
|
||||
if not vocab_only:
|
||||
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
|
||||
else:
|
||||
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
|
||||
|
||||
# Check if preferred output directory path was provided
|
||||
if self.fname_out is not None and self.fname_out.is_dir():
|
||||
# output path is a directory
|
||||
self.fname_out = self.fname_out / f"{fname_default}.gguf"
|
||||
else:
|
||||
# output in the same directory as the model by default
|
||||
self.fname_out = self.dir_model / f"{fname_default}.gguf"
|
||||
|
||||
self.set_type()
|
||||
|
||||
logger.info("Set meta model")
|
||||
self.metadata.set_gguf_meta_model(self.gguf_writer)
|
||||
|
||||
logger.info("Set model parameters")
|
||||
self.set_gguf_parameters()
|
||||
|
||||
logger.info("Set model tokenizer")
|
||||
self.set_vocab()
|
||||
|
||||
logger.info("Set model quantization version")
|
||||
self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
||||
|
||||
def write(self):
|
||||
self.write_tensors()
|
||||
self.gguf_writer.write_header_to_file(self.fname_out)
|
||||
self.prepare_tensors()
|
||||
self.prepare_metadata(vocab_only=False)
|
||||
self.gguf_writer.write_header_to_file(path=self.fname_out)
|
||||
self.gguf_writer.write_kv_data_to_file()
|
||||
self.gguf_writer.write_tensors_to_file(progress=True)
|
||||
self.gguf_writer.close()
|
||||
|
@ -336,7 +406,9 @@ class Model:
|
|||
def write_vocab(self):
|
||||
if len(self.gguf_writer.tensors) != 1:
|
||||
raise ValueError('Splitting the vocabulary is not supported')
|
||||
self.gguf_writer.write_header_to_file(self.fname_out)
|
||||
|
||||
self.prepare_metadata(vocab_only=True)
|
||||
self.gguf_writer.write_header_to_file(path=self.fname_out)
|
||||
self.gguf_writer.write_kv_data_to_file()
|
||||
self.gguf_writer.close()
|
||||
|
||||
|
@ -773,7 +845,6 @@ class GPTNeoXModel(Model):
|
|||
def set_gguf_parameters(self):
|
||||
block_count = self.hparams["num_hidden_layers"]
|
||||
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
|
@ -829,7 +900,6 @@ class BloomModel(Model):
|
|||
model_arch = gguf.MODEL_ARCH.BLOOM
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name("Bloom")
|
||||
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
||||
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
||||
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
||||
|
@ -906,7 +976,6 @@ class MPTModel(Model):
|
|||
|
||||
def set_gguf_parameters(self):
|
||||
block_count = self.hparams["n_layers"]
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
|
@ -945,7 +1014,6 @@ class OrionModel(Model):
|
|||
block_count = self.hparams["num_hidden_layers"]
|
||||
head_count = self.hparams["num_attention_heads"]
|
||||
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
||||
hf_repo = self.hparams.get("_name_or_path", "")
|
||||
|
||||
ctx_length = 0
|
||||
if "max_sequence_length" in self.hparams:
|
||||
|
@ -958,8 +1026,6 @@ class OrionModel(Model):
|
|||
raise ValueError("gguf: can not find ctx length parameter.")
|
||||
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_source_hf_repo(hf_repo)
|
||||
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||
self.gguf_writer.add_context_length(ctx_length)
|
||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||
|
@ -983,7 +1049,6 @@ class BaichuanModel(Model):
|
|||
block_count = self.hparams["num_hidden_layers"]
|
||||
head_count = self.hparams["num_attention_heads"]
|
||||
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
||||
hf_repo = self.hparams.get("_name_or_path", "")
|
||||
|
||||
ctx_length = 0
|
||||
if "max_sequence_length" in self.hparams:
|
||||
|
@ -995,8 +1060,6 @@ class BaichuanModel(Model):
|
|||
else:
|
||||
raise ValueError("gguf: can not find ctx length parameter.")
|
||||
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_source_hf_repo(hf_repo)
|
||||
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||
self.gguf_writer.add_context_length(ctx_length)
|
||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||
|
@ -1110,7 +1173,6 @@ class XverseModel(Model):
|
|||
block_count = self.hparams["num_hidden_layers"]
|
||||
head_count = self.hparams["num_attention_heads"]
|
||||
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
||||
hf_repo = self.hparams.get("_name_or_path", "")
|
||||
|
||||
ctx_length = 0
|
||||
if "max_sequence_length" in self.hparams:
|
||||
|
@ -1122,8 +1184,6 @@ class XverseModel(Model):
|
|||
else:
|
||||
raise ValueError("gguf: can not find ctx length parameter.")
|
||||
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_source_hf_repo(hf_repo)
|
||||
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||
self.gguf_writer.add_context_length(ctx_length)
|
||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||
|
@ -1182,7 +1242,6 @@ class FalconModel(Model):
|
|||
if n_head_kv is None:
|
||||
n_head_kv = self.hparams.get("n_head_kv", 1) # old name
|
||||
|
||||
self.gguf_writer.add_name("Falcon")
|
||||
self.gguf_writer.add_context_length(2048) # not in config.json
|
||||
self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
|
||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||
|
@ -1227,7 +1286,6 @@ class StarCoderModel(Model):
|
|||
def set_gguf_parameters(self):
|
||||
block_count = self.hparams["n_layer"]
|
||||
|
||||
self.gguf_writer.add_name("StarCoder")
|
||||
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
||||
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
||||
|
@ -1262,7 +1320,6 @@ class RefactModel(Model):
|
|||
|
||||
block_count = self.hparams["n_layer"]
|
||||
|
||||
self.gguf_writer.add_name("Refact")
|
||||
# refact uses Alibi. So this is from config.json which might be used by training.
|
||||
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
||||
|
@ -1317,7 +1374,6 @@ class StableLMModel(Model):
|
|||
hparams = self.hparams
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
|
@ -1379,8 +1435,8 @@ class StableLMModel(Model):
|
|||
|
||||
return [(new_name, data_torch)]
|
||||
|
||||
def write_tensors(self):
|
||||
super().write_tensors()
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
|
||||
if self._q_norms is not None or self._k_norms is not None:
|
||||
# flatten two `list[dict[str, Tensor]]` into a single `list[str]`
|
||||
|
@ -1496,8 +1552,8 @@ class LlamaModel(Model):
|
|||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def write_tensors(self):
|
||||
super().write_tensors()
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
|
||||
if self._experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
|
@ -1560,7 +1616,6 @@ class GrokModel(Model):
|
|||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_name("Grok")
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
|
@ -1609,7 +1664,6 @@ class DbrxModel(Model):
|
|||
def set_gguf_parameters(self):
|
||||
ffn_config = self.hparams["ffn_config"]
|
||||
attn_config = self.hparams["attn_config"]
|
||||
self.gguf_writer.add_name(self.hparams["model_type"])
|
||||
self.gguf_writer.add_block_count(self.hparams["n_layers"])
|
||||
|
||||
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
||||
|
@ -1678,7 +1732,6 @@ class MiniCPMModel(Model):
|
|||
|
||||
def set_gguf_parameters(self):
|
||||
block_count = self.hparams["num_hidden_layers"]
|
||||
self.gguf_writer.add_name("MiniCPM")
|
||||
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
|
@ -1748,7 +1801,6 @@ class QwenModel(Model):
|
|||
self._set_vocab_qwen()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name("Qwen")
|
||||
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||
|
@ -1824,8 +1876,8 @@ class Qwen2MoeModel(Model):
|
|||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def write_tensors(self):
|
||||
super().write_tensors()
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
|
||||
if self._experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
|
@ -1839,7 +1891,6 @@ class GPT2Model(Model):
|
|||
model_arch = gguf.MODEL_ARCH.GPT2
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
||||
self.gguf_writer.add_context_length(self.hparams["n_ctx"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
||||
|
@ -1882,7 +1933,6 @@ class Phi2Model(Model):
|
|||
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
||||
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
||||
|
||||
self.gguf_writer.add_name("Phi2")
|
||||
self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
|
||||
|
||||
self.gguf_writer.add_embedding_length(n_embd)
|
||||
|
@ -2004,7 +2054,6 @@ class Phi3MiniModel(Model):
|
|||
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
|
||||
rope_dims = n_embd // n_head
|
||||
|
||||
self.gguf_writer.add_name("Phi3")
|
||||
self.gguf_writer.add_context_length(max_pos_embds)
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
|
||||
self.gguf_writer.add_embedding_length(n_embd)
|
||||
|
@ -2061,7 +2110,6 @@ class PlamoModel(Model):
|
|||
hparams = self.hparams
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
|
||||
self.gguf_writer.add_name("PLaMo")
|
||||
self.gguf_writer.add_context_length(4096) # not in config.json
|
||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||
|
@ -2106,7 +2154,6 @@ class CodeShellModel(Model):
|
|||
def set_gguf_parameters(self):
|
||||
block_count = self.hparams["n_layer"]
|
||||
|
||||
self.gguf_writer.add_name("CodeShell")
|
||||
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
||||
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
||||
|
@ -2264,15 +2311,7 @@ class InternLM2Model(Model):
|
|||
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
|
||||
if n_head_kv is not None and n_head != n_head_kv:
|
||||
n_head = n_head_kv
|
||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||
.swapaxes(1, 2)
|
||||
.reshape(weights.shape))
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name("InternLM2")
|
||||
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||
|
@ -2290,26 +2329,22 @@ class InternLM2Model(Model):
|
|||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
num_heads = self.hparams["num_attention_heads"]
|
||||
num_kv_heads = self.hparams["num_key_value_heads"]
|
||||
hidden_size = self.hparams["hidden_size"]
|
||||
n_embd = self.hparams["hidden_size"]
|
||||
q_per_kv = num_heads // num_kv_heads
|
||||
head_dim = hidden_size // num_heads
|
||||
head_dim = n_embd // num_heads
|
||||
num_groups = num_heads // q_per_kv
|
||||
|
||||
qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
|
||||
|
||||
if re.match(qkv_pattern, name):
|
||||
bid = re.findall(qkv_pattern, name)[0]
|
||||
if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
|
||||
qkv = data_torch
|
||||
# qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
|
||||
qkv = qkv.T.reshape((-1, num_groups, q_per_kv + 2, head_dim))
|
||||
q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
|
||||
|
||||
qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd))
|
||||
q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
|
||||
|
||||
# The model weights of q and k equire additional reshape.
|
||||
# q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
|
||||
q = self._hf_permute_qk(q.reshape((q.shape[0], -1)).T, num_heads, num_heads)
|
||||
# k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
|
||||
k = self._hf_permute_qk(k.reshape((k.shape[0], -1)).T, num_heads, num_kv_heads)
|
||||
# v = rearrange(v, " o g n i -> o (g n i)").T
|
||||
v = v.reshape((v.shape[0], -1)).T
|
||||
q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
|
||||
k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
|
||||
v = v.reshape((-1, v.shape[-1]))
|
||||
|
||||
return [
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q),
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k),
|
||||
|
@ -2444,7 +2479,6 @@ class GemmaModel(Model):
|
|||
hparams = self.hparams
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
|
@ -2485,7 +2519,6 @@ class Gemma2Model(Model):
|
|||
hparams = self.hparams
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
|
@ -2560,7 +2593,6 @@ class MambaModel(Model):
|
|||
# Fail early for models which don't have a block expansion factor of 2
|
||||
assert d_inner == 2 * d_model
|
||||
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
|
||||
self.gguf_writer.add_embedding_length(d_model)
|
||||
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
||||
|
@ -2739,7 +2771,6 @@ class OpenELMModel(Model):
|
|||
assert self.block_count == len(self._num_query_heads)
|
||||
assert self.block_count == len(self._ffn_dims)
|
||||
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_block_count(self.block_count)
|
||||
self.gguf_writer.add_context_length(self.hparams["max_context_length"])
|
||||
self.gguf_writer.add_embedding_length(n_embd)
|
||||
|
@ -2913,8 +2944,8 @@ class ArcticModel(Model):
|
|||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def write_tensors(self):
|
||||
super().write_tensors()
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
|
||||
if self._experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
|
@ -2992,8 +3023,8 @@ class DeepseekV2Model(Model):
|
|||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def write_tensors(self):
|
||||
super().write_tensors()
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
|
||||
if self._experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
|
@ -3111,7 +3142,6 @@ class T5Model(Model):
|
|||
self.gguf_writer.add_add_eos_token(True)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name("T5")
|
||||
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
|
||||
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
|
||||
n_ctx = 512
|
||||
|
@ -3185,7 +3215,6 @@ class JaisModel(Model):
|
|||
self._set_vocab_gpt2()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
||||
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
||||
|
@ -3231,8 +3260,8 @@ class JaisModel(Model):
|
|||
|
||||
return tensors
|
||||
|
||||
def write_tensors(self):
|
||||
super().write_tensors()
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
|
||||
|
||||
|
||||
|
@ -3391,7 +3420,6 @@ class ChatGLMModel(Model):
|
|||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name(self.hparams["_name_or_path"].split("/")[1]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b
|
||||
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
||||
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
||||
n_head_kv = self.hparams.get("multi_query_group_num", n_head)
|
||||
|
@ -3435,19 +3463,46 @@ class LazyTorchTensor(gguf.LazyBase):
|
|||
torch.float32: np.float32,
|
||||
}
|
||||
|
||||
# used for safetensors slices
|
||||
# ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
|
||||
# TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
|
||||
_dtype_str_map: dict[str, torch.dtype] = {
|
||||
"F64": torch.float64,
|
||||
"F32": torch.float32,
|
||||
"BF16": torch.bfloat16,
|
||||
"F16": torch.float16,
|
||||
# "U64": torch.uint64,
|
||||
"I64": torch.int64,
|
||||
# "U32": torch.uint32,
|
||||
"I32": torch.int32,
|
||||
# "U16": torch.uint16,
|
||||
"I16": torch.int16,
|
||||
"U8": torch.uint8,
|
||||
"I8": torch.int8,
|
||||
"BOOL": torch.bool,
|
||||
"F8_E4M3": torch.float8_e4m3fn,
|
||||
"F8_E5M2": torch.float8_e5m2,
|
||||
}
|
||||
|
||||
def numpy(self) -> gguf.LazyNumpyTensor:
|
||||
dtype = self._dtype_map[self.dtype]
|
||||
return gguf.LazyNumpyTensor(
|
||||
meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
|
||||
lazy=self._lazy,
|
||||
args=(self,),
|
||||
func=(lambda s: s[0].numpy())
|
||||
func=(lambda s: s.numpy())
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: torch.Size) -> Tensor:
|
||||
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
|
||||
return torch.empty(size=shape, dtype=dtype, device="meta")
|
||||
|
||||
@classmethod
|
||||
def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
|
||||
dtype = cls._dtype_str_map[st_slice.get_dtype()]
|
||||
shape: tuple[int, ...] = tuple(st_slice.get_shape())
|
||||
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
|
||||
return cast(torch.Tensor, lazy)
|
||||
|
||||
@classmethod
|
||||
def __torch_function__(cls, func, types, args=(), kwargs=None):
|
||||
del types # unused
|
||||
|
@ -3458,7 +3513,7 @@ class LazyTorchTensor(gguf.LazyBase):
|
|||
if func is torch.Tensor.numpy:
|
||||
return args[0].numpy()
|
||||
|
||||
return LazyTorchTensor._wrap_fn(func)(*args, **kwargs)
|
||||
return cls._wrap_fn(func)(*args, **kwargs)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
|
@ -3516,6 +3571,10 @@ def parse_args() -> argparse.Namespace:
|
|||
"--no-tensor-first-split", action="store_true",
|
||||
help="do not add tensors to the first split (disabled by default)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--metadata", type=Path,
|
||||
help="Specify the path for an authorship metadata override file"
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
@ -3541,7 +3600,10 @@ def split_str_to_n_bytes(split_str: str) -> int:
|
|||
def main() -> None:
|
||||
args = parse_args()
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
if args.verbose:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
dir_model = args.model
|
||||
|
||||
|
@ -3562,36 +3624,33 @@ def main() -> None:
|
|||
logger.error("Error: Cannot use temp file when splitting")
|
||||
sys.exit(1)
|
||||
|
||||
fname_out = None
|
||||
|
||||
if args.outfile is not None:
|
||||
fname_out = args.outfile
|
||||
else:
|
||||
# output in the same directory as the model by default
|
||||
fname_out = dir_model / 'ggml-model-{ftype}.gguf'
|
||||
|
||||
logger.info(f"Loading model: {dir_model.name}")
|
||||
|
||||
hparams = Model.load_hparams(dir_model)
|
||||
|
||||
with torch.inference_mode():
|
||||
output_type = ftype_map[args.outtype]
|
||||
model_architecture = hparams["architectures"][0]
|
||||
|
||||
try:
|
||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||
model_class = Model.from_model_architecture(model_architecture)
|
||||
except NotImplementedError:
|
||||
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
||||
logger.error(f"Model {model_architecture} is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file,
|
||||
args.no_lazy, args.model_name, split_max_tensors=args.split_max_tensors,
|
||||
model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out,
|
||||
is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
|
||||
eager=args.no_lazy,
|
||||
metadata_override=args.metadata, model_name=args.model_name,
|
||||
split_max_tensors=args.split_max_tensors,
|
||||
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
|
||||
small_first_shard=args.no_tensor_first_split)
|
||||
|
||||
logger.info("Set model parameters")
|
||||
model_instance.set_gguf_parameters()
|
||||
|
||||
logger.info("Set model tokenizer")
|
||||
model_instance.set_vocab()
|
||||
|
||||
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
||||
|
||||
if args.vocab_only:
|
||||
logger.info("Exporting model vocab...")
|
||||
model_instance.write_vocab()
|
||||
|
@ -3599,6 +3658,7 @@ def main() -> None:
|
|||
else:
|
||||
logger.info("Exporting model...")
|
||||
model_instance.write()
|
||||
assert model_instance.fname_out is not None
|
||||
out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
|
||||
logger.info(f"Model successfully exported to {out_path}")
|
||||
|
||||
|
|
383
convert_lora_to_gguf.py
Executable file
383
convert_lora_to_gguf.py
Executable file
|
@ -0,0 +1,383 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
import logging
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from math import prod
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
|
||||
|
||||
import torch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from torch import Tensor
|
||||
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||
import gguf
|
||||
|
||||
# reuse model definitions from convert_hf_to_gguf.py
|
||||
from convert_hf_to_gguf import LazyTorchTensor, Model
|
||||
|
||||
logger = logging.getLogger("lora-to-gguf")
|
||||
|
||||
|
||||
@dataclass
|
||||
class PartialLoraTensor:
|
||||
A: Tensor | None = None
|
||||
B: Tensor | None = None
|
||||
|
||||
|
||||
# magic to support tensor shape modifications and splitting
|
||||
class LoraTorchTensor:
|
||||
_lora_A: Tensor # (n_rank, row_size)
|
||||
_lora_B: Tensor # (col_size, n_rank)
|
||||
_rank: int
|
||||
|
||||
def __init__(self, A: Tensor, B: Tensor):
|
||||
assert len(A.shape) == len(B.shape)
|
||||
assert A.shape[-2] == B.shape[-1]
|
||||
if A.dtype != B.dtype:
|
||||
A = A.to(torch.float32)
|
||||
B = B.to(torch.float32)
|
||||
self._lora_A = A
|
||||
self._lora_B = B
|
||||
self._rank = B.shape[-1]
|
||||
|
||||
def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
|
||||
return (self._lora_A, self._lora_B)
|
||||
|
||||
def __getitem__(
|
||||
self,
|
||||
indices: (
|
||||
SupportsIndex
|
||||
| slice
|
||||
| tuple[SupportsIndex | slice | Tensor, ...] # TODO: add ellipsis in the type signature
|
||||
),
|
||||
) -> LoraTorchTensor:
|
||||
shape = self.shape
|
||||
if isinstance(indices, SupportsIndex):
|
||||
if len(shape) > 2:
|
||||
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
|
||||
else:
|
||||
raise NotImplementedError # can't return a vector
|
||||
elif isinstance(indices, slice):
|
||||
if len(shape) > 2:
|
||||
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
|
||||
else:
|
||||
return LoraTorchTensor(self._lora_A, self._lora_B[indices])
|
||||
elif isinstance(indices, tuple):
|
||||
assert len(indices) > 0
|
||||
if indices[-1] is Ellipsis:
|
||||
return self[indices[:-1]]
|
||||
# expand ellipsis
|
||||
indices = tuple(
|
||||
u
|
||||
for v in (
|
||||
(
|
||||
(slice(None, None) for _ in range(len(indices) - 1))
|
||||
if i is Ellipsis
|
||||
else (i,)
|
||||
)
|
||||
for i in indices
|
||||
)
|
||||
for u in v
|
||||
)
|
||||
|
||||
if len(indices) < len(shape):
|
||||
indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape))))
|
||||
|
||||
# TODO: make sure this is correct
|
||||
indices_A = (
|
||||
*(
|
||||
(
|
||||
j.__index__() % self._lora_A.shape[i]
|
||||
if isinstance(j, SupportsIndex)
|
||||
else slice(None, None)
|
||||
)
|
||||
for i, j in enumerate(indices[:-2])
|
||||
),
|
||||
slice(None, None),
|
||||
indices[-1],
|
||||
)
|
||||
indices_B = indices[:-1]
|
||||
return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
|
||||
else:
|
||||
raise NotImplementedError # unknown indice type
|
||||
|
||||
@property
|
||||
def dtype(self) -> torch.dtype:
|
||||
assert self._lora_A.dtype == self._lora_B.dtype
|
||||
return self._lora_A.dtype
|
||||
|
||||
@property
|
||||
def shape(self) -> tuple[int, ...]:
|
||||
assert len(self._lora_A.shape) == len(self._lora_B.shape)
|
||||
return (*self._lora_B.shape[:-1], self._lora_A.shape[-1])
|
||||
|
||||
def size(self, dim=None):
|
||||
assert dim is None
|
||||
return self.shape
|
||||
|
||||
def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
|
||||
if isinstance(shape[0], tuple):
|
||||
new_shape: tuple[int, ...] = shape[0]
|
||||
else:
|
||||
new_shape = cast(tuple[int, ...], shape)
|
||||
orig_shape = self.shape
|
||||
if len(new_shape) < 2:
|
||||
raise NotImplementedError # can't become a vector
|
||||
|
||||
# expand -1 in the shape
|
||||
if any(dim == -1 for dim in new_shape):
|
||||
n_elems = prod(orig_shape)
|
||||
n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
|
||||
assert n_elems % n_new_elems == 0
|
||||
new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),)
|
||||
|
||||
if new_shape[-1] != orig_shape[-1]:
|
||||
raise NotImplementedError # can't reshape the row size trivially
|
||||
|
||||
shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
|
||||
shape_B = (*new_shape[:-1], self._rank)
|
||||
return LoraTorchTensor(
|
||||
self._lora_A.reshape(shape_A),
|
||||
self._lora_B.reshape(shape_B),
|
||||
)
|
||||
|
||||
def reshape_as(self, other: Tensor) -> LoraTorchTensor:
|
||||
return self.reshape(*other.shape)
|
||||
|
||||
def view(self, *size: int) -> LoraTorchTensor:
|
||||
return self.reshape(*size)
|
||||
|
||||
def permute(self, *dims: int) -> LoraTorchTensor:
|
||||
shape = self.shape
|
||||
dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
|
||||
if dims[-1] == -1:
|
||||
# TODO: support higher dimensional A shapes bigger than 1
|
||||
assert all(dim == 1 for dim in self._lora_A.shape[:-2])
|
||||
return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
|
||||
if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
|
||||
return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims))
|
||||
else:
|
||||
# TODO: compose the above two
|
||||
raise NotImplementedError
|
||||
|
||||
def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
|
||||
shape = self.shape
|
||||
dims = [i for i in range(len(shape))]
|
||||
dims[dim0], dims[dim1] = dims[dim1], dims[dim0]
|
||||
return self.permute(*dims)
|
||||
|
||||
def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
|
||||
return self.transpose(axis0, axis1)
|
||||
|
||||
def to(self, *args, **kwargs):
|
||||
return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
|
||||
|
||||
@classmethod
|
||||
def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
|
||||
del types # unused
|
||||
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
|
||||
if func is torch.permute:
|
||||
return type(args[0]).permute(*args, **kwargs)
|
||||
elif func is torch.reshape:
|
||||
return type(args[0]).reshape(*args, **kwargs)
|
||||
elif func is torch.stack:
|
||||
assert isinstance(args[0], Sequence)
|
||||
dim = kwargs.get("dim", 0)
|
||||
assert dim == 0
|
||||
return LoraTorchTensor(
|
||||
torch.stack([a._lora_A for a in args[0]], dim),
|
||||
torch.stack([b._lora_B for b in args[0]], dim),
|
||||
)
|
||||
elif func is torch.cat:
|
||||
assert isinstance(args[0], Sequence)
|
||||
dim = kwargs.get("dim", 0)
|
||||
assert dim == 0
|
||||
if len(args[0][0].shape) > 2:
|
||||
return LoraTorchTensor(
|
||||
torch.cat([a._lora_A for a in args[0]], dim),
|
||||
torch.cat([b._lora_B for b in args[0]], dim),
|
||||
)
|
||||
elif all(torch.equal(args[0][0]._lora_A, t._lora_A) for t in args[0][1:]):
|
||||
return LoraTorchTensor(
|
||||
args[0][0]._lora_A,
|
||||
torch.cat([b._lora_B for b in args[0]], dim),
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def get_base_tensor_name(lora_tensor_name: str) -> str:
|
||||
base_name = lora_tensor_name.replace("base_model.model.", "")
|
||||
base_name = base_name.replace(".lora_A.weight", ".weight")
|
||||
base_name = base_name.replace(".lora_B.weight", ".weight")
|
||||
return base_name
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
|
||||
parser.add_argument(
|
||||
"--outfile", type=Path,
|
||||
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
|
||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bigendian", action="store_true",
|
||||
help="model is executed on big endian machine",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-lazy", action="store_true",
|
||||
help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose", action="store_true",
|
||||
help="increase output verbosity",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run", action="store_true",
|
||||
help="only print out what will be done, without writing any new files",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base", type=Path, required=True,
|
||||
help="directory containing base model file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"lora_path", type=Path,
|
||||
help="directory containing LoRA adapter file",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_args()
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
|
||||
ftype_map: dict[str, gguf.LlamaFileType] = {
|
||||
"f32": gguf.LlamaFileType.ALL_F32,
|
||||
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||
"auto": gguf.LlamaFileType.GUESSED,
|
||||
}
|
||||
|
||||
ftype = ftype_map[args.outtype]
|
||||
|
||||
dir_base_model: Path = args.base
|
||||
dir_lora: Path = args.lora_path
|
||||
lora_config = dir_lora / "adapter_config.json"
|
||||
input_model = dir_lora / "adapter_model.safetensors"
|
||||
|
||||
if args.outfile is not None:
|
||||
fname_out = args.outfile
|
||||
else:
|
||||
# output in the same directory as the model by default
|
||||
fname_out = dir_lora / 'ggml-lora-{ftype}.gguf'
|
||||
|
||||
if os.path.exists(input_model):
|
||||
# lazy import load_file only if lora is in safetensors format.
|
||||
from safetensors.torch import load_file
|
||||
|
||||
lora_model = load_file(input_model, device="cpu")
|
||||
else:
|
||||
input_model = os.path.join(dir_lora, "adapter_model.bin")
|
||||
lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
|
||||
|
||||
# load base model
|
||||
logger.info(f"Loading base model: {dir_base_model.name}")
|
||||
hparams = Model.load_hparams(dir_base_model)
|
||||
|
||||
with open(lora_config, "r") as f:
|
||||
lparams: dict[str, Any] = json.load(f)
|
||||
|
||||
alpha: float = lparams["lora_alpha"]
|
||||
|
||||
with torch.inference_mode():
|
||||
try:
|
||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||
except NotImplementedError:
|
||||
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
class LoraModel(model_class):
|
||||
model_arch = model_class.model_arch
|
||||
|
||||
def set_type(self):
|
||||
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
|
||||
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, float(alpha))
|
||||
super().set_gguf_parameters()
|
||||
|
||||
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||
tensor_map: dict[str, PartialLoraTensor] = {}
|
||||
|
||||
for name, tensor in lora_model.items():
|
||||
if self.lazy:
|
||||
tensor = LazyTorchTensor.from_eager(tensor)
|
||||
base_name = get_base_tensor_name(name)
|
||||
is_lora_a = ".lora_A.weight" in name
|
||||
is_lora_b = ".lora_B.weight" in name
|
||||
if not is_lora_a and not is_lora_b:
|
||||
if ".base_layer.weight" in name:
|
||||
continue
|
||||
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
|
||||
sys.exit(1)
|
||||
|
||||
if base_name in tensor_map:
|
||||
if is_lora_a:
|
||||
tensor_map[base_name].A = tensor
|
||||
else:
|
||||
tensor_map[base_name].B = tensor
|
||||
else:
|
||||
if is_lora_a:
|
||||
tensor_map[base_name] = PartialLoraTensor(A=tensor)
|
||||
else:
|
||||
tensor_map[base_name] = PartialLoraTensor(B=tensor)
|
||||
|
||||
for name, tensor in tensor_map.items():
|
||||
assert tensor.A is not None
|
||||
assert tensor.B is not None
|
||||
yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
dest = super().modify_tensors(data_torch, name, bid)
|
||||
for dest_name, dest_data in dest:
|
||||
assert isinstance(dest_data, LoraTorchTensor)
|
||||
lora_a, lora_b = dest_data.get_lora_A_B()
|
||||
|
||||
yield (dest_name + ".lora_a", lora_a)
|
||||
yield (dest_name + ".lora_b", lora_b)
|
||||
|
||||
model_instance = LoraModel(
|
||||
dir_base_model,
|
||||
ftype,
|
||||
fname_out,
|
||||
is_big_endian=args.bigendian,
|
||||
use_temp_file=False,
|
||||
eager=args.no_lazy,
|
||||
dry_run=args.dry_run,
|
||||
)
|
||||
|
||||
logger.info("Exporting model...")
|
||||
model_instance.write()
|
||||
logger.info(f"Model successfully exported to {model_instance.fname_out}")
|
|
@ -31,7 +31,7 @@ int main(int argc, char ** argv) {
|
|||
int n_parallel = params.n_parallel;
|
||||
|
||||
// total length of the sequences including the prompt
|
||||
int n_predict = 32;
|
||||
int n_predict = params.n_predict;
|
||||
|
||||
// init LLM
|
||||
|
||||
|
|
|
@ -24,7 +24,7 @@ from abc import ABC, abstractmethod
|
|||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional
|
||||
from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
@ -346,42 +346,6 @@ class Params:
|
|||
return params
|
||||
|
||||
|
||||
@dataclass
|
||||
class Metadata:
|
||||
name: Optional[str] = None
|
||||
author: Optional[str] = None
|
||||
version: Optional[str] = None
|
||||
url: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
license: Optional[str] = None
|
||||
source_url: Optional[str] = None
|
||||
source_hf_repo: Optional[str] = None
|
||||
|
||||
@staticmethod
|
||||
def load(metadata_path: Path) -> Metadata:
|
||||
if metadata_path is None or not metadata_path.exists():
|
||||
return Metadata()
|
||||
|
||||
with open(metadata_path, 'r') as file:
|
||||
data = json.load(file)
|
||||
|
||||
# Create a new Metadata instance
|
||||
metadata = Metadata()
|
||||
|
||||
# Assigning values to Metadata attributes if they exist in the JSON file
|
||||
# This is based on LLM_KV_NAMES mapping in llama.cpp
|
||||
metadata.name = data.get("general.name")
|
||||
metadata.author = data.get("general.author")
|
||||
metadata.version = data.get("general.version")
|
||||
metadata.url = data.get("general.url")
|
||||
metadata.description = data.get("general.description")
|
||||
metadata.license = data.get("general.license")
|
||||
metadata.source_url = data.get("general.source.url")
|
||||
metadata.source_hf_repo = data.get("general.source.huggingface.repository")
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
#
|
||||
# data loading
|
||||
# TODO: reuse (probably move to gguf.py?)
|
||||
|
@ -806,7 +770,7 @@ class OutputFile:
|
|||
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
|
||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
||||
|
||||
def add_meta_model(self, params: Params, metadata: Metadata | None) -> None:
|
||||
def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None:
|
||||
# Metadata About The Model And Its Provenence
|
||||
name = "LLaMA"
|
||||
if metadata is not None and metadata.name is not None:
|
||||
|
@ -824,16 +788,73 @@ class OutputFile:
|
|||
self.gguf.add_author(metadata.author)
|
||||
if metadata.version is not None:
|
||||
self.gguf.add_version(metadata.version)
|
||||
if metadata.url is not None:
|
||||
self.gguf.add_url(metadata.url)
|
||||
if metadata.organization is not None:
|
||||
self.gguf.add_organization(metadata.organization)
|
||||
|
||||
if metadata.finetune is not None:
|
||||
self.gguf.add_finetune(metadata.finetune)
|
||||
if metadata.basename is not None:
|
||||
self.gguf.add_basename(metadata.basename)
|
||||
|
||||
if metadata.description is not None:
|
||||
self.gguf.add_description(metadata.description)
|
||||
if metadata.quantized_by is not None:
|
||||
self.gguf.add_quantized_by(metadata.quantized_by)
|
||||
|
||||
if metadata.size_label is not None:
|
||||
self.gguf.add_size_label(metadata.size_label)
|
||||
|
||||
if metadata.license is not None:
|
||||
self.gguf.add_licence(metadata.license)
|
||||
self.gguf.add_license(metadata.license)
|
||||
if metadata.license_name is not None:
|
||||
self.gguf.add_license_name(metadata.license_name)
|
||||
if metadata.license_link is not None:
|
||||
self.gguf.add_license_link(metadata.license_link)
|
||||
|
||||
if metadata.url is not None:
|
||||
self.gguf.add_url(metadata.url)
|
||||
if metadata.doi is not None:
|
||||
self.gguf.add_doi(metadata.doi)
|
||||
if metadata.uuid is not None:
|
||||
self.gguf.add_uuid(metadata.uuid)
|
||||
if metadata.repo_url is not None:
|
||||
self.gguf.add_repo_url(metadata.repo_url)
|
||||
|
||||
if metadata.source_url is not None:
|
||||
self.gguf.add_source_url(metadata.source_url)
|
||||
if metadata.source_hf_repo is not None:
|
||||
self.gguf.add_source_hf_repo(metadata.source_hf_repo)
|
||||
if metadata.source_doi is not None:
|
||||
self.gguf.add_source_doi(metadata.source_doi)
|
||||
if metadata.source_uuid is not None:
|
||||
self.gguf.add_source_uuid(metadata.source_uuid)
|
||||
if metadata.source_repo_url is not None:
|
||||
self.gguf.add_source_repo_url(metadata.source_repo_url)
|
||||
|
||||
if metadata.base_models is not None:
|
||||
self.gguf.add_base_model_count(len(metadata.base_models))
|
||||
for key, base_model_entry in enumerate(metadata.base_models):
|
||||
if "name" in base_model_entry:
|
||||
self.gguf.add_base_model_name(key, base_model_entry["name"])
|
||||
if "author" in base_model_entry:
|
||||
self.gguf.add_base_model_author(key, base_model_entry["author"])
|
||||
if "version" in base_model_entry:
|
||||
self.gguf.add_base_model_version(key, base_model_entry["version"])
|
||||
if "organization" in base_model_entry:
|
||||
self.gguf.add_base_model_organization(key, base_model_entry["organization"])
|
||||
if "url" in base_model_entry:
|
||||
self.gguf.add_base_model_url(key, base_model_entry["url"])
|
||||
if "doi" in base_model_entry:
|
||||
self.gguf.add_base_model_doi(key, base_model_entry["doi"])
|
||||
if "uuid" in base_model_entry:
|
||||
self.gguf.add_base_model_uuid(key, base_model_entry["uuid"])
|
||||
if "repo_url" in base_model_entry:
|
||||
self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])
|
||||
|
||||
if metadata.tags is not None:
|
||||
self.gguf.add_tags(metadata.tags)
|
||||
if metadata.languages is not None:
|
||||
self.gguf.add_languages(metadata.languages)
|
||||
if metadata.datasets is not None:
|
||||
self.gguf.add_datasets(metadata.datasets)
|
||||
|
||||
def add_meta_arch(self, params: Params) -> None:
|
||||
# Metadata About The Neural Architecture Itself
|
||||
|
@ -944,7 +965,7 @@ class OutputFile:
|
|||
@staticmethod
|
||||
def write_vocab_only(
|
||||
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
|
||||
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata | None = None,
|
||||
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: gguf.Metadata | None = None,
|
||||
) -> None:
|
||||
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
|
||||
|
||||
|
@ -978,7 +999,7 @@ class OutputFile:
|
|||
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
|
||||
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
||||
pad_vocab: bool = False,
|
||||
metadata: Metadata | None = None,
|
||||
metadata: gguf.Metadata | None = None,
|
||||
) -> None:
|
||||
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
|
||||
|
||||
|
@ -1021,35 +1042,32 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
|
|||
raise ValueError(f"Unexpected combination of types: {name_to_type}")
|
||||
|
||||
|
||||
def model_parameter_count(model: LazyModel) -> int:
|
||||
total_model_parameters = 0
|
||||
for i, (name, lazy_tensor) in enumerate(model.items()):
|
||||
sum_weights_in_tensor = 1
|
||||
def per_model_weight_count_estimation(tensors: Iterable[tuple[str, LazyTensor]]) -> tuple[int, int, int]:
|
||||
total_params = 0
|
||||
shared_params = 0
|
||||
expert_params = 0
|
||||
|
||||
for name, lazy_tensor in tensors:
|
||||
# We don't need these
|
||||
if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
|
||||
continue
|
||||
|
||||
# Got A Tensor
|
||||
sum_weights_in_tensor: int = 1
|
||||
|
||||
# Tensor Volume
|
||||
for dim in lazy_tensor.shape:
|
||||
sum_weights_in_tensor *= dim
|
||||
total_model_parameters += sum_weights_in_tensor
|
||||
return total_model_parameters
|
||||
|
||||
|
||||
def model_parameter_count_rounded_notation(model_params_count: int) -> str:
|
||||
if model_params_count > 1e12 :
|
||||
# Trillions Of Parameters
|
||||
scaled_model_params = model_params_count * 1e-12
|
||||
scale_suffix = "T"
|
||||
elif model_params_count > 1e9 :
|
||||
# Billions Of Parameters
|
||||
scaled_model_params = model_params_count * 1e-9
|
||||
scale_suffix = "B"
|
||||
elif model_params_count > 1e6 :
|
||||
# Millions Of Parameters
|
||||
scaled_model_params = model_params_count * 1e-6
|
||||
scale_suffix = "M"
|
||||
if ".experts." in name:
|
||||
if ".experts.0." in name:
|
||||
expert_params += sum_weights_in_tensor
|
||||
else:
|
||||
# Thousands Of Parameters
|
||||
scaled_model_params = model_params_count * 1e-3
|
||||
scale_suffix = "K"
|
||||
shared_params += sum_weights_in_tensor
|
||||
|
||||
return f"{round(scaled_model_params)}{scale_suffix}"
|
||||
total_params += sum_weights_in_tensor
|
||||
|
||||
return total_params, shared_params, expert_params
|
||||
|
||||
|
||||
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
||||
|
@ -1231,34 +1249,24 @@ class VocabFactory:
|
|||
return vocab, special_vocab
|
||||
|
||||
|
||||
def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
|
||||
quantization = {
|
||||
def default_convention_outfile(file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> str:
|
||||
name = metadata.name if metadata.name is not None else None
|
||||
basename = metadata.basename if metadata.basename is not None else None
|
||||
finetune = metadata.finetune if metadata.finetune is not None else None
|
||||
version = metadata.version if metadata.version is not None else None
|
||||
size_label = metadata.size_label if metadata.size_label is not None else gguf.size_label(*model_params_count, expert_count=expert_count or 0)
|
||||
|
||||
output_type = {
|
||||
GGMLFileType.AllF32: "F32",
|
||||
GGMLFileType.MostlyF16: "F16",
|
||||
GGMLFileType.MostlyQ8_0: "Q8_0",
|
||||
}[file_type]
|
||||
|
||||
parameters = model_parameter_count_rounded_notation(model_params_count)
|
||||
|
||||
expert_count = ""
|
||||
if params.n_experts is not None:
|
||||
expert_count = f"{params.n_experts}x"
|
||||
|
||||
version = ""
|
||||
if metadata is not None and metadata.version is not None:
|
||||
version = f"-{metadata.version}"
|
||||
|
||||
name = "ggml-model"
|
||||
if metadata is not None and metadata.name is not None:
|
||||
name = metadata.name
|
||||
elif params.path_model is not None:
|
||||
name = params.path_model.name
|
||||
|
||||
return f"{name}{version}-{expert_count}{parameters}-{quantization}"
|
||||
return gguf.naming_convention(name, basename, finetune, version, size_label, output_type)
|
||||
|
||||
|
||||
def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
|
||||
default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
|
||||
def default_outfile(model_paths: list[Path], file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> Path:
|
||||
default_filename = default_convention_outfile(file_type, expert_count, model_params_count, metadata)
|
||||
ret = model_paths[0].parent / f"{default_filename}.gguf"
|
||||
if ret in model_paths:
|
||||
logger.error(
|
||||
|
@ -1297,8 +1305,9 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
|
||||
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
parser.add_argument("--metadata", type=Path, help="Specify the path for a metadata file")
|
||||
parser.add_argument("--metadata", type=Path, help="Specify the path for an authorship metadata override file")
|
||||
parser.add_argument("--get-outfile", action="store_true", help="get calculated default outfile name")
|
||||
parser.add_argument("--model-name", type=str, default=None, help="name of the model")
|
||||
|
||||
args = parser.parse_args(args_in)
|
||||
|
||||
|
@ -1310,32 +1319,36 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
metadata = Metadata.load(args.metadata)
|
||||
model_name = args.model_name
|
||||
dir_model = args.model
|
||||
|
||||
metadata = gguf.Metadata.load(args.metadata, dir_model, model_name)
|
||||
|
||||
if args.get_outfile:
|
||||
model_plus = load_some_model(args.model)
|
||||
model_plus = load_some_model(dir_model)
|
||||
params = Params.load(model_plus)
|
||||
model = convert_model_names(model_plus.model, params, args.skip_unknown)
|
||||
model_params_count = model_parameter_count(model_plus.model)
|
||||
model_params_count = per_model_weight_count_estimation(model_plus.model.items())
|
||||
ftype = pick_output_type(model, args.outtype)
|
||||
print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
|
||||
|
||||
if (metadata is None or metadata.name is None) and params.path_model is not None:
|
||||
metadata.name = params.path_model.name
|
||||
|
||||
print(f"{default_convention_outfile(ftype, params.n_experts, model_params_count, metadata)}") # noqa: NP100
|
||||
return
|
||||
|
||||
if args.no_vocab and args.vocab_only:
|
||||
raise ValueError("--vocab-only does not make sense with --no-vocab")
|
||||
|
||||
if args.dump_single:
|
||||
model_plus = lazy_load_file(args.model)
|
||||
model_plus = lazy_load_file(dir_model)
|
||||
do_dump_model(model_plus)
|
||||
return
|
||||
|
||||
if not args.vocab_only:
|
||||
model_plus = load_some_model(args.model)
|
||||
model_plus = load_some_model(dir_model)
|
||||
else:
|
||||
model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
|
||||
|
||||
model_params_count = model_parameter_count(model_plus.model)
|
||||
logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})")
|
||||
model_plus = ModelPlus(model = {}, paths = [dir_model / 'dummy'], format = 'none', vocab = None)
|
||||
|
||||
if args.dump:
|
||||
do_dump_model(model_plus)
|
||||
|
@ -1368,7 +1381,7 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
logger.info(f"params = {params}")
|
||||
|
||||
model_parent_path = model_plus.paths[0].parent
|
||||
vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
|
||||
vocab_path = Path(args.vocab_dir or dir_model or model_parent_path)
|
||||
vocab_factory = VocabFactory(vocab_path)
|
||||
vocab_types = None if args.no_vocab else args.vocab_type.split(",")
|
||||
vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)
|
||||
|
@ -1399,13 +1412,21 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
|
||||
assert params is not None
|
||||
|
||||
if metadata.name is None and params.path_model is not None:
|
||||
metadata.name = params.path_model.name
|
||||
|
||||
model_params_count = per_model_weight_count_estimation(model_plus.model.items())
|
||||
logger.info(f"model parameters count : {model_params_count} ({gguf.model_weight_count_rounded_notation(model_params_count[0])})")
|
||||
|
||||
logger.info(f"Vocab info: {vocab}")
|
||||
logger.info(f"Special vocab info: {special_vocab}")
|
||||
model = model_plus.model
|
||||
model = convert_model_names(model, params, args.skip_unknown)
|
||||
ftype = pick_output_type(model, args.outtype)
|
||||
model = convert_to_output_type(model, ftype)
|
||||
outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)
|
||||
outfile = args.outfile or default_outfile(model_plus.paths, ftype, params.n_experts, model_params_count, metadata=metadata)
|
||||
|
||||
metadata.size_label = gguf.size_label(*model_params_count, expert_count=params.n_experts or 0)
|
||||
|
||||
params.ftype = ftype
|
||||
logger.info(f"Writing {outfile}, format {ftype}")
|
||||
|
|
|
@ -190,6 +190,9 @@ static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_
|
|||
if (params->n_threads <= 0) {
|
||||
params->n_threads = std::thread::hardware_concurrency();
|
||||
}
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
export_lora_print_usage(argc, argv, &default_params);
|
||||
exit(0);
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str());
|
||||
export_lora_print_usage(argc, argv, &default_params);
|
||||
|
|
|
@ -201,6 +201,6 @@ Verification results for test.gguf.manifest - Success
|
|||
|
||||
These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs)
|
||||
|
||||
- https://github.com/mofosyne/xxHash (From: https://github.com/Cyan4973/xxHash)
|
||||
- https://github.com/Cyan4973/xxHash
|
||||
- https://github.com/clibs/sha1/
|
||||
- https://github.com/jb55/sha256.c
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"name": "xxhash",
|
||||
"version": "0.8.2",
|
||||
"repo": "mofosyne/xxhash",
|
||||
"repo": "Cyan4973/xxhash",
|
||||
"description": "Extremely fast non-cryptographic hash algorithm",
|
||||
"keywords": ["xxhash", "hashing"],
|
||||
"license": "BSD-2-Clause",
|
||||
|
|
|
@ -23,6 +23,10 @@
|
|||
#include "ggml-cuda.h"
|
||||
#include "ggml-sycl.h"
|
||||
|
||||
#ifdef GGML_USE_CANN
|
||||
#include "ggml-cann.h"
|
||||
#endif
|
||||
|
||||
// utils
|
||||
static uint64_t get_time_ns() {
|
||||
using clock = std::chrono::high_resolution_clock;
|
||||
|
@ -120,6 +124,17 @@ static std::string get_gpu_info() {
|
|||
id += "/";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef GGML_USE_CANN
|
||||
uint32_t count = ggml_backend_cann_get_device_count();
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
char buf[128];
|
||||
ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
|
||||
id += buf;
|
||||
if (i < count - 1) {
|
||||
id += "/";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// TODO: other backends
|
||||
return id;
|
||||
|
|
|
@ -16,6 +16,10 @@
|
|||
#include "ggml-metal.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CANN
|
||||
#include "ggml-cann.h"
|
||||
#endif
|
||||
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#include "stb_image.h"
|
||||
|
||||
|
@ -1001,6 +1005,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
LOG_TEE("%s: CLIP using Metal backend\n", __func__);
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CANN
|
||||
new_clip->backend = ggml_backend_cann_init(0);
|
||||
LOG_TEE("%s: CLIP using CANN backend\n", __func__);
|
||||
#endif
|
||||
|
||||
|
||||
if (!new_clip->backend) {
|
||||
new_clip->backend = ggml_backend_cpu_init();
|
||||
|
|
|
@ -31,7 +31,6 @@ int main(int argc, char ** argv){
|
|||
|
||||
// load the model
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
||||
|
||||
// tokenize the prompt
|
||||
std::vector<llama_token> inp;
|
||||
|
@ -65,7 +64,7 @@ int main(int argc, char ** argv){
|
|||
}
|
||||
|
||||
const int n_input = inp.size();
|
||||
const int n_ctx = params.n_ctx;
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
int n_drafted = 0;
|
||||
int n_accept = 0;
|
||||
|
|
|
@ -39,7 +39,6 @@ int main(int argc, char ** argv){
|
|||
|
||||
// load the model
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
||||
|
||||
// tokenize the prompt
|
||||
std::vector<llama_token> inp;
|
||||
|
|
|
@ -20,7 +20,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
|
||||
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
|
||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", },
|
||||
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
|
||||
{ "IQ2_XXS", LLAMA_FTYPE_MOSTLY_IQ2_XXS, " 2.06 bpw quantization", },
|
||||
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
|
||||
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
|
||||
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
||||
|
@ -28,7 +28,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|||
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
|
||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
|
||||
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
|
||||
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
|
||||
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
|
||||
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
|
||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||
|
|
|
@ -21,7 +21,7 @@ let generation_settings = null;
|
|||
//
|
||||
export async function* llama(prompt, params = {}, config = {}) {
|
||||
let controller = config.controller;
|
||||
const api_url = config.api_url || "";
|
||||
const api_url = config.api_url?.replace(/\/+$/, '') || "";
|
||||
|
||||
if (!controller) {
|
||||
controller = new AbortController();
|
||||
|
@ -196,7 +196,7 @@ export const llamaComplete = async (params, controller, callback) => {
|
|||
// Get the model info from the server. This is useful for getting the context window and so on.
|
||||
export const llamaModelInfo = async (config = {}) => {
|
||||
if (!generation_settings) {
|
||||
const api_url = config.api_url || "";
|
||||
const api_url = config.api_url?.replace(/\/+$/, '') || "";
|
||||
const props = await fetch(`${api_url}/props`).then(r => r.json());
|
||||
generation_settings = props.default_generation_settings;
|
||||
}
|
||||
|
|
|
@ -14,10 +14,10 @@
|
|||
<script type="module">
|
||||
import {
|
||||
html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
|
||||
} from '/index.js';
|
||||
} from './index.js';
|
||||
|
||||
import { llama } from '/completion.js';
|
||||
import { SchemaConverter } from '/json-schema-to-grammar.mjs';
|
||||
import { llama } from './completion.js';
|
||||
import { SchemaConverter } from './json-schema-to-grammar.mjs';
|
||||
import { promptFormats } from './prompt-formats.js';
|
||||
import { systemPrompts } from './system-prompts.js'; // multilingual is wip
|
||||
let selected_image = false;
|
||||
|
@ -225,7 +225,7 @@
|
|||
throw new Error("already running");
|
||||
}
|
||||
controller.value = new AbortController();
|
||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
|
||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: URL.parse('.', document.baseURI).href })) {
|
||||
const data = chunk.data;
|
||||
if (data.stop) {
|
||||
while (
|
||||
|
|
|
@ -479,7 +479,7 @@
|
|||
throw new Error("already running");
|
||||
}
|
||||
controller.value = new AbortController();
|
||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
|
||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: URL.parse('.', document.baseURI).href })) {
|
||||
const data = chunk.data;
|
||||
|
||||
if (data.stop) {
|
||||
|
|
|
@ -1182,7 +1182,7 @@ struct server_context {
|
|||
|
||||
bool process_token(completion_token_output & result, server_slot & slot) {
|
||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||
const std::string token_str = llama_token_to_piece(ctx, result.tok, false);
|
||||
const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special);
|
||||
slot.sampled = result.tok;
|
||||
|
||||
// search stop word and delete it
|
||||
|
|
|
@ -194,13 +194,19 @@ endif ()
|
|||
include(GNUInstallDirs)
|
||||
include(CMakePackageConfigHelpers)
|
||||
|
||||
# all public headers
|
||||
set(GGML_PUBLIC_HEADERS
|
||||
include/ggml.h
|
||||
include/ggml-alloc.h
|
||||
include/ggml-backend.h
|
||||
"${GGML_HEADERS_CUDA}"
|
||||
"${GGML_HEADERS_METAL}"
|
||||
"${GGML_HEADERS_EXTRA}")
|
||||
include/ggml-blas.h
|
||||
include/ggml-cuda.h
|
||||
include/ggml.h
|
||||
include/ggml-kompute.h
|
||||
include/ggml-metal.h
|
||||
include/ggml-rpc.h
|
||||
include/ggml-sycl.h
|
||||
include/ggml-vulkan.h)
|
||||
|
||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||
#if (GGML_METAL)
|
||||
|
|
|
@ -29,6 +29,7 @@ extern "C" {
|
|||
enum ggml_backend_buffer_usage {
|
||||
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
|
||||
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
||||
GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
|
||||
};
|
||||
|
||||
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
|
||||
|
@ -42,6 +43,7 @@ extern "C" {
|
|||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
||||
|
||||
|
|
125
ggml/include/ggml-cann.h
Normal file
125
ggml/include/ggml-cann.h
Normal file
|
@ -0,0 +1,125 @@
|
|||
/*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief Maximum number of CANN devices supported.
|
||||
*/
|
||||
#define GGML_CANN_MAX_DEVICES 16
|
||||
|
||||
/**
|
||||
* @brief Initializes the CANN backend for a specified device.
|
||||
*
|
||||
* This function initializes the CANN backend for the given device.
|
||||
* It verifies the device index, allocates a context, and creates a backend
|
||||
* instance.
|
||||
*
|
||||
* @param device The index of the device to initialize.
|
||||
* @return A pointer to the initialized backend instance, or nullptr on failure.
|
||||
*/
|
||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device);
|
||||
|
||||
/**
|
||||
* @brief Checks if a given backend is a CANN backend.
|
||||
*
|
||||
* This function verifies if the provided backend is a CANN backend by comparing
|
||||
* its GUID with the CANN backend's GUID.
|
||||
*
|
||||
* @param backend The backend instance to check.
|
||||
* @return True if the backend is a CANN backend, false otherwise.
|
||||
*/
|
||||
GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend);
|
||||
|
||||
/**
|
||||
* @brief Retrieves the CANN buffer type for a specified device.
|
||||
*
|
||||
* This function initializes and returns the buffer type interface associated
|
||||
* with the given device. It ensures thread-safe access using a mutex.
|
||||
*
|
||||
* @param device The device index for which to retrieve the buffer type.
|
||||
* @return A pointer to the buffer type interface for the specified device, or
|
||||
* nullptr if the device index is out of range.
|
||||
*/
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t
|
||||
ggml_backend_cann_buffer_type(int32_t device);
|
||||
|
||||
/**
|
||||
* @brief Retrieves the number of CANN devices available.
|
||||
*
|
||||
* This function returns the number of CANN devices available based on
|
||||
* information obtained from `ggml_cann_info()`.
|
||||
*
|
||||
* @return The number of CANN devices available.
|
||||
*/
|
||||
GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);
|
||||
|
||||
/**
|
||||
* @brief Retrieves the description of a specific CANN device.
|
||||
*
|
||||
* This function sets the specified device, retrieves the SoC name,
|
||||
* and writes it into the provided description buffer.
|
||||
*
|
||||
* @param device The device index to retrieve the description for.
|
||||
* @param description Pointer to a buffer where the description will be written.
|
||||
* @param description_size Size of the description buffer.
|
||||
*/
|
||||
GGML_API GGML_CALL void ggml_backend_cann_get_device_description(
|
||||
int32_t device, char* description, size_t description_size);
|
||||
|
||||
/**
|
||||
* @brief Retrieves the memory information of a specific CANN device.
|
||||
*
|
||||
* This function sets the specified device, retrieves the free and total
|
||||
* memory information of the specified type (ACL_HBM_MEM), and stores them
|
||||
* in the provided pointers.
|
||||
*
|
||||
* @param device The device index to retrieve memory information for.
|
||||
* @param free Pointer to a variable where the free memory size will be stored.
|
||||
* @param total Pointer to a variable where the total memory size will be
|
||||
* stored.
|
||||
*/
|
||||
GGML_API GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device,
|
||||
size_t* free,
|
||||
size_t* total);
|
||||
|
||||
/**
|
||||
* @brief Set the logging callback for GGML.
|
||||
*
|
||||
* This function sets the logging callback and user data for logging.
|
||||
*
|
||||
* @param log_callback The logging callback to set.
|
||||
* @param user_data User data to pass to the logging callback.
|
||||
*/
|
||||
GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback,
|
||||
void* user_data);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -753,6 +753,8 @@ extern "C" {
|
|||
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
|
||||
GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
|
||||
// use this to compute the memory overhead of a tensor
|
||||
GGML_API size_t ggml_tensor_overhead(void);
|
||||
|
||||
|
@ -2397,6 +2399,7 @@ extern "C" {
|
|||
GGML_API int ggml_cpu_has_rpc (void);
|
||||
GGML_API int ggml_cpu_has_vsx (void);
|
||||
GGML_API int ggml_cpu_has_matmul_int8(void);
|
||||
GGML_API int ggml_cpu_has_cann (void);
|
||||
|
||||
//
|
||||
// Internal types and functions exposed for tests and benchmarks
|
||||
|
|
|
@ -440,6 +440,10 @@ if (GGML_HIPBLAS)
|
|||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_FORCE_CUBLAS)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_NO_PEER_COPY)
|
||||
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
||||
endif()
|
||||
|
@ -766,6 +770,74 @@ if (GGML_CPU_HBM)
|
|||
target_link_libraries(ggml PUBLIC memkind)
|
||||
endif()
|
||||
|
||||
if (GGML_CANN)
|
||||
if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
|
||||
set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})
|
||||
message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
|
||||
endif()
|
||||
|
||||
if (CANN_INSTALL_DIR)
|
||||
# Only Support Linux.
|
||||
if (GGML_CANN)
|
||||
if (NOT UNIX)
|
||||
set(GGML_CANN OFF)
|
||||
message(WARNING "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}. Turning off GGML_CANN")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Supported platforms: x86-64, arm64
|
||||
if (GGML_CANN)
|
||||
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
|
||||
else()
|
||||
set(GGML_CANN OFF)
|
||||
message(WARNING "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off GGML_CANN")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Set header and libs
|
||||
if(GGML_CANN)
|
||||
set(CANN_INCLUDE_DIRS
|
||||
${CANN_INSTALL_DIR}/include
|
||||
${CANN_INSTALL_DIR}/include/aclnn
|
||||
${CANN_INSTALL_DIR}/acllib/include
|
||||
)
|
||||
|
||||
# TODO: find libs
|
||||
link_directories(
|
||||
${CANN_INSTALL_DIR}/lib64
|
||||
)
|
||||
|
||||
add_subdirectory(ggml-cann/kernels)
|
||||
list(APPEND CANN_LIBRARIES
|
||||
ascendcl
|
||||
nnopbase
|
||||
opapi
|
||||
acl_op_compiler
|
||||
ascendc_kernels
|
||||
)
|
||||
|
||||
set(GGML_HEADERS_CANN "../include/ggml-cann.h")
|
||||
file(GLOB GGML_SOURCES_CANN "ggml-cann/*.cpp")
|
||||
list(APPEND GGML_SOURCES_CANN "ggml-cann.cpp")
|
||||
|
||||
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
|
||||
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
|
||||
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${CANN_LIBRARIES} )
|
||||
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
|
||||
list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
|
||||
endif()
|
||||
else()
|
||||
set(GGML_CANN OFF)
|
||||
message(WARNING "CANN: Can't find CANN_INSTALL_DIR, do you forget to source set_var.sh. Turning off GGML_CANN")
|
||||
endif()
|
||||
|
||||
if(NOT GGML_CANN)
|
||||
message(WARNING "CANN: GGML_CANN is turned OFF, see above for details.")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
function(get_flags CCID CCVER)
|
||||
set(C_FLAGS "")
|
||||
set(CXX_FLAGS "")
|
||||
|
@ -1180,6 +1252,7 @@ add_library(ggml
|
|||
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
|
||||
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
|
||||
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
|
||||
${GGML_SOURCES_CANN} ${GGML_HEADERS_CANN}
|
||||
ggml-aarch64.c ggml-aarch64.h
|
||||
)
|
||||
|
||||
|
|
|
@ -776,6 +776,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|||
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||
return false;
|
||||
}
|
||||
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -134,6 +134,10 @@ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backe
|
|||
}
|
||||
}
|
||||
|
||||
enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
|
||||
return buffer->usage;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
||||
return buffer->buft;
|
||||
}
|
||||
|
@ -445,6 +449,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
|||
extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
|
||||
ggml_backend_kompute_reg_devices();
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CANN
|
||||
extern GGML_CALL int ggml_backend_cann_reg_devices(void);
|
||||
ggml_backend_cann_reg_devices();
|
||||
#endif
|
||||
}
|
||||
|
||||
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
||||
|
|
2023
ggml/src/ggml-cann.cpp
Normal file
2023
ggml/src/ggml-cann.cpp
Normal file
File diff suppressed because it is too large
Load diff
168
ggml/src/ggml-cann/.clang-format
Normal file
168
ggml/src/ggml-cann/.clang-format
Normal file
|
@ -0,0 +1,168 @@
|
|||
---
|
||||
Language: Cpp
|
||||
# BasedOnStyle: Google
|
||||
AccessModifierOffset: -1
|
||||
AlignAfterOpenBracket: Align
|
||||
AlignConsecutiveMacros: false
|
||||
AlignConsecutiveAssignments: false
|
||||
AlignConsecutiveDeclarations: false
|
||||
AlignEscapedNewlines: Left
|
||||
AlignOperands: true
|
||||
AlignTrailingComments: true
|
||||
AllowAllArgumentsOnNextLine: true
|
||||
AllowAllConstructorInitializersOnNextLine: true
|
||||
AllowAllParametersOfDeclarationOnNextLine: true
|
||||
AllowShortBlocksOnASingleLine: Never
|
||||
AllowShortCaseLabelsOnASingleLine: false
|
||||
AllowShortFunctionsOnASingleLine: All
|
||||
AllowShortLambdasOnASingleLine: All
|
||||
AllowShortIfStatementsOnASingleLine: WithoutElse
|
||||
AllowShortLoopsOnASingleLine: true
|
||||
AlwaysBreakAfterDefinitionReturnType: None
|
||||
AlwaysBreakAfterReturnType: None
|
||||
AlwaysBreakBeforeMultilineStrings: true
|
||||
AlwaysBreakTemplateDeclarations: Yes
|
||||
BinPackArguments: true
|
||||
BinPackParameters: true
|
||||
BraceWrapping:
|
||||
AfterCaseLabel: false
|
||||
AfterClass: false
|
||||
AfterControlStatement: false
|
||||
AfterEnum: false
|
||||
AfterFunction: false
|
||||
AfterNamespace: false
|
||||
AfterObjCDeclaration: false
|
||||
AfterStruct: false
|
||||
AfterUnion: false
|
||||
AfterExternBlock: false
|
||||
BeforeCatch: false
|
||||
BeforeElse: false
|
||||
IndentBraces: false
|
||||
SplitEmptyFunction: true
|
||||
SplitEmptyRecord: true
|
||||
SplitEmptyNamespace: true
|
||||
BreakBeforeBinaryOperators: None
|
||||
BreakBeforeBraces: Attach
|
||||
BreakBeforeInheritanceComma: false
|
||||
BreakInheritanceList: BeforeColon
|
||||
BreakBeforeTernaryOperators: true
|
||||
BreakConstructorInitializersBeforeComma: false
|
||||
BreakConstructorInitializers: BeforeColon
|
||||
BreakAfterJavaFieldAnnotations: false
|
||||
BreakStringLiterals: true
|
||||
ColumnLimit: 80
|
||||
CommentPragmas: '^ IWYU pragma:'
|
||||
CompactNamespaces: false
|
||||
ConstructorInitializerAllOnOneLineOrOnePerLine: true
|
||||
ConstructorInitializerIndentWidth: 4
|
||||
ContinuationIndentWidth: 4
|
||||
Cpp11BracedListStyle: true
|
||||
DeriveLineEnding: true
|
||||
DerivePointerAlignment: true
|
||||
DisableFormat: false
|
||||
ExperimentalAutoDetectBinPacking: false
|
||||
FixNamespaceComments: true
|
||||
ForEachMacros:
|
||||
- foreach
|
||||
- Q_FOREACH
|
||||
- BOOST_FOREACH
|
||||
IncludeBlocks: Regroup
|
||||
IncludeCategories:
|
||||
- Regex: '^<ext/.*\.h>'
|
||||
Priority: 2
|
||||
SortPriority: 0
|
||||
- Regex: '^<.*\.h>'
|
||||
Priority: 1
|
||||
SortPriority: 0
|
||||
- Regex: '^<.*'
|
||||
Priority: 2
|
||||
SortPriority: 0
|
||||
- Regex: '.*'
|
||||
Priority: 3
|
||||
SortPriority: 0
|
||||
IncludeIsMainRegex: '([-_](test|unittest))?$'
|
||||
IncludeIsMainSourceRegex: ''
|
||||
IndentCaseLabels: true
|
||||
IndentGotoLabels: true
|
||||
IndentPPDirectives: None
|
||||
IndentWidth: 4
|
||||
IndentWrappedFunctionNames: false
|
||||
JavaScriptQuotes: Leave
|
||||
JavaScriptWrapImports: true
|
||||
KeepEmptyLinesAtTheStartOfBlocks: false
|
||||
MacroBlockBegin: ''
|
||||
MacroBlockEnd: ''
|
||||
MaxEmptyLinesToKeep: 1
|
||||
NamespaceIndentation: None
|
||||
ObjCBinPackProtocolList: Never
|
||||
ObjCBlockIndentWidth: 2
|
||||
ObjCSpaceAfterProperty: false
|
||||
ObjCSpaceBeforeProtocolList: true
|
||||
PenaltyBreakAssignment: 2
|
||||
PenaltyBreakBeforeFirstCallParameter: 1
|
||||
PenaltyBreakComment: 300
|
||||
PenaltyBreakFirstLessLess: 120
|
||||
PenaltyBreakString: 1000
|
||||
PenaltyBreakTemplateDeclaration: 10
|
||||
PenaltyExcessCharacter: 1000000
|
||||
PenaltyReturnTypeOnItsOwnLine: 200
|
||||
PointerAlignment: Left
|
||||
RawStringFormats:
|
||||
- Language: Cpp
|
||||
Delimiters:
|
||||
- cc
|
||||
- CC
|
||||
- cpp
|
||||
- Cpp
|
||||
- CPP
|
||||
- 'c++'
|
||||
- 'C++'
|
||||
CanonicalDelimiter: ''
|
||||
BasedOnStyle: google
|
||||
- Language: TextProto
|
||||
Delimiters:
|
||||
- pb
|
||||
- PB
|
||||
- proto
|
||||
- PROTO
|
||||
EnclosingFunctions:
|
||||
- EqualsProto
|
||||
- EquivToProto
|
||||
- PARSE_PARTIAL_TEXT_PROTO
|
||||
- PARSE_TEST_PROTO
|
||||
- PARSE_TEXT_PROTO
|
||||
- ParseTextOrDie
|
||||
- ParseTextProtoOrDie
|
||||
CanonicalDelimiter: ''
|
||||
BasedOnStyle: google
|
||||
ReflowComments: true
|
||||
SortIncludes: true
|
||||
SortUsingDeclarations: true
|
||||
SpaceAfterCStyleCast: false
|
||||
SpaceAfterLogicalNot: false
|
||||
SpaceAfterTemplateKeyword: true
|
||||
SpaceBeforeAssignmentOperators: true
|
||||
SpaceBeforeCpp11BracedList: false
|
||||
SpaceBeforeCtorInitializerColon: true
|
||||
SpaceBeforeInheritanceColon: true
|
||||
SpaceBeforeParens: ControlStatements
|
||||
SpaceBeforeRangeBasedForLoopColon: true
|
||||
SpaceInEmptyBlock: false
|
||||
SpaceInEmptyParentheses: false
|
||||
SpacesBeforeTrailingComments: 2
|
||||
SpacesInAngles: false
|
||||
SpacesInConditionalStatement: false
|
||||
SpacesInContainerLiterals: true
|
||||
SpacesInCStyleCastParentheses: false
|
||||
SpacesInParentheses: false
|
||||
SpacesInSquareBrackets: false
|
||||
SpaceBeforeSquareBrackets: false
|
||||
Standard: Auto
|
||||
StatementMacros:
|
||||
- Q_UNUSED
|
||||
- QT_REQUIRE_VERSION
|
||||
TabWidth: 8
|
||||
UseCRLF: false
|
||||
UseTab: Never
|
||||
...
|
||||
|
2579
ggml/src/ggml-cann/Doxyfile
Normal file
2579
ggml/src/ggml-cann/Doxyfile
Normal file
File diff suppressed because it is too large
Load diff
198
ggml/src/ggml-cann/acl_tensor.cpp
Normal file
198
ggml/src/ggml-cann/acl_tensor.cpp
Normal file
|
@ -0,0 +1,198 @@
|
|||
/*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "acl_tensor.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
|
||||
aclDataType ggml_cann_type_mapping(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return ACL_FLOAT;
|
||||
case GGML_TYPE_F16:
|
||||
return ACL_FLOAT16;
|
||||
case GGML_TYPE_I8:
|
||||
return ACL_INT8;
|
||||
case GGML_TYPE_I16:
|
||||
return ACL_INT16;
|
||||
case GGML_TYPE_I32:
|
||||
return ACL_INT32;
|
||||
default:
|
||||
return ACL_DT_UNDEFINED;
|
||||
}
|
||||
return ACL_DT_UNDEFINED;
|
||||
}
|
||||
|
||||
aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
|
||||
size_t* nb, int64_t dims, aclFormat format,
|
||||
size_t offset) {
|
||||
// If tensor is bcasted, Up to GGML_MAX_DIMS additional dimensions will be
|
||||
// added.
|
||||
int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];
|
||||
|
||||
int64_t acl_storage_len = 0;
|
||||
if (ne == nullptr) {
|
||||
acl_storage_len = ggml_nbytes(tensor);
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
acl_ne[i] = tensor->ne[i];
|
||||
// The step size of acl is in elements.
|
||||
acl_stride[i] = tensor->nb[i] / ggml_element_size(tensor);
|
||||
}
|
||||
} else {
|
||||
// With bcast
|
||||
for (int i = 0; i < dims; i++) {
|
||||
acl_storage_len += (ne[i] - 1) * nb[i];
|
||||
acl_ne[i] = ne[i];
|
||||
acl_stride[i] = nb[i] / ggml_element_size(tensor);
|
||||
}
|
||||
}
|
||||
|
||||
// Reverse ne and stride.
|
||||
int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
|
||||
std::reverse(acl_ne, acl_ne + final_dims);
|
||||
std::reverse(acl_stride, acl_stride + final_dims);
|
||||
|
||||
aclTensor* acl_tensor = aclCreateTensor(
|
||||
acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride,
|
||||
offset / ggml_element_size(tensor), format, &acl_storage_len, 1,
|
||||
tensor->data);
|
||||
|
||||
return acl_tensor;
|
||||
}
|
||||
|
||||
bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
if (t1->ne[i] != t0->ne[i] && t1->ne[i] != 1) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
|
||||
size_t type_size, int64_t* ne, size_t* nb,
|
||||
int64_t dims, aclFormat format,
|
||||
size_t offset) {
|
||||
int64_t tmp_ne[GGML_MAX_DIMS * 2];
|
||||
int64_t tmp_stride[GGML_MAX_DIMS * 2];
|
||||
|
||||
memcpy(tmp_ne, ne, dims * sizeof(int64_t));
|
||||
for (int i = 0; i < dims; i++) {
|
||||
tmp_stride[i] = nb[i] / type_size;
|
||||
}
|
||||
|
||||
std::reverse(tmp_ne, tmp_ne + dims);
|
||||
std::reverse(tmp_stride, tmp_stride + dims);
|
||||
|
||||
int64_t acl_storage_len = 0;
|
||||
for (int i = 0; i < dims; i++) {
|
||||
acl_storage_len += (ne[i] - 1) * nb[i];
|
||||
}
|
||||
|
||||
aclTensor* acl_tensor =
|
||||
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
|
||||
format, &acl_storage_len, 1, data_ptr);
|
||||
|
||||
return acl_tensor;
|
||||
}
|
||||
|
||||
int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
|
||||
const ggml_tensor* src1,
|
||||
int64_t* bcast_src0_ne,
|
||||
int64_t* bcast_src1_ne, size_t* bcast_src0_nb,
|
||||
size_t* bcast_src1_nb) {
|
||||
GGML_ASSERT(ggml_can_repeat(src1, src0));
|
||||
int bcast_dim_cnt = 0;
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
int64_t nr = src0->ne[i] / src1->ne[i];
|
||||
bcast_src0_ne[bcast_dim_cnt] = src0->ne[i] / nr;
|
||||
bcast_src1_ne[bcast_dim_cnt] = src1->ne[i];
|
||||
bcast_src0_nb[bcast_dim_cnt] = src0->nb[i];
|
||||
bcast_src1_nb[bcast_dim_cnt] = src1->nb[i];
|
||||
bcast_dim_cnt++;
|
||||
if (nr != 1) {
|
||||
// Need to add an extra dim.
|
||||
bcast_src0_ne[bcast_dim_cnt] = nr;
|
||||
bcast_src1_ne[bcast_dim_cnt] = 1;
|
||||
bcast_src0_nb[bcast_dim_cnt] = bcast_src0_nb[bcast_dim_cnt - 1] *
|
||||
bcast_src0_ne[bcast_dim_cnt - 1];
|
||||
bcast_src1_nb[bcast_dim_cnt] = bcast_src1_nb[bcast_dim_cnt - 1] *
|
||||
bcast_src1_ne[bcast_dim_cnt - 1];
|
||||
bcast_dim_cnt++;
|
||||
}
|
||||
}
|
||||
return bcast_dim_cnt;
|
||||
}
|
||||
|
||||
int64_t ggml_cann_get_mulmat_bcast_shape(
|
||||
const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne,
|
||||
const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb,
|
||||
int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne,
|
||||
size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb) {
|
||||
// input and dst shoule in same shape, except first two dims.
|
||||
GGML_ASSERT(input_ne[2] == dst_ne[2]);
|
||||
GGML_ASSERT(input_ne[3] == dst_ne[3]);
|
||||
|
||||
int bcast_dim_cnt = 0;
|
||||
|
||||
// For mul_mat, a dimension needs to be added before the dimension that
|
||||
// weight needs to be expanded to satisfy the bcast rule of matrix
|
||||
// multiplication.
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
int64_t nr = input_ne[i] / weight_ne[i];
|
||||
// Do not use bcast in the first two dimensions because we only support
|
||||
// the bcast batch dimension. Just copy them.
|
||||
if (i < 2 || nr == 1) {
|
||||
bcast_input_ne[bcast_dim_cnt] = input_ne[i];
|
||||
bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
|
||||
bcast_dst_ne[bcast_dim_cnt] = dst_ne[i];
|
||||
|
||||
bcast_input_nb[bcast_dim_cnt] = input_nb[i];
|
||||
bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
|
||||
bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
|
||||
bcast_dim_cnt++;
|
||||
} else {
|
||||
// Need to add an extra dim.
|
||||
bcast_input_ne[bcast_dim_cnt] = nr;
|
||||
bcast_dst_ne[bcast_dim_cnt] = nr;
|
||||
bcast_weight_ne[bcast_dim_cnt] = 1;
|
||||
bcast_input_nb[bcast_dim_cnt] = input_nb[i];
|
||||
bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
|
||||
bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
|
||||
bcast_dim_cnt++;
|
||||
|
||||
bcast_input_ne[bcast_dim_cnt] = input_ne[i] / nr;
|
||||
bcast_dst_ne[bcast_dim_cnt] = dst_ne[i] / nr;
|
||||
bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
|
||||
bcast_input_nb[bcast_dim_cnt] = bcast_input_nb[bcast_dim_cnt - 1] *
|
||||
bcast_input_ne[bcast_dim_cnt - 1];
|
||||
bcast_dst_nb[bcast_dim_cnt] = bcast_dst_nb[bcast_dim_cnt - 1] *
|
||||
bcast_dst_ne[bcast_dim_cnt - 1];
|
||||
bcast_weight_nb[bcast_dim_cnt] =
|
||||
bcast_weight_nb[bcast_dim_cnt - 1] *
|
||||
bcast_weight_ne[bcast_dim_cnt - 1];
|
||||
bcast_dim_cnt++;
|
||||
}
|
||||
}
|
||||
return bcast_dim_cnt;
|
||||
}
|
230
ggml/src/ggml-cann/acl_tensor.h
Normal file
230
ggml/src/ggml-cann/acl_tensor.h
Normal file
|
@ -0,0 +1,230 @@
|
|||
/*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef CANN_ACL_TENSOR_H
|
||||
#define CANN_ACL_TENSOR_H
|
||||
|
||||
#include <aclnn/aclnn_base.h>
|
||||
#include "common.h"
|
||||
|
||||
/**
|
||||
* @brief Maps a ggml_type to its corresponding aclDataType.
|
||||
*
|
||||
* @details This function takes a ggml_type as input and returns the corresponding
|
||||
* aclDataType. It supports mapping for various ggml_types. If the input type
|
||||
* does not match any of the predefined ggml_types, the function returns
|
||||
* ACL_DT_UNDEFINED.
|
||||
*
|
||||
* @param type The ggml_type to be mapped.
|
||||
* @return The corresponding aclDataType. If the input type is not recognized,
|
||||
* ACL_DT_UNDEFINED is returned.
|
||||
*/
|
||||
aclDataType ggml_cann_type_mapping(ggml_type type);
|
||||
|
||||
/**
|
||||
* @brief Creates an ACL tensor from a ggml_tensor with optional shape.
|
||||
*
|
||||
* @details This function creates an ACL tensor based on the properties of the
|
||||
* provided ggml_tensor. It supports customer shape by adjusting dimensions
|
||||
* and strides accordingly. If customer shape is applied, additional
|
||||
* dimensions and strides are calculated based on the provided parameters.
|
||||
*
|
||||
* @param tensor Pointer to the ggml_tensor to be converted to ACL tensor.
|
||||
* @param ne Pointer to an array containing dimensions. Defaults to nullptr
|
||||
* if no customer shape is applied.
|
||||
* @param nb Pointer to an array containing strides. Defaults to nullptr
|
||||
* if no customer shape is applied.
|
||||
* @param dims Number of dimensions in the tensor. Defaults to 0 if no customer
|
||||
* shape is applied.
|
||||
* @param format ACL tensor format. Defaults to ACL_FORMAT_ND.
|
||||
* @param offset Offset in bytes for the ACL tensor data. Defaults to 0.
|
||||
* @return Pointer to the created ACL tensor.
|
||||
*/
|
||||
aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = nullptr,
|
||||
size_t* nb = nullptr, int64_t dims = 0,
|
||||
aclFormat format = ACL_FORMAT_ND,
|
||||
size_t offset = 0);
|
||||
|
||||
/**
|
||||
* @brief Creates an ACL tensor from provided parameters.
|
||||
*
|
||||
* @details This function creates an ACL tensor using the provided data pointer,
|
||||
* data type, dimensions, strides, format, offset, and additional parameters.
|
||||
* It calculates necessary dimensions and strides based on the provided ne and nb
|
||||
* arrays, adjusting them for the ACL tensor creation. The ACL storage length
|
||||
* is also calculated based on the provided dimensions and strides.
|
||||
*
|
||||
* @param data_ptr Pointer to the data buffer for the ACL tensor.
|
||||
* @param dtype ACL data type of the tensor.
|
||||
* @param type_size Size of each element in the tensor data buffer.
|
||||
* @param ne Pointer to an array containing tensor dimensions.
|
||||
* @param nb Pointer to an array containing tensor strides.
|
||||
* @param dims Number of dimensions of the tensor.
|
||||
* @param format ACL tensor format. Defaults to ACL_FORMAT_ND.
|
||||
* @param offset Offset in bytes for the ACL tensor data. Defaults to 0.
|
||||
* @return Pointer to the created ACL tensor.
|
||||
*/
|
||||
aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
|
||||
size_t type_size, int64_t* ne, size_t* nb,
|
||||
int64_t dims, aclFormat format = ACL_FORMAT_ND,
|
||||
size_t offset = 0);
|
||||
|
||||
/**
|
||||
* @brief Checks if tensors require broadcasting based on their shapes.
|
||||
*
|
||||
* @details This function determines if two ggml_tensors need to be broadcasted for
|
||||
* element-wise operations. Broadcasting is necessary if the shapes of the
|
||||
* tensors are not identical and no dimension in either tensor equals 1.
|
||||
*
|
||||
* @param t0 Pointer to the first ggml_tensor.
|
||||
* @param t1 Pointer to the second ggml_tensor.
|
||||
* @return True if broadcasting is needed, False otherwise.
|
||||
*
|
||||
* @remarks This function iterates over the dimensions of t0 and t1. It checks if each
|
||||
* dimension in t1 differs from t0's corresponding dimension and is not equal
|
||||
* to 1. If such a dimension is found, broadcasting is required to align t1
|
||||
* with t0 for element-wise operations.
|
||||
*/
|
||||
bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1);
|
||||
|
||||
/**
|
||||
* @brief Computes broadcast shapes and strides for two ggml_tensors.
|
||||
*
|
||||
* @details This function calculates the broadcast shapes and strides for two ggml_tensors,
|
||||
* following the broadcasting rules similar to numpy. It adjusts dimensions and
|
||||
* strides to ensure compatibility for element-wise operations where one tensor
|
||||
* can be broadcasted to match the shape of another tensor.
|
||||
*
|
||||
* @param src0 Pointer to the first ggml_tensor.
|
||||
* @param src1 Pointer to the second ggml_tensor.
|
||||
* @param bcast_ne_src0 Output array to store broadcasted dimensions for src0.
|
||||
* @param bcast_ne_src1 Output array to store broadcasted dimensions for src1.
|
||||
* @param bcast_nb_src0 Output array to store broadcasted strides for src0.
|
||||
* @param bcast_nb_src1 Output array to store broadcasted strides for src1.
|
||||
* @return Number of dimensions in the broadcasted shape.
|
||||
*
|
||||
* @pre ggml_can_repeat(src1, src0) must return true, indicating src1 can be broadcasted
|
||||
* to match src0.
|
||||
*
|
||||
* @remarks This function iterates over the dimensions of src0 and src1, calculating the
|
||||
* necessary broadcast dimensions and strides. If a dimension requires broadcasting
|
||||
* (i.e., its size in src1 is smaller than in src0), an additional dimension is
|
||||
* added with size calculated to match src0's dimension. This adjustment ensures
|
||||
* that src1 can be element-wise broadcasted to src0's shape.
|
||||
*
|
||||
* How it works:
|
||||
*
|
||||
* if dim0 has padding.
|
||||
* a -> (2, 2) padding = 2
|
||||
* a: [[1, 2, *, *]
|
||||
* [2, 3, *, *]]
|
||||
* nb = (8, 4, 2)
|
||||
*
|
||||
* if a should bcast with b -> (2, 4)
|
||||
* b' -> (2, 2, 2)
|
||||
* b : [[1, 2, 3, 4, *, *]
|
||||
* [5, 6, 7, 8, *, *]]
|
||||
* nb = (12, 6, 1)
|
||||
*
|
||||
* after bcast:
|
||||
* a' -> (2, 1, 2)
|
||||
* a': [[[1, 2], *, *]
|
||||
* [[2, 3], *, *]]
|
||||
* nb = (8, 4, 2, 1)
|
||||
*
|
||||
* b' : [[[1, 2], [3, 4], *, *]
|
||||
* [[5, 6], [7, 8], *, *]]
|
||||
* nb = (12, 6, 2, 1)
|
||||
* \endcode
|
||||
*
|
||||
* dim1 in a inserted dim, should add nb for dim1,
|
||||
* and all other nb moves to next in order.
|
||||
*/
|
||||
int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,
|
||||
int64_t* bcast_ne_src0, int64_t* bcast_ne_src1,
|
||||
size_t* bcast_nb_src0, size_t* bcast_nb_src1);
|
||||
|
||||
// Bcast macro to avoid duplicate code.
|
||||
#define BCAST_SHAPE(src0, src1) \
|
||||
int64_t bcast_##src0##_ne[GGML_MAX_DIMS * 2]; \
|
||||
int64_t bcast_##src1##_ne[GGML_MAX_DIMS * 2]; \
|
||||
size_t bcast_##src0##_nb[GGML_MAX_DIMS * 2]; \
|
||||
size_t bcast_##src1##_nb[GGML_MAX_DIMS * 2]; \
|
||||
int64_t bcast_dims = ggml_cann_get_bcast_shape( \
|
||||
src0, src1, bcast_##src0##_ne, bcast_##src1##_ne, bcast_##src0##_nb, \
|
||||
bcast_##src1##_nb);
|
||||
|
||||
#define BCAST_PARAM(tensor) bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims
|
||||
|
||||
/**
|
||||
* @brief Calculates broadcast shapes for matrix multiplication.
|
||||
*
|
||||
* @details This function computes the broadcast shapes required for matrix multiplication
|
||||
* based on the input, weight, and destination tensor shapes. It ensures that the
|
||||
* dimensions of weight tensors are expanded appropriately to satisfy matrix
|
||||
* multiplication broadcast rules.
|
||||
*
|
||||
* @param input_ne Array containing the dimensions of the input tensor.
|
||||
* @param weight_ne Array containing the dimensions of the weight tensor.
|
||||
* @param dst_ne Array containing the dimensions of the destination tensor.
|
||||
* @param input_nb Array containing the strides of the input tensor.
|
||||
* @param weight_nb Array containing the strides of the weight tensor.
|
||||
* @param dst_nb Array containing the strides of the destination tensor.
|
||||
* @param bcast_input_ne Output array for broadcasted input tensor dimensions.
|
||||
* @param bcast_weight_ne Output array for broadcasted weight tensor dimensions.
|
||||
* @param bcast_dst_ne Output array for broadcasted destination tensor dimensions.
|
||||
* @param bcast_input_nb Output array for broadcasted input tensor strides.
|
||||
* @param bcast_weight_nb Output array for broadcasted weight tensor strides.
|
||||
* @param bcast_dst_nb Output array for broadcasted destination tensor strides.
|
||||
* @return The number of dimensions in the broadcasted tensors.
|
||||
*
|
||||
* @remarks This function iterates over the tensor dimensions and calculates the broadcast
|
||||
* shapes needed for matrix multiplication. It ensures that dimensions where
|
||||
* weight tensor requires expansion are appropriately handled to conform with
|
||||
* broadcasting rules.
|
||||
* @note compare with ggml_cann_get_bcast_shape, mul_mat broadcast need add this new dim
|
||||
* before cast dim.
|
||||
* @sa ggml_cann_get_bcast_shape
|
||||
*/
|
||||
int64_t ggml_cann_get_mulmat_bcast_shape(
|
||||
const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne,
|
||||
const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb,
|
||||
int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne,
|
||||
size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb);
|
||||
|
||||
// Bcast macro to avoid duplicate code.
|
||||
#define BCAST_MUL_MAT_SHAPE(input, weight, dst) \
|
||||
int64_t bcast_##input##_ne[GGML_MAX_DIMS * 2]; \
|
||||
int64_t bcast_##weight##_ne[GGML_MAX_DIMS * 2]; \
|
||||
int64_t bcast_##dst##_ne[GGML_MAX_DIMS * 2]; \
|
||||
size_t bcast_##input##_nb[GGML_MAX_DIMS * 2]; \
|
||||
size_t bcast_##weight##_nb[GGML_MAX_DIMS * 2]; \
|
||||
size_t bcast_##dst##_nb[GGML_MAX_DIMS * 2]; \
|
||||
int64_t bcast_dims = ggml_cann_get_mulmat_bcast_shape( \
|
||||
input->ne, weight->ne, dst->ne, input->nb, weight->nb, dst->nb, \
|
||||
bcast_##input##_ne, bcast_##weight##_ne, bcast_##dst##_ne, \
|
||||
bcast_##input##_nb, bcast_##weight##_nb, bcast_##dst##_nb);
|
||||
|
||||
#define BCAST_MUL_MAT_PARAM(tensor) \
|
||||
bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims
|
||||
|
||||
#endif // CANN_ACL_TENSOR_H
|
2944
ggml/src/ggml-cann/aclnn_ops.cpp
Normal file
2944
ggml/src/ggml-cann/aclnn_ops.cpp
Normal file
File diff suppressed because it is too large
Load diff
592
ggml/src/ggml-cann/aclnn_ops.h
Normal file
592
ggml/src/ggml-cann/aclnn_ops.h
Normal file
|
@ -0,0 +1,592 @@
|
|||
#ifndef CANN_ACLNN_OPS
|
||||
#define CANN_ACLNN_OPS
|
||||
|
||||
/**
|
||||
* @file acl_tensor
|
||||
* @brief This file contains related functions of ggml_tensor and acl_tensor.
|
||||
* Contains conversion from ggml_tensor to acl_tensor, broadcast and other
|
||||
* functions.
|
||||
* @author hipudding <huafengchun@gmail.com>
|
||||
* @author wangshuai09 <391746016@qq.com>
|
||||
* @date July 15, 2024
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <aclnnop/aclnn_add.h>
|
||||
#include <aclnnop/aclnn_arange.h>
|
||||
#include <aclnnop/aclnn_argsort.h>
|
||||
#include <aclnnop/aclnn_cat.h>
|
||||
#include <aclnnop/aclnn_clamp.h>
|
||||
#include <aclnnop/aclnn_div.h>
|
||||
#include <aclnnop/aclnn_gelu.h>
|
||||
#include <aclnnop/aclnn_hardsigmoid.h>
|
||||
#include <aclnnop/aclnn_hardswish.h>
|
||||
#include <aclnnop/aclnn_leaky_relu.h>
|
||||
#include <aclnnop/aclnn_mul.h>
|
||||
#include <aclnnop/aclnn_relu.h>
|
||||
#include <aclnnop/aclnn_silu.h>
|
||||
#include <aclnnop/aclnn_tanh.h>
|
||||
#include "acl_tensor.h"
|
||||
#include "common.h"
|
||||
|
||||
/**
|
||||
* @brief Repeats a ggml tensor along each dimension to match the dimensions
|
||||
* of another tensor.
|
||||
*
|
||||
* @details This function repeats the elements of a source ggml tensor along
|
||||
* each dimension to create a destination tensor with the specified
|
||||
* dimensions. The operation is performed using the ACL backend and
|
||||
* executed asynchronously on the device.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The ggml tensor representing the destination, which op is
|
||||
* GGML_OP_REPEAT and specifies the desired dimensions.
|
||||
*/
|
||||
void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Adds two ggml tensors using the CANN backend.
|
||||
*
|
||||
* @details This function performs an element-wise addition of two tensors. In
|
||||
* case the tensors do not have the same shape, one or both tensors
|
||||
* will be broadcasted to match the shape of the other before the
|
||||
* addition is performed.The formula for the operation is given by:
|
||||
* \f[
|
||||
* \text{dst} = \text{acl_src0} + \alpha \cdot \text{acl_src1}
|
||||
* \f]
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The ggml tensor representing the destination, result of the
|
||||
* addition is stored at dst->data, and dst->op is `GGML_OP_ADD`
|
||||
*/
|
||||
void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Applies the Leaky ReLU activation function to a tensor using the CANN
|
||||
* backend.
|
||||
*
|
||||
* @details This function computes the Leaky ReLU activation for each element of
|
||||
* the input tensor. The Leaky ReLU function allows a small gradient
|
||||
* when the unit is not active (i.e., when the input is negative). The
|
||||
* Leaky ReLU function is defined as:
|
||||
* \f[
|
||||
* \text{dst} = \max(0, src) + \text{negativeSlope} \cdot \min(0,
|
||||
* src)
|
||||
* \f]
|
||||
* `negativeSlope` is in dst->params.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the result of the Leaky ReLU
|
||||
* activation is stored, which op is `GGML_OP_LEAKY_RELU`
|
||||
*/
|
||||
void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Concatenates multiple tensors along a specified dimension using the
|
||||
* CANN backend.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param tensorList A pointer to the list of tensors to be concatenated.
|
||||
* @param dst The destination tensor where the result of the
|
||||
* concatenation is stored. dst->op is `GGML_OP_CONCAT`.
|
||||
* @param concat_dim The dimension along which the tensors are concatenated.
|
||||
*
|
||||
* @attention tensorList length should be 2 and the dimension using for concat
|
||||
* default to 1.
|
||||
*/
|
||||
void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Generates a sequence of evenly spaced values within a specified
|
||||
* interval for a ggml tensor using the CANN backend.
|
||||
*
|
||||
* @details This function creates a sequence of numbers over a specified i
|
||||
* nterval, starting from `start`, ending before `stop`, and
|
||||
* incrementing by `step`. The sequence is stored in the destination
|
||||
* tensor `dst`.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the generated sequence will be stored.
|
||||
* `start`, 'stop' and 'step' are in dst->op_params and dst->op is
|
||||
* `GGML_OP_ARANGE`.
|
||||
*/
|
||||
void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Computes the square of the elements of a ggml tensor using the CANN
|
||||
* backend.
|
||||
* @details The function sets the second source tensor of the destination
|
||||
* tensor `dst` to be equal to the first source tensor. This is
|
||||
* effectively squaring the elements since the multiplication becomes
|
||||
* `element * element`.
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the squared values will be stored,
|
||||
* which dst->op is `GGML_OP_SQR`.
|
||||
*/
|
||||
void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Applies a clamp operation to the elements of a ggml tensor using the
|
||||
* CANN backend.
|
||||
*
|
||||
* @details This function clamps the elements of the input tensor `src` to a
|
||||
* specified range defined by `min` and `max` values. The result is
|
||||
* stored in the destination tensor `dst`. The operation is defined as:
|
||||
* \f[
|
||||
* y = \max(\min(x, max\_value), min\_value)
|
||||
* \f]
|
||||
* where `x` is an element of the input tensor, and `y` is the
|
||||
* corresponding element in the output tensor.
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the clamped values will be stored.
|
||||
* dst->op is `GGML_OP_CLAMP`, `min` and `max` value is in dst->params.
|
||||
*/
|
||||
void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Scales the elements of a ggml tensor by a constant factor using the
|
||||
* CANN backend.
|
||||
*
|
||||
* @details This function multiplies each element of the input tensor `src` by
|
||||
* a scaling factor `scale`, storing the result in the destination
|
||||
* tensor `dst`. The operation is defined as:
|
||||
* \f[
|
||||
* dst = src \times scale
|
||||
* \f]
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the scaled values will be stored.
|
||||
* dst->op is `GGML_OP_SCALE` and `scale` value is in dst->params.
|
||||
*/
|
||||
void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Sorts the elements of a ggml tensor and returns the indices that
|
||||
* would sort the tensor using the CANN backend.
|
||||
*
|
||||
* @details This function performs an argsort operation on the input tensor
|
||||
* `src`. It sorts the elements of `src` in either ascending or
|
||||
* descending order, depending on the `GGML_SORT_ORDER_DESC`,
|
||||
* and returns the indices that would sort the original tensor.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the sorted indices will be stored.
|
||||
* dst->op is `GGML_OP_ARGSORT`.
|
||||
*/
|
||||
void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Computes the Layer Normalization for a ggml tensor using the CANN
|
||||
* backend.
|
||||
*
|
||||
* @details This function applies the Layer Normalization operation on the
|
||||
* input tensor `src` and stores the result in the destination tensor
|
||||
* `dst`. Layer Normalization normalizes the features at each sample in
|
||||
* a mini-batch independently. It is commonly used in neural networks
|
||||
* to normalize the activations of a layer by adjusting and scaling
|
||||
* the outputs.
|
||||
* The operation is defined as:
|
||||
* \f[
|
||||
* \text { out }=\frac{x-\mathrm{E}[x]}{\sqrt{\text{Var}[x]+eps}}
|
||||
* \f]
|
||||
* `Var` defaults dst->ne[0]. `eps` is in dst->params.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the normalized values will be stored.
|
||||
* @attention `Var` defaults to dst->ne[0].
|
||||
*/
|
||||
void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Computes the Group Normalization for a ggml tensor using the CANN
|
||||
* backend.
|
||||
*
|
||||
* @brief This function applies the Group Normalization operation on the input
|
||||
* tensor `src` and stores the result in the destination tensor `dst`.
|
||||
* Group Normalization divides the channels into groups and normalizes
|
||||
* the features within each group across spatial locations.
|
||||
* It is commonly used in convolutional neural networks to improve
|
||||
* training stability and performance.
|
||||
* The operation is defined as:
|
||||
* \f[
|
||||
* \text { out }=\frac{x-\mathrm{E}[x]}{\sqrt{\text{Var}[x]+eps}}
|
||||
* \f]
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the normalized values will be stored.
|
||||
* `n_groups` is in dst->params, which split C channel to `n_groups`.
|
||||
* dst->op is `GGML_OP_GROUP_NORM`.
|
||||
*
|
||||
* @attention eps defaults to 1e-6f.
|
||||
*/
|
||||
void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Computes the accumulation of tensors using the CANN backend.
|
||||
*
|
||||
* @details This function performs an accumulation operation on two tensors.
|
||||
* Depending on the `inplace` flag, it either updates the destination
|
||||
* tensor `dst` in place by adding `alpha * src1` to it, or it creates
|
||||
* a new tensor as the result of `src0 + alpha * src1` and stores it in
|
||||
* `dst`.
|
||||
* The operation is defined as:
|
||||
* \f[
|
||||
* dst = src0 + alpha \times src1
|
||||
* \f]
|
||||
* if `inplace` is `true`, `src0` is equal to 'dst'.
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the accumulated values will be stored.
|
||||
* `inplace` is in dst->params, and dst->op is `GGML_OP_ACC`.
|
||||
*/
|
||||
void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Computes the sum of elements along the last dimension of a ggml tensor
|
||||
* using the CANN backend.
|
||||
*
|
||||
* @details This function performs a reduction sum operation along the last
|
||||
* dimension of the input tensor `src`. The result of the sum is stored
|
||||
* in the destination tensor `dst`.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the reduced values will be stored。
|
||||
* dst->op is `GGML_OP_SUM_ROWS`.
|
||||
*
|
||||
* @attention `reduce_dims` defaults to 3, which means the last dimension.
|
||||
*/
|
||||
void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Upsamples a ggml tensor using nearest neighbor interpolation using
|
||||
* the CANN backend.
|
||||
*
|
||||
* @details This function performs upsampling of the input tensor `src` using
|
||||
* nearest neighbor interpolation. The upsampling is applied to the
|
||||
* height and width dimensions (last two dimensions) of the tensor. The
|
||||
* result is stored in the destination tensor `dst`, which must have
|
||||
* the appropriate dimensions for the upsampled output.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the upsampled values will be stored.
|
||||
* dst->op is `GGML_OP_UPSCALE`.
|
||||
*/
|
||||
void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
|
||||
ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Pads a ggml tensor to match the dimensions of the destination tensor
|
||||
* using the CANN backend.
|
||||
*
|
||||
* @details This function pads the input tensor `src` so that it matches the
|
||||
* dimensions of the destination tensor `dst`. The amount of padding
|
||||
* is calculated based on the difference in sizes between `src` and
|
||||
* `dst` along each dimension. The padded tensor is stored in `dst`.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor, which specifies the target dimensions for
|
||||
* padding. dst->op is `GGML_OP_PAD`.
|
||||
*/
|
||||
void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Executes a 2D pooling operation on a ggml tensor using the CANN
|
||||
* backend.
|
||||
*
|
||||
* @details This function dispatches the execution of a 2D pooling operation on
|
||||
* the input tensor `dst`. The type of pooling (average or max) is
|
||||
* determined by the `op` parameter, which is read from the operation
|
||||
* parameters of `dst`. The function supports average pooling
|
||||
* (`GGML_OP_POOL_AVG`) and max pooling (`GGML_OP_POOL_MAX`). If an
|
||||
* invalid operation is encountered, the function asserts a failure.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor on which the pooling operation is to be
|
||||
* performed. dst->op is `GGML_OP_POOL_2D`.
|
||||
*/
|
||||
void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Duplicates a ggml tensor using the CANN backend.
|
||||
*
|
||||
* @details This function duplicates the contents of the source tensor `src` to
|
||||
* the destination tensor `dst`. The function supports various tensor
|
||||
* types and configurations, including handling of extra data, type
|
||||
* conversions, and special cases for contiguous and non-contiguous
|
||||
* tensors.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the duplicated data will be stored.
|
||||
* dst->op is `GGML_OP_DUP`
|
||||
*
|
||||
* @attention Only support Fp16/FP32. Not support when src and dst have
|
||||
* different shape and dst is no-contiguous.
|
||||
* @note: This func need to simplify.
|
||||
*/
|
||||
void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Computes the Root Mean Square (RMS) normalization of a ggml tensor
|
||||
* using the CANN backend.
|
||||
*
|
||||
* @details This function applies RMS normalization to the input tensor `src`
|
||||
* and stores the result in the destination tensor `dst`. RMS
|
||||
* normalization involves computing the root mean square of the input
|
||||
* tensor along a specified dimension and then dividing each element of
|
||||
* the tensor by this value, adjusted by a small epsilon value to
|
||||
* prevent division by zero.
|
||||
* The operation is defined as:
|
||||
* \f[
|
||||
* \text{RmsNorm}\left(x_i\right)=\frac{x_i}{\text{Rms}(\mathbf{x})} g_i,
|
||||
* \quad \text { where } \text{Rms}(\mathbf{x})=\sqrt{\frac{1}{n} \sum_{i=1}^n x_i^2+e p s}
|
||||
* \f]
|
||||
* `eps` is in dst->op_params.
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the normalized values will be stored.
|
||||
* dst->op is `GGML_OP_RMS_NORM`.
|
||||
*/
|
||||
void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Applies a diagonal mask to the tensor with a specified value.
|
||||
*
|
||||
* @details This function creates a mask tensor filled with ones, then applies
|
||||
* an upper triangular and lower triangular operation to it based on
|
||||
* the number of past elements specified. Afterward, it adds the masked
|
||||
* tensor to the destination tensor in-place.
|
||||
*
|
||||
* @param ctx The backend CANN context used for operations.
|
||||
* @param dst The destination tensor where the result will be stored. dst->op is
|
||||
* `GGML_OP_DIAG_MASK`
|
||||
* @param value The value to use for masking.
|
||||
*/
|
||||
void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float value);
|
||||
|
||||
/**
|
||||
* @brief Performs an image-to-column transformation on the input tensor.
|
||||
*
|
||||
* @details This function takes an input tensor and applies an image-to-column
|
||||
* operation, converting spatial dimensions into column-like
|
||||
* structures suitable for convolutional operations. It supports both
|
||||
* half-precision (F16) and single-precision (F32) floating-point data
|
||||
* types.
|
||||
*
|
||||
* @param ctx The backend CANN context for executing operations.
|
||||
* @param dst The destination tensor that stores the result of the operation.
|
||||
* dst->op is `GGML_OP_IM2COL`.
|
||||
*/
|
||||
void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Computes time step embeddings using sine and cosine functions.
|
||||
*
|
||||
* @details This function calculates time step embeddings by applying sine and
|
||||
* cosine transformations to a given input tensor, which is typically
|
||||
* used in temporal models like diffusion models or transformers to
|
||||
* encode time information effectively.
|
||||
*
|
||||
* @param ctx The backend CANN context for executing operations.
|
||||
* @param dst The destination tensor where the result of the embedding operation
|
||||
* will be stored. dst->op is `GGML_OP_TIMESTEP_EMBEDDING`.
|
||||
*/
|
||||
void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
// @see ggml_cann_dup.
|
||||
void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Computes the softmax activation with optional masking.
|
||||
*
|
||||
* @details This function computes the softmax activation over the input tensor,
|
||||
* optionally applying a mask and scaling factor. It supports both FP16
|
||||
* and FP32 data types and can handle masking by broadcasting the mask
|
||||
* across rows if necessary.
|
||||
* The function performs the following steps:
|
||||
* 1. Multiplies the input tensor by a scale factor.
|
||||
* 2. Optionally casts the mask tensor to FP32 if it is in FP16 format.
|
||||
* 3. Broadcasts the mask tensor if its dimensions do not match the
|
||||
* input tensor's dimensions.
|
||||
* 4. Adds the mask to the scaled input tensor.
|
||||
* 5. Applies the softmax activation function along the specified
|
||||
* dimension.
|
||||
*
|
||||
* @param ctx The backend CANN context for executing operations.
|
||||
* @param dst The destination tensor where the result will be stored. dst->op is
|
||||
* `GGML_OP_SOFTMAX`.
|
||||
*/
|
||||
void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Extracts specific rows from a tensor based on indices.
|
||||
*
|
||||
* @details This function retrieves rows from a source tensor src0 according to
|
||||
* the indices provided in another tensor src1 and stores the result in
|
||||
* a destination tensor (\p dst). It supports different data types
|
||||
* including F32, F16, Q4_0, and Q8_0.
|
||||
*
|
||||
* @param ctx The backend CANN context for executing operations.
|
||||
* @param dst The destination tensor where the extracted rows will be stored.
|
||||
* dst->op is `GGML_OP_GET_ROWS`.
|
||||
*/
|
||||
void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Executes matrix multiplication for the given tensor.
|
||||
*
|
||||
* @details This function performs matrix multiplication on the source tensors
|
||||
* associated with the destination tensor. It supports matrix
|
||||
* multiplication F32, F16, and Q8_0.
|
||||
*
|
||||
* @param ctx The backend CANN context for executing operations.
|
||||
* @param dst The destination tensor for storing the result of the matrix
|
||||
* multiplication. dst->op is `GGML_OP_MUL_MAT`.
|
||||
*/
|
||||
void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Applies Rotary Positional Embedding (RoPE) to the input tensor.
|
||||
*
|
||||
* @details This function implements the RoPE mechanism, which is a method to
|
||||
* encode positional information into sequence data, particularly
|
||||
* useful in transformer models. It supports both F32 and F16 data
|
||||
* types.
|
||||
*
|
||||
* @param ctx The backend CANN context for executing operations.
|
||||
* @param dst The destination tensor where the RoPE-transformed data will be
|
||||
* stored. dst->op is `GGML_OP_ROPE`.
|
||||
*
|
||||
* @note The function currently does not support cases where the n_dims is less
|
||||
* than the input tensor's first dimension.
|
||||
* @note The function currently does not support cases where the freq_factors is
|
||||
* not NULL.
|
||||
* @note The function currently does not support cases where the ext_factor is
|
||||
* not equal 0.
|
||||
* @note The function currently does not support cases where the freq_scale is
|
||||
* not equal 1.
|
||||
*/
|
||||
void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
|
||||
aclTensor*, uint64_t*, aclOpExecutor**),
|
||||
aclnnStatus execute(void*, uint64_t, aclOpExecutor*, aclrtStream)>
|
||||
void ggml_cann_mul_div(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src0 = dst->src[0];
|
||||
ggml_tensor* src1 = dst->src[1];
|
||||
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
||||
|
||||
aclTensor* acl_src0;
|
||||
aclTensor* acl_src1;
|
||||
aclTensor* acl_dst;
|
||||
|
||||
// Need bcast
|
||||
if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) {
|
||||
BCAST_SHAPE(src0, src1)
|
||||
acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0));
|
||||
acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1));
|
||||
acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0));
|
||||
} else {
|
||||
acl_src0 = ggml_cann_create_tensor(src0);
|
||||
acl_src1 = ggml_cann_create_tensor(src1);
|
||||
acl_dst = ggml_cann_create_tensor(dst);
|
||||
}
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
|
||||
ACL_CHECK(getWorkspaceSize(acl_src0, acl_src1, acl_dst, &workspaceSize,
|
||||
&executor));
|
||||
if (workspaceSize > 0) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
}
|
||||
|
||||
aclrtStream main_stream = ctx.stream();
|
||||
ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_src0));
|
||||
ACL_CHECK(aclDestroyTensor(acl_src1));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
}
|
||||
|
||||
// Activation functions template.
|
||||
template <aclnnStatus getWorkspaceSize(const aclTensor*, aclTensor*, uint64_t*,
|
||||
aclOpExecutor**),
|
||||
aclnnStatus execute(void*, uint64_t, aclOpExecutor*,
|
||||
const aclrtStream)>
|
||||
void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src = dst->src[0];
|
||||
|
||||
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
|
||||
ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
|
||||
if (workspaceSize > 0) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
}
|
||||
|
||||
aclrtStream main_stream = ctx.stream();
|
||||
ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
}
|
||||
|
||||
// Activation functions template for const aclTensors.
|
||||
template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
|
||||
uint64_t*, aclOpExecutor**),
|
||||
aclnnStatus execute(void*, uint64_t, aclOpExecutor*,
|
||||
const aclrtStream)>
|
||||
void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src = dst->src[0];
|
||||
|
||||
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
|
||||
ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
|
||||
if (workspaceSize > 0) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
}
|
||||
|
||||
aclrtStream main_stream = ctx.stream();
|
||||
ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
}
|
||||
|
||||
#endif // CANN_ACLNN_OPS
|
282
ggml/src/ggml-cann/common.h
Normal file
282
ggml/src/ggml-cann/common.h
Normal file
|
@ -0,0 +1,282 @@
|
|||
/*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef CANN_COMMON_H
|
||||
#define CANN_COMMON_H
|
||||
|
||||
#include <acl/acl.h>
|
||||
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "../include/ggml-cann.h"
|
||||
#include "../include/ggml.h"
|
||||
|
||||
#define MATRIX_ROW_PADDING 512
|
||||
#define GGML_CANN_MAX_STREAMS 8
|
||||
|
||||
/**
|
||||
* @brief Handles CANN-related errors by printing an error message and
|
||||
* terminating the program.
|
||||
* @param stmt The statement that caused the error.
|
||||
* @param func The function in which the error occurred.
|
||||
* @param file The file in which the error occurred.
|
||||
* @param line The line number at which the error occurred.
|
||||
* @param msg The error message.
|
||||
*/
|
||||
[[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
|
||||
const char* file, int line, const char* msg);
|
||||
|
||||
/**
|
||||
* @brief Checks the result of a CANN function call and invokes the error
|
||||
* handler if the call fails.
|
||||
* @param stmt The CANN function call to check.
|
||||
* @param success The success code that indicates the call was successful.
|
||||
* @param error_fn The function to call to retrieve the error message.
|
||||
*/
|
||||
#define ACL_CHECK_GEN(stmt, success, error_fn) \
|
||||
do { \
|
||||
int err_code = (stmt); \
|
||||
if (err_code != (success)) { \
|
||||
ggml_cann_error(#stmt, __func__, __FILE__, __LINE__, error_fn()); \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define ACL_CHECK(stmt) ACL_CHECK_GEN(stmt, 0, aclGetRecentErrMsg)
|
||||
|
||||
/**
|
||||
* @brief Contains information about CANN devices.
|
||||
*/
|
||||
struct ggml_cann_device_info {
|
||||
/**
|
||||
* @brief Number of CANN devices available.
|
||||
*/
|
||||
int32_t device_count;
|
||||
|
||||
/**
|
||||
* @brief Information about a single CANN device.
|
||||
*/
|
||||
struct cann_device_info {
|
||||
int cc; /**< Compute capability. */
|
||||
size_t smpb; /**< Maximum shared memory per block. */
|
||||
bool vmm; /**< Virtual memory support. */
|
||||
size_t vmm_granularity; /**< Granularity of virtual memory. */
|
||||
size_t total_vram; /**< Total video RAM available on the device. */
|
||||
};
|
||||
|
||||
cann_device_info devices[GGML_CANN_MAX_DEVICES] =
|
||||
{}; /**< Array of CANN device information. */
|
||||
};
|
||||
|
||||
const ggml_cann_device_info& ggml_cann_info();
|
||||
|
||||
void ggml_cann_set_device(int32_t device);
|
||||
int32_t ggml_cann_get_device();
|
||||
|
||||
/**
|
||||
* @brief Abstract base class for memory pools used by CANN.
|
||||
*/
|
||||
struct ggml_cann_pool {
|
||||
/**
|
||||
* @brief Virtual destructor for the memory pool.
|
||||
*/
|
||||
virtual ~ggml_cann_pool() = default;
|
||||
|
||||
/**
|
||||
* @brief Allocates memory from the pool.
|
||||
*
|
||||
* @param size The size of the memory block to allocate.
|
||||
* @param actual_size Pointer to a variable where the actual allocated size
|
||||
* will be stored.
|
||||
* @return Pointer to the allocated memory block.
|
||||
*/
|
||||
virtual void* alloc(size_t size, size_t* actual_size) = 0;
|
||||
|
||||
/**
|
||||
* @brief Frees a previously allocated memory block.
|
||||
*
|
||||
* @param ptr Pointer to the memory block to free.
|
||||
* @param size Size of the memory block to free.
|
||||
* @note Note that all CANN opertors are running async. Make sure memory is
|
||||
* still avaiable before this operator finished.
|
||||
*/
|
||||
virtual void free(void* ptr, size_t size) = 0;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief RAII wrapper for managing memory allocations from a CANN memory pool.
|
||||
*/
|
||||
struct ggml_cann_pool_alloc {
|
||||
ggml_cann_pool* pool = nullptr; /**< Pointer to the memory pool. */
|
||||
void* ptr = nullptr; /**< Pointer to the allocated memory block. */
|
||||
size_t actual_size = 0; /**< Actual size of the allocated memory block. */
|
||||
|
||||
/**
|
||||
* @brief Default constructor.
|
||||
*/
|
||||
ggml_cann_pool_alloc() = default;
|
||||
|
||||
/**
|
||||
* @brief Constructor that initializes the memory pool.
|
||||
* @param pool Reference to the memory pool.
|
||||
*/
|
||||
explicit ggml_cann_pool_alloc(ggml_cann_pool& pool) : pool(&pool) {}
|
||||
|
||||
/**
|
||||
* @brief Constructor that initializes the memory pool and allocates memory.
|
||||
* @param pool Reference to the memory pool.
|
||||
* @param size Size of the memory block to allocate.
|
||||
*/
|
||||
ggml_cann_pool_alloc(ggml_cann_pool& pool, size_t size) : pool(&pool) {
|
||||
alloc(size);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Destructor that frees the allocated memory block.
|
||||
*/
|
||||
~ggml_cann_pool_alloc() {
|
||||
if (ptr != nullptr) {
|
||||
pool->free(ptr, actual_size);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Allocates memory from the pool.
|
||||
* @param size Size of the memory block to allocate.
|
||||
* @return Pointer to the allocated memory block.
|
||||
*/
|
||||
void* alloc(size_t size) {
|
||||
GGML_ASSERT(pool != nullptr);
|
||||
GGML_ASSERT(ptr == nullptr);
|
||||
ptr = pool->alloc(size, &this->actual_size);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Allocates memory from a specific memory pool.
|
||||
* @param pool Reference to the memory pool.
|
||||
* @param size Size of the memory block to allocate.
|
||||
* @return Pointer to the allocated memory block.
|
||||
*/
|
||||
void* alloc(ggml_cann_pool& pool, size_t size) {
|
||||
this->pool = &pool;
|
||||
return alloc(size);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Gets the pointer to the allocated memory block.
|
||||
* @return Pointer to the allocated memory block.
|
||||
*/
|
||||
void* get() { return ptr; }
|
||||
|
||||
// Deleted copy constructor
|
||||
ggml_cann_pool_alloc(const ggml_cann_pool_alloc&) = delete;
|
||||
|
||||
// Deleted move constructor
|
||||
ggml_cann_pool_alloc(ggml_cann_pool_alloc&&) = delete;
|
||||
|
||||
// Deleted copy assignment operator
|
||||
ggml_cann_pool_alloc& operator=(const ggml_cann_pool_alloc&) = delete;
|
||||
|
||||
// Deleted move assignment operator
|
||||
ggml_cann_pool_alloc& operator=(ggml_cann_pool_alloc&&) = delete;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Context for managing CANN backend operations.
|
||||
*/
|
||||
struct ggml_backend_cann_context {
|
||||
int32_t device; /**< Device ID. */
|
||||
std::string name; /**< Name of the device. */
|
||||
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
|
||||
|
||||
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {
|
||||
{nullptr}}; /**< Array of streams for the device. */
|
||||
|
||||
/**
|
||||
* @brief Constructor for initializing the context with a given device.
|
||||
* @param device Device ID.
|
||||
*/
|
||||
explicit ggml_backend_cann_context(int device)
|
||||
: device(device), name("CANN" + std::to_string(device)) {}
|
||||
|
||||
/**
|
||||
* @brief Destructor for cleaning up resources.
|
||||
*/
|
||||
~ggml_backend_cann_context() {
|
||||
if (copy_event != nullptr) {
|
||||
ACL_CHECK(aclrtDestroyEvent(copy_event));
|
||||
}
|
||||
for (int i = 0; i < GGML_CANN_MAX_STREAMS; ++i) {
|
||||
if (streams[i] != nullptr) {
|
||||
ACL_CHECK(aclrtDestroyStream(streams[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Get or create a stream for a given index.
|
||||
* @param stream Index of the stream.
|
||||
* @return The stream corresponding to the given index.
|
||||
*/
|
||||
aclrtStream stream(int stream) {
|
||||
if (streams[stream] == nullptr) {
|
||||
ggml_cann_set_device(device);
|
||||
ACL_CHECK(aclrtCreateStream(&streams[stream]));
|
||||
}
|
||||
return streams[stream];
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Get or create the default stream (index 0).
|
||||
* @return The default stream.
|
||||
*/
|
||||
aclrtStream stream() { return stream(0); }
|
||||
|
||||
// TODO: each stream should have a memory pool.
|
||||
std::unique_ptr<ggml_cann_pool>
|
||||
mem_pool; /**< Memory pool for the device. */
|
||||
|
||||
/**
|
||||
* @brief Create a new memory pool for a given device.
|
||||
* @param device Device ID.
|
||||
* @return A unique pointer to the new memory pool.
|
||||
*/
|
||||
static std::unique_ptr<ggml_cann_pool> new_pool_for_device(int device);
|
||||
|
||||
/**
|
||||
* @brief Get or create the memory pool for the context.
|
||||
* @return Reference to the memory pool.
|
||||
*/
|
||||
ggml_cann_pool& pool() {
|
||||
if (mem_pool == nullptr) {
|
||||
mem_pool = new_pool_for_device(device);
|
||||
}
|
||||
return *mem_pool;
|
||||
}
|
||||
};
|
||||
|
||||
#endif // CANN_COMMON_H
|
32
ggml/src/ggml-cann/kernels/CMakeLists.txt
Normal file
32
ggml/src/ggml-cann/kernels/CMakeLists.txt
Normal file
|
@ -0,0 +1,32 @@
|
|||
if (NOT SOC_TYPE)
|
||||
set (SOC_TYPE "Ascend910B3")
|
||||
endif()
|
||||
|
||||
file(GLOB SRC_FILES
|
||||
get_row_f32.cpp
|
||||
get_row_f16.cpp
|
||||
get_row_q4_0.cpp
|
||||
get_row_q8_0.cpp
|
||||
quantize_f32_q8_0.cpp
|
||||
quantize_f16_q8_0.cpp
|
||||
dup.cpp
|
||||
)
|
||||
|
||||
string(TOLOWER ${SOC_TYPE} SOC_VERSION)
|
||||
set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
|
||||
set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
|
||||
|
||||
if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
|
||||
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
|
||||
elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
|
||||
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
|
||||
else()
|
||||
message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
|
||||
endif()
|
||||
include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
|
||||
|
||||
ascendc_library(ascendc_kernels STATIC
|
||||
${SRC_FILES}
|
||||
)
|
||||
|
||||
#ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
|
17
ggml/src/ggml-cann/kernels/ascendc_kernels.h
Normal file
17
ggml/src/ggml-cann/kernels/ascendc_kernels.h
Normal file
|
@ -0,0 +1,17 @@
|
|||
#ifndef ASCENDC_KERNELS_H
|
||||
#define ASCENDC_KERNELS_H
|
||||
|
||||
#include "aclrtlaunch_ascendc_get_row_f32.h"
|
||||
#include "aclrtlaunch_ascendc_get_row_f16.h"
|
||||
#include "aclrtlaunch_ascendc_get_row_q8_0.h"
|
||||
#include "aclrtlaunch_ascendc_get_row_q4_0.h"
|
||||
|
||||
#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
|
||||
#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
|
||||
|
||||
#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
|
||||
#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
|
||||
#include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
|
||||
#include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
|
||||
|
||||
#endif // ASCENDC_KERNELS_H
|
223
ggml/src/ggml-cann/kernels/dup.cpp
Normal file
223
ggml/src/ggml-cann/kernels/dup.cpp
Normal file
|
@ -0,0 +1,223 @@
|
|||
#include "kernel_operator.h"
|
||||
|
||||
#include <cmath>
|
||||
|
||||
using namespace AscendC;
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
|
||||
template <typename SRC_T, typename DST_T>
|
||||
class DupByRows {
|
||||
public:
|
||||
__aicore__ inline DupByRows() {}
|
||||
__aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
|
||||
size_t *input_nb_ub) {
|
||||
/* Dup by rows when src is contigous on first dimension and dst is
|
||||
contiguous, each kernel process one row.
|
||||
*/
|
||||
|
||||
// Input has four dims.
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
int64_t op_block_idx = GetBlockIdx();
|
||||
|
||||
// param
|
||||
num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
|
||||
num_elem = input_ne_ub[0];
|
||||
|
||||
// index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)
|
||||
idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]);
|
||||
idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
|
||||
/ (input_ne_ub[1]);
|
||||
idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
|
||||
- idx_ne2 * input_ne_ub[1];
|
||||
|
||||
// src may not contiguous in dim [1,2,3], so stride decited by ne&nb
|
||||
src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2
|
||||
+ input_nb_ub[1] * idx_ne1;
|
||||
|
||||
// dst is contiguous
|
||||
dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T));
|
||||
|
||||
src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +
|
||||
src_stride));
|
||||
dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst +
|
||||
dst_stride));
|
||||
|
||||
pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem +
|
||||
32 - 1) / 32 * 32);
|
||||
pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem +
|
||||
32 - 1) / 32 * 32);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in() {
|
||||
LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
|
||||
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = num_elem * sizeof(SRC_T);
|
||||
DataCopyPadExtParams<SRC_T> padParams;
|
||||
DataCopyPad(src_local, src_gm, dataCopyParams, padParams);
|
||||
|
||||
src_queue.EnQue(src_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out() {
|
||||
LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
|
||||
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = num_elem * sizeof(DST_T);
|
||||
DataCopyPad(dst_gm, dst_local, dataCopyParams);
|
||||
|
||||
dst_queue.FreeTensor(dst_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void dup() {
|
||||
// main process, copy one row data from src to dst.
|
||||
copy_in();
|
||||
|
||||
LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
|
||||
LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
|
||||
|
||||
int32_t BLOCK_NUM = 32 / sizeof(DST_T);
|
||||
DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1)
|
||||
/ BLOCK_NUM * BLOCK_NUM);
|
||||
dst_queue.EnQue<DST_T>(dst_local);
|
||||
|
||||
src_queue.FreeTensor(src_local);
|
||||
copy_out();
|
||||
}
|
||||
|
||||
__aicore__ inline void dup_with_cast() {
|
||||
// main process, copy one row data from src to dst.
|
||||
// cast dtype from src to dst.
|
||||
copy_in();
|
||||
|
||||
LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
|
||||
LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
|
||||
|
||||
Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem);
|
||||
dst_queue.EnQue<DST_T>(dst_local);
|
||||
|
||||
src_queue.FreeTensor(src_local);
|
||||
copy_out();
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<SRC_T> src_gm;
|
||||
GlobalTensor<DST_T> dst_gm;
|
||||
|
||||
int64_t num_rows;
|
||||
int64_t num_elem;
|
||||
int64_t idx_ne3;
|
||||
int64_t idx_ne2;
|
||||
int64_t idx_ne1;
|
||||
int64_t src_stride;
|
||||
int64_t dst_stride;
|
||||
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> src_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> dst_queue;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(
|
||||
GM_ADDR src_gm,
|
||||
GM_ADDR dst_gm,
|
||||
GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm,
|
||||
GM_ADDR output_ne_gm,
|
||||
GM_ADDR output_nb_gm) {
|
||||
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
DupByRows<half, half> op;
|
||||
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
|
||||
op.dup();
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
|
||||
GM_ADDR src_gm,
|
||||
GM_ADDR dst_gm,
|
||||
GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm,
|
||||
GM_ADDR output_ne_gm,
|
||||
GM_ADDR output_nb_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
DupByRows<float_t, float_t> op;
|
||||
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
|
||||
op.dup();
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
|
||||
GM_ADDR src_gm,
|
||||
GM_ADDR dst_gm,
|
||||
GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm,
|
||||
GM_ADDR output_ne_gm,
|
||||
GM_ADDR output_nb_gm) {
|
||||
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
DupByRows<float_t, half> op;
|
||||
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
|
||||
op.dup_with_cast();
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
|
||||
GM_ADDR src_gm,
|
||||
GM_ADDR dst_gm,
|
||||
GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm,
|
||||
GM_ADDR output_ne_gm,
|
||||
GM_ADDR output_nb_gm) {
|
||||
|
||||
// copy params from gm to ub.
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
DupByRows<half, float_t> op;
|
||||
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
|
||||
op.dup_with_cast();
|
||||
}
|
186
ggml/src/ggml-cann/kernels/get_row_f16.cpp
Normal file
186
ggml/src/ggml-cann/kernels/get_row_f16.cpp
Normal file
|
@ -0,0 +1,186 @@
|
|||
#include "kernel_operator.h"
|
||||
|
||||
// optimize me. Use template to avoid copy code.
|
||||
using namespace AscendC;
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
|
||||
class GET_ROW_F16 {
|
||||
public:
|
||||
__aicore__ inline GET_ROW_F16() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
|
||||
int64_t *input_ne_ub, size_t *input_nb_ub,
|
||||
int64_t *indices_ne_ub, size_t *indices_nb_ub,
|
||||
int64_t *output_ne_ub, size_t *output_nb_ub) {
|
||||
// TODO, use template for F16/f32
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
int64_t op_block_idx = GetBlockIdx();
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
|
||||
|
||||
indices_ne[i] = indices_ne_ub[i];
|
||||
indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
|
||||
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
|
||||
}
|
||||
|
||||
// Indices has two dims. n_elements = all rows should get.
|
||||
// dr, all rows should this thread get.
|
||||
uint64_t n_elements =
|
||||
indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
|
||||
dr = n_elements / op_block_num;
|
||||
|
||||
uint64_t tails = n_elements % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ half *)input);
|
||||
indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
|
||||
output_gm.SetGlobalBuffer((__gm__ float *)output);
|
||||
|
||||
uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31)
|
||||
& ~31);
|
||||
uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31)
|
||||
& ~31);
|
||||
|
||||
local_buffer_elems = input_local_buffer_size / sizeof(half);
|
||||
|
||||
// TODO, consider long row that can't put in UB.
|
||||
// All data should asign to 32. It's ok because all data is align to 32.
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size);
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
|
||||
LocalTensor<half> input_local = input_queue.AllocTensor<half>();
|
||||
size_t tail = len % 32;
|
||||
len = len & ~31;
|
||||
DataCopy(input_local, input_gm[offset], len);
|
||||
if(tail != 0) {
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = tail * sizeof(half);
|
||||
DataCopyPadExtParams<half> padParams;
|
||||
DataCopyPad(input_local[len], input_gm[offset + len],
|
||||
dataCopyParams, padParams);
|
||||
}
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
|
||||
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
||||
size_t tail = len % 32;
|
||||
len = len & ~31;
|
||||
DataCopy(output_gm[offset], output_local, len);
|
||||
if(tail != 0) {
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = tail * sizeof(float);
|
||||
DataCopyPad(output_gm[offset + len], output_local[len],
|
||||
dataCopyParams);
|
||||
}
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate_row(int64_t idx) {
|
||||
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
|
||||
const int64_t indices_ne1_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
|
||||
indices_ne[0];
|
||||
const int64_t indices_ne0_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
|
||||
indices_ne1_idx * indices_ne[0]);
|
||||
|
||||
const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
|
||||
indices_ne1_idx * indices_stride[1] +
|
||||
indices_ne2_idx * indices_stride[2];
|
||||
const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
|
||||
|
||||
const int64_t input_offset = selected_row_idx * input_stride[1] +
|
||||
indices_ne1_idx * input_stride[2] +
|
||||
indices_ne2_idx * input_stride[3];
|
||||
|
||||
const int64_t output_offset = indices_ne0_idx * output_stride[1] +
|
||||
indices_ne1_idx * output_stride[2] +
|
||||
indices_ne2_idx * output_stride[3];
|
||||
|
||||
copy_in(input_offset, input_ne[0]);
|
||||
LocalTensor<half> input_local = input_queue.DeQue<half>();
|
||||
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
||||
|
||||
Cast(output_local, input_local, RoundMode::CAST_NONE,
|
||||
local_buffer_elems);
|
||||
output_queue.EnQue(output_local);
|
||||
copy_out(output_offset, input_ne[0]);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
calculate_row(i);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t indices_ne[4];
|
||||
size_t indices_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
size_t local_buffer_elems;
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<half> input_gm;
|
||||
GlobalTensor<int32_t> indices_gm;
|
||||
GlobalTensor<float> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_get_row_f16(
|
||||
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
||||
GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
|
||||
GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t indices_ne_ub[4];
|
||||
size_t indices_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
|
||||
copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
GET_ROW_F16 op;
|
||||
op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
|
||||
indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
|
||||
op.calculate();
|
||||
}
|
180
ggml/src/ggml-cann/kernels/get_row_f32.cpp
Normal file
180
ggml/src/ggml-cann/kernels/get_row_f32.cpp
Normal file
|
@ -0,0 +1,180 @@
|
|||
#include "kernel_operator.h"
|
||||
|
||||
// optimize me. Use template to avoid copy code.
|
||||
using namespace AscendC;
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
|
||||
class GET_ROW_F32 {
|
||||
public:
|
||||
__aicore__ inline GET_ROW_F32() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
|
||||
int64_t *input_ne_ub, size_t *input_nb_ub,
|
||||
int64_t *indices_ne_ub, size_t *indices_nb_ub,
|
||||
int64_t *output_ne_ub, size_t *output_nb_ub) {
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
int64_t op_block_idx = GetBlockIdx();
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
|
||||
|
||||
indices_ne[i] = indices_ne_ub[i];
|
||||
indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
|
||||
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
|
||||
}
|
||||
|
||||
// Indices has two dims. n_elements = all rows should get.
|
||||
// dr, all rows should this thread get.
|
||||
uint64_t n_elements =
|
||||
indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
|
||||
dr = n_elements / op_block_num;
|
||||
|
||||
uint64_t tails = n_elements % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ float *)input);
|
||||
indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
|
||||
output_gm.SetGlobalBuffer((__gm__ float *)output);
|
||||
|
||||
uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31);
|
||||
local_buffer_elems = local_buffer_size / sizeof(float);
|
||||
|
||||
// TODO, consider long row that can't put in UB.
|
||||
// All data should asign to 32. It's ok because all data is align to 32.
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
|
||||
LocalTensor<float> input_local = input_queue.AllocTensor<float>();
|
||||
size_t tail = len % 32;
|
||||
len = len & ~31;
|
||||
DataCopy(input_local, input_gm[offset], len);
|
||||
if(tail != 0) {
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = tail * sizeof(float);
|
||||
DataCopyPadExtParams<float> padParams;
|
||||
DataCopyPad(input_local[len], input_gm[offset + len],
|
||||
dataCopyParams, padParams);
|
||||
}
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
|
||||
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
||||
size_t tail = len % 32;
|
||||
len = len & ~31;
|
||||
DataCopy(output_gm[offset], output_local, len);
|
||||
if(tail != 0) {
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = tail * sizeof(float);
|
||||
DataCopyPad(output_gm[offset + len], output_local[len],
|
||||
dataCopyParams);
|
||||
}
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate_row(int64_t idx) {
|
||||
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
|
||||
const int64_t indices_ne1_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
|
||||
indices_ne[0];
|
||||
const int64_t indices_ne0_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
|
||||
indices_ne1_idx * indices_ne[0]);
|
||||
|
||||
const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
|
||||
indices_ne1_idx * indices_stride[1] +
|
||||
indices_ne2_idx * indices_stride[2];
|
||||
const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
|
||||
|
||||
const int64_t input_offset = selected_row_idx * input_stride[1] +
|
||||
indices_ne1_idx * input_stride[2] +
|
||||
indices_ne2_idx * input_stride[3];
|
||||
|
||||
const int64_t output_offset = indices_ne0_idx * output_stride[1] +
|
||||
indices_ne1_idx * output_stride[2] +
|
||||
indices_ne2_idx * output_stride[3];
|
||||
|
||||
copy_in(input_offset, input_ne[0]);
|
||||
LocalTensor<float> input_local = input_queue.DeQue<float>();
|
||||
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
||||
|
||||
DataCopy(output_local, input_local, local_buffer_elems);
|
||||
output_queue.EnQue(output_local);
|
||||
copy_out(output_offset, input_ne[0]);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
calculate_row(i);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t indices_ne[4];
|
||||
size_t indices_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
size_t local_buffer_elems;
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<float> input_gm;
|
||||
GlobalTensor<int32_t> indices_gm;
|
||||
GlobalTensor<float> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_get_row_f32(
|
||||
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
||||
GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
|
||||
GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t indices_ne_ub[4];
|
||||
size_t indices_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
|
||||
copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
GET_ROW_F32 op;
|
||||
op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
|
||||
indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
|
||||
op.calculate();
|
||||
}
|
193
ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
Normal file
193
ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
Normal file
|
@ -0,0 +1,193 @@
|
|||
#include "kernel_operator.h"
|
||||
|
||||
// optimize me. Use template to avoid copy code.
|
||||
using namespace AscendC;
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
|
||||
#define QK4_0 32
|
||||
|
||||
class GET_ROW_Q4_0 {
|
||||
public:
|
||||
__aicore__ inline GET_ROW_Q4_0() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
|
||||
int64_t *input_ne_ub, int64_t *indices_ne_ub,
|
||||
size_t *indices_nb_ub, int64_t *output_ne_ub,
|
||||
size_t *output_nb_ub) {
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
int64_t op_block_idx = GetBlockIdx();
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
indices_ne[i] = indices_ne_ub[i];
|
||||
indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
|
||||
scale_ne[i] = input_ne_ub[i];
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
|
||||
}
|
||||
|
||||
// one scale for a group.
|
||||
scale_ne[0] /= QK4_0;
|
||||
|
||||
input_stride[0] = 1;
|
||||
scale_stride[0] = 1;
|
||||
output_stride[0] = 1;
|
||||
for (int i = 1; i < 4; i++) {
|
||||
input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
|
||||
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
|
||||
}
|
||||
|
||||
group_size_in_row = input_ne[0] / QK4_0;
|
||||
int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
|
||||
input_ne[3] / 2;
|
||||
|
||||
// Indices has two dims. n_elements = all rows should get.
|
||||
// dr, all rows should this thread get.
|
||||
uint64_t n_elements =
|
||||
indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
|
||||
dr = n_elements / op_block_num;
|
||||
|
||||
uint64_t tails = n_elements % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ int4b_t *)input);
|
||||
scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
|
||||
indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
|
||||
output_gm.SetGlobalBuffer((__gm__ float *)output);
|
||||
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
|
||||
pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset) {
|
||||
LocalTensor<int4b_t> input_local = input_queue.AllocTensor<int4b_t>();
|
||||
// 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error?
|
||||
DataCopy(input_local, input_gm[offset], QK4_0);
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset) {
|
||||
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
||||
DataCopy(output_gm[offset], output_local, QK4_0);
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate_group(int64_t idx, int64_t group) {
|
||||
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
|
||||
const int64_t indices_ne1_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
|
||||
indices_ne[0];
|
||||
const int64_t indices_ne0_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
|
||||
indices_ne1_idx * indices_ne[0]);
|
||||
|
||||
const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
|
||||
indices_ne1_idx * indices_stride[1] +
|
||||
indices_ne2_idx * indices_stride[2];
|
||||
const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
|
||||
|
||||
const int64_t input_offset = selected_row_idx * input_stride[1] +
|
||||
indices_ne1_idx * input_stride[2] +
|
||||
indices_ne2_idx * input_stride[3] +
|
||||
group * QK4_0;
|
||||
const int64_t scale_offset = selected_row_idx * scale_stride[1] +
|
||||
indices_ne1_idx * scale_stride[2] +
|
||||
indices_ne2_idx * scale_stride[3] + group;
|
||||
const int64_t output_offset = indices_ne0_idx * output_stride[1] +
|
||||
indices_ne1_idx * output_stride[2] +
|
||||
indices_ne2_idx * output_stride[3] +
|
||||
group * QK4_0;
|
||||
|
||||
copy_in(input_offset);
|
||||
LocalTensor<int4b_t> input_local = input_queue.DeQue<int4b_t>();
|
||||
LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
|
||||
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
||||
|
||||
// TODO: cast more data to speed up.
|
||||
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
|
||||
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
|
||||
|
||||
// Only mul need compile by group.
|
||||
half scale = scale_gm.GetValue(scale_offset);
|
||||
|
||||
Muls(output_local, output_local, (float)scale, QK4_0);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
cast_queue.FreeTensor(cast_local);
|
||||
output_queue.EnQue(output_local);
|
||||
|
||||
copy_out(output_offset);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
for (int64_t j = 0; j < group_size_in_row; j++) {
|
||||
calculate_group(i, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t scale_ne[4];
|
||||
size_t scale_stride[4];
|
||||
|
||||
int64_t indices_ne[4];
|
||||
size_t indices_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
int64_t group_size_in_row;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<int4b_t> input_gm;
|
||||
GlobalTensor<half> scale_gm;
|
||||
GlobalTensor<int32_t> indices_gm;
|
||||
GlobalTensor<float> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
|
||||
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
||||
GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
|
||||
GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
int64_t indices_ne_ub[4];
|
||||
size_t indices_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
|
||||
copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
GET_ROW_Q4_0 op;
|
||||
op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
|
||||
indices_nb_ub, output_ne_ub, output_nb_ub);
|
||||
op.calculate();
|
||||
}
|
191
ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
Normal file
191
ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
Normal file
|
@ -0,0 +1,191 @@
|
|||
#include "kernel_operator.h"
|
||||
|
||||
// optimize me. Use template to avoid copy code.
|
||||
using namespace AscendC;
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
|
||||
#define QK8_0 32
|
||||
|
||||
class GET_ROW_Q8_0 {
|
||||
public:
|
||||
__aicore__ inline GET_ROW_Q8_0() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
|
||||
int64_t *input_ne_ub, int64_t *indices_ne_ub,
|
||||
size_t *indices_nb_ub, int64_t *output_ne_ub,
|
||||
size_t *output_nb_ub) {
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
int64_t op_block_idx = GetBlockIdx();
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
indices_ne[i] = indices_ne_ub[i];
|
||||
indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
|
||||
scale_ne[i] = input_ne_ub[i];
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
|
||||
}
|
||||
|
||||
// one scale for a group.
|
||||
scale_ne[0] /= QK8_0;
|
||||
|
||||
input_stride[0] = 1;
|
||||
scale_stride[0] = 1;
|
||||
output_stride[0] = 1;
|
||||
for (int i = 1; i < 4; i++) {
|
||||
input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
|
||||
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
|
||||
}
|
||||
|
||||
group_size_in_row = input_ne[0] / QK8_0;
|
||||
int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
|
||||
input_ne[3] * sizeof(int8_t);
|
||||
|
||||
// Indices has two dims. n_elements = all rows should get.
|
||||
// dr, all rows should this thread get.
|
||||
uint64_t n_elements =
|
||||
indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
|
||||
dr = n_elements / op_block_num;
|
||||
|
||||
uint64_t tails = n_elements % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ int8_t *)input);
|
||||
scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
|
||||
indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
|
||||
output_gm.SetGlobalBuffer((__gm__ float *)output);
|
||||
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
|
||||
pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half));
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float));
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset) {
|
||||
LocalTensor<int8_t> input_local = input_queue.AllocTensor<int8_t>();
|
||||
DataCopy(input_local, input_gm[offset], QK8_0);
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset) {
|
||||
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
||||
DataCopy(output_gm[offset], output_local, QK8_0);
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate_group(int64_t idx, int64_t group) {
|
||||
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
|
||||
const int64_t indices_ne1_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
|
||||
indices_ne[0];
|
||||
const int64_t indices_ne0_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
|
||||
indices_ne1_idx * indices_ne[0]);
|
||||
|
||||
const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
|
||||
indices_ne1_idx * indices_stride[1] +
|
||||
indices_ne2_idx * indices_stride[2];
|
||||
const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
|
||||
|
||||
const int64_t input_offset = selected_row_idx * input_stride[1] +
|
||||
indices_ne1_idx * input_stride[2] +
|
||||
indices_ne2_idx * input_stride[3] +
|
||||
group * QK8_0;
|
||||
const int64_t scale_offset = selected_row_idx * scale_stride[1] +
|
||||
indices_ne1_idx * scale_stride[2] +
|
||||
indices_ne2_idx * scale_stride[3] + group;
|
||||
const int64_t output_offset = indices_ne0_idx * output_stride[1] +
|
||||
indices_ne1_idx * output_stride[2] +
|
||||
indices_ne2_idx * output_stride[3] +
|
||||
group * QK8_0;
|
||||
|
||||
copy_in(input_offset);
|
||||
LocalTensor<int8_t> input_local = input_queue.DeQue<int8_t>();
|
||||
LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
|
||||
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
||||
|
||||
// TODO: cast more data to speed up.
|
||||
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
|
||||
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0);
|
||||
|
||||
// Only mul need compile by group.
|
||||
half scale = scale_gm.GetValue(scale_offset);
|
||||
Muls(output_local, output_local, (float)scale, QK8_0);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
cast_queue.FreeTensor(cast_local);
|
||||
output_queue.EnQue(output_local);
|
||||
|
||||
copy_out(output_offset);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
for (int64_t j = 0; j < group_size_in_row; j++) {
|
||||
calculate_group(i, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t scale_ne[4];
|
||||
size_t scale_stride[4];
|
||||
|
||||
int64_t indices_ne[4];
|
||||
size_t indices_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
int64_t group_size_in_row;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<int8_t> input_gm;
|
||||
GlobalTensor<half> scale_gm;
|
||||
GlobalTensor<int32_t> indices_gm;
|
||||
GlobalTensor<float> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_get_row_q8_0(
|
||||
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
||||
GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
|
||||
GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
int64_t indices_ne_ub[4];
|
||||
size_t indices_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
|
||||
copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
GET_ROW_Q8_0 op;
|
||||
op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
|
||||
indices_nb_ub, output_ne_ub, output_nb_ub);
|
||||
op.calculate();
|
||||
}
|
208
ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
Normal file
208
ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
Normal file
|
@ -0,0 +1,208 @@
|
|||
#include "kernel_operator.h"
|
||||
|
||||
using namespace AscendC;
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
#define QK8_0 32
|
||||
|
||||
class QUANTIZE_F16_Q8_0 {
|
||||
public:
|
||||
__aicore__ inline QUANTIZE_F16_Q8_0() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR output,
|
||||
int64_t *input_ne_ub, size_t *input_nb_ub,
|
||||
int64_t *output_ne_ub) {
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
int64_t op_block_idx = GetBlockIdx();
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
|
||||
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
}
|
||||
|
||||
output_stride[0] = 1;
|
||||
for (int i = 1; i < 4; i++) {
|
||||
output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
|
||||
}
|
||||
|
||||
scale_ne = input_ne;
|
||||
scale_stride[0] = 1;
|
||||
scale_stride[1] = input_ne[0] / QK8_0;
|
||||
for (int i = 2; i < 4; i++) {
|
||||
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
|
||||
}
|
||||
|
||||
// split input tensor by rows.
|
||||
uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
|
||||
dr = nr / op_block_num;
|
||||
|
||||
uint64_t tails = nr % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
group_size_in_row = scale_stride[1];
|
||||
int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
|
||||
output_ne[3] * sizeof(uint8_t);
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ half *)input);
|
||||
output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
|
||||
scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir *
|
||||
group_size_in_row *
|
||||
sizeof(half)));
|
||||
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half));
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
|
||||
pipe.InitBuffer(work_queue, 1, 32);
|
||||
pipe.InitBuffer(max_queue, 1, 32);
|
||||
pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
|
||||
pipe.InitBuffer(scale_queue, 1, 32);
|
||||
pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float));
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset) {
|
||||
LocalTensor<half> input_local = input_queue.AllocTensor<half>();
|
||||
DataCopy(input_local, input_gm[offset], QK8_0);
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset) {
|
||||
LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
|
||||
DataCopy(output_gm[offset], output_local, QK8_0);
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline half calculate_group(int64_t row, int64_t group) {
|
||||
const int64_t i3 = row / (input_ne[1] * input_ne[2]);
|
||||
const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
|
||||
const int64_t i1 =
|
||||
row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
|
||||
|
||||
const int64_t input_offset = i1 * input_stride[1] +
|
||||
i2 * input_stride[2] +
|
||||
i3 * input_stride[3] + QK8_0 * group;
|
||||
|
||||
const int64_t output_offset = i1 * output_stride[1] +
|
||||
i2 * output_stride[2] +
|
||||
i3 * output_stride[3] + QK8_0 * group;
|
||||
|
||||
copy_in(input_offset);
|
||||
LocalTensor<half> input_local = input_queue.DeQue<half>();
|
||||
LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
|
||||
LocalTensor<float> work_local = work_queue.AllocTensor<float>();
|
||||
LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
|
||||
LocalTensor<float> max_local = max_queue.AllocTensor<float>();
|
||||
LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
|
||||
|
||||
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
|
||||
Abs(abs_local, cast_local, QK8_0);
|
||||
ReduceMax(max_local, abs_local, work_local, QK8_0);
|
||||
|
||||
pipe_barrier(PIPE_ALL);
|
||||
float d = max_local.GetValue(0);
|
||||
d = d / ((1 << 7) - 1);
|
||||
if (d != 0) {
|
||||
Muls(cast_local, cast_local, 1.0f / d, QK8_0);
|
||||
}
|
||||
|
||||
Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
|
||||
Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
|
||||
Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0);
|
||||
output_queue.EnQue(output_local);
|
||||
copy_out(output_offset);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
work_queue.FreeTensor(work_local);
|
||||
abs_queue.FreeTensor(abs_local);
|
||||
max_queue.FreeTensor(max_local);
|
||||
cast_queue.FreeTensor(cast_local);
|
||||
return (half)d;
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
|
||||
uint32_t scale_local_offset = 0;
|
||||
uint32_t scale_global_offset = 0;
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
for (int64_t j = 0; j < group_size_in_row; j++) {
|
||||
half scale = calculate_group(i, j);
|
||||
scale_local.SetValue(scale_local_offset++, scale);
|
||||
if (scale_local_offset == 16) {
|
||||
scale_local_offset = 0;
|
||||
// TODO: OPTIMIZE ME
|
||||
pipe_barrier(PIPE_ALL);
|
||||
DataCopy(scale_gm[scale_global_offset], scale_local, 16);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
scale_global_offset += 16;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (scale_local_offset != 0) {
|
||||
pipe_barrier(PIPE_ALL);
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = scale_local_offset * sizeof(half);
|
||||
DataCopyPad(scale_gm[scale_global_offset], scale_local,
|
||||
dataCopyParams);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t *scale_ne;
|
||||
size_t scale_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
int64_t group_size_in_row;
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<half> input_gm;
|
||||
GlobalTensor<half> scale_gm;
|
||||
GlobalTensor<int8_t> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
TQue<QuePosition::VECIN, 1> work_queue;
|
||||
TQue<QuePosition::VECOUT, 1> max_queue;
|
||||
TQue<QuePosition::VECIN, 1> abs_queue;
|
||||
TQue<QuePosition::VECOUT, 1> scale_queue;
|
||||
TQue<QuePosition::VECOUT, 1> cast_queue;
|
||||
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
|
||||
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
|
||||
QUANTIZE_F16_Q8_0 op;
|
||||
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
||||
op.calculate();
|
||||
}
|
206
ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
Normal file
206
ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
Normal file
|
@ -0,0 +1,206 @@
|
|||
#include "kernel_operator.h"
|
||||
|
||||
using namespace AscendC;
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
#define QK8_0 32
|
||||
|
||||
class QUANTIZE_F32_Q8_0 {
|
||||
public:
|
||||
__aicore__ inline QUANTIZE_F32_Q8_0() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR output,
|
||||
int64_t *input_ne_ub, size_t *input_nb_ub,
|
||||
int64_t *output_ne_ub) {
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
int64_t op_block_idx = GetBlockIdx();
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
|
||||
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
}
|
||||
|
||||
output_stride[0] = 1;
|
||||
for (int i = 1; i < 4; i++) {
|
||||
output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
|
||||
}
|
||||
|
||||
scale_ne = input_ne;
|
||||
scale_stride[0] = 1;
|
||||
scale_stride[1] = input_ne[0] / QK8_0;
|
||||
for (int i = 2; i < 4; i++) {
|
||||
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
|
||||
}
|
||||
|
||||
// split input tensor by rows.
|
||||
uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
|
||||
dr = nr / op_block_num;
|
||||
|
||||
uint64_t tails = nr % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
group_size_in_row = scale_stride[1];
|
||||
int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
|
||||
output_ne[3] * sizeof(uint8_t);
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ float *)input);
|
||||
output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
|
||||
scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size +
|
||||
ir * group_size_in_row *
|
||||
sizeof(half)));
|
||||
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float));
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
|
||||
pipe.InitBuffer(work_queue, 1, 32);
|
||||
pipe.InitBuffer(max_queue, 1, 32);
|
||||
pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
|
||||
pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half));
|
||||
pipe.InitBuffer(scale_queue, 1, 32);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset) {
|
||||
LocalTensor<float> input_local = input_queue.AllocTensor<float>();
|
||||
DataCopy(input_local, input_gm[offset], QK8_0);
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset) {
|
||||
LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
|
||||
DataCopy(output_gm[offset], output_local, QK8_0);
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline half calculate_group(int64_t row, int64_t group) {
|
||||
const int64_t i3 = row / (input_ne[1] * input_ne[2]);
|
||||
const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
|
||||
const int64_t i1 =
|
||||
row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
|
||||
|
||||
const int64_t input_offset = i1 * input_stride[1] +
|
||||
i2 * input_stride[2] +
|
||||
i3 * input_stride[3] + QK8_0 * group;
|
||||
|
||||
const int64_t output_offset = i1 * output_stride[1] +
|
||||
i2 * output_stride[2] +
|
||||
i3 * output_stride[3] + QK8_0 * group;
|
||||
|
||||
copy_in(input_offset);
|
||||
LocalTensor<float> input_local = input_queue.DeQue<float>();
|
||||
LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
|
||||
LocalTensor<float> work_local = work_queue.AllocTensor<float>();
|
||||
LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
|
||||
LocalTensor<float> max_local = max_queue.AllocTensor<float>();
|
||||
LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
|
||||
|
||||
Abs(abs_local, input_local, QK8_0);
|
||||
ReduceMax(max_local, abs_local, work_local, QK8_0);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
float d = max_local.GetValue(0);
|
||||
d = d / ((1 << 7) - 1);
|
||||
if (d != 0) {
|
||||
Muls(input_local, input_local, 1.0f / d, QK8_0);
|
||||
}
|
||||
|
||||
Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0);
|
||||
Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0);
|
||||
Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
|
||||
output_queue.EnQue(output_local);
|
||||
copy_out(output_offset);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
work_queue.FreeTensor(work_local);
|
||||
abs_queue.FreeTensor(abs_local);
|
||||
max_queue.FreeTensor(max_local);
|
||||
cast_queue.FreeTensor(cast_local);
|
||||
|
||||
return (half)d;
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
|
||||
uint32_t scale_local_offset = 0;
|
||||
uint32_t scale_global_offset = 0;
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
for (int64_t j = 0; j < group_size_in_row; j++) {
|
||||
half scale = calculate_group(i, j);
|
||||
scale_local.SetValue(scale_local_offset++, scale);
|
||||
if (scale_local_offset == 16) {
|
||||
scale_local_offset = 0;
|
||||
// TODO: OPTIMIZE ME
|
||||
pipe_barrier(PIPE_ALL);
|
||||
DataCopy(scale_gm[scale_global_offset], scale_local, 16);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
scale_global_offset += 16;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (scale_local_offset != 0) {
|
||||
pipe_barrier(PIPE_ALL);
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = scale_local_offset * sizeof(half);
|
||||
DataCopyPad(scale_gm[scale_global_offset], scale_local,
|
||||
dataCopyParams);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t *scale_ne;
|
||||
size_t scale_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
int64_t group_size_in_row;
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<float> input_gm;
|
||||
GlobalTensor<half> scale_gm;
|
||||
GlobalTensor<int8_t> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
TQue<QuePosition::VECIN, 1> work_queue;
|
||||
TQue<QuePosition::VECOUT, 1> max_queue;
|
||||
TQue<QuePosition::VECIN, 1> abs_queue;
|
||||
TQue<QuePosition::VECIN, 1> cast_queue;
|
||||
TQue<QuePosition::VECOUT, 1> scale_queue;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
|
||||
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
|
||||
QUANTIZE_F32_Q8_0 op;
|
||||
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
||||
op.calculate();
|
||||
}
|
|
@ -464,12 +464,12 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
|||
return;
|
||||
}
|
||||
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
||||
// initialize padding to 0 to avoid possible NaN values
|
||||
size_t original_size = ggml_nbytes(tensor);
|
||||
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
||||
|
||||
if (padded_size > original_size && tensor->view_src == nullptr) {
|
||||
if (padded_size > original_size) {
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
|
||||
}
|
||||
|
@ -1485,6 +1485,13 @@ static void ggml_cuda_op_mul_mat(
|
|||
dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
|
||||
}
|
||||
|
||||
// If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared:
|
||||
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
|
||||
const int64_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
|
||||
const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
|
||||
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
|
||||
}
|
||||
|
||||
if (src1_on_device && src1_is_contiguous) {
|
||||
dev[id].src1_ddf = (float *) src1->data;
|
||||
} else {
|
||||
|
@ -1876,7 +1883,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|||
|
||||
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
|
||||
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2
|
||||
&& src1->ne[1] == 1;
|
||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||
|
|
|
@ -1786,10 +1786,6 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||
}
|
||||
};
|
||||
|
||||
if (ggml_is_quantized(src0t)) {
|
||||
GGML_ASSERT(ne00 >= nth0*nth1);
|
||||
}
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
|
|
|
@ -4757,7 +4757,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
|
|||
device const float4 * y4 = (device const float4 *)yb;
|
||||
yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
|
||||
|
||||
for (int row = 0; row < 2; ++row) {
|
||||
for (int row = 0; row < 2 && first_row + row < ne01; ++row) {
|
||||
|
||||
device const block_iq4_nl & xb = x[row*nb + ib];
|
||||
device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
|
||||
|
@ -4789,7 +4789,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
|
|||
yb += 16 * QK4_NL;
|
||||
}
|
||||
|
||||
for (int row = 0; row < 2; ++row) {
|
||||
for (int row = 0; row < 2 && first_row + row < ne01; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -3341,7 +3341,7 @@ bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tenso
|
|||
}
|
||||
|
||||
// check if t1 can be represented as a repeatition of t0
|
||||
static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
||||
bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
return ggml_is_empty(t0) ? ggml_is_empty(t1) :
|
||||
|
@ -13699,6 +13699,7 @@ static void ggml_compute_forward_soft_max(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// ggml_compute_forward_soft_max_back
|
||||
|
||||
static void ggml_compute_forward_soft_max_back_f32(
|
||||
|
@ -19018,7 +19019,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|||
FILE * fout = ggml_fopen(fname, "wb");
|
||||
|
||||
if (!fout) {
|
||||
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
||||
fprintf(stderr, "%s: failed to open %s: %s\n", __func__, fname, strerror(errno));
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -19155,7 +19156,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|||
{
|
||||
FILE * fin = ggml_fopen(fname, "rb");
|
||||
if (!fin) {
|
||||
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
||||
fprintf(stderr, "%s: failed to open %s: %s\n", __func__, fname, strerror(errno));
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -19478,7 +19479,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|||
|
||||
fprintf(fp, "digraph G {\n");
|
||||
fprintf(fp, " newrank = true;\n");
|
||||
fprintf(fp, " rankdir = LR;\n");
|
||||
fprintf(fp, " rankdir = TB;\n");
|
||||
|
||||
for (int i = 0; i < gb->n_nodes; i++) {
|
||||
struct ggml_tensor * node = gb->nodes[i];
|
||||
|
@ -19540,7 +19541,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|||
}
|
||||
|
||||
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
||||
if (ggml_nelements(node) < 5) {
|
||||
if (ggml_nelements(node) < 5 && node->data != NULL) {
|
||||
fprintf(fp, " | (");
|
||||
for (int j = 0; j < ggml_nelements(node); j++) {
|
||||
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
|
||||
|
@ -20829,6 +20830,7 @@ struct gguf_context * gguf_init_empty(void) {
|
|||
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
|
||||
FILE * file = ggml_fopen(fname, "rb");
|
||||
if (!file) {
|
||||
fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -21995,6 +21997,14 @@ int ggml_cpu_has_rpc(void) {
|
|||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_cann(void) {
|
||||
#if defined(GGML_USE_CANN)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_gpublas(void) {
|
||||
return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
|
||||
}
|
||||
|
|
|
@ -78,5 +78,13 @@ python -m build
|
|||
python -m twine upload dist/*
|
||||
```
|
||||
|
||||
## Run Unit Tests
|
||||
|
||||
From root of this repository you can run this command to run all the unit tests
|
||||
|
||||
```bash
|
||||
python -m unittest discover ./gguf-py -v
|
||||
```
|
||||
|
||||
## TODO
|
||||
- [ ] Include conversion scripts as command line entry points in this package.
|
||||
|
|
|
@ -5,3 +5,5 @@ from .gguf_writer import *
|
|||
from .quants import *
|
||||
from .tensor_mapping import *
|
||||
from .vocab import *
|
||||
from .utility import *
|
||||
from .metadata import *
|
||||
|
|
|
@ -19,18 +19,60 @@ GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h
|
|||
|
||||
class Keys:
|
||||
class General:
|
||||
TYPE = "general.type"
|
||||
ARCHITECTURE = "general.architecture"
|
||||
QUANTIZATION_VERSION = "general.quantization_version"
|
||||
ALIGNMENT = "general.alignment"
|
||||
FILE_TYPE = "general.file_type"
|
||||
|
||||
# Authorship Metadata
|
||||
NAME = "general.name"
|
||||
AUTHOR = "general.author"
|
||||
VERSION = "general.version"
|
||||
URL = "general.url"
|
||||
ORGANIZATION = "general.organization"
|
||||
|
||||
FINETUNE = "general.finetune"
|
||||
BASENAME = "general.basename"
|
||||
|
||||
DESCRIPTION = "general.description"
|
||||
QUANTIZED_BY = "general.quantized_by"
|
||||
|
||||
SIZE_LABEL = "general.size_label"
|
||||
|
||||
# Licensing details
|
||||
LICENSE = "general.license"
|
||||
SOURCE_URL = "general.source.url"
|
||||
SOURCE_HF_REPO = "general.source.huggingface.repository"
|
||||
FILE_TYPE = "general.file_type"
|
||||
LICENSE_NAME = "general.license.name"
|
||||
LICENSE_LINK = "general.license.link"
|
||||
|
||||
# Typically represents the converted GGUF repo (Unless native)
|
||||
URL = "general.url" # Model Website/Paper
|
||||
DOI = "general.doi"
|
||||
UUID = "general.uuid"
|
||||
REPO_URL = "general.repo_url" # Model Source Repository (git/svn/etc...)
|
||||
|
||||
# Model Source during conversion
|
||||
SOURCE_URL = "general.source.url" # Model Website/Paper
|
||||
SOURCE_DOI = "general.source.doi"
|
||||
SOURCE_UUID = "general.source.uuid"
|
||||
SOURCE_REPO_URL = "general.source.repo_url" # Model Source Repository (git/svn/etc...)
|
||||
|
||||
# Base Model Source. There can be more than one source if it's a merged
|
||||
# model like with 'Mistral-7B-Merge-14-v0.1'. This will assist in
|
||||
# tracing linage of models as it is finetuned or merged over time.
|
||||
BASE_MODEL_COUNT = "general.base_model.count"
|
||||
BASE_MODEL_NAME = "general.base_model.{id}.name"
|
||||
BASE_MODEL_AUTHOR = "general.base_model.{id}.author"
|
||||
BASE_MODEL_VERSION = "general.base_model.{id}.version"
|
||||
BASE_MODEL_ORGANIZATION = "general.base_model.{id}.organization"
|
||||
BASE_MODEL_URL = "general.base_model.{id}.url" # Model Website/Paper
|
||||
BASE_MODEL_DOI = "general.base_model.{id}.doi"
|
||||
BASE_MODEL_UUID = "general.base_model.{id}.uuid"
|
||||
BASE_MODEL_REPO_URL = "general.base_model.{id}.repo_url" # Model Source Repository (git/svn/etc...)
|
||||
|
||||
# Array based KV stores
|
||||
TAGS = "general.tags"
|
||||
LANGUAGES = "general.languages"
|
||||
DATASETS = "general.datasets"
|
||||
|
||||
class LLM:
|
||||
VOCAB_SIZE = "{arch}.vocab_size"
|
||||
|
@ -120,11 +162,20 @@ class Keys:
|
|||
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
|
||||
EOT_ID = "tokenizer.ggml.eot_token_id"
|
||||
|
||||
class Adapter:
|
||||
TYPE = "adapter.type"
|
||||
LORA_ALPHA = "adapter.lora.alpha"
|
||||
|
||||
#
|
||||
# recommended mapping of model tensor names for storage in gguf
|
||||
#
|
||||
|
||||
|
||||
class GGUFType:
|
||||
MODEL = "model"
|
||||
ADAPTER = "adapter"
|
||||
|
||||
|
||||
class MODEL_ARCH(IntEnum):
|
||||
LLAMA = auto()
|
||||
FALCON = auto()
|
||||
|
@ -1223,7 +1274,6 @@ KEY_GENERAL_URL = Keys.General.URL
|
|||
KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION
|
||||
KEY_GENERAL_LICENSE = Keys.General.LICENSE
|
||||
KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL
|
||||
KEY_GENERAL_SOURCE_HF_REPO = Keys.General.SOURCE_HF_REPO
|
||||
KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE
|
||||
|
||||
# LLM
|
||||
|
|
|
@ -7,6 +7,7 @@ import struct
|
|||
import tempfile
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum, auto
|
||||
from math import prod
|
||||
from pathlib import Path
|
||||
from io import BufferedWriter
|
||||
from typing import IO, Any, Sequence, Mapping
|
||||
|
@ -106,6 +107,53 @@ class GGUFWriter:
|
|||
|
||||
self.add_architecture()
|
||||
|
||||
def get_total_parameter_count(self) -> tuple[int, int, int, int]:
|
||||
total_params = 0
|
||||
shared_params = 0
|
||||
expert_params = 0
|
||||
|
||||
expert_sum = 0
|
||||
n_expert_tensors = 0
|
||||
|
||||
last_lora_a: tuple[str, TensorInfo] | None = None
|
||||
|
||||
for tensors in self.tensors:
|
||||
for name, info in tensors.items():
|
||||
|
||||
shape = info.shape
|
||||
|
||||
if name.endswith(".lora_a"):
|
||||
last_lora_a = (name, info)
|
||||
continue
|
||||
elif name.endswith(".lora_b"):
|
||||
if last_lora_a is None or last_lora_a[0] != name[:-1] + "a":
|
||||
# Bail when the LoRA pair can't be found trivially
|
||||
logger.warning("can't measure LoRA size correctly, tensor order is unusual")
|
||||
return 0, 0, 0, 0
|
||||
else:
|
||||
shape = (*shape[:-1], last_lora_a[1].shape[-1])
|
||||
|
||||
size = prod(shape)
|
||||
|
||||
if "_exps." in name:
|
||||
expert_params += (size // shape[-3])
|
||||
expert_sum += shape[-3]
|
||||
n_expert_tensors += 1
|
||||
else:
|
||||
shared_params += size
|
||||
|
||||
total_params += size
|
||||
|
||||
# Hopefully this should work even for variable-expert-count models
|
||||
expert_count = (expert_sum // n_expert_tensors) if n_expert_tensors > 0 else 0
|
||||
|
||||
# Negate the total to signal it's likely not exact
|
||||
if last_lora_a is not None:
|
||||
total_params = -total_params
|
||||
|
||||
# NOTE: keep the output in the same order as accepted by 'size_label' in gguf-py/gguf/utility.py
|
||||
return total_params, shared_params, expert_params, expert_count
|
||||
|
||||
def format_shard_names(self, path: Path) -> list[Path]:
|
||||
if len(self.tensors) == 1:
|
||||
return [path]
|
||||
|
@ -115,6 +163,7 @@ class GGUFWriter:
|
|||
if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path):
|
||||
# allow calling this multiple times as long as the path is the same
|
||||
return
|
||||
|
||||
if self.state is not WriterState.NO_FILE:
|
||||
raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
|
||||
|
||||
|
@ -136,6 +185,8 @@ class GGUFWriter:
|
|||
|
||||
if self.dry_run:
|
||||
logger.info("Dry run, not writing files")
|
||||
for name in filenames:
|
||||
print(name) # noqa: NP100
|
||||
exit()
|
||||
|
||||
return filenames
|
||||
|
@ -424,32 +475,18 @@ class GGUFWriter:
|
|||
fout.close()
|
||||
self.fout = None
|
||||
|
||||
def add_type(self, type_name: str) -> None:
|
||||
self.add_string(Keys.General.TYPE, type_name)
|
||||
|
||||
def add_architecture(self) -> None:
|
||||
self.add_string(Keys.General.ARCHITECTURE, self.arch)
|
||||
|
||||
def add_author(self, author: str) -> None:
|
||||
self.add_string(Keys.General.AUTHOR, author)
|
||||
def add_quantization_version(self, quantization_version: int) -> None:
|
||||
self.add_uint32(Keys.General.QUANTIZATION_VERSION, quantization_version)
|
||||
|
||||
def add_version(self, version: str) -> None:
|
||||
self.add_string(Keys.General.VERSION, version)
|
||||
|
||||
def add_tensor_data_layout(self, layout: str) -> None:
|
||||
self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
||||
|
||||
def add_url(self, url: str) -> None:
|
||||
self.add_string(Keys.General.URL, url)
|
||||
|
||||
def add_description(self, description: str) -> None:
|
||||
self.add_string(Keys.General.DESCRIPTION, description)
|
||||
|
||||
def add_licence(self, licence: str) -> None:
|
||||
self.add_string(Keys.General.LICENSE, licence)
|
||||
|
||||
def add_source_url(self, url: str) -> None:
|
||||
self.add_string(Keys.General.SOURCE_URL, url)
|
||||
|
||||
def add_source_hf_repo(self, repo: str) -> None:
|
||||
self.add_string(Keys.General.SOURCE_HF_REPO, repo)
|
||||
def add_custom_alignment(self, alignment: int) -> None:
|
||||
self.data_alignment = alignment
|
||||
self.add_uint32(Keys.General.ALIGNMENT, alignment)
|
||||
|
||||
def add_file_type(self, ftype: int) -> None:
|
||||
self.add_uint32(Keys.General.FILE_TYPE, ftype)
|
||||
|
@ -457,13 +494,101 @@ class GGUFWriter:
|
|||
def add_name(self, name: str) -> None:
|
||||
self.add_string(Keys.General.NAME, name)
|
||||
|
||||
def add_quantization_version(self, quantization_version: int) -> None:
|
||||
self.add_uint32(
|
||||
Keys.General.QUANTIZATION_VERSION, quantization_version)
|
||||
def add_author(self, author: str) -> None:
|
||||
self.add_string(Keys.General.AUTHOR, author)
|
||||
|
||||
def add_custom_alignment(self, alignment: int) -> None:
|
||||
self.data_alignment = alignment
|
||||
self.add_uint32(Keys.General.ALIGNMENT, alignment)
|
||||
def add_version(self, version: str) -> None:
|
||||
self.add_string(Keys.General.VERSION, version)
|
||||
|
||||
def add_organization(self, organization: str) -> None:
|
||||
self.add_string(Keys.General.ORGANIZATION, organization)
|
||||
|
||||
def add_finetune(self, finetune: str) -> None:
|
||||
self.add_string(Keys.General.FINETUNE, finetune)
|
||||
|
||||
def add_basename(self, basename: str) -> None:
|
||||
self.add_string(Keys.General.BASENAME, basename)
|
||||
|
||||
def add_description(self, description: str) -> None:
|
||||
self.add_string(Keys.General.DESCRIPTION, description)
|
||||
|
||||
def add_quantized_by(self, quantized: str) -> None:
|
||||
self.add_string(Keys.General.QUANTIZED_BY, quantized)
|
||||
|
||||
def add_size_label(self, size_label: str) -> None:
|
||||
self.add_string(Keys.General.SIZE_LABEL, size_label)
|
||||
|
||||
def add_license(self, license: str) -> None:
|
||||
self.add_string(Keys.General.LICENSE, license)
|
||||
|
||||
def add_license_name(self, license: str) -> None:
|
||||
self.add_string(Keys.General.LICENSE_NAME, license)
|
||||
|
||||
def add_license_link(self, license: str) -> None:
|
||||
self.add_string(Keys.General.LICENSE_LINK, license)
|
||||
|
||||
def add_url(self, url: str) -> None:
|
||||
self.add_string(Keys.General.URL, url)
|
||||
|
||||
def add_doi(self, doi: str) -> None:
|
||||
self.add_string(Keys.General.DOI, doi)
|
||||
|
||||
def add_uuid(self, uuid: str) -> None:
|
||||
self.add_string(Keys.General.UUID, uuid)
|
||||
|
||||
def add_repo_url(self, repo_url: str) -> None:
|
||||
self.add_string(Keys.General.REPO_URL, repo_url)
|
||||
|
||||
def add_source_url(self, url: str) -> None:
|
||||
self.add_string(Keys.General.SOURCE_URL, url)
|
||||
|
||||
def add_source_doi(self, doi: str) -> None:
|
||||
self.add_string(Keys.General.SOURCE_DOI, doi)
|
||||
|
||||
def add_source_uuid(self, uuid: str) -> None:
|
||||
self.add_string(Keys.General.SOURCE_UUID, uuid)
|
||||
|
||||
def add_source_repo_url(self, repo_url: str) -> None:
|
||||
self.add_string(Keys.General.SOURCE_REPO_URL, repo_url)
|
||||
|
||||
def add_base_model_count(self, source_count: int) -> None:
|
||||
self.add_uint32(Keys.General.BASE_MODEL_COUNT, source_count)
|
||||
|
||||
def add_base_model_name(self, source_id: int, name: str) -> None:
|
||||
self.add_string(Keys.General.BASE_MODEL_NAME.format(id=source_id), name)
|
||||
|
||||
def add_base_model_author(self, source_id: int, author: str) -> None:
|
||||
self.add_string(Keys.General.BASE_MODEL_AUTHOR.format(id=source_id), author)
|
||||
|
||||
def add_base_model_version(self, source_id: int, version: str) -> None:
|
||||
self.add_string(Keys.General.BASE_MODEL_VERSION.format(id=source_id), version)
|
||||
|
||||
def add_base_model_organization(self, source_id: int, organization: str) -> None:
|
||||
self.add_string(Keys.General.BASE_MODEL_ORGANIZATION.format(id=source_id), organization)
|
||||
|
||||
def add_base_model_url(self, source_id: int, url: str) -> None:
|
||||
self.add_string(Keys.General.BASE_MODEL_URL.format(id=source_id), url)
|
||||
|
||||
def add_base_model_doi(self, source_id: int, doi: str) -> None:
|
||||
self.add_string(Keys.General.BASE_MODEL_DOI.format(id=source_id), doi)
|
||||
|
||||
def add_base_model_uuid(self, source_id: int, uuid: str) -> None:
|
||||
self.add_string(Keys.General.BASE_MODEL_UUID.format(id=source_id), uuid)
|
||||
|
||||
def add_base_model_repo_url(self, source_id: int, repo_url: str) -> None:
|
||||
self.add_string(Keys.General.BASE_MODEL_REPO_URL.format(id=source_id), repo_url)
|
||||
|
||||
def add_tags(self, tags: Sequence[str]) -> None:
|
||||
self.add_array(Keys.General.TAGS, tags)
|
||||
|
||||
def add_languages(self, languages: Sequence[str]) -> None:
|
||||
self.add_array(Keys.General.LANGUAGES, languages)
|
||||
|
||||
def add_datasets(self, datasets: Sequence[str]) -> None:
|
||||
self.add_array(Keys.General.DATASETS, datasets)
|
||||
|
||||
def add_tensor_data_layout(self, layout: str) -> None:
|
||||
self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
||||
|
||||
def add_vocab_size(self, size: int) -> None:
|
||||
self.add_uint32(Keys.LLM.VOCAB_SIZE.format(arch=self.arch), size)
|
||||
|
|
|
@ -3,7 +3,6 @@ from abc import ABC, ABCMeta, abstractmethod
|
|||
|
||||
import logging
|
||||
from typing import Any, Callable
|
||||
from collections import deque
|
||||
|
||||
import numpy as np
|
||||
from numpy.typing import DTypeLike
|
||||
|
@ -74,20 +73,18 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
|||
_tensor_type: type
|
||||
_meta: Any
|
||||
_data: Any | None
|
||||
_lazy: deque[LazyBase] # shared within a graph, to avoid deep recursion when making eager
|
||||
_args: tuple
|
||||
_func: Callable[[tuple], Any] | None
|
||||
_kwargs: dict[str, Any]
|
||||
_func: Callable[[Any], Any] | None
|
||||
|
||||
def __init__(self, *, meta: Any, data: Any | None = None, lazy: deque[LazyBase] | None = None, args: tuple = (), func: Callable[[tuple], Any] | None = None):
|
||||
def __init__(self, *, meta: Any, data: Any | None = None, args: tuple = (), kwargs: dict[str, Any] | None = None, func: Callable[[Any], Any] | None = None):
|
||||
super().__init__()
|
||||
self._meta = meta
|
||||
self._data = data
|
||||
self._lazy = lazy if lazy is not None else deque()
|
||||
self._args = args
|
||||
self._kwargs = kwargs if kwargs is not None else {}
|
||||
self._func = func
|
||||
assert self._func is not None or self._data is not None
|
||||
if self._data is None:
|
||||
self._lazy.append(self)
|
||||
|
||||
def __init_subclass__(cls) -> None:
|
||||
if "_tensor_type" not in cls.__dict__:
|
||||
|
@ -117,6 +114,7 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
|||
args = ((use_self,) if use_self is not None else ()) + args
|
||||
|
||||
meta_args = LazyBase._recurse_apply(args, lambda t: t._meta)
|
||||
# TODO: maybe handle tensors in kwargs too
|
||||
|
||||
if isinstance(meta_noop, bool) and not meta_noop:
|
||||
try:
|
||||
|
@ -140,23 +138,7 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
|||
res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)
|
||||
|
||||
if isinstance(res, cls._tensor_type):
|
||||
class CollectSharedLazy:
|
||||
# emulating a static variable
|
||||
shared_lazy: None | deque[LazyBase] = None
|
||||
|
||||
@staticmethod
|
||||
def collect_replace(t: LazyBase):
|
||||
if CollectSharedLazy.shared_lazy is None:
|
||||
CollectSharedLazy.shared_lazy = t._lazy
|
||||
else:
|
||||
CollectSharedLazy.shared_lazy.extend(t._lazy)
|
||||
t._lazy = CollectSharedLazy.shared_lazy
|
||||
|
||||
LazyBase._recurse_apply(args, CollectSharedLazy.collect_replace)
|
||||
|
||||
shared_lazy = CollectSharedLazy.shared_lazy
|
||||
|
||||
return cls(meta=cls.eager_to_meta(res), lazy=shared_lazy, args=args, func=lambda a: fn(*a, **kwargs))
|
||||
return cls(meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn)
|
||||
else:
|
||||
del res # not needed
|
||||
# non-tensor return likely relies on the contents of the args
|
||||
|
@ -168,26 +150,18 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
|||
@classmethod
|
||||
def to_eager(cls, t: Any) -> Any:
|
||||
def simple_to_eager(_t: LazyBase) -> Any:
|
||||
def already_eager_to_eager(_t: LazyBase) -> Any:
|
||||
assert _t._data is not None
|
||||
if _t._data is not None:
|
||||
return _t._data
|
||||
|
||||
while _t._data is None:
|
||||
lt = _t._lazy.popleft()
|
||||
if lt._data is not None:
|
||||
# Lazy tensor did not belong in the lazy queue.
|
||||
# Weirdly only happens with Bloom models...
|
||||
# likely because tensors aren't unique in the queue.
|
||||
# The final output is still the same as in eager mode,
|
||||
# so it's safe to ignore this.
|
||||
continue
|
||||
assert lt._func is not None
|
||||
lt._args = cls._recurse_apply(lt._args, already_eager_to_eager)
|
||||
lt._data = lt._func(lt._args)
|
||||
# NOTE: there's a recursion limit in Python (usually 1000)
|
||||
|
||||
assert _t._func is not None
|
||||
_t._args = cls._recurse_apply(_t._args, simple_to_eager)
|
||||
_t._data = _t._func(*_t._args, **_t._kwargs)
|
||||
# sanity check
|
||||
assert lt._data is not None
|
||||
assert lt._data.dtype == lt._meta.dtype
|
||||
assert lt._data.shape == lt._meta.shape
|
||||
assert _t._data is not None
|
||||
assert _t._data.dtype == _t._meta.dtype
|
||||
assert _t._data.shape == _t._meta.shape
|
||||
|
||||
return _t._data
|
||||
|
||||
|
@ -206,7 +180,7 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
|||
@classmethod
|
||||
def from_eager(cls, t: Any) -> Any:
|
||||
if type(t) is cls:
|
||||
# already eager
|
||||
# already lazy
|
||||
return t
|
||||
elif isinstance(t, cls._tensor_type):
|
||||
return cls(meta=cls.eager_to_meta(t), data=t)
|
||||
|
@ -228,8 +202,7 @@ class LazyNumpyTensor(LazyBase):
|
|||
def astype(self, dtype, *args, **kwargs):
|
||||
meta = type(self).meta_with_dtype_and_shape(dtype, self._meta.shape)
|
||||
full_args = (self, dtype,) + args
|
||||
# very important to pass the shared _lazy deque, or else there's an infinite loop somewhere.
|
||||
return type(self)(meta=meta, args=full_args, lazy=self._lazy, func=(lambda a: a[0].astype(*a[1:], **kwargs)))
|
||||
return type(self)(meta=meta, args=full_args, kwargs=kwargs, func=(lambda a, *args, **kwargs: a.astype(*args, **kwargs)))
|
||||
|
||||
def tofile(self, *args, **kwargs):
|
||||
eager = LazyNumpyTensor.to_eager(self)
|
||||
|
|
486
gguf-py/gguf/metadata.py
Normal file
486
gguf-py/gguf/metadata.py
Normal file
|
@ -0,0 +1,486 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import json
|
||||
import yaml
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal, Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
from .constants import Keys
|
||||
|
||||
import gguf
|
||||
|
||||
logger = logging.getLogger("metadata")
|
||||
|
||||
|
||||
@dataclass
|
||||
class Metadata:
|
||||
# Authorship Metadata to be written to GGUF KV Store
|
||||
name: Optional[str] = None
|
||||
author: Optional[str] = None
|
||||
version: Optional[str] = None
|
||||
organization: Optional[str] = None
|
||||
finetune: Optional[str] = None
|
||||
basename: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
quantized_by: Optional[str] = None
|
||||
size_label: Optional[str] = None
|
||||
url: Optional[str] = None
|
||||
doi: Optional[str] = None
|
||||
uuid: Optional[str] = None
|
||||
repo_url: Optional[str] = None
|
||||
source_url: Optional[str] = None
|
||||
source_doi: Optional[str] = None
|
||||
source_uuid: Optional[str] = None
|
||||
source_repo_url: Optional[str] = None
|
||||
license: Optional[str] = None
|
||||
license_name: Optional[str] = None
|
||||
license_link: Optional[str] = None
|
||||
base_models: Optional[list[dict]] = None
|
||||
tags: Optional[list[str]] = None
|
||||
languages: Optional[list[str]] = None
|
||||
datasets: Optional[list[str]] = None
|
||||
|
||||
@staticmethod
|
||||
def load(metadata_override_path: Optional[Path] = None, model_path: Optional[Path] = None, model_name: Optional[str] = None, total_params: int = 0) -> Metadata:
|
||||
# This grabs as many contextual authorship metadata as possible from the model repository
|
||||
# making any conversion as required to match the gguf kv store metadata format
|
||||
# as well as giving users the ability to override any authorship metadata that may be incorrect
|
||||
|
||||
# Create a new Metadata instance
|
||||
metadata = Metadata()
|
||||
|
||||
model_card = Metadata.load_model_card(model_path)
|
||||
hf_params = Metadata.load_hf_parameters(model_path)
|
||||
|
||||
# heuristics
|
||||
metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params)
|
||||
|
||||
# Metadata Override File Provided
|
||||
# This is based on LLM_KV_NAMES mapping in llama.cpp
|
||||
metadata_override = Metadata.load_metadata_override(metadata_override_path)
|
||||
|
||||
metadata.name = metadata_override.get(Keys.General.NAME, metadata.name)
|
||||
metadata.author = metadata_override.get(Keys.General.AUTHOR, metadata.author)
|
||||
metadata.version = metadata_override.get(Keys.General.VERSION, metadata.version)
|
||||
metadata.organization = metadata_override.get(Keys.General.ORGANIZATION, metadata.organization)
|
||||
|
||||
metadata.finetune = metadata_override.get(Keys.General.FINETUNE, metadata.finetune)
|
||||
metadata.basename = metadata_override.get(Keys.General.BASENAME, metadata.basename)
|
||||
|
||||
metadata.description = metadata_override.get(Keys.General.DESCRIPTION, metadata.description)
|
||||
metadata.quantized_by = metadata_override.get(Keys.General.QUANTIZED_BY, metadata.quantized_by)
|
||||
|
||||
metadata.size_label = metadata_override.get(Keys.General.SIZE_LABEL, metadata.size_label)
|
||||
metadata.license_name = metadata_override.get(Keys.General.LICENSE_NAME, metadata.license_name)
|
||||
metadata.license_link = metadata_override.get(Keys.General.LICENSE_LINK, metadata.license_link)
|
||||
|
||||
metadata.url = metadata_override.get(Keys.General.URL, metadata.url)
|
||||
metadata.doi = metadata_override.get(Keys.General.DOI, metadata.doi)
|
||||
metadata.uuid = metadata_override.get(Keys.General.UUID, metadata.uuid)
|
||||
metadata.repo_url = metadata_override.get(Keys.General.REPO_URL, metadata.repo_url)
|
||||
|
||||
metadata.source_url = metadata_override.get(Keys.General.SOURCE_URL, metadata.source_url)
|
||||
metadata.source_doi = metadata_override.get(Keys.General.SOURCE_DOI, metadata.source_doi)
|
||||
metadata.source_uuid = metadata_override.get(Keys.General.SOURCE_UUID, metadata.source_uuid)
|
||||
metadata.source_repo_url = metadata_override.get(Keys.General.SOURCE_REPO_URL, metadata.source_repo_url)
|
||||
|
||||
# Base Models is received here as an array of models
|
||||
metadata.base_models = metadata_override.get("general.base_models", metadata.base_models)
|
||||
|
||||
metadata.tags = metadata_override.get(Keys.General.TAGS, metadata.tags)
|
||||
metadata.languages = metadata_override.get(Keys.General.LANGUAGES, metadata.languages)
|
||||
metadata.datasets = metadata_override.get(Keys.General.DATASETS, metadata.datasets)
|
||||
|
||||
# Direct Metadata Override (via direct cli argument)
|
||||
if model_name is not None:
|
||||
metadata.name = model_name
|
||||
|
||||
return metadata
|
||||
|
||||
@staticmethod
|
||||
def load_metadata_override(metadata_override_path: Optional[Path] = None) -> dict[str, Any]:
|
||||
if metadata_override_path is None or not metadata_override_path.is_file():
|
||||
return {}
|
||||
|
||||
with open(metadata_override_path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
@staticmethod
|
||||
def load_model_card(model_path: Optional[Path] = None) -> dict[str, Any]:
|
||||
if model_path is None or not model_path.is_dir():
|
||||
return {}
|
||||
|
||||
model_card_path = model_path / "README.md"
|
||||
|
||||
if not model_card_path.is_file():
|
||||
return {}
|
||||
|
||||
# The model card metadata is assumed to always be in YAML
|
||||
# ref: https://github.com/huggingface/transformers/blob/a5c642fe7a1f25d3bdcd76991443ba6ff7ee34b2/src/transformers/modelcard.py#L468-L473
|
||||
with open(model_card_path, "r", encoding="utf-8") as f:
|
||||
if f.readline() == "---\n":
|
||||
raw = f.read().partition("---\n")[0]
|
||||
data = yaml.safe_load(raw)
|
||||
if isinstance(data, dict):
|
||||
return data
|
||||
else:
|
||||
logger.error(f"while reading YAML model card frontmatter, data is {type(data)} instead of dict")
|
||||
return {}
|
||||
else:
|
||||
return {}
|
||||
|
||||
@staticmethod
|
||||
def load_hf_parameters(model_path: Optional[Path] = None) -> dict[str, Any]:
|
||||
if model_path is None or not model_path.is_dir():
|
||||
return {}
|
||||
|
||||
config_path = model_path / "config.json"
|
||||
|
||||
if not config_path.is_file():
|
||||
return {}
|
||||
|
||||
with open(config_path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
@staticmethod
|
||||
def id_to_title(string):
|
||||
# Convert capitalization into title form unless acronym or version number
|
||||
return ' '.join([w.title() if w.islower() and not re.match(r'^(v\d+(?:\.\d+)*|\d.*)$', w) else w for w in string.strip().replace('-', ' ').split()])
|
||||
|
||||
@staticmethod
|
||||
def get_model_id_components(model_id: Optional[str] = None, total_params: int = 0) -> tuple[str | None, str | None, str | None, str | None, str | None, str | None]:
|
||||
# Huggingface often store model id as '<org>/<model name>'
|
||||
# so let's parse it and apply some heuristics if possible for model name components
|
||||
|
||||
if model_id is None:
|
||||
# model ID missing
|
||||
return None, None, None, None, None, None
|
||||
|
||||
if ' ' in model_id:
|
||||
# model ID is actually a normal human sentence
|
||||
# which means its most likely a normal model name only
|
||||
# not part of the hugging face naming standard, but whatever
|
||||
return model_id, None, None, None, None, None
|
||||
|
||||
if '/' in model_id:
|
||||
# model ID (huggingface style)
|
||||
org_component, model_full_name_component = model_id.split('/', 1)
|
||||
else:
|
||||
# model ID but missing org components
|
||||
org_component, model_full_name_component = None, model_id
|
||||
|
||||
# Check if we erroneously matched against './' or '../' etc...
|
||||
if org_component is not None and org_component[0] == '.':
|
||||
org_component = None
|
||||
|
||||
name_parts: list[str] = model_full_name_component.split('-')
|
||||
name_types: list[
|
||||
set[Literal["basename", "size_label", "finetune", "version", "type"]]
|
||||
] = [set() for _ in name_parts]
|
||||
|
||||
# Annotate the name
|
||||
for i, part in enumerate(name_parts):
|
||||
# Version
|
||||
if re.fullmatch(r'(v|iter)?\d+([.]\d+)*', part, re.IGNORECASE):
|
||||
name_types[i].add("version")
|
||||
# Quant type (should not be there for base models, but still annotated)
|
||||
elif re.fullmatch(r'i?q\d(_\w)*|b?fp?(16|32)', part, re.IGNORECASE):
|
||||
name_types[i].add("type")
|
||||
name_parts[i] = part.upper()
|
||||
# Model size
|
||||
elif i > 0 and re.fullmatch(r'(([A]|\d+[x])?\d+([._]\d+)?[KMBT][\d]?|small|mini|medium|large|x?xl)', part, re.IGNORECASE):
|
||||
part = part.replace("_", ".")
|
||||
# Handle weird bloom-7b1 notation
|
||||
if part[-1].isdecimal():
|
||||
part = part[:-2] + "." + part[-1] + part[-2]
|
||||
# Normalize the size suffixes
|
||||
if len(part) > 1 and part[-2].isdecimal():
|
||||
if part[-1] in "kmbt":
|
||||
part = part[:-1] + part[-1].upper()
|
||||
if total_params != 0:
|
||||
try:
|
||||
label_params = float(part[:-1]) * pow(1000, " KMBT".find(part[-1]))
|
||||
# Only use it as a size label if it's close or bigger than the model size
|
||||
# Note that LoRA adapters don't necessarily include all layers,
|
||||
# so this is why bigger label sizes are accepted.
|
||||
# Do not use the size label when it's smaller than 1/8 of the model size
|
||||
if (total_params < 0 and label_params < abs(total_params) // 8) or (
|
||||
# Check both directions when the current model isn't a LoRA adapter
|
||||
total_params > 0 and abs(label_params - total_params) > 7 * total_params // 8
|
||||
):
|
||||
# Likely a context length
|
||||
name_types[i].add("finetune")
|
||||
# Lowercase the size when it's a context length
|
||||
part = part[:-1] + part[-1].lower()
|
||||
except ValueError:
|
||||
# Failed to convert the size label to float, use it anyway
|
||||
pass
|
||||
if len(name_types[i]) == 0:
|
||||
name_types[i].add("size_label")
|
||||
name_parts[i] = part
|
||||
# Some easy to recognize finetune names
|
||||
elif i > 0 and re.fullmatch(r'chat|instruct|vision|lora', part, re.IGNORECASE):
|
||||
name_types[i].add("finetune")
|
||||
if part.lower() == "lora":
|
||||
name_parts[i] = "LoRA"
|
||||
|
||||
at_start = True
|
||||
# Find the basename through the annotated name
|
||||
for part, t in zip(name_parts, name_types):
|
||||
if at_start and ((len(t) == 0 and part[0].isalpha()) or "version" in t):
|
||||
t.add("basename")
|
||||
else:
|
||||
if at_start:
|
||||
at_start = False
|
||||
if len(t) == 0:
|
||||
t.add("finetune")
|
||||
|
||||
# Remove the basename annotation from trailing version
|
||||
for part, t in zip(reversed(name_parts), reversed(name_types)):
|
||||
if "basename" in t:
|
||||
if len(t) > 1:
|
||||
t.remove("basename")
|
||||
else:
|
||||
break
|
||||
|
||||
basename = "-".join(n for n, t in zip(name_parts, name_types) if "basename" in t) or None
|
||||
size_label = "-".join(s for s, t in zip(name_parts, name_types) if "size_label" in t) or None
|
||||
finetune = "-".join(f for f, t in zip(name_parts, name_types) if "finetune" in t) or None
|
||||
# TODO: should the basename version always be excluded?
|
||||
# TODO: should multiple versions be joined together?
|
||||
version = ([v for v, t, in zip(name_parts, name_types) if "version" in t and "basename" not in t] or [None])[-1]
|
||||
|
||||
if size_label is None and finetune is None and version is None:
|
||||
# Too ambiguous, output nothing
|
||||
basename = None
|
||||
|
||||
return model_full_name_component, org_component, basename, finetune, version, size_label
|
||||
|
||||
@staticmethod
|
||||
def apply_metadata_heuristic(metadata: Metadata, model_card: Optional[dict] = None, hf_params: Optional[dict] = None, model_path: Optional[Path] = None, total_params: int = 0) -> Metadata:
|
||||
# Reference Model Card Metadata: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1
|
||||
|
||||
# Model Card Heuristics
|
||||
########################
|
||||
if model_card is not None:
|
||||
|
||||
if "model_name" in model_card and metadata.name is None:
|
||||
# Not part of huggingface model card standard but notice some model creator using it
|
||||
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
|
||||
metadata.name = model_card.get("model_name")
|
||||
|
||||
if "model_creator" in model_card and metadata.author is None:
|
||||
# Not part of huggingface model card standard but notice some model creator using it
|
||||
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
|
||||
metadata.author = model_card.get("model_creator")
|
||||
|
||||
if "model_type" in model_card and metadata.basename is None:
|
||||
# Not part of huggingface model card standard but notice some model creator using it
|
||||
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
|
||||
metadata.basename = model_card.get("model_type")
|
||||
|
||||
if "base_model" in model_card:
|
||||
# This represents the parent models that this is based on
|
||||
# Example: stabilityai/stable-diffusion-xl-base-1.0. Can also be a list (for merges)
|
||||
# Example of merges: https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0.1/blob/main/README.md
|
||||
metadata_base_models = []
|
||||
base_model_value = model_card.get("base_model", None)
|
||||
|
||||
if base_model_value is not None:
|
||||
if isinstance(base_model_value, str):
|
||||
metadata_base_models.append(base_model_value)
|
||||
elif isinstance(base_model_value, list):
|
||||
metadata_base_models.extend(base_model_value)
|
||||
|
||||
if metadata.base_models is None:
|
||||
metadata.base_models = []
|
||||
|
||||
for model_id in metadata_base_models:
|
||||
# NOTE: model size of base model is assumed to be similar to the size of the current model
|
||||
model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params)
|
||||
base_model = {}
|
||||
if model_full_name_component is not None:
|
||||
base_model["name"] = Metadata.id_to_title(model_full_name_component)
|
||||
if org_component is not None:
|
||||
base_model["organization"] = Metadata.id_to_title(org_component)
|
||||
if version is not None:
|
||||
base_model["version"] = version
|
||||
if org_component is not None and model_full_name_component is not None:
|
||||
base_model["repo_url"] = f"https://huggingface.co/{org_component}/{model_full_name_component}"
|
||||
metadata.base_models.append(base_model)
|
||||
|
||||
if "license" in model_card and metadata.license is None:
|
||||
metadata.license = model_card.get("license")
|
||||
|
||||
if "license_name" in model_card and metadata.license_name is None:
|
||||
metadata.license_name = model_card.get("license_name")
|
||||
|
||||
if "license_link" in model_card and metadata.license_link is None:
|
||||
metadata.license_link = model_card.get("license_link")
|
||||
|
||||
tags_value = model_card.get("tags", None)
|
||||
if tags_value is not None:
|
||||
|
||||
if metadata.tags is None:
|
||||
metadata.tags = []
|
||||
|
||||
if isinstance(tags_value, str):
|
||||
metadata.tags.append(tags_value)
|
||||
elif isinstance(tags_value, list):
|
||||
metadata.tags.extend(tags_value)
|
||||
|
||||
pipeline_tags_value = model_card.get("pipeline_tag", None)
|
||||
if pipeline_tags_value is not None:
|
||||
|
||||
if metadata.tags is None:
|
||||
metadata.tags = []
|
||||
|
||||
if isinstance(pipeline_tags_value, str):
|
||||
metadata.tags.append(pipeline_tags_value)
|
||||
elif isinstance(pipeline_tags_value, list):
|
||||
metadata.tags.extend(pipeline_tags_value)
|
||||
|
||||
language_value = model_card.get("languages", model_card.get("language", None))
|
||||
if language_value is not None:
|
||||
|
||||
if metadata.languages is None:
|
||||
metadata.languages = []
|
||||
|
||||
if isinstance(language_value, str):
|
||||
metadata.languages.append(language_value)
|
||||
elif isinstance(language_value, list):
|
||||
metadata.languages.extend(language_value)
|
||||
|
||||
dataset_value = model_card.get("datasets", model_card.get("dataset", None))
|
||||
if dataset_value is not None:
|
||||
|
||||
if metadata.datasets is None:
|
||||
metadata.datasets = []
|
||||
|
||||
if isinstance(dataset_value, str):
|
||||
metadata.datasets.append(dataset_value)
|
||||
elif isinstance(dataset_value, list):
|
||||
metadata.datasets.extend(dataset_value)
|
||||
|
||||
# Hugging Face Parameter Heuristics
|
||||
####################################
|
||||
|
||||
if hf_params is not None:
|
||||
|
||||
hf_name_or_path = hf_params.get("_name_or_path")
|
||||
if hf_name_or_path is not None and hf_name_or_path.count('/') <= 1:
|
||||
# Use _name_or_path only if its actually a model name and not some computer path
|
||||
# e.g. 'meta-llama/Llama-2-7b-hf'
|
||||
model_id = hf_name_or_path
|
||||
model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params)
|
||||
if metadata.name is None and model_full_name_component is not None:
|
||||
metadata.name = Metadata.id_to_title(model_full_name_component)
|
||||
if metadata.organization is None and org_component is not None:
|
||||
metadata.organization = Metadata.id_to_title(org_component)
|
||||
if metadata.basename is None and basename is not None:
|
||||
metadata.basename = basename
|
||||
if metadata.finetune is None and finetune is not None:
|
||||
metadata.finetune = finetune
|
||||
if metadata.version is None and version is not None:
|
||||
metadata.version = version
|
||||
if metadata.size_label is None and size_label is not None:
|
||||
metadata.size_label = size_label
|
||||
|
||||
# Directory Folder Name Fallback Heuristics
|
||||
############################################
|
||||
if model_path is not None:
|
||||
model_id = model_path.name
|
||||
model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params)
|
||||
if metadata.name is None and model_full_name_component is not None:
|
||||
metadata.name = Metadata.id_to_title(model_full_name_component)
|
||||
if metadata.organization is None and org_component is not None:
|
||||
metadata.organization = Metadata.id_to_title(org_component)
|
||||
if metadata.basename is None and basename is not None:
|
||||
metadata.basename = basename
|
||||
if metadata.finetune is None and finetune is not None:
|
||||
metadata.finetune = finetune
|
||||
if metadata.version is None and version is not None:
|
||||
metadata.version = version
|
||||
if metadata.size_label is None and size_label is not None:
|
||||
metadata.size_label = size_label
|
||||
|
||||
return metadata
|
||||
|
||||
def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
|
||||
assert self.name is not None
|
||||
gguf_writer.add_name(self.name)
|
||||
|
||||
if self.author is not None:
|
||||
gguf_writer.add_author(self.author)
|
||||
if self.version is not None:
|
||||
gguf_writer.add_version(self.version)
|
||||
if self.organization is not None:
|
||||
gguf_writer.add_organization(self.organization)
|
||||
|
||||
if self.finetune is not None:
|
||||
gguf_writer.add_finetune(self.finetune)
|
||||
if self.basename is not None:
|
||||
gguf_writer.add_basename(self.basename)
|
||||
|
||||
if self.description is not None:
|
||||
gguf_writer.add_description(self.description)
|
||||
if self.quantized_by is not None:
|
||||
gguf_writer.add_quantized_by(self.quantized_by)
|
||||
|
||||
if self.size_label is not None:
|
||||
gguf_writer.add_size_label(self.size_label)
|
||||
|
||||
if self.license is not None:
|
||||
gguf_writer.add_license(self.license)
|
||||
if self.license_name is not None:
|
||||
gguf_writer.add_license_name(self.license_name)
|
||||
if self.license_link is not None:
|
||||
gguf_writer.add_license_link(self.license_link)
|
||||
|
||||
if self.url is not None:
|
||||
gguf_writer.add_url(self.url)
|
||||
if self.doi is not None:
|
||||
gguf_writer.add_doi(self.doi)
|
||||
if self.uuid is not None:
|
||||
gguf_writer.add_uuid(self.uuid)
|
||||
if self.repo_url is not None:
|
||||
gguf_writer.add_repo_url(self.repo_url)
|
||||
|
||||
if self.source_url is not None:
|
||||
gguf_writer.add_source_url(self.source_url)
|
||||
if self.source_doi is not None:
|
||||
gguf_writer.add_source_doi(self.source_doi)
|
||||
if self.source_uuid is not None:
|
||||
gguf_writer.add_source_uuid(self.source_uuid)
|
||||
if self.source_repo_url is not None:
|
||||
gguf_writer.add_source_repo_url(self.source_repo_url)
|
||||
|
||||
if self.base_models is not None:
|
||||
gguf_writer.add_base_model_count(len(self.base_models))
|
||||
for key, base_model_entry in enumerate(self.base_models):
|
||||
if "name" in base_model_entry:
|
||||
gguf_writer.add_base_model_name(key, base_model_entry["name"])
|
||||
if "author" in base_model_entry:
|
||||
gguf_writer.add_base_model_author(key, base_model_entry["author"])
|
||||
if "version" in base_model_entry:
|
||||
gguf_writer.add_base_model_version(key, base_model_entry["version"])
|
||||
if "organization" in base_model_entry:
|
||||
gguf_writer.add_base_model_organization(key, base_model_entry["organization"])
|
||||
if "url" in base_model_entry:
|
||||
gguf_writer.add_base_model_url(key, base_model_entry["url"])
|
||||
if "doi" in base_model_entry:
|
||||
gguf_writer.add_base_model_doi(key, base_model_entry["doi"])
|
||||
if "uuid" in base_model_entry:
|
||||
gguf_writer.add_base_model_uuid(key, base_model_entry["uuid"])
|
||||
if "repo_url" in base_model_entry:
|
||||
gguf_writer.add_base_model_repo_url(key, base_model_entry["repo_url"])
|
||||
|
||||
if self.tags is not None:
|
||||
gguf_writer.add_tags(self.tags)
|
||||
if self.languages is not None:
|
||||
gguf_writer.add_languages(self.languages)
|
||||
if self.datasets is not None:
|
||||
gguf_writer.add_datasets(self.datasets)
|
|
@ -43,7 +43,7 @@ def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.
|
|||
osize *= dim
|
||||
out = np.empty(shape=osize, dtype=otype)
|
||||
# compute over groups of 16 rows (arbitrary, but seems good for performance)
|
||||
n_groups = rows.shape[0] // 16
|
||||
n_groups = (rows.shape[0] // 16) or 1
|
||||
np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)
|
||||
return out.reshape(oshape)
|
||||
|
||||
|
|
|
@ -602,13 +602,11 @@ class TensorNameMap:
|
|||
for tensor, keys in self.block_mappings_cfg.items():
|
||||
if tensor not in MODEL_TENSORS[arch]:
|
||||
continue
|
||||
# TODO: make this configurable
|
||||
n_experts = 160
|
||||
for xid in range(n_experts):
|
||||
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
|
||||
|
||||
tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
|
||||
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||
for key in keys:
|
||||
key = key.format(bid = bid, xid = xid)
|
||||
key = key.format(bid = bid)
|
||||
self.mapping[key] = (tensor, tensor_name)
|
||||
|
||||
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
|
||||
|
|
69
gguf-py/gguf/utility.py
Normal file
69
gguf-py/gguf/utility.py
Normal file
|
@ -0,0 +1,69 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from typing import Literal
|
||||
|
||||
|
||||
def fill_templated_filename(filename: str, output_type: str | None) -> str:
|
||||
# Given a file name fill in any type templates e.g. 'some-model-name.{ftype}.gguf'
|
||||
ftype_lowercase: str = output_type.lower() if output_type is not None else ""
|
||||
ftype_uppercase: str = output_type.upper() if output_type is not None else ""
|
||||
return filename.format(ftype_lowercase,
|
||||
outtype=ftype_lowercase, ftype=ftype_lowercase,
|
||||
OUTTYPE=ftype_uppercase, FTYPE=ftype_uppercase)
|
||||
|
||||
|
||||
def model_weight_count_rounded_notation(model_params_count: int, min_digits: int = 2) -> str:
|
||||
if model_params_count > 1e12 :
|
||||
# Trillions Of Parameters
|
||||
scaled_model_params = model_params_count * 1e-12
|
||||
scale_suffix = "T"
|
||||
elif model_params_count > 1e9 :
|
||||
# Billions Of Parameters
|
||||
scaled_model_params = model_params_count * 1e-9
|
||||
scale_suffix = "B"
|
||||
elif model_params_count > 1e6 :
|
||||
# Millions Of Parameters
|
||||
scaled_model_params = model_params_count * 1e-6
|
||||
scale_suffix = "M"
|
||||
else:
|
||||
# Thousands Of Parameters
|
||||
scaled_model_params = model_params_count * 1e-3
|
||||
scale_suffix = "K"
|
||||
|
||||
fix = max(min_digits - len(str(round(scaled_model_params)).lstrip('0')), 0)
|
||||
|
||||
return f"{scaled_model_params:.{fix}f}{scale_suffix}"
|
||||
|
||||
|
||||
def size_label(total_params: int, shared_params: int, expert_params: int, expert_count: int) -> str:
|
||||
|
||||
if expert_count > 0:
|
||||
pretty_size = model_weight_count_rounded_notation(abs(shared_params) + abs(expert_params), min_digits=2)
|
||||
size_class = f"{expert_count}x{pretty_size}"
|
||||
else:
|
||||
size_class = model_weight_count_rounded_notation(abs(total_params), min_digits=2)
|
||||
|
||||
return size_class
|
||||
|
||||
|
||||
def naming_convention(model_name: str | None, base_name: str | None, finetune_string: str | None, version_string: str | None, size_label: str | None, output_type: str | None, model_type: Literal['vocab', 'LoRA'] | None = None) -> str:
|
||||
# Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention
|
||||
|
||||
if base_name is not None:
|
||||
name = base_name.strip().title().replace(' ', '-').replace('/', '-')
|
||||
elif model_name is not None:
|
||||
name = model_name.strip().title().replace(' ', '-').replace('/', '-')
|
||||
else:
|
||||
name = "ggml-model"
|
||||
|
||||
parameters = f"-{size_label}" if size_label is not None else ""
|
||||
|
||||
finetune = f"-{finetune_string.strip().title().replace(' ', '-')}" if finetune_string is not None else ""
|
||||
|
||||
version = f"-{version_string.strip().replace(' ', '-')}" if version_string is not None else ""
|
||||
|
||||
encoding = f"-{output_type.strip().replace(' ', '-').upper()}" if output_type is not None else ""
|
||||
|
||||
kind = f"-{model_type.strip().replace(' ', '-')}" if model_type is not None else ""
|
||||
|
||||
return f"{name}{parameters}{finetune}{version}{encoding}{kind}"
|
|
@ -22,6 +22,7 @@ classifiers = [
|
|||
python = ">=3.8"
|
||||
numpy = ">=1.17"
|
||||
tqdm = ">=4.27"
|
||||
pyyaml = ">=5.1"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
pytest = "^5.2"
|
||||
|
|
1
gguf-py/tests/__init__.py
Normal file
1
gguf-py/tests/__init__.py
Normal file
|
@ -0,0 +1 @@
|
|||
from .test_metadata import *
|
|
@ -1,7 +0,0 @@
|
|||
import gguf # noqa: F401 # pyright: ignore[reportUnusedImport]
|
||||
|
||||
# TODO: add tests
|
||||
|
||||
|
||||
def test_write_gguf() -> None:
|
||||
pass
|
158
gguf-py/tests/test_metadata.py
Executable file
158
gguf-py/tests/test_metadata.py
Executable file
|
@ -0,0 +1,158 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Necessary to load the local gguf package
|
||||
if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
import gguf
|
||||
|
||||
|
||||
class TestMetadataMethod(unittest.TestCase):
|
||||
|
||||
def test_id_to_title(self):
|
||||
self.assertEqual(gguf.Metadata.id_to_title("Mixtral-8x7B-Instruct-v0.1"), "Mixtral 8x7B Instruct v0.1")
|
||||
self.assertEqual(gguf.Metadata.id_to_title("Meta-Llama-3-8B"), "Meta Llama 3 8B")
|
||||
self.assertEqual(gguf.Metadata.id_to_title("hermes-2-pro-llama-3-8b-DPO"), "Hermes 2 Pro Llama 3 8b DPO")
|
||||
|
||||
def test_get_model_id_components(self):
|
||||
# This is the basic standard form with organization marker
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("Mistral/Mixtral-8x7B-Instruct-v0.1"),
|
||||
('Mixtral-8x7B-Instruct-v0.1', "Mistral", 'Mixtral', 'Instruct', 'v0.1', '8x7B'))
|
||||
|
||||
# Similar to basic standard form but without organization marker
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("Mixtral-8x7B-Instruct-v0.1"),
|
||||
('Mixtral-8x7B-Instruct-v0.1', None, 'Mixtral', 'Instruct', 'v0.1', '8x7B'))
|
||||
|
||||
# Missing version
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("Mixtral-8x7B-Instruct"),
|
||||
('Mixtral-8x7B-Instruct', None, 'Mixtral', 'Instruct', None, '8x7B'))
|
||||
|
||||
# Missing finetune
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("Mixtral-8x7B-v0.1"),
|
||||
('Mixtral-8x7B-v0.1', None, 'Mixtral', None, 'v0.1', '8x7B'))
|
||||
|
||||
# Base name and size label only
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("Mixtral-8x7B"),
|
||||
('Mixtral-8x7B', None, 'Mixtral', None, None, '8x7B'))
|
||||
|
||||
# Base name and version only
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("Mixtral-v0.1"),
|
||||
('Mixtral-v0.1', None, 'Mixtral', None, 'v0.1', None))
|
||||
|
||||
## Edge Cases ##
|
||||
|
||||
# This is too ambiguous... best to err on caution and output nothing
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("Mixtral"),
|
||||
('Mixtral', None, None, None, None, None))
|
||||
|
||||
# Basename has numbers mixed in and also size label provided. Must avoid capturing number in basename
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("NousResearch/Meta-Llama-3-8B"),
|
||||
('Meta-Llama-3-8B', "NousResearch", 'Meta-Llama-3', None, None, '8B'))
|
||||
|
||||
# Can't detect all non standard form in a heuristically safe way... best to err in caution and output nothing...
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("Qwen1.5-MoE-A2.7B-Chat"),
|
||||
('Qwen1.5-MoE-A2.7B-Chat', None, 'Qwen1.5-MoE', 'Chat', None, 'A2.7B'))
|
||||
|
||||
# Capture 'sub size labels' e.g. A14B in '57B-A14B' usually refers to activated params/weight count
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("Qwen2-57B-A14B-Instruct"),
|
||||
('Qwen2-57B-A14B-Instruct', None, 'Qwen2', 'Instruct', None, '57B-A14B'))
|
||||
|
||||
# Check that it can handle a real model id with no version code
|
||||
# Note that 4k in this string is non standard and microsoft were referring to context length rather than weight count
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("microsoft/Phi-3-mini-4k-instruct", 4 * 10**9),
|
||||
('Phi-3-mini-4k-instruct', 'microsoft', 'Phi-3', '4k-instruct', None, 'mini'))
|
||||
|
||||
# There is some legitimate models with only thousands of parameters
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("delphi-suite/stories-llama2-50k", 50 * 10**3),
|
||||
('stories-llama2-50k', 'delphi-suite', 'stories-llama2', None, None, '50K'))
|
||||
|
||||
# None standard and not easy to disambiguate
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("DeepSeek-Coder-V2-Lite-Instruct"),
|
||||
('DeepSeek-Coder-V2-Lite-Instruct', None, 'DeepSeek-Coder-V2-Lite', 'Instruct', None, None))
|
||||
|
||||
# This is a real model_id where they append 2DPO to refer to Direct Preference Optimization
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("crestf411/daybreak-kunoichi-2dpo-7b"),
|
||||
('daybreak-kunoichi-2dpo-7b', 'crestf411', 'daybreak-kunoichi', '2dpo', None, '7B'))
|
||||
|
||||
# This is a real model id where the weight size has a decimal point
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("Qwen2-0.5B-Instruct"),
|
||||
('Qwen2-0.5B-Instruct', None, 'Qwen2', 'Instruct', None, '0.5B'))
|
||||
|
||||
# Uses an underscore in the size label
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("smallcloudai/Refact-1_6B-fim"),
|
||||
('Refact-1_6B-fim', 'smallcloudai', 'Refact', 'fim', None, '1.6B'))
|
||||
|
||||
# Uses Iter3 for the version
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("UCLA-AGI/Gemma-2-9B-It-SPPO-Iter3"),
|
||||
('Gemma-2-9B-It-SPPO-Iter3', 'UCLA-AGI', 'Gemma-2', 'It-SPPO', 'Iter3', '9B'))
|
||||
|
||||
# Has two potential versions in the basename
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("NousResearch/Hermes-2-Theta-Llama-3-8B"),
|
||||
('Hermes-2-Theta-Llama-3-8B', 'NousResearch', 'Hermes-2-Theta-Llama-3', None, None, '8B'))
|
||||
|
||||
# Potential version in the basename
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("SeaLLMs/SeaLLMs-v3-7B-Chat"),
|
||||
('SeaLLMs-v3-7B-Chat', 'SeaLLMs', 'SeaLLMs-v3', 'Chat', None, '7B'))
|
||||
|
||||
# Underscore in the basename, and 1m for the context size
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("internlm/internlm2_5-7b-chat-1m", 7 * 10**9),
|
||||
('internlm2_5-7b-chat-1m', 'internlm', 'internlm2_5', 'chat-1m', None, '7B'))
|
||||
|
||||
# Version before the finetune name
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("pszemraj/jamba-900M-v0.13-KIx2"),
|
||||
('jamba-900M-v0.13-KIx2', 'pszemraj', 'jamba', 'KIx2', 'v0.13', '900M'))
|
||||
|
||||
# TODO: hf suffix which could be ignored but isn't
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("state-spaces/mamba-2.8b-hf"),
|
||||
('mamba-2.8b-hf', 'state-spaces', 'mamba', 'hf', None, '2.8B'))
|
||||
|
||||
# Two sizes, don't merge them, the other is the number of tokens on which it was trained
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("abacaj/llama-161M-100B", 161 * 10**6),
|
||||
('llama-161M-100B', 'abacaj', 'llama', '100b', None, '161M'))
|
||||
|
||||
# It's a trap, there is no size label
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("SparseLLM/relu-100B", 1340 * 10**6),
|
||||
('relu-100B', 'SparseLLM', 'relu', '100b', None, None))
|
||||
|
||||
# Weird size notation
|
||||
self.assertEqual(gguf.Metadata.get_model_id_components("bigscience/bloom-7b1-petals"),
|
||||
('bloom-7b1-petals', 'bigscience', 'bloom', 'petals', None, '7.1B'))
|
||||
|
||||
def test_apply_metadata_heuristic_from_model_card(self):
|
||||
model_card = {
|
||||
'tags': ['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl'],
|
||||
'model-index': [{'name': 'Mixtral-8x7B-Instruct-v0.1', 'results': []}],
|
||||
'language': ['en'],
|
||||
'datasets': ['teknium/OpenHermes-2.5'],
|
||||
'widget': [{'example_title': 'Hermes 2 Pro', 'messages': [{'role': 'system', 'content': 'You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.'}, {'role': 'user', 'content': 'Write a short story about Goku discovering kirby has teamed up with Majin Buu to destroy the world.'}]}],
|
||||
'base_model': ["EmbeddedLLM/Mistral-7B-Merge-14-v0", "janai-hq/trinity-v1"]
|
||||
}
|
||||
got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
|
||||
expect = gguf.Metadata()
|
||||
expect.base_models=[{'name': 'Mistral 7B Merge 14 v0', 'organization': 'EmbeddedLLM', 'version': 'v0', 'repo_url': 'https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0'}, {'name': 'Trinity v1', 'organization': 'Janai Hq', 'version': 'v1', 'repo_url': 'https://huggingface.co/janai-hq/trinity-v1'}]
|
||||
expect.tags=['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl']
|
||||
expect.languages=['en']
|
||||
expect.datasets=['teknium/OpenHermes-2.5']
|
||||
|
||||
self.assertEqual(got, expect)
|
||||
|
||||
def test_apply_metadata_heuristic_from_hf_parameters(self):
|
||||
hf_params = {"_name_or_path": "./hermes-2-pro-llama-3-8b-DPO"}
|
||||
got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card=None, hf_params=hf_params, model_path=None)
|
||||
expect = gguf.Metadata(name='Hermes 2 Pro Llama 3 8b DPO', finetune='DPO', basename='hermes-2-pro-llama-3', size_label='8B')
|
||||
self.assertEqual(got, expect)
|
||||
|
||||
def test_apply_metadata_heuristic_from_model_dir(self):
|
||||
model_dir_path = Path("./hermes-2-pro-llama-3-8b-DPO")
|
||||
got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card=None, hf_params=None, model_path=model_dir_path)
|
||||
expect = gguf.Metadata(name='Hermes 2 Pro Llama 3 8b DPO', finetune='DPO', basename='hermes-2-pro-llama-3', size_label='8B')
|
||||
self.assertEqual(got, expect)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
|
@ -40,7 +40,7 @@
|
|||
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
|
||||
|
||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 6
|
||||
#define LLAMA_SESSION_VERSION 7
|
||||
|
||||
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
|
||||
#define LLAMA_STATE_SEQ_VERSION 1
|
||||
|
@ -133,7 +133,7 @@ extern "C" {
|
|||
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
||||
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||
|
@ -411,6 +411,9 @@ extern "C" {
|
|||
const char * content;
|
||||
} llama_chat_message;
|
||||
|
||||
// lora adapter
|
||||
struct llama_lora_adapter;
|
||||
|
||||
// Helpers for getting default parameters
|
||||
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
||||
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
||||
|
@ -510,18 +513,28 @@ extern "C" {
|
|||
const char * fname_out,
|
||||
const llama_model_quantize_params * params);
|
||||
|
||||
// Apply a LoRA adapter to a loaded model
|
||||
// path_base_model is the path to a higher quality model to use as a base for
|
||||
// the layers modified by the adapter. Can be NULL to use the current loaded model.
|
||||
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||
// will be applied on top of the previous one
|
||||
// Returns 0 on success
|
||||
LLAMA_API int32_t llama_model_apply_lora_from_file(
|
||||
const struct llama_model * model,
|
||||
const char * path_lora,
|
||||
float scale,
|
||||
const char * path_base_model,
|
||||
int32_t n_threads);
|
||||
// Load a LoRA adapter from file
|
||||
// The loaded adapter will be associated to the given model, and will be free when the model is deleted
|
||||
LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
|
||||
struct llama_model * model,
|
||||
const char * path_lora);
|
||||
|
||||
// Add a loaded LoRA adapter to given context
|
||||
// This will not modify model's weight
|
||||
LLAMA_API int32_t llama_lora_adapter_set(
|
||||
struct llama_context * ctx,
|
||||
struct llama_lora_adapter * adapter,
|
||||
float scale);
|
||||
|
||||
// Remove a LoRA adapter from given context
|
||||
// Return -1 if the adapter is not present in the context
|
||||
LLAMA_API int32_t llama_lora_adapter_remove(
|
||||
struct llama_context * ctx,
|
||||
struct llama_lora_adapter * adapter);
|
||||
|
||||
// Manually free a LoRA adapter
|
||||
// Note: loaded adapters will be free when the associated model is deleted
|
||||
LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
|
||||
|
||||
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
||||
// the currently loaded vector.
|
||||
|
|
|
@ -9,3 +9,4 @@
|
|||
-r ./requirements/requirements-convert_hf_to_gguf.txt
|
||||
-r ./requirements/requirements-convert_hf_to_gguf_update.txt
|
||||
-r ./requirements/requirements-convert_llama_ggml_to_gguf.txt
|
||||
-r ./requirements/requirements-convert_lora_to_gguf.txt
|
||||
|
|
2
requirements/requirements-convert_lora_to_gguf.txt
Normal file
2
requirements/requirements-convert_lora_to_gguf.txt
Normal file
|
@ -0,0 +1,2 @@
|
|||
-r ./requirements-convert_hf_to_gguf.txt
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
1061
src/llama.cpp
1061
src/llama.cpp
File diff suppressed because it is too large
Load diff
|
@ -79,8 +79,16 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
|||
im = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
|
||||
GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
|
||||
// TODO: other cases
|
||||
//#pragma omp parallel for
|
||||
//for (int i = 0; i < tensor->ne[1]; i++) {
|
||||
// ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
|
||||
// i * tensor->ne[0], 1, tensor->ne[0], im);
|
||||
//}
|
||||
|
||||
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
||||
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
||||
// This is going to create some weird integers though.
|
||||
|
@ -759,7 +767,7 @@ struct test_dup : public test_case {
|
|||
}
|
||||
|
||||
test_dup(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 10, 10, 1},
|
||||
std::array<int64_t, 4> ne = {10, 10, 20, 1},
|
||||
std::array<int64_t, 4> permute = {0, 0, 0, 0})
|
||||
: type(type), ne(ne), permute(permute),
|
||||
_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
|
||||
|
@ -779,9 +787,11 @@ struct test_cpy : public test_case {
|
|||
const ggml_type type_src;
|
||||
const ggml_type type_dst;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const std::array<int64_t, 4> permute;
|
||||
bool _src_use_permute;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR3(type_src, type_dst, ne);
|
||||
return VARS_TO_STR4(type_src, type_dst, ne, permute);
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
|
@ -793,12 +803,18 @@ struct test_cpy : public test_case {
|
|||
}
|
||||
|
||||
test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 10, 10, 1})
|
||||
: type_src(type_src), type_dst(type_dst), ne(ne) {}
|
||||
std::array<int64_t, 4> ne = {10, 10, 10, 1},
|
||||
std::array<int64_t, 4> permute = {0, 0, 0, 0},
|
||||
bool _dst_use_permute = false)
|
||||
: type_src(type_src), type_dst(type_dst), ne(ne), permute(permute),
|
||||
_src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
|
||||
ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, ne.data());
|
||||
if (_src_use_permute) {
|
||||
src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
|
||||
}
|
||||
ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
|
||||
ggml_tensor * out = ggml_cpy(ctx, src, dst);
|
||||
return out;
|
||||
}
|
||||
|
@ -1174,6 +1190,7 @@ struct test_soft_max : public test_case {
|
|||
}
|
||||
};
|
||||
|
||||
|
||||
// GGML_OP_ROPE
|
||||
struct test_rope : public test_case {
|
||||
const ggml_type type;
|
||||
|
@ -2146,12 +2163,22 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
|
||||
|
||||
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
||||
for (ggml_type type_dst : all_types) {
|
||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
|
||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
|
||||
}
|
||||
}
|
||||
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
||||
for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2201,6 +2228,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
|
||||
}
|
||||
|
||||
#if 1
|
||||
for (ggml_type type_a : base_types) {
|
||||
for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
|
||||
|
@ -2220,6 +2248,24 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
|
||||
}
|
||||
}
|
||||
#else
|
||||
// m = a rows
|
||||
// n = b rows
|
||||
// k = cols
|
||||
std::uniform_int_distribution<> dist_m(1, 128);
|
||||
std::uniform_int_distribution<> dist_n(16, 128);
|
||||
std::uniform_int_distribution<> dist_k(1, 16);
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
for (ggml_type type_a : all_types) {
|
||||
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
||||
int m = dist_m(rng);
|
||||
int n = dist_n(rng);
|
||||
int k = dist_k(rng) * ggml_blck_size(type_a);
|
||||
test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1, 1}, {1, 1}));
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (ggml_type type_a : other_types) {
|
||||
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
||||
|
@ -2283,7 +2329,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
for (int n = 0; n < 10; ++n) {
|
||||
int64_t ne0 = dist_ne0(rng);
|
||||
int64_t ne1 = dist_ne1(rng);
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
|
||||
}
|
||||
|
||||
exponent <<= 1;
|
||||
|
@ -2302,7 +2348,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, 0.1f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue