diff --git a/.devops/cloud-v-pipeline b/.devops/cloud-v-pipeline new file mode 100644 index 000000000..f3a4944f8 --- /dev/null +++ b/.devops/cloud-v-pipeline @@ -0,0 +1,22 @@ +node('x86_runner1'){ // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries + stage('Cleanup'){ + cleanWs() // Cleaning previous CI build in workspace + } + stage('checkout repo'){ + retry(5){ // Retry if the cloning fails due to some reason + checkout scm // Clone the repo on Runner + } + } + stage('Compiling llama.cpp'){ + sh'''#!/bin/bash + make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V + ''' + } + stage('Running llama.cpp'){ + sh'''#!/bin/bash + module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc + qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64 + cat llama_log.txt # Printing results + ''' + } +} diff --git a/.github/workflows/gguf-publish.yml b/.github/workflows/gguf-publish.yml index a6289e335..e61bfc6c3 100644 --- a/.github/workflows/gguf-publish.yml +++ b/.github/workflows/gguf-publish.yml @@ -24,7 +24,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python uses: actions/setup-python@v2 with: diff --git a/common/common.cpp b/common/common.cpp index 6e5d5b4d5..9969cb97d 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -374,6 +374,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { #else fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); +#endif + } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD + params.n_gpu_layers_draft = std::stoi(argv[i]); +#else + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); #endif } else if (arg == "--main-gpu" || arg == "-mg") { if (++i >= argc) { @@ -423,8 +434,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { #endif // GGML_USE_CUBLAS } else if (arg == "--no-mmap") { params.use_mmap = false; - } else if (arg == "--mtest") { - params.mem_test = true; } else if (arg == "--numa") { params.numa = true; } else if (arg == "--export") { @@ -664,6 +673,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD printf(" -ngl N, --n-gpu-layers N\n"); printf(" number of layers to store in VRAM\n"); + printf(" -ngld N, --n-gpu-layers-draft N\n"); + printf(" number of layers to store in VRAM for the draft model\n"); printf(" -ts SPLIT --tensor-split SPLIT\n"); printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); @@ -674,7 +685,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" Not recommended since this is both slower and uses more VRAM.\n"); #endif // GGML_USE_CUBLAS #endif - printf(" --mtest compute maximum memory usage\n"); printf(" --export export the computation graph to 'llama.ggml'\n"); printf(" --verbose-prompt print prompt before generation\n"); fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); @@ -1212,7 +1222,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str()); fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); - fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false"); fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers); fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); diff --git a/common/common.h b/common/common.h index 012bf5e13..4979f99dd 100644 --- a/common/common.h +++ b/common/common.h @@ -38,6 +38,7 @@ struct gpt_params { int32_t n_draft = 16; // number of tokens to draft during speculative decoding int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. @@ -109,7 +110,6 @@ struct gpt_params { bool perplexity = false; // compute perplexity over the prompt bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory - bool mem_test = false; // compute maximum memory usage bool numa = false; // attempt optimizations that help on some NUMA systems bool export_cgraph = false; // export the computation graph bool verbose_prompt = false; // print prompt tokens before generation diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py new file mode 100755 index 000000000..5b301de27 --- /dev/null +++ b/convert-baichuan-hf-to-gguf.py @@ -0,0 +1,292 @@ +#!/usr/bin/env python3 +# HF baichuan --> gguf conversion + +from __future__ import annotations + +import argparse +import json +import os +import struct +import sys +from pathlib import Path +from typing import TYPE_CHECKING, Any +import itertools +import gguf +import numpy as np +import torch +from sentencepiece import SentencePieceProcessor # type: ignore[import] + + +if TYPE_CHECKING: + from typing import TypeAlias + +NDArray: TypeAlias = 'np.ndarray[Any, Any]' + +# reverse HF permute back to original pth layout + + +def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + +def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray: + r = weights.shape[0] // 3 + return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv)) + +def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray: + r = weights.shape[0] // 3 + return weights[r * n_part : r * n_part + r, ...] + +def count_model_parts(dir_model: str) -> int: + num_parts = 0 + + for filename in os.listdir(dir_model): + if filename.startswith("pytorch_model-"): + num_parts += 1 + + if num_parts > 0: + print("gguf: found " + str(num_parts) + " model parts") + + return num_parts + + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") + parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1) + return parser.parse_args() + +args = parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file = sys.stderr) + sys.exit(1) + +# possible tensor data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 + +# map from ftype to string +ftype_str = ["f32", "f16"] + +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' + +print("gguf: loading model "+dir_model.name) + +with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) +print("hello print: ",hparams["architectures"][0]) +if hparams["architectures"][0] != "BaichuanForCausalLM": + print("Model architecture not supported: " + hparams["architectures"][0]) + + sys.exit() + +# get number of model parts +num_parts = count_model_parts(dir_model) +print(f"num_parts:{num_parts}\n") +ARCH=gguf.MODEL_ARCH.BAICHUAN +gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) + +print("gguf: get model metadata") + +block_count = hparams["num_hidden_layers"] +head_count = hparams["num_attention_heads"] + +if "num_key_value_heads" in hparams: + head_count_kv = hparams["num_key_value_heads"] +else: + head_count_kv = head_count + +if "_name_or_path" in hparams: + hf_repo = hparams["_name_or_path"] +else: + hf_repo = "" + +if "max_sequence_length" in hparams: + ctx_length = hparams["max_sequence_length"] +elif "max_position_embeddings" in hparams: + ctx_length = hparams["max_position_embeddings"] +elif "model_max_length" in hparams: + ctx_length = hparams["model_max_length"] +else: + print("gguf: can not find ctx length parameter.") + + sys.exit() + + +gguf_writer.add_name(dir_model.name) +gguf_writer.add_source_hf_repo(hf_repo) +gguf_writer.add_tensor_data_layout("Meta AI original pth") +gguf_writer.add_context_length(ctx_length) +gguf_writer.add_embedding_length(hparams["hidden_size"]) +gguf_writer.add_block_count(block_count) +gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) +gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) +gguf_writer.add_head_count(head_count) +gguf_writer.add_head_count_kv(head_count_kv) +gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + +if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]: + if "type" in hparams["rope_scaling"]: + if hparams["rope_scaling"]["type"] == "linear": + gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"]) + + +# TOKENIZATION + +print("gguf: get tokenizer metadata") + +tokens: list[bytes] = [] +scores: list[float] = [] +toktypes: list[int] = [] + +tokenizer_model_file = dir_model / 'tokenizer.model' +if not tokenizer_model_file.is_file(): + print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr) + sys.exit(1) + +# vocab type sentencepiece +print("gguf: get sentencepiece tokenizer vocab, scores and token types") + +tokenizer = SentencePieceProcessor(str(tokenizer_model_file)) + +for i in range(tokenizer.vocab_size()): + text: bytes + score: float + + piece = tokenizer.id_to_piece(i) + text = piece.encode("utf-8") + score = tokenizer.get_score(i) + + toktype = 1 # defualt to normal token type + if tokenizer.is_unknown(i): + toktype = 2 + if tokenizer.is_control(i): + toktype = 3 + + # toktype = 4 is user-defined = tokens from added_tokens.json + + if tokenizer.is_unused(i): + toktype = 5 + if tokenizer.is_byte(i): + toktype = 6 + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + +added_tokens_file = dir_model / 'added_tokens.json' +if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + addtokens_json = json.load(f) + + print("gguf: get added tokens") + + for key in addtokens_json: + tokens.append( key.encode("utf-8") ) + scores.append(-1000.0) + toktypes.append(4) # user-defined token type + + +gguf_writer.add_tokenizer_model("llama") +gguf_writer.add_token_list(tokens) +gguf_writer.add_token_scores(scores) +gguf_writer.add_token_types(toktypes) + +special_vocab = gguf.SpecialVocab(dir_model) +special_vocab.add_to_gguf(gguf_writer) + +# TENSORS + +tensor_map = gguf.get_tensor_name_map(ARCH,block_count) + +# tensor info +print("gguf: get tensor metadata") + +if num_parts == 0: + part_names = iter(("pytorch_model.bin",)) +else: + part_names = ( + f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) + ) + + +for part_name in part_names: + if args.vocab_only: + break + print("gguf: loading model part '" + part_name + "'") + model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") + + tmp=model_part + for i in range(block_count): + if f"model.layers.{i}.self_attn.W_pack.weight" in model_part: + print(f"Unpacking and permuting layer {i}") + tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count) + tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv) + tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2) + del tmp[f"model.layers.{i}.self_attn.W_pack.weight"] + + for name in model_part.keys(): + data = model_part[name] + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + data = data.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + gguf_writer.add_tensor(new_name, data) + + +print("gguf: write header") +gguf_writer.write_header_to_file() +print("gguf: write metadata") +gguf_writer.write_kv_data_to_file() +if not args.vocab_only: + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() + +gguf_writer.close() + +print(f"gguf: model successfully exported to '{fname_out}'") +print("") diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py index 6ed2b88c6..5d4ad04a4 100755 --- a/convert-falcon-hf-to-gguf.py +++ b/convert-falcon-hf-to-gguf.py @@ -137,7 +137,9 @@ with open(tokenizer_json_file, "r", encoding="utf-8") as f: print("gguf: get gpt2 tokenizer vocab") -vocab_size = len(tokenizer_json["model"]["vocab"]) +# The number of tokens in tokenizer.json can differ from the expected vocab size. +# This causes downstream issues with mismatched tensor sizes when running the inference +vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"]) # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py tokenizer = AutoTokenizer.from_pretrained(dir_model) diff --git a/examples/main-cmake-pkg/.gitignore b/examples/main-cmake-pkg/.gitignore new file mode 100644 index 000000000..e32c11c7f --- /dev/null +++ b/examples/main-cmake-pkg/.gitignore @@ -0,0 +1,51 @@ +# Prerequisites +*.d + +# Compiled Object files +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.out +*.app + +*.gguf + +*.log +.DS_Store +.build/ +.cache/ +.direnv/ +.envrc +.swiftpm +.venv +.clang-tidy +.vs/ +.vscode/ + +build*/ +out/ +tmp/ + diff --git a/examples/main-cmake-pkg/CMakeLists.txt b/examples/main-cmake-pkg/CMakeLists.txt new file mode 100644 index 000000000..473738719 --- /dev/null +++ b/examples/main-cmake-pkg/CMakeLists.txt @@ -0,0 +1,36 @@ +cmake_minimum_required(VERSION 3.12) +project("main-cmake-pkg" C CXX) +set(TARGET main-cmake-pkg) + +find_package(Llama 0.0.1 REQUIRED) + +# Bake common functionality in with target. Because applications +# using the relocatable Llama package should be outside of the +# source tree, main-cmake-pkg pretends the dependencies are built-in. + +set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common") +add_library(common OBJECT + ${_common_path}/common.h + ${_common_path}/common.cpp + ${_common_path}/console.h + ${_common_path}/console.cpp + ${_common_path}/grammar-parser.h + ${_common_path}/grammar-parser.cpp + ) + +# WARNING: because build-info.h is auto-generated, it will only +# be available after the user has built the llama.cpp sources. +# +configure_file(${_common_path}/../build-info.h + ${CMAKE_CURRENT_BINARY_DIR}/build-info.h + COPYONLY) + +target_include_directories(common PUBLIC ${LLAMA_INCLUDE_DIR} + ${CMAKE_CURRENT_BINARY_DIR}) + +add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp) +target_include_directories(${TARGET} PRIVATE ${_common_path}) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) + diff --git a/examples/main-cmake-pkg/README.md b/examples/main-cmake-pkg/README.md new file mode 100644 index 000000000..6d665f28f --- /dev/null +++ b/examples/main-cmake-pkg/README.md @@ -0,0 +1,37 @@ +# llama.cpp/example/main-cmake-pkg + +This program builds the [main](../main) application using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree. + +## Building + +Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions. + +### Considerations + +When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_. + +### Build llama.cpp and install to C:\LlamaCPP directory + +In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`. + +```cmd +git clone https://github.com/ggerganov/llama.cpp +cd llama.cpp +mkdir build +cd build +cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64 +cmake --build . --config Release +cmake --install . --prefix C:/LlamaCPP +``` + +### Build main-cmake-pkg + + +```cmd +cd ..\examples\main-cmake-pkg +mkdir build +cd build +cmake .. -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64 +cmake --build . --config Release +cmake --install . --prefix C:/MyLlamaApp +``` diff --git a/examples/main/README.md b/examples/main/README.md index 2773fe976..26e1e28dd 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -144,7 +144,7 @@ The `--ctx-size` option allows you to set the size of the prompt context used by Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8. -- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model. +- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model. ### Keep Prompt @@ -274,7 +274,7 @@ These options help improve the performance and memory usage of the LLaMA models. ### NUMA support -- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop\_caches' as root. +- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root. ### Memory Float 32 @@ -302,7 +302,6 @@ These options provide extra functionality and customization when running the LLa - `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated. - `--verbose-prompt`: Print the prompt before generating text. -- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly. - `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. diff --git a/examples/main/main.cpp b/examples/main/main.cpp index baec6ba12..a8179f1bf 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -198,23 +198,6 @@ int main(int argc, char ** argv) { params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); } - // determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters - // uncomment the "used_mem" line in llama.cpp to see the results - if (params.mem_test) { - { - LOG_TEE("%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx); - - const std::vector tmp(params.n_batch, llama_token_bos(ctx)); - llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads); - } - - llama_print_timings(ctx); - llama_free(ctx); - llama_free_model(model); - - return 0; - } - // export the cgraph and exit if (params.export_cgraph) { llama_eval_export(ctx, "llama.ggml"); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 822d7b529..aa904183f 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -42,6 +42,7 @@ int main(int argc, char ** argv) { // load the draft model params.model = params.model_draft; + params.n_gpu_layers = params.n_gpu_layers_draft; std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params); // tokenize the prompt @@ -81,7 +82,7 @@ int main(int argc, char ** argv) { //GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft)); // how many tokens to draft each time - const int n_draft = params.n_draft; + int n_draft = params.n_draft; int n_predict = 0; int n_drafted = 0; @@ -130,6 +131,7 @@ int main(int argc, char ** argv) { LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted)); int i_dft = 0; + while (true) { // sample from the target model const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft); @@ -173,6 +175,27 @@ int main(int argc, char ** argv) { llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads); ++n_past_dft; + // heuristic for n_draft + { + const int n_draft_cur = (int) drafted.size(); + const bool all_accepted = i_dft == n_draft_cur; + + LOG("n_draft = %d\n", n_draft); + LOG("n_draft_cur = %d\n", n_draft_cur); + LOG("i_dft = %d\n", i_dft); + LOG("all_accepted = %d\n", all_accepted); + + if (all_accepted && n_draft == n_draft_cur) { + LOG(" - max drafted tokens accepted - n_draft += 8\n"); + n_draft = std::min(30, n_draft + 8); + } else if (all_accepted) { + LOG(" - partially drafted tokens accepted - no change\n"); + } else { + LOG(" - drafted token rejected - n_draft -= 1\n"); + n_draft = std::max(2, n_draft - 1); + } + } + drafted.clear(); drafted.push_back(id); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 3c3e42b0a..aa179487e 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -13,7 +13,7 @@ #ifdef __HIP_PLATFORM_AMD__ // for rocblas_initialize() #include "rocblas/rocblas.h" -#endif +#endif // __HIP_PLATFORM_AMD__ #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT @@ -68,19 +68,29 @@ #include #include #include -#endif +#endif // defined(GGML_USE_HIPBLAS) #include "ggml-cuda.h" #include "ggml.h" -#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products -#ifndef CC_TURING -#define CC_TURING 700 -#endif +#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products +#define CC_TURING 700 +#define CC_OFFSET_AMD 1000000 +#define CC_RDNA2 CC_OFFSET_AMD + 1030 #if defined(GGML_USE_HIPBLAS) #define __CUDA_ARCH__ 1300 +#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \ + defined(__gfx1150__) || defined(__gfx1151__) +#define RDNA3 +#endif + +#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \ + defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__) +#define RDNA2 +#endif + #ifndef __has_builtin #define __has_builtin(x) 0 #endif @@ -132,7 +142,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { #endif return c; } -#endif +#endif // defined(GGML_USE_HIPBLAS) #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -3472,6 +3482,12 @@ static __device__ __forceinline__ void mul_mat_q( } } +#define MMQ_X_Q4_0_RDNA2 64 +#define MMQ_Y_Q4_0_RDNA2 128 +#define NWARPS_Q4_0_RDNA2 8 +#define MMQ_X_Q4_0_RDNA1 64 +#define MMQ_Y_Q4_0_RDNA1 64 +#define NWARPS_Q4_0_RDNA1 8 #define MMQ_X_Q4_0_AMPERE 64 #define MMQ_Y_Q4_0_AMPERE 128 #define NWARPS_Q4_0_AMPERE 4 @@ -3479,11 +3495,32 @@ static __device__ __forceinline__ void mul_mat_q( #define MMQ_Y_Q4_0_PASCAL 64 #define NWARPS_Q4_0_PASCAL 8 -template static __global__ void mul_mat_q4_0( +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + mul_mat_q4_0( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { -#if __CUDA_ARCH__ >= CC_TURING +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q4_0_RDNA2; + const int mmq_y = MMQ_Y_Q4_0_RDNA2; + const int nwarps = NWARPS_Q4_0_RDNA2; +#else + const int mmq_x = MMQ_X_Q4_0_RDNA1; + const int mmq_y = MMQ_Y_Q4_0_RDNA1; + const int nwarps = NWARPS_Q4_0_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_TURING const int mmq_x = MMQ_X_Q4_0_AMPERE; const int mmq_y = MMQ_Y_Q4_0_AMPERE; const int nwarps = NWARPS_Q4_0_AMPERE; @@ -3506,6 +3543,12 @@ template static __global__ void mul_mat_q4_0( #endif // __CUDA_ARCH__ >= CC_TURING } +#define MMQ_X_Q4_1_RDNA2 64 +#define MMQ_Y_Q4_1_RDNA2 128 +#define NWARPS_Q4_1_RDNA2 8 +#define MMQ_X_Q4_1_RDNA1 64 +#define MMQ_Y_Q4_1_RDNA1 64 +#define NWARPS_Q4_1_RDNA1 8 #define MMQ_X_Q4_1_AMPERE 64 #define MMQ_Y_Q4_1_AMPERE 128 #define NWARPS_Q4_1_AMPERE 4 @@ -3514,14 +3557,33 @@ template static __global__ void mul_mat_q4_0( #define NWARPS_Q4_1_PASCAL 8 template static __global__ void -#if __CUDA_ARCH__ < CC_TURING +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#elif __CUDA_ARCH__ < CC_TURING __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2) #endif // __CUDA_ARCH__ < CC_TURING mul_mat_q4_1( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { -#if __CUDA_ARCH__ >= CC_TURING +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q4_1_RDNA2; + const int mmq_y = MMQ_Y_Q4_1_RDNA2; + const int nwarps = NWARPS_Q4_1_RDNA2; +#else + const int mmq_x = MMQ_X_Q4_1_RDNA1; + const int mmq_y = MMQ_Y_Q4_1_RDNA1; + const int nwarps = NWARPS_Q4_1_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_TURING const int mmq_x = MMQ_X_Q4_1_AMPERE; const int mmq_y = MMQ_Y_Q4_1_AMPERE; const int nwarps = NWARPS_Q4_1_AMPERE; @@ -3544,6 +3606,12 @@ template static __global__ void #endif // __CUDA_ARCH__ >= CC_TURING } +#define MMQ_X_Q5_0_RDNA2 64 +#define MMQ_Y_Q5_0_RDNA2 128 +#define NWARPS_Q5_0_RDNA2 8 +#define MMQ_X_Q5_0_RDNA1 64 +#define MMQ_Y_Q5_0_RDNA1 64 +#define NWARPS_Q5_0_RDNA1 8 #define MMQ_X_Q5_0_AMPERE 128 #define MMQ_Y_Q5_0_AMPERE 64 #define NWARPS_Q5_0_AMPERE 4 @@ -3551,11 +3619,32 @@ template static __global__ void #define MMQ_Y_Q5_0_PASCAL 64 #define NWARPS_Q5_0_PASCAL 8 -template static __global__ void mul_mat_q5_0( +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + mul_mat_q5_0( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { -#if __CUDA_ARCH__ >= CC_TURING +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q5_0_RDNA2; + const int mmq_y = MMQ_Y_Q5_0_RDNA2; + const int nwarps = NWARPS_Q5_0_RDNA2; +#else + const int mmq_x = MMQ_X_Q5_0_RDNA1; + const int mmq_y = MMQ_Y_Q5_0_RDNA1; + const int nwarps = NWARPS_Q5_0_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_TURING const int mmq_x = MMQ_X_Q5_0_AMPERE; const int mmq_y = MMQ_Y_Q5_0_AMPERE; const int nwarps = NWARPS_Q5_0_AMPERE; @@ -3578,6 +3667,12 @@ template static __global__ void mul_mat_q5_0( #endif // __CUDA_ARCH__ >= CC_TURING } +#define MMQ_X_Q5_1_RDNA2 64 +#define MMQ_Y_Q5_1_RDNA2 128 +#define NWARPS_Q5_1_RDNA2 8 +#define MMQ_X_Q5_1_RDNA1 64 +#define MMQ_Y_Q5_1_RDNA1 64 +#define NWARPS_Q5_1_RDNA1 8 #define MMQ_X_Q5_1_AMPERE 128 #define MMQ_Y_Q5_1_AMPERE 64 #define NWARPS_Q5_1_AMPERE 4 @@ -3585,11 +3680,32 @@ template static __global__ void mul_mat_q5_0( #define MMQ_Y_Q5_1_PASCAL 64 #define NWARPS_Q5_1_PASCAL 8 -template static __global__ void mul_mat_q5_1( +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +mul_mat_q5_1( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { -#if __CUDA_ARCH__ >= CC_TURING +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q5_1_RDNA2; + const int mmq_y = MMQ_Y_Q5_1_RDNA2; + const int nwarps = NWARPS_Q5_1_RDNA2; +#else + const int mmq_x = MMQ_X_Q5_1_RDNA1; + const int mmq_y = MMQ_Y_Q5_1_RDNA1; + const int nwarps = NWARPS_Q5_1_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_TURING const int mmq_x = MMQ_X_Q5_1_AMPERE; const int mmq_y = MMQ_Y_Q5_1_AMPERE; const int nwarps = NWARPS_Q5_1_AMPERE; @@ -3612,6 +3728,12 @@ template static __global__ void mul_mat_q5_1( #endif // __CUDA_ARCH__ >= CC_TURING } +#define MMQ_X_Q8_0_RDNA2 64 +#define MMQ_Y_Q8_0_RDNA2 128 +#define NWARPS_Q8_0_RDNA2 8 +#define MMQ_X_Q8_0_RDNA1 64 +#define MMQ_Y_Q8_0_RDNA1 64 +#define NWARPS_Q8_0_RDNA1 8 #define MMQ_X_Q8_0_AMPERE 128 #define MMQ_Y_Q8_0_AMPERE 64 #define NWARPS_Q8_0_AMPERE 4 @@ -3619,11 +3741,32 @@ template static __global__ void mul_mat_q5_1( #define MMQ_Y_Q8_0_PASCAL 64 #define NWARPS_Q8_0_PASCAL 8 -template static __global__ void mul_mat_q8_0( +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + mul_mat_q8_0( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { -#if __CUDA_ARCH__ >= CC_TURING +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q8_0_RDNA2; + const int mmq_y = MMQ_Y_Q8_0_RDNA2; + const int nwarps = NWARPS_Q8_0_RDNA2; +#else + const int mmq_x = MMQ_X_Q8_0_RDNA1; + const int mmq_y = MMQ_Y_Q8_0_RDNA1; + const int nwarps = NWARPS_Q8_0_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_TURING const int mmq_x = MMQ_X_Q8_0_AMPERE; const int mmq_y = MMQ_Y_Q8_0_AMPERE; const int nwarps = NWARPS_Q8_0_AMPERE; @@ -3646,6 +3789,12 @@ template static __global__ void mul_mat_q8_0( #endif // __CUDA_ARCH__ >= CC_TURING } +#define MMQ_X_Q2_K_RDNA2 64 +#define MMQ_Y_Q2_K_RDNA2 128 +#define NWARPS_Q2_K_RDNA2 8 +#define MMQ_X_Q2_K_RDNA1 128 +#define MMQ_Y_Q2_K_RDNA1 32 +#define NWARPS_Q2_K_RDNA1 8 #define MMQ_X_Q2_K_AMPERE 64 #define MMQ_Y_Q2_K_AMPERE 128 #define NWARPS_Q2_K_AMPERE 4 @@ -3653,11 +3802,32 @@ template static __global__ void mul_mat_q8_0( #define MMQ_Y_Q2_K_PASCAL 64 #define NWARPS_Q2_K_PASCAL 8 -template static __global__ void mul_mat_q2_K( +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +mul_mat_q2_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { -#if __CUDA_ARCH__ >= CC_TURING +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q2_K_RDNA2; + const int mmq_y = MMQ_Y_Q2_K_RDNA2; + const int nwarps = NWARPS_Q2_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q2_K_RDNA1; + const int mmq_y = MMQ_Y_Q2_K_RDNA1; + const int nwarps = NWARPS_Q2_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_TURING const int mmq_x = MMQ_X_Q2_K_AMPERE; const int mmq_y = MMQ_Y_Q2_K_AMPERE; const int nwarps = NWARPS_Q2_K_AMPERE; @@ -3680,6 +3850,12 @@ template static __global__ void mul_mat_q2_K( #endif // __CUDA_ARCH__ >= CC_TURING } +#define MMQ_X_Q3_K_RDNA2 128 +#define MMQ_Y_Q3_K_RDNA2 64 +#define NWARPS_Q3_K_RDNA2 8 +#define MMQ_X_Q3_K_RDNA1 32 +#define MMQ_Y_Q3_K_RDNA1 128 +#define NWARPS_Q3_K_RDNA1 8 #define MMQ_X_Q3_K_AMPERE 128 #define MMQ_Y_Q3_K_AMPERE 128 #define NWARPS_Q3_K_AMPERE 4 @@ -3688,14 +3864,33 @@ template static __global__ void mul_mat_q2_K( #define NWARPS_Q3_K_PASCAL 8 template static __global__ void -#if __CUDA_ARCH__ < CC_TURING +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#elif __CUDA_ARCH__ < CC_TURING __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2) #endif // __CUDA_ARCH__ < CC_TURING mul_mat_q3_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { -#if __CUDA_ARCH__ >= CC_TURING +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q3_K_RDNA2; + const int mmq_y = MMQ_Y_Q3_K_RDNA2; + const int nwarps = NWARPS_Q3_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q3_K_RDNA1; + const int mmq_y = MMQ_Y_Q3_K_RDNA1; + const int nwarps = NWARPS_Q3_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_TURING const int mmq_x = MMQ_X_Q3_K_AMPERE; const int mmq_y = MMQ_Y_Q3_K_AMPERE; const int nwarps = NWARPS_Q3_K_AMPERE; @@ -3718,6 +3913,12 @@ template static __global__ void #endif // __CUDA_ARCH__ >= CC_TURING } +#define MMQ_X_Q4_K_RDNA2 64 +#define MMQ_Y_Q4_K_RDNA2 128 +#define NWARPS_Q4_K_RDNA2 8 +#define MMQ_X_Q4_K_RDNA1 32 +#define MMQ_Y_Q4_K_RDNA1 64 +#define NWARPS_Q4_K_RDNA1 8 #define MMQ_X_Q4_K_AMPERE 64 #define MMQ_Y_Q4_K_AMPERE 128 #define NWARPS_Q4_K_AMPERE 4 @@ -3726,14 +3927,33 @@ template static __global__ void #define NWARPS_Q4_K_PASCAL 8 template static __global__ void -#if __CUDA_ARCH__ < CC_TURING +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#elif __CUDA_ARCH__ < CC_TURING __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2) #endif // __CUDA_ARCH__ < CC_TURING mul_mat_q4_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { -#if __CUDA_ARCH__ >= CC_TURING +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q4_K_RDNA2; + const int mmq_y = MMQ_Y_Q4_K_RDNA2; + const int nwarps = NWARPS_Q4_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q4_K_RDNA1; + const int mmq_y = MMQ_Y_Q4_K_RDNA1; + const int nwarps = NWARPS_Q4_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_TURING const int mmq_x = MMQ_X_Q4_K_AMPERE; const int mmq_y = MMQ_Y_Q4_K_AMPERE; const int nwarps = NWARPS_Q4_K_AMPERE; @@ -3756,6 +3976,12 @@ template static __global__ void #endif // __CUDA_ARCH__ >= CC_TURING } +#define MMQ_X_Q5_K_RDNA2 64 +#define MMQ_Y_Q5_K_RDNA2 128 +#define NWARPS_Q5_K_RDNA2 8 +#define MMQ_X_Q5_K_RDNA1 32 +#define MMQ_Y_Q5_K_RDNA1 64 +#define NWARPS_Q5_K_RDNA1 8 #define MMQ_X_Q5_K_AMPERE 64 #define MMQ_Y_Q5_K_AMPERE 128 #define NWARPS_Q5_K_AMPERE 4 @@ -3763,11 +3989,32 @@ template static __global__ void #define MMQ_Y_Q5_K_PASCAL 64 #define NWARPS_Q5_K_PASCAL 8 -template static __global__ void mul_mat_q5_K( +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +mul_mat_q5_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { -#if __CUDA_ARCH__ >= CC_TURING +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q5_K_RDNA2; + const int mmq_y = MMQ_Y_Q5_K_RDNA2; + const int nwarps = NWARPS_Q5_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q5_K_RDNA1; + const int mmq_y = MMQ_Y_Q5_K_RDNA1; + const int nwarps = NWARPS_Q5_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_TURING const int mmq_x = MMQ_X_Q5_K_AMPERE; const int mmq_y = MMQ_Y_Q5_K_AMPERE; const int nwarps = NWARPS_Q5_K_AMPERE; @@ -3790,6 +4037,12 @@ template static __global__ void mul_mat_q5_K( #endif // __CUDA_ARCH__ >= CC_TURING } +#define MMQ_X_Q6_K_RDNA2 64 +#define MMQ_Y_Q6_K_RDNA2 128 +#define NWARPS_Q6_K_RDNA2 8 +#define MMQ_X_Q6_K_RDNA1 32 +#define MMQ_Y_Q6_K_RDNA1 64 +#define NWARPS_Q6_K_RDNA1 8 #define MMQ_X_Q6_K_AMPERE 64 #define MMQ_Y_Q6_K_AMPERE 64 #define NWARPS_Q6_K_AMPERE 4 @@ -3798,14 +4051,33 @@ template static __global__ void mul_mat_q5_K( #define NWARPS_Q6_K_PASCAL 8 template static __global__ void -#if __CUDA_ARCH__ < CC_TURING +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#elif __CUDA_ARCH__ < CC_TURING __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2) #endif // __CUDA_ARCH__ < CC_TURING mul_mat_q6_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { -#if __CUDA_ARCH__ >= CC_TURING +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q6_K_RDNA2; + const int mmq_y = MMQ_Y_Q6_K_RDNA2; + const int nwarps = NWARPS_Q6_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q6_K_RDNA1; + const int mmq_y = MMQ_Y_Q6_K_RDNA1; + const int nwarps = NWARPS_Q6_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_TURING const int mmq_x = MMQ_X_Q6_K_AMPERE; const int mmq_y = MMQ_Y_Q6_K_AMPERE; const int nwarps = NWARPS_Q6_K_AMPERE; @@ -4588,7 +4860,15 @@ static void ggml_mul_mat_q4_0_q8_1_cuda( const int compute_capability = g_compute_capabilities[id]; int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_TURING) { + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q4_0_RDNA2; + mmq_y = MMQ_Y_Q4_0_RDNA2; + nwarps = NWARPS_Q4_0_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q4_0_RDNA1; + mmq_y = MMQ_Y_Q4_0_RDNA1; + nwarps = NWARPS_Q4_0_RDNA1; + } else if (compute_capability >= CC_TURING) { mmq_x = MMQ_X_Q4_0_AMPERE; mmq_y = MMQ_Y_Q4_0_AMPERE; nwarps = NWARPS_Q4_0_AMPERE; @@ -4625,7 +4905,15 @@ static void ggml_mul_mat_q4_1_q8_1_cuda( const int compute_capability = g_compute_capabilities[id]; int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_TURING) { + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q4_1_RDNA2; + mmq_y = MMQ_Y_Q4_1_RDNA2; + nwarps = NWARPS_Q4_1_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q4_1_RDNA1; + mmq_y = MMQ_Y_Q4_1_RDNA1; + nwarps = NWARPS_Q4_1_RDNA1; + } else if (compute_capability >= CC_TURING) { mmq_x = MMQ_X_Q4_1_AMPERE; mmq_y = MMQ_Y_Q4_1_AMPERE; nwarps = NWARPS_Q4_1_AMPERE; @@ -4662,7 +4950,15 @@ static void ggml_mul_mat_q5_0_q8_1_cuda( const int compute_capability = g_compute_capabilities[id]; int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_TURING) { + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q5_0_RDNA2; + mmq_y = MMQ_Y_Q5_0_RDNA2; + nwarps = NWARPS_Q5_0_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q5_0_RDNA1; + mmq_y = MMQ_Y_Q5_0_RDNA1; + nwarps = NWARPS_Q5_0_RDNA1; + } else if (compute_capability >= CC_TURING) { mmq_x = MMQ_X_Q5_0_AMPERE; mmq_y = MMQ_Y_Q5_0_AMPERE; nwarps = NWARPS_Q5_0_AMPERE; @@ -4699,7 +4995,15 @@ static void ggml_mul_mat_q5_1_q8_1_cuda( const int compute_capability = g_compute_capabilities[id]; int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_TURING) { + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q5_1_RDNA2; + mmq_y = MMQ_Y_Q5_1_RDNA2; + nwarps = NWARPS_Q5_1_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q5_1_RDNA1; + mmq_y = MMQ_Y_Q5_1_RDNA1; + nwarps = NWARPS_Q5_1_RDNA1; + } else if (compute_capability >= CC_TURING) { mmq_x = MMQ_X_Q5_1_AMPERE; mmq_y = MMQ_Y_Q5_1_AMPERE; nwarps = NWARPS_Q5_1_AMPERE; @@ -4736,7 +5040,15 @@ static void ggml_mul_mat_q8_0_q8_1_cuda( const int compute_capability = g_compute_capabilities[id]; int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_TURING) { + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q8_0_RDNA2; + mmq_y = MMQ_Y_Q8_0_RDNA2; + nwarps = NWARPS_Q8_0_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q8_0_RDNA1; + mmq_y = MMQ_Y_Q8_0_RDNA1; + nwarps = NWARPS_Q8_0_RDNA1; + } else if (compute_capability >= CC_TURING) { mmq_x = MMQ_X_Q8_0_AMPERE; mmq_y = MMQ_Y_Q8_0_AMPERE; nwarps = NWARPS_Q8_0_AMPERE; @@ -4773,7 +5085,15 @@ static void ggml_mul_mat_q2_K_q8_1_cuda( const int compute_capability = g_compute_capabilities[id]; int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_TURING) { + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q2_K_RDNA2; + mmq_y = MMQ_Y_Q2_K_RDNA2; + nwarps = NWARPS_Q2_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q2_K_RDNA1; + mmq_y = MMQ_Y_Q2_K_RDNA1; + nwarps = NWARPS_Q2_K_RDNA1; + } else if (compute_capability >= CC_TURING) { mmq_x = MMQ_X_Q2_K_AMPERE; mmq_y = MMQ_Y_Q2_K_AMPERE; nwarps = NWARPS_Q2_K_AMPERE; @@ -4812,7 +5132,15 @@ static void ggml_mul_mat_q3_K_q8_1_cuda( const int compute_capability = g_compute_capabilities[id]; int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_TURING) { + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q3_K_RDNA2; + mmq_y = MMQ_Y_Q3_K_RDNA2; + nwarps = NWARPS_Q3_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q3_K_RDNA1; + mmq_y = MMQ_Y_Q3_K_RDNA1; + nwarps = NWARPS_Q3_K_RDNA1; + } else if (compute_capability >= CC_TURING) { mmq_x = MMQ_X_Q3_K_AMPERE; mmq_y = MMQ_Y_Q3_K_AMPERE; nwarps = NWARPS_Q3_K_AMPERE; @@ -4850,7 +5178,15 @@ static void ggml_mul_mat_q4_K_q8_1_cuda( const int compute_capability = g_compute_capabilities[id]; int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_TURING) { + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q4_K_RDNA2; + mmq_y = MMQ_Y_Q4_K_RDNA2; + nwarps = NWARPS_Q4_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q4_K_RDNA1; + mmq_y = MMQ_Y_Q4_K_RDNA1; + nwarps = NWARPS_Q4_K_RDNA1; + } else if (compute_capability >= CC_TURING) { mmq_x = MMQ_X_Q4_K_AMPERE; mmq_y = MMQ_Y_Q4_K_AMPERE; nwarps = NWARPS_Q4_K_AMPERE; @@ -4887,7 +5223,15 @@ static void ggml_mul_mat_q5_K_q8_1_cuda( const int compute_capability = g_compute_capabilities[id]; int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_TURING) { + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q5_K_RDNA2; + mmq_y = MMQ_Y_Q5_K_RDNA2; + nwarps = NWARPS_Q5_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q5_K_RDNA1; + mmq_y = MMQ_Y_Q5_K_RDNA1; + nwarps = NWARPS_Q5_K_RDNA1; + } else if (compute_capability >= CC_TURING) { mmq_x = MMQ_X_Q5_K_AMPERE; mmq_y = MMQ_Y_Q5_K_AMPERE; nwarps = NWARPS_Q5_K_AMPERE; @@ -4924,7 +5268,15 @@ static void ggml_mul_mat_q6_K_q8_1_cuda( const int compute_capability = g_compute_capabilities[id]; int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_TURING) { + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q6_K_RDNA2; + mmq_y = MMQ_Y_Q6_K_RDNA2; + nwarps = NWARPS_Q6_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q6_K_RDNA1; + mmq_y = MMQ_Y_Q6_K_RDNA1; + nwarps = NWARPS_Q6_K_RDNA1; + } else if (compute_capability >= CC_TURING) { mmq_x = MMQ_X_Q6_K_AMPERE; mmq_y = MMQ_Y_Q6_K_AMPERE; nwarps = NWARPS_Q6_K_AMPERE; @@ -5161,8 +5513,11 @@ void ggml_init_cublas() { g_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; - +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD; +#else g_compute_capabilities[id] = 100*prop.major + 10*prop.minor; +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) } for (int64_t id = 0; id < g_device_count; ++id) { g_tensor_split[id] /= total_vram; @@ -5243,7 +5598,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( if (src->backend == GGML_BACKEND_CPU) { kind = cudaMemcpyHostToDevice; src_ptr = (char *) src->data; - } else if (src->backend == GGML_BACKEND_GPU) { + } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) { + GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); kind = cudaMemcpyDeviceToDevice; struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; int id; @@ -5285,9 +5641,7 @@ inline void ggml_cuda_op_add( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; @@ -5448,14 +5802,41 @@ inline void ggml_cuda_op_mul_mat_q( } static int64_t get_row_rounding(ggml_type type) { - int max_compute_capability = INT_MIN; - for (int id = 0; id < g_device_count; ++id) { - if (max_compute_capability < g_compute_capabilities[id] - && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { - max_compute_capability = g_compute_capabilities[id]; + int64_t min_compute_capability = INT_MAX; + int64_t max_compute_capability = INT_MIN; + for (int64_t id = 0; id < g_device_count; ++id) { + if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { + if (min_compute_capability > g_compute_capabilities[id]) { + min_compute_capability = g_compute_capabilities[id]; + } + if (max_compute_capability < g_compute_capabilities[id]) { + max_compute_capability = g_compute_capabilities[id]; + } } } +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + switch(type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return max_compute_capability >= CC_RDNA2 ? 128 : 64; + case GGML_TYPE_F16: + return 1; + case GGML_TYPE_Q2_K: + return max_compute_capability >= CC_RDNA2 ? 128 : 32; + case GGML_TYPE_Q3_K: + return min_compute_capability < CC_RDNA2 ? 128 : 64; + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + return max_compute_capability >= CC_RDNA2 ? 128 : 64; + default: + GGML_ASSERT(false); + } +#else switch(type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -5476,6 +5857,7 @@ static int64_t get_row_rounding(ggml_type type) { default: GGML_ASSERT(false); } +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) } inline void ggml_cuda_op_mul_mat_vec_q( @@ -5627,10 +6009,15 @@ inline void ggml_cuda_op_mul_mat_cublas( const int64_t ne0 = dst->ne[0]; const int64_t row_diff = row_high - row_low; - const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); - size_t src0_as; - float * src0_ddf_i = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); - to_fp32_cuda(src0_dd_i, src0_ddf_i, row_diff*ne00, stream); + float * src0_ddq_as_f32; + size_t src0_as = 0; + + if (src0->type != GGML_TYPE_F32) { + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); + src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT + to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream); + } + const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32; int id; CUDA_CHECK(cudaGetDevice(&id)); @@ -5647,10 +6034,11 @@ inline void ggml_cuda_op_mul_mat_cublas( src1_ddf_i, ne10, &beta, dst_dd_i, ldc)); - ggml_cuda_pool_free(src0_ddf_i, src0_as); + if (src0_as > 0) { + ggml_cuda_pool_free(src0_ddq_as_f32, src0_as); + } (void) dst; - (void) src0_dd_i; (void) src1_ddq_i; (void) src1_padded_row_size; } @@ -5789,7 +6177,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s const bool use_src1 = src1 != nullptr; const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; - GGML_ASSERT( src0->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT); @@ -5797,7 +6184,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool src0_on_device = src0->backend == GGML_BACKEND_GPU; + const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU; const bool dst_on_device = dst->backend == GGML_BACKEND_GPU; diff --git a/ggml-metal.m b/ggml-metal.m index 039448e30..9f58926f2 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -66,6 +66,7 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(soft_max_4); GGML_METAL_DECL_KERNEL(diag_mask_inf); GGML_METAL_DECL_KERNEL(diag_mask_inf_8); + GGML_METAL_DECL_KERNEL(get_rows_f32); GGML_METAL_DECL_KERNEL(get_rows_f16); GGML_METAL_DECL_KERNEL(get_rows_q4_0); GGML_METAL_DECL_KERNEL(get_rows_q4_1); @@ -145,7 +146,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { ctx->n_buffers = 0; ctx->concur_list_len = 0; - ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT); + ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); #ifdef GGML_SWIFT // load the default.metallib file @@ -175,7 +176,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"]; NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; - NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; + NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]); NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; @@ -224,6 +225,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(soft_max_4); GGML_METAL_ADD_KERNEL(diag_mask_inf); GGML_METAL_ADD_KERNEL(diag_mask_inf_8); + GGML_METAL_ADD_KERNEL(get_rows_f32); GGML_METAL_ADD_KERNEL(get_rows_f16); GGML_METAL_ADD_KERNEL(get_rows_q4_0); GGML_METAL_ADD_KERNEL(get_rows_q4_1); @@ -293,7 +295,9 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(gelu); GGML_METAL_DEL_KERNEL(soft_max); GGML_METAL_DEL_KERNEL(soft_max_4); + GGML_METAL_DEL_KERNEL(diag_mask_inf); GGML_METAL_DEL_KERNEL(diag_mask_inf_8); + GGML_METAL_DEL_KERNEL(get_rows_f32); GGML_METAL_DEL_KERNEL(get_rows_f16); GGML_METAL_DEL_KERNEL(get_rows_q4_0); GGML_METAL_DEL_KERNEL(get_rows_q4_1); @@ -386,6 +390,7 @@ static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru for (int i = 0; i < ctx->n_buffers; ++i) { const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data; + //metal_printf("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name); if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { *offs = (size_t) ioffs; @@ -723,6 +728,7 @@ void ggml_metal_graph_compute( case GGML_OP_ADD: { GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); // utilize float4 GGML_ASSERT(ne00 % 4 == 0); @@ -730,6 +736,7 @@ void ggml_metal_graph_compute( if (ggml_nelements(src1) == ne10) { // src1 is a row + GGML_ASSERT(ne11 == 1); [encoder setComputePipelineState:ctx->pipeline_add_row]; } else { [encoder setComputePipelineState:ctx->pipeline_add]; @@ -746,6 +753,7 @@ void ggml_metal_graph_compute( case GGML_OP_MUL: { GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); // utilize float4 GGML_ASSERT(ne00 % 4 == 0); @@ -753,6 +761,7 @@ void ggml_metal_graph_compute( if (ggml_nelements(src1) == ne10) { // src1 is a row + GGML_ASSERT(ne11 == 1); [encoder setComputePipelineState:ctx->pipeline_mul_row]; } else { [encoder setComputePipelineState:ctx->pipeline_mul]; @@ -768,6 +777,8 @@ void ggml_metal_graph_compute( } break; case GGML_OP_SCALE: { + GGML_ASSERT(ggml_is_contiguous(src0)); + const float scale = *(const float *) src1->data; [encoder setComputePipelineState:ctx->pipeline_scale]; @@ -867,8 +878,8 @@ void ggml_metal_graph_compute( // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel - if (ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && + if (!ggml_is_transposed(src0) && + !ggml_is_transposed(src1) && src1t == GGML_TYPE_F32 && [ctx->device supportsFamily:MTLGPUFamilyApple7] && ne00%32 == 0 && @@ -893,9 +904,12 @@ void ggml_metal_graph_compute( [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5]; [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6]; [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9]; - [encoder setBytes:&gqa length:sizeof(gqa) atIndex:10]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:8]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12]; + [encoder setBytes:&gqa length:sizeof(gqa) atIndex:13]; [encoder setThreadgroupMemoryLength:8192 atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; } else { @@ -1045,6 +1059,7 @@ void ggml_metal_graph_compute( case GGML_OP_GET_ROWS: { switch (src0->type) { + case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_get_rows_f32]; break; case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break; case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break; case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break; @@ -1060,9 +1075,9 @@ void ggml_metal_graph_compute( [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&(src0->ne[0]) length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&(src0->nb[1]) length:sizeof(uint64_t) atIndex:4]; - [encoder setBytes:&(dst->nb[1]) length:sizeof(uint64_t) atIndex:5]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:5]; const int64_t n = ggml_nelements(src1); diff --git a/ggml-metal.metal b/ggml-metal.metal index f45b1490f..ea8b42844 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -38,7 +38,7 @@ kernel void kernel_add_row( device const float4 * src0, device const float4 * src1, device float4 * dst, - constant int64_t & nb, + constant int64_t & nb, uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] + src1[tpig % nb]; } @@ -1321,7 +1321,6 @@ kernel void kernel_mul_mat_q3_K_f32( dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = sumf1[row]; } } - } #else kernel void kernel_mul_mat_q3_K_f32( @@ -1865,6 +1864,15 @@ kernel void kernel_mul_mat_q6_K_f32( //============================= templates and their specializations ============================= +// NOTE: this is not dequantizing - we are simply fitting the template +template +void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) { + float4x4 temp = *(((device float4x4 *)src)); + for (int i = 0; i < 16; i++){ + reg[i/4][i%4] = temp[i/4][i%4]; + } +} + template void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) { half4x4 temp = *(((device half4x4 *)src)); @@ -1875,7 +1883,6 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) template void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 1); const float d1 = il ? (xb->d / 16.h) : xb->d; const float d2 = d1 / 256.f; @@ -1887,12 +1894,10 @@ void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md; reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md; } - } template void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 2); const float d1 = il ? (xb->d / 16.h) : xb->d; const float d2 = d1 / 256.f; @@ -1964,7 +1969,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg for (int i = 0; i < 16; ++i) { reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml); } - #else float kcoef = il&1 ? 1.f/16.f : 1.f; uint16_t kmask = il&1 ? 0xF0 : 0x0F; @@ -2110,22 +2114,25 @@ kernel void kernel_get_rows( // each block_q contains 16*nl weights template kernel void kernel_mul_mm(device const uchar * src0, - device const float * src1, - device float * dst, - constant int64_t & ne00, - constant int64_t & ne02, - constant int64_t & nb01, - constant int64_t & nb02, - constant int64_t & ne12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & gqa, - threadgroup uchar * shared_memory [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const uchar * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne02, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & ne12, + constant int64_t & nb10, + constant int64_t & nb11, + constant int64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & gqa, + threadgroup uchar * shared_memory [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { - threadgroup half * sa = ((threadgroup half *)shared_memory); + threadgroup half * sa = (threadgroup half *)(shared_memory); threadgroup float * sb = (threadgroup float *)(shared_memory + 4096); const uint r0 = tgpig.y; @@ -2138,7 +2145,7 @@ kernel void kernel_mul_mm(device const uchar * src0, short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1; - simdgroup_half8x8 ma[4]; + simdgroup_half8x8 ma[4]; simdgroup_float8x8 mb[2]; simdgroup_float8x8 c_res[8]; for (int i = 0; i < 8; i++){ @@ -2146,10 +2153,15 @@ kernel void kernel_mul_mm(device const uchar * src0, } short il = (tiitg % THREAD_PER_ROW); - uint offset0 = im/gqa*nb02; ushort offset1 = il/nl; - device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1; - device const float * y = src1 + (r1 * BLOCK_SIZE_N + thread_col) * ne00 \ - + BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL) + im * ne00 * ne1; + + uint offset0 = im/gqa*nb02; + ushort offset1 = il/nl; + + device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1; + device const float * y = (device const float *)(src1 + + nb12 * im + + nb11 * (r1 * BLOCK_SIZE_N + thread_col) + + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL))); for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) { //load data and store to threadgroup memory @@ -2229,6 +2241,7 @@ kernel void kernel_mul_mm(device const uchar * src0, typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \ constant uint64_t &, constant uint64_t &, uint, uint, uint); +template [[host_name("kernel_get_rows_f32")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows; @@ -2239,14 +2252,27 @@ template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows; -typedef void (mat_mm_t)(device const uchar *, device const float *, device float *, constant int64_t &,\ - constant int64_t &, constant int64_t &, constant int64_t &, constant int64_t &, \ - constant int64_t &, constant int64_t &, constant uint &, threadgroup uchar *, uint3, uint, uint); +typedef void (mat_mm_t)( + device const uchar * src0, + device const uchar * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne02, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & ne12, + constant int64_t & nb10, + constant int64_t & nb11, + constant int64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & gqa, + threadgroup uchar *, uint3, uint, uint); -template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm; diff --git a/ggml.c b/ggml.c index 9ed9f678d..bd62c45fd 100644 --- a/ggml.c +++ b/ggml.c @@ -283,7 +283,7 @@ typedef double ggml_float; // 16-bit float // on Arm, we use __fp16 // on x86, we use uint16_t -#ifdef __ARM_NEON +#if defined(__ARM_NEON) && !defined(_MSC_VER) // if YCM cannot find , make a symbolic link to it, for example: // @@ -4304,10 +4304,21 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) { } size_t ggml_nbytes(const struct ggml_tensor * tensor) { - size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type); - for (int i = 1; i < GGML_MAX_DIMS; ++i) { - nbytes += (tensor->ne[i] - 1)*tensor->nb[i]; + size_t nbytes; + size_t blck_size = ggml_blck_size(tensor->type); + if (blck_size == 1) { + nbytes = ggml_type_size(tensor->type); + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + nbytes += (tensor->ne[i] - 1)*tensor->nb[i]; + } } + else { + nbytes = tensor->ne[0]*tensor->nb[0]/blck_size; + for (int i = 1; i < GGML_MAX_DIMS; ++i) { + nbytes += (tensor->ne[i] - 1)*tensor->nb[i]; + } + } + return nbytes; } @@ -18340,7 +18351,8 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n", i, node->ne[0], node->ne[1], - ggml_op_name(node->op)); + ggml_op_name(node->op), + ggml_get_name(node)); } for (int i = 0; i < GGML_OP_COUNT; i++) { diff --git a/ggml.h b/ggml.h index c936823d6..6d4cf465d 100644 --- a/ggml.h +++ b/ggml.h @@ -270,7 +270,7 @@ extern "C" { #if defined(__ARM_NEON) && defined(__CUDACC__) typedef half ggml_fp16_t; -#elif defined(__ARM_NEON) +#elif defined(__ARM_NEON) && !defined(_MSC_VER) typedef __fp16 ggml_fp16_t; #else typedef uint16_t ggml_fp16_t; diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index d377cd56d..7f7204ea1 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -79,6 +79,7 @@ KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world" class MODEL_ARCH(IntEnum): LLAMA : int = auto() FALCON : int = auto() + BAICHUAN:int = auto() GPT2 : int = auto() GPTJ : int = auto() GPTNEOX: int = auto() @@ -108,6 +109,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.LLAMA: "llama", MODEL_ARCH.FALCON: "falcon", + MODEL_ARCH.BAICHUAN:"baichuan", MODEL_ARCH.GPT2: "gpt2", MODEL_ARCH.GPTJ: "gptj", MODEL_ARCH.GPTNEOX: "gptneox", @@ -153,6 +155,22 @@ MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = { MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", }, + MODEL_ARCH.BAICHUAN: { + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ROPE_FREQS: "rope_freqs", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", + MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", + MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + }, MODEL_ARCH.GPT2: { # TODO }, @@ -165,6 +183,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, ], + MODEL_ARCH.BAICHUAN: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], } @@ -187,7 +209,7 @@ class TensorNameMap: # Output MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf + "lm_head", # gpt2 mpt falcon llama-hf baichuan "output", # llama-pth ), @@ -195,7 +217,7 @@ class TensorNameMap: MODEL_TENSOR.OUTPUT_NORM: ( "gpt_neox.final_layer_norm", # gptneox "transformer.ln_f", # gpt2 falcon - "model.norm", # llama-hf + "model.norm", # llama-hf baichuan "norm", # llama-pth ), @@ -311,6 +333,7 @@ class TensorNameMap: tensor_name = tensor_names.get(tensor) if tensor_name is None: continue + mapping[tensor_name] = (tensor, tensor_name) for key in keys: mapping[key] = (tensor, tensor_name) for bid in range(n_blocks): @@ -319,11 +342,12 @@ class TensorNameMap: if tensor_name is None: continue tensor_name = tensor_name.format(bid = bid) + mapping[tensor_name] = (tensor, tensor_name) for key in keys: key = key.format(bid = bid) mapping[key] = (tensor, tensor_name) - def get_type_and_name(self, key: str, try_suffixes: Sequence[str]) -> tuple[MODEL_TENSOR, str] | None: + def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None: result = self.mapping.get(key) if result is not None: return result @@ -334,13 +358,13 @@ class TensorNameMap: return (result[0], result[1] + suffix) return None - def get_name(self, key: str, try_suffixes: Sequence[str]) -> str | None: + def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None: result = self.get_type_and_name(key, try_suffixes = try_suffixes) if result is None: return None return result[1] - def get_type(self, key: str, try_suffixes: Sequence[str]) -> MODEL_TENSOR | None: + def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None: result = self.get_type_and_name(key, try_suffixes = try_suffixes) if result is None: return None diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 8da60de1b..9489ccd6f 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.3.2" +version = "0.3.3" description = "Write ML models in GGUF for GGML" authors = ["GGML "] packages = [ diff --git a/k_quants.c b/k_quants.c index eb702ce86..62085882d 100644 --- a/k_quants.c +++ b/k_quants.c @@ -2609,7 +2609,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri memcpy(utmp, x[i].scales, 12); - const uint32x2_t mins8 = {utmp[1] & kmask1, ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4)}; + uint32x2_t mins8 = { 0 }; + mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0); + mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1); + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); utmp[0] &= kmask1; diff --git a/llama.cpp b/llama.cpp index db47c5101..d42f591f1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -156,6 +156,7 @@ static std::string format(const char * fmt, ...) { enum llm_arch { LLM_ARCH_LLAMA, LLM_ARCH_FALCON, + LLM_ARCH_BAICHUAN, LLM_ARCH_GPT2, LLM_ARCH_GPTJ, LLM_ARCH_GPTNEOX, @@ -170,6 +171,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_GPTJ, "gptj" }, { LLM_ARCH_GPTNEOX, "gptneox" }, { LLM_ARCH_MPT, "mpt" }, + { LLM_ARCH_BAICHUAN,"baichuan" }, }; enum llm_kv { @@ -310,6 +312,25 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_BAICHUAN, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_FALCON, { @@ -1689,6 +1710,15 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_BAICHUAN: + { + GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_7B; break; + case 40: model.type = e_model::MODEL_13B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; }; @@ -1929,7 +1959,6 @@ static void llm_load_tensors( const int64_t n_vocab = hparams.n_vocab; const auto tn = LLM_TN(model.arch); - switch (model.arch) { case LLM_ARCH_LLAMA: { @@ -1972,6 +2001,72 @@ static void llm_load_tensors( model.layers.resize(n_layer); + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + + layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); + layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); + layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + + layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); + layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + + if (backend == GGML_BACKEND_GPU) { + vram_weights += + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + + ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + + ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + } + } + } break; + case LLM_ARCH_BAICHUAN: + { + model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + { + ggml_backend backend_norm; + ggml_backend backend_output; + + if (n_gpu_layers > int(n_layer)) { + // norm is not performance relevant on its own but keeping it in VRAM reduces data copying + // on Windows however this is detrimental unless everything is on the GPU +#ifndef _WIN32 + backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#else + backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#endif // _WIN32 + + backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } + + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + + if (backend_norm == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(model.output_norm); + } + if (backend_output == GGML_BACKEND_GPU_SPLIT) { + vram_weights += ggml_nbytes(model.output); + } + } + + const uint32_t n_ff = hparams.n_ff; + + const int i_gpu_start = n_layer - n_gpu_layers; + + model.layers.resize(n_layer); + for (uint32_t i = 0; i < n_layer; ++i) { const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT @@ -2552,6 +2647,367 @@ static struct ggml_cgraph * llm_build_llama( return gf; } + +static struct ggml_cgraph * llm_build_baichaun( + llama_context & lctx, + const llama_token * tokens, + const float * embd, + int n_tokens, + int n_past) { + + GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT + + const int N = n_tokens; + + const auto & model = lctx.model; + const auto & hparams = model.hparams; + + const auto & kv_self = lctx.kv_self; + + GGML_ASSERT(!!kv_self.ctx); + + const int64_t n_embd = hparams.n_embd; + const int64_t n_layer = hparams.n_layer; + const int64_t n_ctx = hparams.n_ctx; + const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + GGML_ASSERT(n_embd_head == hparams.n_rot); + + const float freq_base = hparams.rope_freq_base; + const float freq_scale = hparams.rope_freq_scale; + const float norm_rms_eps = hparams.f_norm_rms_eps; + + const int n_gpu_layers = model.n_gpu_layers; + + auto & buf_compute = lctx.buf_compute; + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.data, + /*.no_alloc =*/ false, + }; + + params.no_alloc = true; + + struct ggml_context * ctx0 = ggml_init(params); + + ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + if (tokens) { + struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + + ggml_allocr_alloc(lctx.alloc, inp_tokens); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); + } + ggml_set_name(inp_tokens, "inp_tokens"); + + inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); + } else { +#ifdef GGML_USE_MPI + GGML_ASSERT(false && "not implemented"); +#endif + + inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); + + ggml_allocr_alloc(lctx.alloc, inpL); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); + } + } + + const int i_gpu_start = n_layer - n_gpu_layers; + (void) i_gpu_start; + + // offload functions set the tensor output backend to GPU + // tensors are GPU-accelerated if any input or the output has been offloaded + // + // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal + // in that case ggml_cuda_assign_buffers has no effect + offload_func_t offload_func_nr = llama_nop; // nr = non-repeating + offload_func_t offload_func_kq = llama_nop; + offload_func_t offload_func_v = llama_nop; + +#ifdef GGML_USE_CUBLAS + if (n_gpu_layers > n_layer) { + offload_func_nr = ggml_cuda_assign_buffers_no_alloc; + } + if (n_gpu_layers > n_layer + 1) { + offload_func_v = ggml_cuda_assign_buffers_no_alloc; + } + if (n_gpu_layers > n_layer + 2) { + offload_func_kq = ggml_cuda_assign_buffers_no_alloc; + } +#endif // GGML_USE_CUBLAS + + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(lctx.alloc, KQ_scale); + if (!ggml_allocr_is_measure(lctx.alloc)) { + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + } + ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + + for (int il = 0; il < n_layer; ++il) { + ggml_format_name(inpL, "layer_inp_%d", il); + + offload_func_t offload_func = llama_nop; + +#ifdef GGML_USE_CUBLAS + if (il >= i_gpu_start) { + offload_func = ggml_cuda_assign_buffers_no_alloc; + } +#endif // GGML_USE_CUBLAS + + struct ggml_tensor * inpSA = inpL; + + // norm + { + cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); + offload_func(cur); + ggml_set_name(cur, "rms_norm_0"); + + // cur = cur*attn_norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); + offload_func(cur); + ggml_set_name(cur, "attention_norm_0"); + } + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + offload_func_kq(tmpk); + ggml_set_name(tmpk, "tmpk"); + + struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + offload_func_kq(tmpq); + ggml_set_name(tmpq, "tmpq"); + + struct ggml_tensor * Kcur; + struct ggml_tensor * Qcur; + switch (model.type) { + case MODEL_7B: + Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); + Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); + break; + case MODEL_13B: + Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N); + Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N); + break; + default: + GGML_ASSERT(false); + } + + offload_func_kq(Kcur); + ggml_set_name(Kcur, "Kcur"); + + offload_func_kq(Qcur); + ggml_set_name(Qcur, "Qcur"); + + // store key and value to memory + { + // compute the transposed [N, n_embd] V matrix + + struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + offload_func_v(tmpv); + ggml_set_name(tmpv, "tmpv"); + + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N)); + offload_func_v(Vcur); + ggml_set_name(Vcur, "Vcur"); + + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); + offload_func_kq(k); + ggml_set_name(k, "k"); + + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v)); + offload_func_v(v); + ggml_set_name(v, "v"); + + // important: storing RoPE-ed version of K in the KV cache! + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } + + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + offload_func_kq(Q); + ggml_set_name(Q, "Q"); + + struct ggml_tensor * K = + ggml_view_3d(ctx0, kv_self.k, + n_embd_head, n_past + N, n_head_kv, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + offload_func_kq(K); + ggml_set_name(K, "K"); + + // K * Q + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + offload_func_kq(KQ); + ggml_set_name(KQ, "KQ"); + + // KQ_scaled = KQ / sqrt(n_embd_head) + // KQ_scaled shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); + offload_func_kq(KQ_scaled); + ggml_set_name(KQ_scaled, "KQ_scaled"); + + struct ggml_tensor * KQ_masked; + struct ggml_tensor * KQ_scaled_alibi; + + switch (model.type) { + case MODEL_7B: + KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + break; + case MODEL_13B: + KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8); + ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); + KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past); + break; + default: + GGML_ASSERT(false); + } + // KQ_masked = mask_past(KQ_scaled) + // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past); + // offload_func_kq(KQ_masked); + // ggml_set_name(KQ_masked, "KQ_masked"); + + // KQ = soft_max(KQ_masked) + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + offload_func_v(KQ_soft_max); + ggml_set_name(KQ_soft_max, "KQ_soft_max"); + + // split cached V into n_head heads + struct ggml_tensor * V = + ggml_view_3d(ctx0, kv_self.v, + n_past + N, n_embd_head, n_head_kv, + ggml_element_size(kv_self.v)*n_ctx, + ggml_element_size(kv_self.v)*n_ctx*n_embd_head, + ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); + offload_func_v(V); + ggml_set_name(V, "V"); + +#if 1 + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + offload_func_v(KQV); + ggml_set_name(KQV, "KQV"); +#else + // make V contiguous in memory to speed up the matmul, however we waste time on the copy + // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation + // is there a better way? + struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head)); + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); +#endif + + // KQV_merged = KQV.permute(0, 2, 1, 3) + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + offload_func_v(KQV_merged); + ggml_set_name(KQV_merged, "KQV_merged"); + + // cur = KQV_merged.contiguous().view(n_embd, N) + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + offload_func_v(cur); + ggml_set_name(cur, "KQV_merged_contiguous"); + + // projection (no bias) + cur = ggml_mul_mat(ctx0, + model.layers[il].wo, + cur); + offload_func(cur); + ggml_set_name(cur, "result_wo"); + } + + struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); + offload_func(inpFF); + ggml_set_name(inpFF, "inpFF"); + + // feed-forward network + { + // norm + { + cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); + offload_func(cur); + ggml_set_name(cur, "rms_norm_1"); + + // cur = cur*ffn_norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); + offload_func(cur); + ggml_set_name(cur, "ffn_norm"); + } + + struct ggml_tensor * tmp = ggml_mul_mat(ctx0, + model.layers[il].w3, + cur); + offload_func(tmp); + ggml_set_name(tmp, "result_w3"); + + cur = ggml_mul_mat(ctx0, + model.layers[il].w1, + cur); + offload_func(cur); + ggml_set_name(cur, "result_w1"); + + // SILU activation + cur = ggml_silu(ctx0, cur); + offload_func(cur); + ggml_set_name(cur, "silu"); + + cur = ggml_mul(ctx0, cur, tmp); + offload_func(cur); + ggml_set_name(cur, "silu_x_result_w3"); + + cur = ggml_mul_mat(ctx0, + model.layers[il].w2, + cur); + offload_func(cur); + ggml_set_name(cur, "result_w2"); + } + + cur = ggml_add(ctx0, cur, inpFF); + offload_func(cur); + ggml_set_name(cur, "inpFF_+_result_w2"); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + // norm + { + cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); + offload_func_nr(cur); + ggml_set_name(cur, "rms_norm_2"); + + // cur = cur*norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.output_norm); + // offload_func_nr(cur); // TODO CPU + GPU mirrored backend + ggml_set_name(cur, "result_norm"); + } + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + ggml_set_name(cur, "result_output"); + + ggml_build_forward_expand(gf, cur); + + ggml_free(ctx0); + + return gf; +} + static struct ggml_cgraph * llm_build_falcon( llama_context & lctx, const llama_token * tokens, @@ -2874,6 +3330,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past); } break; + case LLM_ARCH_BAICHUAN: + { + result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past); + } break; case LLM_ARCH_FALCON: { result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past); @@ -2979,10 +3439,6 @@ static bool llama_eval_internal( if (lctx.ctx_metal) { ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); ggml_metal_graph_compute(lctx.ctx_metal, gf); - ggml_metal_get_tensor (lctx.ctx_metal, res); - if (!lctx.embedding.empty()) { - ggml_metal_get_tensor(lctx.ctx_metal, embeddings); - } } else { ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); } @@ -3131,10 +3587,9 @@ struct llm_tokenizer_spm { while (offs < text.size()) { llm_symbol sym; size_t len = utf8_len(text[offs]); - GGML_ASSERT(offs + len <= text.size()); sym.text = text.c_str() + offs; - sym.n = len; - offs += len; + sym.n = std::min(len, text.size() - offs); + offs += sym.n; sym.prev = index - 1; sym.next = offs == text.size() ? -1 : index + 1; index++; @@ -4653,7 +5108,16 @@ void llama_beam_search(llama_context * ctx, // quantization // -static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vector & output, const size_t nelements, const int nthread) { +template +struct no_init { + T value; + no_init() { /* do nothing */ } +}; + +static void llama_convert_tensor_internal( + struct ggml_tensor * tensor, std::vector> & output, std::vector & workers, + const size_t nelements, const int nthread +) { if (output.size() < nelements) { output.resize(nelements); } @@ -4688,7 +5152,6 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect auto blocks_per_thread = nblocks / nthread; auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count - std::vector workers; for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) { auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread auto thr_elems = thr_blocks * block_size; // number of elements for this thread @@ -4701,15 +5164,124 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect qtype.to_float(inbuf, outbuf, nels); } }; - workers.push_back(std::thread(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems)); + workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems); in_buff_offs += thr_block_bytes; out_buff_offs += thr_elems; } - for (auto & worker : workers) { - worker.join(); - } + for (auto & w : workers) { w.join(); } + workers.clear(); } +#ifdef GGML_USE_K_QUANTS +static ggml_type get_k_quant_type( + ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv, + int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2 +) { + const std::string name = ggml_get_name(tensor); + // TODO: avoid hardcoded tensor names - use the TN_* constants + const auto tn = LLM_TN(model.arch); + + auto use_more_bits = [](int i_layer, int num_layers) -> bool { + return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2; + }; + + if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { + int nx = tensor->ne[0]; + if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) { + new_type = GGML_TYPE_Q8_0; + } + else if (new_type != GGML_TYPE_Q8_0) { + new_type = GGML_TYPE_Q6_K; + } + } else if (name.find("attn_v.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && + use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; + else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && + (*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; + if (model.type == MODEL_70B) { + // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is + // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with + // nearly negligible increase in model size by quantizing this tensor with more bits: + if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K; + } + ++*i_attention_wv; + } else if (name.find("ffn_down.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K + : model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K + : GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { + if (model.arch == LLM_ARCH_FALCON) { + new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K : + use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } else { + if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; + } + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) { + new_type = GGML_TYPE_Q5_K; + } + ++*i_feed_forward_w2; + } else if (name.find("attn_output.weight") != std::string::npos) { + if (model.arch != LLM_ARCH_FALCON) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + } else { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; + } + } + else if (name.find("attn_qkv.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; + } + else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + } + // This can be used to reduce the size of the Q5_K_S model. + // The associated PPL increase is fully in line with the size reduction + //else { + // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K; + //} + bool convert_incompatible_tensor = false; + if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || + new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) { + int nx = tensor->ne[0]; + int ny = tensor->ne[1]; + if (nx % QK_K != 0) { + LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K); + convert_incompatible_tensor = true; + } + } + if (convert_incompatible_tensor) { + if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { + new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing. + LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n"); + } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) { + new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing. + LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n"); + } else { + throw std::runtime_error("Unsupported tensor size encountered\n"); + } + } + + return new_type; +} +#endif + static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { ggml_type quantized_type; llama_ftype ftype = params->ftype; @@ -4793,18 +5365,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::vector hist_all(1 << 4, 0); std::vector workers; + workers.reserve(nthread); std::mutex mutex; -#ifdef GGML_USE_K_QUANTS - auto use_more_bits = [] (int i_layer, int num_layers) -> bool { - return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2; - }; -#endif - int idx = 0; - std::vector read_data; - std::vector work; + std::vector> read_data; + std::vector> work; + std::vector> f32_conv_buf; // populate the original tensors so we get an initial meta data for (int i = 0; i < ml->n_tensors; ++i) { @@ -4826,7 +5394,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const std::string name = ggml_get_name(tensor); - read_data.resize(ggml_nbytes(tensor)); + if (read_data.size() < ggml_nbytes(tensor)) { + read_data.resize(ggml_nbytes(tensor)); + } tensor->data = read_data.data(); ml->load_data_for(tensor); @@ -4851,101 +5421,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (quantize) { new_type = quantized_type; #ifdef GGML_USE_K_QUANTS - // TODO: avoid hardcoded tensor names - use the TN_* constants - const auto tn = LLM_TN(ml->get_arch()); - - if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { - int nx = tensor->ne[0]; - if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) { - new_type = GGML_TYPE_Q8_0; - } - else if (new_type != GGML_TYPE_Q8_0) { - new_type = GGML_TYPE_Q6_K; - } - } else if (name.find("attn_v.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && - use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; - else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && - (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; - if (model.type == MODEL_70B) { - // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is - // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with - // nearly negligible increase in model size by quantizing this tensor with more bits: - if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K; - } - ++i_attention_wv; - } else if (name.find("ffn_down.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K - : model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K - : GGML_TYPE_Q3_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { - new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { - if (model.arch == LLM_ARCH_FALCON) { - new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K : - use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; - } else { - if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; - } - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) { - new_type = GGML_TYPE_Q5_K; - } - ++i_feed_forward_w2; - } else if (name.find("attn_output.weight") != std::string::npos) { - if (model.arch != LLM_ARCH_FALCON) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; - } else { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; - } - } - else if (name.find("attn_qkv.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; - } - else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; - } - // This can be used to reduce the size of the Q5_K_S model. - // The associated PPL increase is fully in line with the size reduction - //else { - // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K; - //} - bool convert_incompatible_tensor = false; - if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || - new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) { - int nx = tensor->ne[0]; - int ny = tensor->ne[1]; - if (nx % QK_K != 0) { - LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K); - convert_incompatible_tensor = true; - } - } - if (convert_incompatible_tensor) { - if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { - new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing. - LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n"); - } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) { - new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing. - LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n"); - } else { - throw std::runtime_error("Unsupported tensor size encountered\n"); - } - } + new_type = get_k_quant_type( + new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2 + ); #endif // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. @@ -4960,23 +5438,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const size_t nelements = ggml_nelements(tensor); float * f32_data; - std::vector f32_conv_buf; if (tensor->type == GGML_TYPE_F32) { f32_data = (float *) tensor->data; } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) { throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type))); } else { - llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread); + llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread); f32_data = (float *) f32_conv_buf.data(); } LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type)); fflush(stdout); - work.resize(nelements * 4); // upper bound on size + if (work.size() < nelements * 4) { + work.resize(nelements * 4); // upper bound on size + } new_data = work.data(); - std::vector hist_cur(1 << 4, 0); + std::array hist_cur = {}; static const int chunk_size = 32 * 512; const int nchunk = (nelements + chunk_size - 1)/chunk_size; @@ -4987,13 +5466,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s size_t counter = 0; new_size = 0; auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() { - std::vector local_hist; + std::array local_hist = {}; size_t local_size = 0; while (true) { std::unique_lock lock(mutex); size_t first = counter; counter += chunk_size; if (first >= nelements) { - if (!local_hist.empty()) { + if (local_size > 0) { for (int j=0; j %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); @@ -6231,7 +6703,7 @@ int llama_tokenize_with_model( auto res = llama_tokenize_internal(model->vocab, text, add_bos); if (n_max_tokens < (int) res.size()) { - LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); + // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); return -((int) res.size()); } diff --git a/prompts/chat-with-baichuan.txt b/prompts/chat-with-baichuan.txt new file mode 100644 index 000000000..11626b692 --- /dev/null +++ b/prompts/chat-with-baichuan.txt @@ -0,0 +1,4 @@ +以下内容为人类用户与与一位智能助手的对话。 + +用户:你好! +助手: diff --git a/run_with_preset.py b/run_with_preset.py index 8f90f52a9..9b4d7ecbe 100755 --- a/run_with_preset.py +++ b/run_with_preset.py @@ -13,7 +13,7 @@ CLI_ARGS_MAIN_PERPLEXITY = [ "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct", "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base", "low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock", - "model", "mtest", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q", + "model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q", "np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt", "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n", "repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed", diff --git a/scripts/LlamaConfig.cmake.in b/scripts/LlamaConfig.cmake.in new file mode 100644 index 000000000..e1fadc361 --- /dev/null +++ b/scripts/LlamaConfig.cmake.in @@ -0,0 +1,69 @@ +set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@) +set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@) +set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@) +set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) +set(LLAMA_BLAS @LLAMA_BLAS@) +set(LLAMA_CUBLAS @LLAMA_CUBLAS@) +set(LLAMA_METAL @LLAMA_METAL@) +set(LLAMA_MPI @LLAMA_MPI@) +set(LLAMA_CLBLAST @LLAMA_CLBLAST@) +set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@) +set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@) + +@PACKAGE_INIT@ + +set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@") +set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@") +set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@") + +# Ensure transient dependencies satisfied + +find_package(Threads REQUIRED) +if (APPLE AND LLAMA_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) +endif() + +if (LLAMA_BLAS) + find_package(BLAS REQUIRED) +endif() + +if (LLAMA_CUBLAS) + find_package(CUDAToolkit REQUIRED) +endif() + +if (LLAMA_METAL) + find_library(FOUNDATION_LIBRARY Foundation REQUIRED) + find_library(METAL_FRAMEWORK Metal REQUIRED) + find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) +endif() + +if (LLAMA_MPI) + find_package(MPI REQUIRED) +endif() + +if (LLAMA_CLBLAST) + find_package(CLBlast REQUIRED) +endif() + +if (LLAMA_HIPBLAS) + find_package(hip REQUIRED) + find_package(hipblas REQUIRED) + find_package(rocblas REQUIRED) +endif() + +find_library(llama_LIBRARY llama + REQUIRED + HINTS ${LLAMA_LIB_DIR}) + +set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@") +add_library(llama UNKNOWN IMPORTED) +set_target_properties(llama + PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}" + INTERFACE_LINK_LIBRARIES "${_llama_link_deps}" + IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" + IMPORTED_LOCATION "${llama_LIBRARY}" + INTERFACE_COMPILE_FEATURES cxx_std_11 + POSITION_INDEPENDENT_CODE ON ) + +check_required_components(Llama) diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp index 8630742c6..edbd86f85 100644 --- a/tests/test-tokenizer-0-llama.cpp +++ b/tests/test-tokenizer-0-llama.cpp @@ -1,5 +1,6 @@ #include "llama.h" #include "common.h" +#include "console.h" #include #include @@ -89,6 +90,12 @@ int main(int argc, char **argv) { return 2; } +#ifdef _WIN32 + // We need this for unicode console support + console::init(false, false); + atexit([]() { console::cleanup(); }); +#endif + bool success = true; for (const auto & test_kv : k_tests()) { diff --git a/tests/test-tokenizer-1-llama.cpp b/tests/test-tokenizer-1-llama.cpp new file mode 100644 index 000000000..ab3d822f2 --- /dev/null +++ b/tests/test-tokenizer-1-llama.cpp @@ -0,0 +1,127 @@ +#include "llama.h" +#include "common.h" +#include "console.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef int codepoint; + +std::string codepoint_to_utf8(codepoint cp) { + std::string result; + if (0x00 <= cp && cp <= 0x7f) { + result.push_back(cp); + } else if (0x80 <= cp && cp <= 0x7ff) { + result.push_back(0xc0 | ((cp >> 6) & 0x1f)); + result.push_back(0x80 | (cp & 0x3f)); + } else if (0x800 <= cp && cp <= 0xffff) { + result.push_back(0xe0 | ((cp >> 12) & 0x0f)); + result.push_back(0x80 | ((cp >> 6) & 0x3f)); + result.push_back(0x80 | (cp & 0x3f)); + } else if (0x10000 <= cp && cp <= 0x10ffff) { + result.push_back(0xf0 | ((cp >> 18) & 0x07)); + result.push_back(0x80 | ((cp >> 12) & 0x3f)); + result.push_back(0x80 | ((cp >> 6) & 0x3f)); + result.push_back(0x80 | (cp & 0x3f)); + } else { + throw std::invalid_argument("invalid codepoint"); + } + return result; +} + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + const std::string fname = argv[1]; + + fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); + + llama_model * model; + llama_context * ctx; + + llama_backend_init(false); + + // load the vocab + { + auto lparams = llama_context_default_params(); + + lparams.vocab_only = true; + + model = llama_load_model_from_file(fname.c_str(), lparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + return 1; + } + + ctx = llama_new_context_with_model(model, lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + llama_free_model(model); + return 1; + } + } + + GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM); + +#ifdef _WIN32 + // We need this for unicode console support + console::init(false, false); + atexit([]() { console::cleanup(); }); +#endif + + const int n_vocab = llama_n_vocab(ctx); + + for (int i = 0; i < n_vocab; ++i) { + std::string str = llama_detokenize_spm(ctx, std::vector(1, i)); + std::vector tokens = llama_tokenize(ctx, str, false); + std::string check = llama_detokenize_spm(ctx, tokens); + if (check != str) { + fprintf(stderr, "%s : error: token %d detokenizes to >%s<(%llu) but tokenization of this detokenizes to >%s<(%llu)\n", + __func__, i, str.c_str(), str.length(), check.c_str(), check.length()); + if(i != 3) + return 2; + } + } + + for (codepoint cp = 0x0000; cp < 0xffff; ++cp) { + if (cp < 0xd800 || cp > 0xdfff) { + std::string str = codepoint_to_utf8(cp); + std::vector tokens = llama_tokenize(ctx, str, false); + std::string check = llama_detokenize_spm(ctx, tokens); + if (str != check) { + fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%llu) instead of >%s<(%llu)\n", + __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); + if(cp != 0 && cp != 9601) + return 3; + } + } + } + for (codepoint cp = 0x10000; cp < 0x0010ffff; ++cp) { + std::string str = codepoint_to_utf8(cp); + std::vector tokens = llama_tokenize(ctx, str, false); + std::string check = llama_detokenize_spm(ctx, tokens); + if (str != check) { + fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%llu) instead of >%s<(%llu)\n", + __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); + return 4; + } + } + + llama_free_model(model); + llama_free(ctx); + + llama_backend_free(); + + return 0; +} diff --git a/tests/test-tokenizer-1.cpp b/tests/test-tokenizer-1.cpp deleted file mode 100644 index ce4f2898c..000000000 --- a/tests/test-tokenizer-1.cpp +++ /dev/null @@ -1,108 +0,0 @@ -#include "llama.h" -#include "common.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -static std::string escape_whitespace(const std::string& text) { - std::string result = "\xe2\x96\x81"; - for (size_t offs = 0; offs < text.length(); ++offs) { - if (text[offs] == ' ') { - result += "\xe2\x96\x81"; - } else { - result += text[offs]; - } - } - return result; -} - -int main(int argc, char **argv) { - if (argc < 2) { - fprintf(stderr, "Usage: %s \n", argv[0]); - return 1; - } - - const std::string fname = argv[1]; - - fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); - - llama_model * model; - llama_context * ctx; - - llama_backend_init(false); - - // load the vocab - { - auto lparams = llama_context_default_params(); - - lparams.vocab_only = true; - - model = llama_load_model_from_file(fname.c_str(), lparams); - - if (model == NULL) { - fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); - return 1; - } - - ctx = llama_new_context_with_model(model, lparams); - - if (ctx == NULL) { - fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); - llama_free_model(model); - return 1; - } - } - - GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_BPE); - - const int n_vocab = llama_n_vocab(ctx); - - for (int i = 0; i < n_vocab; ++i) { - std::string forward = llama_token_to_piece(ctx, i); - std::vector tokens = llama_tokenize(ctx, forward, false); - if (tokens.size() == 1) { - if (i != tokens[0]) { - std::string backward = llama_token_to_piece(ctx, tokens[0]); - fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n", - __func__, i, llama_token_to_piece(ctx, i).c_str(), tokens[0], backward.c_str()); - return 2; - } - } - } - -#ifdef _WIN32 - std::wstring_convert, char16_t> u16converter; - for (char16_t ch = 0x0000; ch < 0xffff; ++ch) { - std::u16string u16str(1, ch); - std::string str = u16converter.to_bytes(u16str); - std::vector tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false); - if (tokens.size() == 1) { - fprintf(stderr, "%s : info: %s tokenized to %d \n", - __func__, str.c_str(), tokens[0]); - } - } - - std::wstring_convert, char32_t> u32converter; - for (char32_t ch = 0x0000; ch < 0x0010ffff; ++ch) { - std::u32string u32str(1, ch); - std::string str = u32converter.to_bytes(u32str); - std::vector tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false); - if (tokens.size() == 1) { - fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]); - } - } -#endif - - llama_free_model(model); - llama_free(ctx); - - llama_backend_free(); - - return 0; -}