diff --git a/.devops/nix/apps.nix b/.devops/nix/apps.nix new file mode 100644 index 000000000..b8a12cc0a --- /dev/null +++ b/.devops/nix/apps.nix @@ -0,0 +1,22 @@ +{ + perSystem = + { config, lib, ... }: + { + apps = + let + inherit (config.packages) default; + binaries = [ + "llama" + "llama-embedding" + "llama-server" + "quantize" + "train-text-from-scratch" + ]; + mkApp = name: { + type = "app"; + program = "${default}/bin/${name}"; + }; + in + lib.genAttrs binaries mkApp; + }; +} diff --git a/.devops/nix/devshells.nix b/.devops/nix/devshells.nix new file mode 100644 index 000000000..1862f0f08 --- /dev/null +++ b/.devops/nix/devshells.nix @@ -0,0 +1,13 @@ +{ + perSystem = + { config, lib, ... }: + { + devShells = + lib.concatMapAttrs + (name: package: { + ${name} = package.passthru.shell; + ${name + "-extra"} = package.passthru.shell-extra; + }) + config.packages; + }; +} diff --git a/.devops/nix/jetson-support.nix b/.devops/nix/jetson-support.nix new file mode 100644 index 000000000..08426d2ab --- /dev/null +++ b/.devops/nix/jetson-support.nix @@ -0,0 +1,32 @@ +{ inputs, ... }: +{ + perSystem = + { + config, + system, + lib, + pkgsCuda, + ... + }: + lib.optionalAttrs (system == "aarch64-linux") { + packages = + let + caps.jetson-xavier = "7.2"; + caps.jetson-orin = "8.7"; + caps.jetson-nano = "5.3"; + + pkgsFor = + cap: + import inputs.nixpkgs { + inherit system; + config = { + cudaSupport = true; + cudaCapabilities = [ cap ]; + cudaEnableForwardCompat = false; + inherit (pkgsCuda.config) allowUnfreePredicate; + }; + }; + in + builtins.mapAttrs (name: cap: ((pkgsFor cap).callPackage ./scope.nix { }).llama-cpp) caps; + }; +} diff --git a/.devops/nix/nixpkgs-instances.nix b/.devops/nix/nixpkgs-instances.nix new file mode 100644 index 000000000..6e9872b28 --- /dev/null +++ b/.devops/nix/nixpkgs-instances.nix @@ -0,0 +1,35 @@ +{ inputs, ... }: +{ + # The _module.args definitions are passed on to modules as arguments. E.g. + # the module `{ pkgs ... }: { /* config */ }` implicitly uses + # `_module.args.pkgs` (defined in this case by flake-parts). + perSystem = + { system, ... }: + { + _module.args = { + pkgsCuda = import inputs.nixpkgs { + inherit system; + # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc, + # and ucx are built with CUDA support) + config.cudaSupport = true; + config.allowUnfreePredicate = + p: + builtins.all + ( + license: + license.free + || builtins.elem license.shortName [ + "CUDA EULA" + "cuDNN EULA" + ] + ) + (p.meta.licenses or [ p.meta.license ]); + }; + # Ensure dependencies use ROCm consistently + pkgsRocm = import inputs.nixpkgs { + inherit system; + config.rocmSupport = true; + }; + }; + }; +} diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix new file mode 100644 index 000000000..5f2a7c9f4 --- /dev/null +++ b/.devops/nix/package.nix @@ -0,0 +1,265 @@ +{ + lib, + config, + stdenv, + mkShell, + cmake, + ninja, + pkg-config, + git, + python3, + mpi, + openblas, # TODO: Use the generic `blas` so users could switch betwen alternative implementations + cudaPackages, + darwin, + rocmPackages, + clblast, + useBlas ? builtins.all (x: !x) [ + useCuda + useMetalKit + useOpenCL + useRocm + ], + useCuda ? config.cudaSupport, + useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL, + useMpi ? false, # Increases the runtime closure size by ~700M + useOpenCL ? false, + useRocm ? config.rocmSupport, + llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake +}@inputs: + +let + inherit (lib) + cmakeBool + cmakeFeature + optionals + strings + versionOlder + ; + + # It's necessary to consistently use backendStdenv when building with CUDA support, + # otherwise we get libstdc++ errors downstream. + stdenv = throw "Use effectiveStdenv instead"; + effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv; + + suffices = + lib.optionals useBlas [ "BLAS" ] + ++ lib.optionals useCuda [ "CUDA" ] + ++ lib.optionals useMetalKit [ "MetalKit" ] + ++ lib.optionals useMpi [ "MPI" ] + ++ lib.optionals useOpenCL [ "OpenCL" ] + ++ lib.optionals useRocm [ "ROCm" ]; + + pnameSuffix = + strings.optionalString (suffices != [ ]) + "-${strings.concatMapStringsSep "-" strings.toLower suffices}"; + descriptionSuffix = + strings.optionalString (suffices != [ ]) + ", accelerated with ${strings.concatStringsSep ", " suffices}"; + + # TODO: package the Python in this repository in a Nix-like way. + # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo + # is PEP 517-compatible, and ensure the correct .dist-info is generated. + # https://peps.python.org/pep-0517/ + llama-python = python3.withPackages ( + ps: [ + ps.numpy + ps.sentencepiece + ] + ); + + # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime + llama-python-extra = python3.withPackages ( + ps: [ + ps.numpy + ps.sentencepiece + ps.torchWithoutCuda + ps.transformers + ] + ); + + # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64 + # separately + darwinBuildInputs = + with darwin.apple_sdk.frameworks; + [ + Accelerate + CoreVideo + CoreGraphics + ] + ++ optionals useMetalKit [ MetalKit ]; + + cudaBuildInputs = with cudaPackages; [ + cuda_cccl.dev # + + # A temporary hack for reducing the closure size, remove once cudaPackages + # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792 + cuda_cudart.dev + cuda_cudart.lib + cuda_cudart.static + libcublas.dev + libcublas.lib + libcublas.static + ]; + + rocmBuildInputs = with rocmPackages; [ + clr + hipblas + rocblas + ]; +in + +effectiveStdenv.mkDerivation ( + finalAttrs: { + pname = "llama-cpp${pnameSuffix}"; + version = llamaVersion; + + src = lib.cleanSourceWith { + filter = + name: type: + !(builtins.any (_: _) [ + (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths + (name == "README.md") # Ignore *.md changes whe computing outPaths + (lib.hasPrefix "." name) # Skip hidden files and directories + ]); + src = lib.cleanSource ../../.; + }; + + postPatch = '' + substituteInPlace ./ggml-metal.m \ + --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" + + # TODO: Package up each Python script or service appropriately. + # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`, + # we could make those *.py into setuptools' entrypoints + substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python" + ''; + + nativeBuildInputs = + [ + cmake + ninja + pkg-config + git + ] + ++ optionals useCuda [ + cudaPackages.cuda_nvcc + + # TODO: Replace with autoAddDriverRunpath + # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged + cudaPackages.autoAddOpenGLRunpathHook + ]; + + buildInputs = + optionals effectiveStdenv.isDarwin darwinBuildInputs + ++ optionals useCuda cudaBuildInputs + ++ optionals useMpi [ mpi ] + ++ optionals useOpenCL [ clblast ] + ++ optionals useRocm rocmBuildInputs; + + cmakeFlags = + [ + (cmakeBool "LLAMA_NATIVE" true) + (cmakeBool "LLAMA_BUILD_SERVER" true) + (cmakeBool "BUILD_SHARED_LIBS" true) + (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) + (cmakeBool "LLAMA_BLAS" useBlas) + (cmakeBool "LLAMA_CLBLAST" useOpenCL) + (cmakeBool "LLAMA_CUBLAS" useCuda) + (cmakeBool "LLAMA_HIPBLAS" useRocm) + (cmakeBool "LLAMA_METAL" useMetalKit) + (cmakeBool "LLAMA_MPI" useMpi) + ] + ++ optionals useCuda [ + ( + with cudaPackages.flags; + cmakeFeature "CMAKE_CUDA_ARCHITECTURES" ( + builtins.concatStringsSep ";" (map dropDot cudaCapabilities) + ) + ) + ] + ++ optionals useRocm [ + (cmakeFeature "CMAKE_C_COMPILER" "hipcc") + (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc") + + # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM + # in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt + # and select the line that matches the current nixpkgs version of rocBLAS. + # Should likely use `rocmPackages.clr.gpuTargets`. + "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102" + ] + ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ] + ++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ]; + + # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level, + # if they haven't been added yet. + postInstall = '' + mv $out/bin/main $out/bin/llama + mv $out/bin/server $out/bin/llama-server + mkdir -p $out/include + cp $src/llama.h $out/include/ + ''; + + # Define the shells here, but don't add in the inputsFrom to avoid recursion. + passthru = { + inherit + useBlas + useCuda + useMetalKit + useMpi + useOpenCL + useRocm + ; + + shell = mkShell { + name = "shell-${finalAttrs.finalPackage.name}"; + description = "contains numpy and sentencepiece"; + buildInputs = [ llama-python ]; + inputsFrom = [ finalAttrs.finalPackage ]; + }; + + shell-extra = mkShell { + name = "shell-extra-${finalAttrs.finalPackage.name}"; + description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers"; + buildInputs = [ llama-python-extra ]; + inputsFrom = [ finalAttrs.finalPackage ]; + }; + }; + + meta = { + # Configurations we don't want even the CI to evaluate. Results in the + # "unsupported platform" messages. This is mostly a no-op, because + # cudaPackages would've refused to evaluate anyway. + badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin; + + # Configurations that are known to result in build failures. Can be + # overridden by importing Nixpkgs with `allowBroken = true`. + broken = (useMetalKit && !effectiveStdenv.isDarwin); + + description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}"; + homepage = "https://github.com/ggerganov/llama.cpp/"; + license = lib.licenses.mit; + + # Accommodates `nix run` and `lib.getExe` + mainProgram = "llama"; + + # These people might respond, on the best effort basis, if you ping them + # in case of Nix-specific regressions or for reviewing Nix-specific PRs. + # Consider adding yourself to this list if you want to ensure this flake + # stays maintained and you're willing to invest your time. Do not add + # other people without their consent. Consider removing people after + # they've been unreachable for long periods of time. + + # Note that lib.maintainers is defined in Nixpkgs, but you may just add + # an attrset following the same format as in + # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix + maintainers = with lib.maintainers; [ + philiptaron + SomeoneSerge + ]; + + # Extend `badPlatforms` instead + platforms = lib.platforms.all; + }; + } +) diff --git a/.devops/nix/scope.nix b/.devops/nix/scope.nix new file mode 100644 index 000000000..7932ac1e8 --- /dev/null +++ b/.devops/nix/scope.nix @@ -0,0 +1,12 @@ +{ + lib, + newScope, + llamaVersion ? "0.0.0", +}: + +lib.makeScope newScope ( + self: { + inherit llamaVersion; + llama-cpp = self.callPackage ./package.nix { }; + } +) diff --git a/.github/workflows/nix-flakestry.yml b/.github/workflows/nix-flakestry.yml new file mode 100644 index 000000000..3abfb3509 --- /dev/null +++ b/.github/workflows/nix-flakestry.yml @@ -0,0 +1,23 @@ +# Make the flake discoverable on https://flakestry.dev +name: "Publish a flake to flakestry" +on: + push: + tags: + - "v?[0-9]+.[0-9]+.[0-9]+" + - "v?[0-9]+.[0-9]+" + workflow_dispatch: + inputs: + tag: + description: "The existing tag to publish" + type: "string" + required: true +jobs: + publish-flake: + runs-on: ubuntu-latest + permissions: + id-token: "write" + contents: "read" + steps: + - uses: flakestry/flakestry-publish@main + with: + version: "${{ inputs.tag || github.ref_name }}" diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml new file mode 100644 index 000000000..92e1108b3 --- /dev/null +++ b/.github/workflows/python-check-requirements.yml @@ -0,0 +1,29 @@ +name: Python check requirements.txt + +on: + push: + paths: + - 'scripts/check-requirements.sh' + - 'convert*.py' + - 'requirements.txt' + - 'requirements/*.txt' + pull_request: + paths: + - 'scripts/check-requirements.sh' + - 'convert*.py' + - 'requirements.txt' + - 'requirements/*.txt' + +jobs: + python-check-requirements: + runs-on: ubuntu-latest + name: check-requirements + steps: + - name: Check out source repository + uses: actions/checkout@v3 + - name: Set up Python environment + uses: actions/setup-python@v4 + with: + python-version: "3.11" + - name: Run check-requirements.sh script + run: bash scripts/check-requirements.sh nocleanup diff --git a/awq-py/README.md b/awq-py/README.md new file mode 100644 index 000000000..59354f4e3 --- /dev/null +++ b/awq-py/README.md @@ -0,0 +1,116 @@ +# AWQ: Activation-aware Weight Quantization for LLM - version apply to llamacpp +[[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[Easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)] + +**Supported models:** + +- [X] LLaMA +- [x] LLaMA 2 +- [X] MPT +- [X] Mistral AI v0.1 +- [ ] Bloom +- [ ] Mixtral MoE + +**TODO:** +- [x] Update version work with both MPT and MPT-AWQ model +- [ ] Add OPT model +- [ ] Add Bloom model +- [ ] Add Mixtral MoE +- [ ] Support w3, w2 + + +## Contents + +- [Install](##Install) +- [Convert](##Convert) +- [Quantize](##Quantize) +- [Test](##Test) +- [Benchmark](##Benchmark) +- [Results](##Results) + +## Install +Install requirements +```bash +pip install -r requirements.txt +``` +Get the pre-computed AWQ search results for multiple model families, including LLaMA, LLaMA2, MPT, OPT +```bash +git clone https://huggingface.co/datasets/mit-han-lab/awq-model-zoo awq_cache +``` + +## Convert +Example for llama model +```bash +# For llama7b and llama2 models +python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf +# For mistral and mpt models +python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf +``` + +## Quantize +```bash +# We only benchmark and confirm the results on q4_0, q4_1, and q2_k types. +./quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0 +``` + +## Test +```bash +# For all models. +./build/bin/main -m models/llama_7b_q4_0.gguf -n 128 --prompt "Once upon a time" +``` + +## Benchmark +The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512. +```bash +# For llama and llama2, and mistral models. +./perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw +``` + +## Results +Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison +We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k + +### Llama 7B (Build with OpenBLAS) + +| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K | +|-----------:|--------------|-------:|-------:|-------:|-------:| +|Llama 7B | perplexity | 5.9066 | 6.1214 | 6.0643 | 6.5808 | +|Llama 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G | +|Llama 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | +|AWQ-LLama 7B| perplexity | 5.9175 | 6.0252 | 5.9987 | 6.3692 | +|AWQ-LLama 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G | +|AWQ-LLama 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | + + +### Llama2 7B (Build with CuBLAS) + +| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K | +|------------:|--------------|-------:|-------:|-------:|-------:| +|Llama2 7B | perplexity | 5.8664 | 6.0260 | 6.0656 | 6.4496 | +|Llama2 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G | +|Llama2 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | +|AWQ-LLama2 7B| perplexity | 5.8801 | 6.0054 | 5.9849 | 6.3650 | +|AWQ-LLama2 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G | +|AWQ-LLama2 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | + + +### Mistral 7B v0.1 (Build with CuBLAS) + +| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K | +|-------------:|--------------|-------:|-------:|-------:|-------:| +|Mistral 7B | perplexity | 5.6931 | 5.8202 | 5.8268 | 6.1645 | +|Mistral 7B | file size | 14.5G | 4.1G | 4.5G | 3.1G | +|Mistral 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | +|AWQ-Mistral 7B| perplexity | 5.6934 | 5.8020 | 5.7691 | 6.0426 | +|AWQ-Mistral 7B| file size | 14.5G | 4.1G | 4.5G | 3.1G | +|AWQ-Mistral 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | + +### MPT 7B (Build with OpenBLAS) + +| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K | +|---------:|--------------|-------:|-------:|-------:|--------:| +|MPT 7B | perplexity | 8.4369 | 8.7956 | 8.6265 | 11.4913 | +|MPT 7B | file size | 13.7G | 3.9G | 4.3G | 2.8G | +|MPT 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | +|AWQ-MPT 7B| perplexity | 8.4944 | 8.7053 | 8.6750 | 10.2873| +|AWQ-MPT 7B| file size | 13.7G | 3.9G | 4.3G | 2.8G | +|AWQ-MPT 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | diff --git a/awq-py/awq/apply_awq.py b/awq-py/awq/apply_awq.py new file mode 100644 index 000000000..11132c5d2 --- /dev/null +++ b/awq-py/awq/apply_awq.py @@ -0,0 +1,254 @@ +""" +Implements the AWQ for llama.cpp use cases. +Original paper: https://arxiv.org/abs/2306.00978 + +This code is based on versions of the AWQ implementation found in the following repositories: +* https://github.com/mit-han-lab/llm-awq +* https://github.com/casper-hansen/AutoAWQ +""" + +import os +import torch +import torch.nn as nn + +from transformers import AutoModelForCausalLM, AutoConfig +from transformers.models.bloom.modeling_bloom import BloomGelu +from transformers.models.llama.modeling_llama import LlamaRMSNorm +from transformers.activations import GELUActivation + + +class ScaledActivation(nn.Module): + """ + ScaledActivation module wraps an existing activation function and applies a + scale factor to its output. + + Args: + module (nn.Module): The activation function to be scaled. + scales (torch.Tensor): A tensor of size (num_features,) containing the initial + scale factors for each feature. + + Returns: + torch.Tensor: The scaled output of the activation function. + """ + + def __init__(self, module, scales): + super().__init__() + self.act = module + self.scales = nn.Parameter(scales.data) + + def forward(self, x): + return self.act(x) / self.scales.view(1, 1, -1).to(x.device) + + +def set_op_by_name(layer, name, new_module): + """ + Set the new module for given module's name. + + Args: + layer (nn.Module): The layer in which to replace the submodule. + name (str): The path to the submodule to be replaced, using dot notation + to access nested modules. + new_module (nn.Module): The new module to replace the existing one. + """ + levels = name.split(".") + if len(levels) > 1: + mod_ = layer + for l_idx in range(len(levels) - 1): + if levels[l_idx].isdigit(): + mod_ = mod_[int(levels[l_idx])] + else: + mod_ = getattr(mod_, levels[l_idx]) + setattr(mod_, levels[-1], new_module) + else: + setattr(layer, name, new_module) + + +def get_op_by_name(module, op_name): + """ + Retrieves a submodule within a given layer based on its name. + + Args: + module (nn.Module): The layer containing the submodule to find. + op_name (str): The name of the submodule. + + Returns: + nn.Module: The requested submodule found within the given layer. + + Raises: + ValueError: If the specified submodule cannot be found within the layer. + """ + for name, m in module.named_modules(): + if name == op_name: + return m + raise ValueError(f"Cannot find op {op_name} in module {module}") + + +@torch.no_grad() +def scale_ln_fcs(ln, fcs, scales): + """ + Scales the weights of a LayerNorm and a list of fully-connected layers proportionally. + + Args: + ln (nn.LayerNorm): The LayerNorm module to be scaled. + fcs (List[nn.Linear]): A list of fully-connected layers to be scaled. + scales (torch.Tensor): A 1D tensor of size (num_features,). + """ + + if not isinstance(fcs, list): + fcs = [fcs] + + scales = scales.to(ln.weight.device) + + ln.weight.div_(scales) + if hasattr(ln, "bias") and ln.bias is not None: + ln.bias.div_(scales) + + for fc in fcs: + fc.weight.mul_(scales.view(1, -1)) + + for p in ln.parameters(): + assert torch.isnan(p).sum() == 0 + for fc in fcs: + for p in fc.parameters(): + assert torch.isnan(p).sum() == 0 + + +@torch.no_grad() +def scale_fc_fc(fc1, fc2, scales): + """ + Scales the weights of two fully-connected layers in a specific pattern. + + Args: + fc1 (nn.Linear): The first fully-connected layer to be scaled. + fc2 (nn.Linear): The second fully-connected layer to be scaled. + scales (torch.Tensor): A 1D tensor of size (num_features,). + """ + assert isinstance(fc1, nn.Linear) + assert isinstance(fc2, nn.Linear) + + scales = scales.to(fc1.weight.device) + + fc1.weight[-scales.size(0):].div_(scales.view(-1, 1)) + if fc1.bias is not None: + fc1.bias.div_(scales.view(-1)) + + fc2.weight.mul_(scales.view(1, -1)) + + for p in fc1.parameters(): + assert torch.isnan(p).sum() == 0 + for p in fc2.parameters(): + assert torch.isnan(p).sum() == 0 + + +@torch.no_grad() +def scale_gelu_fc(gelu, fc, scales): + """ + Scales the weight of a GELU activation and a fully-connected layer proportionally. + + Args: + gelu (Union[nn.GELU, BloomGelu, GELUActivation]): The GELU activation module to be scaled. + fc (nn.Linear): The fully-connected layer to be scaled. + scales (torch.Tensor): A 1D tensor of size (num_features,). + + Raises: + TypeError: If the `gelu` module is not of type `nn.GELU`, `BloomGelu`, or `GELUActivation`. + TypeError: If the `fc` module is not of type `nn.Linear`. + """ + assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation)) + assert isinstance(fc, nn.Linear) + + fc.weight.mul_(scales.view(1, -1).to(fc.weight.device)) + + for p in fc.parameters(): + assert torch.isnan(p).sum() == 0 + + +def apply_scale(module, scales_list, input_feat_dict=None): + """ + Applies different scaling strategies to layers based on their type and hierarchy within a given module. + + Args: + module (nn.Module): The module containing the layers to be scaled. + scales_list (List[Tuple[str, List[str], torch.Tensor]]): A list of tuples containing: + * prev_op_name (str): The name of the preceding operation or module, + relative to which the layers to be scaled are located. + * layer_names (List[str]): A list of names of the layers to be scaled, relative to the preceding operation. + * scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature. + input_feat_dict (Optional[Dict[str, torch.Tensor]]): A dictionary mapping layer names to their corresponding + input features (optional). + """ + for prev_op_name, layer_names, scales in scales_list: + prev_op = get_op_by_name(module, prev_op_name) + layers = [get_op_by_name(module, name) for name in layer_names] + + prev_op.cuda() + for layer in layers: + layer.cuda() + scales.cuda() + + if isinstance(prev_op, nn.Linear): + assert len(layers) == 1 + scale_fc_fc(prev_op, layers[0], scales) + elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm)) or "rmsnorm" in str(prev_op.__class__).lower(): + scale_ln_fcs(prev_op, layers, scales) + elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)): + new_module = ScaledActivation(prev_op, scales) + set_op_by_name(module, prev_op_name, new_module) + scale_gelu_fc(prev_op, layers[0], scales) + else: + raise NotImplementedError(f"prev_op {type(prev_op)} not supported yet!") + + # apply the scaling to input feat if given; prepare it for clipping + if input_feat_dict is not None: + for layer_name in layer_names: + inp = input_feat_dict[layer_name] + inp.div_(scales.view(1, -1).to(inp.device)) + + prev_op.cpu() + for layer in layers: + layer.cpu() + scales.cpu() + + +@torch.no_grad() +def apply_clip(module, clip_list): + """ + Applies element-wise clipping to the weight of a specific layer within a given module. + + Args: + module (nn.Module): The module containing the layer to be clipped. + clip_list (List[Tuple[str, torch.Tensor]]): A list of tuples containing: + * name (str): The name of the layer to be clipped, relative to the root of the module. + * max_val (torch.Tensor): A 1D or 2D tensor defining the upper bound for each element of the layer's weight. + """ + for name, max_val in clip_list: + layer = get_op_by_name(module, name) + layer.cuda() + max_val = max_val.to(layer.weight.device) + org_shape = layer.weight.shape + layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1) + layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val) + layer.weight.data = layer.weight.data.reshape(org_shape) + layer.cpu() + + +def add_scale_weights(model_path, scale_path, tmp_path): + """ + Adds pre-computed Activation Weight Quantization (AWQ) results to a model, + including scaling factors and clipping bounds. + + Args: + model_path (str): Path to the pre-trained model to be equipped with AWQ. + scale_path (str): Path to the AWQ scale factors (.pt file). + tmp_path (str): Path to the temporary directory where the equipped model will be saved. + """ + config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained( + model_path, config=config, trust_remote_code=True + ) + model.eval() + awq_results = torch.load(str(scale_path), map_location="cpu") + apply_scale(model, awq_results["scale"]) + apply_clip(model, awq_results["clip"]) + model.save_pretrained(str(tmp_path)) + os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}") diff --git a/awq-py/requirements.txt b/awq-py/requirements.txt new file mode 100644 index 000000000..5fe604329 --- /dev/null +++ b/awq-py/requirements.txt @@ -0,0 +1,2 @@ +torch>=2.0.0 +transformers>=4.32.0 diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index b5d5453d2..f79acfef1 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -65,4 +65,4 @@ endif() target_include_directories(${TARGET} PUBLIC .) target_compile_features(${TARGET} PUBLIC cxx_std_11) -target_link_libraries(${TARGET} PRIVATE llama build_info) +target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama) diff --git a/common/common.cpp b/common/common.cpp index e08e7dbac..ceddb029e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1395,6 +1395,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER); fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false"); + fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false"); fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false"); fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 303d08170..51724c0df 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -46,7 +46,7 @@ class Model: self.part_names = self._get_part_names() self.hparams = Model.load_hparams(self.dir_model) self.model_arch = self._get_model_architecture() - self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess) + self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False) def set_vocab(self): self._set_vocab_gpt2() @@ -59,7 +59,7 @@ class Model: from safetensors import safe_open ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) else: - ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True)) + ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", weights_only=True)) with ctx as model_part: for name in model_part.keys(): @@ -182,6 +182,8 @@ class Model: return QwenModel if model_architecture == "MixtralForCausalLM": return MixtralModel + if model_architecture == "GPT2LMHeadModel": + return GPT2Model if model_architecture == "PhiForCausalLM": return Phi2Model if model_architecture == "PlamoForCausalLM": @@ -225,6 +227,8 @@ class Model: return gguf.MODEL_ARCH.QWEN if arch == "MixtralForCausalLM": return gguf.MODEL_ARCH.LLAMA + if arch == "GPT2LMHeadModel": + return gguf.MODEL_ARCH.GPT2 if arch == "PhiForCausalLM": return gguf.MODEL_ARCH.PHI2 if arch == "PlamoForCausalLM": @@ -238,7 +242,7 @@ class Model: tokens: list[bytearray] = [] toktypes: list[int] = [] - from transformers import AutoTokenizer # type: ignore[attr-defined] + from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(dir_model) vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) assert max(tokenizer.vocab.values()) < vocab_size @@ -464,7 +468,11 @@ class MPTModel(Model): data = data_torch.squeeze().numpy() # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if "scales" in name: + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales")) + new_name = new_name.replace("scales", "act.scales") + else: + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: print(f"Can not map tensor {name!r}") sys.exit() @@ -848,7 +856,7 @@ class StableLMModel(Model): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_name(dir_model.name) + self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -894,7 +902,7 @@ class QwenModel(Model): tokens: list[bytearray] = [] toktypes: list[int] = [] - from transformers import AutoTokenizer # type: ignore[attr-defined] + from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) vocab_size = hparams["vocab_size"] assert max(tokenizer.get_vocab().values()) < vocab_size @@ -989,6 +997,68 @@ class QwenModel(Model): self.gguf_writer.add_tensor(new_name, data) +class GPT2Model(Model): + def set_gguf_parameters(self): + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_context_length(self.hparams["n_ctx"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias")): + continue + + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): + data_torch = data_torch.transpose(1, 0) + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + # note: GPT2 output is tied to (same as) wte in original model + if new_name == "token_embd.weight": + print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor("output.weight", data) + + class Phi2Model(Model): def set_gguf_parameters(self): block_count = self.hparams["n_layer"] @@ -1095,6 +1165,9 @@ def parse_args() -> argparse.Namespace: "--vocab-only", action="store_true", help="extract only the vocab", ) + parser.add_argument( + "--awq-path", type=Path, default=None, + help="Path to scale awq cache file") parser.add_argument( "--outfile", type=Path, help="path to write to; default: based on input", @@ -1112,43 +1185,62 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() -args = parse_args() +def main() -> None: + args = parse_args() -dir_model = args.model -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file=sys.stderr) - sys.exit(1) + dir_model = args.model -ftype_map = { - "f32": gguf.GGMLQuantizationType.F32, - "f16": gguf.GGMLQuantizationType.F16, -} + if args.awq_path: + sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) + from awq.apply_awq import add_scale_weights + tmp_model_path = args.model / "weighted_model" + dir_model = tmp_model_path + if tmp_model_path.is_dir(): + print(f"{tmp_model_path} exists as a weighted model.") + else: + tmp_model_path.mkdir(parents=True, exist_ok=True) + print("Saving new weighted model ...") + add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) + print(f"Saved weighted model at {tmp_model_path}.") -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{args.outtype}.gguf' + if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file=sys.stderr) + sys.exit(1) -print(f"Loading model: {dir_model.name}") + ftype_map = { + "f32": gguf.GGMLQuantizationType.F32, + "f16": gguf.GGMLQuantizationType.F16, + } -hparams = Model.load_hparams(dir_model) - -with torch.inference_mode(): - model_class = Model.from_model_architecture(hparams["architectures"][0]) - model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian) - - print("Set model parameters") - model_instance.set_gguf_parameters() - - print("Set model tokenizer") - model_instance.set_vocab() - - if args.vocab_only: - print(f"Exporting model vocab to '{fname_out}'") - model_instance.write_vocab() + if args.outfile is not None: + fname_out = args.outfile else: - print(f"Exporting model to '{fname_out}'") - model_instance.write() + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{args.outtype}.gguf' - print(f"Model successfully exported to '{fname_out}'") + print(f"Loading model: {dir_model.name}") + + hparams = Model.load_hparams(dir_model) + + with torch.inference_mode(): + model_class = Model.from_model_architecture(hparams["architectures"][0]) + model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian) + + print("Set model parameters") + model_instance.set_gguf_parameters() + + print("Set model tokenizer") + model_instance.set_vocab() + + if args.vocab_only: + print(f"Exporting model vocab to '{fname_out}'") + model_instance.write_vocab() + else: + print(f"Exporting model to '{fname_out}'") + model_instance.write() + + print(f"Model successfully exported to '{fname_out}'") + + +if __name__ == '__main__': + main() diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py index 53bb8a3d9..35ce152f4 100755 --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -47,95 +47,96 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty fout.seek((fout.tell() + 31) & -32) -if len(sys.argv) < 2: - print(f"Usage: python {sys.argv[0]} [arch]") - print( - "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'" - ) - print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)") - sys.exit(1) +if __name__ == '__main__': + if len(sys.argv) < 2: + print(f"Usage: python {sys.argv[0]} [arch]") + print( + "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'" + ) + print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)") + sys.exit(1) -input_json = os.path.join(sys.argv[1], "adapter_config.json") -input_model = os.path.join(sys.argv[1], "adapter_model.bin") -output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin") + input_json = os.path.join(sys.argv[1], "adapter_config.json") + input_model = os.path.join(sys.argv[1], "adapter_model.bin") + output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin") -model = torch.load(input_model, map_location="cpu") -arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama" + model = torch.load(input_model, map_location="cpu") + arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama" -if arch_name not in gguf.MODEL_ARCH_NAMES.values(): - print(f"Error: unsupported architecture {arch_name}") - sys.exit(1) + if arch_name not in gguf.MODEL_ARCH_NAMES.values(): + print(f"Error: unsupported architecture {arch_name}") + sys.exit(1) -arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)] -name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone + arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)] + name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone -with open(input_json, "r") as f: - params = json.load(f) + with open(input_json, "r") as f: + params = json.load(f) -if params["peft_type"] != "LORA": - print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA") - sys.exit(1) + if params["peft_type"] != "LORA": + print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA") + sys.exit(1) -if params["fan_in_fan_out"] is True: - print("Error: param fan_in_fan_out is not supported") - sys.exit(1) + if params["fan_in_fan_out"] is True: + print("Error: param fan_in_fan_out is not supported") + sys.exit(1) -if params["bias"] is not None and params["bias"] != "none": - print("Error: param bias is not supported") - sys.exit(1) + if params["bias"] is not None and params["bias"] != "none": + print("Error: param bias is not supported") + sys.exit(1) -# TODO: these seem to be layers that have been trained but without lora. -# doesn't seem widely used but eventually should be supported -if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0: - print("Error: param modules_to_save is not supported") - sys.exit(1) + # TODO: these seem to be layers that have been trained but without lora. + # doesn't seem widely used but eventually should be supported + if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0: + print("Error: param modules_to_save is not supported") + sys.exit(1) -with open(output_path, "wb") as fout: - fout.truncate() + with open(output_path, "wb") as fout: + fout.truncate() - write_file_header(fout, params) - for k, v in model.items(): - orig_k = k - if k.endswith(".default.weight"): - k = k.replace(".default.weight", ".weight") - if k in ["llama_proj.weight", "llama_proj.bias"]: - continue - if k.endswith("lora_A.weight"): - if v.dtype != torch.float16 and v.dtype != torch.float32: + write_file_header(fout, params) + for k, v in model.items(): + orig_k = k + if k.endswith(".default.weight"): + k = k.replace(".default.weight", ".weight") + if k in ["llama_proj.weight", "llama_proj.bias"]: + continue + if k.endswith("lora_A.weight"): + if v.dtype != torch.float16 and v.dtype != torch.float32: + v = v.float() + v = v.T + else: v = v.float() - v = v.T - else: - v = v.float() - t = v.detach().numpy() + t = v.detach().numpy() - prefix = "base_model.model." - if k.startswith(prefix): - k = k[len(prefix) :] + prefix = "base_model.model." + if k.startswith(prefix): + k = k[len(prefix) :] - lora_suffixes = (".lora_A.weight", ".lora_B.weight") - if k.endswith(lora_suffixes): - suffix = k[-len(lora_suffixes[0]):] - k = k[: -len(lora_suffixes[0])] - else: - print(f"Error: unrecognized tensor name {orig_k}") - sys.exit(1) + lora_suffixes = (".lora_A.weight", ".lora_B.weight") + if k.endswith(lora_suffixes): + suffix = k[-len(lora_suffixes[0]):] + k = k[: -len(lora_suffixes[0])] + else: + print(f"Error: unrecognized tensor name {orig_k}") + sys.exit(1) - tname = name_map.get_name(k) - if tname is None: - print(f"Error: could not map tensor name {orig_k}") - print(" Note: the arch parameter must be specified if the model is not llama") - sys.exit(1) + tname = name_map.get_name(k) + if tname is None: + print(f"Error: could not map tensor name {orig_k}") + print(" Note: the arch parameter must be specified if the model is not llama") + sys.exit(1) - if suffix == ".lora_A.weight": - tname += ".weight.loraA" - elif suffix == ".lora_B.weight": - tname += ".weight.loraB" - else: - assert False + if suffix == ".lora_A.weight": + tname += ".weight.loraA" + elif suffix == ".lora_B.weight": + tname += ".weight.loraB" + else: + assert False - print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") - write_tensor_header(fout, tname, t.shape, t.dtype) - t.tofile(fout) + print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") + write_tensor_header(fout, tname, t.shape, t.dtype) + t.tofile(fout) -print(f"Converted {input_json} and {input_model} to {output_path}") + print(f"Converted {input_json} and {input_model} to {output_path}") diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py old mode 100644 new mode 100755 index 206b7d5ff..1ba5864dc --- a/convert-persimmon-to-gguf.py +++ b/convert-persimmon-to-gguf.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import torch import os from pprint import pprint diff --git a/convert.py b/convert.py index 1f0c4f2f4..c3f3fc0a1 100755 --- a/convert.py +++ b/convert.py @@ -1187,6 +1187,7 @@ def main(args_in: list[str] | None = None) -> None: # We currently only support Q8_0 output on little endian systems. output_choices.append("q8_0") parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file") + parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None) parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") @@ -1200,6 +1201,19 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--padvocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") args = parser.parse_args(args_in) + if args.awq_path: + sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) + from awq.apply_awq import add_scale_weights + tmp_model_path = args.model / "weighted_model" + if tmp_model_path.is_dir(): + print(f"{tmp_model_path} exists as a weighted model.") + else: + tmp_model_path.mkdir(parents=True, exist_ok=True) + print("Saving new weighted model ...") + add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) + print(f"Saved weighted model at {tmp_model_path}.") + args.model = tmp_model_path + if args.dump_single: model_plus = lazy_load_file(args.model) do_dump_model(model_plus) diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 7b1333a9d..e0520f64c 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -196,13 +196,13 @@ static const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down"; static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up"; static void print_params(struct my_llama_hparams * params) { - printf("%s: n_vocab: %u\n", __func__, params->n_vocab); - printf("%s: n_ctx: %u\n", __func__, params->n_ctx); - printf("%s: n_embd: %u\n", __func__, params->n_embd); - printf("%s: n_ff: %u\n", __func__, params->n_ff); - printf("%s: n_head: %u\n", __func__, params->n_head); - printf("%s: n_head_kv: %u\n", __func__, params->n_head_kv); - printf("%s: n_layer: %u\n", __func__, params->n_layer); + printf("%s: n_vocab : %u\n", __func__, params->n_vocab); + printf("%s: n_ctx : %u\n", __func__, params->n_ctx); + printf("%s: n_embd : %u\n", __func__, params->n_embd); + printf("%s: n_ff : %u\n", __func__, params->n_ff); + printf("%s: n_head : %u\n", __func__, params->n_head); + printf("%s: n_head_kv : %u\n", __func__, params->n_head_kv); + printf("%s: n_layer : %u\n", __func__, params->n_layer); printf("%s: norm_rms_eps : %f\n", __func__, params->f_norm_rms_eps); printf("%s: rope_freq_base : %f\n", __func__, params->rope_freq_base); printf("%s: rope_freq_scale : %f\n", __func__, params->rope_freq_scale); diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 464fb3277..66244382f 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -1,5 +1,7 @@ import Foundation +// To use this in your own project, add llama.cpp as a swift package dependency +// and uncomment this import line. // import llama enum LlamaError: Error { diff --git a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift index 3393eb242..17cb5b9dd 100644 --- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift +++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift @@ -4,6 +4,7 @@ import Foundation class LlamaState: ObservableObject { @Published var messageLog = "" @Published var cacheCleared = false + let NS_PER_S = 1_000_000_000.0 private var llamaContext: LlamaContext? private var defaultModelUrl: URL? { @@ -20,12 +21,12 @@ class LlamaState: ObservableObject { } func loadModel(modelUrl: URL?) throws { - messageLog += "Loading model...\n" if let modelUrl { + messageLog += "Loading model...\n" llamaContext = try LlamaContext.create_context(path: modelUrl.path()) messageLog += "Loaded model \(modelUrl.lastPathComponent)\n" } else { - messageLog += "Could not locate model\n" + messageLog += "Load a model from the list below\n" } } @@ -34,15 +35,29 @@ class LlamaState: ObservableObject { return } + let t_start = DispatchTime.now().uptimeNanoseconds await llamaContext.completion_init(text: text) + let t_heat_end = DispatchTime.now().uptimeNanoseconds + let t_heat = Double(t_heat_end - t_start) / NS_PER_S + messageLog += "\(text)" - while await llamaContext.n_cur <= llamaContext.n_len { + while await llamaContext.n_cur < llamaContext.n_len { let result = await llamaContext.completion_loop() messageLog += "\(result)" } + + let t_end = DispatchTime.now().uptimeNanoseconds + let t_generation = Double(t_end - t_heat_end) / NS_PER_S + let tokens_per_second = Double(await llamaContext.n_len) / t_generation + await llamaContext.clear() - messageLog += "\n\ndone\n" + messageLog += """ + \n + Done + Heat up took \(t_heat)s + Generated \(tokens_per_second) t/s\n + """ } func bench() async { @@ -56,10 +71,10 @@ class LlamaState: ObservableObject { messageLog += await llamaContext.model_info() + "\n" let t_start = DispatchTime.now().uptimeNanoseconds - await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up + let _ = await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up let t_end = DispatchTime.now().uptimeNanoseconds - let t_heat = Double(t_end - t_start) / 1_000_000_000.0 + let t_heat = Double(t_end - t_start) / NS_PER_S messageLog += "Heat up time: \(t_heat) seconds, please wait...\n" // if more than 5 seconds, then we're probably running on a slow device diff --git a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift index c78f107b3..147e0c63b 100644 --- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift +++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift @@ -42,46 +42,27 @@ struct ContentView: View { Button("Send") { sendText() } - .padding(8) - .background(Color.blue) - .foregroundColor(.white) - .cornerRadius(8) Button("Bench") { bench() } - .padding(8) - .background(Color.blue) - .foregroundColor(.white) - .cornerRadius(8) Button("Clear") { clear() } - .padding(8) - .background(Color.blue) - .foregroundColor(.white) - .cornerRadius(8) Button("Copy") { UIPasteboard.general.string = llamaState.messageLog } - .padding(8) - .background(Color.blue) - .foregroundColor(.white) - .cornerRadius(8) - } + }.buttonStyle(.bordered) - VStack { + VStack(alignment: .leading) { DownloadButton( llamaState: llamaState, modelName: "TinyLlama-1.1B (Q4_0, 0.6 GiB)", modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true", filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf" ) - .font(.system(size: 12)) - .padding(.top, 4) - .frame(maxWidth: .infinity, alignment: .leading) DownloadButton( llamaState: llamaState, @@ -89,7 +70,6 @@ struct ContentView: View { modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf?download=true", filename: "tinyllama-1.1b-1t-openorca.Q8_0.gguf" ) - .font(.system(size: 12)) DownloadButton( llamaState: llamaState, @@ -97,8 +77,6 @@ struct ContentView: View { modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true", filename: "tinyllama-1.1b-f16.gguf" ) - .font(.system(size: 12)) - .frame(maxWidth: .infinity, alignment: .leading) DownloadButton( llamaState: llamaState, @@ -106,7 +84,6 @@ struct ContentView: View { modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true", filename: "phi-2-q4_0.gguf" ) - .font(.system(size: 12)) DownloadButton( llamaState: llamaState, @@ -114,8 +91,6 @@ struct ContentView: View { modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true", filename: "phi-2-q8_0.gguf" ) - .font(.system(size: 12)) - .frame(maxWidth: .infinity, alignment: .leading) DownloadButton( llamaState: llamaState, @@ -123,15 +98,15 @@ struct ContentView: View { modelUrl: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true", filename: "mistral-7b-v0.1.Q4_0.gguf" ) - .font(.system(size: 12)) Button("Clear downloaded models") { ContentView.cleanupModelCaches() llamaState.cacheCleared = true } - .padding(8) - .font(.system(size: 12)) } + .padding(.top, 4) + .font(.system(size: 12)) + .frame(maxWidth: .infinity, alignment: .leading) } .padding() } diff --git a/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift b/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift index 4bd75cb69..c9f322ca1 100644 --- a/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift +++ b/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift @@ -93,7 +93,7 @@ struct DownloadButton: View { print("Error: \(err.localizedDescription)") } }) { - Text("\(modelName) (Downloaded)") + Text("Load \(modelName)") } } else { Text("Unknown status") diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index 8ea3e5c83..2985caff8 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -24,7 +24,8 @@ endif() if (NOT MSVC) target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h - endif() +endif() + if(TARGET BUILD_INFO) add_dependencies(llava BUILD_INFO) endif() @@ -32,5 +33,5 @@ endif() set(TARGET llava-cli) add_executable(llava-cli llava-cli.cpp) install(TARGETS llava-cli RUNTIME) -target_link_libraries(llava-cli PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(llava PRIVATE cxx_std_11) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index f06ec400d..6a731eeec 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -16,12 +16,19 @@ #include "clip.h" #include "ggml.h" #include "ggml-alloc.h" +#include "ggml-backend.h" + +#ifdef GGML_USE_CUBLAS +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" -#define CLIP_DEBUG - static std::string format(const char * fmt, ...) { va_list ap; va_list ap2; @@ -196,20 +203,6 @@ struct clip_vision_model { struct ggml_tensor * mm_2_b; }; -// Replacement for std::vector that doesn't require zero-initialization. -struct clip_buffer { - uint8_t * data = NULL; - size_t size = 0; - - void resize(size_t size) { - delete[] data; - data = new uint8_t[size]; - this->size = size; - } - - ~clip_buffer() { delete[] data; } -}; - struct clip_ctx { bool has_text_encoder = false; bool has_vision_encoder = false; @@ -223,9 +216,10 @@ struct clip_ctx { struct gguf_context * ctx_gguf; // memory buffers to evaluate the model - clip_buffer buf_compute; - clip_buffer buf_alloc; - ggml_allocr * alloc = NULL; + ggml_backend_buffer_t params_buffer = NULL; + ggml_backend_buffer_t compute_buffer = NULL; + ggml_backend_t backend = NULL; + ggml_allocr * compute_alloc = NULL; }; static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_image_f32_batch * imgs) { @@ -252,25 +246,20 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima if(ctx->has_llava_projector) { GGML_ASSERT(batch_size == 1); } - - const auto & buf_compute = ctx->buf_compute; - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ false, + /*.mem_size =*/ GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, }; - params.no_alloc = true; - struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size); - ggml_allocr_alloc(ctx->alloc, inp_raw); + ggml_allocr_alloc(ctx->compute_alloc, inp_raw); - if (!ggml_allocr_is_measure(ctx->alloc)) { - float * data = (float *)ggml_get_data(inp_raw); + if (!ggml_allocr_is_measure(ctx->compute_alloc)) { + float * data = (float *)malloc(ggml_nbytes(inp_raw)); for (size_t i = 0; i < imgs->size; i++) { const int nx = imgs->data[i].nx; @@ -289,6 +278,8 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima } } } + ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); + free(data); } struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1); @@ -298,36 +289,39 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima // concat class_embeddings and patch_embeddings struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); - ggml_allocr_alloc(ctx->alloc, embeddings); - if (!ggml_allocr_is_measure(ctx->alloc)) { - ggml_set_zero(embeddings); + ggml_allocr_alloc(ctx->compute_alloc, embeddings); + if (!ggml_allocr_is_measure(ctx->compute_alloc)) { + void* zero_mem = malloc(ggml_nbytes(embeddings)); + memset(zero_mem, 0, ggml_nbytes(embeddings)); + ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); + free(zero_mem); } - struct ggml_tensor * temp = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, 1, batch_size); - ggml_allocr_alloc(ctx->alloc, temp); + embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); - embeddings = ggml_acc(ctx0, embeddings, ggml_repeat(ctx0, model.class_embedding, temp), embeddings->nb[1], - embeddings->nb[2], embeddings->nb[3], 0); - embeddings = - ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); + embeddings = ggml_acc(ctx0, embeddings, inp, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); - ggml_allocr_alloc(ctx->alloc, positions); - if (!ggml_allocr_is_measure(ctx->alloc)) { + ggml_allocr_alloc(ctx->compute_alloc, positions); + if (!ggml_allocr_is_measure(ctx->compute_alloc)) { + int* positions_data = (int*)malloc(ggml_nbytes(positions)); for (int i = 0; i < num_positions; i++) { - ggml_set_i32_1d(positions, i, i); + positions_data[i] = i; } + ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); + free(positions_data); } embeddings = - ggml_add(ctx0, embeddings, ggml_repeat(ctx0, ggml_get_rows(ctx0, model.position_embeddings, positions), embeddings)); + ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); // pre-layernorm { embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.pre_ln_w, embeddings), embeddings), - ggml_repeat(ctx0, model.pre_ln_b, embeddings)); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b); } // loop over layers @@ -340,15 +334,15 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima { cur = ggml_norm(ctx0, cur, eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_1_w, cur), cur), - ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), + model.layers[il].ln_1_b); } // self-attention { struct ggml_tensor * Q = - ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, cur), ggml_mul_mat(ctx0, model.layers[il].q_w, cur)); + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); @@ -356,14 +350,14 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size); struct ggml_tensor * K = - ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].k_b, cur), ggml_mul_mat(ctx0, model.layers[il].k_w, cur)); + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b); K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); struct ggml_tensor * V = - ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].v_b, cur), ggml_mul_mat(ctx0, model.layers[il].v_w, cur)); + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b); V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); @@ -379,7 +373,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima } // attention output - cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].o_b, cur), ggml_mul_mat(ctx0, model.layers[il].o_w, cur)); + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b); // re-add the layer input, e.g., residual cur = ggml_add(ctx0, cur, embeddings); @@ -390,12 +384,11 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima { cur = ggml_norm(ctx0, cur, eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_2_w, cur), cur), - ggml_repeat(ctx0, model.layers[il].ln_2_b, cur)); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b); } cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur); - cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].ff_i_b, cur), cur); + cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b); if (ctx->use_gelu) { cur = ggml_gelu_inplace(ctx0, cur); @@ -404,7 +397,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima } cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur); - cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].ff_o_b, cur), cur); + cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b); // residual 2 cur = ggml_add(ctx0, embeddings, cur); @@ -417,23 +410,26 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches); - ggml_allocr_alloc(ctx->alloc, patches); - if (!ggml_allocr_is_measure(ctx->alloc)) { - for (int i = 0; i < num_patches; ++i) { - ggml_set_i32_1d(patches, i, i+1); + ggml_allocr_alloc(ctx->compute_alloc, patches); + if (!ggml_allocr_is_measure(ctx->compute_alloc)) { + int* patches_data = (int*)malloc(ggml_nbytes(patches)); + for (int i = 0; i < num_positions; i++) { + patches_data[i] = i + 1; } + ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); + free(patches_data); } embeddings = ggml_get_rows(ctx0, embeddings, patches); // mm projection 0 embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); - embeddings = ggml_add(ctx0, ggml_repeat(ctx0, model.mm_0_b, embeddings), embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); embeddings = ggml_gelu(ctx0, embeddings); embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); - embeddings = ggml_add(ctx0, ggml_repeat(ctx0, model.mm_2_b, embeddings), embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); } // build the graph @@ -446,7 +442,6 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima // read and create ggml_context containing the tensors and their data struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { - struct ggml_context * meta = NULL; struct gguf_init_params params = { @@ -479,7 +474,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { printf("%s: ftype: %s\n", __func__, ftype_str.c_str()); printf("\n"); } - + const int n_tensors = gguf_get_n_tensors(ctx); // kv if (verbosity >= 3) { const int n_kv = gguf_get_n_kv(ctx); @@ -493,27 +488,38 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } // data - size_t ctx_size = 0; + size_t buffer_size = 0; { - const int n_tensors = gguf_get_n_tensors(ctx); - for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name(ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i); - struct ggml_tensor * cur = ggml_get_tensor(meta, name); - ctx_size += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; size_t tensor_size = ggml_nbytes(cur); - size_t padded_size = ggml_nbytes_pad(cur); - ctx_size += padded_size; + buffer_size += tensor_size; if (verbosity >= 3) { - printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, padded_size=%zu, offset=%zu\n", __func__, i, - ggml_n_dims(cur), cur->name, tensor_size, padded_size, offset); + printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu\n", __func__, i, + ggml_n_dims(cur), cur->name, tensor_size, offset); } } } + buffer_size += n_tensors * 128 /* CLIP PADDING */; + clip_ctx * new_clip = new clip_ctx; +#ifdef GGML_USE_CUBLAS + new_clip->backend = ggml_backend_cuda_init(0); + printf("%s: CLIP using CUDA backend\n", __func__); +#endif + +#ifdef GGML_USE_METAL + new_clip->backend = ggml_backend_metal_init(); + printf("%s: CLIP using Metal backend\n", __func__); +#endif + + if (!new_clip->backend) { + new_clip->backend = ggml_backend_cpu_init(); + printf("%s: CLIP using CPU backend\n", __func__); + } // model size and capabilities { @@ -539,17 +545,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { printf("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); printf("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); - printf("%s: model size: %.2f MB\n", __func__, (ctx_size / 1024.0 / 1024.0)); + printf("%s: model size: %.2f MB\n", __func__, buffer_size / 1024.0 / 1024.0); printf("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); } } + printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, buffer_size / (1024.0 * 1024.0), n_tensors); + // load tensors { + std::vector read_buf; struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, + /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, + /*.no_alloc =*/ true, }; new_clip->ctx = ggml_init(params); @@ -566,13 +575,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { return nullptr; } - const int n_tensors = gguf_get_n_tensors(ctx); + // add tensors to context for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name(ctx, i); struct ggml_tensor * t = ggml_get_tensor(meta, name); struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx, t); ggml_set_name(cur, name); + } + // alloc memory and offload data + new_clip->params_buffer = ggml_backend_alloc_buffer(new_clip->backend, buffer_size); + ggml_allocr* alloc = ggml_allocr_new_from_buffer(new_clip->params_buffer); + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx, i); + struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx, name); + ggml_allocr_alloc(alloc, cur); const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); fin.seekg(offset, std::ios::beg); if (!fin) { @@ -580,10 +597,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { clip_free(new_clip); return nullptr; } - - fin.read(reinterpret_cast(cur->data), ggml_nbytes(t)); + int num_bytes = ggml_nbytes(cur); + if (ggml_backend_buffer_is_host(new_clip->params_buffer)) { + // for the CPU and Metal backend, we can read directly into the tensor + fin.read(reinterpret_cast(cur->data), num_bytes); + } else { + // read into a temporary buffer first, then copy to device memory + read_buf.resize(num_bytes); + fin.read(reinterpret_cast(read_buf.data()), num_bytes); + ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); + } } - + ggml_allocr_free(alloc); fin.close(); } @@ -657,18 +682,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { // measure mem requirement and allocate { - static const size_t tensor_alignment = 32; - new_clip->buf_compute.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead()); - new_clip->alloc = ggml_allocr_new_measure(tensor_alignment); + new_clip->compute_alloc = ggml_allocr_new_measure_from_backend(new_clip->backend); clip_image_f32_batch batch; batch.size = 1; ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch); - size_t alloc_size = ggml_allocr_alloc_graph(new_clip->alloc, gf) + tensor_alignment; - ggml_allocr_free(new_clip->alloc); - new_clip->buf_alloc.resize(alloc_size); - new_clip->alloc = ggml_allocr_new(new_clip->buf_alloc.data, new_clip->buf_alloc.size, tensor_alignment); + size_t compute_memory_buffer_size = ggml_allocr_alloc_graph(new_clip->compute_alloc, gf); + ggml_allocr_free(new_clip->compute_alloc); + new_clip->compute_buffer = ggml_backend_alloc_buffer(new_clip->backend, compute_memory_buffer_size); + new_clip->compute_alloc = ggml_allocr_new_from_buffer(new_clip->compute_buffer); - printf("%s: total allocated memory: %.2f MB\n", __func__, (new_clip->buf_compute.size + alloc_size)/1024.0/1024.0); + printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); } return new_clip; @@ -852,29 +875,29 @@ bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const cl } // reset alloc buffer to clean the memory from previous invocations - ggml_allocr_reset(ctx->alloc); + ggml_allocr_reset(ctx->compute_alloc); // build the inference graph ggml_cgraph * gf = clip_image_build_graph(ctx, imgs); - ggml_allocr_alloc_graph(ctx->alloc, gf); + ggml_allocr_alloc_graph(ctx->compute_alloc, gf); - struct ggml_cplan plan = ggml_graph_plan(gf, n_threads); - if (plan.work_size > 0) { - plan.work_data = (uint8_t *)malloc(plan.work_size); + if (ggml_backend_is_cpu(ctx->backend)) { + ggml_backend_cpu_set_n_threads(ctx->backend, n_threads); } - ggml_graph_compute(gf, &plan); +#ifdef GGML_USE_METAL + if (ggml_backend_is_metal(ctx->backend)) { + ggml_backend_metal_set_n_cb(ctx->backend, n_threads); + } +#endif + + ggml_backend_graph_compute(ctx->backend, gf); // the last node is the embedding tensor -struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1]; // copy the embeddings to the location passed by the user - memcpy(vec, ggml_get_data_f32(embeddings), ggml_nbytes(embeddings)); - - if (plan.work_size > 0) { - free(plan.work_data); - } - + ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); return true; } @@ -1045,8 +1068,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i gguf_free(ctx_out); { - printf("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); - printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); + printf("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); + printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); int64_t sum_all = 0; for (size_t i = 0; i < hist_all.size(); ++i) { diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 31f8cd8e0..502b788b1 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -39,73 +39,11 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n return true; } -// TODO: use common/sampling.h -static llama_token sample_id(llama_context * ctx_llama, gpt_params & params) { - auto & sparams = params.sparams; - - // out of user input, sample next token - const float temp = sparams.temp; - const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k; - const float top_p = sparams.top_p; - const float tfs_z = sparams.tfs_z; - const float typical_p = sparams.typical_p; - // const int32_t repeat_last_n = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n; - // const float repeat_penalty = sparams.repeat_penalty; - // const float alpha_presence = sparams.presence_penalty; - // const float alpha_frequency = sparams.frequency_penalty; - const int mirostat = sparams.mirostat; - const float mirostat_tau = sparams.mirostat_tau; - const float mirostat_eta = sparams.mirostat_eta; - // const bool penalize_nl = sparams.penalize_nl; - - llama_token id = 0; - { - auto logits = llama_get_logits(ctx_llama); - auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama)); - - // Apply params.logit_bias map - for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) { - logits[it->first] += it->second; - } - - std::vector candidates; - candidates.reserve(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } - - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - - if (temp <= 0) { - // Greedy sampling - id = llama_sample_token_greedy(ctx_llama, &candidates_p); - } else { - if (mirostat == 1) { - static float mirostat_mu = 2.0f * mirostat_tau; - const int mirostat_m = 100; - llama_sample_temp(ctx_llama, &candidates_p, temp); - id = llama_sample_token_mirostat(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); - } else if (mirostat == 2) { - static float mirostat_mu = 2.0f * mirostat_tau; - llama_sample_temp(ctx_llama, &candidates_p, temp); - id = llama_sample_token_mirostat_v2(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); - } else { - // Temperature sampling - llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1); - llama_sample_tail_free(ctx_llama, &candidates_p, tfs_z, 1); - llama_sample_typical(ctx_llama, &candidates_p, typical_p, 1); - llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1); - llama_sample_temp(ctx_llama, &candidates_p, temp); - id = llama_sample_token(ctx_llama, &candidates_p); - } - } - } - - return id; -} - -static const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) { - int id = sample_id(ctx_llama, params); +static const char * sample(struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_llama, + int * n_past) { + const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); + llama_sampling_accept(ctx_sampling, ctx_llama, id, true); static std::string ret; if (id == llama_token_eos(llama_get_model(ctx_llama))) { ret = ""; @@ -174,8 +112,8 @@ struct llava_context { }; static void show_additional_info(int /*argc*/, char ** argv) { - printf("\n example usage: %s -m --mmproj --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - printf(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); + fprintf(stderr, "\n example usage: %s -m --mmproj --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + fprintf(stderr, " note: a lower temperature value like 0.1 is recommended for better quality.\n"); } static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) { @@ -185,7 +123,7 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para auto prompt = params->prompt; if (prompt_contains_image(prompt)) { if (!params->image.empty()) { - printf("using base64 encoded image instead of command line image path\n"); + fprintf(stderr, "using base64 encoded image instead of command line image path\n"); } embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt); if (!embed) { @@ -217,16 +155,19 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ // generate the response - printf("\n"); + fprintf(stderr, "\n"); + + struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); for (int i = 0; i < max_tgt_len; i++) { - const char * tmp = sample(ctx_llava->ctx_llama, *params, &n_past); + const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); if (strcmp(tmp, "") == 0) break; printf("%s", tmp); fflush(stdout); } + llama_sampling_free(ctx_sampling); printf("\n"); } diff --git a/examples/main-cmake-pkg/CMakeLists.txt b/examples/main-cmake-pkg/CMakeLists.txt index cb00edbbb..deb77d588 100644 --- a/examples/main-cmake-pkg/CMakeLists.txt +++ b/examples/main-cmake-pkg/CMakeLists.txt @@ -7,28 +7,13 @@ find_package(Llama 0.0.1 REQUIRED) # Bake common functionality in with target. Because applications # using the relocatable Llama package should be outside of the # source tree, main-cmake-pkg pretends the dependencies are built-in. - set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common") -add_library(common OBJECT - ${_common_path}/common.h - ${_common_path}/common.cpp - ${_common_path}/console.h - ${_common_path}/console.cpp - ${_common_path}/grammar-parser.h - ${_common_path}/grammar-parser.cpp - ${_common_path}/sampling.h - ${_common_path}/sampling.cpp - ) - -# WARNING: because build-info.h is auto-generated, it will only -# be available after the user has built the llama.cpp sources. -# -configure_file(${_common_path}/../build-info.h - ${CMAKE_CURRENT_BINARY_DIR}/build-info.h - COPYONLY) - -target_include_directories(common PUBLIC ${LLAMA_INCLUDE_DIR} - ${CMAKE_CURRENT_BINARY_DIR}) +add_library(common OBJECT) +file(GLOB _common_files + "${_common_path}/*.h" + "${_common_path}/*.cpp" +) +target_sources(common PRIVATE ${_common_files}) # If the common project was part of "main-cmake-pkg" the transient # defines would automatically be attached. Because the common func- diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 859cd12c6..81709e448 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -6,7 +6,7 @@ install(TARGETS ${TARGET} RUNTIME) target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$ ) -target_link_libraries(${TARGET} PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) if (WIN32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) endif() diff --git a/examples/server/README.md b/examples/server/README.md index f1e586a1c..718a7e064 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -166,7 +166,7 @@ node index.js `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0) - `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:` In this case, `[img-12]` will be replaced by the embeddings of the image id 12 in the following `image_data` array: `{..., "image_data": [{"data": "", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. + `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. *Result JSON:* @@ -224,6 +224,8 @@ node index.js `content`: Set the text to process. + `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. + - **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream. *Options:* diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c8bcc2a04..8dbbed9f7 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #ifndef SERVER_VERBOSE #define SERVER_VERBOSE 1 @@ -442,7 +443,6 @@ struct llama_client_slot } images.clear(); - // llama_set_rng_seed(ctx, params.seed); in batched the seed matter??????? } bool has_budget(gpt_params &global_params) { @@ -543,7 +543,9 @@ struct llama_server_context std::vector queue_results; std::vector queue_multitasks; std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks + std::condition_variable condition_tasks; std::mutex mutex_results; + std::condition_variable condition_results; ~llama_server_context() { @@ -922,6 +924,7 @@ struct llama_server_context llama_sampling_free(slot->ctx_sampling); } slot->ctx_sampling = llama_sampling_init(slot->sparams); + llama_set_rng_seed(ctx, slot->params.seed); slot->command = LOAD_PROMPT; all_slots_are_idle = false; @@ -1170,7 +1173,7 @@ struct llama_server_context void send_error(task_server& task, std::string error) { - std::lock_guard lock(mutex_results); + std::unique_lock lock(mutex_results); task_result res; res.id = task.id; res.multitask_id = task.multitask_id; @@ -1178,6 +1181,7 @@ struct llama_server_context res.error = true; res.result_json = { { "content", error } }; queue_results.push_back(res); + condition_results.notify_all(); } void add_multi_task(int id, std::vector& sub_ids) @@ -1187,6 +1191,7 @@ struct llama_server_context multi.id = id; std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end())); queue_multitasks.push_back(multi); + condition_tasks.notify_one(); } void update_multi_task(int multitask_id, int subtask_id, task_result& result) @@ -1198,6 +1203,7 @@ struct llama_server_context { multitask.subtasks_remaining.erase(subtask_id); multitask.results.push_back(result); + condition_tasks.notify_one(); } } } @@ -1216,7 +1222,7 @@ struct llama_server_context {"n_ctx", slot.n_ctx}, {"model", params.model_alias}, {"seed", slot.params.seed}, - {"temp", slot.sparams.temp}, + {"temperature", slot.sparams.temp}, {"top_k", slot.sparams.top_k}, {"top_p", slot.sparams.top_p}, {"min_p", slot.sparams.min_p}, @@ -1245,7 +1251,7 @@ struct llama_server_context void send_partial_response(llama_client_slot &slot, completion_token_output tkn) { - std::lock_guard lock(mutex_results); + std::unique_lock lock(mutex_results); task_result res; res.id = slot.task_id; res.multitask_id = slot.multitask_id; @@ -1281,11 +1287,12 @@ struct llama_server_context } queue_results.push_back(res); + condition_results.notify_all(); } void send_final_response(llama_client_slot &slot) { - std::lock_guard lock(mutex_results); + std::unique_lock lock(mutex_results); task_result res; res.id = slot.task_id; res.multitask_id = slot.multitask_id; @@ -1341,11 +1348,12 @@ struct llama_server_context } queue_results.push_back(res); + condition_results.notify_all(); } void send_embedding(llama_client_slot &slot) { - std::lock_guard lock(mutex_results); + std::unique_lock lock(mutex_results); task_result res; res.id = slot.task_id; res.multitask_id = slot.multitask_id; @@ -1373,6 +1381,7 @@ struct llama_server_context }; } queue_results.push_back(res); + condition_results.notify_all(); } int request_completion(json data, bool infill, bool embedding, int multitask_id) @@ -1396,6 +1405,7 @@ struct llama_server_context // otherwise, it's a single-prompt task, we actually queue it queue_tasks.push_back(task); + condition_tasks.notify_one(); return task.id; } @@ -1403,13 +1413,10 @@ struct llama_server_context { while (true) { - std::this_thread::sleep_for(std::chrono::microseconds(5)); - std::lock_guard lock(mutex_results); - - if (queue_results.empty()) - { - continue; - } + std::unique_lock lock(mutex_results); + condition_results.wait(lock, [&]{ + return !queue_results.empty(); + }); for (int i = 0; i < (int) queue_results.size(); i++) { @@ -1505,12 +1512,13 @@ struct llama_server_context void request_cancel(int task_id) { - std::lock_guard lock(mutex_tasks); + std::unique_lock lock(mutex_tasks); task_server task; task.id = id_gen++; task.type = CANCEL_TASK; task.target_id = task_id; queue_tasks.push_back(task); + condition_tasks.notify_one(); } int split_multiprompt_task(task_server& multiprompt_task) @@ -1536,7 +1544,7 @@ struct llama_server_context void process_tasks() { - std::lock_guard lock(mutex_tasks); + std::unique_lock lock(mutex_tasks); while (!queue_tasks.empty()) { task_server task = queue_tasks.front(); @@ -1608,6 +1616,7 @@ struct llama_server_context std::lock_guard lock(mutex_results); queue_results.push_back(aggregate_result); + condition_results.notify_all(); queue_iterator = queue_multitasks.erase(queue_iterator); } @@ -1638,8 +1647,10 @@ struct llama_server_context LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n"); kv_cache_clear(); } - // avoid 100% usage of cpu all time - std::this_thread::sleep_for(std::chrono::milliseconds(5)); + std::unique_lock lock(mutex_tasks); + condition_tasks.wait(lock, [&]{ + return !queue_tasks.empty(); + }); } for (llama_client_slot &slot : slots) @@ -2438,26 +2449,33 @@ json oaicompat_completion_params_parse( llama_params["__oaicompat"] = true; // Map OpenAI parameters to llama.cpp parameters + // + // For parameters that are defined by the OpenAI documentation (e.g. + // temperature), we explicitly specify OpenAI's intended default; we + // need to do that because sometimes OpenAI disagrees with llama.cpp + // + // https://platform.openai.com/docs/api-reference/chat/create + llama_sampling_params default_sparams; llama_params["model"] = json_value(body, "model", std::string("uknown")); llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); - llama_params["temperature"] = json_value(body, "temperature", 0.8); - llama_params["top_k"] = json_value(body, "top_k", 40); - llama_params["top_p"] = json_value(body, "top_p", 0.95); + llama_params["temperature"] = json_value(body, "temperature", 0.0); + llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k); + llama_params["top_p"] = json_value(body, "top_p", 1.0); llama_params["n_predict"] = json_value(body, "max_tokens", -1); llama_params["logit_bias"] = json_value(body, "logit_bias",json::object()); llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0); llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0); - llama_params["seed"] = json_value(body, "seed", 0); + llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED); llama_params["stream"] = json_value(body, "stream", false); - llama_params["mirostat"] = json_value(body, "mirostat", false); - llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", 0.0); - llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", 0.0); - llama_params["penalize_nl"] = json_value(body, "penalize_nl", false); - llama_params["typical_p"] = json_value(body, "typical_p", 0.0); - llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", 0); + llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat); + llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau); + llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta); + llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl); + llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p); + llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n); llama_params["ignore_eos"] = json_value(body, "ignore_eos", false); - llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0); + llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z); if (body.count("grammar") != 0) { llama_params["grammar"] = json_value(body, "grammar", json::object()); @@ -3071,7 +3089,17 @@ int main(int argc, char **argv) { prompt = ""; } - const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1); + + json image_data; + if (body.count("image_data") != 0) { + image_data = body["image_data"]; + } + else + { + image_data = ""; + } + + const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, false, true, -1); task_result result = llama.next_result(task_id); return res.set_content(result.result_json.dump(), "application/json; charset=utf-8"); }); diff --git a/ggml-backend.c b/ggml-backend.c index 526ce732b..2c3752067 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -614,10 +614,14 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c } static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { - return true; + switch (op->op) { + case GGML_OP_MUL_MAT: + return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type; + default: + return true; + } GGML_UNUSED(backend); - GGML_UNUSED(op); } static struct ggml_backend_i cpu_backend_i = { diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 0ceddf6f0..afe4e4a26 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -119,7 +119,9 @@ #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products #define CC_VOLTA 700 #define CC_OFFSET_AMD 1000000 +#define CC_RDNA1 (CC_OFFSET_AMD + 1010) #define CC_RDNA2 (CC_OFFSET_AMD + 1030) +#define CC_RDNA3 (CC_OFFSET_AMD + 1100) #define GGML_CUDA_MAX_NODES 8192 @@ -133,7 +135,6 @@ // TODO: improve this to be correct for more hardware // for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores -// probably other such cases, and not sure what happens on AMD hardware #if !defined(GGML_CUDA_FORCE_MMQ) #define CUDA_USE_TENSOR_CORES #endif @@ -6664,7 +6665,7 @@ static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) { // pool with virtual memory static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0}; static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0}; -static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB +static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) { scoped_spin_lock lock(g_cuda_pool_lock); @@ -7479,6 +7480,8 @@ static void ggml_cuda_op_dequantize_mul_mat_vec( const int64_t ne00 = src0->ne[0]; const int64_t row_diff = row_high - row_low; + GGML_ASSERT(src1->type == GGML_TYPE_F32); + // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics #ifdef GGML_CUDA_F16 cuda_pool_alloc src1_dfloat_a; @@ -7571,6 +7574,7 @@ static void ggml_cuda_op_mul_mat_cublas( const int compute_capability = g_device_caps[id].cc; if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { + //printf("this branch\n"); // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 cuda_pool_alloc src0_as_f16; if (src0->type != GGML_TYPE_F16) { @@ -7608,9 +7612,9 @@ static void ggml_cuda_op_mul_mat_cublas( const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream); - } - else { + } else { cuda_pool_alloc src0_ddq_as_f32; + cuda_pool_alloc src1_ddq_as_f32; if (src0->type != GGML_TYPE_F32) { const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); @@ -7618,7 +7622,15 @@ static void ggml_cuda_op_mul_mat_cublas( src0_ddq_as_f32.alloc(row_diff*ne00); to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream); } + if (src1->type != GGML_TYPE_F32) { + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type); + GGML_ASSERT(to_fp32_cuda != nullptr); + src1_ddq_as_f32.alloc(src1_ncols*ne10); + to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream); + } + const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get(); + const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get(); const float alpha = 1.0f; const float beta = 0.0f; @@ -7627,9 +7639,9 @@ static void ggml_cuda_op_mul_mat_cublas( CUBLAS_CHECK( cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, row_diff, src1_ncols, ne10, - &alpha, src0_ddf_i, ne00, - src1_ddf_i, ne10, - &beta, dst_dd_i, ldc)); + &alpha, src0_ddf_i, ne00, + src1_ddf1_i, ne10, + &beta, dst_dd_i, ldc)); } (void) dst; @@ -8029,6 +8041,7 @@ static void ggml_cuda_op_mul_mat( GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1)); GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0); @@ -8475,9 +8488,9 @@ static __global__ void k_compute_batched_ptrs( int64_t i03 = i13 / r3; int64_t i02 = i12 / r2; - ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03; - ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2; - ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3; + ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03; + ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13; + ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3; } static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -8486,28 +8499,10 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00); - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; + GGML_TENSOR_BINARY_OP_LOCALS - const int64_t nb01 = src0->nb[1]; - const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02); - const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03); - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const int64_t nb11 = src1->nb[1]; - const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12); - const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13); - - const int64_t ne1 = ggml_nelements(src1); - const int64_t ne = ggml_nelements(dst); + const int64_t ne_dst = ggml_nelements(dst); ggml_cuda_set_device(g_main_device); cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; @@ -8516,7 +8511,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; void * src0_ddq = src0_extra->data_device[g_main_device]; - half * src0_as_f16 = (half *) src0_ddq; + half * src0_f16 = (half *) src0_ddq; ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; @@ -8525,11 +8520,15 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; // convert src1 to fp16 - const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); - GGML_ASSERT(to_fp16_cuda != nullptr); - - cuda_pool_alloc src1_as_f16(ne1); - to_fp16_cuda(src1_ddf, src1_as_f16.get(), ne1, main_stream); + cuda_pool_alloc src1_f16_alloc; + if (src1->type != GGML_TYPE_F16) { + const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); + const int64_t ne_src1 = ggml_nelements(src1); + src1_f16_alloc.alloc(ne_src1); + GGML_ASSERT(to_fp16_cuda != nullptr); + to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream); + } + half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get(); cuda_pool_alloc dst_f16; char * dst_t; @@ -8551,7 +8550,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const const void * beta = &beta_f16; if (dst->op_params[0] == GGML_PREC_DEFAULT) { - dst_t = (char *) dst_f16.alloc(ne); + dst_t = (char *) dst_f16.alloc(ne_dst); nbd2 /= sizeof(float) / sizeof(half); nbd3 /= sizeof(float) / sizeof(half); @@ -8598,9 +8597,9 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const CUBLAS_CHECK( cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA - (const char *) src1_as_f16.get(), CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB - beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC + alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA + (const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB + beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC ne12*ne13, cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); @@ -8613,12 +8612,13 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const dim3 block_dims(ne13, ne12); k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>( - src0_as_f16, src1_as_f16.get(), dst_t, + src0_f16, src1_f16, dst_t, ptrs_src.get(), ptrs_dst.get(), ne12, ne13, ne23, nb02, nb03, - nb12, nb13, + src1->type == GGML_TYPE_F16 ? nb12 : nb12/2, + src1->type == GGML_TYPE_F16 ? nb13 : nb13/2, nbd2, nbd3, r2, r3); CUDA_CHECK(cudaGetLastError()); @@ -8626,8 +8626,8 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const CUBLAS_CHECK( cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/sizeof(half), - (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/sizeof(float), + alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00, + (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10, beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01, ne23, cu_compute_type, @@ -8637,7 +8637,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const if (dst->op_params[0] == GGML_PREC_DEFAULT) { const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); - to_fp32_cuda(dst_f16.get(), dst_ddf, ne, main_stream); + to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream); } } @@ -8656,11 +8656,30 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 } } -#ifdef CUDA_USE_TENSOR_CORES - const bool use_tensor_cores = true; +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + + const bool fp16_performance_good = min_compute_capability >= CC_RDNA1; + bool use_mul_mat_q = ggml_is_quantized(src0->type); + + if(!g_mul_mat_q) + { + use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3; + } + + #else - const bool use_tensor_cores = false; -#endif + + const bool fp16_performance_good = min_compute_capability >= CC_VOLTA; + bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type); + + // when tensor cores are available, use them for large batch size + // ref: https://github.com/ggerganov/llama.cpp/pull/3776 + if(!g_mul_mat_q) + { + use_mul_mat_q = use_mul_mat_q && !(fp16_performance_good && src1->ne[1] > MMQ_MAX_BATCH_SIZE); + } + +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) // debug helpers //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); @@ -8670,19 +8689,19 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); - if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { + if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { // KQ single-batch ggml_cuda_mul_mat_vec_p021(src0, src1, dst); - } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { + } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { // KQV single-batch ggml_cuda_mul_mat_vec_nc(src0, src1, dst); - } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) { + } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) { // KQ + KQV multi-batch ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst); } else if (src0->type == GGML_TYPE_F32) { ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) { - if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) { + if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->type == GGML_TYPE_F32) { #ifdef GGML_CUDA_FORCE_DMMV const bool use_mul_mat_vec_q = false; #else @@ -8696,17 +8715,6 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false); } } else { - bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type); - - // when tensor cores are available, use them for large batch size - // ref: https://github.com/ggerganov/llama.cpp/pull/3776 - if (min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) { - if(!g_mul_mat_q) - { - use_mul_mat_q = false; - } - } - if (use_mul_mat_q) { ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true); } else { diff --git a/ggml-opencl.h b/ggml-opencl.h index 5edea8c9d..8c19a32cd 100644 --- a/ggml-opencl.h +++ b/ggml-opencl.h @@ -6,19 +6,19 @@ extern "C" { #endif -void ggml_cl_init(void); +GGML_API void ggml_cl_init(void); -void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); +GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); -void * ggml_cl_host_malloc(size_t size); -void ggml_cl_host_free(void * ptr); +GGML_API void * ggml_cl_host_malloc(size_t size); +GGML_API void ggml_cl_host_free(void * ptr); -void ggml_cl_free_data(const struct ggml_tensor* tensor); +GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor); -void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor); +GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor); #ifdef __cplusplus } diff --git a/ggml.c b/ggml.c index 7a541b118..ffe88ec17 100644 --- a/ggml.c +++ b/ggml.c @@ -9687,7 +9687,7 @@ static void ggml_compute_forward_mul_mat( const size_t row_size = ggml_row_size(vec_dot_type, ne10); assert(params->wsize >= ne11*ne12*ne13*row_size); - assert(src1->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { @@ -19679,6 +19679,14 @@ int ggml_cpu_has_avx(void) { #endif } +int ggml_cpu_has_avx_vnni(void) { +#if defined(__AVXVNNI__) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_avx2(void) { #if defined(__AVX2__) return 1; diff --git a/ggml.h b/ggml.h index 3090795ae..2a4fc5cb7 100644 --- a/ggml.h +++ b/ggml.h @@ -2205,6 +2205,7 @@ extern "C" { // GGML_API int ggml_cpu_has_avx (void); + GGML_API int ggml_cpu_has_avx_vnni (void); GGML_API int ggml_cpu_has_avx2 (void); GGML_API int ggml_cpu_has_avx512 (void); GGML_API int ggml_cpu_has_avx512_vbmi(void); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 4cd87cdda..ae62cc575 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -120,6 +120,7 @@ class MODEL_TENSOR(IntEnum): FFN_GATE = auto() FFN_DOWN = auto() FFN_UP = auto() + FFN_ACT = auto() FFN_GATE_EXP = auto() FFN_DOWN_EXP = auto() FFN_UP_EXP = auto() @@ -169,6 +170,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}", MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}", @@ -269,6 +271,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_NORM, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_ACT, ], MODEL_ARCH.GPTJ: [ MODEL_TENSOR.TOKEN_EMBD, @@ -367,7 +370,16 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_UP, ], MODEL_ARCH.GPT2: [ - # TODO + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.POS_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, ], MODEL_ARCH.PHI2: [ MODEL_TENSOR.TOKEN_EMBD, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 446c6b688..80c1d5449 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -17,6 +17,7 @@ class TensorNameMap: "tok_embeddings", # llama-pth "embeddings.word_embeddings", # bert "language_model.embedding.word_embeddings", # persimmon + "wte", # gpt2 "transformer.embd.wte", # phi2 ), @@ -34,6 +35,7 @@ class TensorNameMap: MODEL_TENSOR.POS_EMBD: ( "transformer.wpe", # gpt2 "embeddings.position_embeddings", # bert + "wpe", # gpt2 ), # Output @@ -53,7 +55,7 @@ class TensorNameMap: "norm", # llama-pth "embeddings.LayerNorm", # bert "transformer.norm_f", # mpt - "ln_f", # refact bloom qwen + "ln_f", # refact bloom qwen gpt2 "language_model.encoder.final_layernorm", # persimmon "lm_head.ln", # phi2 ), @@ -78,6 +80,7 @@ class TensorNameMap: "encoder.layer.{bid}.attention.output.LayerNorm", # bert "language_model.encoder.layers.{bid}.input_layernorm", # persimmon "model.layers.{bid}.ln1", # yi + "h.{bid}.ln_1", # gpt2 "transformer.h.{bid}.ln", # phi2 "model.layers.layers.{bid}.norm", # plamo ), @@ -95,6 +98,7 @@ class TensorNameMap: "transformer.h.{bid}.self_attention.query_key_value", # falcon "h.{bid}.self_attention.query_key_value", # bloom "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon + "h.{bid}.attn.c_attn", # gpt2 "transformer.h.{bid}.mixer.Wqkv", # phi2 ), @@ -137,6 +141,7 @@ class TensorNameMap: "encoder.layer.{bid}.attention.output.dense", # bert "transformer.h.{bid}.attn.out_proj", # gpt-j "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon + "h.{bid}.attn.c_proj", # gpt2 "transformer.h.{bid}.mixer.out_proj", # phi2 "model.layers.layers.{bid}.self_attn.o_proj", # plamo ), @@ -159,6 +164,7 @@ class TensorNameMap: "encoder.layer.{bid}.output.LayerNorm", # bert "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon "model.layers.{bid}.ln2", # yi + "h.{bid}.ln_2", # gpt2 ), MODEL_TENSOR.FFN_GATE_INP: ( @@ -179,6 +185,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.fc_in", # gpt-j "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon "transformer.h.{bid}.mlp.w1", # qwen + "h.{bid}.mlp.c_fc", # gpt2 "transformer.h.{bid}.mlp.fc1", # phi2 "model.layers.layers.{bid}.mlp.up_proj", # plamo ), @@ -188,6 +195,11 @@ class TensorNameMap: "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral ), + # AWQ-activation gate + MODEL_TENSOR.FFN_ACT: ( + "transformer.blocks.{bid}.ffn.act", # mpt + ), + # Feed-forward gate MODEL_TENSOR.FFN_GATE: ( "model.layers.{bid}.mlp.gate_proj", # llama-hf refact @@ -213,6 +225,7 @@ class TensorNameMap: "encoder.layer.{bid}.output.dense", # bert "transformer.h.{bid}.mlp.fc_out", # gpt-j "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon + "h.{bid}.mlp.c_proj", # gpt2 "transformer.h.{bid}.mlp.fc2", # phi2 "model.layers.layers.{bid}.mlp.down_proj", # plamo ), diff --git a/llama.cpp b/llama.cpp index f40ee8ef2..0b1b09210 100644 --- a/llama.cpp +++ b/llama.cpp @@ -355,6 +355,7 @@ enum llm_tensor { LLM_TENSOR_FFN_GATE, LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_ACT, LLM_TENSOR_FFN_DOWN_EXP, LLM_TENSOR_FFN_GATE_EXP, LLM_TENSOR_FFN_UP_EXP, @@ -423,6 +424,15 @@ static std::map> LLM_TENSOR_NAMES = LLM_ARCH_GPT2, { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_POS_EMBD, "position_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, }, }, { @@ -474,6 +484,7 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" }, }, }, { @@ -1259,6 +1270,10 @@ enum e_model { MODEL_40B, MODEL_65B, MODEL_70B, + MODEL_SMALL, + MODEL_MEDIUM, + MODEL_LARGE, + MODEL_XL, }; static const size_t kiB = 1024; @@ -1290,6 +1305,7 @@ struct llama_hparams { float f_clamp_kqv; float f_max_alibi_bias; + bool operator!=(const llama_hparams & other) const { if (this->vocab_only != other.vocab_only) return true; if (this->n_vocab != other.n_vocab) return true; @@ -1393,6 +1409,7 @@ struct llama_layer { // ff bias struct ggml_tensor * ffn_down_b; // b2 struct ggml_tensor * ffn_up_b; // b3 + struct ggml_tensor * ffn_act; }; struct llama_kv_cell { @@ -2559,18 +2576,22 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { static const char * llama_model_type_name(e_model type) { switch (type) { - case MODEL_1B: return "1B"; - case MODEL_3B: return "3B"; - case MODEL_7B: return "7B"; - case MODEL_8B: return "8B"; - case MODEL_13B: return "13B"; - case MODEL_15B: return "15B"; - case MODEL_30B: return "30B"; - case MODEL_34B: return "34B"; - case MODEL_40B: return "40B"; - case MODEL_65B: return "65B"; - case MODEL_70B: return "70B"; - default: return "?B"; + case MODEL_1B: return "1B"; + case MODEL_3B: return "3B"; + case MODEL_7B: return "7B"; + case MODEL_8B: return "8B"; + case MODEL_13B: return "13B"; + case MODEL_15B: return "15B"; + case MODEL_30B: return "30B"; + case MODEL_34B: return "34B"; + case MODEL_40B: return "40B"; + case MODEL_65B: return "65B"; + case MODEL_70B: return "70B"; + case MODEL_SMALL: return "0.1B"; + case MODEL_MEDIUM: return "0.4B"; + case MODEL_LARGE: return "0.8B"; + case MODEL_XL: return "1.5B"; + default: return "?B"; } } @@ -2789,6 +2810,17 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_GPT2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + switch (hparams.n_layer) { + case 12: model.type = e_model::MODEL_SMALL; break; + case 24: model.type = e_model::MODEL_MEDIUM; break; + case 36: model.type = e_model::MODEL_LARGE; break; + case 48: model.type = e_model::MODEL_XL; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -3500,7 +3532,6 @@ static bool llm_load_tensors( case LLM_ARCH_MPT: { model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - // output { ggml_backend_type backend_norm; @@ -3538,6 +3569,9 @@ static bool llm_load_tensors( layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + + // AWQ ScaleActivation layer + layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend, false); } } break; case LLM_ARCH_STABLELM: @@ -3733,6 +3767,60 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); } } break; + case LLM_ARCH_GPT2: + { + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); + + // output + { + ggml_backend_type backend_norm; + ggml_backend_type backend_output; + + if (n_gpu_layers > int(n_layer)) { + backend_norm = llama_backend_offload; + backend_output = llama_backend_offload_split; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } + + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + } + + const uint32_t n_ff = hparams.n_ff; + + const int i_gpu_start = n_layer - n_gpu_layers; + + model.layers.resize(n_layer); + + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT + const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -4068,6 +4156,7 @@ static struct ggml_tensor * llm_build_ffn( struct ggml_tensor * gate_b, struct ggml_tensor * down, struct ggml_tensor * down_b, + struct ggml_tensor * act_scales, llm_ffn_op_type type_op, llm_ffn_gate_type type_gate, const llm_build_cb & cb, @@ -4112,6 +4201,10 @@ static struct ggml_tensor * llm_build_ffn( { cur = ggml_gelu(ctx, cur); cb(cur, "ffn_gelu", il); + if (act_scales != NULL) { + cur = ggml_div(ctx, cur, act_scales); + cb(cur, "ffn_act", il); + } } break; case LLM_FFN_RELU: { @@ -4430,6 +4523,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } else { @@ -4609,6 +4703,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } @@ -4723,6 +4818,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -4827,6 +4923,7 @@ struct llm_build_context { model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -5031,6 +5128,7 @@ struct llm_build_context { model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -5117,6 +5215,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } @@ -5212,6 +5311,7 @@ struct llm_build_context { model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -5297,11 +5397,11 @@ struct llm_build_context { NULL, LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, model.layers[il].ffn_up, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, + model.layers[il].ffn_act, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -5410,6 +5510,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } @@ -5522,6 +5623,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } @@ -5629,6 +5731,7 @@ struct llm_build_context { model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(ffn_output, "ffn_out", il); } @@ -5732,6 +5835,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } @@ -5761,6 +5865,102 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_gpt2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + struct ggml_tensor * cur; + struct ggml_tensor * pos; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); + cb(pos, "pos_embd", -1); + + inpL = ggml_add(ctx0, inpL, pos); + cb(inpL, "inpL", -1); + + for (int il = 0; il < n_layer; ++il) { + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + cur = llm_build_kqv(ctx0, model, hparams, kv_self, + model.layers[il].wo, model.layers[il].bo, + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); + } + + // add the input + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + // FF + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + } + + inpL = ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); + } + + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; // @@ -5916,6 +6116,7 @@ static const std::unordered_map k_offload_map { "ffn_gate", OFFLOAD_FUNC }, { "ffn_gate_b", OFFLOAD_FUNC }, { "ffn_gate_par", OFFLOAD_FUNC }, + { "ffn_act", OFFLOAD_FUNC }, { "ffn_down", OFFLOAD_FUNC }, { "ffn_down_b", OFFLOAD_FUNC }, { "ffn_out", OFFLOAD_FUNC }, @@ -6275,6 +6476,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_plamo(); } break; + case LLM_ARCH_GPT2: + { + result = llm.build_gpt2(); + } break; default: GGML_ASSERT(false); } @@ -10853,6 +11058,7 @@ const char * llama_print_system_info(void) { s = ""; s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; + s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | "; s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | "; diff --git a/models/ggml-vocab-gpt2.gguf b/models/ggml-vocab-gpt2.gguf new file mode 100644 index 000000000..1fbc72c1e Binary files /dev/null and b/models/ggml-vocab-gpt2.gguf differ diff --git a/requirements-hf-to-gguf.txt b/requirements-hf-to-gguf.txt deleted file mode 100644 index f4600539e..000000000 --- a/requirements-hf-to-gguf.txt +++ /dev/null @@ -1,3 +0,0 @@ --r requirements.txt -torch==2.1.1 -transformers==4.35.2 diff --git a/requirements/requirements-convert-hf-to-gguf.txt b/requirements/requirements-convert-hf-to-gguf.txt new file mode 100644 index 000000000..6ac402610 --- /dev/null +++ b/requirements/requirements-convert-hf-to-gguf.txt @@ -0,0 +1,2 @@ +-r ./requirements-convert.txt +torch~=2.1.1 diff --git a/requirements/requirements-convert-llama-ggml-to-gguf.txt b/requirements/requirements-convert-llama-ggml-to-gguf.txt new file mode 100644 index 000000000..a0f37cd1c --- /dev/null +++ b/requirements/requirements-convert-llama-ggml-to-gguf.txt @@ -0,0 +1 @@ +-r ./requirements-convert.txt diff --git a/requirements/requirements-convert-lora-to-ggml.txt b/requirements/requirements-convert-lora-to-ggml.txt new file mode 100644 index 000000000..6ac402610 --- /dev/null +++ b/requirements/requirements-convert-lora-to-ggml.txt @@ -0,0 +1,2 @@ +-r ./requirements-convert.txt +torch~=2.1.1 diff --git a/requirements/requirements-convert-persimmon-to-gguf.txt b/requirements/requirements-convert-persimmon-to-gguf.txt new file mode 100644 index 000000000..6ac402610 --- /dev/null +++ b/requirements/requirements-convert-persimmon-to-gguf.txt @@ -0,0 +1,2 @@ +-r ./requirements-convert.txt +torch~=2.1.1 diff --git a/requirements/requirements-convert.txt b/requirements/requirements-convert.txt new file mode 100644 index 000000000..a3d6ecec0 --- /dev/null +++ b/requirements/requirements-convert.txt @@ -0,0 +1,5 @@ +numpy~=1.24.4 +sentencepiece~=0.1.98 +transformers>=4.35.2,<5.0.0 +gguf>=0.1.0 +protobuf>=4.21.0,<5.0.0 diff --git a/scripts/check-requirements.sh b/scripts/check-requirements.sh new file mode 100755 index 000000000..af7bab753 --- /dev/null +++ b/scripts/check-requirements.sh @@ -0,0 +1,174 @@ +#!/bin/bash +set -euo pipefail + +# +# check-requirements.sh checks all requirements files for each top-level +# convert*.py script. +# +# WARNING: This is quite IO intensive, because a fresh venv is set up for every +# python script. As of 2023-12-22, this writes ~2.7GB of data. An adequately +# sized tmpfs /tmp or ramdisk is recommended if running this frequently. +# +# usage: check-requirements.sh [] +# check-requirements.sh nocleanup [] +# +# where: +# - is a directory that can be used as the base for +# setting up the venvs. Defaults to `/tmp`. +# - 'nocleanup' as the first argument will disable automatic cleanup +# of the files created by this script. +# +# requires: +# - bash >= 3.2.57 +# - shellcheck +# +# For each script, it creates a fresh venv, `pip install`s the requirements, and +# finally imports the python script to check for `ImportError`. +# + +log() { + local level=$1 msg=$2 + printf >&2 '%s: %s\n' "$level" "$msg" +} + +debug() { + log DEBUG "$@" +} + +info() { + log INFO "$@" +} + +fatal() { + log FATAL "$@" + exit 1 +} + +cleanup() { + if [[ -n ${workdir+x} && -d $workdir && -w $workdir ]]; then + info "Removing $workdir" + local count=0 + rm -rfv -- "$workdir" | while read -r; do + if (( count++ > 750 )); then + printf . + count=0 + fi + done + printf '\n' + info "Removed $workdir" + fi +} + +do_cleanup=1 +if [[ ${1-} == nocleanup ]]; then + do_cleanup=0; shift +fi + +if (( do_cleanup )); then + trap exit INT TERM + trap cleanup EXIT +fi + +this=$(realpath -- "$0"); readonly this +cd "$(dirname "$this")/.." # PWD should stay in llama.cpp project directory + +shellcheck "$this" + +readonly reqs_dir=requirements + +if [[ ${1+x} ]]; then + tmp_dir=$(realpath -- "$1") + if [[ ! ( -d $tmp_dir && -w $tmp_dir ) ]]; then + fatal "$tmp_dir is not a writable directory" + fi +else + tmp_dir=/tmp +fi + +workdir=$(mktemp -d "$tmp_dir/check-requirements.XXXX"); readonly workdir +info "Working directory: $workdir" + +check_requirements() { + local reqs=$1 + + info "$reqs: beginning check" + pip --disable-pip-version-check install -qr "$reqs" + info "$reqs: OK" +} + +check_convert_script() { + local py=$1 # e.g. ./convert-hf-to-gguf.py + local pyname=${py##*/} # e.g. convert-hf-to-gguf.py + pyname=${pyname%.py} # e.g. convert-hf-to-gguf + + info "$py: beginning check" + + local reqs="$reqs_dir/requirements-$pyname.txt" + if [[ ! -r $reqs ]]; then + fatal "$py missing requirements. Expected: $reqs" + fi + + local venv="$workdir/$pyname-venv" + python3 -m venv "$venv" + + ( + # shellcheck source=/dev/null + source "$venv/bin/activate" + + check_requirements "$reqs" + + python - "$py" "$pyname" <<'EOF' +import sys +from importlib.machinery import SourceFileLoader +py, pyname = sys.argv[1:] +SourceFileLoader(pyname, py).load_module() +EOF + ) + + if (( do_cleanup )); then + rm -rf -- "$venv" + fi + + info "$py: imports OK" +} + +readonly ignore_eq_eq='check_requirements: ignore "=="' + +for req in "$reqs_dir"/*; do + # Check that all sub-requirements are added to top-level requirements.txt + if ! grep -qF "$req" requirements.txt; then + fatal "$req needs to be added to requirements.txt" + fi + + # Make sure exact release versions aren't being pinned in the requirements + # Filters out the ignore string + if grep -vF "$ignore_eq_eq" "$req" | grep -q '=='; then + tab=$'\t' + cat >&2 < $SRC_LLAMA/ggml-commits -git format-patch $lc --stdout -- \ - include/ggml/ggml*.h \ - src/ggml*.h \ - src/ggml*.c \ - src/ggml*.cpp \ - src/ggml*.m \ - src/ggml*.metal \ - src/ggml*.cu \ - tests/test-opt.cpp \ - tests/test-grad0.cpp \ - tests/test-quantize-fns.cpp \ - tests/test-quantize-perf.cpp \ - tests/test-backend-ops.cpp \ - > $SRC_LLAMA/ggml-src.patch +if [ ! -s $SRC_LLAMA/ggml-commits ]; then + rm -v $SRC_LLAMA/ggml-commits + echo "No new commits" + exit 0 +fi + +if [ -f $SRC_LLAMA/ggml-src.patch ]; then + rm -v $SRC_LLAMA/ggml-src.patch +fi + +while read c; do + git format-patch -k $c~1..$c --stdout -- \ + include/ggml/ggml*.h \ + src/ggml*.h \ + src/ggml*.c \ + src/ggml*.cpp \ + src/ggml*.m \ + src/ggml*.metal \ + src/ggml*.cu \ + tests/test-opt.cpp \ + tests/test-grad0.cpp \ + tests/test-quantize-fns.cpp \ + tests/test-quantize-perf.cpp \ + tests/test-backend-ops.cpp \ + >> $SRC_LLAMA/ggml-src.patch +done < $SRC_LLAMA/ggml-commits + +rm -v $SRC_LLAMA/ggml-commits # delete files if empty if [ ! -s $SRC_LLAMA/ggml-src.patch ]; then diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 1ec144116..5b6a440f7 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -76e7f47b69e8334384dc718480c496dafbd47999 +df098ea908764cba4a4889a1cbe7b026b2d31a14 diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f3df8a8c6..b115299c0 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -350,13 +350,18 @@ struct test_case { fflush(stdout); // check if backends support op + bool supported = true; for (ggml_backend_t backend : {backend1, backend2}) { if (!ggml_backend_supports_op(backend, out)) { - printf("not supported\n"); - ggml_free(ctx); - return true; + printf("not supported [%s] ", ggml_backend_name(backend)); + supported = false; } } + if (!supported) { + printf("\n"); + ggml_free(ctx); + return true; + } // post-graph sentinel add_sentinel(ctx); @@ -1505,8 +1510,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } for (ggml_type type_a : all_types) { - for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) { - // FIXME: CPU crashes on f16xf16 + for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) { test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1})); test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {1, 1})); test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {2, 1}));