Merge branch 'master' into HEAD
This commit is contained in:
commit
d24da31d2f
50 changed files with 1505 additions and 649 deletions
|
@ -14,7 +14,8 @@ ARG CUDA_DOCKER_ARCH=all
|
|||
RUN apt-get update && \
|
||||
apt-get install -y build-essential python3 python3-pip git
|
||||
|
||||
COPY requirements.txt requirements.txt
|
||||
COPY requirements.txt requirements.txt
|
||||
COPY requirements requirements
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt
|
||||
|
|
|
@ -23,7 +23,8 @@ ARG ROCM_DOCKER_ARCH=\
|
|||
gfx1101 \
|
||||
gfx1102
|
||||
|
||||
COPY requirements.txt requirements.txt
|
||||
COPY requirements.txt requirements.txt
|
||||
COPY requirements requirements
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt
|
||||
|
|
|
@ -5,7 +5,8 @@ FROM ubuntu:$UBUNTU_VERSION as build
|
|||
RUN apt-get update && \
|
||||
apt-get install -y build-essential python3 python3-pip git
|
||||
|
||||
COPY requirements.txt requirements.txt
|
||||
COPY requirements.txt requirements.txt
|
||||
COPY requirements requirements
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt
|
||||
|
|
|
@ -23,7 +23,8 @@ ARG ROCM_DOCKER_ARCH=\
|
|||
gfx1101 \
|
||||
gfx1102
|
||||
|
||||
COPY requirements.txt requirements.txt
|
||||
COPY requirements.txt requirements.txt
|
||||
COPY requirements requirements
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt
|
||||
|
|
22
.devops/nix/apps.nix
Normal file
22
.devops/nix/apps.nix
Normal file
|
@ -0,0 +1,22 @@
|
|||
{
|
||||
perSystem =
|
||||
{ config, lib, ... }:
|
||||
{
|
||||
apps =
|
||||
let
|
||||
inherit (config.packages) default;
|
||||
binaries = [
|
||||
"llama"
|
||||
"llama-embedding"
|
||||
"llama-server"
|
||||
"quantize"
|
||||
"train-text-from-scratch"
|
||||
];
|
||||
mkApp = name: {
|
||||
type = "app";
|
||||
program = "${default}/bin/${name}";
|
||||
};
|
||||
in
|
||||
lib.genAttrs binaries mkApp;
|
||||
};
|
||||
}
|
13
.devops/nix/devshells.nix
Normal file
13
.devops/nix/devshells.nix
Normal file
|
@ -0,0 +1,13 @@
|
|||
{
|
||||
perSystem =
|
||||
{ config, lib, ... }:
|
||||
{
|
||||
devShells =
|
||||
lib.concatMapAttrs
|
||||
(name: package: {
|
||||
${name} = package.passthru.shell;
|
||||
${name + "-extra"} = package.passthru.shell-extra;
|
||||
})
|
||||
config.packages;
|
||||
};
|
||||
}
|
32
.devops/nix/jetson-support.nix
Normal file
32
.devops/nix/jetson-support.nix
Normal file
|
@ -0,0 +1,32 @@
|
|||
{ inputs, ... }:
|
||||
{
|
||||
perSystem =
|
||||
{
|
||||
config,
|
||||
system,
|
||||
lib,
|
||||
pkgsCuda,
|
||||
...
|
||||
}:
|
||||
lib.optionalAttrs (system == "aarch64-linux") {
|
||||
packages =
|
||||
let
|
||||
caps.jetson-xavier = "7.2";
|
||||
caps.jetson-orin = "8.7";
|
||||
caps.jetson-nano = "5.3";
|
||||
|
||||
pkgsFor =
|
||||
cap:
|
||||
import inputs.nixpkgs {
|
||||
inherit system;
|
||||
config = {
|
||||
cudaSupport = true;
|
||||
cudaCapabilities = [ cap ];
|
||||
cudaEnableForwardCompat = false;
|
||||
inherit (pkgsCuda.config) allowUnfreePredicate;
|
||||
};
|
||||
};
|
||||
in
|
||||
builtins.mapAttrs (name: cap: ((pkgsFor cap).callPackage ./scope.nix { }).llama-cpp) caps;
|
||||
};
|
||||
}
|
35
.devops/nix/nixpkgs-instances.nix
Normal file
35
.devops/nix/nixpkgs-instances.nix
Normal file
|
@ -0,0 +1,35 @@
|
|||
{ inputs, ... }:
|
||||
{
|
||||
# The _module.args definitions are passed on to modules as arguments. E.g.
|
||||
# the module `{ pkgs ... }: { /* config */ }` implicitly uses
|
||||
# `_module.args.pkgs` (defined in this case by flake-parts).
|
||||
perSystem =
|
||||
{ system, ... }:
|
||||
{
|
||||
_module.args = {
|
||||
pkgsCuda = import inputs.nixpkgs {
|
||||
inherit system;
|
||||
# Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
|
||||
# and ucx are built with CUDA support)
|
||||
config.cudaSupport = true;
|
||||
config.allowUnfreePredicate =
|
||||
p:
|
||||
builtins.all
|
||||
(
|
||||
license:
|
||||
license.free
|
||||
|| builtins.elem license.shortName [
|
||||
"CUDA EULA"
|
||||
"cuDNN EULA"
|
||||
]
|
||||
)
|
||||
(p.meta.licenses or [ p.meta.license ]);
|
||||
};
|
||||
# Ensure dependencies use ROCm consistently
|
||||
pkgsRocm = import inputs.nixpkgs {
|
||||
inherit system;
|
||||
config.rocmSupport = true;
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
265
.devops/nix/package.nix
Normal file
265
.devops/nix/package.nix
Normal file
|
@ -0,0 +1,265 @@
|
|||
{
|
||||
lib,
|
||||
config,
|
||||
stdenv,
|
||||
mkShell,
|
||||
cmake,
|
||||
ninja,
|
||||
pkg-config,
|
||||
git,
|
||||
python3,
|
||||
mpi,
|
||||
openblas, # TODO: Use the generic `blas` so users could switch betwen alternative implementations
|
||||
cudaPackages,
|
||||
darwin,
|
||||
rocmPackages,
|
||||
clblast,
|
||||
useBlas ? builtins.all (x: !x) [
|
||||
useCuda
|
||||
useMetalKit
|
||||
useOpenCL
|
||||
useRocm
|
||||
],
|
||||
useCuda ? config.cudaSupport,
|
||||
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
|
||||
useMpi ? false, # Increases the runtime closure size by ~700M
|
||||
useOpenCL ? false,
|
||||
useRocm ? config.rocmSupport,
|
||||
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
||||
}@inputs:
|
||||
|
||||
let
|
||||
inherit (lib)
|
||||
cmakeBool
|
||||
cmakeFeature
|
||||
optionals
|
||||
strings
|
||||
versionOlder
|
||||
;
|
||||
|
||||
# It's necessary to consistently use backendStdenv when building with CUDA support,
|
||||
# otherwise we get libstdc++ errors downstream.
|
||||
stdenv = throw "Use effectiveStdenv instead";
|
||||
effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
|
||||
|
||||
suffices =
|
||||
lib.optionals useBlas [ "BLAS" ]
|
||||
++ lib.optionals useCuda [ "CUDA" ]
|
||||
++ lib.optionals useMetalKit [ "MetalKit" ]
|
||||
++ lib.optionals useMpi [ "MPI" ]
|
||||
++ lib.optionals useOpenCL [ "OpenCL" ]
|
||||
++ lib.optionals useRocm [ "ROCm" ];
|
||||
|
||||
pnameSuffix =
|
||||
strings.optionalString (suffices != [ ])
|
||||
"-${strings.concatMapStringsSep "-" strings.toLower suffices}";
|
||||
descriptionSuffix =
|
||||
strings.optionalString (suffices != [ ])
|
||||
", accelerated with ${strings.concatStringsSep ", " suffices}";
|
||||
|
||||
# TODO: package the Python in this repository in a Nix-like way.
|
||||
# It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
|
||||
# is PEP 517-compatible, and ensure the correct .dist-info is generated.
|
||||
# https://peps.python.org/pep-0517/
|
||||
llama-python = python3.withPackages (
|
||||
ps: [
|
||||
ps.numpy
|
||||
ps.sentencepiece
|
||||
]
|
||||
);
|
||||
|
||||
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
|
||||
llama-python-extra = python3.withPackages (
|
||||
ps: [
|
||||
ps.numpy
|
||||
ps.sentencepiece
|
||||
ps.torchWithoutCuda
|
||||
ps.transformers
|
||||
]
|
||||
);
|
||||
|
||||
# apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
|
||||
# separately
|
||||
darwinBuildInputs =
|
||||
with darwin.apple_sdk.frameworks;
|
||||
[
|
||||
Accelerate
|
||||
CoreVideo
|
||||
CoreGraphics
|
||||
]
|
||||
++ optionals useMetalKit [ MetalKit ];
|
||||
|
||||
cudaBuildInputs = with cudaPackages; [
|
||||
cuda_cccl.dev # <nv/target>
|
||||
|
||||
# A temporary hack for reducing the closure size, remove once cudaPackages
|
||||
# have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
|
||||
cuda_cudart.dev
|
||||
cuda_cudart.lib
|
||||
cuda_cudart.static
|
||||
libcublas.dev
|
||||
libcublas.lib
|
||||
libcublas.static
|
||||
];
|
||||
|
||||
rocmBuildInputs = with rocmPackages; [
|
||||
clr
|
||||
hipblas
|
||||
rocblas
|
||||
];
|
||||
in
|
||||
|
||||
effectiveStdenv.mkDerivation (
|
||||
finalAttrs: {
|
||||
pname = "llama-cpp${pnameSuffix}";
|
||||
version = llamaVersion;
|
||||
|
||||
src = lib.cleanSourceWith {
|
||||
filter =
|
||||
name: type:
|
||||
!(builtins.any (_: _) [
|
||||
(lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
|
||||
(name == "README.md") # Ignore *.md changes whe computing outPaths
|
||||
(lib.hasPrefix "." name) # Skip hidden files and directories
|
||||
]);
|
||||
src = lib.cleanSource ../../.;
|
||||
};
|
||||
|
||||
postPatch = ''
|
||||
substituteInPlace ./ggml-metal.m \
|
||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||
|
||||
# TODO: Package up each Python script or service appropriately.
|
||||
# If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
|
||||
# we could make those *.py into setuptools' entrypoints
|
||||
substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
|
||||
'';
|
||||
|
||||
nativeBuildInputs =
|
||||
[
|
||||
cmake
|
||||
ninja
|
||||
pkg-config
|
||||
git
|
||||
]
|
||||
++ optionals useCuda [
|
||||
cudaPackages.cuda_nvcc
|
||||
|
||||
# TODO: Replace with autoAddDriverRunpath
|
||||
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
|
||||
cudaPackages.autoAddOpenGLRunpathHook
|
||||
];
|
||||
|
||||
buildInputs =
|
||||
optionals effectiveStdenv.isDarwin darwinBuildInputs
|
||||
++ optionals useCuda cudaBuildInputs
|
||||
++ optionals useMpi [ mpi ]
|
||||
++ optionals useOpenCL [ clblast ]
|
||||
++ optionals useRocm rocmBuildInputs;
|
||||
|
||||
cmakeFlags =
|
||||
[
|
||||
(cmakeBool "LLAMA_NATIVE" true)
|
||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
||||
(cmakeBool "BUILD_SHARED_LIBS" true)
|
||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||
(cmakeBool "LLAMA_BLAS" useBlas)
|
||||
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
|
||||
(cmakeBool "LLAMA_CUBLAS" useCuda)
|
||||
(cmakeBool "LLAMA_HIPBLAS" useRocm)
|
||||
(cmakeBool "LLAMA_METAL" useMetalKit)
|
||||
(cmakeBool "LLAMA_MPI" useMpi)
|
||||
]
|
||||
++ optionals useCuda [
|
||||
(
|
||||
with cudaPackages.flags;
|
||||
cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
|
||||
builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
|
||||
)
|
||||
)
|
||||
]
|
||||
++ optionals useRocm [
|
||||
(cmakeFeature "CMAKE_C_COMPILER" "hipcc")
|
||||
(cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
|
||||
|
||||
# Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
|
||||
# in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
|
||||
# and select the line that matches the current nixpkgs version of rocBLAS.
|
||||
# Should likely use `rocmPackages.clr.gpuTargets`.
|
||||
"-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
|
||||
]
|
||||
++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]
|
||||
++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ];
|
||||
|
||||
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
||||
# if they haven't been added yet.
|
||||
postInstall = ''
|
||||
mv $out/bin/main $out/bin/llama
|
||||
mv $out/bin/server $out/bin/llama-server
|
||||
mkdir -p $out/include
|
||||
cp $src/llama.h $out/include/
|
||||
'';
|
||||
|
||||
# Define the shells here, but don't add in the inputsFrom to avoid recursion.
|
||||
passthru = {
|
||||
inherit
|
||||
useBlas
|
||||
useCuda
|
||||
useMetalKit
|
||||
useMpi
|
||||
useOpenCL
|
||||
useRocm
|
||||
;
|
||||
|
||||
shell = mkShell {
|
||||
name = "shell-${finalAttrs.finalPackage.name}";
|
||||
description = "contains numpy and sentencepiece";
|
||||
buildInputs = [ llama-python ];
|
||||
inputsFrom = [ finalAttrs.finalPackage ];
|
||||
};
|
||||
|
||||
shell-extra = mkShell {
|
||||
name = "shell-extra-${finalAttrs.finalPackage.name}";
|
||||
description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
|
||||
buildInputs = [ llama-python-extra ];
|
||||
inputsFrom = [ finalAttrs.finalPackage ];
|
||||
};
|
||||
};
|
||||
|
||||
meta = {
|
||||
# Configurations we don't want even the CI to evaluate. Results in the
|
||||
# "unsupported platform" messages. This is mostly a no-op, because
|
||||
# cudaPackages would've refused to evaluate anyway.
|
||||
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
|
||||
|
||||
# Configurations that are known to result in build failures. Can be
|
||||
# overridden by importing Nixpkgs with `allowBroken = true`.
|
||||
broken = (useMetalKit && !effectiveStdenv.isDarwin);
|
||||
|
||||
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
||||
homepage = "https://github.com/ggerganov/llama.cpp/";
|
||||
license = lib.licenses.mit;
|
||||
|
||||
# Accommodates `nix run` and `lib.getExe`
|
||||
mainProgram = "llama";
|
||||
|
||||
# These people might respond, on the best effort basis, if you ping them
|
||||
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
||||
# Consider adding yourself to this list if you want to ensure this flake
|
||||
# stays maintained and you're willing to invest your time. Do not add
|
||||
# other people without their consent. Consider removing people after
|
||||
# they've been unreachable for long periods of time.
|
||||
|
||||
# Note that lib.maintainers is defined in Nixpkgs, but you may just add
|
||||
# an attrset following the same format as in
|
||||
# https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
|
||||
maintainers = with lib.maintainers; [
|
||||
philiptaron
|
||||
SomeoneSerge
|
||||
];
|
||||
|
||||
# Extend `badPlatforms` instead
|
||||
platforms = lib.platforms.all;
|
||||
};
|
||||
}
|
||||
)
|
12
.devops/nix/scope.nix
Normal file
12
.devops/nix/scope.nix
Normal file
|
@ -0,0 +1,12 @@
|
|||
{
|
||||
lib,
|
||||
newScope,
|
||||
llamaVersion ? "0.0.0",
|
||||
}:
|
||||
|
||||
lib.makeScope newScope (
|
||||
self: {
|
||||
inherit llamaVersion;
|
||||
llama-cpp = self.callPackage ./package.nix { };
|
||||
}
|
||||
)
|
23
.github/workflows/nix-flakestry.yml
vendored
Normal file
23
.github/workflows/nix-flakestry.yml
vendored
Normal file
|
@ -0,0 +1,23 @@
|
|||
# Make the flake discoverable on https://flakestry.dev
|
||||
name: "Publish a flake to flakestry"
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "v?[0-9]+.[0-9]+.[0-9]+"
|
||||
- "v?[0-9]+.[0-9]+"
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
tag:
|
||||
description: "The existing tag to publish"
|
||||
type: "string"
|
||||
required: true
|
||||
jobs:
|
||||
publish-flake:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
id-token: "write"
|
||||
contents: "read"
|
||||
steps:
|
||||
- uses: flakestry/flakestry-publish@main
|
||||
with:
|
||||
version: "${{ inputs.tag || github.ref_name }}"
|
29
.github/workflows/python-check-requirements.yml
vendored
Normal file
29
.github/workflows/python-check-requirements.yml
vendored
Normal file
|
@ -0,0 +1,29 @@
|
|||
name: Python check requirements.txt
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'scripts/check-requirements.sh'
|
||||
- 'convert*.py'
|
||||
- 'requirements.txt'
|
||||
- 'requirements/*.txt'
|
||||
pull_request:
|
||||
paths:
|
||||
- 'scripts/check-requirements.sh'
|
||||
- 'convert*.py'
|
||||
- 'requirements.txt'
|
||||
- 'requirements/*.txt'
|
||||
|
||||
jobs:
|
||||
python-check-requirements:
|
||||
runs-on: ubuntu-latest
|
||||
name: check-requirements
|
||||
steps:
|
||||
- name: Check out source repository
|
||||
uses: actions/checkout@v3
|
||||
- name: Set up Python environment
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.11"
|
||||
- name: Run check-requirements.sh script
|
||||
run: bash scripts/check-requirements.sh nocleanup
|
|
@ -103,6 +103,7 @@ as the main playground for developing new features for the [ggml](https://github
|
|||
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
|
||||
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
|
||||
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
|
||||
- [x] [GPT-2](https://huggingface.co/gpt2)
|
||||
|
||||
**Multimodal models:**
|
||||
|
||||
|
|
|
@ -65,4 +65,4 @@ endif()
|
|||
|
||||
target_include_directories(${TARGET} PUBLIC .)
|
||||
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
||||
target_link_libraries(${TARGET} PRIVATE llama build_info)
|
||||
target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)
|
||||
|
|
|
@ -184,6 +184,8 @@ class Model:
|
|||
return QwenModel
|
||||
if model_architecture == "MixtralForCausalLM":
|
||||
return MixtralModel
|
||||
if model_architecture == "GPT2LMHeadModel":
|
||||
return GPT2Model
|
||||
if model_architecture == "PhiForCausalLM":
|
||||
return Phi2Model
|
||||
if model_architecture == "PlamoForCausalLM":
|
||||
|
@ -282,6 +284,8 @@ class Model:
|
|||
return gguf.MODEL_ARCH.QWEN
|
||||
if arch == "MixtralForCausalLM":
|
||||
return gguf.MODEL_ARCH.LLAMA
|
||||
if arch == "GPT2LMHeadModel":
|
||||
return gguf.MODEL_ARCH.GPT2
|
||||
if arch == "PhiForCausalLM":
|
||||
return gguf.MODEL_ARCH.PHI2
|
||||
if arch == "PlamoForCausalLM":
|
||||
|
@ -295,7 +299,7 @@ class Model:
|
|||
tokens: list[bytearray] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
from transformers import AutoTokenizer # type: ignore[attr-defined]
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||
assert max(tokenizer.vocab.values()) < vocab_size
|
||||
|
@ -932,7 +936,7 @@ class StableLMModel(Model):
|
|||
hparams = self.hparams
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
|
||||
self.gguf_writer.add_name(dir_model.name)
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
|
@ -978,7 +982,7 @@ class QwenModel(Model):
|
|||
tokens: list[bytearray] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
from transformers import AutoTokenizer # type: ignore[attr-defined]
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
||||
vocab_size = hparams["vocab_size"]
|
||||
assert max(tokenizer.get_vocab().values()) < vocab_size
|
||||
|
@ -1073,6 +1077,68 @@ class QwenModel(Model):
|
|||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
class GPT2Model(Model):
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
||||
self.gguf_writer.add_context_length(self.hparams["n_ctx"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
||||
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
||||
self.gguf_writer.add_head_count(self.hparams["n_head"])
|
||||
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
def write_tensors(self):
|
||||
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
||||
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||
|
||||
for name, data_torch in self.get_tensors():
|
||||
# we don't need these
|
||||
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias")):
|
||||
continue
|
||||
|
||||
if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
|
||||
data_torch = data_torch.transpose(1, 0)
|
||||
|
||||
old_dtype = data_torch.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||
data_torch = data_torch.to(torch.float32)
|
||||
|
||||
data = data_torch.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if self.ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
# note: GPT2 output is tied to (same as) wte in original model
|
||||
if new_name == "token_embd.weight":
|
||||
print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
self.gguf_writer.add_tensor("output.weight", data)
|
||||
|
||||
|
||||
class Phi2Model(Model):
|
||||
def set_gguf_parameters(self):
|
||||
block_count = self.hparams["n_layer"]
|
||||
|
@ -1200,57 +1266,62 @@ def parse_args() -> argparse.Namespace:
|
|||
return parser.parse_args()
|
||||
|
||||
|
||||
args = parse_args()
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
|
||||
dir_model = args.model
|
||||
dir_model = args.model
|
||||
|
||||
if args.awq_path:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
|
||||
from awq.apply_awq import add_scale_weights
|
||||
tmp_model_path = args.model / "weighted_model"
|
||||
dir_model = tmp_model_path
|
||||
if tmp_model_path.is_dir():
|
||||
print(f"{tmp_model_path} exists as a weighted model.")
|
||||
if args.awq_path:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
|
||||
from awq.apply_awq import add_scale_weights
|
||||
tmp_model_path = args.model / "weighted_model"
|
||||
dir_model = tmp_model_path
|
||||
if tmp_model_path.is_dir():
|
||||
print(f"{tmp_model_path} exists as a weighted model.")
|
||||
else:
|
||||
tmp_model_path.mkdir(parents=True, exist_ok=True)
|
||||
print("Saving new weighted model ...")
|
||||
add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
|
||||
print(f"Saved weighted model at {tmp_model_path}.")
|
||||
|
||||
if not dir_model.is_dir():
|
||||
print(f'Error: {args.model} is not a directory', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
ftype_map = {
|
||||
"f32": gguf.GGMLQuantizationType.F32,
|
||||
"f16": gguf.GGMLQuantizationType.F16,
|
||||
}
|
||||
|
||||
if args.outfile is not None:
|
||||
fname_out = args.outfile
|
||||
else:
|
||||
tmp_model_path.mkdir(parents=True, exist_ok=True)
|
||||
print("Saving new weighted model ...")
|
||||
add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
|
||||
print(f"Saved weighted model at {tmp_model_path}.")
|
||||
# output in the same directory as the model by default
|
||||
fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
|
||||
|
||||
if not dir_model.is_dir():
|
||||
print(f'Error: {args.model} is not a directory', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
print(f"Loading model: {dir_model.name}")
|
||||
|
||||
ftype_map = {
|
||||
"f32": gguf.GGMLQuantizationType.F32,
|
||||
"f16": gguf.GGMLQuantizationType.F16,
|
||||
}
|
||||
hparams = Model.load_hparams(dir_model)
|
||||
|
||||
if args.outfile is not None:
|
||||
fname_out = args.outfile
|
||||
else:
|
||||
# output in the same directory as the model by default
|
||||
fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
|
||||
with torch.inference_mode():
|
||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
|
||||
|
||||
print(f"Loading model: {dir_model.name}")
|
||||
print("Set model parameters")
|
||||
model_instance.set_gguf_parameters()
|
||||
|
||||
hparams = Model.load_hparams(dir_model)
|
||||
print("Set model tokenizer")
|
||||
model_instance.set_vocab()
|
||||
|
||||
with torch.inference_mode():
|
||||
model_class = Model.from_model_name(args.model_name) if args.model_name else Model.from_model_architecture(hparams["architectures"][0])
|
||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
|
||||
if args.vocab_only:
|
||||
print(f"Exporting model vocab to '{fname_out}'")
|
||||
model_instance.write_vocab()
|
||||
else:
|
||||
print(f"Exporting model to '{fname_out}'")
|
||||
model_instance.write()
|
||||
|
||||
print("Set model parameters")
|
||||
model_instance.set_gguf_parameters()
|
||||
print(f"Model successfully exported to '{fname_out}'")
|
||||
|
||||
print("Set model tokenizer")
|
||||
model_instance.set_vocab()
|
||||
|
||||
if args.vocab_only:
|
||||
print(f"Exporting model vocab to '{fname_out}'")
|
||||
model_instance.write_vocab()
|
||||
else:
|
||||
print(f"Exporting model to '{fname_out}'")
|
||||
model_instance.write()
|
||||
|
||||
print(f"Model successfully exported to '{fname_out}'")
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
|
@ -47,95 +47,96 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty
|
|||
fout.seek((fout.tell() + 31) & -32)
|
||||
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print(f"Usage: python {sys.argv[0]} <path> [arch]")
|
||||
print(
|
||||
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
|
||||
)
|
||||
print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
|
||||
sys.exit(1)
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) < 2:
|
||||
print(f"Usage: python {sys.argv[0]} <path> [arch]")
|
||||
print(
|
||||
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
|
||||
)
|
||||
print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
|
||||
sys.exit(1)
|
||||
|
||||
input_json = os.path.join(sys.argv[1], "adapter_config.json")
|
||||
input_model = os.path.join(sys.argv[1], "adapter_model.bin")
|
||||
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
|
||||
input_json = os.path.join(sys.argv[1], "adapter_config.json")
|
||||
input_model = os.path.join(sys.argv[1], "adapter_model.bin")
|
||||
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
|
||||
|
||||
model = torch.load(input_model, map_location="cpu")
|
||||
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
|
||||
model = torch.load(input_model, map_location="cpu")
|
||||
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
|
||||
|
||||
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
|
||||
print(f"Error: unsupported architecture {arch_name}")
|
||||
sys.exit(1)
|
||||
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
|
||||
print(f"Error: unsupported architecture {arch_name}")
|
||||
sys.exit(1)
|
||||
|
||||
arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
|
||||
name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
|
||||
arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
|
||||
name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
|
||||
|
||||
with open(input_json, "r") as f:
|
||||
params = json.load(f)
|
||||
with open(input_json, "r") as f:
|
||||
params = json.load(f)
|
||||
|
||||
if params["peft_type"] != "LORA":
|
||||
print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
|
||||
sys.exit(1)
|
||||
if params["peft_type"] != "LORA":
|
||||
print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
|
||||
sys.exit(1)
|
||||
|
||||
if params["fan_in_fan_out"] is True:
|
||||
print("Error: param fan_in_fan_out is not supported")
|
||||
sys.exit(1)
|
||||
if params["fan_in_fan_out"] is True:
|
||||
print("Error: param fan_in_fan_out is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
if params["bias"] is not None and params["bias"] != "none":
|
||||
print("Error: param bias is not supported")
|
||||
sys.exit(1)
|
||||
if params["bias"] is not None and params["bias"] != "none":
|
||||
print("Error: param bias is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
# TODO: these seem to be layers that have been trained but without lora.
|
||||
# doesn't seem widely used but eventually should be supported
|
||||
if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
|
||||
print("Error: param modules_to_save is not supported")
|
||||
sys.exit(1)
|
||||
# TODO: these seem to be layers that have been trained but without lora.
|
||||
# doesn't seem widely used but eventually should be supported
|
||||
if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
|
||||
print("Error: param modules_to_save is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
with open(output_path, "wb") as fout:
|
||||
fout.truncate()
|
||||
with open(output_path, "wb") as fout:
|
||||
fout.truncate()
|
||||
|
||||
write_file_header(fout, params)
|
||||
for k, v in model.items():
|
||||
orig_k = k
|
||||
if k.endswith(".default.weight"):
|
||||
k = k.replace(".default.weight", ".weight")
|
||||
if k in ["llama_proj.weight", "llama_proj.bias"]:
|
||||
continue
|
||||
if k.endswith("lora_A.weight"):
|
||||
if v.dtype != torch.float16 and v.dtype != torch.float32:
|
||||
write_file_header(fout, params)
|
||||
for k, v in model.items():
|
||||
orig_k = k
|
||||
if k.endswith(".default.weight"):
|
||||
k = k.replace(".default.weight", ".weight")
|
||||
if k in ["llama_proj.weight", "llama_proj.bias"]:
|
||||
continue
|
||||
if k.endswith("lora_A.weight"):
|
||||
if v.dtype != torch.float16 and v.dtype != torch.float32:
|
||||
v = v.float()
|
||||
v = v.T
|
||||
else:
|
||||
v = v.float()
|
||||
v = v.T
|
||||
else:
|
||||
v = v.float()
|
||||
|
||||
t = v.detach().numpy()
|
||||
t = v.detach().numpy()
|
||||
|
||||
prefix = "base_model.model."
|
||||
if k.startswith(prefix):
|
||||
k = k[len(prefix) :]
|
||||
prefix = "base_model.model."
|
||||
if k.startswith(prefix):
|
||||
k = k[len(prefix) :]
|
||||
|
||||
lora_suffixes = (".lora_A.weight", ".lora_B.weight")
|
||||
if k.endswith(lora_suffixes):
|
||||
suffix = k[-len(lora_suffixes[0]):]
|
||||
k = k[: -len(lora_suffixes[0])]
|
||||
else:
|
||||
print(f"Error: unrecognized tensor name {orig_k}")
|
||||
sys.exit(1)
|
||||
lora_suffixes = (".lora_A.weight", ".lora_B.weight")
|
||||
if k.endswith(lora_suffixes):
|
||||
suffix = k[-len(lora_suffixes[0]):]
|
||||
k = k[: -len(lora_suffixes[0])]
|
||||
else:
|
||||
print(f"Error: unrecognized tensor name {orig_k}")
|
||||
sys.exit(1)
|
||||
|
||||
tname = name_map.get_name(k)
|
||||
if tname is None:
|
||||
print(f"Error: could not map tensor name {orig_k}")
|
||||
print(" Note: the arch parameter must be specified if the model is not llama")
|
||||
sys.exit(1)
|
||||
tname = name_map.get_name(k)
|
||||
if tname is None:
|
||||
print(f"Error: could not map tensor name {orig_k}")
|
||||
print(" Note: the arch parameter must be specified if the model is not llama")
|
||||
sys.exit(1)
|
||||
|
||||
if suffix == ".lora_A.weight":
|
||||
tname += ".weight.loraA"
|
||||
elif suffix == ".lora_B.weight":
|
||||
tname += ".weight.loraB"
|
||||
else:
|
||||
assert False
|
||||
if suffix == ".lora_A.weight":
|
||||
tname += ".weight.loraA"
|
||||
elif suffix == ".lora_B.weight":
|
||||
tname += ".weight.loraB"
|
||||
else:
|
||||
assert False
|
||||
|
||||
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
|
||||
write_tensor_header(fout, tname, t.shape, t.dtype)
|
||||
t.tofile(fout)
|
||||
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
|
||||
write_tensor_header(fout, tname, t.shape, t.dtype)
|
||||
t.tofile(fout)
|
||||
|
||||
print(f"Converted {input_json} and {input_model} to {output_path}")
|
||||
print(f"Converted {input_json} and {input_model} to {output_path}")
|
||||
|
|
1
convert-persimmon-to-gguf.py
Normal file → Executable file
1
convert-persimmon-to-gguf.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
|||
#!/usr/bin/env python3
|
||||
import torch
|
||||
import os
|
||||
from pprint import pprint
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
import Foundation
|
||||
|
||||
// To use this in your own project, add llama.cpp as a swift package dependency
|
||||
// and uncomment this import line.
|
||||
// import llama
|
||||
|
||||
enum LlamaError: Error {
|
||||
|
|
|
@ -4,6 +4,7 @@ import Foundation
|
|||
class LlamaState: ObservableObject {
|
||||
@Published var messageLog = ""
|
||||
@Published var cacheCleared = false
|
||||
let NS_PER_S = 1_000_000_000.0
|
||||
|
||||
private var llamaContext: LlamaContext?
|
||||
private var defaultModelUrl: URL? {
|
||||
|
@ -20,12 +21,12 @@ class LlamaState: ObservableObject {
|
|||
}
|
||||
|
||||
func loadModel(modelUrl: URL?) throws {
|
||||
messageLog += "Loading model...\n"
|
||||
if let modelUrl {
|
||||
messageLog += "Loading model...\n"
|
||||
llamaContext = try LlamaContext.create_context(path: modelUrl.path())
|
||||
messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
|
||||
} else {
|
||||
messageLog += "Could not locate model\n"
|
||||
messageLog += "Load a model from the list below\n"
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -34,15 +35,29 @@ class LlamaState: ObservableObject {
|
|||
return
|
||||
}
|
||||
|
||||
let t_start = DispatchTime.now().uptimeNanoseconds
|
||||
await llamaContext.completion_init(text: text)
|
||||
let t_heat_end = DispatchTime.now().uptimeNanoseconds
|
||||
let t_heat = Double(t_heat_end - t_start) / NS_PER_S
|
||||
|
||||
messageLog += "\(text)"
|
||||
|
||||
while await llamaContext.n_cur <= llamaContext.n_len {
|
||||
while await llamaContext.n_cur < llamaContext.n_len {
|
||||
let result = await llamaContext.completion_loop()
|
||||
messageLog += "\(result)"
|
||||
}
|
||||
|
||||
let t_end = DispatchTime.now().uptimeNanoseconds
|
||||
let t_generation = Double(t_end - t_heat_end) / NS_PER_S
|
||||
let tokens_per_second = Double(await llamaContext.n_len) / t_generation
|
||||
|
||||
await llamaContext.clear()
|
||||
messageLog += "\n\ndone\n"
|
||||
messageLog += """
|
||||
\n
|
||||
Done
|
||||
Heat up took \(t_heat)s
|
||||
Generated \(tokens_per_second) t/s\n
|
||||
"""
|
||||
}
|
||||
|
||||
func bench() async {
|
||||
|
@ -56,10 +71,10 @@ class LlamaState: ObservableObject {
|
|||
messageLog += await llamaContext.model_info() + "\n"
|
||||
|
||||
let t_start = DispatchTime.now().uptimeNanoseconds
|
||||
await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up
|
||||
let _ = await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up
|
||||
let t_end = DispatchTime.now().uptimeNanoseconds
|
||||
|
||||
let t_heat = Double(t_end - t_start) / 1_000_000_000.0
|
||||
let t_heat = Double(t_end - t_start) / NS_PER_S
|
||||
messageLog += "Heat up time: \(t_heat) seconds, please wait...\n"
|
||||
|
||||
// if more than 5 seconds, then we're probably running on a slow device
|
||||
|
|
|
@ -42,46 +42,27 @@ struct ContentView: View {
|
|||
Button("Send") {
|
||||
sendText()
|
||||
}
|
||||
.padding(8)
|
||||
.background(Color.blue)
|
||||
.foregroundColor(.white)
|
||||
.cornerRadius(8)
|
||||
|
||||
Button("Bench") {
|
||||
bench()
|
||||
}
|
||||
.padding(8)
|
||||
.background(Color.blue)
|
||||
.foregroundColor(.white)
|
||||
.cornerRadius(8)
|
||||
|
||||
Button("Clear") {
|
||||
clear()
|
||||
}
|
||||
.padding(8)
|
||||
.background(Color.blue)
|
||||
.foregroundColor(.white)
|
||||
.cornerRadius(8)
|
||||
|
||||
Button("Copy") {
|
||||
UIPasteboard.general.string = llamaState.messageLog
|
||||
}
|
||||
.padding(8)
|
||||
.background(Color.blue)
|
||||
.foregroundColor(.white)
|
||||
.cornerRadius(8)
|
||||
}
|
||||
}.buttonStyle(.bordered)
|
||||
|
||||
VStack {
|
||||
VStack(alignment: .leading) {
|
||||
DownloadButton(
|
||||
llamaState: llamaState,
|
||||
modelName: "TinyLlama-1.1B (Q4_0, 0.6 GiB)",
|
||||
modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
|
||||
filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
|
||||
)
|
||||
.font(.system(size: 12))
|
||||
.padding(.top, 4)
|
||||
.frame(maxWidth: .infinity, alignment: .leading)
|
||||
|
||||
DownloadButton(
|
||||
llamaState: llamaState,
|
||||
|
@ -89,7 +70,6 @@ struct ContentView: View {
|
|||
modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf?download=true",
|
||||
filename: "tinyllama-1.1b-1t-openorca.Q8_0.gguf"
|
||||
)
|
||||
.font(.system(size: 12))
|
||||
|
||||
DownloadButton(
|
||||
llamaState: llamaState,
|
||||
|
@ -97,8 +77,6 @@ struct ContentView: View {
|
|||
modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true",
|
||||
filename: "tinyllama-1.1b-f16.gguf"
|
||||
)
|
||||
.font(.system(size: 12))
|
||||
.frame(maxWidth: .infinity, alignment: .leading)
|
||||
|
||||
DownloadButton(
|
||||
llamaState: llamaState,
|
||||
|
@ -106,7 +84,6 @@ struct ContentView: View {
|
|||
modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true",
|
||||
filename: "phi-2-q4_0.gguf"
|
||||
)
|
||||
.font(.system(size: 12))
|
||||
|
||||
DownloadButton(
|
||||
llamaState: llamaState,
|
||||
|
@ -114,8 +91,6 @@ struct ContentView: View {
|
|||
modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true",
|
||||
filename: "phi-2-q8_0.gguf"
|
||||
)
|
||||
.font(.system(size: 12))
|
||||
.frame(maxWidth: .infinity, alignment: .leading)
|
||||
|
||||
DownloadButton(
|
||||
llamaState: llamaState,
|
||||
|
@ -123,15 +98,15 @@ struct ContentView: View {
|
|||
modelUrl: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true",
|
||||
filename: "mistral-7b-v0.1.Q4_0.gguf"
|
||||
)
|
||||
.font(.system(size: 12))
|
||||
|
||||
Button("Clear downloaded models") {
|
||||
ContentView.cleanupModelCaches()
|
||||
llamaState.cacheCleared = true
|
||||
}
|
||||
.padding(8)
|
||||
.font(.system(size: 12))
|
||||
}
|
||||
.padding(.top, 4)
|
||||
.font(.system(size: 12))
|
||||
.frame(maxWidth: .infinity, alignment: .leading)
|
||||
}
|
||||
.padding()
|
||||
}
|
||||
|
|
|
@ -93,7 +93,7 @@ struct DownloadButton: View {
|
|||
print("Error: \(err.localizedDescription)")
|
||||
}
|
||||
}) {
|
||||
Text("\(modelName) (Downloaded)")
|
||||
Text("Load \(modelName)")
|
||||
}
|
||||
} else {
|
||||
Text("Unknown status")
|
||||
|
|
|
@ -24,7 +24,8 @@ endif()
|
|||
|
||||
if (NOT MSVC)
|
||||
target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(llava BUILD_INFO)
|
||||
endif()
|
||||
|
@ -32,5 +33,5 @@ endif()
|
|||
set(TARGET llava-cli)
|
||||
add_executable(llava-cli llava-cli.cpp)
|
||||
install(TARGETS llava-cli RUNTIME)
|
||||
target_link_libraries(llava-cli PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(llava PRIVATE cxx_std_11)
|
||||
|
|
|
@ -16,12 +16,19 @@
|
|||
#include "clip.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#include "ggml-cuda.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
#include "ggml-metal.h"
|
||||
#endif
|
||||
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#include "stb_image.h"
|
||||
|
||||
#define CLIP_DEBUG
|
||||
|
||||
static std::string format(const char * fmt, ...) {
|
||||
va_list ap;
|
||||
va_list ap2;
|
||||
|
@ -196,20 +203,6 @@ struct clip_vision_model {
|
|||
struct ggml_tensor * mm_2_b;
|
||||
};
|
||||
|
||||
// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
|
||||
struct clip_buffer {
|
||||
uint8_t * data = NULL;
|
||||
size_t size = 0;
|
||||
|
||||
void resize(size_t size) {
|
||||
delete[] data;
|
||||
data = new uint8_t[size];
|
||||
this->size = size;
|
||||
}
|
||||
|
||||
~clip_buffer() { delete[] data; }
|
||||
};
|
||||
|
||||
struct clip_ctx {
|
||||
bool has_text_encoder = false;
|
||||
bool has_vision_encoder = false;
|
||||
|
@ -223,9 +216,10 @@ struct clip_ctx {
|
|||
struct gguf_context * ctx_gguf;
|
||||
|
||||
// memory buffers to evaluate the model
|
||||
clip_buffer buf_compute;
|
||||
clip_buffer buf_alloc;
|
||||
ggml_allocr * alloc = NULL;
|
||||
ggml_backend_buffer_t params_buffer = NULL;
|
||||
ggml_backend_buffer_t compute_buffer = NULL;
|
||||
ggml_backend_t backend = NULL;
|
||||
ggml_allocr * compute_alloc = NULL;
|
||||
};
|
||||
|
||||
static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_image_f32_batch * imgs) {
|
||||
|
@ -252,25 +246,20 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
|||
if(ctx->has_llava_projector) {
|
||||
GGML_ASSERT(batch_size == 1);
|
||||
}
|
||||
|
||||
const auto & buf_compute = ctx->buf_compute;
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_compute.size,
|
||||
/*.mem_buffer =*/ buf_compute.data,
|
||||
/*.no_alloc =*/ false,
|
||||
/*.mem_size =*/ GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead(),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
params.no_alloc = true;
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
|
||||
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
|
||||
ggml_allocr_alloc(ctx->alloc, inp_raw);
|
||||
ggml_allocr_alloc(ctx->compute_alloc, inp_raw);
|
||||
|
||||
if (!ggml_allocr_is_measure(ctx->alloc)) {
|
||||
float * data = (float *)ggml_get_data(inp_raw);
|
||||
if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
|
||||
float * data = (float *)malloc(ggml_nbytes(inp_raw));
|
||||
|
||||
for (size_t i = 0; i < imgs->size; i++) {
|
||||
const int nx = imgs->data[i].nx;
|
||||
|
@ -289,6 +278,8 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
|||
}
|
||||
}
|
||||
}
|
||||
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
|
||||
free(data);
|
||||
}
|
||||
|
||||
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
|
@ -298,36 +289,39 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
|||
|
||||
// concat class_embeddings and patch_embeddings
|
||||
struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
||||
ggml_allocr_alloc(ctx->alloc, embeddings);
|
||||
if (!ggml_allocr_is_measure(ctx->alloc)) {
|
||||
ggml_set_zero(embeddings);
|
||||
ggml_allocr_alloc(ctx->compute_alloc, embeddings);
|
||||
if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
|
||||
void* zero_mem = malloc(ggml_nbytes(embeddings));
|
||||
memset(zero_mem, 0, ggml_nbytes(embeddings));
|
||||
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
|
||||
free(zero_mem);
|
||||
}
|
||||
|
||||
struct ggml_tensor * temp = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, 1, batch_size);
|
||||
ggml_allocr_alloc(ctx->alloc, temp);
|
||||
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
||||
|
||||
embeddings = ggml_acc(ctx0, embeddings, ggml_repeat(ctx0, model.class_embedding, temp), embeddings->nb[1],
|
||||
embeddings->nb[2], embeddings->nb[3], 0);
|
||||
embeddings =
|
||||
ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
||||
embeddings = ggml_acc(ctx0, embeddings, inp,
|
||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
||||
|
||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
||||
ggml_allocr_alloc(ctx->alloc, positions);
|
||||
if (!ggml_allocr_is_measure(ctx->alloc)) {
|
||||
ggml_allocr_alloc(ctx->compute_alloc, positions);
|
||||
if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
|
||||
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
||||
for (int i = 0; i < num_positions; i++) {
|
||||
ggml_set_i32_1d(positions, i, i);
|
||||
positions_data[i] = i;
|
||||
}
|
||||
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
||||
free(positions_data);
|
||||
}
|
||||
|
||||
embeddings =
|
||||
ggml_add(ctx0, embeddings, ggml_repeat(ctx0, ggml_get_rows(ctx0, model.position_embeddings, positions), embeddings));
|
||||
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
||||
|
||||
// pre-layernorm
|
||||
{
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.pre_ln_w, embeddings), embeddings),
|
||||
ggml_repeat(ctx0, model.pre_ln_b, embeddings));
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
|
||||
}
|
||||
|
||||
// loop over layers
|
||||
|
@ -340,15 +334,15 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
|||
{
|
||||
cur = ggml_norm(ctx0, cur, eps);
|
||||
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_1_w, cur), cur),
|
||||
ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w),
|
||||
model.layers[il].ln_1_b);
|
||||
}
|
||||
|
||||
// self-attention
|
||||
{
|
||||
|
||||
struct ggml_tensor * Q =
|
||||
ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, cur), ggml_mul_mat(ctx0, model.layers[il].q_w, cur));
|
||||
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
|
||||
|
||||
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
|
||||
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
|
||||
|
@ -356,14 +350,14 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
|||
Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
|
||||
|
||||
struct ggml_tensor * K =
|
||||
ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].k_b, cur), ggml_mul_mat(ctx0, model.layers[il].k_w, cur));
|
||||
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
|
||||
|
||||
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
|
||||
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
||||
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
|
||||
|
||||
struct ggml_tensor * V =
|
||||
ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].v_b, cur), ggml_mul_mat(ctx0, model.layers[il].v_w, cur));
|
||||
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
|
||||
|
||||
V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
|
||||
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
||||
|
@ -379,7 +373,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
|||
}
|
||||
|
||||
// attention output
|
||||
cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].o_b, cur), ggml_mul_mat(ctx0, model.layers[il].o_w, cur));
|
||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
|
||||
|
||||
// re-add the layer input, e.g., residual
|
||||
cur = ggml_add(ctx0, cur, embeddings);
|
||||
|
@ -390,12 +384,11 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
|||
{
|
||||
cur = ggml_norm(ctx0, cur, eps);
|
||||
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_2_w, cur), cur),
|
||||
ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
|
||||
}
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
|
||||
cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].ff_i_b, cur), cur);
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
|
||||
|
||||
if (ctx->use_gelu) {
|
||||
cur = ggml_gelu_inplace(ctx0, cur);
|
||||
|
@ -404,7 +397,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
|||
}
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
|
||||
cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].ff_o_b, cur), cur);
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
|
||||
|
||||
// residual 2
|
||||
cur = ggml_add(ctx0, embeddings, cur);
|
||||
|
@ -417,23 +410,26 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
|||
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
||||
|
||||
struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
|
||||
ggml_allocr_alloc(ctx->alloc, patches);
|
||||
if (!ggml_allocr_is_measure(ctx->alloc)) {
|
||||
for (int i = 0; i < num_patches; ++i) {
|
||||
ggml_set_i32_1d(patches, i, i+1);
|
||||
ggml_allocr_alloc(ctx->compute_alloc, patches);
|
||||
if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
|
||||
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
||||
for (int i = 0; i < num_positions; i++) {
|
||||
patches_data[i] = i + 1;
|
||||
}
|
||||
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
||||
free(patches_data);
|
||||
}
|
||||
|
||||
embeddings = ggml_get_rows(ctx0, embeddings, patches);
|
||||
|
||||
// mm projection 0
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, ggml_repeat(ctx0, model.mm_0_b, embeddings), embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, ggml_repeat(ctx0, model.mm_2_b, embeddings), embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
|
@ -446,7 +442,6 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
|||
|
||||
// read and create ggml_context containing the tensors and their data
|
||||
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
|
||||
struct ggml_context * meta = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
|
@ -479,7 +474,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
printf("%s: ftype: %s\n", __func__, ftype_str.c_str());
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
// kv
|
||||
if (verbosity >= 3) {
|
||||
const int n_kv = gguf_get_n_kv(ctx);
|
||||
|
@ -493,27 +488,38 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
}
|
||||
|
||||
// data
|
||||
size_t ctx_size = 0;
|
||||
size_t buffer_size = 0;
|
||||
{
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(ctx, i);
|
||||
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
||||
|
||||
struct ggml_tensor * cur = ggml_get_tensor(meta, name);
|
||||
ctx_size += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
||||
size_t tensor_size = ggml_nbytes(cur);
|
||||
size_t padded_size = ggml_nbytes_pad(cur);
|
||||
ctx_size += padded_size;
|
||||
buffer_size += tensor_size;
|
||||
if (verbosity >= 3) {
|
||||
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, padded_size=%zu, offset=%zu\n", __func__, i,
|
||||
ggml_n_dims(cur), cur->name, tensor_size, padded_size, offset);
|
||||
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu\n", __func__, i,
|
||||
ggml_n_dims(cur), cur->name, tensor_size, offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
buffer_size += n_tensors * 128 /* CLIP PADDING */;
|
||||
|
||||
clip_ctx * new_clip = new clip_ctx;
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
new_clip->backend = ggml_backend_cuda_init(0);
|
||||
printf("%s: CLIP using CUDA backend\n", __func__);
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
new_clip->backend = ggml_backend_metal_init();
|
||||
printf("%s: CLIP using Metal backend\n", __func__);
|
||||
#endif
|
||||
|
||||
if (!new_clip->backend) {
|
||||
new_clip->backend = ggml_backend_cpu_init();
|
||||
printf("%s: CLIP using CPU backend\n", __func__);
|
||||
}
|
||||
|
||||
// model size and capabilities
|
||||
{
|
||||
|
@ -539,17 +545,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
printf("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
||||
printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
||||
printf("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
||||
printf("%s: model size: %.2f MB\n", __func__, (ctx_size / 1024.0 / 1024.0));
|
||||
printf("%s: model size: %.2f MB\n", __func__, buffer_size / 1024.0 / 1024.0);
|
||||
printf("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
|
||||
printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, buffer_size / (1024.0 * 1024.0), n_tensors);
|
||||
|
||||
// load tensors
|
||||
{
|
||||
std::vector<uint8_t> read_buf;
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx_size,
|
||||
/*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ false,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
new_clip->ctx = ggml_init(params);
|
||||
|
@ -566,13 +575,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
// add tensors to context
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(ctx, i);
|
||||
struct ggml_tensor * t = ggml_get_tensor(meta, name);
|
||||
struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx, t);
|
||||
ggml_set_name(cur, name);
|
||||
}
|
||||
|
||||
// alloc memory and offload data
|
||||
new_clip->params_buffer = ggml_backend_alloc_buffer(new_clip->backend, buffer_size);
|
||||
ggml_allocr* alloc = ggml_allocr_new_from_buffer(new_clip->params_buffer);
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(ctx, i);
|
||||
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx, name);
|
||||
ggml_allocr_alloc(alloc, cur);
|
||||
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
||||
fin.seekg(offset, std::ios::beg);
|
||||
if (!fin) {
|
||||
|
@ -580,10 +597,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
clip_free(new_clip);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
fin.read(reinterpret_cast<char *>(cur->data), ggml_nbytes(t));
|
||||
int num_bytes = ggml_nbytes(cur);
|
||||
if (ggml_backend_buffer_is_host(new_clip->params_buffer)) {
|
||||
// for the CPU and Metal backend, we can read directly into the tensor
|
||||
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
|
||||
} else {
|
||||
// read into a temporary buffer first, then copy to device memory
|
||||
read_buf.resize(num_bytes);
|
||||
fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_allocr_free(alloc);
|
||||
fin.close();
|
||||
}
|
||||
|
||||
|
@ -657,18 +682,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
|
||||
// measure mem requirement and allocate
|
||||
{
|
||||
static const size_t tensor_alignment = 32;
|
||||
new_clip->buf_compute.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead());
|
||||
new_clip->alloc = ggml_allocr_new_measure(tensor_alignment);
|
||||
new_clip->compute_alloc = ggml_allocr_new_measure_from_backend(new_clip->backend);
|
||||
clip_image_f32_batch batch;
|
||||
batch.size = 1;
|
||||
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
|
||||
size_t alloc_size = ggml_allocr_alloc_graph(new_clip->alloc, gf) + tensor_alignment;
|
||||
ggml_allocr_free(new_clip->alloc);
|
||||
new_clip->buf_alloc.resize(alloc_size);
|
||||
new_clip->alloc = ggml_allocr_new(new_clip->buf_alloc.data, new_clip->buf_alloc.size, tensor_alignment);
|
||||
size_t compute_memory_buffer_size = ggml_allocr_alloc_graph(new_clip->compute_alloc, gf);
|
||||
ggml_allocr_free(new_clip->compute_alloc);
|
||||
new_clip->compute_buffer = ggml_backend_alloc_buffer(new_clip->backend, compute_memory_buffer_size);
|
||||
new_clip->compute_alloc = ggml_allocr_new_from_buffer(new_clip->compute_buffer);
|
||||
|
||||
printf("%s: total allocated memory: %.2f MB\n", __func__, (new_clip->buf_compute.size + alloc_size)/1024.0/1024.0);
|
||||
printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
||||
}
|
||||
|
||||
return new_clip;
|
||||
|
@ -852,29 +875,29 @@ bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const cl
|
|||
}
|
||||
|
||||
// reset alloc buffer to clean the memory from previous invocations
|
||||
ggml_allocr_reset(ctx->alloc);
|
||||
ggml_allocr_reset(ctx->compute_alloc);
|
||||
|
||||
// build the inference graph
|
||||
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
|
||||
ggml_allocr_alloc_graph(ctx->alloc, gf);
|
||||
ggml_allocr_alloc_graph(ctx->compute_alloc, gf);
|
||||
|
||||
struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = (uint8_t *)malloc(plan.work_size);
|
||||
if (ggml_backend_is_cpu(ctx->backend)) {
|
||||
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
||||
}
|
||||
|
||||
ggml_graph_compute(gf, &plan);
|
||||
#ifdef GGML_USE_METAL
|
||||
if (ggml_backend_is_metal(ctx->backend)) {
|
||||
ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
|
||||
}
|
||||
#endif
|
||||
|
||||
ggml_backend_graph_compute(ctx->backend, gf);
|
||||
|
||||
// the last node is the embedding tensor
|
||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
|
||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
|
||||
|
||||
// copy the embeddings to the location passed by the user
|
||||
memcpy(vec, ggml_get_data_f32(embeddings), ggml_nbytes(embeddings));
|
||||
|
||||
if (plan.work_size > 0) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
|
||||
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -1045,8 +1068,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||
gguf_free(ctx_out);
|
||||
|
||||
{
|
||||
printf("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
|
||||
printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
|
||||
printf("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
|
||||
printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
|
||||
|
||||
int64_t sum_all = 0;
|
||||
for (size_t i = 0; i < hist_all.size(); ++i) {
|
||||
|
|
|
@ -39,73 +39,11 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n
|
|||
return true;
|
||||
}
|
||||
|
||||
// TODO: use common/sampling.h
|
||||
static llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
|
||||
auto & sparams = params.sparams;
|
||||
|
||||
// out of user input, sample next token
|
||||
const float temp = sparams.temp;
|
||||
const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
|
||||
const float top_p = sparams.top_p;
|
||||
const float tfs_z = sparams.tfs_z;
|
||||
const float typical_p = sparams.typical_p;
|
||||
// const int32_t repeat_last_n = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
|
||||
// const float repeat_penalty = sparams.repeat_penalty;
|
||||
// const float alpha_presence = sparams.presence_penalty;
|
||||
// const float alpha_frequency = sparams.frequency_penalty;
|
||||
const int mirostat = sparams.mirostat;
|
||||
const float mirostat_tau = sparams.mirostat_tau;
|
||||
const float mirostat_eta = sparams.mirostat_eta;
|
||||
// const bool penalize_nl = sparams.penalize_nl;
|
||||
|
||||
llama_token id = 0;
|
||||
{
|
||||
auto logits = llama_get_logits(ctx_llama);
|
||||
auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
|
||||
|
||||
// Apply params.logit_bias map
|
||||
for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
|
||||
logits[it->first] += it->second;
|
||||
}
|
||||
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
if (temp <= 0) {
|
||||
// Greedy sampling
|
||||
id = llama_sample_token_greedy(ctx_llama, &candidates_p);
|
||||
} else {
|
||||
if (mirostat == 1) {
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
const int mirostat_m = 100;
|
||||
llama_sample_temp(ctx_llama, &candidates_p, temp);
|
||||
id = llama_sample_token_mirostat(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
||||
} else if (mirostat == 2) {
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
llama_sample_temp(ctx_llama, &candidates_p, temp);
|
||||
id = llama_sample_token_mirostat_v2(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
||||
} else {
|
||||
// Temperature sampling
|
||||
llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
|
||||
llama_sample_tail_free(ctx_llama, &candidates_p, tfs_z, 1);
|
||||
llama_sample_typical(ctx_llama, &candidates_p, typical_p, 1);
|
||||
llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
|
||||
llama_sample_temp(ctx_llama, &candidates_p, temp);
|
||||
id = llama_sample_token(ctx_llama, &candidates_p);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return id;
|
||||
}
|
||||
|
||||
static const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
|
||||
int id = sample_id(ctx_llama, params);
|
||||
static const char * sample(struct llama_sampling_context * ctx_sampling,
|
||||
struct llama_context * ctx_llama,
|
||||
int * n_past) {
|
||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
|
||||
llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
|
||||
static std::string ret;
|
||||
if (id == llama_token_eos(llama_get_model(ctx_llama))) {
|
||||
ret = "</s>";
|
||||
|
@ -174,8 +112,8 @@ struct llava_context {
|
|||
};
|
||||
|
||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||
printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
printf(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
fprintf(stderr, "\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
fprintf(stderr, " note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
}
|
||||
|
||||
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) {
|
||||
|
@ -185,7 +123,7 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
|
|||
auto prompt = params->prompt;
|
||||
if (prompt_contains_image(prompt)) {
|
||||
if (!params->image.empty()) {
|
||||
printf("using base64 encoded image instead of command line image path\n");
|
||||
fprintf(stderr, "using base64 encoded image instead of command line image path\n");
|
||||
}
|
||||
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
|
||||
if (!embed) {
|
||||
|
@ -217,16 +155,19 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||
|
||||
// generate the response
|
||||
|
||||
printf("\n");
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
|
||||
|
||||
for (int i = 0; i < max_tgt_len; i++) {
|
||||
const char * tmp = sample(ctx_llava->ctx_llama, *params, &n_past);
|
||||
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
|
||||
if (strcmp(tmp, "</s>") == 0) break;
|
||||
|
||||
printf("%s", tmp);
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
llama_sampling_free(ctx_sampling);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
|
|
@ -7,28 +7,13 @@ find_package(Llama 0.0.1 REQUIRED)
|
|||
# Bake common functionality in with target. Because applications
|
||||
# using the relocatable Llama package should be outside of the
|
||||
# source tree, main-cmake-pkg pretends the dependencies are built-in.
|
||||
|
||||
set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
|
||||
add_library(common OBJECT
|
||||
${_common_path}/common.h
|
||||
${_common_path}/common.cpp
|
||||
${_common_path}/console.h
|
||||
${_common_path}/console.cpp
|
||||
${_common_path}/grammar-parser.h
|
||||
${_common_path}/grammar-parser.cpp
|
||||
${_common_path}/sampling.h
|
||||
${_common_path}/sampling.cpp
|
||||
)
|
||||
|
||||
# WARNING: because build-info.h is auto-generated, it will only
|
||||
# be available after the user has built the llama.cpp sources.
|
||||
#
|
||||
configure_file(${_common_path}/../build-info.h
|
||||
${CMAKE_CURRENT_BINARY_DIR}/build-info.h
|
||||
COPYONLY)
|
||||
|
||||
target_include_directories(common PUBLIC ${LLAMA_INCLUDE_DIR}
|
||||
${CMAKE_CURRENT_BINARY_DIR})
|
||||
add_library(common OBJECT)
|
||||
file(GLOB _common_files
|
||||
"${_common_path}/*.h"
|
||||
"${_common_path}/*.cpp"
|
||||
)
|
||||
target_sources(common PRIVATE ${_common_files})
|
||||
|
||||
# If the common project was part of "main-cmake-pkg" the transient
|
||||
# defines would automatically be attached. Because the common func-
|
||||
|
|
|
@ -6,7 +6,7 @@ install(TARGETS ${TARGET} RUNTIME)
|
|||
target_compile_definitions(${TARGET} PRIVATE
|
||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||
)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
if (WIN32)
|
||||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||
endif()
|
||||
|
|
|
@ -166,7 +166,7 @@ node index.js
|
|||
|
||||
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
|
||||
|
||||
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:` In this case, `[img-12]` will be replaced by the embeddings of the image id 12 in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
|
||||
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
|
||||
|
||||
*Result JSON:*
|
||||
|
||||
|
@ -224,6 +224,8 @@ node index.js
|
|||
|
||||
`content`: Set the text to process.
|
||||
|
||||
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
|
||||
|
||||
- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
|
||||
|
||||
*Options:*
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
#include <thread>
|
||||
#include <mutex>
|
||||
#include <chrono>
|
||||
#include <condition_variable>
|
||||
|
||||
#ifndef SERVER_VERBOSE
|
||||
#define SERVER_VERBOSE 1
|
||||
|
@ -441,7 +442,6 @@ struct llama_client_slot
|
|||
}
|
||||
|
||||
images.clear();
|
||||
// llama_set_rng_seed(ctx, params.seed); in batched the seed matter???????
|
||||
}
|
||||
|
||||
bool has_budget(gpt_params &global_params) {
|
||||
|
@ -542,7 +542,9 @@ struct llama_server_context
|
|||
std::vector<task_result> queue_results;
|
||||
std::vector<task_multi> queue_multitasks;
|
||||
std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks
|
||||
std::condition_variable condition_tasks;
|
||||
std::mutex mutex_results;
|
||||
std::condition_variable condition_results;
|
||||
|
||||
~llama_server_context()
|
||||
{
|
||||
|
@ -921,6 +923,7 @@ struct llama_server_context
|
|||
llama_sampling_free(slot->ctx_sampling);
|
||||
}
|
||||
slot->ctx_sampling = llama_sampling_init(slot->sparams);
|
||||
llama_set_rng_seed(ctx, slot->params.seed);
|
||||
slot->command = LOAD_PROMPT;
|
||||
|
||||
all_slots_are_idle = false;
|
||||
|
@ -1169,7 +1172,7 @@ struct llama_server_context
|
|||
|
||||
void send_error(task_server& task, std::string error)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_results);
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
task_result res;
|
||||
res.id = task.id;
|
||||
res.multitask_id = task.multitask_id;
|
||||
|
@ -1177,6 +1180,7 @@ struct llama_server_context
|
|||
res.error = true;
|
||||
res.result_json = { { "content", error } };
|
||||
queue_results.push_back(res);
|
||||
condition_results.notify_all();
|
||||
}
|
||||
|
||||
void add_multi_task(int id, std::vector<int>& sub_ids)
|
||||
|
@ -1186,6 +1190,7 @@ struct llama_server_context
|
|||
multi.id = id;
|
||||
std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
|
||||
queue_multitasks.push_back(multi);
|
||||
condition_tasks.notify_one();
|
||||
}
|
||||
|
||||
void update_multi_task(int multitask_id, int subtask_id, task_result& result)
|
||||
|
@ -1197,6 +1202,7 @@ struct llama_server_context
|
|||
{
|
||||
multitask.subtasks_remaining.erase(subtask_id);
|
||||
multitask.results.push_back(result);
|
||||
condition_tasks.notify_one();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1215,7 +1221,7 @@ struct llama_server_context
|
|||
{"n_ctx", slot.n_ctx},
|
||||
{"model", params.model_alias},
|
||||
{"seed", slot.params.seed},
|
||||
{"temp", slot.sparams.temp},
|
||||
{"temperature", slot.sparams.temp},
|
||||
{"top_k", slot.sparams.top_k},
|
||||
{"top_p", slot.sparams.top_p},
|
||||
{"min_p", slot.sparams.min_p},
|
||||
|
@ -1244,7 +1250,7 @@ struct llama_server_context
|
|||
|
||||
void send_partial_response(llama_client_slot &slot, completion_token_output tkn)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_results);
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
task_result res;
|
||||
res.id = slot.task_id;
|
||||
res.multitask_id = slot.multitask_id;
|
||||
|
@ -1280,11 +1286,12 @@ struct llama_server_context
|
|||
}
|
||||
|
||||
queue_results.push_back(res);
|
||||
condition_results.notify_all();
|
||||
}
|
||||
|
||||
void send_final_response(llama_client_slot &slot)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_results);
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
task_result res;
|
||||
res.id = slot.task_id;
|
||||
res.multitask_id = slot.multitask_id;
|
||||
|
@ -1340,11 +1347,12 @@ struct llama_server_context
|
|||
}
|
||||
|
||||
queue_results.push_back(res);
|
||||
condition_results.notify_all();
|
||||
}
|
||||
|
||||
void send_embedding(llama_client_slot &slot)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_results);
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
task_result res;
|
||||
res.id = slot.task_id;
|
||||
res.multitask_id = slot.multitask_id;
|
||||
|
@ -1372,6 +1380,7 @@ struct llama_server_context
|
|||
};
|
||||
}
|
||||
queue_results.push_back(res);
|
||||
condition_results.notify_all();
|
||||
}
|
||||
|
||||
int request_completion(json data, bool infill, bool embedding, int multitask_id)
|
||||
|
@ -1395,6 +1404,7 @@ struct llama_server_context
|
|||
|
||||
// otherwise, it's a single-prompt task, we actually queue it
|
||||
queue_tasks.push_back(task);
|
||||
condition_tasks.notify_one();
|
||||
return task.id;
|
||||
}
|
||||
|
||||
|
@ -1402,13 +1412,10 @@ struct llama_server_context
|
|||
{
|
||||
while (true)
|
||||
{
|
||||
std::this_thread::sleep_for(std::chrono::microseconds(5));
|
||||
std::lock_guard<std::mutex> lock(mutex_results);
|
||||
|
||||
if (queue_results.empty())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
condition_results.wait(lock, [&]{
|
||||
return !queue_results.empty();
|
||||
});
|
||||
|
||||
for (int i = 0; i < (int) queue_results.size(); i++)
|
||||
{
|
||||
|
@ -1504,12 +1511,13 @@ struct llama_server_context
|
|||
|
||||
void request_cancel(int task_id)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
task_server task;
|
||||
task.id = id_gen++;
|
||||
task.type = CANCEL_TASK;
|
||||
task.target_id = task_id;
|
||||
queue_tasks.push_back(task);
|
||||
condition_tasks.notify_one();
|
||||
}
|
||||
|
||||
int split_multiprompt_task(task_server& multiprompt_task)
|
||||
|
@ -1535,7 +1543,7 @@ struct llama_server_context
|
|||
|
||||
void process_tasks()
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
while (!queue_tasks.empty())
|
||||
{
|
||||
task_server task = queue_tasks.front();
|
||||
|
@ -1607,6 +1615,7 @@ struct llama_server_context
|
|||
|
||||
std::lock_guard<std::mutex> lock(mutex_results);
|
||||
queue_results.push_back(aggregate_result);
|
||||
condition_results.notify_all();
|
||||
|
||||
queue_iterator = queue_multitasks.erase(queue_iterator);
|
||||
}
|
||||
|
@ -1637,8 +1646,10 @@ struct llama_server_context
|
|||
LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
|
||||
kv_cache_clear();
|
||||
}
|
||||
// avoid 100% usage of cpu all time
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(5));
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
condition_tasks.wait(lock, [&]{
|
||||
return !queue_tasks.empty();
|
||||
});
|
||||
}
|
||||
|
||||
for (llama_client_slot &slot : slots)
|
||||
|
@ -2437,26 +2448,33 @@ json oaicompat_completion_params_parse(
|
|||
llama_params["__oaicompat"] = true;
|
||||
|
||||
// Map OpenAI parameters to llama.cpp parameters
|
||||
//
|
||||
// For parameters that are defined by the OpenAI documentation (e.g.
|
||||
// temperature), we explicitly specify OpenAI's intended default; we
|
||||
// need to do that because sometimes OpenAI disagrees with llama.cpp
|
||||
//
|
||||
// https://platform.openai.com/docs/api-reference/chat/create
|
||||
llama_sampling_params default_sparams;
|
||||
llama_params["model"] = json_value(body, "model", std::string("uknown"));
|
||||
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
|
||||
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
||||
llama_params["temperature"] = json_value(body, "temperature", 0.8);
|
||||
llama_params["top_k"] = json_value(body, "top_k", 40);
|
||||
llama_params["top_p"] = json_value(body, "top_p", 0.95);
|
||||
llama_params["temperature"] = json_value(body, "temperature", 0.0);
|
||||
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
|
||||
llama_params["top_p"] = json_value(body, "top_p", 1.0);
|
||||
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
|
||||
llama_params["logit_bias"] = json_value(body, "logit_bias",json::object());
|
||||
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
|
||||
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
|
||||
llama_params["seed"] = json_value(body, "seed", 0);
|
||||
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
|
||||
llama_params["stream"] = json_value(body, "stream", false);
|
||||
llama_params["mirostat"] = json_value(body, "mirostat", false);
|
||||
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", 0.0);
|
||||
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", 0.0);
|
||||
llama_params["penalize_nl"] = json_value(body, "penalize_nl", false);
|
||||
llama_params["typical_p"] = json_value(body, "typical_p", 0.0);
|
||||
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", 0);
|
||||
llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat);
|
||||
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
|
||||
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
|
||||
llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl);
|
||||
llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p);
|
||||
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
|
||||
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
|
||||
llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0);
|
||||
llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z);
|
||||
|
||||
if (body.count("grammar") != 0) {
|
||||
llama_params["grammar"] = json_value(body, "grammar", json::object());
|
||||
|
@ -3070,7 +3088,17 @@ int main(int argc, char **argv)
|
|||
{
|
||||
prompt = "";
|
||||
}
|
||||
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
|
||||
|
||||
json image_data;
|
||||
if (body.count("image_data") != 0) {
|
||||
image_data = body["image_data"];
|
||||
}
|
||||
else
|
||||
{
|
||||
image_data = "";
|
||||
}
|
||||
|
||||
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, false, true, -1);
|
||||
task_result result = llama.next_result(task_id);
|
||||
return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
|
||||
});
|
||||
|
|
55
flake.lock
generated
55
flake.lock
generated
|
@ -1,30 +1,30 @@
|
|||
{
|
||||
"nodes": {
|
||||
"flake-utils": {
|
||||
"flake-parts": {
|
||||
"inputs": {
|
||||
"systems": "systems"
|
||||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1694529238,
|
||||
"narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
|
||||
"lastModified": 1701473968,
|
||||
"narHash": "sha256-YcVE5emp1qQ8ieHUnxt1wCZCC3ZfAS+SRRWZ2TMda7E=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"rev": "34fed993f1674c8d06d58b37ce1e0fe5eebcb9f5",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1698318101,
|
||||
"narHash": "sha256-gUihHt3yPD7bVqg+k/UVHgngyaJ3DMEBchbymBMvK1E=",
|
||||
"lastModified": 1703559957,
|
||||
"narHash": "sha256-x9PUuMEPGUOMB51zNxrDr2QoHbYWlCS2xhFedm9MC5Q=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "63678e9f3d3afecfeafa0acead6239cdb447574c",
|
||||
"rev": "75dd68c36f458c6593c5bbb48abfd3e59bfed380",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
@ -34,26 +34,29 @@
|
|||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"flake-utils": "flake-utils",
|
||||
"nixpkgs": "nixpkgs"
|
||||
}
|
||||
},
|
||||
"systems": {
|
||||
"nixpkgs-lib": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"dir": "lib",
|
||||
"lastModified": 1701253981,
|
||||
"narHash": "sha256-ztaDIyZ7HrTAfEEUt9AtTDNoCYxUdSd6NrRHaYOIxtk=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "e92039b55bcd58469325ded85d4f58dd5a4eaf58",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"dir": "lib",
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"flake-parts": "flake-parts",
|
||||
"nixpkgs": "nixpkgs"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
|
|
226
flake.nix
226
flake.nix
|
@ -1,139 +1,99 @@
|
|||
{
|
||||
description = "Port of Facebook's LLaMA model in C/C++";
|
||||
|
||||
inputs = {
|
||||
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
|
||||
flake-utils.url = "github:numtide/flake-utils";
|
||||
flake-parts.url = "github:hercules-ci/flake-parts";
|
||||
};
|
||||
outputs = { self, nixpkgs, flake-utils }:
|
||||
flake-utils.lib.eachDefaultSystem (system:
|
||||
let
|
||||
name = "llama.cpp";
|
||||
src = ./.;
|
||||
meta.mainProgram = "llama";
|
||||
inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
|
||||
buildInputs = with pkgs; [ openmpi ];
|
||||
osSpecific = with pkgs; buildInputs ++ (
|
||||
if isAarch64 && isDarwin then
|
||||
with pkgs.darwin.apple_sdk_11_0.frameworks; [
|
||||
Accelerate
|
||||
MetalKit
|
||||
]
|
||||
else if isAarch32 && isDarwin then
|
||||
with pkgs.darwin.apple_sdk.frameworks; [
|
||||
Accelerate
|
||||
CoreGraphics
|
||||
CoreVideo
|
||||
]
|
||||
else if isDarwin then
|
||||
with pkgs.darwin.apple_sdk.frameworks; [
|
||||
Accelerate
|
||||
CoreGraphics
|
||||
CoreVideo
|
||||
]
|
||||
else
|
||||
with pkgs; [ openblas ]
|
||||
);
|
||||
pkgs = import nixpkgs { inherit system; };
|
||||
nativeBuildInputs = with pkgs; [ cmake ninja pkg-config ];
|
||||
cudatoolkit_joined = with pkgs; symlinkJoin {
|
||||
# HACK(Green-Sky): nix currently has issues with cmake findcudatoolkit
|
||||
# see https://github.com/NixOS/nixpkgs/issues/224291
|
||||
# copied from jaxlib
|
||||
name = "${cudaPackages.cudatoolkit.name}-merged";
|
||||
paths = [
|
||||
cudaPackages.cudatoolkit.lib
|
||||
cudaPackages.cudatoolkit.out
|
||||
] ++ lib.optionals (lib.versionOlder cudaPackages.cudatoolkit.version "11") [
|
||||
# for some reason some of the required libs are in the targets/x86_64-linux
|
||||
# directory; not sure why but this works around it
|
||||
"${cudaPackages.cudatoolkit}/targets/${system}"
|
||||
];
|
||||
};
|
||||
llama-python =
|
||||
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
||||
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
|
||||
llama-python-extra =
|
||||
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece torchWithoutCuda transformers ]);
|
||||
postPatch = ''
|
||||
substituteInPlace ./ggml-metal.m \
|
||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||
substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python'
|
||||
'';
|
||||
postInstall = ''
|
||||
mv $out/bin/main $out/bin/llama
|
||||
mv $out/bin/server $out/bin/llama-server
|
||||
mkdir -p $out/include
|
||||
cp ${src}/llama.h $out/include/
|
||||
'';
|
||||
cmakeFlags = [ "-DLLAMA_NATIVE=OFF" "-DLLAMA_BUILD_SERVER=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
|
||||
in
|
||||
|
||||
# For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl:
|
||||
#
|
||||
# ```bash
|
||||
# ❯ nix repl
|
||||
# nix-repl> :lf github:ggerganov/llama.cpp
|
||||
# Added 13 variables.
|
||||
# nix-repl> outputs.apps.x86_64-linux.quantize
|
||||
# { program = "/nix/store/00000000000000000000000000000000-llama.cpp/bin/quantize"; type = "app"; }
|
||||
# ```
|
||||
outputs =
|
||||
{ self, flake-parts, ... }@inputs:
|
||||
let
|
||||
# We could include the git revisions in the package names but those would
|
||||
# needlessly trigger rebuilds:
|
||||
# llamaVersion = self.dirtyShortRev or self.shortRev;
|
||||
|
||||
# Nix already uses cryptographic hashes for versioning, so we'll just fix
|
||||
# the fake semver for now:
|
||||
llamaVersion = "0.0.0";
|
||||
in
|
||||
flake-parts.lib.mkFlake { inherit inputs; }
|
||||
|
||||
{
|
||||
packages.default = pkgs.stdenv.mkDerivation {
|
||||
inherit name src meta postPatch nativeBuildInputs postInstall;
|
||||
buildInputs = osSpecific;
|
||||
cmakeFlags = cmakeFlags
|
||||
++ (if isAarch64 && isDarwin then [
|
||||
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
||||
"-DLLAMA_METAL=ON"
|
||||
] else [
|
||||
"-DLLAMA_BLAS=ON"
|
||||
"-DLLAMA_BLAS_VENDOR=OpenBLAS"
|
||||
]);
|
||||
};
|
||||
packages.opencl = pkgs.stdenv.mkDerivation {
|
||||
inherit name src meta postPatch nativeBuildInputs postInstall;
|
||||
buildInputs = with pkgs; buildInputs ++ [ clblast ];
|
||||
cmakeFlags = cmakeFlags ++ [
|
||||
"-DLLAMA_CLBLAST=ON"
|
||||
];
|
||||
};
|
||||
packages.cuda = pkgs.stdenv.mkDerivation {
|
||||
inherit name src meta postPatch nativeBuildInputs postInstall;
|
||||
buildInputs = with pkgs; buildInputs ++ [ cudatoolkit_joined ];
|
||||
cmakeFlags = cmakeFlags ++ [
|
||||
"-DLLAMA_CUBLAS=ON"
|
||||
];
|
||||
};
|
||||
packages.rocm = pkgs.stdenv.mkDerivation {
|
||||
inherit name src meta postPatch nativeBuildInputs postInstall;
|
||||
buildInputs = with pkgs.rocmPackages; buildInputs ++ [ clr hipblas rocblas ];
|
||||
cmakeFlags = cmakeFlags ++ [
|
||||
"-DLLAMA_HIPBLAS=1"
|
||||
"-DCMAKE_C_COMPILER=hipcc"
|
||||
"-DCMAKE_CXX_COMPILER=hipcc"
|
||||
# Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
|
||||
# in github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
|
||||
# and select the line that matches the current nixpkgs version of rocBLAS.
|
||||
"-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
|
||||
];
|
||||
};
|
||||
apps.llama-server = {
|
||||
type = "app";
|
||||
program = "${self.packages.${system}.default}/bin/llama-server";
|
||||
};
|
||||
apps.llama-embedding = {
|
||||
type = "app";
|
||||
program = "${self.packages.${system}.default}/bin/embedding";
|
||||
};
|
||||
apps.llama = {
|
||||
type = "app";
|
||||
program = "${self.packages.${system}.default}/bin/llama";
|
||||
};
|
||||
apps.quantize = {
|
||||
type = "app";
|
||||
program = "${self.packages.${system}.default}/bin/quantize";
|
||||
};
|
||||
apps.train-text-from-scratch = {
|
||||
type = "app";
|
||||
program = "${self.packages.${system}.default}/bin/train-text-from-scratch";
|
||||
};
|
||||
apps.default = self.apps.${system}.llama;
|
||||
devShells.default = pkgs.mkShell {
|
||||
buildInputs = [ llama-python ];
|
||||
packages = nativeBuildInputs ++ osSpecific;
|
||||
};
|
||||
devShells.extra = pkgs.mkShell {
|
||||
buildInputs = [ llama-python-extra ];
|
||||
packages = nativeBuildInputs ++ osSpecific;
|
||||
};
|
||||
});
|
||||
|
||||
imports = [
|
||||
.devops/nix/nixpkgs-instances.nix
|
||||
.devops/nix/apps.nix
|
||||
.devops/nix/devshells.nix
|
||||
.devops/nix/jetson-support.nix
|
||||
];
|
||||
|
||||
# An overlay can be used to have a more granular control over llama-cpp's
|
||||
# dependencies and configuration, than that offered by the `.override`
|
||||
# mechanism. Cf. https://nixos.org/manual/nixpkgs/stable/#chap-overlays.
|
||||
#
|
||||
# E.g. in a flake:
|
||||
# ```
|
||||
# { nixpkgs, llama-cpp, ... }:
|
||||
# let pkgs = import nixpkgs {
|
||||
# overlays = [ (llama-cpp.overlays.default) ];
|
||||
# system = "aarch64-linux";
|
||||
# config.allowUnfree = true;
|
||||
# config.cudaSupport = true;
|
||||
# config.cudaCapabilities = [ "7.2" ];
|
||||
# config.cudaEnableForwardCompat = false;
|
||||
# }; in {
|
||||
# packages.aarch64-linux.llamaJetsonXavier = pkgs.llamaPackages.llama-cpp;
|
||||
# }
|
||||
# ```
|
||||
#
|
||||
# Cf. https://nixos.org/manual/nix/unstable/command-ref/new-cli/nix3-flake.html?highlight=flake#flake-format
|
||||
flake.overlays.default =
|
||||
(final: prev: {
|
||||
llamaPackages = final.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
||||
inherit (final.llamaPackages) llama-cpp;
|
||||
});
|
||||
|
||||
systems = [
|
||||
"aarch64-darwin"
|
||||
"aarch64-linux"
|
||||
"x86_64-darwin" # x86_64-darwin isn't tested (and likely isn't relevant)
|
||||
"x86_64-linux"
|
||||
];
|
||||
|
||||
perSystem =
|
||||
{
|
||||
config,
|
||||
lib,
|
||||
pkgs,
|
||||
pkgsCuda,
|
||||
pkgsRocm,
|
||||
...
|
||||
}:
|
||||
{
|
||||
# We don't use the overlay here so as to avoid making too many instances of nixpkgs,
|
||||
# cf. https://zimbatm.com/notes/1000-instances-of-nixpkgs
|
||||
packages =
|
||||
{
|
||||
default = (pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; }).llama-cpp;
|
||||
}
|
||||
// lib.optionalAttrs pkgs.stdenv.isLinux {
|
||||
opencl = config.packages.default.override { useOpenCL = true; };
|
||||
cuda = (pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; }).llama-cpp;
|
||||
rocm = (pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; }).llama-cpp;
|
||||
|
||||
mpi-cpu = config.packages.default.override { useMpi = true; };
|
||||
mpi-cuda = config.packages.default.override { useMpi = true; };
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
|
|
|
@ -614,10 +614,14 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
|
|||
}
|
||||
|
||||
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||
return true;
|
||||
switch (op->op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
GGML_UNUSED(op);
|
||||
}
|
||||
|
||||
static struct ggml_backend_i cpu_backend_i = {
|
||||
|
|
91
ggml-cuda.cu
91
ggml-cuda.cu
|
@ -6662,7 +6662,7 @@ static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) {
|
|||
// pool with virtual memory
|
||||
static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
|
||||
static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
|
||||
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
|
||||
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
||||
|
||||
static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) {
|
||||
scoped_spin_lock lock(g_cuda_pool_lock);
|
||||
|
@ -7485,6 +7485,8 @@ static void ggml_cuda_op_dequantize_mul_mat_vec(
|
|||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t row_diff = row_high - row_low;
|
||||
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
||||
#ifdef GGML_CUDA_F16
|
||||
cuda_pool_alloc<half> src1_dfloat_a;
|
||||
|
@ -7577,6 +7579,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|||
const int compute_capability = g_device_caps[id].cc;
|
||||
|
||||
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||
//printf("this branch\n");
|
||||
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
||||
cuda_pool_alloc<half> src0_as_f16;
|
||||
if (src0->type != GGML_TYPE_F16) {
|
||||
|
@ -7614,9 +7617,9 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|||
|
||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
||||
to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
cuda_pool_alloc<float> src0_ddq_as_f32;
|
||||
cuda_pool_alloc<float> src1_ddq_as_f32;
|
||||
|
||||
if (src0->type != GGML_TYPE_F32) {
|
||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
||||
|
@ -7624,7 +7627,15 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|||
src0_ddq_as_f32.alloc(row_diff*ne00);
|
||||
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
|
||||
}
|
||||
if (src1->type != GGML_TYPE_F32) {
|
||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
|
||||
GGML_ASSERT(to_fp32_cuda != nullptr);
|
||||
src1_ddq_as_f32.alloc(src1_ncols*ne10);
|
||||
to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
|
||||
}
|
||||
|
||||
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
|
||||
const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
|
||||
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
|
@ -7633,9 +7644,9 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|||
CUBLAS_CHECK(
|
||||
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
row_diff, src1_ncols, ne10,
|
||||
&alpha, src0_ddf_i, ne00,
|
||||
src1_ddf_i, ne10,
|
||||
&beta, dst_dd_i, ldc));
|
||||
&alpha, src0_ddf_i, ne00,
|
||||
src1_ddf1_i, ne10,
|
||||
&beta, dst_dd_i, ldc));
|
||||
}
|
||||
|
||||
(void) dst;
|
||||
|
@ -8035,6 +8046,7 @@ static void ggml_cuda_op_mul_mat(
|
|||
|
||||
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
|
||||
|
||||
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
||||
|
||||
|
@ -8481,9 +8493,9 @@ static __global__ void k_compute_batched_ptrs(
|
|||
int64_t i03 = i13 / r3;
|
||||
int64_t i02 = i12 / r2;
|
||||
|
||||
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
||||
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
||||
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
||||
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
||||
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
|
||||
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
|
@ -8492,28 +8504,10 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[3];
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const int64_t nb01 = src0->nb[1];
|
||||
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
|
||||
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
const int64_t ne13 = src1->ne[3];
|
||||
|
||||
const int64_t nb11 = src1->nb[1];
|
||||
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
||||
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
||||
|
||||
const int64_t ne1 = ggml_nelements(src1);
|
||||
const int64_t ne = ggml_nelements(dst);
|
||||
const int64_t ne_dst = ggml_nelements(dst);
|
||||
|
||||
ggml_cuda_set_device(g_main_device);
|
||||
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||
|
@ -8522,7 +8516,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
void * src0_ddq = src0_extra->data_device[g_main_device];
|
||||
half * src0_as_f16 = (half *) src0_ddq;
|
||||
half * src0_f16 = (half *) src0_ddq;
|
||||
|
||||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
||||
|
@ -8531,11 +8525,15 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
||||
|
||||
// convert src1 to fp16
|
||||
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
||||
GGML_ASSERT(to_fp16_cuda != nullptr);
|
||||
|
||||
cuda_pool_alloc<half> src1_as_f16(ne1);
|
||||
to_fp16_cuda(src1_ddf, src1_as_f16.get(), ne1, main_stream);
|
||||
cuda_pool_alloc<half> src1_f16_alloc;
|
||||
if (src1->type != GGML_TYPE_F16) {
|
||||
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
||||
const int64_t ne_src1 = ggml_nelements(src1);
|
||||
src1_f16_alloc.alloc(ne_src1);
|
||||
GGML_ASSERT(to_fp16_cuda != nullptr);
|
||||
to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
|
||||
}
|
||||
half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
|
||||
|
||||
cuda_pool_alloc<half> dst_f16;
|
||||
char * dst_t;
|
||||
|
@ -8557,7 +8555,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||
const void * beta = &beta_f16;
|
||||
|
||||
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||
dst_t = (char *) dst_f16.alloc(ne);
|
||||
dst_t = (char *) dst_f16.alloc(ne_dst);
|
||||
|
||||
nbd2 /= sizeof(float) / sizeof(half);
|
||||
nbd3 /= sizeof(float) / sizeof(half);
|
||||
|
@ -8604,9 +8602,9 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||
CUBLAS_CHECK(
|
||||
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
||||
(const char *) src1_as_f16.get(), CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
||||
beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
|
||||
alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
|
||||
(const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB
|
||||
beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC
|
||||
ne12*ne13,
|
||||
cu_compute_type,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
|
@ -8619,12 +8617,13 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||
|
||||
dim3 block_dims(ne13, ne12);
|
||||
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
||||
src0_as_f16, src1_as_f16.get(), dst_t,
|
||||
src0_f16, src1_f16, dst_t,
|
||||
ptrs_src.get(), ptrs_dst.get(),
|
||||
ne12, ne13,
|
||||
ne23,
|
||||
nb02, nb03,
|
||||
nb12, nb13,
|
||||
src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
|
||||
src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
|
||||
nbd2, nbd3,
|
||||
r2, r3);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
@ -8632,8 +8631,8 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||
CUBLAS_CHECK(
|
||||
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
||||
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
||||
alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
|
||||
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10,
|
||||
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
|
||||
ne23,
|
||||
cu_compute_type,
|
||||
|
@ -8643,7 +8642,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||
|
||||
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
||||
to_fp32_cuda(dst_f16.get(), dst_ddf, ne, main_stream);
|
||||
to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -8682,13 +8681,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|||
} else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||
// KQV single-batch
|
||||
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
||||
} else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
||||
} else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
||||
// KQ + KQV multi-batch
|
||||
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
||||
} else if (src0->type == GGML_TYPE_F32) {
|
||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
||||
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
||||
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
||||
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->type == GGML_TYPE_F32) {
|
||||
#ifdef GGML_CUDA_FORCE_DMMV
|
||||
const bool use_mul_mat_vec_q = false;
|
||||
#else
|
||||
|
|
|
@ -6,19 +6,19 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
void ggml_cl_init(void);
|
||||
GGML_API void ggml_cl_init(void);
|
||||
|
||||
void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
||||
GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
||||
|
||||
void * ggml_cl_host_malloc(size_t size);
|
||||
void ggml_cl_host_free(void * ptr);
|
||||
GGML_API void * ggml_cl_host_malloc(size_t size);
|
||||
GGML_API void ggml_cl_host_free(void * ptr);
|
||||
|
||||
void ggml_cl_free_data(const struct ggml_tensor* tensor);
|
||||
GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
|
||||
|
||||
void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
2
ggml.c
2
ggml.c
|
@ -9687,7 +9687,7 @@ static void ggml_compute_forward_mul_mat(
|
|||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||
|
||||
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
||||
assert(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||
|
|
|
@ -370,7 +370,16 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.GPT2: [
|
||||
# TODO
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.POS_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.PHI2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
|
|
|
@ -17,6 +17,7 @@ class TensorNameMap:
|
|||
"tok_embeddings", # llama-pth
|
||||
"embeddings.word_embeddings", # bert
|
||||
"language_model.embedding.word_embeddings", # persimmon
|
||||
"wte", # gpt2
|
||||
"transformer.embd.wte", # phi2
|
||||
),
|
||||
|
||||
|
@ -34,6 +35,7 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.POS_EMBD: (
|
||||
"transformer.wpe", # gpt2
|
||||
"embeddings.position_embeddings", # bert
|
||||
"wpe", # gpt2
|
||||
),
|
||||
|
||||
# Output
|
||||
|
@ -53,7 +55,7 @@ class TensorNameMap:
|
|||
"norm", # llama-pth
|
||||
"embeddings.LayerNorm", # bert
|
||||
"transformer.norm_f", # mpt
|
||||
"ln_f", # refact bloom qwen
|
||||
"ln_f", # refact bloom qwen gpt2
|
||||
"language_model.encoder.final_layernorm", # persimmon
|
||||
"lm_head.ln", # phi2
|
||||
),
|
||||
|
@ -78,6 +80,7 @@ class TensorNameMap:
|
|||
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
||||
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
||||
"model.layers.{bid}.ln1", # yi
|
||||
"h.{bid}.ln_1", # gpt2
|
||||
"transformer.h.{bid}.ln", # phi2
|
||||
"model.layers.layers.{bid}.norm", # plamo
|
||||
),
|
||||
|
@ -95,6 +98,7 @@ class TensorNameMap:
|
|||
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
||||
"h.{bid}.self_attention.query_key_value", # bloom
|
||||
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
||||
"h.{bid}.attn.c_attn", # gpt2
|
||||
"transformer.h.{bid}.mixer.Wqkv", # phi2
|
||||
),
|
||||
|
||||
|
@ -137,6 +141,7 @@ class TensorNameMap:
|
|||
"encoder.layer.{bid}.attention.output.dense", # bert
|
||||
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
||||
"language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
|
||||
"h.{bid}.attn.c_proj", # gpt2
|
||||
"transformer.h.{bid}.mixer.out_proj", # phi2
|
||||
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
||||
),
|
||||
|
@ -159,6 +164,7 @@ class TensorNameMap:
|
|||
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
||||
"model.layers.{bid}.ln2", # yi
|
||||
"h.{bid}.ln_2", # gpt2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP: (
|
||||
|
@ -179,6 +185,7 @@ class TensorNameMap:
|
|||
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
||||
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
||||
"transformer.h.{bid}.mlp.w1", # qwen
|
||||
"h.{bid}.mlp.c_fc", # gpt2
|
||||
"transformer.h.{bid}.mlp.fc1", # phi2
|
||||
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
||||
),
|
||||
|
@ -218,6 +225,7 @@ class TensorNameMap:
|
|||
"encoder.layer.{bid}.output.dense", # bert
|
||||
"transformer.h.{bid}.mlp.fc_out", # gpt-j
|
||||
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
||||
"h.{bid}.mlp.c_proj", # gpt2
|
||||
"transformer.h.{bid}.mlp.fc2", # phi2
|
||||
"model.layers.layers.{bid}.mlp.down_proj", # plamo
|
||||
),
|
||||
|
|
206
llama.cpp
206
llama.cpp
|
@ -424,6 +424,15 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|||
LLM_ARCH_GPT2,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -1257,6 +1266,10 @@ enum e_model {
|
|||
MODEL_40B,
|
||||
MODEL_65B,
|
||||
MODEL_70B,
|
||||
MODEL_SMALL,
|
||||
MODEL_MEDIUM,
|
||||
MODEL_LARGE,
|
||||
MODEL_XL,
|
||||
};
|
||||
|
||||
static const size_t kiB = 1024;
|
||||
|
@ -2553,18 +2566,22 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|||
|
||||
static const char * llama_model_type_name(e_model type) {
|
||||
switch (type) {
|
||||
case MODEL_1B: return "1B";
|
||||
case MODEL_3B: return "3B";
|
||||
case MODEL_7B: return "7B";
|
||||
case MODEL_8B: return "8B";
|
||||
case MODEL_13B: return "13B";
|
||||
case MODEL_15B: return "15B";
|
||||
case MODEL_30B: return "30B";
|
||||
case MODEL_34B: return "34B";
|
||||
case MODEL_40B: return "40B";
|
||||
case MODEL_65B: return "65B";
|
||||
case MODEL_70B: return "70B";
|
||||
default: return "?B";
|
||||
case MODEL_1B: return "1B";
|
||||
case MODEL_3B: return "3B";
|
||||
case MODEL_7B: return "7B";
|
||||
case MODEL_8B: return "8B";
|
||||
case MODEL_13B: return "13B";
|
||||
case MODEL_15B: return "15B";
|
||||
case MODEL_30B: return "30B";
|
||||
case MODEL_34B: return "34B";
|
||||
case MODEL_40B: return "40B";
|
||||
case MODEL_65B: return "65B";
|
||||
case MODEL_70B: return "70B";
|
||||
case MODEL_SMALL: return "0.1B";
|
||||
case MODEL_MEDIUM: return "0.4B";
|
||||
case MODEL_LARGE: return "0.8B";
|
||||
case MODEL_XL: return "1.5B";
|
||||
default: return "?B";
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2783,6 +2800,17 @@ static void llm_load_hparams(
|
|||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GPT2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 12: model.type = e_model::MODEL_SMALL; break;
|
||||
case 24: model.type = e_model::MODEL_MEDIUM; break;
|
||||
case 36: model.type = e_model::MODEL_LARGE; break;
|
||||
case 48: model.type = e_model::MODEL_XL; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
||||
default: (void)0;
|
||||
}
|
||||
|
@ -3716,6 +3744,60 @@ static bool llm_load_tensors(
|
|||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GPT2:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||
model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
|
||||
|
||||
// output
|
||||
{
|
||||
ggml_backend_type backend_norm;
|
||||
ggml_backend_type backend_output;
|
||||
|
||||
if (n_gpu_layers > int(n_layer)) {
|
||||
backend_norm = llama_backend_offload;
|
||||
backend_output = llama_backend_offload_split;
|
||||
} else {
|
||||
backend_norm = GGML_BACKEND_CPU;
|
||||
backend_output = GGML_BACKEND_CPU;
|
||||
}
|
||||
|
||||
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
||||
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
model.layers.resize(n_layer);
|
||||
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
||||
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
||||
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
||||
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
|
||||
|
||||
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
||||
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
||||
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
||||
layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
|
@ -5759,6 +5841,102 @@ struct llm_build_context {
|
|||
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_gpt2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * pos;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
||||
cb(pos, "pos_embd", -1);
|
||||
|
||||
inpL = ggml_add(ctx0, inpL, pos);
|
||||
cb(inpL, "inpL", -1);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm,
|
||||
model.layers[il].attn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
||||
cb(cur, "wqkv", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
||||
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
||||
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
|
||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
// add the input
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// FF
|
||||
{
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm,
|
||||
model.layers[il].ffn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
||||
inpL = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(inpL, "l_out", il);
|
||||
}
|
||||
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.output_norm,
|
||||
model.output_norm_b,
|
||||
LLM_NORM, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
|
@ -6274,6 +6452,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
{
|
||||
result = llm.build_plamo();
|
||||
} break;
|
||||
case LLM_ARCH_GPT2:
|
||||
{
|
||||
result = llm.build_gpt2();
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
|
BIN
models/ggml-vocab-gpt2.gguf
Normal file
BIN
models/ggml-vocab-gpt2.gguf
Normal file
Binary file not shown.
|
@ -1,3 +0,0 @@
|
|||
-r requirements.txt
|
||||
torch==2.1.1
|
||||
transformers==4.35.2
|
|
@ -1,5 +1,12 @@
|
|||
numpy==1.24.4
|
||||
sentencepiece==0.1.98
|
||||
transformers>=4.34.0
|
||||
gguf>=0.1.0
|
||||
protobuf>=4.21.0
|
||||
# These requirements include all dependencies for all top-level python scripts
|
||||
# for llama.cpp. Avoid adding packages here directly.
|
||||
#
|
||||
# Package versions must stay compatible across all top-level python scripts.
|
||||
#
|
||||
|
||||
-r ./requirements/requirements-convert.txt
|
||||
|
||||
-r ./requirements/requirements-convert-hf-to-gguf.txt
|
||||
-r ./requirements/requirements-convert-llama-ggml-to-gguf.txt
|
||||
-r ./requirements/requirements-convert-lora-to-ggml.txt
|
||||
-r ./requirements/requirements-convert-persimmon-to-gguf.txt
|
||||
|
|
2
requirements/requirements-convert-hf-to-gguf.txt
Normal file
2
requirements/requirements-convert-hf-to-gguf.txt
Normal file
|
@ -0,0 +1,2 @@
|
|||
-r ./requirements-convert.txt
|
||||
torch~=2.1.1
|
1
requirements/requirements-convert-llama-ggml-to-gguf.txt
Normal file
1
requirements/requirements-convert-llama-ggml-to-gguf.txt
Normal file
|
@ -0,0 +1 @@
|
|||
-r ./requirements-convert.txt
|
2
requirements/requirements-convert-lora-to-ggml.txt
Normal file
2
requirements/requirements-convert-lora-to-ggml.txt
Normal file
|
@ -0,0 +1,2 @@
|
|||
-r ./requirements-convert.txt
|
||||
torch~=2.1.1
|
2
requirements/requirements-convert-persimmon-to-gguf.txt
Normal file
2
requirements/requirements-convert-persimmon-to-gguf.txt
Normal file
|
@ -0,0 +1,2 @@
|
|||
-r ./requirements-convert.txt
|
||||
torch~=2.1.1
|
5
requirements/requirements-convert.txt
Normal file
5
requirements/requirements-convert.txt
Normal file
|
@ -0,0 +1,5 @@
|
|||
numpy~=1.24.4
|
||||
sentencepiece~=0.1.98
|
||||
transformers>=4.35.2,<5.0.0
|
||||
gguf>=0.1.0
|
||||
protobuf>=4.21.0,<5.0.0
|
174
scripts/check-requirements.sh
Executable file
174
scripts/check-requirements.sh
Executable file
|
@ -0,0 +1,174 @@
|
|||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
#
|
||||
# check-requirements.sh checks all requirements files for each top-level
|
||||
# convert*.py script.
|
||||
#
|
||||
# WARNING: This is quite IO intensive, because a fresh venv is set up for every
|
||||
# python script. As of 2023-12-22, this writes ~2.7GB of data. An adequately
|
||||
# sized tmpfs /tmp or ramdisk is recommended if running this frequently.
|
||||
#
|
||||
# usage: check-requirements.sh [<working_dir>]
|
||||
# check-requirements.sh nocleanup [<working_dir>]
|
||||
#
|
||||
# where:
|
||||
# - <working_dir> is a directory that can be used as the base for
|
||||
# setting up the venvs. Defaults to `/tmp`.
|
||||
# - 'nocleanup' as the first argument will disable automatic cleanup
|
||||
# of the files created by this script.
|
||||
#
|
||||
# requires:
|
||||
# - bash >= 3.2.57
|
||||
# - shellcheck
|
||||
#
|
||||
# For each script, it creates a fresh venv, `pip install`s the requirements, and
|
||||
# finally imports the python script to check for `ImportError`.
|
||||
#
|
||||
|
||||
log() {
|
||||
local level=$1 msg=$2
|
||||
printf >&2 '%s: %s\n' "$level" "$msg"
|
||||
}
|
||||
|
||||
debug() {
|
||||
log DEBUG "$@"
|
||||
}
|
||||
|
||||
info() {
|
||||
log INFO "$@"
|
||||
}
|
||||
|
||||
fatal() {
|
||||
log FATAL "$@"
|
||||
exit 1
|
||||
}
|
||||
|
||||
cleanup() {
|
||||
if [[ -n ${workdir+x} && -d $workdir && -w $workdir ]]; then
|
||||
info "Removing $workdir"
|
||||
local count=0
|
||||
rm -rfv -- "$workdir" | while read -r; do
|
||||
if (( count++ > 750 )); then
|
||||
printf .
|
||||
count=0
|
||||
fi
|
||||
done
|
||||
printf '\n'
|
||||
info "Removed $workdir"
|
||||
fi
|
||||
}
|
||||
|
||||
do_cleanup=1
|
||||
if [[ ${1-} == nocleanup ]]; then
|
||||
do_cleanup=0; shift
|
||||
fi
|
||||
|
||||
if (( do_cleanup )); then
|
||||
trap exit INT TERM
|
||||
trap cleanup EXIT
|
||||
fi
|
||||
|
||||
this=$(realpath -- "$0"); readonly this
|
||||
cd "$(dirname "$this")/.." # PWD should stay in llama.cpp project directory
|
||||
|
||||
shellcheck "$this"
|
||||
|
||||
readonly reqs_dir=requirements
|
||||
|
||||
if [[ ${1+x} ]]; then
|
||||
tmp_dir=$(realpath -- "$1")
|
||||
if [[ ! ( -d $tmp_dir && -w $tmp_dir ) ]]; then
|
||||
fatal "$tmp_dir is not a writable directory"
|
||||
fi
|
||||
else
|
||||
tmp_dir=/tmp
|
||||
fi
|
||||
|
||||
workdir=$(mktemp -d "$tmp_dir/check-requirements.XXXX"); readonly workdir
|
||||
info "Working directory: $workdir"
|
||||
|
||||
check_requirements() {
|
||||
local reqs=$1
|
||||
|
||||
info "$reqs: beginning check"
|
||||
pip --disable-pip-version-check install -qr "$reqs"
|
||||
info "$reqs: OK"
|
||||
}
|
||||
|
||||
check_convert_script() {
|
||||
local py=$1 # e.g. ./convert-hf-to-gguf.py
|
||||
local pyname=${py##*/} # e.g. convert-hf-to-gguf.py
|
||||
pyname=${pyname%.py} # e.g. convert-hf-to-gguf
|
||||
|
||||
info "$py: beginning check"
|
||||
|
||||
local reqs="$reqs_dir/requirements-$pyname.txt"
|
||||
if [[ ! -r $reqs ]]; then
|
||||
fatal "$py missing requirements. Expected: $reqs"
|
||||
fi
|
||||
|
||||
local venv="$workdir/$pyname-venv"
|
||||
python3 -m venv "$venv"
|
||||
|
||||
(
|
||||
# shellcheck source=/dev/null
|
||||
source "$venv/bin/activate"
|
||||
|
||||
check_requirements "$reqs"
|
||||
|
||||
python - "$py" "$pyname" <<'EOF'
|
||||
import sys
|
||||
from importlib.machinery import SourceFileLoader
|
||||
py, pyname = sys.argv[1:]
|
||||
SourceFileLoader(pyname, py).load_module()
|
||||
EOF
|
||||
)
|
||||
|
||||
if (( do_cleanup )); then
|
||||
rm -rf -- "$venv"
|
||||
fi
|
||||
|
||||
info "$py: imports OK"
|
||||
}
|
||||
|
||||
readonly ignore_eq_eq='check_requirements: ignore "=="'
|
||||
|
||||
for req in "$reqs_dir"/*; do
|
||||
# Check that all sub-requirements are added to top-level requirements.txt
|
||||
if ! grep -qF "$req" requirements.txt; then
|
||||
fatal "$req needs to be added to requirements.txt"
|
||||
fi
|
||||
|
||||
# Make sure exact release versions aren't being pinned in the requirements
|
||||
# Filters out the ignore string
|
||||
if grep -vF "$ignore_eq_eq" "$req" | grep -q '=='; then
|
||||
tab=$'\t'
|
||||
cat >&2 <<EOF
|
||||
FATAL: Avoid pinning exact package versions. Use '~=' instead.
|
||||
You can suppress this error by appending the following to the line:
|
||||
$tab# $ignore_eq_eq
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
all_venv="$workdir/all-venv"
|
||||
python3 -m venv "$all_venv"
|
||||
|
||||
(
|
||||
# shellcheck source=/dev/null
|
||||
source "$all_venv/bin/activate"
|
||||
check_requirements requirements.txt
|
||||
)
|
||||
|
||||
if (( do_cleanup )); then
|
||||
rm -rf -- "$all_venv"
|
||||
fi
|
||||
|
||||
check_convert_script convert.py
|
||||
for py in convert-*.py; do
|
||||
check_convert_script "$py"
|
||||
done
|
||||
|
||||
info 'Done! No issues found.'
|
|
@ -27,21 +27,36 @@ echo "Syncing ggml changes since commit $lc"
|
|||
cd $SRC_GGML
|
||||
|
||||
git log --oneline $lc..HEAD
|
||||
git log --oneline $lc..HEAD | grep -v "(llama/[0-9]*)" | cut -d' ' -f1 > $SRC_LLAMA/ggml-commits
|
||||
|
||||
git format-patch $lc --stdout -- \
|
||||
include/ggml/ggml*.h \
|
||||
src/ggml*.h \
|
||||
src/ggml*.c \
|
||||
src/ggml*.cpp \
|
||||
src/ggml*.m \
|
||||
src/ggml*.metal \
|
||||
src/ggml*.cu \
|
||||
tests/test-opt.cpp \
|
||||
tests/test-grad0.cpp \
|
||||
tests/test-quantize-fns.cpp \
|
||||
tests/test-quantize-perf.cpp \
|
||||
tests/test-backend-ops.cpp \
|
||||
> $SRC_LLAMA/ggml-src.patch
|
||||
if [ ! -s $SRC_LLAMA/ggml-commits ]; then
|
||||
rm -v $SRC_LLAMA/ggml-commits
|
||||
echo "No new commits"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
rm -v $SRC_LLAMA/ggml-src.patch
|
||||
fi
|
||||
|
||||
while read c; do
|
||||
git format-patch -k $c~1..$c --stdout -- \
|
||||
include/ggml/ggml*.h \
|
||||
src/ggml*.h \
|
||||
src/ggml*.c \
|
||||
src/ggml*.cpp \
|
||||
src/ggml*.m \
|
||||
src/ggml*.metal \
|
||||
src/ggml*.cu \
|
||||
tests/test-opt.cpp \
|
||||
tests/test-grad0.cpp \
|
||||
tests/test-quantize-fns.cpp \
|
||||
tests/test-quantize-perf.cpp \
|
||||
tests/test-backend-ops.cpp \
|
||||
>> $SRC_LLAMA/ggml-src.patch
|
||||
done < $SRC_LLAMA/ggml-commits
|
||||
|
||||
rm -v $SRC_LLAMA/ggml-commits
|
||||
|
||||
# delete files if empty
|
||||
if [ ! -s $SRC_LLAMA/ggml-src.patch ]; then
|
||||
|
|
|
@ -1 +1 @@
|
|||
76e7f47b69e8334384dc718480c496dafbd47999
|
||||
df098ea908764cba4a4889a1cbe7b026b2d31a14
|
||||
|
|
|
@ -2,7 +2,7 @@ function(llama_build_executable source)
|
|||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||
add_executable(${TEST_TARGET} ${source})
|
||||
install(TARGETS ${TEST_TARGET} RUNTIME)
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE llama common)
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE common)
|
||||
endfunction()
|
||||
|
||||
function(llama_test_executable name source)
|
||||
|
@ -14,7 +14,7 @@ function(llama_build_and_test_executable source)
|
|||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||
add_executable(${TEST_TARGET} ${source})
|
||||
install(TARGETS ${TEST_TARGET} RUNTIME)
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE llama common)
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE common)
|
||||
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
||||
endfunction()
|
||||
|
||||
|
@ -46,6 +46,7 @@ llama_test_executable (test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cp
|
|||
llama_test_executable (test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
||||
llama_test_executable (test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||
llama_test_executable (test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||
llama_test_executable (test-tokenizer-1-gpt2 test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
|
||||
# llama_test_executable (test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
|
||||
|
||||
llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||
|
|
|
@ -350,13 +350,18 @@ struct test_case {
|
|||
fflush(stdout);
|
||||
|
||||
// check if backends support op
|
||||
bool supported = true;
|
||||
for (ggml_backend_t backend : {backend1, backend2}) {
|
||||
if (!ggml_backend_supports_op(backend, out)) {
|
||||
printf("not supported\n");
|
||||
ggml_free(ctx);
|
||||
return true;
|
||||
printf("not supported [%s] ", ggml_backend_name(backend));
|
||||
supported = false;
|
||||
}
|
||||
}
|
||||
if (!supported) {
|
||||
printf("\n");
|
||||
ggml_free(ctx);
|
||||
return true;
|
||||
}
|
||||
|
||||
// post-graph sentinel
|
||||
add_sentinel(ctx);
|
||||
|
@ -1505,8 +1510,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
}
|
||||
|
||||
for (ggml_type type_a : all_types) {
|
||||
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
|
||||
// FIXME: CPU crashes on f16xf16
|
||||
for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
|
||||
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {1, 1}));
|
||||
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {2, 1}));
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue