Merge branch 'ggerganov:master' into master

This commit is contained in:
Eve 2023-08-18 21:17:06 +00:00 committed by GitHub
commit a129a31457
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
37 changed files with 7206 additions and 3091 deletions

4
.gitignore vendored
View file

@ -1,6 +1,7 @@
*.o *.o
*.a *.a
*.so *.so
*.bin
.DS_Store .DS_Store
.build/ .build/
.cache/ .cache/
@ -39,6 +40,7 @@ models-mnt
/perplexity /perplexity
/embedding /embedding
/train-text-from-scratch /train-text-from-scratch
/convert-llama2c-to-ggml
/simple /simple
/benchmark-matmult /benchmark-matmult
/vdot /vdot
@ -46,6 +48,7 @@ models-mnt
/Pipfile /Pipfile
/embd-input-test /embd-input-test
/libllama.so /libllama.so
/llama-bench
build-info.h build-info.h
arm_neon.h arm_neon.h
compile_commands.json compile_commands.json
@ -68,6 +71,7 @@ poetry.lock
poetry.toml poetry.toml
# Test binaries # Test binaries
tests/test-grammar-parser
tests/test-double-float tests/test-double-float
tests/test-grad0 tests/test-grad0
tests/test-opt tests/test-opt

View file

@ -69,7 +69,6 @@ option(LLAMA_BLAS "llama: use BLAS"
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use CUDA" OFF) option(LLAMA_CUBLAS "llama: use CUDA" OFF)
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF) #option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA kernels")
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
@ -256,7 +255,6 @@ if (LLAMA_CUBLAS)
# if (LLAMA_CUDA_CUBLAS) # if (LLAMA_CUDA_CUBLAS)
# add_compile_definitions(GGML_CUDA_CUBLAS) # add_compile_definitions(GGML_CUDA_CUBLAS)
# endif() # endif()
add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
if (LLAMA_CUDA_FORCE_DMMV) if (LLAMA_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif() endif()
@ -298,7 +296,6 @@ if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED) find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED) find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h) set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
@ -315,7 +312,6 @@ if (LLAMA_METAL)
${FOUNDATION_LIBRARY} ${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK} ${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK} ${METALKIT_FRAMEWORK}
${METALPERFORMANCE_FRAMEWORK}
) )
endif() endif()
@ -573,6 +569,16 @@ install(
WORLD_READ WORLD_READ
WORLD_EXECUTE WORLD_EXECUTE
DESTINATION ${CMAKE_INSTALL_BINDIR}) DESTINATION ${CMAKE_INSTALL_BINDIR})
if (LLAMA_METAL)
install(
FILES ggml-metal.metal
PERMISSIONS
OWNER_READ
OWNER_WRITE
GROUP_READ
WORLD_READ
DESTINATION ${CMAKE_INSTALL_BINDIR})
endif()
# #
# programs, examples and tests # programs, examples and tests

View file

@ -1,8 +1,8 @@
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server embd-input-test BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test llama-bench
# Binaries only useful for tests # Binaries only useful for tests
TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0 TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
default: $(BUILD_TARGETS) default: $(BUILD_TARGETS)
@ -283,7 +283,7 @@ endif # LLAMA_CLBLAST
ifdef LLAMA_METAL ifdef LLAMA_METAL
CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
CXXFLAGS += -DGGML_USE_METAL CXXFLAGS += -DGGML_USE_METAL
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
OBJS += ggml-metal.o OBJS += ggml-metal.o
endif # LLAMA_METAL endif # LLAMA_METAL
@ -345,7 +345,7 @@ libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
clean: clean:
rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h $(TEST_TARGETS) rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test llama-bench build-info.h $(TEST_TARGETS)
# #
# Examples # Examples
@ -388,6 +388,12 @@ embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-te
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS) train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
build-info.h: $(wildcard .git/index) scripts/build-info.sh build-info.h: $(wildcard .git/index) scripts/build-info.sh
@sh scripts/build-info.sh > $@.tmp @sh scripts/build-info.sh > $@.tmp
@if ! cmp -s $@.tmp $@; then \ @if ! cmp -s $@.tmp $@; then \
@ -409,6 +415,12 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-grammar-parser: tests/test-grammar-parser.cpp examples/grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

View file

@ -9,13 +9,13 @@
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
**Hot topics:** ### 🚧 Incoming breaking change + refactoring:
- Simple web chat example: https://github.com/ggerganov/llama.cpp/pull/1998 See PR https://github.com/ggerganov/llama.cpp/pull/2398 for more info.
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
- New roadmap: https://github.com/users/ggerganov/projects/7 To devs: avoid making big changes to `llama.h` / `llama.cpp` until merged
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1 ----
<details> <details>
<summary>Table of Contents</summary> <summary>Table of Contents</summary>
@ -96,8 +96,10 @@ as the main playground for developing new features for the [ggml](https://github
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node) - Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
**UI:** **UI:**
@ -238,12 +240,17 @@ In order to build llama.cpp you have three different options.
cmake --build . --config Release cmake --build . --config Release
``` ```
- Using `Zig`: - Using `Zig` (version 0.11 or later):
Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
it's also possible to cross compile for other operating systems and architectures:
```bash ```bash
zig build -Doptimize=ReleaseFast zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
``` ```
The `zig targets` command will give you valid options to use.
- Using `gmake` (FreeBSD): - Using `gmake` (FreeBSD):
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics) 1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
@ -408,7 +415,7 @@ Building the program with BLAS support may lead to some performance improvements
|-------------------------|------------------------|---------|-------------| |-------------------------|------------------------|---------|-------------|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |

View file

@ -1,5 +1,6 @@
// Compatible with Zig Version 0.11.0 // Compatible with Zig Version 0.11.0
const std = @import("std"); const std = @import("std");
const ArrayList = std.ArrayList;
const Compile = std.Build.Step.Compile; const Compile = std.Build.Step.Compile;
const ConfigHeader = std.Build.Step.ConfigHeader; const ConfigHeader = std.Build.Step.ConfigHeader;
const Mode = std.builtin.Mode; const Mode = std.builtin.Mode;
@ -10,11 +11,31 @@ const Maker = struct {
target: CrossTarget, target: CrossTarget,
optimize: Mode, optimize: Mode,
config_header: *ConfigHeader, config_header: *ConfigHeader,
enable_lto: bool,
const cflags = .{"-std=c11"}; include_dirs: ArrayList([]const u8),
const cxxflags = .{"-std=c++11"}; cflags: ArrayList([]const u8),
cxxflags: ArrayList([]const u8),
objs: ArrayList(*Compile),
fn init(builder: *std.build.Builder) Maker { fn addInclude(m: *Maker, dir: []const u8) !void {
try m.include_dirs.append(dir);
}
fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
}
fn addCFlag(m: *Maker, flag: []const u8) !void {
try m.cflags.append(flag);
}
fn addCxxFlag(m: *Maker, flag: []const u8) !void {
try m.cxxflags.append(flag);
}
fn addFlag(m: *Maker, flag: []const u8) !void {
try m.addCFlag(flag);
try m.addCxxFlag(flag);
}
fn init(builder: *std.build.Builder) !Maker {
const commit_hash = @embedFile(".git/refs/heads/master"); const commit_hash = @embedFile(".git/refs/heads/master");
const config_header = builder.addConfigHeader( const config_header = builder.addConfigHeader(
.{ .style = .blank, .include_path = "build-info.h" }, .{ .style = .blank, .include_path = "build-info.h" },
@ -23,58 +44,71 @@ const Maker = struct {
.BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
}, },
); );
return Maker{ var m = Maker{
.builder = builder, .builder = builder,
.target = builder.standardTargetOptions(.{}), .target = builder.standardTargetOptions(.{}),
.optimize = builder.standardOptimizeOption(.{}), .optimize = builder.standardOptimizeOption(.{}),
.config_header = config_header, .config_header = config_header,
.enable_lto = false,
.include_dirs = ArrayList([]const u8).init(builder.allocator),
.cflags = ArrayList([]const u8).init(builder.allocator),
.cxxflags = ArrayList([]const u8).init(builder.allocator),
.objs = ArrayList(*Compile).init(builder.allocator),
}; };
try m.addCFlag("-std=c11");
try m.addCxxFlag("-std=c++11");
try m.addProjectInclude(&.{});
try m.addProjectInclude(&.{"examples"});
return m;
} }
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile { fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize }); const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
if (std.mem.endsWith(u8, src, ".c")) { if (std.mem.endsWith(u8, src, ".c")) {
o.addCSourceFiles(&.{src}, &cflags); o.addCSourceFiles(&.{src}, m.cflags.items);
o.linkLibC(); o.linkLibC();
} else { } else {
o.addCSourceFiles(&.{src}, &cxxflags); o.addCSourceFiles(&.{src}, m.cxxflags.items);
o.linkLibCpp(); o.linkLibCpp();
} }
o.addIncludePath(.{ .path = "." }); for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
o.addIncludePath(.{ .path = "./examples" }); o.want_lto = m.enable_lto;
return o; return o;
} }
fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile { fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize }); const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
e.addIncludePath(.{ .path = "." }); e.addCSourceFiles(&.{src}, m.cxxflags.items);
e.addIncludePath(.{ .path = "./examples" });
e.addCSourceFiles(&.{src}, &cxxflags);
for (deps) |d| e.addObject(d); for (deps) |d| e.addObject(d);
for (m.objs.items) |o| e.addObject(o);
for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
e.linkLibC(); e.linkLibC();
e.linkLibCpp(); e.linkLibCpp();
e.addConfigHeader(m.config_header); e.addConfigHeader(m.config_header);
m.builder.installArtifact(e); m.builder.installArtifact(e);
e.want_lto = m.enable_lto;
// Currently a bug is preventing correct linking for optimized builds for Windows:
// https://github.com/ziglang/zig/issues/15958
if (e.target.isWindows()) {
e.want_lto = false;
}
return e; return e;
} }
}; };
pub fn build(b: *std.build.Builder) void { pub fn build(b: *std.build.Builder) !void {
const make = Maker.init(b); var make = try Maker.init(b);
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
try make.addFlag("-DGGML_USE_K_QUANTS");
const k_quants = make.obj("k_quants", "k_quants.c");
try make.objs.append(k_quants);
}
const ggml = make.obj("ggml", "ggml.c"); const ggml = make.obj("ggml", "ggml.c");
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c"); const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
const llama = make.obj("llama", "llama.cpp"); const llama = make.obj("llama", "llama.cpp");
const common = make.obj("common", "examples/common.cpp"); const common = make.obj("common", "examples/common.cpp");
const console = make.obj("common", "examples/console.cpp");
const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp"); const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp");
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser }); _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama }); _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common }); _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common }); _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });

View file

@ -42,8 +42,10 @@ else()
add_subdirectory(benchmark) add_subdirectory(benchmark)
add_subdirectory(baby-llama) add_subdirectory(baby-llama)
add_subdirectory(train-text-from-scratch) add_subdirectory(train-text-from-scratch)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(simple) add_subdirectory(simple)
add_subdirectory(embd-input) add_subdirectory(embd-input)
add_subdirectory(llama-bench)
if (LLAMA_METAL) if (LLAMA_METAL)
add_subdirectory(metal) add_subdirectory(metal)
endif() endif()

View file

@ -283,6 +283,21 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.cfg_negative_prompt = argv[i]; params.cfg_negative_prompt = argv[i];
} else if (arg == "--cfg-negative-prompt-file") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
invalid_param = true;
break;
}
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
if (params.cfg_negative_prompt.back() == '\n') {
params.cfg_negative_prompt.pop_back();
}
} else if (arg == "--cfg-scale") { } else if (arg == "--cfg-scale") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -580,6 +595,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stdout, " --grammar-file FNAME file to read grammar from\n"); fprintf(stdout, " --grammar-file FNAME file to read grammar from\n");
fprintf(stdout, " --cfg-negative-prompt PROMPT\n"); fprintf(stdout, " --cfg-negative-prompt PROMPT\n");
fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n"); fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n");
fprintf(stdout, " --cfg-negative-prompt-file FNAME\n");
fprintf(stdout, " negative prompt file to use for guidance. (default: empty)\n");
fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
fprintf(stdout, " --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale); fprintf(stdout, " --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
fprintf(stdout, " --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base); fprintf(stdout, " --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);

View file

@ -0,0 +1,5 @@
set(TARGET convert-llama2c-to-ggml)
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,26 @@
## Convert llama2.c model to ggml
This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository:
`$ make -j`
After successful compilation, following usage options are available:
```
usage: ./convert-llama2c-to-ggml [options]
options:
-h, --help show this help message and exit
--copy-vocab-from-model FNAME model path from which to copy vocab (default 'models/ggml-vocab.bin')
--llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model
--llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin')
```
An example command is as follows:
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model <ggml-vocab.bin> --llama2c-model <llama2.c model path> --llama2c-output-model <ggml output model path>`
Now you can use the model with command like:
`$ ./main -m <ggml output model path> -p "One day, Lily met a Shoggoth" -n 500 -c 256 -eps 1e-5`

View file

@ -0,0 +1,825 @@
#include "ggml.h"
#include "llama.h"
#include <unordered_map>
#include <vector>
#include <cassert>
#include <climits>
#include <cstring>
#include <cstdarg>
#include <ctime>
#include <random>
#include <stdexcept>
#include <algorithm>
#include <string>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
typedef struct {
int dim; // transformer dimension
int hidden_dim; // for ffn layers
int n_layers; // number of layers
int n_heads; // number of query heads
int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
int vocab_size; // vocabulary size, usually 256 (byte-level)
int seq_len; // max sequence length
} Config;
typedef struct {
// token embedding table
float* token_embedding_table; // (vocab_size, dim)
// weights for rmsnorms
float* rms_att_weight; // (layer, dim) rmsnorm weights
float* rms_ffn_weight; // (layer, dim)
// weights for matmuls
float* wq; // (layer, dim, dim)
float* wk; // (layer, dim, dim)
float* wv; // (layer, dim, dim)
float* wo; // (layer, dim, dim)
// weights for ffn
float* w1; // (layer, hidden_dim, dim)
float* w2; // (layer, dim, hidden_dim)
float* w3; // (layer, hidden_dim, dim)
// final rmsnorm
float* rms_final_weight; // (dim,)
// freq_cis for RoPE relatively positional embeddings
// float* freq_cis_real; // (seq_len, dim/2)
// float* freq_cis_imag; // (seq_len, dim/2)
// (optional) classifier weights for the logits, on the last layer
//float* wcls;
} TransformerWeights;
void malloc_weights(TransformerWeights* w, Config* p) {
// we calloc instead of malloc to keep valgrind happy
w->token_embedding_table = new float[p->vocab_size * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
w->rms_att_weight = new float[p->n_layers * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
w->rms_ffn_weight = new float[p->n_layers * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
w->wq = new float[p->n_layers * p->dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wk = new float[p->n_layers * p->dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wv = new float[p->n_layers * p->dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wo = new float[p->n_layers * p->dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->w1 = new float[p->n_layers * p->hidden_dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->w2 = new float[p->n_layers * p->hidden_dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
w->w3 = new float[p->n_layers * p->hidden_dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->rms_final_weight = new float[p->dim]();
printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
}
int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
return 0;
}
void free_weights(TransformerWeights* w) {
delete w->token_embedding_table;
delete w->rms_att_weight;
delete w->rms_ffn_weight;
delete w->wq;
delete w->wk;
delete w->wv;
delete w->wo;
delete w->w1;
delete w->w2;
delete w->w3;
delete w->rms_final_weight;
}
void print_sample_weights(TransformerWeights *w){
printf("----- Quick print of first of the weight vales of all the variables\n");
printf("%f\n", w->token_embedding_table[0]);
printf("%f\n", w->rms_att_weight[0]);
printf("%f\n", w->rms_ffn_weight[0]);
printf("%f\n", w->wq[0]);
printf("%f\n", w->wk[0]);
printf("%f\n", w->wv[0]);
printf("%f\n", w->wo[0]);
printf("%f\n", w->w1[0]);
printf("%f\n", w->w2[0]);
printf("%f\n", w->w3[0]);
printf("%f\n", w->rms_att_weight[0]);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
struct llama_vocab {
using id = int32_t;
using token = std::string;
struct token_score {
token tok;
float score;
};
std::unordered_map<token, id> token_to_id;
std::vector<token_score> id_to_token;
};
struct my_llama_hparams {
uint32_t n_vocab = 32000;
uint32_t n_ctx = 512; // this is provided as user input?
uint32_t n_embd = 4096;
uint32_t n_mult = 4;
uint32_t n_head = 32;
uint32_t n_layer = 32;
uint32_t n_rot = 64;
bool operator!=(const my_llama_hparams& other) const {
return memcmp(this, &other, sizeof(my_llama_hparams));
}
};
struct my_llama_layer {
// normalization
struct ggml_tensor * attention_norm;
// attention
struct ggml_tensor * wq;
struct ggml_tensor * wk;
struct ggml_tensor * wv;
struct ggml_tensor * wo;
// normalization
struct ggml_tensor * ffn_norm;
// ff
struct ggml_tensor * w1;
struct ggml_tensor * w2;
struct ggml_tensor * w3;
};
struct my_llama_model {
struct ggml_context * ctx = NULL;
my_llama_hparams hparams;
struct ggml_tensor * tok_embeddings;
struct ggml_tensor * norm;
struct ggml_tensor * output;
std::vector<my_llama_layer> layers;
uint32_t train_its = 0;
uint32_t train_samples = 0;
uint32_t train_tokens = 0;
};
struct train_params {
const char * fn_vocab_model;
const char * fn_llama2c_model;
const char * fn_llama2c_output_model;
const char * fn_train_data;
const char * fn_checkpoint_in;
const char * fn_checkpoint_out;
const char * fn_model_out;
uint32_t seed;
int n_ctx;
int n_embd;
int n_mult;
int n_head;
int n_layer;
int n_rotmax;
int n_threads;
int n_batch;
int n_examples;
int n_predict;
int print_info_interval;
int print_details_interval;
bool samples_start_after_nl;
bool use_adam;
bool use_flash;
bool use_scratch;
// only adam
int warmup;
int cos_decay_steps;
float cos_decay_restart;
float cos_decay_alpha;
int lbfgs_n_iter;
int adam_n_iter;
float adam_alpha;
float adam_decay;
int mem_model_gb;
int mem_compute_gb;
int mem_compute0_gb;
int mem_compute1_gb;
};
uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
return n_ff;
}
void print_params(struct my_llama_hparams * params) {
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
printf("%s: n_embd: %d\n", __func__, params->n_embd);
printf("%s: n_mult: %d\n", __func__, params->n_mult);
printf("%s: n_head: %d\n", __func__, params->n_head);
printf("%s: n_ff: %d\n", __func__, get_n_ff(params));
printf("%s: n_layer: %d\n", __func__, params->n_layer);
printf("%s: n_rot: %d\n", __func__, params->n_rot);
}
void init_model(struct my_llama_model * model) {
const auto & hparams = model->hparams;
const uint32_t n_embd = hparams.n_embd;
const uint32_t n_layer = hparams.n_layer;
const uint32_t n_vocab = hparams.n_vocab;
const uint32_t n_ff = get_n_ff(&hparams);
struct ggml_context * ctx = model->ctx;
model->train_its = 0;
model->train_samples = 0;
model->train_tokens = 0;
model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
// printing the per-layer allocations here so we dont print in the for loop.
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
ggml_set_name(model->norm, "norm.weight");
ggml_set_name(model->output, "output.weight");
model->layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
auto & layer = model->layers[i];
std::string layers_i = "layers." + std::to_string(i);
layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str());
ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
}
}
float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
return *ptr;
}
int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
return *ptr;
}
void print_row(struct ggml_tensor * probs, int i) {
for (int k = 0; k < probs->ne[0]; ++k) {
float p = get_f32_2d(probs, k, i);
printf(" %f", p);
}
printf("\n");
}
void print_matrix(struct ggml_tensor * probs) {
assert(probs->n_dims == 2);
for (int i = 0; i < probs->ne[1]; ++i) {
for (int k = 0; k < probs->ne[0]; ++k) {
float p = get_f32_2d(probs, k, i);
printf(" %.2f", p);
}
printf("\n");
}
}
#ifdef __GNUC__
#ifdef __MINGW32__
__attribute__((format(gnu_printf, 1, 2)))
#else
__attribute__((format(printf, 1, 2)))
#endif
#endif
static std::string format(const char * fmt, ...) {
va_list ap, ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX);
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), size);
}
struct llama_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
size_t size;
llama_file(const char * fname, const char * mode) {
fp = std::fopen(fname, mode);
if (fp == NULL) {
size = 0;
} else {
seek(0, SEEK_END);
size = tell();
seek(0, SEEK_SET);
}
}
size_t tell() const {
#ifdef _WIN32
__int64 ret = _ftelli64(fp);
#else
long ret = std::ftell(fp);
#endif
GGML_ASSERT(ret != -1); // this really shouldn't fail
return (size_t) ret;
}
void seek(size_t offset, int whence) {
#ifdef _WIN32
int ret = _fseeki64(fp, (__int64) offset, whence);
#else
int ret = std::fseek(fp, (long) offset, whence);
#endif
GGML_ASSERT(ret == 0); // same
}
void read_raw(void * ptr, size_t size) {
if (size == 0) {
return;
}
errno = 0;
std::size_t ret = std::fread(ptr, size, 1, fp);
if (ferror(fp)) {
throw std::runtime_error(format("read error: %s", strerror(errno)));
}
if (ret != 1) {
throw std::runtime_error(std::string("unexpectedly reached end of file"));
}
}
std::uint32_t read_u32() {
std::uint32_t ret;
read_raw(&ret, sizeof(ret));
return ret;
}
std::float_t read_f32() {
std::float_t ret;
read_raw(&ret, sizeof(ret));
return ret;
}
std::string read_string(std::uint32_t len) {
std::vector<char> chars(len);
read_raw(chars.data(), len);
return std::string(chars.data(), len);
}
void write_raw(const void * ptr, size_t size) {
if (size == 0) {
return;
}
errno = 0;
size_t ret = std::fwrite(ptr, size, 1, fp);
if (ret != 1) {
throw std::runtime_error(format("write error: %s", strerror(errno)));
}
}
void write_u32(std::uint32_t val) {
write_raw(&val, sizeof(val));
}
~llama_file() {
if (fp) {
std::fclose(fp);
}
}
};
void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
if (tensor == NULL) {
file->write_u32(0);
file->write_u32(0);
file->write_u32(GGML_TYPE_F32);
file->seek((0-file->tell()) & 31, SEEK_CUR);
return;
}
const char * name = ggml_get_name(tensor);
uint32_t name_len = strlen(name);
uint32_t nd = tensor->n_dims;
uint32_t ne[4] = { (uint32_t)tensor->ne[0],
(uint32_t)tensor->ne[1],
(uint32_t)tensor->ne[2],
(uint32_t)tensor->ne[3] };
file->write_u32(nd);
file->write_u32(name_len);
file->write_u32(tensor->type);
file->write_raw(ne, sizeof(ne[0]) * nd);
file->write_raw(name, name_len);
file->seek((0-file->tell()) & 31, SEEK_CUR);
file->write_raw(tensor->data, ggml_nbytes(tensor));
}
bool is_ggml_file(const char *filename) {
llama_file file(filename, "rb");
if (file.size < 4) {
return false;
}
uint32_t magic = file.read_u32();
return magic == LLAMA_FILE_MAGIC;
}
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
// heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
if (is_ggml_file(filename)) {
struct llama_context_params llama_params = llama_context_default_params();
llama_params.vocab_only = true;
struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
std::vector<const char *> strings;
std::vector<float> scores;
int n_vocab = llama_n_vocab(lctx);
strings.resize(n_vocab, NULL);
scores.resize(n_vocab, 0);
n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
vocab->id_to_token.resize(n_vocab);
for (int i=0; i<n_vocab; ++i) {
std::string tok = std::string(strings[i]);
float score = scores[i];
vocab->id_to_token[i].tok = tok;
vocab->id_to_token[i].score = score;
vocab->token_to_id.emplace(tok, i);
}
llama_free(lctx);
llama_free_model(lmodel);
} else { // assume llama2.c vocabulary
printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
llama_file file(filename, "rb");
uint32_t n_vocab = config->vocab_size;
/* uint32_t max_token_length = */ file.read_u32(); // unused
vocab->id_to_token.resize(n_vocab);
for (uint32_t i=0; i<n_vocab; ++i) {
float_t score = file.read_f32();
uint32_t len = file.read_u32();
std::string tok = file.read_string(len);
vocab->id_to_token[i].tok = tok;
vocab->id_to_token[i].score = score;
vocab->token_to_id.emplace(tok, i);
}
}
}
void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){
int ct;
switch (gg_weights->n_dims){
case 1:
ct = 0;
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
*ptr = karpathy_weights[ct];
ct++;
}
break;
case 2:
ct = 0;
for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
*ptr = karpathy_weights[ct];
ct++;
}
}
break;
case 3:
ct = 0;
for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
*ptr = karpathy_weights[ct];
ct++;
}
}
}
break;
}
}
void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
struct llama_file file(filename, "wb");
if (file.fp == NULL) {
return;
}
// write_magic
file.write_u32(LLAMA_FILE_MAGIC); // magic
file.write_u32(LLAMA_FILE_VERSION); // version
// write_hparams
file.write_u32(model->hparams.n_vocab);
file.write_u32(model->hparams.n_embd);
file.write_u32(model->hparams.n_mult);
file.write_u32(model->hparams.n_head);
file.write_u32(model->hparams.n_layer);
file.write_u32(model->hparams.n_rot);
file.write_u32(LLAMA_FTYPE_ALL_F32);
// write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
uint32_t n_vocab = model->hparams.n_vocab;
for (uint32_t i = 0; i < n_vocab; i++) {
const auto & token_score = vocab->id_to_token.at(i);
file.write_u32((uint32_t) token_score.tok.size());
file.write_raw(token_score.tok.data(), token_score.tok.size());
file.write_raw(&token_score.score, sizeof(token_score.score));
}
// stuff AK weights into GG weights one by one.
// w->token_embedding_table -> model->tok_embeddings
// float* -> struct ggml_tensor
stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
//print_row(model->norm, 0);
// for rms-att-weight
int row_length = model->hparams.n_embd;
const auto & hparams = model->hparams;
//int n_ff = model->hparams.n_embd;
int n_ff = get_n_ff(&hparams);
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
auto & layer = model->layers[i];
// 1d
stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
// from 3d matrix layer x dim x dim to 2d matrix dim x dim
stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
}
// write tensors
write_tensor(&file, model->tok_embeddings);
write_tensor(&file, model->norm);
write_tensor(&file, model->output); // ?
for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
auto & layer = model->layers[i];
write_tensor(&file, layer.attention_norm);
write_tensor(&file, layer.wq);
write_tensor(&file, layer.wk);
write_tensor(&file, layer.wv);
write_tensor(&file, layer.wo);
write_tensor(&file, layer.ffn_norm);
write_tensor(&file, layer.w1);
write_tensor(&file, layer.w2);
write_tensor(&file, layer.w3);
}
}
struct train_params get_default_train_params() {
struct train_params params;
params.fn_vocab_model = "models/ggml-vocab.bin";
params.fn_llama2c_output_model = "ak_llama_model.bin";
params.fn_train_data = "shakespeare.txt";
params.fn_checkpoint_in = "checkpoint.bin";
params.fn_checkpoint_out = "checkpoint.bin";
params.fn_model_out = "ggml-checkpoint-f32.bin";
params.seed = -1;
params.n_ctx = 128;
params.n_embd = 256;
params.n_mult = 256;
params.n_head = 8;
params.n_layer = 16;
params.n_rotmax = 64;
params.n_threads = 6;
params.n_batch = 8;
params.n_examples = 8;
params.n_predict = 1024;
params.print_info_interval = 1;
params.print_details_interval = 2;
params.samples_start_after_nl = false;
params.use_adam = true;
params.use_flash = true;
params.use_scratch = true;
// only adam
params.warmup = 100;
params.cos_decay_steps = 1000;
params.cos_decay_restart = 1.1f;
params.cos_decay_alpha = 0.0f;
params.lbfgs_n_iter = 16;
params.adam_n_iter = 16;
params.adam_alpha = 1e-3f;
params.adam_decay = 1e-3f;
params.mem_model_gb = 2;
params.mem_compute_gb = 24;
params.mem_compute0_gb = 8;
params.mem_compute1_gb = 2;
return params;
}
void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggml model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
fprintf(stderr, "\n");
}
bool params_parse(int argc, char ** argv, struct train_params * params) {
bool invalid_param = false;
bool reqd_param_found = false;
std::string arg;
struct train_params default_params = get_default_train_params();
const std::string arg_prefix = "--";
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
if (arg == "--copy-vocab-from-model") {
if (++i >= argc) {
invalid_param = true;
break;
}
params->fn_vocab_model = argv[i];
} else if (arg == "--llama2c-model") {
if (++i >= argc) {
invalid_param = true;
break;
}
reqd_param_found = true;
params->fn_llama2c_model = argv[i];
} else if (arg == "--llama2c-output-model") {
if (++i >= argc) {
invalid_param = true;
break;
}
params->fn_llama2c_output_model = argv[i];
} else if (arg == "-h" || arg == "--help") {
print_usage(argc, argv, &default_params);
exit(0);
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
print_usage(argc, argv, &default_params);
exit(1);
}
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
print_usage(argc, argv, &default_params);
exit(1);
}
if (!reqd_param_found){
fprintf(stderr, "error: please specify a llama2.c .bin file to be converted with argument --llama2c-model\n");
print_usage(argc, argv, &default_params);
exit(1);
}
return true;
}
int main(int argc, char ** argv) {
struct train_params params = get_default_train_params();
if (!params_parse(argc, argv, &params)) {
return 1;
}
Config config;
TransformerWeights weights;
{
FILE *file = fopen(params.fn_llama2c_model, "rb");
if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
// read in the config header
if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
// read in the Transformer weights
malloc_weights(&weights, &config);
if(checkpoint_init_weights(&weights, &config, file)) { return 1; }
fclose(file);
}
struct llama_vocab vocab;
load_vocab(params.fn_vocab_model, &config, &vocab);
struct my_llama_model model;
model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
model.hparams.n_ctx = params.n_ctx;
model.hparams.n_embd = config.dim; //params.n_embd;
model.hparams.n_mult = 32;//params.n_mult;
model.hparams.n_head = config.n_heads; //params.n_head;
model.hparams.n_layer = config.n_layers; //params.n_layer;
model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
print_params(&model.hparams);
struct ggml_init_params lcparams;
lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
lcparams.mem_buffer = NULL;
lcparams.no_alloc = false;
model.ctx = ggml_init(lcparams);
init_model(&model);
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
ggml_free(model.ctx);
free_weights(&weights);
return 0;
}

View file

@ -0,0 +1,8 @@
set(TARGET llama-bench)
add_executable(${TARGET} llama-bench.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -0,0 +1,967 @@
#include <algorithm>
#include <array>
#include <cassert>
#include <chrono>
#include <cinttypes>
#include <cstring>
#include <ctime>
#include <iterator>
#include <map>
#include <numeric>
#include <regex>
#include <sstream>
#include <stdio.h>
#include <string>
#include <vector>
#include "ggml.h"
#include "llama.h"
#include "common.h"
#include "build-info.h"
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
#endif
// utils
static uint64_t get_time_ns() {
using clock = std::chrono::high_resolution_clock;
return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
}
template<class T>
static std::string join(const std::vector<T> & values, const std::string & delim) {
std::ostringstream str;
for (size_t i = 0; i < values.size(); i++) {
str << values[i];
if (i < values.size() - 1) {
str << delim;
}
}
return str.str();
}
template<class T>
static std::vector<T> split(const std::string & str, char delim) {
std::vector<T> values;
std::istringstream str_stream(str);
std::string token;
while (std::getline(str_stream, token, delim)) {
T value;
std::istringstream token_stream(token);
token_stream >> value;
values.push_back(value);
}
return values;
}
template<typename T>
static T avg(const std::vector<T> & v) {
if (v.empty()) {
return 0;
}
T sum = std::accumulate(v.begin(), v.end(), T(0));
return sum / (T)v.size();
}
template<typename T>
static T stdev(const std::vector<T> & v) {
if (v.size() <= 1) {
return 0;
}
T mean = avg(v);
T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0));
T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1));
return stdev;
}
static bool ggml_cpu_has_metal() {
#if defined(GGML_USE_METAL)
return true;
#else
return false;
#endif
}
static std::string get_cpu_info() {
std::string id;
#ifdef __linux__
FILE * f = fopen("/proc/cpuinfo", "r");
if (f) {
char buf[1024];
while (fgets(buf, sizeof(buf), f)) {
if (strncmp(buf, "model name", 10) == 0) {
char * p = strchr(buf, ':');
if (p) {
p++;
while (std::isspace(*p)) {
p++;
}
while (std::isspace(p[strlen(p) - 1])) {
p[strlen(p) - 1] = '\0';
}
id = p;
break;
}
}
}
}
#endif
// TODO: other platforms
return id;
}
static std::string get_gpu_info() {
std::string id;
#ifdef GGML_USE_CUBLAS
int count = ggml_cuda_get_device_count();
for (int i = 0; i < count; i++) {
char buf[128];
ggml_cuda_get_device_description(i, buf, sizeof(buf));
id += buf;
if (i < count - 1) {
id += "/";
}
}
#endif
// TODO: other backends
return id;
}
// command line params
enum output_formats {CSV, JSON, MARKDOWN, SQL};
struct cmd_params {
std::vector<std::string> model;
std::vector<int> n_prompt;
std::vector<int> n_gen;
std::vector<int> n_batch;
std::vector<bool> f32_kv;
std::vector<int> n_threads;
std::vector<int> n_gpu_layers;
std::vector<int> main_gpu;
std::vector<bool> mul_mat_q;
std::vector<bool> low_vram;
std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
int reps;
bool verbose;
output_formats output_format;
};
static const cmd_params cmd_params_defaults = {
/* model */ {"models/7B/ggml-model-q4_0.bin"},
/* n_prompt */ {512},
/* n_gen */ {128},
/* n_batch */ {512},
/* f32_kv */ {false},
/* n_threads */ {get_num_physical_cores()},
/* n_gpu_layers */ {99},
/* main_gpu */ {0},
/* mul_mat_q */ {true},
/* low_vram */ {false},
/* tensor_split */ {{}},
/* reps */ 5,
/* verbose */ false,
/* output_format */ MARKDOWN
};
static void print_usage(int /* argc */, char ** argv) {
fprintf(stdout, "usage: %s [options]\n", argv[0]);
fprintf(stdout, "\n");
fprintf(stdout, "options:\n");
fprintf(stdout, " -h, --help\n");
fprintf(stdout, " -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
fprintf(stdout, " -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
fprintf(stdout, " -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
fprintf(stdout, " -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
fprintf(stdout, " --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
fprintf(stdout, " -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
fprintf(stdout, " -ngl N, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
fprintf(stdout, " -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
fprintf(stdout, " -lv, --low-vram <0|1> (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
fprintf(stdout, " -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
fprintf(stdout, " -ts, --tensor_split <ts> \n");
fprintf(stdout, " -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
fprintf(stdout, " -o, --output <csv|json|md|sql> (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : "md");
fprintf(stdout, " -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
fprintf(stdout, "\n");
fprintf(stdout, "Multiple values can be given for each parameter by separating them with ',' or by repeating the parameter.\n");
}
static cmd_params parse_cmd_params(int argc, char ** argv) {
cmd_params params;
std::string arg;
bool invalid_param = false;
const std::string arg_prefix = "--";
const char split_delim = ',';
params.verbose = cmd_params_defaults.verbose;
params.output_format = cmd_params_defaults.output_format;
params.reps = cmd_params_defaults.reps;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
if (arg == "-h" || arg == "--help") {
print_usage(argc, argv);
exit(0);
} else if (arg == "-m" || arg == "--model") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<std::string>(argv[i], split_delim);
params.model.insert(params.model.end(), p.begin(), p.end());
} else if (arg == "-p" || arg == "--n-prompt") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
} else if (arg == "-n" || arg == "--n-gen") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
} else if (arg == "-b" || arg == "--batch-size") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
} else if (arg == "--memory-f32") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
} else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
} else if (arg == "-mg" || arg == "--main-gpu") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.main_gpu = split<int>(argv[i], split_delim);
} else if (arg == "-lv" || arg == "--low-vram") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<bool>(argv[i], split_delim);
params.low_vram.insert(params.low_vram.end(), p.begin(), p.end());
} else if (arg == "-mmq" || arg == "--mul-mat-q") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<bool>(argv[i], split_delim);
params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
} else if (arg == "-ts" || arg == "--tensor-split") {
if (++i >= argc) {
invalid_param = true;
break;
}
for (auto ts : split<std::string>(argv[i], split_delim)) {
// split string by ; and /
const std::regex regex{R"([;/]+)"};
std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
std::vector<std::string> split_arg{it, {}};
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
if (i < split_arg.size()) {
tensor_split[i] = std::stof(split_arg[i]);
} else {
tensor_split[i] = 0.0f;
}
}
params.tensor_split.push_back(tensor_split);
}
} else if (arg == "-r" || arg == "--repetitions") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.reps = std::stoi(argv[i]);
} else if (arg == "-o" || arg == "--output") {
if (++i >= argc) {
invalid_param = true;
break;
}
if (argv[i] == std::string("csv")) {
params.output_format = CSV;
} else if (argv[i] == std::string("json")) {
params.output_format = JSON;
} else if (argv[i] == std::string("md")) {
params.output_format = MARKDOWN;
} else if (argv[i] == std::string("sql")) {
params.output_format = SQL;
} else {
invalid_param = true;
break;
}
} else if (arg == "-v" || arg == "--verbose") {
params.verbose = true;
} else {
invalid_param = true;
break;
}
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
print_usage(argc, argv);
exit(1);
}
// set defaults
if (params.model.empty()) { params.model = cmd_params_defaults.model; }
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
if (params.f32_kv.empty()) { params.f32_kv = cmd_params_defaults.f32_kv; }
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
if (params.low_vram.empty()) { params.low_vram = cmd_params_defaults.low_vram; }
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
return params;
}
struct cmd_params_instance {
std::string model;
int n_prompt;
int n_gen;
int n_batch;
bool f32_kv;
int n_threads;
int n_gpu_layers;
int main_gpu;
bool mul_mat_q;
bool low_vram;
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
llama_context_params to_llama_params() const {
llama_context_params lparams = llama_context_default_params();
lparams.n_ctx = n_prompt + n_gen;
lparams.n_batch = n_batch;
lparams.f16_kv = !f32_kv;
lparams.n_gpu_layers = n_gpu_layers;
lparams.main_gpu = main_gpu;
lparams.mul_mat_q = mul_mat_q;
lparams.low_vram = low_vram;
lparams.tensor_split = tensor_split.data();
return lparams;
}
};
static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_params & params, int n_gen, int n_prompt) {
std::vector<cmd_params_instance> instances;
for (const auto & m : params.model)
for (const auto & nb : params.n_batch)
for (const auto & fk : params.f32_kv)
for (const auto & nl : params.n_gpu_layers)
for (const auto & mg : params.main_gpu)
for (const auto & mmq : params.mul_mat_q)
for (const auto & lv : params.low_vram)
for (const auto & ts : params.tensor_split)
for (const auto & nt : params.n_threads) {
cmd_params_instance instance = {
/* .model = */ m,
/* .n_prompt = */ n_prompt,
/* .n_gen = */ n_gen,
/* .n_batch = */ nb,
/* .f32_kv = */ fk,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
/* .main_gpu = */ mg,
/* .mul_mat_q = */ mmq,
/* .low_vram = */ lv,
/* .tensor_split = */ ts,
};
instances.push_back(instance);
}
return instances;
}
static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
std::vector<cmd_params_instance> instances;
for (const auto & n_prompt : params.n_prompt) {
if (n_prompt == 0) {
continue;
}
auto instances_prompt = get_cmd_params_instances_int(params, 0, n_prompt);
instances.insert(instances.end(), instances_prompt.begin(), instances_prompt.end());
}
for (const auto & n_gen : params.n_gen) {
if (n_gen == 0) {
continue;
}
auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0);
instances.insert(instances.end(), instances_gen.begin(), instances_gen.end());
}
return instances;
}
struct test {
static const std::string build_commit;
static const int build_number;
static const bool cuda;
static const bool opencl;
static const bool metal;
static const bool gpu_blas;
static const bool blas;
static const std::string cpu_info;
static const std::string gpu_info;
std::string model_filename;
std::string model_type;
int n_batch;
int n_threads;
bool f32_kv;
int n_gpu_layers;
int main_gpu;
bool mul_mat_q;
bool low_vram;
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
int n_prompt;
int n_gen;
std::string test_time;
std::vector<uint64_t> samples_ns;
test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
model_filename = inst.model;
char buf[128];
llama_model_type(lmodel, buf, sizeof(buf));
model_type = buf;
n_batch = inst.n_batch;
n_threads = inst.n_threads;
f32_kv = inst.f32_kv;
n_gpu_layers = inst.n_gpu_layers;
main_gpu = inst.main_gpu;
mul_mat_q = inst.mul_mat_q;
low_vram = inst.low_vram;
tensor_split = inst.tensor_split;
n_prompt = inst.n_prompt;
n_gen = inst.n_gen;
// RFC 3339 date-time format
time_t t = time(NULL);
std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
test_time = buf;
(void) ctx;
}
uint64_t avg_ns() const {
return ::avg(samples_ns);
}
uint64_t stdev_ns() const {
return ::stdev(samples_ns);
}
std::vector<double> get_ts() const {
int n_tokens = n_prompt + n_gen;
std::vector<double> ts;
std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
return ts;
}
double avg_ts() const {
return ::avg(get_ts());
}
double stdev_ts() const {
return ::stdev(get_ts());
}
static std::string get_backend() {
if (cuda) {
return "CUDA";
}
if (opencl) {
return "OpenCL";
}
if (metal) {
return "Metal";
}
if (gpu_blas) {
return "GPU BLAS";
}
if (blas) {
return "BLAS";
}
return "CPU";
}
static const std::vector<std::string> & get_fields() {
static const std::vector<std::string> fields = {
"build_commit", "build_number",
"cuda", "opencl", "metal", "gpu_blas", "blas",
"cpu_info", "gpu_info",
"model_filename", "model_type",
"n_batch", "n_threads", "f16_kv",
"n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
"n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns",
"avg_ts", "stddev_ts"
};
return fields;
}
enum field_type {STRING, BOOL, INT, FLOAT};
static field_type get_field_type(const std::string & field) {
if (field == "build_number" || field == "n_batch" || field == "n_threads" ||
field == "n_gpu_layers" || field == "main_gpu" ||
field == "n_prompt" || field == "n_gen" ||
field == "avg_ns" || field == "stddev_ns") {
return INT;
}
if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
field == "f16_kv" || field == "mul_mat_q" || field == "low_vram") {
return BOOL;
}
if (field == "avg_ts" || field == "stddev_ts") {
return FLOAT;
}
return STRING;
}
std::vector<std::string> get_values() const {
std::string tensor_split_str;
int max_nonzero = 0;
for (int i = 0; i < LLAMA_MAX_DEVICES; i++) {
if (tensor_split[i] > 0) {
max_nonzero = i;
}
}
for (int i = 0; i <= max_nonzero; i++) {
char buf[32];
snprintf(buf, sizeof(buf), "%.2f", tensor_split[i]);
tensor_split_str += buf;
if (i < max_nonzero) {
tensor_split_str += "/";
}
}
std::vector<std::string> values = {
build_commit, std::to_string(build_number),
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info,
model_filename, model_type,
std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()),
std::to_string(avg_ts()), std::to_string(stdev_ts())
};
return values;
}
std::map<std::string, std::string> get_map() const {
std::map<std::string, std::string> map;
auto fields = get_fields();
auto values = get_values();
std::transform(fields.begin(), fields.end(), values.begin(),
std::inserter(map, map.end()), std::make_pair<const std::string &, const std::string &>);
return map;
}
};
const std::string test::build_commit = BUILD_COMMIT;
const int test::build_number = BUILD_NUMBER;
const bool test::cuda = !!ggml_cpu_has_cublas();
const bool test::opencl = !!ggml_cpu_has_clblast();
const bool test::metal = !!ggml_cpu_has_metal();
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
const bool test::blas = !!ggml_cpu_has_blas();
const std::string test::cpu_info = get_cpu_info();
const std::string test::gpu_info = get_gpu_info();
struct printer {
FILE * fout;
virtual void print_header(const cmd_params & params) { (void) params; };
virtual void print_test(const test & t) = 0;
virtual void print_footer() { };
};
struct csv_printer : public printer {
static std::string escape_csv(const std::string & field) {
std::string escaped = "\"";
for (auto c : field) {
if (c == '"') {
escaped += "\"";
}
escaped += c;
}
escaped += "\"";
return escaped;
}
void print_header(const cmd_params & params) override {
std::vector<std::string> fields = test::get_fields();
fprintf(fout, "%s\n", join(fields, ",").c_str());
(void) params;
}
void print_test(const test & t) override {
std::vector<std::string> values = t.get_values();
std::transform(values.begin(), values.end(), values.begin(), escape_csv);
fprintf(fout, "%s\n", join(values, ",").c_str());
}
};
struct json_printer : public printer {
bool first = true;
static std::string escape_json(const std::string & value) {
std::string escaped;
for (auto c : value) {
if (c == '"') {
escaped += "\\\"";
} else if (c == '\\') {
escaped += "\\\\";
} else if (c <= 0x1f) {
char buf[8];
snprintf(buf, sizeof(buf), "\\u%04x", c);
escaped += buf;
} else {
escaped += c;
}
}
return escaped;
}
static std::string format_value(const std::string & field, const std::string & value) {
switch (test::get_field_type(field)) {
case test::STRING:
return "\"" + escape_json(value) + "\"";
case test::BOOL:
return value == "0" ? "false" : "true";
default:
return value;
}
}
void print_header(const cmd_params & params) override {
fprintf(fout, "[\n");
(void) params;
}
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
assert(fields.size() == values.size());
for (size_t i = 0; i < fields.size(); i++) {
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str());
}
}
void print_test(const test & t) override {
if (first) {
first = false;
} else {
fprintf(fout, ",\n");
}
fprintf(fout, " {\n");
print_fields(test::get_fields(), t.get_values());
fprintf(fout, " \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str());
fprintf(fout, " \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str());
fprintf(fout, " }");
fflush(fout);
}
void print_footer() override {
fprintf(fout, "\n]\n");
}
};
struct markdown_printer : public printer {
std::vector<std::string> fields;
static int get_field_width(const std::string & field) {
if (field == "model") {
return -30;
}
if (field == "t/s") {
return 15;
}
int width = std::max((int)field.length(), 10);
if (test::get_field_type(field) == test::STRING) {
return -width;
}
return width;
}
void print_header(const cmd_params & params) override {
// select fields to print
fields = { "model", "backend" };
bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
if (!is_cpu_backend) {
fields.push_back("n_gpu_layers");
}
if (params.n_batch.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
fields.push_back("n_threads");
}
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
fields.push_back("n_batch");
}
if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
fields.push_back("f16_kv");
}
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
fields.push_back("main_gpu");
}
if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
fields.push_back("mul_mat_q");
}
if (params.low_vram.size() > 1 || params.low_vram != cmd_params_defaults.low_vram) {
fields.push_back("low_vram");
}
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
fields.push_back("tensor_split");
}
fields.push_back("test");
fields.push_back("t/s");
fprintf(fout, "|");
for (const auto & field : fields) {
fprintf(fout, " %*s |", get_field_width(field), field.c_str());
}
fprintf(fout, "\n");
fprintf(fout, "|");
for (const auto & field : fields) {
int width = get_field_width(field);
fprintf(fout, " %s%s |", std::string(std::abs(width) - 1, '-').c_str(), width > 0 ? ":" : "-");
}
fprintf(fout, "\n");
}
void print_test(const test & t) override {
std::map<std::string, std::string> vmap = t.get_map();
fprintf(fout, "|");
for (const auto & field : fields) {
std::string value;
if (field == "model") {
value = t.model_type;
} else if (field == "backend") {
value = test::get_backend();
} else if (field == "test") {
char buf[128];
if (t.n_prompt > 0 && t.n_gen == 0) {
snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
} else if (t.n_gen > 0 && t.n_prompt == 0) {
snprintf(buf, sizeof(buf), "tg %d", t.n_gen);
} else {
assert(false);
exit(1);
}
value = buf;
} else if (field == "t/s") {
char buf[128];
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
value = buf;
} else if (vmap.find(field) != vmap.end()) {
value = vmap.at(field);
} else {
assert(false);
exit(1);
}
int width = get_field_width(field);
if (field == "t/s") {
// HACK: the utf-8 character is 2 bytes
width += 1;
}
fprintf(fout, " %*s |", width, value.c_str());
}
fprintf(fout, "\n");
}
void print_footer() override {
fprintf(fout, "\nbuild: %s (%d)\n", test::build_commit.c_str(), test::build_number);
}
};
struct sql_printer : public printer {
static std::string get_sql_field_type(const std::string & field) {
switch (test::get_field_type(field)) {
case test::STRING:
return "TEXT";
case test::BOOL:
case test::INT:
return "INTEGER";
case test::FLOAT:
return "REAL";
default:
assert(false);
exit(1);
}
}
void print_header(const cmd_params & params) override {
std::vector<std::string> fields = test::get_fields();
fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n");
for (size_t i = 0; i < fields.size(); i++) {
fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(), i < fields.size() - 1 ? "," : "");
}
fprintf(fout, ");\n");
fprintf(fout, "\n");
(void) params;
}
void print_test(const test & t) override {
fprintf(fout, "INSERT INTO test (%s) ", join(test::get_fields(), ", ").c_str());
fprintf(fout, "VALUES (");
std::vector<std::string> values = t.get_values();
for (size_t i = 0; i < values.size(); i++) {
fprintf(fout, "'%s'%s", values.at(i).c_str(), i < values.size() - 1 ? ", " : "");
}
fprintf(fout, ");\n");
}
};
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
std::vector<llama_token> tokens(n_batch, llama_token_bos());
int n_processed = 0;
while (n_processed < n_prompt) {
int n_tokens = std::min(n_prompt - n_processed, n_batch);
llama_eval(ctx, tokens.data(), n_tokens, n_past + n_processed, n_threads);
n_processed += n_tokens;
}
}
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
llama_token token = llama_token_bos();
for (int i = 0; i < n_gen; i++) {
llama_eval(ctx, &token, 1, n_past + i, n_threads);
}
}
static void llama_null_log_callback(enum llama_log_level level, const char * text, void * user_data) {
(void) level;
(void) text;
(void) user_data;
}
int main(int argc, char ** argv) {
#if !defined(NDEBUG)
fprintf(stderr, "warning: asserts enabled, performance may be affected\n");
#endif
#if (defined(_MSC_VER) && defined(_DEBUG)) || (!defined(_MSC_VER) && !defined(__OPTIMIZE__))
fprintf(stderr, "warning: debug build, performance may be affected\n");
#endif
#if defined(__SANITIZE_ADDRESS__) || defined(__SANITIZE_THREAD__)
fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n");
#endif
cmd_params params = parse_cmd_params(argc, argv);
// initialize llama.cpp
if (!params.verbose) {
llama_log_set(llama_null_log_callback, NULL);
}
bool numa = false;
llama_backend_init(numa);
// initialize printer
std::unique_ptr<printer> p;
switch (params.output_format) {
case CSV:
p.reset(new csv_printer());
break;
case JSON:
p.reset(new json_printer());
break;
case MARKDOWN:
p.reset(new markdown_printer());
break;
case SQL:
p.reset(new sql_printer());
break;
default:
assert(false);
exit(1);
}
p->fout = stdout;
p->print_header(params);
std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
for (const auto & inst : params_instances) {
// TODO: keep the model between tests when possible
llama_context_params lparams = inst.to_llama_params();
llama_model * lmodel = llama_load_model_from_file(inst.model.c_str(), lparams);
if (lmodel == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
return 1;
}
llama_context * ctx = llama_new_context_with_model(lmodel, lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
llama_free_model(lmodel);
return 1;
}
test t(inst, lmodel, ctx);
// warmup run
test_gen(ctx, 1, 0, t.n_threads);
for (int i = 0; i < params.reps; i++) {
uint64_t t_start = get_time_ns();
if (t.n_prompt > 0) {
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
}
if (t.n_gen > 0) {
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
}
uint64_t t_ns = get_time_ns() - t_start;
t.samples_ns.push_back(t_ns);
}
p->print_test(t);
llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(lmodel);
}
p->print_footer();
llama_backend_free();
return 0;
}

View file

@ -88,7 +88,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
fprintf(stderr, "%d hours ", total_seconds / (60*60)); fprintf(stderr, "%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60); total_seconds = total_seconds % (60*60);
} }
fprintf(stderr, "%d minutes\n", total_seconds / 60); fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
} }
// We get the logits for all the tokens in the context window (params.n_ctx) // We get the logits for all the tokens in the context window (params.n_ctx)

View file

@ -17,6 +17,7 @@ Command line options:
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
- `--numa`: Attempt optimizations that help on some NUMA systems.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`. - `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.

View file

@ -1,5 +1,34 @@
import * as readline from 'node:readline' import * as readline from 'node:readline'
import { stdin, stdout } from 'node:process' import { stdin, stdout } from 'node:process'
import { readFileSync } from 'node:fs'
import { SchemaConverter } from './public/json-schema-to-grammar.mjs'
const args = process.argv.slice(2);
const grammarJsonSchemaFile = args.find(
(_, index) => args[index - 1] === "--grammar-json-schema"
);
const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");
// Example usage: function,arguments
const grammarJsonSchemaPropOrder = args.find(
(_, index) => args[index - 1] === "--grammar-json-schema-prop-order"
);
const propOrder = grammarJsonSchemaPropOrder
? grammarJsonSchemaPropOrder
.split(",")
.reduce((acc, cur, index) => ({ ...acc, [cur]: index }), {})
: {};
let grammar = null
if (grammarJsonSchemaFile) {
const schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8'))
const converter = new SchemaConverter(propOrder)
converter.visit(schema, '')
grammar = converter.formatGrammar()
}
if (grammarFile) {
grammar = readFileSync(grammarFile, 'utf-8')
}
const API_URL = 'http://127.0.0.1:8080' const API_URL = 'http://127.0.0.1:8080'
@ -48,6 +77,7 @@ async function chat_completion(question) {
n_keep: n_keep, n_keep: n_keep,
n_predict: 256, n_predict: 256,
stop: ["\n### Human:"], // stop completion after generating this stop: ["\n### Human:"], // stop completion after generating this
grammar,
stream: true, stream: true,
}) })
}) })

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,311 @@
unsigned char json_schema_to_grammar_mjs[] = {
0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x53, 0x50, 0x41, 0x43, 0x45, 0x5f,
0x52, 0x55, 0x4c, 0x45, 0x20, 0x3d, 0x20, 0x27, 0x22, 0x20, 0x22, 0x3f,
0x27, 0x3b, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x50, 0x52,
0x49, 0x4d, 0x49, 0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45,
0x53, 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x62, 0x6f, 0x6f, 0x6c,
0x65, 0x61, 0x6e, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x74, 0x72, 0x75, 0x65,
0x22, 0x20, 0x7c, 0x20, 0x22, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x22, 0x29,
0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x6e,
0x75, 0x6d, 0x62, 0x65, 0x72, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x2d, 0x22,
0x3f, 0x20, 0x28, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x7c, 0x20, 0x5b,
0x31, 0x2d, 0x39, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2a, 0x29,
0x29, 0x20, 0x28, 0x22, 0x2e, 0x22, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d,
0x2b, 0x29, 0x3f, 0x20, 0x28, 0x5b, 0x65, 0x45, 0x5d, 0x20, 0x5b, 0x2d,
0x2b, 0x5d, 0x3f, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2b, 0x29, 0x3f,
0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x69,
0x6e, 0x74, 0x65, 0x67, 0x65, 0x72, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x2d,
0x22, 0x3f, 0x20, 0x28, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x7c, 0x20,
0x5b, 0x31, 0x2d, 0x39, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2a,
0x29, 0x29, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20,
0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x60, 0x20, 0x22,
0x5c, 0x5c, 0x22, 0x22, 0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x5b, 0x5e, 0x22, 0x5c, 0x5c, 0x5c, 0x5c, 0x5d, 0x20,
0x7c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x22, 0x5c,
0x5c, 0x5c, 0x5c, 0x22, 0x20, 0x28, 0x5b, 0x22, 0x5c, 0x5c, 0x5c, 0x5c,
0x2f, 0x62, 0x66, 0x6e, 0x72, 0x74, 0x5d, 0x20, 0x7c, 0x20, 0x22, 0x75,
0x22, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
0x5d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x2a, 0x20,
0x22, 0x5c, 0x5c, 0x22, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x60,
0x2c, 0x0a, 0x20, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3a, 0x20, 0x27, 0x22,
0x6e, 0x75, 0x6c, 0x6c, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27,
0x2c, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
0x49, 0x4e, 0x56, 0x41, 0x4c, 0x49, 0x44, 0x5f, 0x52, 0x55, 0x4c, 0x45,
0x5f, 0x43, 0x48, 0x41, 0x52, 0x53, 0x5f, 0x52, 0x45, 0x20, 0x3d, 0x20,
0x2f, 0x5b, 0x5e, 0x5c, 0x64, 0x41, 0x2d, 0x5a, 0x61, 0x2d, 0x7a, 0x2d,
0x5d, 0x2b, 0x2f, 0x67, 0x3b, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54, 0x45,
0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x5f, 0x52,
0x45, 0x20, 0x3d, 0x20, 0x2f, 0x5b, 0x5c, 0x6e, 0x5c, 0x72, 0x22, 0x5d,
0x2f, 0x67, 0x3b, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x47, 0x52,
0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54, 0x45, 0x52, 0x41,
0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x53, 0x20, 0x3d, 0x20,
0x7b, 0x27, 0x5c, 0x72, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x72, 0x27,
0x2c, 0x20, 0x27, 0x5c, 0x6e, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x6e,
0x27, 0x2c, 0x20, 0x27, 0x22, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x22,
0x27, 0x7d, 0x3b, 0x0a, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20,
0x63, 0x6c, 0x61, 0x73, 0x73, 0x20, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61,
0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a,
0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f,
0x72, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x29,
0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
0x5f, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x3d,
0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x7c,
0x7c, 0x20, 0x7b, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68,
0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x20, 0x3d, 0x20,
0x6e, 0x65, 0x77, 0x20, 0x4d, 0x61, 0x70, 0x28, 0x29, 0x3b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c,
0x65, 0x73, 0x2e, 0x73, 0x65, 0x74, 0x28, 0x27, 0x73, 0x70, 0x61, 0x63,
0x65, 0x27, 0x2c, 0x20, 0x53, 0x50, 0x41, 0x43, 0x45, 0x5f, 0x52, 0x55,
0x4c, 0x45, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72,
0x61, 0x6c, 0x28, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x29, 0x20,
0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x4a, 0x53,
0x4f, 0x4e, 0x2e, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79,
0x28, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x29, 0x2e, 0x72, 0x65,
0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54,
0x45, 0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x5f,
0x52, 0x45, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x20,
0x3d, 0x3e, 0x20, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c,
0x49, 0x54, 0x45, 0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50,
0x45, 0x53, 0x5b, 0x6d, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
0x60, 0x22, 0x24, 0x7b, 0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x64, 0x7d,
0x22, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x5f,
0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x6e, 0x61, 0x6d, 0x65,
0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d,
0x65, 0x20, 0x3d, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2e, 0x72, 0x65, 0x70,
0x6c, 0x61, 0x63, 0x65, 0x28, 0x49, 0x4e, 0x56, 0x41, 0x4c, 0x49, 0x44,
0x5f, 0x52, 0x55, 0x4c, 0x45, 0x5f, 0x43, 0x48, 0x41, 0x52, 0x53, 0x5f,
0x52, 0x45, 0x2c, 0x20, 0x27, 0x2d, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6b, 0x65, 0x79, 0x20, 0x3d, 0x20,
0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x3b, 0x0a, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f,
0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x68, 0x61, 0x73, 0x28, 0x65, 0x73,
0x63, 0x4e, 0x61, 0x6d, 0x65, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73,
0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x67, 0x65, 0x74, 0x28,
0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x3d, 0x3d, 0x3d,
0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
0x6b, 0x65, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20,
0x69, 0x20, 0x3d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73,
0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x68, 0x61, 0x73, 0x28,
0x60, 0x24, 0x7b, 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x24,
0x7b, 0x69, 0x7d, 0x60, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, 0x31, 0x3b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x6b, 0x65, 0x79, 0x20, 0x3d, 0x20, 0x60, 0x24, 0x7b,
0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x24, 0x7b, 0x69, 0x7d,
0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65,
0x73, 0x2e, 0x73, 0x65, 0x74, 0x28, 0x6b, 0x65, 0x79, 0x2c, 0x20, 0x72,
0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
0x74, 0x75, 0x72, 0x6e, 0x20, 0x6b, 0x65, 0x79, 0x3b, 0x0a, 0x20, 0x20,
0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x73,
0x63, 0x68, 0x65, 0x6d, 0x61, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29,
0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20,
0x3d, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x74, 0x79, 0x70,
0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
0x20, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20,
0x6e, 0x61, 0x6d, 0x65, 0x20, 0x7c, 0x7c, 0x20, 0x27, 0x72, 0x6f, 0x6f,
0x74, 0x27, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x6f, 0x6e, 0x65, 0x4f,
0x66, 0x20, 0x7c, 0x7c, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e,
0x61, 0x6e, 0x79, 0x4f, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c,
0x65, 0x20, 0x3d, 0x20, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e,
0x6f, 0x6e, 0x65, 0x4f, 0x66, 0x20, 0x7c, 0x7c, 0x20, 0x73, 0x63, 0x68,
0x65, 0x6d, 0x61, 0x2e, 0x61, 0x6e, 0x79, 0x4f, 0x66, 0x29, 0x2e, 0x6d,
0x61, 0x70, 0x28, 0x28, 0x61, 0x6c, 0x74, 0x53, 0x63, 0x68, 0x65, 0x6d,
0x61, 0x2c, 0x20, 0x69, 0x29, 0x20, 0x3d, 0x3e, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x69,
0x73, 0x69, 0x74, 0x28, 0x61, 0x6c, 0x74, 0x53, 0x63, 0x68, 0x65, 0x6d,
0x61, 0x2c, 0x20, 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24,
0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20,
0x3a, 0x20, 0x22, 0x22, 0x7d, 0x24, 0x7b, 0x69, 0x7d, 0x60, 0x29, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e,
0x28, 0x27, 0x20, 0x7c, 0x20, 0x27, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74,
0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65,
0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x72,
0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20,
0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x27, 0x63, 0x6f,
0x6e, 0x73, 0x74, 0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65,
0x6d, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c,
0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72,
0x61, 0x6c, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x63, 0x6f,
0x6e, 0x73, 0x74, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x27, 0x65,
0x6e, 0x75, 0x6d, 0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65,
0x6d, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x3d,
0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x65, 0x6e, 0x75, 0x6d,
0x2e, 0x6d, 0x61, 0x70, 0x28, 0x76, 0x20, 0x3d, 0x3e, 0x20, 0x74, 0x68,
0x69, 0x73, 0x2e, 0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69,
0x74, 0x65, 0x72, 0x61, 0x6c, 0x28, 0x76, 0x29, 0x29, 0x2e, 0x6a, 0x6f,
0x69, 0x6e, 0x28, 0x27, 0x20, 0x7c, 0x20, 0x27, 0x29, 0x3b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c,
0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20,
0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x73, 0x63,
0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d,
0x20, 0x27, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x27, 0x20, 0x26, 0x26,
0x20, 0x27, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73,
0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x29,
0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
0x54, 0x4f, 0x44, 0x4f, 0x3a, 0x20, 0x60, 0x72, 0x65, 0x71, 0x75, 0x69,
0x72, 0x65, 0x64, 0x60, 0x20, 0x6b, 0x65, 0x79, 0x77, 0x6f, 0x72, 0x64,
0x20, 0x28, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x70, 0x79, 0x74, 0x68, 0x6f,
0x6e, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61,
0x74, 0x69, 0x6f, 0x6e, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72,
0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f,
0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x3b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70,
0x72, 0x6f, 0x70, 0x50, 0x61, 0x69, 0x72, 0x73, 0x20, 0x3d, 0x20, 0x4f,
0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x65, 0x6e, 0x74, 0x72, 0x69, 0x65,
0x73, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x70, 0x72, 0x6f,
0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x29, 0x2e, 0x73, 0x6f, 0x72,
0x74, 0x28, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x3d, 0x3e, 0x20,
0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
0x20, 0x73, 0x6f, 0x72, 0x74, 0x20, 0x62, 0x79, 0x20, 0x70, 0x6f, 0x73,
0x69, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x70, 0x72, 0x6f,
0x70, 0x5f, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x28, 0x69, 0x66, 0x20,
0x73, 0x70, 0x65, 0x63, 0x69, 0x66, 0x69, 0x65, 0x64, 0x29, 0x20, 0x74,
0x68, 0x65, 0x6e, 0x20, 0x62, 0x79, 0x20, 0x6b, 0x65, 0x79, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
0x20, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x41, 0x20, 0x3d, 0x20, 0x74, 0x79,
0x70, 0x65, 0x6f, 0x66, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64,
0x65, 0x72, 0x5b, 0x61, 0x5b, 0x30, 0x5d, 0x5d, 0x20, 0x3d, 0x3d, 0x3d,
0x20, 0x27, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x27, 0x20, 0x3f, 0x20,
0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x5b, 0x61, 0x5b,
0x30, 0x5d, 0x5d, 0x20, 0x3a, 0x20, 0x49, 0x6e, 0x66, 0x69, 0x6e, 0x69,
0x74, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x42,
0x20, 0x3d, 0x20, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x70, 0x72,
0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x5b, 0x62, 0x5b, 0x30, 0x5d,
0x5d, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x6e, 0x75, 0x6d, 0x62, 0x65,
0x72, 0x27, 0x20, 0x3f, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64,
0x65, 0x72, 0x5b, 0x62, 0x5b, 0x30, 0x5d, 0x5d, 0x20, 0x3a, 0x20, 0x49,
0x6e, 0x66, 0x69, 0x6e, 0x69, 0x74, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
0x6f, 0x72, 0x64, 0x65, 0x72, 0x41, 0x20, 0x2d, 0x20, 0x6f, 0x72, 0x64,
0x65, 0x72, 0x42, 0x20, 0x7c, 0x7c, 0x20, 0x61, 0x5b, 0x30, 0x5d, 0x2e,
0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x65, 0x43, 0x6f, 0x6d, 0x70, 0x61, 0x72,
0x65, 0x28, 0x62, 0x5b, 0x30, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x3d,
0x20, 0x27, 0x22, 0x7b, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27,
0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x72, 0x6f, 0x70,
0x50, 0x61, 0x69, 0x72, 0x73, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63,
0x68, 0x28, 0x28, 0x5b, 0x70, 0x72, 0x6f, 0x70, 0x4e, 0x61, 0x6d, 0x65,
0x2c, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61,
0x5d, 0x2c, 0x20, 0x69, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
0x20, 0x70, 0x72, 0x6f, 0x70, 0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d,
0x65, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x69, 0x73,
0x69, 0x74, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x53, 0x63, 0x68, 0x65, 0x6d,
0x61, 0x2c, 0x20, 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24,
0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20,
0x3a, 0x20, 0x22, 0x22, 0x7d, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70, 0x4e,
0x61, 0x6d, 0x65, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x69, 0x20, 0x3e, 0x20,
0x30, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x2b, 0x3d, 0x20, 0x27,
0x20, 0x22, 0x2c, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x3b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20,
0x2b, 0x3d, 0x20, 0x60, 0x20, 0x24, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e,
0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72,
0x61, 0x6c, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x4e, 0x61, 0x6d, 0x65, 0x29,
0x7d, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x22, 0x3a, 0x22, 0x20,
0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70,
0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x60, 0x3b, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x2b, 0x3d, 0x20,
0x27, 0x20, 0x22, 0x7d, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27,
0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74,
0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64,
0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61,
0x6d, 0x65, 0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66,
0x20, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65,
0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x61, 0x72, 0x72, 0x61, 0x79, 0x27,
0x20, 0x26, 0x26, 0x20, 0x27, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x27, 0x20,
0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x29, 0x20, 0x7b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x4f,
0x44, 0x4f, 0x20, 0x60, 0x70, 0x72, 0x65, 0x66, 0x69, 0x78, 0x49, 0x74,
0x65, 0x6d, 0x73, 0x60, 0x20, 0x6b, 0x65, 0x79, 0x77, 0x6f, 0x72, 0x64,
0x20, 0x28, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x70, 0x79, 0x74, 0x68, 0x6f,
0x6e, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61,
0x74, 0x69, 0x6f, 0x6e, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x74, 0x65, 0x6d, 0x52, 0x75,
0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69,
0x73, 0x2e, 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x73, 0x63, 0x68, 0x65,
0x6d, 0x61, 0x2e, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x2c, 0x20, 0x60, 0x24,
0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65,
0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20, 0x3a, 0x20, 0x22, 0x22, 0x7d,
0x69, 0x74, 0x65, 0x6d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65,
0x20, 0x3d, 0x20, 0x60, 0x22, 0x5b, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63,
0x65, 0x20, 0x28, 0x24, 0x7b, 0x69, 0x74, 0x65, 0x6d, 0x52, 0x75, 0x6c,
0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x20, 0x28, 0x22, 0x2c, 0x22, 0x20,
0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x24, 0x7b, 0x69, 0x74, 0x65, 0x6d,
0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x29, 0x2a, 0x29,
0x3f, 0x20, 0x22, 0x5d, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x60,
0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64,
0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d,
0x65, 0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x50, 0x52,
0x49, 0x4d, 0x49, 0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45,
0x53, 0x5b, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65,
0x5d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45,
0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x55, 0x6e, 0x72, 0x65, 0x63, 0x6f,
0x67, 0x6e, 0x69, 0x7a, 0x65, 0x64, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d,
0x61, 0x3a, 0x20, 0x24, 0x7b, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x73, 0x74,
0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, 0x73, 0x63, 0x68, 0x65,
0x6d, 0x61, 0x29, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61,
0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65,
0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x72, 0x6f, 0x6f, 0x74, 0x27, 0x20,
0x3f, 0x20, 0x27, 0x72, 0x6f, 0x6f, 0x74, 0x27, 0x20, 0x3a, 0x20, 0x73,
0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x2c, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x50, 0x52, 0x49, 0x4d, 0x49,
0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45, 0x53, 0x5b, 0x73,
0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x5d, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x66, 0x6f, 0x72,
0x6d, 0x61, 0x74, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x28, 0x29,
0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x67,
0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x20, 0x3d, 0x20, 0x27, 0x27, 0x3b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72,
0x75, 0x6c, 0x65, 0x73, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68,
0x28, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65,
0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x20, 0x2b, 0x3d, 0x20,
0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x20, 0x3a, 0x3a, 0x3d,
0x20, 0x24, 0x7b, 0x72, 0x75, 0x6c, 0x65, 0x7d, 0x5c, 0x6e, 0x60, 0x3b,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67, 0x72, 0x61, 0x6d,
0x6d, 0x61, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a
};
unsigned int json_schema_to_grammar_mjs_len = 3695;

View file

@ -141,6 +141,7 @@
} from '/index.js'; } from '/index.js';
import { llama } from '/completion.js'; import { llama } from '/completion.js';
import { SchemaConverter } from '/json-schema-to-grammar.mjs';
const session = signal({ const session = signal({
prompt: "This is a conversation between user and llama, a friendly chatbot. respond in simple markdown.", prompt: "This is a conversation between user and llama, a friendly chatbot. respond in simple markdown.",
@ -166,8 +167,139 @@
mirostat: 0, // 0/1/2 mirostat: 0, // 0/1/2
mirostat_tau: 5, // target entropy mirostat_tau: 5, // target entropy
mirostat_eta: 0.1, // learning rate mirostat_eta: 0.1, // learning rate
grammar: '',
}) })
/* START: Support for storing prompt templates and parameters in borwser LocalStorage */
const local_storage_storageKey = "llamacpp_server_local_storage";
function local_storage_setDataFromObject(tag, content) {
localStorage.setItem(local_storage_storageKey + '/' + tag, JSON.stringify(content));
}
function local_storage_setDataFromRawText(tag, content) {
localStorage.setItem(local_storage_storageKey + '/' + tag, content);
}
function local_storage_getDataAsObject(tag) {
const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
if (!item) {
return null;
} else {
return JSON.parse(item);
}
}
function local_storage_getDataAsRawText(tag) {
const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
if (!item) {
return null;
} else {
return item;
}
}
// create a container for user templates and settings
const savedUserTemplates = signal({})
const selectedUserTemplate = signal({ name: '', template: { session: {}, params: {} } })
// let's import locally saved templates and settings if there are any
// user templates and settings are stored in one object
// in form of { "templatename": "templatedata" } and { "settingstemplatename":"settingsdata" }
console.log('Importing saved templates')
let importedTemplates = local_storage_getDataAsObject('user_templates')
if (importedTemplates) {
// saved templates were successfuly imported.
console.log('Processing saved templates and updating default template')
//console.log(importedTemplates);
savedUserTemplates.value = importedTemplates;
//override default template
savedUserTemplates.value.default = { session: session.value, params: params.value }
local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
} else {
// no saved templates detected.
console.log('Initializing LocalStorage and saving default template')
savedUserTemplates.value = { "default": { session: session.value, params: params.value } }
local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
}
function userTemplateResetToDefault() {
console.log('Reseting themplate to default')
selectedUserTemplate.value.name = 'default';
selectedUserTemplate.value.data = savedUserTemplates.value['default'];
}
function userTemplateApply(t) {
session.value = t.data.session;
params.value = t.data.params;
}
function userTemplateResetToDefaultAndApply() {
userTemplateResetToDefault()
userTemplateApply(selectedUserTemplate.value)
}
function userTemplateLoadAndApplyAutosaved() {
// get autosaved last used template
let lastUsedTemplate = local_storage_getDataAsObject('user_templates_last')
if (lastUsedTemplate) {
console.log('Autosaved template found, restoring')
selectedUserTemplate.value = lastUsedTemplate
}
else {
console.log('No autosaved template found, using default template')
// no autosaved last used template was found, so load from default.
userTemplateResetToDefault()
}
console.log('Applying template')
// and update internal data from templates
userTemplateApply(selectedUserTemplate.value)
}
//console.log(savedUserTemplates.value)
//console.log(selectedUserTemplate.value)
function userTemplateAutosave() {
console.log('Template Autosave...')
if (selectedUserTemplate.value.name == 'default') {
// we don't want to save over default template, so let's create a new one
let newTemplateName = 'UserTemplate-' + Date.now().toString()
let newTemplate = { 'name': newTemplateName, 'data': { 'session': session.value, 'params': params.value } }
console.log('Saving as ' + newTemplateName)
// save in the autosave slot
local_storage_setDataFromObject('user_templates_last', newTemplate)
// and load it back and apply
userTemplateLoadAndApplyAutosaved()
} else {
local_storage_setDataFromObject('user_templates_last', { 'name': selectedUserTemplate.value.name, 'data': { 'session': session.value, 'params': params.value } })
}
}
console.log('Checking for autosaved last used template')
userTemplateLoadAndApplyAutosaved()
/* END: Support for storing prompt templates and parameters in browsers LocalStorage */
const llamaStats = signal(null) const llamaStats = signal(null)
const controller = signal(null) const controller = signal(null)
@ -304,6 +436,26 @@
const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) } const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) } const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
const grammarJsonSchemaPropOrder = signal('')
const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
const convertJSONSchemaGrammar = () => {
try {
const schema = JSON.parse(params.value.grammar)
const converter = new SchemaConverter(
grammarJsonSchemaPropOrder.value
.split(',')
.reduce((acc, cur, i) => ({...acc, [cur.trim()]: i}), {})
)
converter.visit(schema, '')
params.value = {
...params.value,
grammar: converter.formatGrammar(),
}
} catch (e) {
alert(`Convert failed: ${e.message}`)
}
}
const FloatField = ({label, max, min, name, step, value}) => { const FloatField = ({label, max, min, name, step, value}) => {
return html` return html`
<div> <div>
@ -324,8 +476,34 @@
` `
}; };
const userTemplateReset = (e) => {
e.preventDefault();
userTemplateResetToDefaultAndApply()
}
const UserTemplateResetButton = () => {
if (selectedUserTemplate.value.name == 'default') {
return html`
<button disabled>Using default template</button>
`
}
return html`
<button onclick=${userTemplateReset}>Reset all to default</button>
`
};
useEffect(() => {
// autosave template on every change
userTemplateAutosave()
}, [session.value, params.value])
return html` return html`
<form> <form>
<fieldset>
<${UserTemplateResetButton}/>
</fieldset>
<fieldset> <fieldset>
<div> <div>
<label for="prompt">Prompt</label> <label for="prompt">Prompt</label>
@ -355,6 +533,13 @@
<label for="template">Chat history template</label> <label for="template">Chat history template</label>
<textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/> <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
</div> </div>
<div>
<label for="template">Grammar</label>
<textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
<input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
<button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
</div>
</fieldset> </fieldset>
<fieldset class="two"> <fieldset class="two">

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,112 @@
const SPACE_RULE = '" "?';
const PRIMITIVE_RULES = {
boolean: '("true" | "false") space',
number: '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
integer: '("-"? ([0-9] | [1-9] [0-9]*)) space',
string: ` "\\"" (
[^"\\\\] |
"\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\\"" space`,
null: '"null" space',
};
const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g;
const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g;
const GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'};
export class SchemaConverter {
constructor(propOrder) {
this._propOrder = propOrder || {};
this._rules = new Map();
this._rules.set('space', SPACE_RULE);
}
_formatLiteral(literal) {
const escaped = JSON.stringify(literal).replace(
GRAMMAR_LITERAL_ESCAPE_RE,
m => GRAMMAR_LITERAL_ESCAPES[m]
);
return `"${escaped}"`;
}
_addRule(name, rule) {
let escName = name.replace(INVALID_RULE_CHARS_RE, '-');
let key = escName;
if (this._rules.has(escName)) {
if (this._rules.get(escName) === rule) {
return key;
}
let i = 0;
while (this._rules.has(`${escName}${i}`)) {
i += 1;
}
key = `${escName}${i}`;
}
this._rules.set(key, rule);
return key;
}
visit(schema, name) {
const schemaType = schema.type;
const ruleName = name || 'root';
if (schema.oneOf || schema.anyOf) {
const rule = (schema.oneOf || schema.anyOf).map((altSchema, i) =>
this.visit(altSchema, `${name}${name ? "-" : ""}${i}`)
).join(' | ');
return this._addRule(ruleName, rule);
} else if ('const' in schema) {
return this._addRule(ruleName, this._formatLiteral(schema.const));
} else if ('enum' in schema) {
const rule = schema.enum.map(v => this._formatLiteral(v)).join(' | ');
return this._addRule(ruleName, rule);
} else if (schemaType === 'object' && 'properties' in schema) {
// TODO: `required` keyword (from python implementation)
const propOrder = this._propOrder;
const propPairs = Object.entries(schema.properties).sort((a, b) => {
// sort by position in prop_order (if specified) then by key
const orderA = typeof propOrder[a[0]] === 'number' ? propOrder[a[0]] : Infinity;
const orderB = typeof propOrder[b[0]] === 'number' ? propOrder[b[0]] : Infinity;
return orderA - orderB || a[0].localeCompare(b[0]);
});
let rule = '"{" space';
propPairs.forEach(([propName, propSchema], i) => {
const propRuleName = this.visit(propSchema, `${name}${name ? "-" : ""}${propName}`);
if (i > 0) {
rule += ' "," space';
}
rule += ` ${this._formatLiteral(propName)} space ":" space ${propRuleName}`;
});
rule += ' "}" space';
return this._addRule(ruleName, rule);
} else if (schemaType === 'array' && 'items' in schema) {
// TODO `prefixItems` keyword (from python implementation)
const itemRuleName = this.visit(schema.items, `${name}${name ? "-" : ""}item`);
const rule = `"[" space (${itemRuleName} ("," space ${itemRuleName})*)? "]" space`;
return this._addRule(ruleName, rule);
} else {
if (!PRIMITIVE_RULES[schemaType]) {
throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`);
}
return this._addRule(
ruleName === 'root' ? 'root' : schemaType,
PRIMITIVE_RULES[schemaType]
);
}
}
formatGrammar() {
let grammar = '';
this._rules.forEach((rule, name) => {
grammar += `${name} ::= ${rule}\n`;
});
return grammar;
}
}

View file

@ -15,6 +15,7 @@
#include "index.html.hpp" #include "index.html.hpp"
#include "index.js.hpp" #include "index.js.hpp"
#include "completion.js.hpp" #include "completion.js.hpp"
#include "json-schema-to-grammar.mjs.hpp"
#ifndef SERVER_VERBOSE #ifndef SERVER_VERBOSE
#define SERVER_VERBOSE 1 #define SERVER_VERBOSE 1
@ -668,6 +669,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
{ {
fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
} }
fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n");
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stdout, " -ngl N, --n-gpu-layers N\n"); fprintf(stdout, " -ngl N, --n-gpu-layers N\n");
fprintf(stdout, " number of layers to store in VRAM\n"); fprintf(stdout, " number of layers to store in VRAM\n");
@ -951,6 +953,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
{ {
params.use_mmap = false; params.use_mmap = false;
} }
else if (arg == "--numa")
{
params.numa = true;
}
else if (arg == "--embedding") else if (arg == "--embedding")
{ {
params.embedding = true; params.embedding = true;
@ -1019,7 +1025,7 @@ static json format_timings(llama_server_context &llama)
assert(timings.n_eval == llama.num_tokens_predicted); assert(timings.n_eval == llama.num_tokens_predicted);
return json{ return json{
{"prompt_n", timings.n_eval}, {"prompt_n", timings.n_p_eval},
{"prompt_ms", timings.t_p_eval_ms}, {"prompt_ms", timings.t_p_eval_ms},
{"prompt_per_token_ms", timings.t_p_eval_ms / timings.n_p_eval}, {"prompt_per_token_ms", timings.t_p_eval_ms / timings.n_p_eval},
{"prompt_per_second", 1e3 / timings.t_p_eval_ms * timings.n_p_eval}, {"prompt_per_second", 1e3 / timings.t_p_eval_ms * timings.n_p_eval},
@ -1048,7 +1054,6 @@ static json format_final_response(llama_server_context &llama, const std::string
{"stopped_limit", llama.stopped_limit}, {"stopped_limit", llama.stopped_limit},
{"stopping_word", llama.stopping_word}, {"stopping_word", llama.stopping_word},
{"tokens_cached", llama.n_past}, {"tokens_cached", llama.n_past},
{"tokens_predicted", llama.num_tokens_predicted},
{"timings", format_timings(llama)}, {"timings", format_timings(llama)},
}; };
@ -1226,6 +1231,12 @@ int main(int argc, char **argv)
res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript"); res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
return false; }); return false; });
// this is only called if no index.html is found in the public --path
svr.Get("/json-schema-to-grammar.mjs", [](const Request &, Response &res)
{
res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript");
return false; });
svr.Post("/completion", [&llama](const Request &req, Response &res) svr.Post("/completion", [&llama](const Request &req, Response &res)
{ {
auto lock = llama.lock(); auto lock = llama.lock();

View file

@ -14,8 +14,6 @@
with pkgs.darwin.apple_sdk_11_0.frameworks; [ with pkgs.darwin.apple_sdk_11_0.frameworks; [
Accelerate Accelerate
MetalKit MetalKit
MetalPerformanceShaders
MetalPerformanceShadersGraph
] ]
else if isAarch32 && isDarwin then else if isAarch32 && isDarwin then
with pkgs.darwin.apple_sdk.frameworks; [ with pkgs.darwin.apple_sdk.frameworks; [

View file

@ -67,6 +67,8 @@ struct ggml_allocr {
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE]; struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
size_t max_size; size_t max_size;
bool measure; bool measure;
int parse_seq[GGML_MAX_NODES];
bool has_parse_seq;
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG
struct ggml_tensor * allocated_tensors[1024]; struct ggml_tensor * allocated_tensors[1024];
@ -111,10 +113,10 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
size_t max_avail = 0; size_t max_avail = 0;
// find the best fitting free block // find the best fitting free block besides the last block
int best_fit_block = -1; int best_fit_block = -1;
size_t best_fit_size = SIZE_MAX; size_t best_fit_size = SIZE_MAX;
for (int i = 0; i < alloc->n_free_blocks; i++) { for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
struct free_block * block = &alloc->free_blocks[i]; struct free_block * block = &alloc->free_blocks[i];
max_avail = MAX(max_avail, block->size); max_avail = MAX(max_avail, block->size);
if (block->size >= size && block->size <= best_fit_size) { if (block->size >= size && block->size <= best_fit_size) {
@ -126,11 +128,18 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
AT_PRINTF("block %d\n", best_fit_block); AT_PRINTF("block %d\n", best_fit_block);
if (best_fit_block == -1) { if (best_fit_block == -1) {
// the last block is our last resort
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
if (block->size >= size) {
best_fit_block = alloc->n_free_blocks - 1;
max_avail = MAX(max_avail, block->size);
} else {
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
__func__, size, max_avail); __func__, size, max_avail);
GGML_ASSERT(!"not enough space in the buffer"); GGML_ASSERT(!"not enough space in the buffer");
return; return;
} }
}
struct free_block * block = &alloc->free_blocks[best_fit_block]; struct free_block * block = &alloc->free_blocks[best_fit_block];
void * addr = block->addr; void * addr = block->addr;
block->addr = (char*)block->addr + size; block->addr = (char*)block->addr + size;
@ -229,6 +238,17 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
alloc->n_free_blocks++; alloc->n_free_blocks++;
} }
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
int pos = 0;
for (int i = 0; i < n; i++) {
if (list[i] != -1) {
alloc->parse_seq[pos] = list[i];
pos++;
}
}
alloc->has_parse_seq = true;
}
void ggml_allocr_reset(struct ggml_allocr * alloc) { void ggml_allocr_reset(struct ggml_allocr * alloc) {
alloc->n_free_blocks = 1; alloc->n_free_blocks = 1;
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment); size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
@ -248,6 +268,8 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
/*.hash_table = */ {{0}}, /*.hash_table = */ {{0}},
/*.max_size = */ 0, /*.max_size = */ 0,
/*.measure = */ false, /*.measure = */ false,
/*.parse_seq = */ {0},
/*.has_parse_seq = */ false,
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG
/*.allocated_tensors = */ = {0}, /*.allocated_tensors = */ = {0},
#endif #endif
@ -275,6 +297,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
/*.hash_table = */ {{0}}, /*.hash_table = */ {{0}},
/*.max_size = */ 0, /*.max_size = */ 0,
/*.measure = */ true, /*.measure = */ true,
/*.parse_seq = */ {0},
/*.has_parse_seq = */ false,
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG
/*.allocated_tensors = */ = {0}, /*.allocated_tensors = */ = {0},
#endif #endif
@ -473,7 +497,13 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
allocate_node(alloc, input); allocate_node(alloc, input);
} }
} }
for (int i = 0; i < gf->n_nodes; i++) { for (int ind = 0; ind < gf->n_nodes; ind++) {
int i;
if (alloc->has_parse_seq) {
i = alloc->parse_seq[ind];
} else {
i = ind;
}
struct ggml_tensor * node = gf->nodes[i]; struct ggml_tensor * node = gf->nodes[i];
// allocate parents (leafs) // allocate parents (leafs)

View file

@ -10,6 +10,10 @@ extern "C" {
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment); GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment); GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
// tell the allocator to parse nodes following the order described in the list
// you should call this if your graph are optimized to execute out-of-order
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc); GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc); GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc); GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);

File diff suppressed because it is too large Load diff

View file

@ -8,29 +8,25 @@ extern "C" {
#define GGML_CUDA_MAX_DEVICES 16 #define GGML_CUDA_MAX_DEVICES 16
void ggml_init_cublas(void); GGML_API void ggml_init_cublas(void);
void ggml_cuda_set_tensor_split(const float * tensor_split); GGML_API void * ggml_cuda_host_malloc(size_t size);
GGML_API void ggml_cuda_host_free(void * ptr);
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split);
size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_set_main_device(int main_device);
GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size);
GGML_API void ggml_cuda_free_scratch(void);
GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
// TODO: export these with GGML_API GGML_API int ggml_cuda_get_device_count(void);
void * ggml_cuda_host_malloc(size_t size); GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
void ggml_cuda_host_free(void * ptr);
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
void ggml_cuda_free_data(struct ggml_tensor * tensor);
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
void ggml_cuda_set_main_device(int main_device);
void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
void ggml_cuda_set_scratch_size(size_t scratch_size);
void ggml_cuda_free_scratch(void);
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -63,10 +63,13 @@ void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
// try to find operations that can be run concurrently in the graph // try to find operations that can be run concurrently in the graph
// you should run it again if the topology of your graph changes // you should run it again if the topology of your graph changes
void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
// if the graph has been optimized for concurrently dispatch // if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
bool ggml_metal_if_optimized(struct ggml_metal_context * ctx); int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
// output the concur_list for ggml_alloc
int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
// same as ggml_graph_compute but uses Metal // same as ggml_graph_compute but uses Metal
// creates gf->n_threads command buffers in parallel // creates gf->n_threads command buffers in parallel

View file

@ -5,7 +5,6 @@
#import <Foundation/Foundation.h> #import <Foundation/Foundation.h>
#import <Metal/Metal.h> #import <Metal/Metal.h>
#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
#undef MIN #undef MIN
#undef MAX #undef MAX
@ -79,6 +78,14 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32); GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32); GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32);
GGML_METAL_DECL_KERNEL(rope); GGML_METAL_DECL_KERNEL(rope);
GGML_METAL_DECL_KERNEL(alibi_f32); GGML_METAL_DECL_KERNEL(alibi_f32);
GGML_METAL_DECL_KERNEL(cpy_f32_f16); GGML_METAL_DECL_KERNEL(cpy_f32_f16);
@ -110,13 +117,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
ctx->n_buffers = 0; ctx->n_buffers = 0;
ctx->concur_list_len = 0; ctx->concur_list_len = 0;
// determine if we can use MPS
if (MPSSupportsMTLDevice(ctx->device)) {
fprintf(stderr, "%s: using MPS\n", __func__);
} else {
fprintf(stderr, "%s: not using MPS\n", __func__);
GGML_ASSERT(false && "MPS not supported");
}
#if 0 #if 0
// compile from source string and show compile log // compile from source string and show compile log
@ -126,7 +126,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error]; ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
if (error) { if (error) {
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
exit(1); return NULL;
} }
} }
#else #else
@ -144,7 +144,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
if (error) { if (error) {
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
exit(1); return NULL;
} }
#ifdef GGML_QKK_64 #ifdef GGML_QKK_64
@ -156,17 +156,22 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
#endif #endif
if (error) { if (error) {
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
exit(1); return NULL;
} }
} }
#endif #endif
// load kernels // load kernels
{ {
NSError * error = nil;
#define GGML_METAL_ADD_KERNEL(name) \ #define GGML_METAL_ADD_KERNEL(name) \
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:nil]; \ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); \
if (error) { \
fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
return NULL; \
}
GGML_METAL_ADD_KERNEL(add); GGML_METAL_ADD_KERNEL(add);
GGML_METAL_ADD_KERNEL(add_row); GGML_METAL_ADD_KERNEL(add_row);
@ -196,6 +201,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32); GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32); GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
GGML_METAL_ADD_KERNEL(rope); GGML_METAL_ADD_KERNEL(rope);
GGML_METAL_ADD_KERNEL(alibi_f32); GGML_METAL_ADD_KERNEL(alibi_f32);
GGML_METAL_ADD_KERNEL(cpy_f32_f16); GGML_METAL_ADD_KERNEL(cpy_f32_f16);
@ -228,11 +241,12 @@ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
ctx->n_cb = n_cb; ctx->n_cb = n_cb;
} }
bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) { int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
if (ctx->concur_list_len) { return ctx->concur_list_len;
return true;
} }
return false;
int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
return ctx->concur_list;
} }
// finds the Metal buffer that contains the tensor data on the GPU device // finds the Metal buffer that contains the tensor data on the GPU device
@ -375,7 +389,7 @@ void ggml_metal_get_tensor(
void ggml_metal_graph_find_concurrency( void ggml_metal_graph_find_concurrency(
struct ggml_metal_context * ctx, struct ggml_metal_context * ctx,
struct ggml_cgraph * gf) { struct ggml_cgraph * gf, bool check_mem) {
int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
int nodes_unused[GGML_MAX_CONCUR]; int nodes_unused[GGML_MAX_CONCUR];
@ -422,7 +436,7 @@ void ggml_metal_graph_find_concurrency(
} }
} }
} }
if (exe_flag) { if (exe_flag && check_mem) {
// check if nodes[i]'s data will be overwritten by a node before nodes[i]. // check if nodes[i]'s data will be overwritten by a node before nodes[i].
// if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3] // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
int64_t data_start = (int64_t) gf->nodes[i]->data; int64_t data_start = (int64_t) gf->nodes[i]->data;
@ -506,7 +520,7 @@ void ggml_metal_graph_compute(
id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx]; id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
id<MTLComputeCommandEncoder> encoder = nil; id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
const int node_start = (cb_idx + 0) * n_nodes_per_cb; const int node_start = (cb_idx + 0) * n_nodes_per_cb;
const int node_end = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb; const int node_end = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb;
@ -515,10 +529,6 @@ void ggml_metal_graph_compute(
const int i = has_concur ? ctx->concur_list[ind] : ind; const int i = has_concur ? ctx->concur_list[ind] : ind;
if (i == -1) { if (i == -1) {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
continue;
}
[encoder memoryBarrierWithScope:MTLBarrierScopeBuffers]; [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
continue; continue;
} }
@ -592,10 +602,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_ADD: case GGML_OP_ADD:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
if (ggml_nelements(src1) == ne10) { if (ggml_nelements(src1) == ne10) {
// src1 is a row // src1 is a row
[encoder setComputePipelineState:ctx->pipeline_add_row]; [encoder setComputePipelineState:ctx->pipeline_add_row];
@ -613,10 +619,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_MUL: case GGML_OP_MUL:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
if (ggml_nelements(src1) == ne10) { if (ggml_nelements(src1) == ne10) {
// src1 is a row // src1 is a row
[encoder setComputePipelineState:ctx->pipeline_mul_row]; [encoder setComputePipelineState:ctx->pipeline_mul_row];
@ -634,10 +636,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_SCALE: case GGML_OP_SCALE:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
const float scale = *(const float *) src1->data; const float scale = *(const float *) src1->data;
[encoder setComputePipelineState:ctx->pipeline_scale]; [encoder setComputePipelineState:ctx->pipeline_scale];
@ -653,10 +651,6 @@ void ggml_metal_graph_compute(
switch (ggml_get_unary_op(gf->nodes[i])) { switch (ggml_get_unary_op(gf->nodes[i])) {
case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_SILU:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
[encoder setComputePipelineState:ctx->pipeline_silu]; [encoder setComputePipelineState:ctx->pipeline_silu];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
@ -667,10 +661,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_RELU:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
[encoder setComputePipelineState:ctx->pipeline_relu]; [encoder setComputePipelineState:ctx->pipeline_relu];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
@ -681,10 +671,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
[encoder setComputePipelineState:ctx->pipeline_gelu]; [encoder setComputePipelineState:ctx->pipeline_gelu];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
@ -701,10 +687,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
const int nth = 32; const int nth = 32;
[encoder setComputePipelineState:ctx->pipeline_soft_max]; [encoder setComputePipelineState:ctx->pipeline_soft_max];
@ -719,10 +701,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_DIAG_MASK_INF: case GGML_OP_DIAG_MASK_INF:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
const int n_past = ((int32_t *)(dst->op_params))[0]; const int n_past = ((int32_t *)(dst->op_params))[0];
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf]; [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
@ -740,53 +718,43 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne00 == ne10); GGML_ASSERT(ne00 == ne10);
// GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
uint gqa = ne12/ne02;
GGML_ASSERT(ne03 == ne13); GGML_ASSERT(ne03 == ne13);
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
if (ggml_is_contiguous(src0) && if (ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) && ggml_is_contiguous(src1) &&
(src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) { src1t == GGML_TYPE_F32 &&
[ctx->device supportsFamily:MTLGPUFamilyApple7] &&
if (encoder != nil) { ne00%32 == 0 &&
[encoder endEncoding]; ne11 > 1) {
encoder = nil; switch (src0->type) {
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break;
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
} }
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
MPSDataType src0dt = src0t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
MPSDataType src1dt = src1t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16; [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
// for F32 x F32 we use MPS [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:src0->nb[1] dataType:src0dt]; [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8];
matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:src1->nb[1] dataType:src1dt]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9];
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
MPSMatrixDescriptor * desc = [MPSMatrixDescriptor [encoder setThreadgroupMemoryLength:8192 atIndex:0];
matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:dst->nb[1] dataType:MPSDataTypeFloat32]; [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc]
initWithDevice:ctx->device transposeLeft:false transposeRight:true
resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
// we need to do ne12 multiplications
// TODO: is there a way to do this in parallel - currently very slow ..
// TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
for (int64_t i02 = 0; i02 < ne12; ++i02) {
size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
size_t offs_src1_cur = offs_src1 + i02*nb12;
size_t offs_dst_cur = offs_dst + i02*nb2;
MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0_cur descriptor:desc0];
MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1_cur descriptor:desc1];
MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst_cur descriptor:desc ];
[mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
} }
} else { else {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
int nth0 = 32; int nth0 = 32;
int nth1 = 1; int nth1 = 1;
@ -885,23 +853,24 @@ void ggml_metal_graph_compute(
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14]; [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) { src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src0t == GGML_TYPE_Q3_K) { else if (src0t == GGML_TYPE_Q3_K) {
#ifdef GGML_QKK_64 #ifdef GGML_QKK_64
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
#else #else
[encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
#endif #endif
} }
else if (src0t == GGML_TYPE_Q5_K) { else if (src0t == GGML_TYPE_Q5_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src0t == GGML_TYPE_Q6_K) { else if (src0t == GGML_TYPE_Q6_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} else { } else {
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0]; [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@ -910,10 +879,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_GET_ROWS: case GGML_OP_GET_ROWS:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break; case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break; case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
@ -939,10 +904,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
float eps; float eps;
memcpy(&eps, dst->op_params, sizeof(float)); memcpy(&eps, dst->op_params, sizeof(float));
@ -962,10 +923,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_NORM: case GGML_OP_NORM:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
const float eps = 1e-5f; const float eps = 1e-5f;
const int nth = 256; const int nth = 256;
@ -984,10 +941,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_ALIBI: case GGML_OP_ALIBI:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
GGML_ASSERT((src0t == GGML_TYPE_F32)); GGML_ASSERT((src0t == GGML_TYPE_F32));
const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past); const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
@ -1027,10 +980,6 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_ROPE: case GGML_OP_ROPE:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
const int n_past = ((int32_t *) dst->op_params)[0]; const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1]; const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2]; const int mode = ((int32_t *) dst->op_params)[2];
@ -1071,10 +1020,6 @@ void ggml_metal_graph_compute(
case GGML_OP_CPY: case GGML_OP_CPY:
case GGML_OP_CONT: case GGML_OP_CONT:
{ {
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
const int nth = 32; const int nth = 32;
switch (src0t) { switch (src0t) {

File diff suppressed because it is too large Load diff

View file

@ -271,20 +271,29 @@ struct llama_mmap {
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str())); throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
} }
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
if (prefetch) { if (prefetch) {
// Advise the kernel to preload the mapped memory // The PrefetchVirtualMemory API is only present on Windows 8 and above, so we
// will dynamically load it using GetProcAddress.
BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
HMODULE hKernel32;
// This call is guaranteed to succeed.
hKernel32 = GetModuleHandleW(L"kernel32.dll");
// This call may fail if on a pre-Win8 system.
pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
if (pPrefetchVirtualMemory) {
// Advise the kernel to preload the mapped memory.
WIN32_MEMORY_RANGE_ENTRY range; WIN32_MEMORY_RANGE_ENTRY range;
range.VirtualAddress = addr; range.VirtualAddress = addr;
range.NumberOfBytes = (SIZE_T)size; range.NumberOfBytes = (SIZE_T)size;
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
llama_format_win_err(GetLastError()).c_str()); llama_format_win_err(GetLastError()).c_str());
} }
} }
#else }
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
} }
~llama_mmap() { ~llama_mmap() {

232
llama.cpp
View file

@ -63,7 +63,7 @@ static void llama_log_callback_default(llama_log_level level, const char * text,
#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL) #if !defined(GGML_USE_CUBLAS)
#include "ggml-alloc.h" #include "ggml-alloc.h"
#define LLAMA_USE_ALLOCATOR #define LLAMA_USE_ALLOCATOR
#else #else
@ -115,9 +115,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
// memory sizes (calculated for n_batch == 512) // memory sizes (calculated for n_batch == 512)
// //
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx) static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
{ {
static std::map<e_model, size_t> k_sizes = { std::map<e_model, size_t> k_sizes = {
{ MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB }, { MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB }, { MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB }, { MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
@ -996,7 +996,7 @@ static const char *llama_file_version_name(llama_file_version version) {
return "unknown"; return "unknown";
} }
static const char *llama_ftype_name(enum llama_ftype ftype) { const char * llama_ftype_name(enum llama_ftype ftype) {
switch (ftype) { switch (ftype) {
case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_ALL_F32: return "all F32";
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
@ -1609,11 +1609,11 @@ static struct ggml_cgraph * llama_build_graph(
ggml_set_name(Q, "Q"); ggml_set_name(Q, "Q");
struct ggml_tensor * K = struct ggml_tensor * K =
ggml_permute(ctx0, ggml_view_3d(ctx0, kv_self.k,
ggml_reshape_3d(ctx0, n_embd_head, n_past + N, n_head_kv,
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa), ggml_element_size(kv_self.k)*n_embd_gqa,
n_embd_head, n_head_kv, n_past + N), ggml_element_size(kv_self.k)*n_embd_head,
0, 2, 1, 3); ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
offload_func_kq(K); offload_func_kq(K);
ggml_set_name(K, "K"); ggml_set_name(K, "K");
@ -1642,9 +1642,9 @@ static struct ggml_cgraph * llama_build_graph(
struct ggml_tensor * V = struct ggml_tensor * V =
ggml_view_3d(ctx0, kv_self.v, ggml_view_3d(ctx0, kv_self.v,
n_past + N, n_embd_head, n_head_kv, n_past + N, n_embd_head, n_head_kv,
n_ctx*ggml_element_size(kv_self.v), ggml_element_size(kv_self.v)*n_ctx,
n_ctx*ggml_element_size(kv_self.v)*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il); ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
offload_func_v(V); offload_func_v(V);
ggml_set_name(V, "V"); ggml_set_name(V, "V");
@ -1801,6 +1801,13 @@ static bool llama_eval_internal(
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd)); LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
LLAMA_ASSERT(n_tokens > 0);
LLAMA_ASSERT(n_past >= 0);
LLAMA_ASSERT(n_threads > 0);
// TODO: keep the values of n_batch and n_ctx
// LLAMA_ASSERT(n_tokens <= n_batch);
// LLAMA_ASSERT(n_past + n_tokens <= n_ctx);
const int64_t t_start_us = ggml_time_us(); const int64_t t_start_us = ggml_time_us();
#ifdef GGML_USE_MPI #ifdef GGML_USE_MPI
@ -1848,11 +1855,7 @@ static bool llama_eval_internal(
#endif #endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
if (lctx.ctx_metal && N == 1) { if (lctx.ctx_metal) {
// TODO: disabled until #2413 is resolved
//if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
// ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
//}
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
ggml_metal_graph_compute(lctx.ctx_metal, gf); ggml_metal_graph_compute(lctx.ctx_metal, gf);
ggml_metal_get_tensor (lctx.ctx_metal, res); ggml_metal_get_tensor (lctx.ctx_metal, res);
@ -1860,22 +1863,6 @@ static bool llama_eval_internal(
ggml_metal_get_tensor(lctx.ctx_metal, embeddings); ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
} }
} else { } else {
// IMPORTANT:
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
// ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
// coprocessor.
//
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
// But for now, we have focused only on Matrix x Vector Metal multiplication.
//
// TODO: avoid these syncs via shared memory (ref #1696)
//
if (lctx.ctx_metal) {
// We need to sync the GPU KV cache with the CPU KV cache
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
}
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
} }
#else #else
@ -2100,37 +2087,81 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
// grammar - internal // grammar - internal
// //
struct llama_partial_utf8 {
uint32_t value; // bit value so far (unshifted)
int n_remain; // num bytes remaining; -1 indicates invalid sequence
};
struct llama_grammar { struct llama_grammar {
const std::vector<std::vector<llama_grammar_element>> rules; const std::vector<std::vector<llama_grammar_element>> rules;
std::vector<std::vector<const llama_grammar_element *>> stacks; std::vector<std::vector<const llama_grammar_element *>> stacks;
// buffer for partially generated UTF-8 sequence from accepted tokens
llama_partial_utf8 partial_utf8;
}; };
struct llama_grammar_candidate { struct llama_grammar_candidate {
size_t index; size_t index;
const uint32_t * code_points; const uint32_t * code_points;
llama_partial_utf8 partial_utf8;
}; };
// NOTE: assumes valid utf8 (but checks for overrun) // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
// adds a terminating 0 for use as pointer // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
std::vector<uint32_t> decode_utf8(const char * src) { std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; const char * src,
llama_partial_utf8 partial_start) {
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
const char * pos = src; const char * pos = src;
std::vector<uint32_t> code_points; std::vector<uint32_t> code_points;
uint32_t value = partial_start.value;
int n_remain = partial_start.n_remain;
// continue previous decode, if applicable
while (*pos != 0 && n_remain > 0) {
uint8_t next_byte = static_cast<uint8_t>(*pos);
if ((next_byte >> 6) != 2) {
// invalid sequence, abort
code_points.push_back(0);
return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
}
value = (value << 6) + (next_byte & 0x3F);
++pos;
--n_remain;
}
if (partial_start.n_remain > 0 && n_remain == 0) {
code_points.push_back(value);
}
// decode any subsequent utf-8 sequences, which may end in an incomplete one
while (*pos != 0) { while (*pos != 0) {
uint8_t first_byte = static_cast<uint8_t>(*pos); uint8_t first_byte = static_cast<uint8_t>(*pos);
uint8_t highbits = first_byte >> 4; uint8_t highbits = first_byte >> 4;
int len = lookup[highbits]; n_remain = lookup[highbits] - 1;
uint8_t mask = (1 << (8 - len)) - 1;
uint32_t value = first_byte & mask; if (n_remain < 0) {
const char * end = pos + len; // may overrun! // invalid sequence, abort
++pos; code_points.clear();
for ( ; pos < end && *pos != 0; ++pos) { code_points.push_back(0);
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F); return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
} }
uint8_t mask = (1 << (7 - n_remain)) - 1;
value = first_byte & mask;
++pos;
while (*pos != 0 && n_remain > 0) {
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
++pos;
--n_remain;
}
if (n_remain == 0) {
code_points.push_back(value); code_points.push_back(value);
} }
}
code_points.push_back(0); code_points.push_back(0);
return code_points;
return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
} }
// returns true iff pos points to the end of one of the definitions of a rule // returns true iff pos points to the end of one of the definitions of a rule
@ -2167,6 +2198,56 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
return std::make_pair(found == is_positive_char, pos); return std::make_pair(found == is_positive_char, pos);
} }
// returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
// range at pos (regular or inverse range)
// asserts that pos is pointing to a char range element
static bool llama_grammar_match_partial_char(
const llama_grammar_element * pos,
const llama_partial_utf8 partial_utf8) {
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
uint32_t partial_value = partial_utf8.value;
int n_remain = partial_utf8.n_remain;
// invalid sequence or 7-bit char split across 2 bytes (overlong)
if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
return false;
}
// range of possible code points this partial UTF-8 sequence could complete to
uint32_t low = partial_value << (n_remain * 6);
uint32_t high = low | ((1 << (n_remain * 6)) - 1);
if (low == 0) {
if (n_remain == 2) {
low = 1 << 11;
} else if (n_remain == 3) {
low = 1 << 16;
}
}
do {
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
// inclusive range, e.g. [a-z]
if (pos->value <= high && low <= pos[1].value) {
return is_positive_char;
}
pos += 2;
} else {
// exact char match, e.g. [a] or "a"
if (low <= pos->value && pos->value <= high) {
return is_positive_char;
}
pos += 1;
}
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
return !is_positive_char;
}
// transforms a grammar pushdown stack into N possible stacks, all ending // transforms a grammar pushdown stack into N possible stacks, all ending
// at a character range (terminal element) // at a character range (terminal element)
static void llama_grammar_advance_stack( static void llama_grammar_advance_stack(
@ -2267,8 +2348,11 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
std::vector<llama_grammar_candidate> rejects; std::vector<llama_grammar_candidate> rejects;
if (stack.empty()) { if (stack.empty()) {
// accept nothing; EOS is handled elsewhere for (auto tok : candidates) {
rejects.insert(rejects.end(), candidates.begin(), candidates.end()); if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
rejects.push_back(tok);
}
}
return rejects; return rejects;
} }
@ -2276,10 +2360,15 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
std::vector<llama_grammar_candidate> next_candidates; std::vector<llama_grammar_candidate> next_candidates;
for (auto tok : candidates) { for (auto tok : candidates) {
if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) { if (*tok.code_points == 0) {
if (tok.code_points[1] != 0) { // reached end of full codepoints in token, reject iff it ended in a partial sequence
next_candidates.push_back({ tok.index, tok.code_points + 1 }); // that cannot satisfy this position in grammar
if (tok.partial_utf8.n_remain != 0 &&
!llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
rejects.push_back(tok);
} }
} else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
} else { } else {
rejects.push_back(tok); rejects.push_back(tok);
} }
@ -2297,7 +2386,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates); auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
for (auto tok : next_rejects) { for (auto tok : next_rejects) {
rejects.push_back({ tok.index, tok.code_points - 1 }); rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
} }
return rejects; return rejects;
@ -2362,7 +2451,7 @@ struct llama_grammar * llama_grammar_init(
} }
} while (true); } while (true);
return new llama_grammar{ std::move(vec_rules), std::move(stacks) }; return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
} }
void llama_grammar_free(struct llama_grammar * grammar) { void llama_grammar_free(struct llama_grammar * grammar) {
@ -2668,7 +2757,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
const llama_token eos = llama_token_eos(); const llama_token eos = llama_token_eos();
std::vector<std::vector<uint32_t>> candidates_decoded; std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
std::vector<llama_grammar_candidate> candidates_grammar; std::vector<llama_grammar_candidate> candidates_grammar;
for (size_t i = 0; i < candidates->size; ++i) { for (size_t i = 0; i < candidates->size; ++i) {
@ -2681,8 +2770,10 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
} else if (*str == 0) { } else if (*str == 0) {
candidates->data[i].logit = -INFINITY; candidates->data[i].logit = -INFINITY;
} else { } else {
candidates_decoded.push_back(decode_utf8(str)); candidates_decoded.push_back(decode_utf8(str, grammar->partial_utf8));
candidates_grammar.push_back({ i, candidates_decoded.back().data() }); candidates_grammar.push_back({
i, candidates_decoded.back().first.data(), candidates_decoded.back().second
});
} }
} }
@ -2883,11 +2974,14 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
} }
const char * str = llama_token_to_str(ctx, token); const char * str = llama_token_to_str(ctx, token);
// Note terminating 0 in decoded string // Note terminating 0 in decoded string
auto code_points = decode_utf8(str); const auto decoded = decode_utf8(str, grammar->partial_utf8);
const auto & code_points = decoded.first;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
} }
grammar->partial_utf8 = decoded.second;
LLAMA_ASSERT(!grammar->stacks.empty()); LLAMA_ASSERT(!grammar->stacks.empty());
ctx->t_sample_us += ggml_time_us() - t_start_sample_us; ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
@ -3306,7 +3400,18 @@ struct llama_context * llama_new_context_with_model(
int n_past = hparams.n_ctx - n_tokens; int n_past = hparams.n_ctx - n_tokens;
llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past); ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
#ifdef GGML_USE_METAL
if (params.n_gpu_layers > 0) {
ctx->ctx_metal = ggml_metal_init(1);
if (!ctx->ctx_metal) {
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
llama_free(ctx);
return NULL;
}
ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
}
#endif
// measure memory requirements for the graph // measure memory requirements for the graph
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
@ -3324,6 +3429,11 @@ struct llama_context * llama_new_context_with_model(
ctx->buf_alloc.resize(alloc_size); ctx->buf_alloc.resize(alloc_size);
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment); ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
#ifdef GGML_USE_METAL
if (ctx->ctx_metal) {
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
}
#endif
} }
#else #else
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead()); ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
@ -3338,7 +3448,6 @@ struct llama_context * llama_new_context_with_model(
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
if (params.n_gpu_layers > 0) { if (params.n_gpu_layers > 0) {
// this allocates all Metal resources and memory buffers // this allocates all Metal resources and memory buffers
ctx->ctx_metal = ggml_metal_init(1);
void * data_ptr = NULL; void * data_ptr = NULL;
size_t data_size = 0; size_t data_size = 0;
@ -3367,8 +3476,7 @@ struct llama_context * llama_new_context_with_model(
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0)); LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0)); LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0)); LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.addr, ctx->buf_alloc.size, 0));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
#undef LLAMA_METAL_CHECK_BUF #undef LLAMA_METAL_CHECK_BUF
} }
#endif #endif
@ -4178,6 +4286,10 @@ int llama_n_embd(const struct llama_context * ctx) {
return ctx->model.hparams.n_embd; return ctx->model.hparams.n_embd;
} }
int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_ftype_name(model->hparams.ftype));
}
int llama_get_vocab_from_model( int llama_get_vocab_from_model(
const struct llama_model * model, const struct llama_model * model,
const char * * strings, const char * * strings,

View file

@ -97,7 +97,7 @@ extern "C" {
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
// if it exists. // if it exists.
// It might not exist for progress report where '.' is output repeatedly. // It might not exist for progress report where '.' is output repeatedly.
typedef void (*llama_log_callback)(llama_log_level level, const char * text, void * user_data); typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
struct llama_context_params { struct llama_context_params {
uint32_t seed; // RNG seed, -1 for random uint32_t seed; // RNG seed, -1 for random
@ -353,6 +353,8 @@ extern "C" {
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model); LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model); LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
// Get the vocabulary as output parameters. // Get the vocabulary as output parameters.
// Returns number of results. // Returns number of results.
LLAMA_API int llama_get_vocab( LLAMA_API int llama_get_vocab(

View file

@ -0,0 +1,3 @@
#!/bin/bash
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip

View file

@ -11,5 +11,7 @@ llama_add_test(test-quantize-fns.cpp)
llama_add_test(test-quantize-perf.cpp) llama_add_test(test-quantize-perf.cpp)
llama_add_test(test-sampling.cpp) llama_add_test(test-sampling.cpp)
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
llama_add_test(test-grammar-parser.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../examples/grammar-parser.cpp)
llama_add_test(test-llama-grammar.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../examples/grammar-parser.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../examples/common.cpp)
llama_add_test(test-grad0.cpp) # SLOW llama_add_test(test-grad0.cpp) # SLOW
# llama_add_test(test-opt.cpp) # SLOW # llama_add_test(test-opt.cpp) # SLOW

View file

@ -0,0 +1,249 @@
#ifdef NDEBUG
#undef NDEBUG
#endif
#include "llama.h"
#include "examples/grammar-parser.cpp"
#include <cassert>
int main()
{
grammar_parser::parse_state parsed_grammar;
const char *grammar_bytes = R"""(root ::= (expr "=" term "\n")+
expr ::= term ([-+*/] term)*
term ::= [0-9]+)""";
parsed_grammar = grammar_parser::parse(grammar_bytes);
std::vector<std::pair<std::string, uint32_t>> expected = {
{"expr", 2},
{"expr_5", 5},
{"expr_6", 6},
{"root", 0},
{"root_1", 1},
{"root_4", 4},
{"term", 3},
{"term_7", 7},
};
uint32_t index = 0;
for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it)
{
std::string key = it->first;
uint32_t value = it->second;
std::pair<std::string, uint32_t> expected_pair = expected[index];
// pretty print error message before asserting
if (expected_pair.first != key || expected_pair.second != value)
{
fprintf(stderr, "expected_pair: %s, %d\n", expected_pair.first.c_str(), expected_pair.second);
fprintf(stderr, "actual_pair: %s, %d\n", key.c_str(), value);
fprintf(stderr, "expected_pair != actual_pair\n");
}
assert(expected_pair.first == key && expected_pair.second == value);
index++;
}
std::vector<llama_grammar_element> expected_rules = {
{LLAMA_GRETYPE_RULE_REF, 4},
{LLAMA_GRETYPE_END, 0},
{LLAMA_GRETYPE_RULE_REF, 2},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 10},
{LLAMA_GRETYPE_END, 0},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_RULE_REF, 6},
{LLAMA_GRETYPE_END, 0},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_END, 0},
{LLAMA_GRETYPE_RULE_REF, 1},
{LLAMA_GRETYPE_RULE_REF, 4},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_RULE_REF, 1},
{LLAMA_GRETYPE_END, 0},
{LLAMA_GRETYPE_CHAR, 45},
{LLAMA_GRETYPE_CHAR_ALT, 43},
{LLAMA_GRETYPE_CHAR_ALT, 42},
{LLAMA_GRETYPE_CHAR_ALT, 47},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_END, 0},
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_RULE_REF, 6},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
{LLAMA_GRETYPE_CHAR, 48},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_CHAR, 48},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
{LLAMA_GRETYPE_END, 0},
};
index = 0;
for (auto rule : parsed_grammar.rules)
{
// compare rule to expected rule
for (uint32_t i = 0; i < rule.size(); i++)
{
llama_grammar_element element = rule[i];
llama_grammar_element expected_element = expected_rules[index];
// pretty print error message before asserting
if (expected_element.type != element.type || expected_element.value != element.value)
{
fprintf(stderr, "index: %d\n", index);
fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
fprintf(stderr, "actual_element: %d, %d\n", element.type, element.value);
fprintf(stderr, "expected_element != actual_element\n");
}
assert(expected_element.type == element.type && expected_element.value == element.value);
index++;
}
}
const char *longer_grammar_bytes = R"""(
root ::= (expr "=" ws term "\n")+
expr ::= term ([-+*/] term)*
term ::= ident | num | "(" ws expr ")" ws
ident ::= [a-z] [a-z0-9_]* ws
num ::= [0-9]+ ws
ws ::= [ \t\n]*
)""";
parsed_grammar = grammar_parser::parse(longer_grammar_bytes);
expected = {
{"expr", 2},
{"expr_6", 6},
{"expr_7", 7},
{"ident", 8},
{"ident_10", 10},
{"num", 9},
{"num_11", 11},
{"root", 0},
{"root_1", 1},
{"root_5", 5},
{"term", 4},
{"ws", 3},
{"ws_12", 12},
};
index = 0;
for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it)
{
std::string key = it->first;
uint32_t value = it->second;
std::pair<std::string, uint32_t> expected_pair = expected[index];
// pretty print error message before asserting
if (expected_pair.first != key || expected_pair.second != value)
{
fprintf(stderr, "expected_pair: %s, %d\n", expected_pair.first.c_str(), expected_pair.second);
fprintf(stderr, "actual_pair: %s, %d\n", key.c_str(), value);
fprintf(stderr, "expected_pair != actual_pair\n");
}
assert(expected_pair.first == key && expected_pair.second == value);
index++;
}
expected_rules = {
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_END, 0},
{LLAMA_GRETYPE_RULE_REF, 2},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_RULE_REF, 4},
{LLAMA_GRETYPE_CHAR, 10},
{LLAMA_GRETYPE_END, 0},
{LLAMA_GRETYPE_RULE_REF, 4},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_END, 0},
{LLAMA_GRETYPE_RULE_REF, 12},
{LLAMA_GRETYPE_END, 0},
{LLAMA_GRETYPE_RULE_REF, 8},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_RULE_REF, 9},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_CHAR, 40},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_RULE_REF, 2},
{LLAMA_GRETYPE_CHAR, 41},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_END, 0},
{LLAMA_GRETYPE_RULE_REF, 1},
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_RULE_REF, 1},
{LLAMA_GRETYPE_END, 0},
{LLAMA_GRETYPE_CHAR, 45},
{LLAMA_GRETYPE_CHAR_ALT, 43},
{LLAMA_GRETYPE_CHAR_ALT, 42},
{LLAMA_GRETYPE_CHAR_ALT, 47},
{LLAMA_GRETYPE_RULE_REF, 4},
{LLAMA_GRETYPE_END, 0},
{LLAMA_GRETYPE_RULE_REF, 6},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
{LLAMA_GRETYPE_CHAR, 97},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
{LLAMA_GRETYPE_RULE_REF, 10},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_END, 0},
{LLAMA_GRETYPE_RULE_REF, 11},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_END, 0},
{LLAMA_GRETYPE_CHAR, 97},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
{LLAMA_GRETYPE_CHAR_ALT, 48},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
{LLAMA_GRETYPE_CHAR_ALT, 95},
{LLAMA_GRETYPE_RULE_REF, 10},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
{LLAMA_GRETYPE_CHAR, 48},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
{LLAMA_GRETYPE_RULE_REF, 11},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_CHAR, 48},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
{LLAMA_GRETYPE_END, 0},
{LLAMA_GRETYPE_CHAR, 32},
{LLAMA_GRETYPE_CHAR_ALT, 9},
{LLAMA_GRETYPE_CHAR_ALT, 10},
{LLAMA_GRETYPE_RULE_REF, 12},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
};
index = 0;
for (auto rule : parsed_grammar.rules)
{
// compare rule to expected rule
for (uint32_t i = 0; i < rule.size(); i++)
{
llama_grammar_element element = rule[i];
llama_grammar_element expected_element = expected_rules[index];
// pretty print error message before asserting
if (expected_element.type != element.type || expected_element.value != element.value)
{
fprintf(stderr, "index: %d\n", index);
fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
fprintf(stderr, "actual_element: %d, %d\n", element.type, element.value);
fprintf(stderr, "expected_element != actual_element\n");
}
assert(expected_element.type == element.type && expected_element.value == element.value);
index++;
}
}
return 0;
}

View file

@ -0,0 +1,403 @@
#ifdef NDEBUG
#undef NDEBUG
#endif
#include "llama.cpp"
#include "examples/common.cpp"
#include "examples/grammar-parser.cpp"
#include <cassert>
int main()
{
grammar_parser::parse_state parsed_grammar;
std::vector<std::pair<std::string, uint32_t>> expected = {
{"expr", 2},
{"expr_6", 6},
{"expr_7", 7},
{"ident", 8},
{"ident_10", 10},
{"num", 9},
{"num_11", 11},
{"root", 0},
{"root_1", 1},
{"root_5", 5},
{"term", 4},
{"ws", 3},
{"ws_12", 12},
};
std::vector<std::vector<llama_grammar_element>> expected_rules = {
{{LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_END, 0}},
{
{LLAMA_GRETYPE_RULE_REF, 2},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_RULE_REF, 4},
{LLAMA_GRETYPE_CHAR, 10},
{LLAMA_GRETYPE_END, 0},
},
{{LLAMA_GRETYPE_RULE_REF, 4}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_END, 0}},
{{LLAMA_GRETYPE_RULE_REF, 12}, {LLAMA_GRETYPE_END, 0}},
{
{LLAMA_GRETYPE_RULE_REF, 8},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_RULE_REF, 9},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_CHAR, 40},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_RULE_REF, 2},
{LLAMA_GRETYPE_CHAR, 41},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_END, 0},
},
{{LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_END, 0}},
{
{LLAMA_GRETYPE_CHAR, 45},
{LLAMA_GRETYPE_CHAR_ALT, 43},
{LLAMA_GRETYPE_CHAR_ALT, 42},
{LLAMA_GRETYPE_CHAR_ALT, 47},
{LLAMA_GRETYPE_RULE_REF, 4},
{LLAMA_GRETYPE_END, 0},
},
{{LLAMA_GRETYPE_RULE_REF, 6}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_END, 0}},
{
{LLAMA_GRETYPE_CHAR, 97},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
{LLAMA_GRETYPE_RULE_REF, 10},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_END, 0},
},
{{LLAMA_GRETYPE_RULE_REF, 11}, {LLAMA_GRETYPE_RULE_REF, 3}, {LLAMA_GRETYPE_END, 0}},
{
{LLAMA_GRETYPE_CHAR, 97},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
{LLAMA_GRETYPE_CHAR_ALT, 48},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
{LLAMA_GRETYPE_CHAR_ALT, 95},
{LLAMA_GRETYPE_RULE_REF, 10},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
},
{
{LLAMA_GRETYPE_CHAR, 48},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
{LLAMA_GRETYPE_RULE_REF, 11},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_CHAR, 48},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
{LLAMA_GRETYPE_END, 0},
},
{
{LLAMA_GRETYPE_CHAR, 32},
{LLAMA_GRETYPE_CHAR_ALT, 9},
{LLAMA_GRETYPE_CHAR_ALT, 10},
{LLAMA_GRETYPE_RULE_REF, 12},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
},
};
for (auto pair : expected)
{
parsed_grammar.symbol_ids[pair.first] = pair.second;
}
for (auto rule : expected_rules)
{
parsed_grammar.rules.push_back({});
for (auto element : rule)
{
parsed_grammar.rules.back().push_back(element);
}
}
llama_grammar *grammar = NULL;
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
grammar = llama_grammar_init(
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
std::vector<std::vector<llama_grammar_element>> expected_stacks = {
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_CHAR, 97},
},
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_CHAR, 40},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_CHAR, 97},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_CHAR, 40},
}};
auto index = 0;
for (auto stack : grammar->stacks)
{
// compare stack to expected_stack
for (uint32_t i = 0; i < stack.size(); i++)
{
auto element = stack[i];
auto expected_element = expected_stacks[index][i];
// pretty print error message before asserting
if (expected_element.type != element->type || expected_element.value != element->value)
{
fprintf(stderr, "index: %d\n", index);
fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
fprintf(stderr, "actual_element: %d, %d\n", element->type, element->value);
fprintf(stderr, "expected_element != actual_element\n");
}
assert(expected_element.type == element->type && expected_element.value == element->value);
}
index++;
}
std::vector<std::vector<const llama_grammar_element *>> next_stacks;
std::vector<llama_grammar_candidate> next_candidates;
next_candidates.resize(24);
for (size_t i = 0; i < 24; ++i)
{
uint32_t *cp = new uint32_t[2]; // dynamically allocate memory for code_point
cp[0] = 37 + i;
cp[1] = 0;
next_candidates[i] = {i, cp, {}};
}
std::vector<std::vector<std::pair<uint32_t, uint16_t>>> expected_reject = {
{
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{11, 48},
{12, 49},
{13, 50},
{14, 51},
{15, 52},
{16, 53},
{17, 54},
{18, 55},
{19, 56},
{20, 57},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{11, 48},
{12, 49},
{13, 50},
{14, 51},
{15, 52},
{16, 53},
{17, 54},
{18, 55},
{19, 56},
{20, 57},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{11, 48},
{12, 49},
{13, 50},
{14, 51},
{15, 52},
{16, 53},
{17, 54},
{18, 55},
{19, 56},
{20, 57},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{11, 48},
{12, 49},
{13, 50},
{14, 51},
{15, 52},
{16, 53},
{17, 54},
{18, 55},
{19, 56},
{20, 57},
{21, 58},
{22, 59},
{23, 60},
},
};
std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[0], next_candidates);
std::vector<std::vector<llama_grammar_candidate>> all_rejects;
for (std::size_t count = 0; count < grammar->stacks.size(); ++count)
{
rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[count], next_candidates);
all_rejects.push_back(rejects);
}
index = 0;
for (auto rej : all_rejects)
{
for (uint32_t i = 0; i < rej.size(); i++)
{
auto element = rej[i];
auto expected_element = expected_reject[index][i];
assert(element.index == expected_element.first && *element.code_points == expected_element.second);
}
index++;
}
for (auto &candidate : next_candidates)
{
delete[] candidate.code_points;
candidate.code_points = nullptr;
}
delete grammar;
return 0;
}