Merge branch 'ggerganov:master' into master

This commit is contained in:
Marc 2023-09-07 15:09:52 -05:00 committed by GitHub
commit bf7eeb731f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
28 changed files with 262 additions and 195 deletions

View file

@ -3,6 +3,7 @@ Checks: >
bugprone-*, bugprone-*,
-bugprone-easily-swappable-parameters, -bugprone-easily-swappable-parameters,
-bugprone-implicit-widening-of-multiplication-result, -bugprone-implicit-widening-of-multiplication-result,
-bugprone-misplaced-widening-cast,
-bugprone-narrowing-conversions, -bugprone-narrowing-conversions,
readability-*, readability-*,
-readability-avoid-unconditional-preprocessor-if, -readability-avoid-unconditional-preprocessor-if,
@ -15,4 +16,8 @@ Checks: >
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*, performance-*,
portability-*, portability-*,
misc-*,
-misc-const-correctness,
-misc-non-private-member-variables-in-classes,
-misc-no-recursion,
FormatStyle: none FormatStyle: none

View file

@ -83,6 +83,7 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON) option(LLAMA_K_QUANTS "llama: use k-quants" ON)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
@ -174,7 +175,9 @@ if (LLAMA_METAL)
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h) set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
add_compile_definitions(GGML_USE_METAL) add_compile_definitions(GGML_USE_METAL)
#add_compile_definitions(GGML_METAL_NDEBUG) if (LLAMA_METAL_NDEBUG)
add_compile_definitions(GGML_METAL_NDEBUG)
endif()
# get full path to the file # get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
@ -423,7 +426,7 @@ if (LLAMA_ALL_WARNINGS)
) )
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
# g++ only # g++ only
set(cxx_flags ${cxx_flags} -Wno-format-truncation) set(cxx_flags ${cxx_flags} -Wno-format-truncation -Wno-array-bounds)
endif() endif()
else() else()
# todo : msvc # todo : msvc

View file

@ -42,9 +42,9 @@ endif
default: $(BUILD_TARGETS) default: $(BUILD_TARGETS)
test: test: $(TEST_TARGETS)
@echo "Running tests..." @failures=0; \
@for test_target in $(TEST_TARGETS); do \ for test_target in $(TEST_TARGETS); do \
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \ if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \ elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
@ -52,10 +52,21 @@ test:
elif [ "$$test_target" = "tests/test-tokenizer-1" ]; then \ elif [ "$$test_target" = "tests/test-tokenizer-1" ]; then \
continue; \ continue; \
else \ else \
echo "Running test $$test_target..."; \
./$$test_target; \ ./$$test_target; \
fi; \ fi; \
done if [ $$? -ne 0 ]; then \
@echo "All tests have been run." printf 'Test $$test_target FAILED!\n\n' $$test_target; \
failures=$$(( failures + 1 )); \
else \
printf 'Test %s passed.\n\n' $$test_target; \
fi; \
done; \
if [ $$failures -gt 0 ]; then \
printf '\n%s tests failed.\n' $$failures; \
exit 1; \
fi
@echo 'All tests passed.'
all: $(BUILD_TARGETS) $(TEST_TARGETS) all: $(BUILD_TARGETS) $(TEST_TARGETS)
@ -91,8 +102,8 @@ else
OPT = -O3 OPT = -O3
endif endif
MK_CPPFLAGS = -I. -Icommon MK_CPPFLAGS = -I. -Icommon
MK_CFLAGS = $(CPPFLAGS) $(OPT) -std=c11 -fPIC MK_CFLAGS = $(OPT) -std=c11 -fPIC
MK_CXXFLAGS = $(CPPFLAGS) $(OPT) -std=c++11 -fPIC MK_CXXFLAGS = $(OPT) -std=c++11 -fPIC
MK_LDFLAGS = MK_LDFLAGS =
ifdef LLAMA_DEBUG ifdef LLAMA_DEBUG
@ -123,7 +134,7 @@ MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-m
ifeq '' '$(findstring clang++,$(CXX))' ifeq '' '$(findstring clang++,$(CXX))'
# g++ only # g++ only
MK_CXXFLAGS += -Wno-format-truncation MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds
endif endif
# OS specific # OS specific
@ -360,6 +371,9 @@ ifdef LLAMA_METAL
MK_CPPFLAGS += -DGGML_USE_METAL MK_CPPFLAGS += -DGGML_USE_METAL
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
OBJS += ggml-metal.o OBJS += ggml-metal.o
ifdef LLAMA_METAL_NDEBUG
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
endif
endif # LLAMA_METAL endif # LLAMA_METAL
ifdef LLAMA_METAL ifdef LLAMA_METAL
@ -378,9 +392,8 @@ k_quants.o: k_quants.c k_quants.h
endif # LLAMA_NO_K_QUANTS endif # LLAMA_NO_K_QUANTS
# combine build flags with cmdline overrides # combine build flags with cmdline overrides
override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
override CFLAGS := $(MK_CFLAGS) $(CFLAGS) override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
override CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS) override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
# #

View file

@ -57,7 +57,7 @@ int32_t get_num_physical_cores() {
siblings.insert(line); siblings.insert(line);
} }
} }
if (siblings.size() > 0) { if (!siblings.empty()) {
return static_cast<int32_t>(siblings.size()); return static_cast<int32_t>(siblings.size());
} }
#elif defined(__APPLE__) && defined(__MACH__) #elif defined(__APPLE__) && defined(__MACH__)
@ -773,7 +773,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
LOG("warming up the model with an empty run\n"); LOG("warming up the model with an empty run\n");
const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), }; const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
llama_eval(lctx, tmp.data(), tmp.size(), 0, params.n_threads); llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads);
llama_reset_timings(lctx); llama_reset_timings(lctx);
} }

View file

@ -20,6 +20,9 @@
#define DIRECTORY_SEPARATOR '/' #define DIRECTORY_SEPARATOR '/'
#endif // _WIN32 #endif // _WIN32
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", ##__VA_ARGS__); exit(1); } while (0)
// //
// CLI argument parsing // CLI argument parsing
// //

View file

@ -415,6 +415,7 @@ namespace grammar_parser {
std::vector<const llama_grammar_element *> parse_state::c_rules() { std::vector<const llama_grammar_element *> parse_state::c_rules() {
std::vector<const llama_grammar_element *> ret; std::vector<const llama_grammar_element *> ret;
ret.reserve(rules.size());
for (const auto & rule : rules) { for (const auto & rule : rules) {
ret.push_back(rule.data()); ret.push_back(rule.data());
} }

View file

@ -5,6 +5,7 @@ import argparse
import math import math
import struct import struct
import sys import sys
from enum import IntEnum
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
@ -34,10 +35,35 @@ GGML_QUANT_SIZES = {
gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8), gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
} }
class GGMLFormat(IntEnum):
GGML = 0
GGMF = 1
GGJT = 2
class GGMLFType(IntEnum):
ALL_F32 = 0
MOSTLY_F16 = 1
MOSTLY_Q4_0 = 2
MOSTLY_Q4_1 = 3
MOSTLY_Q4_1_SOME_F16 = 4
MOSTLY_Q8_0 = 7
MOSTLY_Q5_0 = 8
MOSTLY_Q5_1 = 9
MOSTLY_Q2_K = 10
MOSTLY_Q3_K_S = 11
MOSTLY_Q3_K_M = 12
MOSTLY_Q3_K_L = 13
MOSTLY_Q4_K_S = 14
MOSTLY_Q4_K_M = 15
MOSTLY_Q5_K_S = 16
MOSTLY_Q5_K_M = 17
MOSTLY_Q6_K = 18
class Hyperparameters: class Hyperparameters:
def __init__(self): def __init__(self):
self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0 self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
self.n_ff = 0 self.n_layer = self.n_rot = self.n_ff = 0
self.ftype = GGMLFType.ALL_F32
def set_n_ff(self, model): def set_n_ff(self, model):
ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight') ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
@ -53,16 +79,21 @@ class Hyperparameters:
self.n_head, self.n_head,
self.n_layer, self.n_layer,
self.n_rot, self.n_rot,
self.ftype, ftype,
) = struct.unpack('<7I', data[offset:offset + (4 * 7)]) ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
try:
self.ftype = GGMLFType(ftype)
except ValueError:
raise ValueError(f'Invalid ftype {ftype}')
return 4 * 7 return 4 * 7
def __str__(self): def __str__(self):
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype}>' return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
class Vocab: class Vocab:
def __init__(self): def __init__(self, load_scores = True):
self.items = [] self.items = []
self.load_scores = load_scores
def load(self, data, offset, n_vocab): def load(self, data, offset, n_vocab):
orig_offset = offset orig_offset = offset
@ -70,20 +101,24 @@ class Vocab:
itemlen = struct.unpack('<I', data[offset:offset + 4])[0] itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
assert itemlen < 4096, 'Absurd vocab item length' assert itemlen < 4096, 'Absurd vocab item length'
offset += 4 offset += 4
vocab = bytes(data[offset:offset + itemlen]) item_text = bytes(data[offset:offset + itemlen])
offset += itemlen offset += itemlen
score = struct.unpack('<f', data[offset:offset + 4])[0] if self.load_scores:
offset += 4 item_score = struct.unpack('<f', data[offset:offset + 4])[0]
self.items.append((vocab, score)) offset += 4
else:
item_score = 0.0
self.items.append((item_text, item_score))
return offset - orig_offset return offset - orig_offset
class Tensor: class Tensor:
def __init__(self): def __init__(self, use_padding = True):
self.name = None self.name = None
self.dims: tuple[int, ...] = () self.dims: tuple[int, ...] = ()
self.dtype = None self.dtype = None
self.start_offset = 0 self.start_offset = 0
self.len_bytes = np.int64(0) self.len_bytes = np.int64(0)
self.use_padding = use_padding
def load(self, data, offset): def load(self, data, offset):
orig_offset = offset orig_offset = offset
@ -99,7 +134,7 @@ class Tensor:
offset += 4 * n_dims offset += 4 * n_dims
self.name = bytes(data[offset:offset + name_len]) self.name = bytes(data[offset:offset + name_len])
offset += name_len offset += name_len
pad = ((offset + 31) & ~31) - offset pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
offset += pad offset += pad
n_elems = np.prod(self.dims) n_elems = np.prod(self.dims)
n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize) n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
@ -109,7 +144,7 @@ class Tensor:
# print(n_dims, name_len, dtype, self.dims, self.name, pad) # print(n_dims, name_len, dtype, self.dims, self.name, pad)
return offset - orig_offset return offset - orig_offset
class GGMLV3Model: class GGMLModel:
def __init__(self): def __init__(self):
self.hyperparameters = None self.hyperparameters = None
self.vocab = None self.vocab = None
@ -117,20 +152,52 @@ class GGMLV3Model:
self.tensors = [] self.tensors = []
def validate_header(self, data, offset): def validate_header(self, data, offset):
if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3: magic = bytes(data[offset:offset + 4])
raise ValueError('Only GGJTv3 supported') if magic == b'GGUF':
return 8 raise ValueError('File is already in GGUF format.')
if magic == b'lmgg':
self.file_format = GGMLFormat.GGML
self.format_version = 1
return 4
version = struct.unpack('<I', data[offset + 4:offset + 8])[0]
if magic == b'fmgg':
if version != 1:
raise ValueError(f'Cannot handle unexpected GGMF file version {version}')
self.file_format = GGMLFormat.GGMF
self.format_version = version
return 8
if magic == b'tjgg':
if version < 1 or version > 3:
raise ValueError(f'Cannot handle unexpected GGJT file version {version}')
self.file_format = GGMLFormat.GGJT
self.format_version = version
return 8
raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.")
def validate_conversion(self, ftype):
err = ''
if (self.file_format < GGMLFormat.GGJT or self.format_version < 2):
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
err = 'Q4 and Q8 quantizations changed in GGJTv3.'
if len(err) > 0:
raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
def load(self, data, offset): def load(self, data, offset):
offset += self.validate_header(data, offset) offset += self.validate_header(data, offset)
hp = Hyperparameters() hp = Hyperparameters()
offset += hp.load(data, offset) offset += hp.load(data, offset)
vocab = Vocab() print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
self.validate_conversion(hp.ftype)
vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
offset += vocab.load(data, offset, hp.n_vocab) offset += vocab.load(data, offset, hp.n_vocab)
tensors: list[Tensor] = [] tensors: list[Tensor] = []
tensor_map = {} tensor_map = {}
while offset < len(data): while offset < len(data):
tensor = Tensor() tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
offset += tensor.load(data, offset) offset += tensor.load(data, offset)
tensor_map[tensor.name] = len(tensors) tensor_map[tensor.name] = len(tensors)
tensors.append(tensor) tensors.append(tensor)
@ -168,7 +235,10 @@ class GGMLToGGUF:
def save(self): def save(self):
print('* Preparing to save GGUF file') print('* Preparing to save GGUF file')
gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False) gguf_writer = gguf.GGUFWriter(
self.cfg.output,
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
use_temp_file = False )
self.add_params(gguf_writer) self.add_params(gguf_writer)
self.add_vocab(gguf_writer) self.add_vocab(gguf_writer)
if self.special_vocab is not None: if self.special_vocab is not None:
@ -185,7 +255,10 @@ class GGMLToGGUF:
def add_params(self, gguf_writer): def add_params(self, gguf_writer):
hp = self.model.hyperparameters hp = self.model.hyperparameters
cfg = self.cfg cfg = self.cfg
desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format' if cfg.desc is not None:
desc = cfg.desc
else:
desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format'
try: try:
# Filenames aren't necessarily valid UTF8. # Filenames aren't necessarily valid UTF8.
name = cfg.name if cfg.name is not None else cfg.input.name name = cfg.name if cfg.name is not None else cfg.input.name
@ -195,6 +268,7 @@ class GGMLToGGUF:
if name is not None: if name is not None:
gguf_writer.add_name(name) gguf_writer.add_name(name)
gguf_writer.add_description(desc) gguf_writer.add_description(desc)
gguf_writer.add_file_type(int(hp.ftype))
if self.params_override is not None: if self.params_override is not None:
po = self.params_override po = self.params_override
assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch' assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
@ -231,7 +305,8 @@ class GGMLToGGUF:
tokens.append(vbytes) tokens.append(vbytes)
scores.append(score) scores.append(score)
toktypes.append(ttype) toktypes.append(ttype)
assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}' assert len(tokens) == hp.n_vocab, \
f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
gguf_writer.add_token_list(tokens) gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores) gguf_writer.add_token_scores(scores)
if len(toktypes) > 0: if len(toktypes) > 0:
@ -283,7 +358,11 @@ class GGMLToGGUF:
tempdims[1] = tempdims[0] tempdims[1] = tempdims[0]
tempdims[0] = temp tempdims[0] = temp
# print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}') # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype) gguf_writer.add_tensor(
mapped_name,
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
raw_shape = tempdims,
raw_dtype = tensor.dtype )
def handle_metadata(cfg, hp): def handle_metadata(cfg, hp):
import convert import convert
@ -305,32 +384,46 @@ def handle_metadata(cfg, hp):
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path) params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
else: else:
raise ValueError('Unable to load metadata') raise ValueError('Unable to load metadata')
vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype) vocab = convert.load_vocab(
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
cfg.vocabtype )
# FIXME: Respect cfg.vocab_dir? # FIXME: Respect cfg.vocab_dir?
svocab = gguf.SpecialVocab(cfg.model_metadata_dir) svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
convert.check_vocab_size(params, vocab) convert.check_vocab_size(params, vocab)
return (params, vocab, svocab) return (params, vocab, svocab)
def handle_args(): def handle_args():
parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF') parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
parser.add_argument('--input', '-i', type = Path, required = True, help = 'Input GGMLv3 filename') parser.add_argument('--input', '-i', type = Path, required = True,
parser.add_argument('--output', '-o', type = Path, required = True, help ='Output GGUF filename') help = 'Input GGMLv3 filename')
parser.add_argument('--name', help = 'Set model name') parser.add_argument('--output', '-o', type = Path, required = True,
parser.add_argument('--desc', help = 'Set model description') help ='Output GGUF filename')
parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)') parser.add_argument('--name',
parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2') help = 'Set model name')
parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096') parser.add_argument('--desc',
parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory') help = 'Set model description')
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") parser.add_argument('--gqa', type = int, default = 1,
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm") help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
parser.add_argument('--eps', default = '5.0e-06',
help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
parser.add_argument('--context-length', '-c', type=int, default = 2048,
help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
parser.add_argument('--model-metadata-dir', '-m', type = Path,
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
parser.add_argument("--vocab-dir", type=Path,
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
return parser.parse_args() return parser.parse_args()
def main(): def main():
cfg = handle_args() cfg = handle_args()
print(f'* Using config: {cfg}') print(f'* Using config: {cfg}')
print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n') print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
data = np.memmap(cfg.input, mode = 'r') data = np.memmap(cfg.input, mode = 'r')
model = GGMLV3Model() model = GGMLModel()
print('* Scanning GGML input file') print('* Scanning GGML input file')
offset = model.load(data, 0) offset = model.load(data, 0)
print(f'* GGML model hyperparameters: {model.hyperparameters}') print(f'* GGML model hyperparameters: {model.hyperparameters}')
@ -345,7 +438,12 @@ def main():
print(f'* Special vocab: {special_vocab}') print(f'* Special vocab: {special_vocab}')
else: else:
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override, special_vocab = special_vocab) if model.file_format == GGMLFormat.GGML:
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
converter = GGMLToGGUF(model, data, cfg,
params_override = params_override,
vocab_override = vocab_override,
special_vocab = special_vocab )
converter.save() converter.save()
print(f'* Successful completion. Output saved to: {cfg.output}') print(f'* Successful completion. Output saved to: {cfg.output}')

View file

@ -266,7 +266,7 @@ class Params:
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
# hack to determine LLaMA v1 vs v2 vs CodeLlama # hack to determine LLaMA v1 vs v2 vs CodeLlama
if f_rope_freq_base and f_rope_freq_base == 1000000: if f_rope_freq_base == 1000000:
# CodeLlama # CodeLlama
n_ctx = 16384 n_ctx = 16384
elif config["norm_eps"] == 1e-05: elif config["norm_eps"] == 1e-05:
@ -841,9 +841,9 @@ class OutputFile:
name = "LLaMA" name = "LLaMA"
# TODO: better logic to determine model name # TODO: better logic to determine model name
if (params.n_ctx == 4096): if params.n_ctx == 4096:
name = "LLaMA v2" name = "LLaMA v2"
elif params.path_model: elif params.path_model is not None:
name = str(params.path_model.parent).split('/')[-1] name = str(params.path_model.parent).split('/')[-1]
self.gguf.add_name (name) self.gguf.add_name (name)
@ -856,13 +856,13 @@ class OutputFile:
self.gguf.add_head_count_kv (params.n_head_kv) self.gguf.add_head_count_kv (params.n_head_kv)
self.gguf.add_layer_norm_rms_eps (params.f_norm_eps) self.gguf.add_layer_norm_rms_eps (params.f_norm_eps)
if params.f_rope_freq_base: if params.f_rope_freq_base is not None:
self.gguf.add_rope_freq_base(params.f_rope_freq_base) self.gguf.add_rope_freq_base(params.f_rope_freq_base)
if params.f_rope_scale: if params.f_rope_scale is not None:
self.gguf.add_rope_scale_linear(params.f_rope_scale) self.gguf.add_rope_scale_linear(params.f_rope_scale)
if params.ftype: if params.ftype is not None:
self.gguf.add_file_type(params.ftype) self.gguf.add_file_type(params.ftype)
def add_meta_vocab(self, vocab: Vocab) -> None: def add_meta_vocab(self, vocab: Vocab) -> None:

View file

@ -1,5 +1,6 @@
#include "ggml.h" #include "ggml.h"
#include "llama.h" #include "llama.h"
#include "common.h"
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
@ -499,10 +500,10 @@ struct llama_file {
errno = 0; errno = 0;
std::size_t ret = std::fread(ptr, size, 1, fp); std::size_t ret = std::fread(ptr, size, 1, fp);
if (ferror(fp)) { if (ferror(fp)) {
throw std::runtime_error(format("read error: %s", strerror(errno))); die_fmt("fread failed: %s", strerror(errno));
} }
if (ret != 1) { if (ret != 1) {
throw std::runtime_error(std::string("unexpectedly reached end of file")); die("unexpectedly reached end of file");
} }
} }
@ -597,8 +598,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename); printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
llama_file file(filename, "rb"); llama_file file(filename, "rb");
if (!file.fp) { if (!file.fp) {
fprintf(stderr, "error: %s: %s\n", strerror(errno), filename); die_fmt("%s: %s", strerror(errno), filename);
exit(1);
} }
const int n_vocab = config->vocab_size; const int n_vocab = config->vocab_size;
/* uint32_t max_token_length = */ file.read_u32(); // unused /* uint32_t max_token_length = */ file.read_u32(); // unused

View file

@ -23,7 +23,7 @@ extern "C" {
struct MyModel* create_mymodel(int argc, char ** argv) { struct MyModel* create_mymodel(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (gpt_params_parse(argc, argv, params) == false) { if (!gpt_params_parse(argc, argv, params)) {
return nullptr; return nullptr;
} }

View file

@ -11,7 +11,7 @@
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (gpt_params_parse(argc, argv, params) == false) { if (!gpt_params_parse(argc, argv, params)) {
return 1; return 1;
} }

View file

@ -953,7 +953,7 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (gpt_params_parse(argc, argv, params) == false) { if (!gpt_params_parse(argc, argv, params)) {
return 1; return 1;
} }

View file

@ -925,7 +925,7 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (gpt_params_parse(argc, argv, params) == false) { if (!gpt_params_parse(argc, argv, params)) {
return 1; return 1;
} }

View file

@ -986,7 +986,12 @@ int main(int argc, char ** argv) {
test t(inst, lmodel, ctx); test t(inst, lmodel, ctx);
// warmup run // warmup run
test_gen(ctx, 1, 0, t.n_threads); if (t.n_prompt > 0) {
test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads);
}
if (t.n_gen > 0) {
test_gen(ctx, 1, 0, t.n_threads);
}
for (int i = 0; i < params.reps; i++) { for (int i = 0; i < params.reps; i++) {
uint64_t t_start = get_time_ns(); uint64_t t_start = get_time_ns();

View file

@ -48,8 +48,9 @@ static bool is_interacting = false;
void write_logfile( void write_logfile(
const llama_context * ctx, const gpt_params & params, const llama_model * model, const llama_context * ctx, const gpt_params & params, const llama_model * model,
const std::vector<llama_token> input_tokens, const std::string output, const std::vector<llama_token> output_tokens) { const std::vector<llama_token> & input_tokens, const std::string & output,
const std::vector<llama_token> & output_tokens
) {
if (params.logdir.empty()) { if (params.logdir.empty()) {
return; return;
} }
@ -109,7 +110,7 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
g_params = &params; g_params = &params;
if (gpt_params_parse(argc, argv, params) == false) { if (!gpt_params_parse(argc, argv, params)) {
return 1; return 1;
} }
@ -303,7 +304,7 @@ int main(int argc, char ** argv) {
// debug message about similarity of saved session, if applicable // debug message about similarity of saved session, if applicable
size_t n_matching_session_tokens = 0; size_t n_matching_session_tokens = 0;
if (session_tokens.size() > 0) { if (!session_tokens.empty()) {
for (llama_token id : session_tokens) { for (llama_token id : session_tokens) {
if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) { if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
break; break;
@ -401,7 +402,7 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: interactive mode on.\n", __func__); LOG_TEE("%s: interactive mode on.\n", __func__);
if (params.antiprompt.size()) { if (!params.antiprompt.empty()) {
for (const auto & antiprompt : params.antiprompt) { for (const auto & antiprompt : params.antiprompt) {
LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str()); LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
} }
@ -499,7 +500,7 @@ int main(int argc, char ** argv) {
while ((n_remain != 0 && !is_antiprompt) || params.interactive) { while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict // predict
if (embd.size() > 0) { if (!embd.empty()) {
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
// --prompt or --file which uses the same value. // --prompt or --file which uses the same value.
int max_embd_size = n_ctx - 4; int max_embd_size = n_ctx - 4;
@ -624,7 +625,7 @@ int main(int argc, char ** argv) {
LOG("n_past = %d\n", n_past); LOG("n_past = %d\n", n_past);
} }
if (embd.size() > 0 && !path_session.empty()) { if (!embd.empty() && !path_session.empty()) {
session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
n_session_consumed = session_tokens.size(); n_session_consumed = session_tokens.size();
} }
@ -695,7 +696,7 @@ int main(int argc, char ** argv) {
// if not currently processing queued inputs; // if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) { if ((int) embd_inp.size() <= n_consumed) {
// check for reverse prompt // check for reverse prompt
if (params.antiprompt.size()) { if (!params.antiprompt.empty()) {
std::string last_output; std::string last_output;
for (auto id : last_tokens) { for (auto id : last_tokens) {
last_output += llama_token_to_piece(ctx, id); last_output += llama_token_to_piece(ctx, id);
@ -732,7 +733,7 @@ int main(int argc, char ** argv) {
LOG("found EOS token\n"); LOG("found EOS token\n");
if (params.interactive) { if (params.interactive) {
if (params.antiprompt.size() != 0) { if (!params.antiprompt.empty()) {
// tokenize and inject first reverse prompt // tokenize and inject first reverse prompt
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false); const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());

View file

@ -655,7 +655,7 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
params.n_batch = 512; params.n_batch = 512;
if (gpt_params_parse(argc, argv, params) == false) { if (!gpt_params_parse(argc, argv, params)) {
return 1; return 1;
} }

View file

@ -71,7 +71,7 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) {
} }
// Check if a layer is included/excluded by command line // Check if a layer is included/excluded by command line
bool layer_included(const quantize_stats_params params, const std::string & layer) { bool layer_included(const quantize_stats_params & params, const std::string & layer) {
for (const auto& excluded : params.exclude_layers) { for (const auto& excluded : params.exclude_layers) {
if (std::regex_search(layer, std::regex(excluded))) { if (std::regex_search(layer, std::regex(excluded))) {
return false; return false;

View file

@ -143,10 +143,9 @@ int main(int argc, char ** argv) {
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]); fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
return 1; return 1;
} else { }
if (ftype_str == "COPY") { if (ftype_str == "COPY") {
params.only_copy = true; params.only_copy = true;
}
} }
arg_idx++; arg_idx++;
} }

View file

@ -13,7 +13,7 @@ int main(int argc, char ** argv) {
params.repeat_last_n = 64; params.repeat_last_n = 64;
params.prompt = "The quick brown fox"; params.prompt = "The quick brown fox";
if (gpt_params_parse(argc, argv, params) == false) { if (!gpt_params_parse(argc, argv, params)) {
return 1; return 1;
} }
@ -44,7 +44,7 @@ int main(int argc, char ** argv) {
llama_free_model(model); llama_free_model(model);
return 1; return 1;
} }
auto tokens = llama_tokenize(ctx, params.prompt.c_str(), true); auto tokens = llama_tokenize(ctx, params.prompt, true);
auto n_prompt_tokens = tokens.size(); auto n_prompt_tokens = tokens.size();
if (n_prompt_tokens < 1) { if (n_prompt_tokens < 1) {
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__); fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);

View file

@ -139,7 +139,7 @@ static std::string tokens_to_output_formatted_string(const llama_context *ctx, c
} }
// convert a vector of completion_token_output to json // convert a vector of completion_token_output to json
static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> probs) static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> & probs)
{ {
json out = json::array(); json out = json::array();
for (const auto &prob : probs) for (const auto &prob : probs)
@ -271,7 +271,7 @@ struct llama_server_context
return true; return true;
} }
std::vector<llama_token> tokenize(json json_prompt, bool add_bos) std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
{ {
// If `add_bos` is true, we only add BOS, when json_prompt is a string, // If `add_bos` is true, we only add BOS, when json_prompt is a string,
// or the first element of the json_prompt array is a string. // or the first element of the json_prompt array is a string.
@ -611,7 +611,7 @@ struct llama_server_context
completion_token_output doCompletion() completion_token_output doCompletion()
{ {
const completion_token_output token_with_probs = nextToken(); auto token_with_probs = nextToken();
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok); const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
generated_text += token_text; generated_text += token_text;
@ -1255,7 +1255,7 @@ void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
struct token_translator { struct token_translator {
llama_context * ctx; llama_context * ctx;
std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); } std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); } std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); }
}; };
void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) { void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) {

View file

@ -169,10 +169,6 @@ struct my_llama_hparams {
float rope_freq_base = 10000.0f; float rope_freq_base = 10000.0f;
float rope_freq_scale = 1.0f; float rope_freq_scale = 1.0f;
bool operator!=(const my_llama_hparams& other) const {
return memcmp(this, &other, sizeof(my_llama_hparams));
}
}; };
struct my_llama_layer { struct my_llama_layer {
@ -929,28 +925,6 @@ void get_example_targets_batch(struct llama_context * lctx, const int * train_sa
} }
} }
#ifdef __GNUC__
#ifdef __MINGW32__
__attribute__((format(gnu_printf, 1, 2)))
#else
__attribute__((format(printf, 1, 2)))
#endif
#endif
static std::string format(const char * fmt, ...) {
va_list ap, ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX);
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), size);
}
int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token>& out) { int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token>& out) {
FILE * fp = std::fopen(filename, "rb"); FILE * fp = std::fopen(filename, "rb");
if (fp == NULL) { if (fp == NULL) {
@ -983,10 +957,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
out.resize(size+1); out.resize(size+1);
if (std::fread(buf.data(), size, 1, fp) != 1) { if (std::fread(buf.data(), size, 1, fp) != 1) {
throw std::runtime_error(std::string("unexpectedly reached end of file")); die("unexpectedly reached end of file");
} }
if (ferror(fp)) { if (ferror(fp)) {
throw std::runtime_error(format("read error: %s", strerror(errno))); die_fmt("fread failed: %s", strerror(errno));
} }
buf[size] = '\0'; buf[size] = '\0';
@ -1047,11 +1021,11 @@ void shuffle_ints(int * begin, int * end) {
if (kid >= 0) { \ if (kid >= 0) { \
enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
if (ktype != (type)) { \ if (ktype != (type)) { \
throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \ die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \
} \ } \
(dst) = func(ctx, kid); \ (dst) = func(ctx, kid); \
} else if (req) { \ } else if (req) { \
throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \ die_fmt("key not found in model: %s", skey.c_str()); \
} \ } \
} }
@ -1136,7 +1110,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
read_tensor_by_name(opt->lbfgs.lms, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); read_tensor_by_name(opt->lbfgs.lms, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
read_tensor_by_name(opt->lbfgs.lmy, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); read_tensor_by_name(opt->lbfgs.lmy, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
} else { } else {
throw std::runtime_error("unknown optimizer type\n"); die("unknown optimizer type");
} }
} }
@ -1315,20 +1289,20 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod
const int token_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_LIST)); const int token_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_LIST));
if (token_idx == -1) { if (token_idx == -1) {
throw std::runtime_error("cannot find tokenizer vocab in model file\n"); die("cannot find tokenizer vocab in model file");
} }
const uint32_t n_vocab = gguf_get_arr_n(vctx, token_idx); const uint32_t n_vocab = gguf_get_arr_n(vctx, token_idx);
const int score_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_SCORES)); const int score_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_SCORES));
if (score_idx == -1) { if (score_idx == -1) {
throw std::runtime_error("cannot find tokenizer scores in model file\n"); die("cannot find tokenizer scores in model file");
} }
const float * scores = (const float * ) gguf_get_arr_data(vctx, score_idx); const float * scores = (const float * ) gguf_get_arr_data(vctx, score_idx);
const int toktype_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE)); const int toktype_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE));
if (toktype_idx == -1) { if (toktype_idx == -1) {
throw std::runtime_error("cannot find token type list in GGUF file\n"); die("cannot find token type list in GGUF file");
} }
const int * toktypes = (const int * ) gguf_get_arr_data(vctx, toktype_idx); const int * toktypes = (const int * ) gguf_get_arr_data(vctx, toktype_idx);
@ -1356,7 +1330,7 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod
// read and copy bpe merges // read and copy bpe merges
const int merges_keyidx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_MERGES)); const int merges_keyidx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_MERGES));
if (merges_keyidx == -1) { if (merges_keyidx == -1) {
throw std::runtime_error("cannot find tokenizer merges in model file\n"); die("cannot find tokenizer merges in model file");
} }
const int n_merges = gguf_get_arr_n(vctx, merges_keyidx); const int n_merges = gguf_get_arr_n(vctx, merges_keyidx);
@ -1988,7 +1962,7 @@ void opt_callback(void * vdata, float * sched) {
float min_sched = params->adam_min_alpha / params->adam_alpha; float min_sched = params->adam_min_alpha / params->adam_alpha;
*sched = min_sched + *sched * (1.0f - min_sched); *sched = min_sched + *sched * (1.0f - min_sched);
int impr_plot = std::isnan(opt->loss_after) ? 0 : -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f); int impr_plot = std::isnan(opt->loss_after) ? 0 : -std::lround(1 + (opt->loss_before - opt->loss_after) * 10.0f);
printf("%s: iter=%*d, sched=%f loss0=%f loss=%f | improvement: %*d>\n", __func__, 6, opt->iter, *sched, opt->loss_before, opt->loss_after, impr_plot, (int)0); printf("%s: iter=%*d, sched=%f loss0=%f loss=%f | improvement: %*d>\n", __func__, 6, opt->iter, *sched, opt->loss_before, opt->loss_after, impr_plot, (int)0);
if (data->shuffle_countdown < n_batch) { if (data->shuffle_countdown < n_batch) {

View file

@ -138,7 +138,7 @@ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_ten
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG
GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
#endif #endif
size_t size = ggml_allocr_get_alloc_size(alloc, tensor); size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
@ -165,14 +165,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
if (best_fit_block == -1) { if (best_fit_block == -1) {
// the last block is our last resort // the last block is our last resort
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1]; struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
max_avail = MAX(max_avail, block->size);
if (block->size >= size) { if (block->size >= size) {
best_fit_block = alloc->n_free_blocks - 1; best_fit_block = alloc->n_free_blocks - 1;
max_avail = MAX(max_avail, block->size);
} else { } else {
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
__func__, size, max_avail); __func__, size, max_avail);
GGML_ASSERT(!"not enough space in the buffer"); GGML_ASSERT(!"not enough space in the buffer");
return; return;
} }
} }
struct free_block * block = &alloc->free_blocks[best_fit_block]; struct free_block * block = &alloc->free_blocks[best_fit_block];

View file

@ -307,7 +307,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
void * ggml_metal_host_malloc(size_t n) { void * ggml_metal_host_malloc(size_t n) {
void * data = NULL; void * data = NULL;
const int result = posix_memalign((void **) &data, getpagesize(), n); const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
if (result != 0) { if (result != 0) {
metal_printf("%s: error: posix_memalign failed\n", __func__); metal_printf("%s: error: posix_memalign failed\n", __func__);
return NULL; return NULL;
@ -381,7 +381,7 @@ bool ggml_metal_add_buffer(
} }
} }
const size_t size_page = getpagesize(); const size_t size_page = sysconf(_SC_PAGESIZE);
size_t size_aligned = size; size_t size_aligned = size;
if ((size_aligned % size_page) != 0) { if ((size_aligned % size_page) != 0) {
@ -1114,7 +1114,7 @@ void ggml_metal_graph_compute(
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21]; [encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22]; [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
} break; } break;
case GGML_OP_DUP: case GGML_OP_DUP:
case GGML_OP_CPY: case GGML_OP_CPY:

View file

@ -220,14 +220,10 @@ kernel void kernel_norm(
} }
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
} }
//// broadcast const float mean = sum[0] / ne00;
//if (tpitg == 0) {
// sum[0] /= ne00;
//}
//threadgroup_barrier(mem_flags::mem_threadgroup);
const float mean = sum[0];
// recenter and VARIANCE // recenter and VARIANCE
threadgroup_barrier(mem_flags::mem_threadgroup);
device float * y = dst + tgpig*ne00; device float * y = dst + tgpig*ne00;
sum[tpitg] = 0.0f; sum[tpitg] = 0.0f;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) { for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@ -235,12 +231,6 @@ kernel void kernel_norm(
sum[tpitg] += y[i00] * y[i00]; sum[tpitg] += y[i00] * y[i00];
} }
//// VARIANCE
//// parallel sum
//sum[tpitg] = 0.0f;
//for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
// sum[tpitg] += y[i00] * y[i00];
//}
// reduce // reduce
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
for (uint i = ntg/2; i > 0; i /= 2) { for (uint i = ntg/2; i > 0; i /= 2) {
@ -249,12 +239,7 @@ kernel void kernel_norm(
} }
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
} }
//// broadcast const float variance = sum[0] / ne00;
//if (tpitg == 0) {
// sum[0] /= ne00;
//}
//threadgroup_barrier(mem_flags::mem_threadgroup);
const float variance = sum[0];
const float scale = 1.0f/sqrt(variance + eps); const float scale = 1.0f/sqrt(variance + eps);
for (int i00 = tpitg; i00 < ne00; i00 += ntg) { for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@ -262,7 +247,6 @@ kernel void kernel_norm(
} }
} }
kernel void kernel_rms_norm( kernel void kernel_rms_norm(
device const void * src0, device const void * src0,
device float * dst, device float * dst,
@ -630,7 +614,6 @@ kernel void kernel_mul_mat_f16_f32(
} }
} }
} }
} }
kernel void kernel_alibi_f32( kernel void kernel_alibi_f32(
@ -699,25 +682,27 @@ kernel void kernel_rope(
constant int & mode, constant int & mode,
constant float & freq_base, constant float & freq_base,
constant float & freq_scale, constant float & freq_scale,
uint3 tpig[[thread_position_in_grid]]) { uint tiitg[[thread_index_in_threadgroup]],
const int64_t i3 = tpig[2]; uint3 tptg[[threads_per_threadgroup]],
const int64_t i2 = tpig[1]; uint3 tgpig[[threadgroup_position_in_grid]]) {
const int64_t i1 = tpig[0]; const int64_t i3 = tgpig[2];
const int64_t i2 = tgpig[1];
const int64_t i1 = tgpig[0];
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
const float theta_scale = pow(freq_base, -2.0f/n_dims);
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
float theta = freq_scale * (float)p; const float theta_0 = freq_scale * (float)p;
const float inv_ndims = -1.f/n_dims;
if (!is_neox) { if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) { for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
const float cos_theta = cos(theta); const float cos_theta = cos(theta);
const float sin_theta = sin(theta); const float sin_theta = sin(theta);
theta *= theta_scale;
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
@ -729,12 +714,12 @@ kernel void kernel_rope(
} }
} else { } else {
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
for (int64_t ic = 0; ic < n_dims; ic += 2) { for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
const float cos_theta = cos(theta); const float cos_theta = cos(theta);
const float sin_theta = sin(theta); const float sin_theta = sin(theta);
theta *= theta_scale;
const int64_t i0 = ib*n_dims + ic/2; const int64_t i0 = ib*n_dims + ic/2;
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);

12
ggml.c
View file

@ -195,7 +195,7 @@ typedef void * thread_ret_t;
inline static void * ggml_aligned_malloc(size_t size) { inline static void * ggml_aligned_malloc(size_t size) {
void * aligned_memory = NULL; void * aligned_memory = NULL;
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
int result = posix_memalign(&aligned_memory, getpagesize(), size); int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
#else #else
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size); int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
#endif #endif
@ -4815,7 +4815,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
size_t obj_alloc_size = 0; size_t obj_alloc_size = 0;
if (view_src == NULL && ctx->no_alloc == false) { if (view_src == NULL && !ctx->no_alloc) {
if (ctx->scratch.data != NULL) { if (ctx->scratch.data != NULL) {
// allocate tensor data in the scratch buffer // allocate tensor data in the scratch buffer
if (ctx->scratch.offs + data_size > ctx->scratch.size) { if (ctx->scratch.offs + data_size > ctx->scratch.size) {
@ -5516,7 +5516,7 @@ static struct ggml_tensor * ggml_mul_impl(
} }
if (inplace) { if (inplace) {
GGML_ASSERT(is_node == false); GGML_ASSERT(!is_node);
} }
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@ -5559,7 +5559,7 @@ static struct ggml_tensor * ggml_div_impl(
} }
if (inplace) { if (inplace) {
GGML_ASSERT(is_node == false); GGML_ASSERT(!is_node);
} }
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@ -20006,7 +20006,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
struct ggml_tensor * data = NULL; struct ggml_tensor * data = NULL;
if (params.no_alloc == false) { if (!params.no_alloc) {
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size); data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
ok = ok && data != NULL; ok = ok && data != NULL;
@ -20047,7 +20047,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
} }
// point the data member to the appropriate location in the binary blob using the tensor infos // point the data member to the appropriate location in the binary blob using the tensor infos
if (params.no_alloc == false) { if (!params.no_alloc) {
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
} }

View file

@ -1089,6 +1089,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
if (!max_abs_scale) { if (!max_abs_scale) {
memset(&y[i], 0, sizeof(block_q6_K)); memset(&y[i], 0, sizeof(block_q6_K));
y[i].d = ggml_fp32_to_fp16(0.f); y[i].d = ggml_fp32_to_fp16(0.f);
x += QK_K;
continue; continue;
} }

View file

@ -606,16 +606,16 @@ struct llama_mmap {
if (prefetch > 0) { if (prefetch > 0) {
// Advise the kernel to preload the mapped memory // Advise the kernel to preload the mapped memory
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
strerror(errno)); strerror(errno));
} }
} }
if (numa) { if (numa) {
// advise the kernel not to use readahead // advise the kernel not to use readahead
// (because the next page might not belong on the same node) // (because the next page might not belong on the same node)
if (madvise(addr, file->size, MADV_RANDOM)) { if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
strerror(errno)); strerror(errno));
} }
} }
@ -3052,33 +3052,10 @@ static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL; return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
} }
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
}
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
}
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) { static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE; return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
} }
static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(llama_is_control_token(vocab, id));
return id == vocab.special_bos_id;
}
static bool llama_is_eos_token(const llama_vocab & vocab, llama_token id ) {
GGML_ASSERT(llama_is_control_token(vocab, id));
return id == vocab.special_eos_id;
}
static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
GGML_ASSERT(id < 0 || llama_is_control_token(vocab, id));
return id == vocab.special_pad_id;
}
static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) { static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(llama_is_byte_token(vocab, id)); GGML_ASSERT(llama_is_byte_token(vocab, id));
const auto& token_data = vocab.id_to_token.at(id); const auto& token_data = vocab.id_to_token.at(id);
@ -4800,9 +4777,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
std::vector<std::thread> workers; std::vector<std::thread> workers;
std::mutex mutex; std::mutex mutex;
#ifdef GGML_USE_K_QUANTS
auto use_more_bits = [] (int i_layer, int num_layers) -> bool { auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2; return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
}; };
#endif
int idx = 0; int idx = 0;
@ -5947,7 +5926,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
rng_ss.str(std::string(&rng_buf[0], rng_size)); rng_ss.str(std::string(&rng_buf[0], rng_size));
rng_ss >> ctx->rng; rng_ss >> ctx->rng;
GGML_ASSERT(rng_ss.fail() == false); GGML_ASSERT(!rng_ss.fail());
} }
// set logits // set logits

View file

@ -76,7 +76,7 @@ void * align_with_offset(void * ptr, int offset) {
return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset; return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
} }
void benchmark_function(size_t size, size_t q_size, int64_t iterations, std::function<size_t(void)> function) { void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) {
int64_t min_time_us = INT64_MAX; int64_t min_time_us = INT64_MAX;
int64_t total_time_us = 0; int64_t total_time_us = 0;
int64_t min_time_cycles = INT64_MAX; int64_t min_time_cycles = INT64_MAX;