still not working

ready to rebase

working
This commit is contained in:
mike dupont 2023-11-24 19:09:19 -05:00
parent 04814e718e
commit 3faef69427
18 changed files with 2818 additions and 520 deletions

3
.gitignore vendored
View file

@ -99,3 +99,6 @@ tests/test-tokenizer-0-llama
tests/test-tokenizer-0-falcon
tests/test-tokenizer-1-llama
tests/test-tokenizer-1-bpe
/#llama.cpp#
#*
\\#*

View file

@ -104,7 +104,7 @@ option(LLAMA_BUILD_SERVER "llama: build server example"
# Compile flags
#
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED true)
set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true)
@ -230,7 +230,12 @@ if (LLAMA_BLAS)
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
add_compile_options(${BLAS_LINKER_FLAGS})
add_compile_definitions(GGML_USE_OPENBLAS)
# from https://github.com/NVIDIA/cutlass
make_directory("${PROJECT_BINARY_DIR}/nvcc_tmp")
set(cuda_flags --keep "SHELL:--keep-dir ${PROJECT_BINARY_DIR}/nvcc_tmp" ${cuda_flags})
# add_compile_definitions(GGML_USE_OPENBLAS)
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
add_compile_definitions(GGML_BLAS_USE_MKL)
endif()
@ -312,7 +317,7 @@ if (LLAMA_MPI)
if (MPI_C_FOUND)
message(STATUS "MPI found")
set(GGML_HEADERS_MPI ggml-mpi.h)
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
set(GGML_SOURCES_MPI ggml-mpi.cpp ggml-mpi.h)
add_compile_definitions(GGML_USE_MPI)
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
if (NOT MSVC)
@ -438,6 +443,9 @@ if (NOT cuda_host_flags STREQUAL "")
set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
endif()
#
set(cuda_flags --verbose -G ${cuda_flags})
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
if (WIN32)
@ -485,8 +493,10 @@ if (NOT MSVC)
add_link_options(-static-libgcc -static-libstdc++)
endif()
endif()
add_link_options("-Wl,-Map=${TARGET}.map")
if (LLAMA_GPROF)
add_compile_options(-pg)
add_compile_options(-pg)
endif()
endif()
@ -645,13 +655,16 @@ if (GGML_USE_CPU_HBM)
endif()
add_library(ggml OBJECT
ggml.c
ggml.cpp
ggml.h
ggml-alloc.c
print.hpp
ggml-internal.hpp
llama-internal.hpp
ggml-alloc.cpp
ggml-alloc.h
ggml-backend.c
ggml-backend.cpp
ggml-backend.h
ggml-quants.c
ggml-quants.cpp
ggml-quants.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
@ -683,7 +696,7 @@ add_library(llama
)
target_include_directories(llama PUBLIC .)
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
target_compile_features(llama PUBLIC cxx_std_20) # don't bump
target_link_libraries(llama PRIVATE
ggml
${LLAMA_EXTRA_LIBS}

View file

@ -116,7 +116,7 @@ endif
# keep standard at C11 and C++11
MK_CPPFLAGS = -I. -Icommon
MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++11 -fPIC
MK_CXXFLAGS = -std=c++20 -fPIC -fpermissive -DCPP_ONLY
# -Ofast tends to produce faster code, but may not be available for some compilers.
ifdef LLAMA_FAST
@ -502,7 +502,7 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
endif # LLAMA_METAL
ifdef LLAMA_MPI
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
ggml-mpi.o: ggml-mpi.cpp ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI
@ -537,17 +537,17 @@ $(info )
# Build library
#
ggml.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) -c $< -o $@
ggml.o: ggml.cpp ggml.h ggml-cuda.h
$(CXX) $(CXXFLAGS) -c $< -o $@
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
$(CC) $(CFLAGS) -c $< -o $@
ggml-alloc.o: ggml-alloc.cpp ggml.h ggml-alloc.h
$(CXX) $(CXXFLAGS) -c $< -o $@
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
$(CC) $(CFLAGS) -c $< -o $@
ggml-backend.o: ggml-backend.cpp ggml.h ggml-backend.h
$(CXX) $(CXXFLAGS) -c $< -o $@
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
$(CC) $(CFLAGS) -c $< -o $@
ggml-quants.o: ggml-quants.cpp ggml.h ggml-quants.h
$(CXX) $(CXXFLAGS) -c $< -o $@
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
@ -734,5 +734,5 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-c.o: tests/test-c.c llama.h
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
tests/test-c.o: tests/test-c.cpp llama.h
$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@

334
binding.py Normal file
View file

@ -0,0 +1,334 @@
import os
import json
import re
import clang.cindex
# configurable part
CLANG_VERSION='13.0.1'
# homebrew installs for llvm (brew info llvm gives details):
# x64: /usr/local/opt/llvm/lib
# arm64: /opt/homebrew/opt/llvm/lib
llvmLibPath = "/usr/lib/llvm-15/lib/"
cxxClientRoot = "/home/mdupont/experiments/llama.cpp/"
fileList = [
"ggml.cpp",
"llama.cpp"
]
typeList = [
]
# end of configurable part
clang.cindex.Config.set_library_path(llvmLibPath)
def list_headers_in_dir(path):
# enumerates a folder but keeps the full pathing for the files returned
# and removes certain files we don't want (like non-hxx, _json.hxx or _fmt.hxx)
# list all the files in the folder
files = os.listdir(path)
# only include .hxx files
files = list(filter(lambda x: x.endswith('.hxx'), files))
# add the folder path back on
files = list(map(lambda x: path + x, files))
return files
# parse through the list of files specified and expand wildcards
fullFileList = []
for filePath in fileList:
if "*" in filePath:
# wildcard path
basePath = filePath[:-1]
if "*" in basePath:
# if there is still a wildcard, we have an issue...
raise NotImplementedError(
"wildcard only supported at end of file path")
files = list_headers_in_dir(os.path.join(cxxClientRoot, basePath))
fullFileList = fullFileList + files
else:
# normal path
ff = os.path.join(cxxClientRoot, filePath)
fullFileList.append(ff)
print("DBUG",ff)
# exclude _json.hxx files
fullFileList = list(
filter(lambda x: not x.endswith('_json.hxx'), fullFileList))
# exclude _fmt.hxx files
fullFileList = list(
filter(lambda x: not x.endswith('_fmt.hxx'), fullFileList))
# generate a list of regexps from the type list (for handling wildcards)
typeListRe = list(map(lambda x: x.replace("*", "(.*)") + "(.*)", typeList))
def is_included_type(name, with_durability=False):
# TODO(brett19): This should be generalized somehow...
if "is_compound_operation" in name:
return False
if "replica_context" in name:
return False
if with_durability is True and '_with_legacy_durability' not in name:
return False
for x in typeListRe:
if re.fullmatch(x, name):
return True
return False
opTypes = []
opEnums = []
def parse_type(type):
typeStr = type.get_canonical().spelling
return parse_type_str(typeStr)
std_comparators = ["std::less<>", "std::greater<>", "std::less_equal<>", "std::greater_equal<>"]
def parse_type_str(typeStr):
if typeStr == "std::mutex":
return {"name": "std::mutex"}
if typeStr == "std::string":
return {"name": "std::string"}
if typeStr == "std::chrono::duration<long long>":
return {"name": "std::chrono::seconds"}
if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000>>":
return {"name": "std::chrono::milliseconds"}
if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000000>>":
return {"name": "std::chrono::microseconds"}
if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000000000>>":
return {"name": "std::chrono::nanoseconds"}
if typeStr == "std::error_code":
return {"name": "std::error_code"}
if typeStr == "std::monostate":
return {"name": "std::monostate"}
if typeStr == "std::byte":
return {"name": "std::byte"}
if typeStr == "unsigned long":
return {"name": "std::size_t"}
if typeStr == "char":
return {"name": "std::int8_t"}
if typeStr == "unsigned char":
return {"name": "std::uint8_t"}
if typeStr == "short":
return {"name": "std::int16_t"}
if typeStr == "unsigned short":
return {"name": "std::uint16_t"}
if typeStr == "int":
return {"name": "std::int32_t"}
if typeStr == "unsigned int":
return {"name": "std::uint32_t"}
if typeStr == "long long":
return {"name": "std::int64_t"}
if typeStr == "unsigned long long":
return {"name": "std::uint64_t"}
if typeStr == "bool":
return {"name": "std::bool"}
if typeStr == "float":
return {"name": "std::float"}
if typeStr == "double":
return {"name": "std::double"}
if typeStr == "std::nullptr_t":
return {"name": "std::nullptr_t"}
if typeStr in std_comparators:
return {"name": typeStr}
tplParts = typeStr.split("<", 1)
if len(tplParts) > 1:
tplClassName = tplParts[0]
tplParams = tplParts[1][:-1]
if tplClassName == "std::function":
return {
"name": "std::function"
}
if tplClassName == "std::optional":
return {
"name": "std::optional",
"of": parse_type_str(tplParams)
}
if tplClassName == "std::vector":
return {
"name": "std::vector",
"of": parse_type_str(tplParams)
}
if tplClassName == "std::set":
return {
"name": "std::set",
"of": parse_type_str(tplParams)
}
if tplClassName == "std::variant":
variantParts = tplParams.split(", ")
variantTypes = []
for variantPart in variantParts:
variantTypes.append(parse_type_str(variantPart))
return {
"name": "std::variant",
"of": variantTypes
}
if tplClassName == "std::array":
variantParts = tplParams.split(", ")
if len(variantParts) != 2:
print("FAILED TO PARSE ARRAY TYPES: " + typeStr)
return {"name": "unknown", "str": typeStr}
return {
"name": "std::array",
"of": parse_type_str(variantParts[0]),
"size": int(variantParts[1])
}
if tplClassName == "std::map":
variantParts = tplParams.split(", ")
if len(variantParts) < 2 or len(variantParts) > 3:
print("FAILED TO PARSE MAP TYPES: " + typeStr)
return {"name": "unknown", "str": typeStr}
if len(variantParts) == 2:
return {
"name": "std::map",
"of": parse_type_str(variantParts[0]),
"to": parse_type_str(variantParts[1])
}
else:
return {
"name": "std::map",
"of": parse_type_str(variantParts[0]),
"to": parse_type_str(variantParts[1]),
"comparator": parse_type_str(variantParts[2])
}
if tplClassName == "std::shared_ptr":
return {
"name": "std::shared_ptr",
"of": parse_type_str(tplParams)
}
#return {"name": "unknown", "str": typeStr}
if 'unnamed struct' in typeStr:
print("WARNING: Found unnamed struct: " + typeStr)
return {"name": typeStr}
internal_structs = []
UNNAMED_STRUCT_DELIM = '::(unnamed struct'
def traverse(node, namespace, main_file):
# only scan the elements of the file we parsed
#print("FILE", node.location.file )
if node.kind == clang.cindex.CursorKind.STRUCT_DECL or node.kind == clang.cindex.CursorKind.CLASS_DECL:
fullStructName = "::".join([*namespace, node.displayname])
print("REFL_TYPE(" + fullStructName + ")")
structFields = []
for child in node.get_children():
if child.kind == clang.cindex.CursorKind.FIELD_DECL:
struct_type = parse_type(child.type)
type_str = child.type.get_canonical().spelling
print(" REFL_FIELD(" + child.displayname + ")")
if 'unnamed' in type_str:
name_tokens = type_str.split('::')
name_override = '::'.join(name_tokens[:-1] + [child.displayname])
struct_type['name'] = name_override
internal_structs.append(name_override)
structFields.append({
"name": child.displayname,
"type": struct_type,
})
# replica read changes introduced duplicate get requests
if any(map(lambda op: op['name'] == fullStructName, opTypes)):
return
opTypes.append({
"name": fullStructName,
"fields": structFields,
})
print("REFL_END")
if node.kind == clang.cindex.CursorKind.TYPE_ALIAS_DECL:
fullStructName = "::".join([*namespace, node.displayname])
if is_included_type(fullStructName, with_durability=True):
type_ref = next((c for c in node.get_children() if c.kind == clang.cindex.CursorKind.TYPE_REF), None)
if type_ref:
base_request_name = type_ref.displayname.replace('struct', '').strip()
base_request = next((op for op in opTypes if op['name'] == base_request_name), None)
if base_request:
new_fields = [f for f in base_request['fields'] if f['name'] != 'durability_level']
new_fields.extend([
{"name":"persist_to", "type":{"name":"couchbase::persist_to"}},
{"name":"replicate_to", "type":{"name":"couchbase::replicate_to"}}
])
opTypes.append({
"name": fullStructName,
"fields": new_fields
})
if node.kind == clang.cindex.CursorKind.ENUM_DECL:
fullEnumName = "::".join([*namespace, node.displayname])
if is_included_type(fullEnumName):
enumValues = []
for child in node.get_children():
if child.kind == clang.cindex.CursorKind.ENUM_CONSTANT_DECL:
enumValues.append({
"name": child.displayname,
"value": child.enum_value,
})
opEnums.append({
"name": fullEnumName,
"type": parse_type(node.enum_type),
"values": enumValues,
})
if node.kind == clang.cindex.CursorKind.NAMESPACE:
namespace = [*namespace, node.displayname]
if node.kind == clang.cindex.CursorKind.CLASS_DECL:
namespace = [*namespace, node.displayname]
if node.kind == clang.cindex.CursorKind.STRUCT_DECL:
namespace = [*namespace, node.displayname]
for child in node.get_children():
traverse(child, namespace, main_file)
for headerPath in fullFileList:
print("processing " + headerPath)
index = clang.cindex.Index.create()
args = [
'-std=c++17',
]
try:
translation_unit = index.parse(headerPath, args=args)
except Exception as e:
print(e)
import pdb
pdb.set_trace()
raise e
# output clang compiler diagnostics information (for debugging)
for diagnostic in translation_unit.diagnostics:
diagnosticMsg = diagnostic.format()
print(diagnostic)
traverse(translation_unit.cursor, [], headerPath)
jsonData = json.dumps({
'op_structs': opTypes,
'op_enums': opEnums
})
f = open("bindings.json", "w")
f.write(jsonData)
f.close()

View file

@ -386,7 +386,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) {
free(galloc->parse_seq);
galloc->parse_seq = malloc(sizeof(int) * n);
galloc->parse_seq = (int*)malloc(sizeof(int) * n);
for (int i = 0; i < n; i++) {
galloc->parse_seq[i] = list[i];
@ -646,9 +646,9 @@ size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, st
if (galloc->hash_values != NULL) {
free(galloc->hash_values);
}
galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size);
galloc->hash_set.keys = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * hash_size);
galloc->hash_set.size = hash_size;
galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
galloc->hash_values = (hash_node*)malloc(sizeof(struct hash_node) * hash_size);
}
// reset hash table
@ -674,7 +674,7 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap
// alloc hash_values if needed
if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) {
free(galloc->hash_values);
galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
galloc->hash_values = (hash_node*)malloc(sizeof(struct hash_node) * hash_size);
galloc->hash_values_size = hash_size;
}

View file

@ -20,7 +20,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
struct ggml_backend_buffer_i iface,
ggml_backend_buffer_context_t context,
size_t size) {
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
ggml_backend_buffer_t buffer = (ggml_backend_buffer*)malloc(sizeof(struct ggml_backend_buffer));
GGML_ASSERT(iface.get_base != NULL);
@ -195,9 +195,9 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
// TODO: allow backends to support copy to/from same backend
if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst);
ggml_get_backend(dst)->iface.cpy_tensor_from((ggml_backend_t)ggml_get_backend(dst)->context, src, dst);
} else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst);
ggml_get_backend(src)->iface.cpy_tensor_to((ggml_backend_t)ggml_get_backend(src)->context, src, dst);
} else {
// shouldn't be hit when copying from/to CPU
#ifndef NDEBUG
@ -316,13 +316,13 @@ struct ggml_backend_plan_cpu {
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
struct ggml_backend_plan_cpu * cpu_plan = (ggml_backend_plan_cpu*)malloc(sizeof(struct ggml_backend_plan_cpu));
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
cpu_plan->cgraph = *cgraph;
if (cpu_plan->cplan.work_size > 0) {
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
cpu_plan->cplan.work_data = (uint8_t*)malloc(cpu_plan->cplan.work_size);
}
return cpu_plan;
@ -356,7 +356,7 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
cpu_ctx->work_size = cplan.work_size;
}
cplan.work_data = cpu_ctx->work_data;
cplan.work_data = (uint8_t*)cpu_ctx->work_data;
ggml_graph_compute(cgraph, &cplan);
}
@ -385,13 +385,13 @@ static struct ggml_backend_i cpu_backend_i = {
};
ggml_backend_t ggml_backend_cpu_init(void) {
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
struct ggml_backend_cpu_context * ctx = (ggml_backend_cpu_context*)malloc(sizeof(struct ggml_backend_cpu_context));
ctx->n_threads = GGML_DEFAULT_N_THREADS;
ctx->work_data = NULL;
ctx->work_size = 0;
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
ggml_backend_t cpu_backend = (ggml_backend_t)malloc(sizeof(struct ggml_backend));
*cpu_backend = (struct ggml_backend) {
/* .interface = */ cpu_backend_i,
@ -869,7 +869,7 @@ static void sched_reset(ggml_backend_sched_t sched) {
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched));
struct ggml_backend_sched * sched = (ggml_backend_sched*)malloc(sizeof(struct ggml_backend_sched));
memset(sched, 0, sizeof(struct ggml_backend_sched));
fprintf(stderr, "ggml_backend_sched size: %lu KB\n", sizeof(struct ggml_backend_sched)/1024);
@ -907,9 +907,9 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
// initialize hash tables
size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
sched->hash_set.size = hash_size;
sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size);
sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size);
sched->hash_set.keys = (ggml_tensor**)malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
sched->node_talloc = (ggml_tallocr**)malloc(sizeof(sched->node_talloc[0]) * hash_size);
sched->node_copies = (ggml_tensor *(*)[4])malloc(sizeof(sched->node_copies[0]) * hash_size);
sched_split_graph(sched, measure_graph);
sched_alloc_splits(sched);

View file

@ -22,7 +22,7 @@ extern "C" {
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
#define static_assert(cond, msg) _Static_assert(cond, msg)
#else
#define static_assert(cond, msg) struct global_scope_noop_trick
//#define static_assert(cond, msg) struct global_scope_noop_trick
#endif
#endif

258
ggml-internal.hpp Normal file
View file

@ -0,0 +1,258 @@
struct ggml_context {
size_t mem_size;
void * mem_buffer;
bool mem_buffer_owned;
bool no_alloc;
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
int n_objects;
struct ggml_object * objects_begin;
struct ggml_object * objects_end;
struct ggml_scratch scratch;
struct ggml_scratch scratch_save;
ggml_context():
mem_size(0),
mem_buffer(0),
mem_buffer_owned(0),
no_alloc(0),
no_alloc_save(0),
n_objects(0),
objects_begin(0),
objects_end(0),
scratch(),
scratch_save()
{
}
};
struct ggml_context_container {
bool used;
struct ggml_context context;
ggml_context_container(): used(0),context(){
}
};
typedef double ggml_float;
typedef void * thread_ret_t;
#define MAX_FREE_BLOCKS 256
struct free_block {
void * addr;
size_t size;
};
struct ggml_tallocr {
struct ggml_backend_buffer * buffer;
bool buffer_owned;
void * base;
size_t alignment;
int n_free_blocks;
struct free_block free_blocks[MAX_FREE_BLOCKS];
size_t max_size;
bool measure;
#ifdef GGML_ALLOCATOR_DEBUG
struct ggml_tensor * allocated_tensors[1024];
#endif
};
struct hash_node {
int n_children;
int n_views;
};
typedef struct ggml_tallocr * ggml_tallocr_t;
typedef struct ggml_gallocr * ggml_gallocr_t;
struct ggml_gallocr {
ggml_tallocr_t talloc;
struct ggml_hash_set hash_set;
struct hash_node * hash_values;
size_t hash_values_size;
ggml_tallocr_t * hash_allocs;
int * parse_seq;
int parse_seq_len;
};
struct ggml_allocr {
ggml_tallocr_t talloc;
ggml_gallocr_t galloc;
};
#define GGML_NUMA_MAX_NODES 8
#define GGML_NUMA_MAX_CPUS 512
struct ggml_numa_node {
uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
uint32_t n_cpus;
};
struct ggml_numa_nodes {
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
uint32_t n_nodes;
uint32_t total_cpus; // hardware threads on system
};
struct ggml_state {
struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
struct ggml_numa_nodes numa;
ggml_state():contexts(), numa()
{
}
};
struct gguf_str {
uint64_t n; // GGUFv2
char * data;
};
struct ggml_map_custom1_op_params {
ggml_custom1_op_t fun;
int n_tasks;
void * userdata;
};
struct ggml_map_custom2_op_params {
ggml_custom2_op_t fun;
int n_tasks;
void * userdata;
};
struct ggml_map_custom3_op_params {
ggml_custom3_op_t fun;
int n_tasks;
void * userdata;
};
struct hash_map {
struct ggml_hash_set set;
struct ggml_tensor ** vals;
};
#if defined(_WIN32)
typedef volatile LONG atomic_int;
typedef atomic_int atomic_bool;
#else
#include<atomic>
using namespace std;
#endif
struct ggml_compute_state_shared {
const struct ggml_cgraph * cgraph;
const struct ggml_cplan * cplan;
int64_t perf_node_start_cycles;
int64_t perf_node_start_time_us;
const int n_threads;
// synchronization primitives
atomic_int n_active; // num active threads
atomic_int node_n; // active graph node
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
void * abort_callback_data;
};
typedef pthread_t ggml_thread_t;
struct ggml_compute_state {
ggml_thread_t thrd;
int ith;
struct ggml_compute_state_shared * shared;
};
union gguf_value {
uint8_t uint8;
int8_t int8;
uint16_t uint16;
int16_t int16;
uint32_t uint32;
int32_t int32;
float float32;
uint64_t uint64;
int64_t int64;
double float64;
bool bool_;
struct gguf_str str;
struct gguf_array_T {
enum gguf_type type;
uint64_t n; // GGUFv2
void * data;
} arr;
};
struct ggml_lbfgs_iteration_data {
float alpha;
float ys;
float * s;
float * y;
};
struct gguf_kv {
struct gguf_str key;
enum gguf_type type;
union gguf_value value;
};
struct gguf_header {
char magic[4];
uint32_t version;
uint64_t n_tensors; // GGUFv2
uint64_t n_kv; // GGUFv2
};
struct gguf_tensor_info {
struct gguf_str name;
uint32_t n_dims;
uint64_t ne[GGML_MAX_DIMS];
enum ggml_type type;
uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
// for writing API
const void * data;
size_t size;
};
struct gguf_context {
struct gguf_header header;
struct gguf_kv * kv;
struct gguf_tensor_info * infos;
size_t alignment;
size_t offset; // offset of `data` from beginning of file
size_t size; // size of `data` in bytes
//uint8_t * padding;
void * data;
};
struct gguf_buf {
void * data;
size_t size;
size_t offset;
};
#include "ggml-backend-impl.h"

File diff suppressed because it is too large Load diff

View file

@ -167,58 +167,58 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
// Quantization
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
void quantize_row_q4_0_reference(const float * __restrict__ x, block_q4_0 * __restrict__ y, int k);
void quantize_row_q4_1_reference(const float * __restrict__ x, block_q4_1 * __restrict__ y, int k);
void quantize_row_q5_0_reference(const float * __restrict__ x, block_q5_0 * __restrict__ y, int k);
void quantize_row_q5_1_reference(const float * __restrict__ x, block_q5_1 * __restrict__ y, int k);
void quantize_row_q8_0_reference(const float * __restrict__ x, block_q8_0 * __restrict__ y, int k);
void quantize_row_q8_1_reference(const float * __restrict__ x, block_q8_1 * __restrict__ y, int k);
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
void quantize_row_q2_K_reference(const float * __restrict__ x, block_q2_K * __restrict__ y, int k);
void quantize_row_q3_K_reference(const float * __restrict__ x, block_q3_K * __restrict__ y, int k);
void quantize_row_q4_K_reference(const float * __restrict__ x, block_q4_K * __restrict__ y, int k);
void quantize_row_q5_K_reference(const float * __restrict__ x, block_q5_K * __restrict__ y, int k);
void quantize_row_q6_K_reference(const float * __restrict__ x, block_q6_K * __restrict__ y, int k);
void quantize_row_q8_K_reference(const float * __restrict__ x, block_q8_K * __restrict__ y, int k);
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
void quantize_row_q4_0(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q4_1(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q5_0(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q5_1(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q8_0(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q8_1(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q2_K(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q3_K(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q4_K(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q5_K(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q6_K(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q8_K(const float * __restrict__ x, void * __restrict__ y, int k);
// Dequantization
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
void dequantize_row_q4_0(const block_q4_0 * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q4_1(const block_q4_1 * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q5_0(const block_q5_0 * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q5_1(const block_q5_1 * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q8_0(const block_q8_0 * __restrict__ x, float * __restrict__ y, int k);
//void dequantize_row_q8_1(const block_q8_1 * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
void dequantize_row_q2_K(const block_q2_K * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q3_K(const block_q3_K * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q4_K(const block_q4_K * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q5_K(const block_q5_K * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q6_K(const block_q6_K * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q8_K(const block_q8_K * __restrict__ x, float * __restrict__ y, int k);
// Dot product
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q4_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q4_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q5_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q5_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q8_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q2_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q3_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q4_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q5_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q6_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);

View file

@ -38,6 +38,14 @@
#pragma warning(disable: 4996)
#endif
// initializers for static data called in the ggml_init function
static size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {};
static char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {};
void type_traits_init();
void GGUF_TYPE_SIZE_init();
void GGUF_TYPE_NAME_init();
#if defined(_WIN32)
#include <windows.h>
@ -86,7 +94,9 @@ static int sched_yield (void) {
}
#else
#include <pthread.h>
#include <stdatomic.h>
//#include <stdatomic.h>
#include <atomic>
using namespace std;
typedef void * thread_ret_t;
@ -96,6 +106,8 @@ typedef void * thread_ret_t;
#endif
#include <atomic>
#ifdef GGML_USE_CPU_HBM
#include <hbwmalloc.h>
#endif
@ -409,37 +421,39 @@ int64_t ggml_cycles_per_ms(void) {
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
static void ggml_vec_dot_f32(const int n, float * GGML_RESTRICT s, const float * GGML_RESTRICT x, const float * GGML_RESTRICT y);
static void ggml_vec_dot_f16(const int n, float * GGML_RESTRICT s, ggml_fp16_t * GGML_RESTRICT x, ggml_fp16_t * GGML_RESTRICT y);
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
[GGML_TYPE_I8] = {
static ggml_type_traits_t type_traits[GGML_TYPE_COUNT];
void type_traits_init(){
type_traits[GGML_TYPE_I8] = {
.type_name = "i8",
.blck_size = 1,
.type_size = sizeof(int8_t),
.is_quantized = false,
},
[GGML_TYPE_I16] = {
};
type_traits[GGML_TYPE_I16] = {
.type_name = "i16",
.blck_size = 1,
.type_size = sizeof(int16_t),
.is_quantized = false,
},
[GGML_TYPE_I32] = {
};
type_traits[GGML_TYPE_I32] = {
.type_name = "i32",
.blck_size = 1,
.type_size = sizeof(int32_t),
.is_quantized = false,
},
[GGML_TYPE_F32] = {
};
type_traits[GGML_TYPE_F32] = {
.type_name = "f32",
.blck_size = 1,
.type_size = sizeof(float),
.is_quantized = false,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
},
[GGML_TYPE_F16] = {
};
type_traits[GGML_TYPE_F16] = {
.type_name = "f16",
.blck_size = 1,
.type_size = sizeof(ggml_fp16_t),
@ -449,8 +463,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
.vec_dot_type = GGML_TYPE_F16,
},
[GGML_TYPE_Q4_0] = {
};
type_traits[GGML_TYPE_Q4_0] = {
.type_name = "q4_0",
.blck_size = QK4_0,
.type_size = sizeof(block_q4_0),
@ -460,8 +474,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
.vec_dot = ggml_vec_dot_q4_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
},
[GGML_TYPE_Q4_1] = {
};
type_traits[GGML_TYPE_Q4_1] = {
.type_name = "q4_1",
.blck_size = QK4_1,
.type_size = sizeof(block_q4_1),
@ -471,8 +485,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
.vec_dot = ggml_vec_dot_q4_1_q8_1,
.vec_dot_type = GGML_TYPE_Q8_1,
},
[4] = { // GGML_TYPE_Q4_2
};
type_traits[4] = { // GGML_TYPE_Q4_2
.type_name = "DEPRECATED",
.blck_size = 0,
.type_size = 0,
@ -482,8 +496,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = NULL,
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_COUNT,
},
[5] = { // GGML_TYPE_Q4_3
};
type_traits[5] = { // GGML_TYPE_Q4_3
.type_name = "DEPRECATED",
.blck_size = 0,
.type_size = 0,
@ -493,8 +507,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = NULL,
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_COUNT,
},
[GGML_TYPE_Q5_0] = {
};
type_traits[GGML_TYPE_Q5_0] = {
.type_name = "q5_0",
.blck_size = QK5_0,
.type_size = sizeof(block_q5_0),
@ -504,8 +518,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
.vec_dot = ggml_vec_dot_q5_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
},
[GGML_TYPE_Q5_1] = {
};
type_traits[GGML_TYPE_Q5_1] = {
.type_name = "q5_1",
.blck_size = QK5_1,
.type_size = sizeof(block_q5_1),
@ -515,8 +529,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
.vec_dot = ggml_vec_dot_q5_1_q8_1,
.vec_dot_type = GGML_TYPE_Q8_1,
},
[GGML_TYPE_Q8_0] = {
};
type_traits[GGML_TYPE_Q8_0] = {
.type_name = "q8_0",
.blck_size = QK8_0,
.type_size = sizeof(block_q8_0),
@ -526,8 +540,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
.vec_dot = ggml_vec_dot_q8_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
},
[GGML_TYPE_Q8_1] = {
};
type_traits[GGML_TYPE_Q8_1] = {
.type_name = "q8_1",
.blck_size = QK8_1,
.type_size = sizeof(block_q8_1),
@ -535,8 +549,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float = quantize_row_q8_1,
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
.vec_dot_type = GGML_TYPE_Q8_1,
},
[GGML_TYPE_Q2_K] = {
};
type_traits[GGML_TYPE_Q2_K] = {
.type_name = "q2_K",
.blck_size = QK_K,
.type_size = sizeof(block_q2_K),
@ -546,8 +560,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
.vec_dot = ggml_vec_dot_q2_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
},
[GGML_TYPE_Q3_K] = {
};
type_traits[GGML_TYPE_Q3_K] = {
.type_name = "q3_K",
.blck_size = QK_K,
.type_size = sizeof(block_q3_K),
@ -557,8 +571,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
.vec_dot = ggml_vec_dot_q3_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
},
[GGML_TYPE_Q4_K] = {
};
type_traits[GGML_TYPE_Q4_K] = {
.type_name = "q4_K",
.blck_size = QK_K,
.type_size = sizeof(block_q4_K),
@ -568,8 +582,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
.vec_dot = ggml_vec_dot_q4_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
},
[GGML_TYPE_Q5_K] = {
};
type_traits[GGML_TYPE_Q5_K] = {
.type_name = "q5_K",
.blck_size = QK_K,
.type_size = sizeof(block_q5_K),
@ -579,8 +593,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
.vec_dot = ggml_vec_dot_q5_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
},
[GGML_TYPE_Q6_K] = {
};
type_traits[GGML_TYPE_Q6_K] = {
.type_name = "q6_K",
.blck_size = QK_K,
.type_size = sizeof(block_q6_K),
@ -590,15 +604,15 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
.vec_dot = ggml_vec_dot_q6_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
},
[GGML_TYPE_Q8_K] = {
};
type_traits[GGML_TYPE_Q8_K] = {
.type_name = "q8_K",
.blck_size = QK_K,
.type_size = sizeof(block_q8_K),
.is_quantized = true,
.from_float = quantize_row_q8_K,
}
};
};
}
// For internal test use
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
@ -1160,7 +1174,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
static void ggml_vec_dot_f32(const int n, float * GGML_RESTRICT s, const float * GGML_RESTRICT x, const float * GGML_RESTRICT y) {
#ifdef GGML_SIMD
float sumf = 0.0f;
const int np = (n & ~(GGML_F32_STEP - 1));
@ -1197,7 +1211,7 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
*s = sumf;
}
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
static void ggml_vec_dot_f16(const int n, float * GGML_RESTRICT s, ggml_fp16_t * GGML_RESTRICT x, ggml_fp16_t * GGML_RESTRICT y) {
ggml_float sumf = 0.0;
#if defined(GGML_SIMD)
@ -1235,10 +1249,10 @@ static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * rest
// compute GGML_VEC_DOT_UNROLL dot products at once
// xs - x row stride in bytes
inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
@ -1288,7 +1302,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re
}
}
inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
#if defined(GGML_SIMD)
const int np = (n & ~(GGML_F32_STEP - 1));
@ -1320,10 +1334,10 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
}
// xs and vs are byte strides of x and v
inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
const float * restrict x[GGML_VEC_MAD_UNROLL];
const float * restrict v[GGML_VEC_MAD_UNROLL];
const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
x[i] = (const float *) ((const char *) xv + i*xs);
@ -2175,18 +2189,26 @@ static inline int ggml_up(int n, int m) {
////////////////////////////////////////////////////////////////////////////////
struct ggml_context * ggml_init(struct ggml_init_params params) {
// make this function thread safe
ggml_critical_section_start();
static bool is_first_call = true;
// initialize the data in the arrays
type_traits_init();
GGUF_TYPE_SIZE_init();
GGUF_TYPE_NAME_init();
if (is_first_call) {
// initialize time system (required on Windows)
ggml_time_init();
struct ggml_context * ctx = NULL;
static bool is_first_call = true;
// make this function thread safe
ggml_critical_section_start();
// initialize GELU, Quick GELU, SILU and EXP F32 tables
{
if (is_first_call) {
// initialize time system (required on Windows)
ggml_time_init();
// initialize GELU, Quick GELU, SILU and EXP F32 tables
{
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
ggml_fp16_t ii;
@ -2238,7 +2260,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
}
// find non-used context in g_state
struct ggml_context * ctx = NULL;
for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
if (!g_state.contexts[i].used) {
@ -2402,7 +2424,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
// align to GGML_MEM_ALIGN
size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
char * const mem_buffer = ctx->mem_buffer;
char * const mem_buffer = (char*)ctx->mem_buffer;
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
@ -2475,7 +2497,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
return NULL;
}
data = (char * const) ctx->scratch.data + ctx->scratch.offs;
data = (void*)(((char *)ctx->scratch.data) + ctx->scratch.offs);
ctx->scratch.offs += data_size;
} else {
@ -2630,7 +2652,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
const int nc = tensor->ne[0];
const size_t n1 = tensor->nb[1];
char * const data = tensor->data;
char * const data = (char*)tensor->data;
switch (tensor->type) {
case GGML_TYPE_I8:
@ -2682,7 +2704,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
const int nc = tensor->ne[0];
const size_t n1 = tensor->nb[1];
char * const data = tensor->data;
char * const data = (char*)tensor->data;
switch (tensor->type) {
case GGML_TYPE_I8:
@ -3063,7 +3085,7 @@ struct ggml_tensor * ggml_view_tensor(
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
struct ggml_object * obj = ctx->objects_begin;
char * const mem_buffer = ctx->mem_buffer;
char * const mem_buffer = (char*)ctx->mem_buffer;
while (obj != NULL) {
if (obj->type == GGML_OBJECT_TENSOR) {
@ -3080,7 +3102,7 @@ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
obj = obj->next;
char * const mem_buffer = ctx->mem_buffer;
char * const mem_buffer = (char*)ctx->mem_buffer;
while (obj != NULL) {
if (obj->type == GGML_OBJECT_TENSOR) {
@ -3096,7 +3118,7 @@ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml
struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
struct ggml_object * obj = ctx->objects_begin;
char * const mem_buffer = ctx->mem_buffer;
char * const mem_buffer = (char*)ctx->mem_buffer;
while (obj != NULL) {
if (obj->type == GGML_OBJECT_TENSOR) {
@ -3292,7 +3314,7 @@ static struct ggml_tensor * ggml_acc_impl(
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
int32_t params[] = { (int32_t)nb1, (int32_t)nb2, (int32_t)nb3, (int32_t)offset, inplace ? 1 : 0 };
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_ACC;
@ -4145,7 +4167,7 @@ static struct ggml_tensor * ggml_set_impl(
// make a view of the destination
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
int32_t params[] = { (int32_t)nb1,(int32_t) nb2, (int32_t)nb3, (int32_t)offset, inplace ? 1 : 0 };
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_SET;
@ -5402,7 +5424,7 @@ struct ggml_tensor * ggml_pool_2d(
};
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
int32_t params[] = { op, k0, k1, s0, s1, (int32_t)p0, (int32_t)p1 };
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_POOL_2D;
@ -8262,7 +8284,7 @@ static void ggml_compute_forward_repeat_back_f32(
GGML_ASSERT(nb00 == sizeof(float));
if (ggml_is_contiguous(dst)) {
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
} else {
for (int k3 = 0; k3 < ne3; k3++) {
for (int k2 = 0; k2 < ne2; k2++) {
@ -9390,6 +9412,7 @@ static void ggml_compute_forward_mul_mat(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
int64_t t0 = ggml_perf_time_us();
UNUSED(t0);
@ -9492,7 +9515,7 @@ static void ggml_compute_forward_mul_mat(
if (params->type == GGML_TASK_INIT) {
if (src1->type != vec_dot_type) {
char * wdata = params->wdata;
char * wdata = (char*)params->wdata;
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
for (int64_t i13 = 0; i13 < ne13; ++i13) {
@ -9646,7 +9669,7 @@ static void ggml_compute_forward_out_prod_f32(
return;
}
#endif
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
return;
}
@ -9829,7 +9852,7 @@ static void ggml_compute_forward_out_prod_q_f32(
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
if (params->type == GGML_TASK_INIT) {
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
return;
}
@ -11843,7 +11866,7 @@ static void ggml_compute_forward_pool_1d(
struct ggml_tensor * dst) {
const int32_t * opts = (const int32_t *)dst->op_params;
enum ggml_op_pool op = opts[0];
enum ggml_op_pool op = (ggml_op_pool)opts[0];
const int k0 = opts[1];
const int s0 = opts[2];
const int p0 = opts[3];
@ -11867,7 +11890,7 @@ static void ggml_compute_forward_pool_2d(
}
const int32_t * opts = (const int32_t *)dst->op_params;
enum ggml_op_pool op = opts[0];
enum ggml_op_pool op = (ggml_op_pool)opts[0];
const int k0 = opts[1];
const int k1 = opts[2];
const int s0 = opts[3];
@ -14098,7 +14121,7 @@ static struct ggml_hash_set ggml_hash_set_new(size_t size) {
size = ggml_hash_size(size);
struct ggml_hash_set result;
result.size = size;
result.keys = malloc(sizeof(struct ggml_tensor *) * size);
result.keys = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * size);
memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
return result;
}
@ -14113,9 +14136,9 @@ struct hash_map {
};
static struct hash_map * ggml_new_hash_map(size_t size) {
struct hash_map * result = malloc(sizeof(struct hash_map));
struct hash_map * result = (hash_map*)malloc(sizeof(struct hash_map));
result->set = ggml_hash_set_new(size);
result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
result->vals = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * result->set.size);
memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
return result;
}
@ -16034,7 +16057,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
/*.abort_callback =*/ NULL,
/*.abort_callback_data =*/ NULL,
};
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
struct ggml_compute_state * workers = (ggml_compute_state*)alloca(sizeof(struct ggml_compute_state)*n_threads);
// create thread pool
if (n_threads > 1) {
@ -16631,7 +16654,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
continue;
}
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name((ggml_op)i), (double) perf_total_per_op_us[i] / 1000.0);
}
GGML_PRINT("========================================\n");
@ -16903,11 +16926,11 @@ static enum ggml_opt_result ggml_opt_adam(
const int n_accum = MAX(1, params.n_gradient_accumulation);
const float accum_norm = 1.0f / (float) n_accum;
float * g = opt->adam.g->data; // gradients
float * m = opt->adam.m->data; // first moment
float * v = opt->adam.v->data; // second moment
float * g = (float*)opt->adam.g->data; // gradients
float * m = (float*)opt->adam.m->data; // first moment
float * v = (float*)opt->adam.v->data; // second moment
float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
float * pf = params.past > 0 ? (float *)opt->adam.pf->data : NULL; // past function values
struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
@ -17175,7 +17198,7 @@ static enum ggml_opt_result linesearch_backtracking(
} else {
// Armijo condition is satisfied
if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) {
return count;
return (ggml_opt_result)count;
}
ggml_vec_dot_f32(nx, &dg, g, d);
@ -17186,14 +17209,14 @@ static enum ggml_opt_result linesearch_backtracking(
} else {
if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) {
// regular Wolfe conditions
return count;
return (ggml_opt_result)count;
}
if(dg > -params->lbfgs.wolfe*dginit) {
width = dec;
} else {
// strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
return count;
return (ggml_opt_result)count;
}
}
}
@ -17258,13 +17281,13 @@ static enum ggml_opt_result ggml_opt_lbfgs(
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
float * x = opt->lbfgs.x->data; // current parameters
float * xp = opt->lbfgs.xp->data; // previous parameters
float * g = opt->lbfgs.g->data; // current gradient
float * gp = opt->lbfgs.gp->data; // previous gradient
float * d = opt->lbfgs.d->data; // search direction
float * x = (float*)opt->lbfgs.x->data; // current parameters
float * xp = (float*)opt->lbfgs.xp->data; // previous parameters
float * g = (float*)opt->lbfgs.g->data; // current gradient
float * gp = (float*)opt->lbfgs.gp->data; // previous gradient
float * d = (float*)opt->lbfgs.d->data; // search direction
float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values
float * pf = params.past > 0 ? (float*)opt->lbfgs.pf->data : NULL; // past function values
const int n_accum = MAX(1, params.n_gradient_accumulation);
const float accum_norm = 1.0f / (float) n_accum;
@ -17277,10 +17300,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
ggml_opt_get_params(np, ps, x);
// the L-BFGS memory
float * lm_alpha = opt->lbfgs.lmal->data;
float * lm_ys = opt->lbfgs.lmys->data;
float * lm_s = opt->lbfgs.lms->data;
float * lm_y = opt->lbfgs.lmy->data;
float * lm_alpha = (float*)opt->lbfgs.lmal->data;
float * lm_ys = (float*)opt->lbfgs.lmys->data;
float * lm_s = (float*)opt->lbfgs.lms->data;
float * lm_y = (float*)opt->lbfgs.lmy->data;
bool cancel = false;
@ -17377,7 +17400,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
ggml_vec_cpy_f32(nx, x, xp);
ggml_vec_cpy_f32(nx, g, gp);
return ls;
return (ggml_opt_result)ls;
}
opt->loss_after = fx;
@ -17564,7 +17587,7 @@ GGML_API void ggml_opt_init(
opt->nx = nx;
opt->just_initialized = true;
if (opt->ctx == NULL) {
struct ggml_init_params ctx_opt_params;
struct ggml_init_params ctx_opt_params;
if (opt->params.type == GGML_OPT_ADAM) {
ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3;
if (opt->params.past > 0) {
@ -17718,7 +17741,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
const int nb = k / QK4_0;
for (int b = 0; b < n; b += k) {
block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
block_q4_0 * GGML_RESTRICT y = (block_q4_0 *) dst + b/QK4_0;
quantize_row_q4_0_reference(src + b, y, k);
@ -17741,7 +17764,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
const int nb = k / QK4_1;
for (int b = 0; b < n; b += k) {
block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
block_q4_1 * GGML_RESTRICT y = (block_q4_1 *) dst + b/QK4_1;
quantize_row_q4_1_reference(src + b, y, k);
@ -17764,7 +17787,7 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
const int nb = k / QK5_0;
for (int b = 0; b < n; b += k) {
block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
block_q5_0 * GGML_RESTRICT y = (block_q5_0 *)dst + b/QK5_0;
quantize_row_q5_0_reference(src + b, y, k);
@ -17794,7 +17817,7 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
const int nb = k / QK5_1;
for (int b = 0; b < n; b += k) {
block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
block_q5_1 * GGML_RESTRICT y = (block_q5_1 *)dst + b/QK5_1;
quantize_row_q5_1_reference(src + b, y, k);
@ -17824,7 +17847,7 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
const int nb = k / QK8_0;
for (int b = 0; b < n; b += k) {
block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
block_q8_0 * GGML_RESTRICT y = (block_q8_0 *)dst + b/QK8_0;
quantize_row_q8_0_reference(src + b, y, k);
@ -17928,37 +17951,39 @@ struct gguf_str {
char * data;
};
static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
[GGUF_TYPE_UINT8] = sizeof(uint8_t),
[GGUF_TYPE_INT8] = sizeof(int8_t),
[GGUF_TYPE_UINT16] = sizeof(uint16_t),
[GGUF_TYPE_INT16] = sizeof(int16_t),
[GGUF_TYPE_UINT32] = sizeof(uint32_t),
[GGUF_TYPE_INT32] = sizeof(int32_t),
[GGUF_TYPE_FLOAT32] = sizeof(float),
[GGUF_TYPE_BOOL] = sizeof(bool),
[GGUF_TYPE_STRING] = sizeof(struct gguf_str),
[GGUF_TYPE_UINT64] = sizeof(uint64_t),
[GGUF_TYPE_INT64] = sizeof(int64_t),
[GGUF_TYPE_FLOAT64] = sizeof(double),
[GGUF_TYPE_ARRAY] = 0, // undefined
void GGUF_TYPE_SIZE_init() {
GGUF_TYPE_SIZE[GGUF_TYPE_UINT8] = sizeof(uint8_t);
GGUF_TYPE_SIZE[GGUF_TYPE_INT8] = sizeof(int8_t);
GGUF_TYPE_SIZE[GGUF_TYPE_UINT16] = sizeof(uint16_t);
GGUF_TYPE_SIZE[GGUF_TYPE_INT16] = sizeof(int16_t);
GGUF_TYPE_SIZE[GGUF_TYPE_UINT32] = sizeof(uint32_t);
GGUF_TYPE_SIZE[GGUF_TYPE_INT32] = sizeof(int32_t);
GGUF_TYPE_SIZE[GGUF_TYPE_FLOAT32] = sizeof(float);
GGUF_TYPE_SIZE[GGUF_TYPE_BOOL] = sizeof(bool);
GGUF_TYPE_SIZE[GGUF_TYPE_STRING] = sizeof(struct gguf_str);
GGUF_TYPE_SIZE[GGUF_TYPE_UINT64] = sizeof(uint64_t);
GGUF_TYPE_SIZE[GGUF_TYPE_INT64] = sizeof(int64_t);
GGUF_TYPE_SIZE[GGUF_TYPE_FLOAT64] = sizeof(double);
GGUF_TYPE_SIZE[GGUF_TYPE_ARRAY] = 0; // undefined
};
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
[GGUF_TYPE_UINT8] = "u8",
[GGUF_TYPE_INT8] = "i8",
[GGUF_TYPE_UINT16] = "u16",
[GGUF_TYPE_INT16] = "i16",
[GGUF_TYPE_UINT32] = "u32",
[GGUF_TYPE_INT32] = "i32",
[GGUF_TYPE_FLOAT32] = "f32",
[GGUF_TYPE_BOOL] = "bool",
[GGUF_TYPE_STRING] = "str",
[GGUF_TYPE_ARRAY] = "arr",
[GGUF_TYPE_UINT64] = "u64",
[GGUF_TYPE_INT64] = "i64",
[GGUF_TYPE_FLOAT64] = "f64",
void GGUF_TYPE_NAME_init(){
GGUF_TYPE_NAME[GGUF_TYPE_UINT8] = "u8";
GGUF_TYPE_NAME[GGUF_TYPE_INT8] = "i8";
GGUF_TYPE_NAME[GGUF_TYPE_UINT16] = "u16";
GGUF_TYPE_NAME[GGUF_TYPE_INT16] = "i16";
GGUF_TYPE_NAME[GGUF_TYPE_UINT32] = "u32";
GGUF_TYPE_NAME[GGUF_TYPE_INT32] = "i32";
GGUF_TYPE_NAME[GGUF_TYPE_FLOAT32] = "f32";
GGUF_TYPE_NAME[GGUF_TYPE_BOOL] = "bool";
GGUF_TYPE_NAME[GGUF_TYPE_STRING] = "str";
GGUF_TYPE_NAME[GGUF_TYPE_ARRAY] = "arr";
GGUF_TYPE_NAME[GGUF_TYPE_UINT64] = "u64";
GGUF_TYPE_NAME[GGUF_TYPE_INT64] = "i64";
GGUF_TYPE_NAME[GGUF_TYPE_FLOAT64] = "f64";
};
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
@ -18040,14 +18065,14 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
bool ok = true;
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = (char*)calloc(p->n + 1, 1);
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
return ok;
}
struct gguf_context * gguf_init_empty(void) {
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
ctx->header.version = GGUF_VERSION;
@ -18092,7 +18117,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
bool ok = true;
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
// read the header
{
@ -18124,7 +18149,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// read the kv pairs
{
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
ctx->kv = (gguf_kv*)malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
struct gguf_kv * kv = &ctx->kv[i];
@ -18199,7 +18224,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// read the tensor infos
{
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
ctx->infos = (gguf_tensor_info*)malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
struct gguf_tensor_info * info = &ctx->infos[i];
@ -18319,10 +18344,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// create the tensors
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
const int64_t ne[GGML_MAX_DIMS] = {
ctx->infos[i].ne[0],
ctx->infos[i].ne[1],
ctx->infos[i].ne[2],
ctx->infos[i].ne[3],
(int64_t)ctx->infos[i].ne[0],
(int64_t)ctx->infos[i].ne[1],
(int64_t)ctx->infos[i].ne[2],
(int64_t)ctx->infos[i].ne[3],
};
struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
@ -18603,7 +18628,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
const int n_kv = gguf_get_n_kv(ctx);
ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
ctx->kv = (gguf_kv*)realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
ctx->kv[n_kv].key.n = strlen(key);
ctx->kv[n_kv].key.data = strdup(key);
ctx->header.n_kv++;
@ -18739,7 +18764,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
case GGUF_TYPE_ARRAY:
{
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
const char ** data = (const char **)malloc(src->kv[i].value.arr.n*sizeof(char *));
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
}
@ -18760,7 +18785,7 @@ void gguf_add_tensor(
struct gguf_context * ctx,
const struct ggml_tensor * tensor) {
const int idx = ctx->header.n_tensors;
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
ctx->infos = (gguf_tensor_info*)realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
ctx->infos[idx].name.n = strlen(tensor->name);
ctx->infos[idx].name.data = strdup(tensor->name);

6
ggml.h
View file

@ -285,8 +285,10 @@
GGML_UNUSED(prefix##3);
#ifdef __cplusplus
#ifndef CPP_ONLY
extern "C" {
#endif
#endif
#if defined(__ARM_NEON) && defined(__CUDACC__)
typedef half ggml_fp16_t;
@ -2136,7 +2138,7 @@ extern "C" {
// restrict not standard in C++
#define GGML_RESTRICT
#else
#define GGML_RESTRICT restrict
#define GGML_RESTRICT __restrict__
#endif
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
@ -2157,5 +2159,7 @@ extern "C" {
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
#ifdef __cplusplus
#ifndef CPP_ONLY
}
#endif
#endif

896
llama-internal.hpp Normal file
View file

@ -0,0 +1,896 @@
#include <set>
#include <queue>
enum llm_arch {
LLM_ARCH_LLAMA,
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
LLM_ARCH_GPT2,
LLM_ARCH_GPTJ,
LLM_ARCH_GPTNEOX,
LLM_ARCH_MPT,
LLM_ARCH_STARCODER,
LLM_ARCH_PERSIMMON,
LLM_ARCH_REFACT,
LLM_ARCH_BLOOM,
LLM_ARCH_STABLELM,
LLM_ARCH_UNKNOWN,
};
enum llm_kv {
LLM_KV_GENERAL_ARCHITECTURE,
LLM_KV_GENERAL_QUANTIZATION_VERSION,
LLM_KV_GENERAL_ALIGNMENT,
LLM_KV_GENERAL_NAME,
LLM_KV_GENERAL_AUTHOR,
LLM_KV_GENERAL_URL,
LLM_KV_GENERAL_DESCRIPTION,
LLM_KV_GENERAL_LICENSE,
LLM_KV_GENERAL_SOURCE_URL,
LLM_KV_GENERAL_SOURCE_HF_REPO,
LLM_KV_CONTEXT_LENGTH,
LLM_KV_EMBEDDING_LENGTH,
LLM_KV_BLOCK_COUNT,
LLM_KV_FEED_FORWARD_LENGTH,
LLM_KV_USE_PARALLEL_RESIDUAL,
LLM_KV_TENSOR_DATA_LAYOUT,
LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV,
LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
LLM_KV_ATTENTION_CLAMP_KQV,
LLM_KV_ATTENTION_LAYERNORM_EPS,
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_FREQ_BASE,
LLM_KV_ROPE_SCALE_LINEAR,
LLM_KV_ROPE_SCALING_TYPE,
LLM_KV_ROPE_SCALING_FACTOR,
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
LLM_KV_ROPE_SCALING_FINETUNED,
LLM_KV_TOKENIZER_MODEL,
LLM_KV_TOKENIZER_LIST,
LLM_KV_TOKENIZER_TOKEN_TYPE,
LLM_KV_TOKENIZER_SCORES,
LLM_KV_TOKENIZER_MERGES,
LLM_KV_TOKENIZER_BOS_ID,
LLM_KV_TOKENIZER_EOS_ID,
LLM_KV_TOKENIZER_UNK_ID,
LLM_KV_TOKENIZER_SEP_ID,
LLM_KV_TOKENIZER_PAD_ID,
LLM_KV_TOKENIZER_ADD_BOS,
LLM_KV_TOKENIZER_ADD_EOS,
LLM_KV_TOKENIZER_HF_JSON,
LLM_KV_TOKENIZER_RWKV,
};
// available llama models
enum e_model {
MODEL_UNKNOWN,
MODEL_1B,
MODEL_3B,
MODEL_7B,
MODEL_8B,
MODEL_13B,
MODEL_15B,
MODEL_30B,
MODEL_34B,
MODEL_40B,
MODEL_65B,
MODEL_70B,
};
enum llama_fver {
GGUF_FILE_VERSION_V1 = 1,
GGUF_FILE_VERSION_V2 = 2,
GGUF_FILE_VERSION_V3 = 3,
};
struct LLM_KV {
LLM_KV(llm_arch arch) : arch(arch) {}
llm_arch arch;
std::string operator()(llm_kv kv) const; // moved to llama.cpp file
};
enum llm_tensor {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_TOKEN_EMBD_NORM,
LLM_TENSOR_POS_EMBD,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_ROPE_FREQS,
LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_K,
LLM_TENSOR_ATTN_V,
LLM_TENSOR_ATTN_QKV,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_NORM_2,
LLM_TENSOR_ATTN_ROT_EMBD,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM,
};
struct llama_cparams {
uint32_t n_ctx; // context size used during inference
uint32_t n_batch;
uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing
float rope_freq_base;
float rope_freq_scale;
uint32_t n_yarn_orig_ctx;
// These hyperparameters are not exposed in GGUF, because all
// existing YaRN models use the same values for them.
float yarn_ext_factor;
float yarn_attn_factor;
float yarn_beta_fast;
float yarn_beta_slow;
bool mul_mat_q;
};
struct llama_layer {
// normalization
struct ggml_tensor * attn_norm;
struct ggml_tensor * attn_norm_b;
struct ggml_tensor * attn_norm_2;
struct ggml_tensor * attn_norm_2_b;
struct ggml_tensor * attn_q_norm;
struct ggml_tensor * attn_q_norm_b;
struct ggml_tensor * attn_k_norm;
struct ggml_tensor * attn_k_norm_b;
// attention
struct ggml_tensor * wq;
struct ggml_tensor * wk;
struct ggml_tensor * wv;
struct ggml_tensor * wo;
struct ggml_tensor * wqkv;
// attention bias
struct ggml_tensor * bo;
struct ggml_tensor * bqkv;
// normalization
struct ggml_tensor * ffn_norm;
struct ggml_tensor * ffn_norm_b;
// ff
struct ggml_tensor * ffn_gate; // w1
struct ggml_tensor * ffn_down; // w2
struct ggml_tensor * ffn_up; // w3
// ff bias
struct ggml_tensor * ffn_down_b; // b2
struct ggml_tensor * ffn_up_b; // b3
};
struct llama_kv_cell {
llama_pos pos = -1;
llama_pos delta = 0;
std::set<llama_seq_id> seq_id;
bool has_seq_id(const llama_seq_id & id) const {
return seq_id.find(id) != seq_id.end();
}
};
struct llama_buffer {
void * data = NULL;
size_t size = 0;
// fallback to malloc / free
// useful in cases where CUDA can try to allocate PINNED memory
bool fallback = false;
void resize(size_t n) ;
~llama_buffer();
};
// ring-buffer of cached KV data
struct llama_kv_cache {
bool has_shift = false;
// Note: The value of head isn't only used to optimize searching
// for a free KV slot. llama_decode_internal also uses it, so it
// cannot be freely changed after a slot has been allocated.
uint32_t head = 0;
uint32_t size = 0;
// computed before each graph build
uint32_t n = 0;
std::vector<llama_kv_cell> cells;
struct ggml_tensor * k = NULL;
struct ggml_tensor * v = NULL;
struct ggml_context * ctx = NULL;
llama_buffer buf;
~llama_kv_cache() {
if (ctx) {
ggml_free(ctx);
}
#ifdef GGML_USE_CUBLAS
if (ggml_cublas_loaded()) {
ggml_cuda_free_data(k);
ggml_cuda_free_data(v);
}
#endif
}
};
struct llama_vocab {
using id = int32_t;
using token = std::string;
using ttype = llama_token_type;
struct token_data {
token text;
float score;
ttype type;
};
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
std::unordered_map<token, id> token_to_id;
std::vector<token_data> id_to_token;
std::unordered_map<token, id> special_tokens_cache;
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
// default LLaMA special tokens
id special_bos_id = 1;
id special_eos_id = 2;
id special_unk_id = 0;
id special_sep_id = -1;
id special_pad_id = -1;
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
id linefeed_id = 13;
id special_prefix_id = 32007;
id special_middle_id = 32009;
id special_suffix_id = 32008;
id special_eot_id = 32010;
int find_bpe_rank(std::string token_left, std::string token_right) const {
GGML_ASSERT(token_left.find(" ") == std::string::npos);
GGML_ASSERT(token_left.find("\n") == std::string::npos);
GGML_ASSERT(token_right.find(" ") == std::string::npos);
GGML_ASSERT(token_right.find("\n") == std::string::npos);
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
if (it == bpe_ranks.end()) {
return -1;
}
return it->second;
}
};
struct llama_mmap {
void * addr;
size_t size;
llama_mmap(const llama_mmap &) = delete;
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false);
~llama_mmap();
#ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true;
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
#else
static constexpr bool SUPPORTED = false;
#endif
};
struct llama_hparams {
bool vocab_only;
uint32_t n_vocab;
uint32_t n_ctx_train; // context size the model was trained on
uint32_t n_embd;
uint32_t n_head;
uint32_t n_head_kv;
uint32_t n_layer;
uint32_t n_rot;
uint32_t n_ff;
float f_norm_eps;
float f_norm_rms_eps;
float rope_freq_base_train;
float rope_freq_scale_train;
uint32_t n_yarn_orig_ctx;
int8_t rope_scaling_type_train : 3;
bool rope_finetuned : 1;
float f_clamp_kqv;
float f_max_alibi_bias;
bool operator!=(const llama_hparams & other) const;
uint32_t n_gqa() const {
return n_head/n_head_kv;
}
uint32_t n_embd_head() const {
return n_embd/n_head;
}
uint32_t n_embd_gqa() const {
return n_embd/n_gqa();
}
};
struct llama_mlock {
void * addr = NULL;
size_t size = 0;
bool failed_already = false;
llama_mlock() ;
llama_mlock(const llama_mlock &) = delete;
~llama_mlock();
void init(void * ptr);
void grow_to(size_t target_size);
#ifdef _POSIX_MEMLOCK_RANGE
static constexpr bool SUPPORTED = true;
static size_t lock_granularity();
#ifdef __APPLE__
#define MLOCK_SUGGESTION \
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
#else
#define MLOCK_SUGGESTION \
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
#endif
bool raw_lock(const void * addr, size_t size) const ;
#undef MLOCK_SUGGESTION
static void raw_unlock(void * addr, size_t size);
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
static size_t lock_granularity();
bool raw_lock(void * ptr, size_t len) const ;
static void raw_unlock(void * ptr, size_t len);
#else
static constexpr bool SUPPORTED = false;
static size_t lock_granularity();
bool raw_lock(const void * addr, size_t len) const;
static void raw_unlock(const void * addr, size_t len);
#endif
};
struct llama_model {
e_model type = MODEL_UNKNOWN;
llm_arch arch = LLM_ARCH_UNKNOWN;
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
std::string name = "n/a";
llama_hparams hparams = {};
llama_vocab vocab;
struct ggml_tensor * tok_embd;
struct ggml_tensor * pos_embd;
struct ggml_tensor * tok_norm;
struct ggml_tensor * tok_norm_b;
struct ggml_tensor * output_norm;
struct ggml_tensor * output_norm_b;
struct ggml_tensor * output;
std::vector<llama_layer> layers;
int n_gpu_layers;
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
// context
struct ggml_context * ctx = NULL;
// the model memory buffer
llama_buffer buf;
// model memory mapped file
std::unique_ptr<llama_mmap> mapping;
// objects representing data potentially being locked in memory
llama_mlock mlock_buf;
llama_mlock mlock_mmap;
// for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
int64_t t_load_us = 0;
int64_t t_start_us = 0;
~llama_model() {
if (ctx) {
ggml_free(ctx);
}
#ifdef GGML_USE_CUBLAS
if (ggml_cublas_loaded()) {
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
ggml_cuda_free_data(tensors_by_name[i].second);
}
ggml_cuda_free_scratch();
}
#endif
#if defined(GGML_USE_CLBLAST)
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
ggml_cl_free_data(tensors_by_name[i].second);
}
#endif
}
};
struct llama_context {
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
~llama_context();
llama_cparams cparams;
const llama_model & model;
// key + value cache for the self attention
struct llama_kv_cache kv_self;
std::mt19937 rng;
bool has_evaluated_once = false;
int64_t t_start_us;
int64_t t_load_us;
int64_t t_sample_us = 0;
int64_t t_p_eval_us = 0;
int64_t t_eval_us = 0;
int32_t n_sample = 0; // number of tokens sampled
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
int32_t n_eval = 0; // number of eval calls
// decode output (2-dimensional array: [n_tokens][n_vocab])
std::vector<float> logits;
bool logits_all = false;
// input embedding (1-dimensional array: [n_embd])
std::vector<float> embedding;
// reusable buffer for `struct ggml_graph_plan.work_data`
std::vector<uint8_t> work_buffer;
// memory buffers used to evaluate the model
llama_buffer buf_compute;
llama_buffer buf_alloc;
ggml_allocr * alloc = NULL;
#ifdef GGML_USE_METAL
ggml_metal_context * ctx_metal = NULL;
#endif
#ifdef GGML_USE_MPI
ggml_mpi_context * ctx_mpi = NULL;
#endif
};
struct LLM_TN {
LLM_TN(llm_arch arch) ;
llm_arch arch;
std::string operator()(llm_tensor tensor) const;
std::string operator()(llm_tensor tensor, const std::string & suffix) const ;
std::string operator()(llm_tensor tensor, int bid) const ;
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const ;
};
struct llama_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
size_t size;
llama_file(const char * fname, const char * mode) ;
size_t tell() const;
void seek(size_t offset, int whence) const;
void read_raw(void * ptr, size_t len) const;
uint32_t read_u32() const;
void write_raw(const void * ptr, size_t len) const ;
void write_u32(std::uint32_t val) const;
~llama_file();
};
struct llama_state {
llama_state();
// We save the log callback globally
ggml_log_callback log_callback;
void * log_callback_user_data = nullptr;
};
struct llama_model_loader {
int n_kv = 0;
int n_tensors = 0;
int n_created = 0;
int64_t n_elements = 0;
size_t n_bytes = 0;
bool use_mmap = false;
llama_file file;
llama_ftype ftype;
llama_fver fver;
std::unique_ptr<llama_mmap> mapping;
struct gguf_context * ctx_gguf = NULL;
struct ggml_context * ctx_meta = NULL;
llama_model_loader(const std::string & fname, bool use_mmap) ;
~llama_model_loader();
std::string get_arch_name() const;
enum llm_arch get_arch() const ;
const char * get_tensor_name(int i) const;
struct ggml_tensor * get_tensor_meta(int i) const;
void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const;
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) ;
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) ;
void done_getting_tensors() const;
size_t file_offset(const char * name) const;
void load_data_for(struct ggml_tensor * cur) const ;
void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) ;
};
struct llama_data_context {
virtual void write(const void * src, size_t size) = 0;
virtual size_t get_size_written() = 0;
virtual ~llama_data_context() = default;
};
struct llama_data_buffer_context : llama_data_context {
uint8_t * ptr;
size_t size_written = 0;
llama_data_buffer_context(uint8_t * p) ;
void write(const void * src, size_t size) override ;
size_t get_size_written() override ;
};
struct llama_data_file_context : llama_data_context {
llama_file * file;
size_t size_written = 0;
llama_data_file_context(llama_file * f);
size_t get_size_written() override ;
void write(const void * src, size_t size);
};
struct llama_beam {
std::vector<llama_token> tokens;
float p; // Cumulative beam probability (renormalized relative to all beams)
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
// Sort beams by probability. In case of ties, prefer beams at eob.
bool operator<(const llama_beam & rhs) const ;
void shift_tokens(const size_t n) ;
llama_beam_view view() const;
};
// A struct for calculating logit-related info.
struct llama_logit_info {
const float * const logits;
const int n_vocab;
const float max_l;
const float normalizer;
struct sum_exp {
float max_l;
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
};
llama_logit_info(llama_context * ctx);
llama_token_data get_token_data(const llama_token token_id) const ;
std::vector<llama_token_data> top_k(size_t k) ;
float probability_from_logit(float logit) const ;
};
struct llama_beam_search_data {
llama_context * ctx;
size_t n_beams;
int n_past;
int n_predict;
std::vector<llama_beam> beams;
std::vector<llama_beam> next_beams;
size_t common_prefix_length;
std::vector<llama_beam_view> beam_views;
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict);
void collapse_beams(const size_t beam_idx) ;
void fill_next_beams_by_top_probabilities(llama_beam & beam) ;
size_t find_common_prefix_length() ;
llama_beams_state get_beams_state(const bool last_call) ;
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data);
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) ;
size_t top_beam_index();
void update_beams_from_beam_views();
};
using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
enum llm_rope_type {
LLM_ROPE,
LLM_ROPE_NEOX,
LLM_ROPE_GLM,
};
enum llm_ffn_op_type {
LLM_FFN_SILU,
LLM_FFN_GELU,
LLM_FFN_RELU,
LLM_FFN_RELU_SQR,
};
enum llm_ffn_gate_type {
LLM_FFN_SEQ,
LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
};
enum llm_norm_type {
LLM_NORM,
LLM_NORM_RMS,
};
struct llm_build_context {
const llama_model & model;
const llama_hparams & hparams;
const llama_cparams & cparams;
const llama_batch & batch;
const llama_kv_cache & kv_self;
const int64_t n_embd;
const int64_t n_layer;
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
const int64_t n_head;
const int64_t n_head_kv;
const int64_t n_embd_head;
const int64_t n_embd_gqa;
const float freq_base;
const float freq_scale;
const float ext_factor;
const float attn_factor;
const float beta_fast;
const float beta_slow;
const float norm_eps;
const float norm_rms_eps;
const int32_t n_tokens;
const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx)
const int32_t kv_head; // index of where we store new KV data in the cache
const int32_t n_orig_ctx;
const bool do_rope_shift;
const llm_build_cb & cb;
llama_buffer & buf_compute;
struct ggml_context * ctx0 = nullptr;
// TODO: consider making the entire interface noexcept
llm_build_context(
llama_context & lctx,
const llama_batch & batch,
const llm_build_cb & cb,
bool worst_case);
void init() ;
void free() ;
struct ggml_cgraph * build_llama() ;
struct ggml_cgraph * build_baichuan() ;
struct ggml_cgraph * build_falcon() ;
struct ggml_cgraph * build_starcoder() ;
struct ggml_cgraph * build_persimmon() ;
struct ggml_cgraph * build_refact() ;
struct ggml_cgraph * build_bloom() ;
struct ggml_cgraph * build_mpt() ;
struct ggml_cgraph * build_stablelm();
};
enum llm_offload_func_e {
OFFLOAD_FUNC_NOP,
OFFLOAD_FUNC,
OFFLOAD_FUNC_KQ,
OFFLOAD_FUNC_V,
OFFLOAD_FUNC_NR,
OFFLOAD_FUNC_EMB,
OFFLOAD_FUNC_OUT,
};
struct llm_offload_trie {
struct node {
~node() ;
node * children[256] = { nullptr };
llm_offload_func_e func = OFFLOAD_FUNC_NOP;
};
node * root = nullptr;
llm_offload_trie();
llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) ;
~llm_offload_trie();
void add(const char * name, llm_offload_func_e func);
llm_offload_func_e find(const char * name) const;
};
struct llm_symbol {
using index = int;
index prev;
index next;
const char * text;
size_t n;
};
struct llm_bigram_spm {
struct comparator {
bool operator()(llm_bigram_spm & l, llm_bigram_spm & r);
};
using queue_storage = std::vector<llm_bigram_spm>;
using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
llm_symbol::index left;
llm_symbol::index right;
float score;
size_t size;
};
struct llm_tokenizer_spm {
llm_tokenizer_spm(const llama_vocab & vocab);
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output);
private:
void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) ;
void try_add_bigram(int left, int right) ;
const llama_vocab & vocab;
std::vector<llm_symbol> symbols;
llm_bigram_spm::queue work_queue;
std::map<std::string, std::pair<int, int>> rev_merge;
};
// BPE tokenizer
// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
// tried to simplify unicode stuff, so most likely does not work 100% correctly!
// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
struct llm_bigram_bpe {
struct comparator {
bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const ;
};
using queue_storage = std::vector<llm_bigram_bpe>;
using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
llm_symbol::index left;
llm_symbol::index right;
std::string text;
int rank;
size_t size;
};
struct llm_tokenizer_bpe {
llm_tokenizer_bpe(const llama_vocab & vocab);
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output);
private:
void add_new_bigram(int left, int right) ;
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) ;
const llama_vocab & vocab;
std::vector<llm_symbol> symbols;
std::vector<llm_symbol> symbols_final;
llm_bigram_bpe::queue work_queue;
};
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
} FRAGMENT_BUFFER_VARIANT_TYPE;
struct fragment_buffer_variant{
fragment_buffer_variant(llama_vocab::id _token);
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length);
const FRAGMENT_BUFFER_VARIANT_TYPE type;
const llama_vocab::id token;
const std::string _dummy;
const std::string & raw_text;
const uint64_t offset;
const uint64_t length;
};
struct llama_partial_utf8 {
uint32_t value; // bit value so far (unshifted)
int n_remain; // num bytes remaining; -1 indicates invalid sequence
};
struct llama_grammar {
const std::vector<std::vector<llama_grammar_element>> rules;
std::vector<std::vector<const llama_grammar_element *>> stacks;
// buffer for partially generated UTF-8 sequence from accepted tokens
llama_partial_utf8 partial_utf8;
};
struct llama_grammar_candidate {
size_t index;
const uint32_t * code_points;
llama_partial_utf8 partial_utf8;
};
struct quantize_state_internal {
const llama_model & model;
const llama_model_quantize_params * params;
int n_attention_wv = 0;
int n_feed_forward_w2 = 0;
int i_attention_wv = 0;
int i_feed_forward_w2 = 0;
int n_k_quantized = 0;
int n_fallback = 0;
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
: model(model)
, params(params)
{}
};

View file

@ -50,7 +50,9 @@
#endif
#ifdef __cplusplus
#ifndef CPP_ONLY
extern "C" {
#endif
#endif
//
@ -827,8 +829,10 @@ extern "C" {
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
#ifdef __cplusplus
#ifndef CPP_ONLY
}
#endif
#endif
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
#ifdef LLAMA_API_INTERNAL
@ -844,4 +848,8 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
#endif // LLAMA_API_INTERNAL
#endif // LLAMA_H

756
print.hpp Normal file
View file

@ -0,0 +1,756 @@
#include <iostream>
#include "llama.h"
#include "ggml-internal.hpp"
#include "llama-internal.hpp"
REFL_TYPE(ggml_init_params )
REFL_END
REFL_TYPE(ggml_opt_params::ggml_adam)
REFL_END
REFL_TYPE(ggml_opt_params::ggml_lbfgs)
REFL_END
REFL_TYPE(ggml_opt_context::ggml_grad )
REFL_END
REFL_TYPE(gpt_params )
REFL_FIELD( seed )
REFL_FIELD( n_threads)
REFL_FIELD( n_threads_batch)
REFL_FIELD( n_predict )
REFL_FIELD( n_ctx )
REFL_FIELD( n_batch)
REFL_FIELD( n_keep )
REFL_FIELD( n_draft)
REFL_FIELD( n_chunks )
REFL_FIELD( n_parallel)
REFL_FIELD( n_sequences)
REFL_FIELD( p_accept )
REFL_FIELD( p_split )
REFL_FIELD( n_gpu_layers)
REFL_FIELD( n_gpu_layers_draft)
REFL_FIELD( main_gpu )
REFL_FIELD( tensor_split)
REFL_FIELD( n_beams )
REFL_FIELD(rope_freq_base)
REFL_FIELD( rope_freq_scale )
REFL_FIELD( yarn_ext_factor )
REFL_FIELD( yarn_attn_factor )
REFL_FIELD( yarn_beta_fast )
REFL_FIELD( yarn_beta_slow )
REFL_FIELD( yarn_orig_ctx)
REFL_FIELD( rope_scaling_type)
REFL_FIELD( sparams)
REFL_FIELD(model )
REFL_FIELD(model_draft )
REFL_FIELD(model_alias)
REFL_FIELD(prompt )
REFL_FIELD(prompt_file )
REFL_FIELD(path_prompt_cache )
REFL_FIELD(input_prefix )
REFL_FIELD(input_suffix )
REFL_FIELD( antiprompt)
REFL_FIELD(logdir )
REFL_FIELD( lora_adapter)
REFL_FIELD(lora_base )
REFL_FIELD( ppl_stride )
REFL_FIELD( ppl_output_type )
REFL_FIELD( hellaswag )
REFL_FIELD( hellaswag_tasks )
REFL_FIELD( mul_mat_q )
REFL_FIELD( memory_f16)
REFL_FIELD( random_prompt )
REFL_FIELD( use_color )
REFL_FIELD( interactive )
REFL_FIELD( chatml )
REFL_FIELD( prompt_cache_all )
REFL_FIELD( prompt_cache_ro )
REFL_FIELD( embedding )
REFL_FIELD( escape )
REFL_FIELD( interactive_first )
REFL_FIELD( multiline_input )
REFL_FIELD( simple_io )
REFL_FIELD( cont_batching )
REFL_FIELD( input_prefix_bos )
REFL_FIELD( ignore_eos )
REFL_FIELD( instruct )
REFL_FIELD( logits_all )
REFL_FIELD( use_mmap)
REFL_FIELD( use_mlock )
REFL_FIELD( numa )
REFL_FIELD( verbose_prompt )
REFL_FIELD( infill )
REFL_FIELD(mmproj )
REFL_FIELD( image)
REFL_END
REFL_TYPE(llama_sampling_params)
REFL_END
REFL_TYPE(llm_arch)
REFL_END
REFL_TYPE(llama_sampling_context )
REFL_FIELD( params)
REFL_FIELD( mirostat_mu)
REFL_FIELD( grammar)
REFL_FIELD( parsed_grammar)
REFL_FIELD( prev)
REFL_FIELD( cur)
REFL_END
REFL_TYPE(llama_token_data )
REFL_END
REFL_TYPE(llama_token_data_array )
REFL_END
REFL_TYPE(llama_batch )
REFL_END
REFL_TYPE(ggml_object)
REFL_FIELD(offs)
REFL_END
REFL_TYPE(ggml_tensor)
REFL_FIELD(type)
REFL_END
REFL_TYPE(ggml_cplan)
REFL_FIELD(work_size)
REFL_END
REFL_TYPE(ggml_hash_set)
REFL_FIELD(size)
REFL_END
REFL_TYPE(ggml_cgraph)
REFL_FIELD(size)
REFL_END
REFL_TYPE(ggml_scratch)
REFL_FIELD(offs)
REFL_END
REFL_TYPE(ggml_compute_params)
REFL_FIELD(type)
REFL_END
REFL_TYPE(ggml_opt_params)
REFL_FIELD(type)
REFL_END
REFL_TYPE(ggml_opt_context)
REFL_FIELD(ctx)
REFL_END
REFL_TYPE(gguf_init_params)
REFL_END
REFL_TYPE(ggml_something)
REFL_FIELD(type_name)
REFL_END
REFL_TYPE(ggml_context)
REFL_FIELD(mem_size)
REFL_FIELD(mem_buffer)
REFL_FIELD(mem_buffer_owned)
REFL_FIELD( no_alloc)
REFL_FIELD( no_alloc_save)
REFL_FIELD( n_objects)
REFL_FIELD( objects_begin)
REFL_FIELD( objects_end)
REFL_FIELD( scratch)
REFL_FIELD( scratch_save)
REFL_END
REFL_TYPE(ggml_context_container)
REFL_FIELD(used)
REFL_FIELD(context)
REFL_END
REFL_TYPE(ggml_numa_node)
REFL_FIELD(cpus)
REFL_FIELD(n_cpus)
REFL_END
REFL_TYPE(ggml_numa_nodes)
REFL_FIELD(nodes)
REFL_FIELD(n_nodes)
REFL_END
REFL_TYPE(ggml_state)
REFL_FIELD(contexts)
REFL_FIELD(numa)
REFL_END
REFL_TYPE(gguf_str)
REFL_FIELD(n)
REFL_FIELD(data)
REFL_END
REFL_TYPE(ggml_map_custom1_op_params)
REFL_FIELD(fun)
REFL_FIELD(n_tasks)
REFL_END
REFL_TYPE(ggml_map_custom2_op_params)
REFL_FIELD(fun)
REFL_FIELD(n_tasks)
REFL_END
REFL_TYPE(ggml_map_custom3_op_params)
REFL_FIELD(fun)
REFL_FIELD(n_tasks)
REFL_END
REFL_TYPE(hash_map)
REFL_FIELD(set)
REFL_FIELD(vals)
REFL_END
REFL_TYPE(ggml_compute_state_shared)
REFL_FIELD(cgraph)
REFL_FIELD(cplan)
REFL_END
REFL_TYPE(ggml_compute_state)
REFL_FIELD(thrd)
REFL_FIELD(ith)
REFL_END
REFL_TYPE(ggml_lbfgs_iteration_data)
REFL_FIELD(alpha)
REFL_FIELD(ys)
REFL_END
REFL_TYPE(gguf_kv)
REFL_FIELD(key)
REFL_FIELD(type)
REFL_END
REFL_TYPE(gguf_header)
REFL_FIELD(magic)
REFL_FIELD(version)
REFL_END
REFL_TYPE(gguf_tensor_info)
REFL_FIELD(name)
REFL_FIELD(n_dims)
REFL_END
REFL_TYPE(gguf_context)
REFL_FIELD(header)
REFL_FIELD(kv)
REFL_END
REFL_TYPE(gguf_buf)
REFL_FIELD(data)
REFL_FIELD(size)
REFL_END
REFL_TYPE(llama_model_params)
REFL_FIELD(n_gpu_layers)
REFL_END
REFL_TYPE(llama_context_params)
REFL_FIELD(seed)
REFL_END
REFL_TYPE(llama_model_quantize_params)
REFL_FIELD(nthread)
REFL_END
REFL_TYPE(llama_grammar_element)
REFL_END
REFL_TYPE(llama_timings)
REFL_FIELD(t_start_ms)
REFL_END
REFL_TYPE(llama_beam_view)
REFL_FIELD(tokens)
REFL_END
REFL_TYPE(llama_beams_state)
REFL_FIELD(beam_views)
REFL_END
REFL_TYPE(ggml_backend)
REFL_END
REFL_TYPE(ggml_backend_buffer)
REFL_END
REFL_TYPE(ggml_allocr)
REFL_END
REFL_TYPE(ggml_tallocr)
REFL_END
REFL_TYPE(ggml_gallocr)
REFL_END
REFL_TYPE(llama_buffer)
REFL_FIELD(data)
REFL_FIELD(size)
REFL_END
REFL_TYPE(llama_file)
REFL_FIELD(fp)
REFL_FIELD(size)
REFL_END
REFL_TYPE(llama_mmap)
REFL_FIELD(addr)
REFL_FIELD(size)
REFL_END
REFL_TYPE(llama_mlock)
REFL_FIELD(addr)
REFL_FIELD(size)
REFL_END
REFL_TYPE(llama_state)
REFL_FIELD(log_callback)
REFL_FIELD(log_callback_user_data)
REFL_END
REFL_TYPE(llama_hparams)
REFL_FIELD(vocab_only)
REFL_FIELD(n_vocab)
REFL_END
REFL_TYPE(llama_cparams)
REFL_FIELD(n_ctx)
REFL_FIELD(n_batch)
REFL_END
REFL_TYPE(llama_layer)
REFL_FIELD(attn_norm)
REFL_FIELD(attn_norm_b)
REFL_END
REFL_TYPE(llama_kv_cell)
REFL_FIELD(pos)
REFL_FIELD(delta)
REFL_END
REFL_TYPE(llama_kv_cache)
REFL_FIELD(has_shift)
REFL_FIELD(head)
REFL_END
REFL_TYPE(e_model)
REFL_END
REFL_TYPE(llama_ftype)
REFL_END
REFL_TYPE(llama_model)
REFL_FIELD(type)
REFL_FIELD(arch)
REFL_FIELD(ftype )
REFL_FIELD( name )
REFL_FIELD( hparams )
REFL_FIELD( vocab)
REFL_FIELD( tok_embd)
REFL_FIELD( pos_embd)
REFL_FIELD( tok_norm)
REFL_FIELD( tok_norm_b)
REFL_FIELD( output_norm)
REFL_FIELD( output_norm_b)
REFL_FIELD( output)
REFL_FIELD( layers)
REFL_FIELD( n_gpu_layers)
REFL_FIELD( gguf_kv) //unordered map
REFL_FIELD( ctx)
REFL_FIELD( buf)
REFL_FIELD( mapping) //std::unique_ptr
REFL_FIELD( mlock_buf)
REFL_FIELD( mlock_mmap)
REFL_FIELD( tensors_by_name)
REFL_FIELD( t_load_us)
REFL_FIELD( t_start_us)
REFL_END
REFL_TYPE(llama_vocab)
REFL_END
REFL_TYPE(grammar_parser::parse_state)
REFL_END
REFL_TYPE(llama_context)
REFL_FIELD( cparams)
//REFL_FIELD(model)
REFL_FIELD(kv_self)
REFL_FIELD(rng) //random numbers
REFL_FIELD(has_evaluated_once )
REFL_FIELD( t_start_us)
REFL_FIELD( t_load_us)
REFL_FIELD( t_sample_us )
REFL_FIELD( t_p_eval_us )
REFL_FIELD( t_eval_us)
REFL_FIELD( n_sample )
REFL_FIELD( n_p_eval )
REFL_FIELD( n_eval )
REFL_FIELD( logits)
REFL_FIELD( logits_all )
REFL_FIELD( embedding)
REFL_FIELD( work_buffer)
REFL_FIELD( buf_compute)
REFL_FIELD( buf_alloc)
REFL_FIELD( alloc )
#ifdef GGML_USE_METAL
REFL_FIELD( ctx_metal )
#endif
#ifdef GGML_USE_MPI
REFL_FIELD( ctx_mpi )
#endif
REFL_END
REFL_TYPE(llama_model_loader)
REFL_FIELD(n_kv)
REFL_FIELD(n_tensors)
REFL_END
REFL_TYPE(llm_build_context)
// REFL_FIELD(model) cannot create pointer to reference member llm_build_context::model
// REFL_FIELD(hparams) cannot create pointer to reference member llm_build_context::hparams
REFL_END
REFL_TYPE(llm_offload_trie)
REFL_END
REFL_TYPE(llm_symbol)
REFL_FIELD(prev)
REFL_END
REFL_TYPE(llm_bigram_spm)
REFL_END
REFL_TYPE(llm_tokenizer_spm)
REFL_END
REFL_TYPE(llm_bigram_bpe)
REFL_END
REFL_TYPE(llm_tokenizer_bpe)
REFL_END
REFL_TYPE(fragment_buffer_variant)
REFL_END
REFL_TYPE(llama_partial_utf8)
REFL_FIELD(value)
REFL_FIELD(n_remain)
REFL_END
REFL_TYPE(llama_grammar)
REFL_FIELD(rules)
REFL_FIELD(stacks)
REFL_END
REFL_TYPE(llama_grammar_candidate)
REFL_FIELD(index)
REFL_FIELD(code_points)
REFL_END
REFL_TYPE(llama_beam)
REFL_FIELD(tokens)
REFL_FIELD(p)
REFL_END
REFL_TYPE(llama_logit_info)
REFL_FIELD(logits)
REFL_FIELD(n_vocab)
REFL_END
REFL_TYPE(llama_beam_search_data)
REFL_FIELD(ctx)
REFL_FIELD(n_beams)
REFL_END
REFL_TYPE(quantize_state_internal)
// REFL_FIELD(model)
REFL_FIELD(params)
REFL_FIELD( n_attention_wv )
REFL_FIELD( n_feed_forward_w2 )
REFL_FIELD( i_attention_wv )
REFL_FIELD( i_feed_forward_w2 )
REFL_FIELD( n_k_quantized )
REFL_FIELD( n_fallback )
REFL_END
REFL_TYPE(llama_data_context)
REFL_END
REFL_TYPE(llama_data_buffer_context)
REFL_FIELD(ptr)
REFL_END
REFL_TYPE(llama_data_file_context)
REFL_FIELD(file)
REFL_END
template <typename T>
constexpr auto get_value_type_name(const T t) noexcept
{
return t.value_type;
}
namespace runtime2
{
using namespace refl;
using namespace refl::descriptor;
template <typename CharT, typename T>
void debug(std::basic_ostream<CharT>& os, const T& value, bool compact = false);
namespace detail
{
template <typename CharT, typename T, typename = decltype(std::declval<std::basic_ostream<CharT>&>() << std::declval<T>())>
std::true_type is_ostream_printable_test(int);
template <typename CharT, typename T>
std::false_type is_ostream_printable_test(...);
template <typename CharT, typename T>
constexpr bool is_ostream_printable_v{ decltype(is_ostream_printable_test<CharT, T>(0))::value };
namespace
{
[[maybe_unused]] int next_depth(int depth)
{
return depth == -1 || depth > 8
? -1
: depth + 1;
}
}
template <typename CharT>
void indent(std::basic_ostream<CharT>& os, int depth)
{
for (int i = 0; i < depth; i++) {
os << " ";
}
}
template <typename CharT, typename T>
void debug_impl(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] int depth);
template <typename CharT, typename T>
void debug_detailed(std::basic_ostream<CharT>& os, const T& value, int depth)
{
using type_descriptor = type_descriptor<T>;
bool compact = depth == -1;
// print type with members enclosed in braces
os << type_descriptor::name << " { ";
if (!compact) os << '\n';
constexpr auto readable_members = filter(type_descriptor::members, [](auto member) { return is_readable(member); });
for_each(readable_members, [&](auto member, [[maybe_unused]] auto index) {
int new_depth = next_depth(depth);
indent(os, new_depth);
os << get_display_name(member) << " = ";
if constexpr (util::contains_instance<attr::debug>(member.attributes)) {
// use the debug attribute to print
auto debug_attr = util::get_instance<attr::debug>(member.attributes);
debug_attr.write(os, value);
}
else {
debug_impl(os, member(value), new_depth);
}
if (!compact || index + 1 != readable_members.size) {
os << ", ";
}
if (!compact) {
indent(os, depth);
os << '\n';
}
});
if (compact) os << ' ';
indent(os, depth);
os << '}';
}
template <typename CharT, typename T>
void debug_reflectable(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] int depth)
{
using type_descriptor = type_descriptor<T>;
if constexpr (trait::contains_instance_v<attr::debug, typename type_descriptor::attribute_types>) {
// use the debug attribute to print
auto debug_attr = util::get_instance<attr::debug>(type_descriptor::attributes);
debug_attr.write(os, value);
}
else if constexpr (detail::is_ostream_printable_v<CharT, T>) {
// type supports printing natively, just use that
os << value;
}
else {
debug_detailed(os, value, depth);
}
}
template <typename CharT, typename T>
void debug_container(std::basic_ostream<CharT>& os, const T& value, int depth)
{
bool compact = depth == -1;
os << "[";
auto end = value.end();
for (auto it = value.begin(); it != end; ++it)
{
if (!compact) os << '\n';
int new_depth = next_depth(depth);
indent(os, new_depth);
debug_impl(os, *it, new_depth);
if (std::next(it, 1) != end) {
os << ", ";
}
else if (!compact) {
os << '\n';
}
}
indent(os, depth);
os << "]";
}
template <typename CharT, typename T>
void debug_impl(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] int depth)
{
using no_pointer_t = std::remove_pointer_t<T>;
if constexpr (std::is_same_v<bool, T>) {
os << (value ? "true" : "false");
}
else if constexpr (std::is_pointer_v<T> && !std::is_void_v<no_pointer_t> && trait::is_reflectable_v<no_pointer_t>) {
if (value == nullptr) {
os << "nullptr";
}
else {
os << '&';
debug_impl(os, *value, -1);
}
}
else if constexpr (trait::is_reflectable_v<T>) {
debug_reflectable(os, value, depth);
}
else if constexpr (detail::is_ostream_printable_v<CharT, T>) {
os << value;
}
else if constexpr (trait::is_container_v<T>) {
debug_container(os, value, depth);
}
else {
os << "(not printable)";
}
}
}
/**
* Writes the debug representation of value to the given std::ostream.
* Calls the function specified by the debug<F> attribute whenever possible,
* before falling back to recursively interating the members and printing them.
* Takes an optional arguments specifying whether to print a compact representation.
* The compact representation contains no newlines.
*/
template <typename CharT, typename T>
void debug(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] bool compact)
{
static_assert(trait::is_reflectable_v<T> || trait::is_container_v<T> || detail::is_ostream_printable_v<CharT, T>,
"Type is not reflectable, not a container of reflectable types and does not support operator<<(std::ostream&, T)!");
detail::debug_impl(os, value, compact ? -1 : 0);
}
/**
* Writes the compact debug representation of the provided values to the given std::ostream.
*/
template <typename CharT, typename... Ts>
void debug_all(std::basic_ostream<CharT>& os, const Ts&... values)
{
refl::runtime::debug(os, std::forward_as_tuple(static_cast<const Ts&>(values)...), true);
}
/**
* Writes the debug representation of the provided value to an std::string and returns it.
* Takes an optional arguments specifying whether to print a compact representation.
* The compact representation contains no newlines.
*/
template <typename CharT = char, typename T>
std::basic_string<CharT> debug_str(const T& value, bool compact = false)
{
std::basic_stringstream<CharT> ss;
debug(ss, value, compact);
return ss.str();
}
/**
* Writes the compact debug representation of the provided values to an std::string and returns it.
*/
template <typename CharT = char, typename... Ts>
std::basic_string<CharT> debug_all_str(const Ts&... values)
{
return refl::runtime::debug_str(std::forward_as_tuple(static_cast<const Ts&>(values)...), true);
}
}
// // A generic function to print out the fields of any object
template<typename T>
void print_fields(const T& t) {
runtime2::debug(std::cout, t);
constexpr auto type = refl::reflect<T>();
constexpr auto membertype = refl::member_list<T>();
constexpr auto members = get_members(type);
std::cout << "DEBUG Type: " << type.name.c_str() << "\n";
std::cout << "DEBUG Type2: " << typeid(membertype).name() << "\n";
std::cout << "DEBUG Type3: " << typeid(members).name() << "\n";
refl::util::for_each(members, [&](auto member) {
//using member_t = decltype(member::value_type);
//typename type3 = member::value_type;
//typename trait::remove_qualifiers_t<member_t>::value_type>;
//constexpr auto type2 = refl::reflect(type3);
//std::cout << "Auto:" << foo <<"\n";
std::cout << "Auto:" << member.name <<"\n";
//std::cout << "DEBUG Type2: " << typeid(member_t).name() << "\n";
//std::cout << "DEBUG Type2: " << type2.name.c_str() << "\n";
});
std::cout << "\n";
}

View file

@ -46,6 +46,6 @@ llama_build_and_test_executable(test-grad0.cpp) # SLOW
llama_build_and_test_executable(test-rope.cpp)
# dummy executable - not installed
get_filename_component(TEST_TARGET test-c.c NAME_WE)
add_executable(${TEST_TARGET} test-c.c)
get_filename_component(TEST_TARGET test-c.cpp NAME_WE)
add_executable(${TEST_TARGET} test-c.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE llama)