Naming the unnamed ggml structures

Here we add names for the nested structures of ggml

adding print statements to main.

This inserts the print probes at key points

adding include for refl

working

now server has it
This commit is contained in:
mike dupont 2023-11-24 19:09:19 -05:00
parent 04814e718e
commit 77f4b996ed
20 changed files with 2969 additions and 530 deletions

3
.gitignore vendored
View file

@ -99,3 +99,6 @@ tests/test-tokenizer-0-llama
tests/test-tokenizer-0-falcon
tests/test-tokenizer-1-llama
tests/test-tokenizer-1-bpe
/#llama.cpp#
#*
\\#*

View file

@ -104,7 +104,7 @@ option(LLAMA_BUILD_SERVER "llama: build server example"
# Compile flags
#
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED true)
set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true)
@ -230,7 +230,12 @@ if (LLAMA_BLAS)
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
add_compile_options(${BLAS_LINKER_FLAGS})
add_compile_definitions(GGML_USE_OPENBLAS)
# from https://github.com/NVIDIA/cutlass
make_directory("${PROJECT_BINARY_DIR}/nvcc_tmp")
set(cuda_flags --keep "SHELL:--keep-dir ${PROJECT_BINARY_DIR}/nvcc_tmp" ${cuda_flags})
# add_compile_definitions(GGML_USE_OPENBLAS)
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
add_compile_definitions(GGML_BLAS_USE_MKL)
endif()
@ -312,7 +317,7 @@ if (LLAMA_MPI)
if (MPI_C_FOUND)
message(STATUS "MPI found")
set(GGML_HEADERS_MPI ggml-mpi.h)
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
set(GGML_SOURCES_MPI ggml-mpi.cpp ggml-mpi.h)
add_compile_definitions(GGML_USE_MPI)
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
if (NOT MSVC)
@ -438,6 +443,9 @@ if (NOT cuda_host_flags STREQUAL "")
set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
endif()
#
set(cuda_flags --verbose -G ${cuda_flags})
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
if (WIN32)
@ -485,6 +493,8 @@ if (NOT MSVC)
add_link_options(-static-libgcc -static-libstdc++)
endif()
endif()
add_link_options("-Wl,-Map=${TARGET}.map")
if (LLAMA_GPROF)
add_compile_options(-pg)
endif()
@ -645,13 +655,16 @@ if (GGML_USE_CPU_HBM)
endif()
add_library(ggml OBJECT
ggml.c
ggml.cpp
ggml.h
ggml-alloc.c
print.hpp
ggml-internal.hpp
llama-internal.hpp
ggml-alloc.cpp
ggml-alloc.h
ggml-backend.c
ggml-backend.cpp
ggml-backend.h
ggml-quants.c
ggml-quants.cpp
ggml-quants.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
@ -683,7 +696,7 @@ add_library(llama
)
target_include_directories(llama PUBLIC .)
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
target_compile_features(llama PUBLIC cxx_std_20) # don't bump
target_link_libraries(llama PRIVATE
ggml
${LLAMA_EXTRA_LIBS}

View file

@ -116,7 +116,7 @@ endif
# keep standard at C11 and C++11
MK_CPPFLAGS = -I. -Icommon
MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++11 -fPIC
MK_CXXFLAGS = -std=c++20 -fPIC -fpermissive -DCPP_ONLY
# -Ofast tends to produce faster code, but may not be available for some compilers.
ifdef LLAMA_FAST
@ -502,7 +502,7 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
endif # LLAMA_METAL
ifdef LLAMA_MPI
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
ggml-mpi.o: ggml-mpi.cpp ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI
@ -537,17 +537,17 @@ $(info )
# Build library
#
ggml.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) -c $< -o $@
ggml.o: ggml.cpp ggml.h ggml-cuda.h
$(CXX) $(CXXFLAGS) -c $< -o $@
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
$(CC) $(CFLAGS) -c $< -o $@
ggml-alloc.o: ggml-alloc.cpp ggml.h ggml-alloc.h
$(CXX) $(CXXFLAGS) -c $< -o $@
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
$(CC) $(CFLAGS) -c $< -o $@
ggml-backend.o: ggml-backend.cpp ggml.h ggml-backend.h
$(CXX) $(CXXFLAGS) -c $< -o $@
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
$(CC) $(CFLAGS) -c $< -o $@
ggml-quants.o: ggml-quants.cpp ggml.h ggml-quants.h
$(CXX) $(CXXFLAGS) -c $< -o $@
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
@ -734,5 +734,5 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-c.o: tests/test-c.c llama.h
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
tests/test-c.o: tests/test-c.cpp llama.h
$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@

337
binding.py Normal file
View file

@ -0,0 +1,337 @@
import os
import json
import re
import clang.cindex
# configurable part
CLANG_VERSION='13.0.1'
# homebrew installs for llvm (brew info llvm gives details):
# x64: /usr/local/opt/llvm/lib
# arm64: /opt/homebrew/opt/llvm/lib
llvmLibPath = "/usr/lib/llvm-15/lib/"
cxxClientRoot = "/home/mdupont/experiments/llama.cpp/"
fileList = [
# "ggml.cpp",
# "llama.cpp",
"examples/server/server.cpp",
]
typeList = [
]
# end of configurable part
clang.cindex.Config.set_library_path(llvmLibPath)
def list_headers_in_dir(path):
# enumerates a folder but keeps the full pathing for the files returned
# and removes certain files we don't want (like non-hxx, _json.hxx or _fmt.hxx)
# list all the files in the folder
files = os.listdir(path)
# only include .hxx files
files = list(filter(lambda x: x.endswith('.hxx'), files))
# add the folder path back on
files = list(map(lambda x: path + x, files))
return files
# parse through the list of files specified and expand wildcards
fullFileList = []
for filePath in fileList:
if "*" in filePath:
# wildcard path
basePath = filePath[:-1]
if "*" in basePath:
# if there is still a wildcard, we have an issue...
raise NotImplementedError(
"wildcard only supported at end of file path")
files = list_headers_in_dir(os.path.join(cxxClientRoot, basePath))
fullFileList = fullFileList + files
else:
# normal path
ff = os.path.join(cxxClientRoot, filePath)
fullFileList.append(ff)
print("DBUG",ff)
# exclude _json.hxx files
fullFileList = list(
filter(lambda x: not x.endswith('_json.hxx'), fullFileList))
# exclude _fmt.hxx files
fullFileList = list(
filter(lambda x: not x.endswith('_fmt.hxx'), fullFileList))
# generate a list of regexps from the type list (for handling wildcards)
typeListRe = list(map(lambda x: x.replace("*", "(.*)") + "(.*)", typeList))
def is_included_type(name, with_durability=False):
# TODO(brett19): This should be generalized somehow...
if "is_compound_operation" in name:
return False
if "replica_context" in name:
return False
if with_durability is True and '_with_legacy_durability' not in name:
return False
for x in typeListRe:
if re.fullmatch(x, name):
return True
return False
opTypes = []
opEnums = []
def parse_type(type):
typeStr = type.get_canonical().spelling
return parse_type_str(typeStr)
std_comparators = ["std::less<>", "std::greater<>", "std::less_equal<>", "std::greater_equal<>"]
def parse_type_str(typeStr):
if typeStr == "std::mutex":
return {"name": "std::mutex"}
if typeStr == "std::string":
return {"name": "std::string"}
if typeStr == "std::chrono::duration<long long>":
return {"name": "std::chrono::seconds"}
if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000>>":
return {"name": "std::chrono::milliseconds"}
if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000000>>":
return {"name": "std::chrono::microseconds"}
if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000000000>>":
return {"name": "std::chrono::nanoseconds"}
if typeStr == "std::error_code":
return {"name": "std::error_code"}
if typeStr == "std::monostate":
return {"name": "std::monostate"}
if typeStr == "std::byte":
return {"name": "std::byte"}
if typeStr == "unsigned long":
return {"name": "std::size_t"}
if typeStr == "char":
return {"name": "std::int8_t"}
if typeStr == "unsigned char":
return {"name": "std::uint8_t"}
if typeStr == "short":
return {"name": "std::int16_t"}
if typeStr == "unsigned short":
return {"name": "std::uint16_t"}
if typeStr == "int":
return {"name": "std::int32_t"}
if typeStr == "unsigned int":
return {"name": "std::uint32_t"}
if typeStr == "long long":
return {"name": "std::int64_t"}
if typeStr == "unsigned long long":
return {"name": "std::uint64_t"}
if typeStr == "bool":
return {"name": "std::bool"}
if typeStr == "float":
return {"name": "std::float"}
if typeStr == "double":
return {"name": "std::double"}
if typeStr == "std::nullptr_t":
return {"name": "std::nullptr_t"}
if typeStr in std_comparators:
return {"name": typeStr}
tplParts = typeStr.split("<", 1)
if len(tplParts) > 1:
tplClassName = tplParts[0]
tplParams = tplParts[1][:-1]
if tplClassName == "std::function":
return {
"name": "std::function"
}
if tplClassName == "std::optional":
return {
"name": "std::optional",
"of": parse_type_str(tplParams)
}
if tplClassName == "std::vector":
return {
"name": "std::vector",
"of": parse_type_str(tplParams)
}
if tplClassName == "std::set":
return {
"name": "std::set",
"of": parse_type_str(tplParams)
}
if tplClassName == "std::variant":
variantParts = tplParams.split(", ")
variantTypes = []
for variantPart in variantParts:
variantTypes.append(parse_type_str(variantPart))
return {
"name": "std::variant",
"of": variantTypes
}
if tplClassName == "std::array":
variantParts = tplParams.split(", ")
if len(variantParts) != 2:
print("FAILED TO PARSE ARRAY TYPES: " + typeStr)
return {"name": "unknown", "str": typeStr}
return {
"name": "std::array",
"of": parse_type_str(variantParts[0]),
"size": int(variantParts[1])
}
if tplClassName == "std::map":
variantParts = tplParams.split(", ")
if len(variantParts) < 2 or len(variantParts) > 3:
print("FAILED TO PARSE MAP TYPES: " + typeStr)
return {"name": "unknown", "str": typeStr}
if len(variantParts) == 2:
return {
"name": "std::map",
"of": parse_type_str(variantParts[0]),
"to": parse_type_str(variantParts[1])
}
else:
return {
"name": "std::map",
"of": parse_type_str(variantParts[0]),
"to": parse_type_str(variantParts[1]),
"comparator": parse_type_str(variantParts[2])
}
if tplClassName == "std::shared_ptr":
return {
"name": "std::shared_ptr",
"of": parse_type_str(tplParams)
}
#return {"name": "unknown", "str": typeStr}
if 'unnamed struct' in typeStr:
print("WARNING: Found unnamed struct: " + typeStr)
return {"name": typeStr}
internal_structs = []
UNNAMED_STRUCT_DELIM = '::(unnamed struct'
def traverse(node, namespace, main_file):
# only scan the elements of the file we parsed
if node.kind == clang.cindex.CursorKind.STRUCT_DECL or node.kind == clang.cindex.CursorKind.CLASS_DECL:
fullStructName = "::".join([*namespace, node.displayname])
print("#FILE", node.location.file )
print("REFL_TYPE(" + fullStructName + ")")
structFields = []
for child in node.get_children():
if child.kind == clang.cindex.CursorKind.FIELD_DECL:
struct_type = parse_type(child.type)
type_str = child.type.get_canonical().spelling
print(" REFL_FIELD(" + child.displayname + ")")
if 'unnamed' in type_str:
name_tokens = type_str.split('::')
name_override = '::'.join(name_tokens[:-1] + [child.displayname])
struct_type['name'] = name_override
internal_structs.append(name_override)
structFields.append({
"name": child.displayname,
"type": struct_type,
})
# replica read changes introduced duplicate get requests
#if any(map(lambda op: op['name'] == fullStructName, opTypes)):
# return
#opTypes.append({
# "name": fullStructName,
# "fields": structFields,
#})
print("REFL_END")
if node.kind == clang.cindex.CursorKind.TYPE_ALIAS_DECL:
fullStructName = "::".join([*namespace, node.displayname])
if is_included_type(fullStructName, with_durability=True):
type_ref = next((c for c in node.get_children() if c.kind == clang.cindex.CursorKind.TYPE_REF), None)
if type_ref:
base_request_name = type_ref.displayname.replace('struct', '').strip()
base_request = next((op for op in opTypes if op['name'] == base_request_name), None)
if base_request:
new_fields = [f for f in base_request['fields'] if f['name'] != 'durability_level']
new_fields.extend([
{"name":"persist_to", "type":{"name":"couchbase::persist_to"}},
{"name":"replicate_to", "type":{"name":"couchbase::replicate_to"}}
])
opTypes.append({
"name": fullStructName,
"fields": new_fields
})
if node.kind == clang.cindex.CursorKind.ENUM_DECL:
fullEnumName = "::".join([*namespace, node.displayname])
if is_included_type(fullEnumName):
enumValues = []
for child in node.get_children():
if child.kind == clang.cindex.CursorKind.ENUM_CONSTANT_DECL:
enumValues.append({
"name": child.displayname,
"value": child.enum_value,
})
opEnums.append({
"name": fullEnumName,
"type": parse_type(node.enum_type),
"values": enumValues,
})
if node.kind == clang.cindex.CursorKind.NAMESPACE:
namespace = [*namespace, node.displayname]
if node.kind == clang.cindex.CursorKind.CLASS_DECL:
namespace = [*namespace, node.displayname]
if node.kind == clang.cindex.CursorKind.STRUCT_DECL:
namespace = [*namespace, node.displayname]
for child in node.get_children():
traverse(child, namespace, main_file)
for headerPath in fullFileList:
print("processing " + headerPath)
index = clang.cindex.Index.create()
args = [
'-std=c++17',
]
try:
translation_unit = index.parse(headerPath, args=args)
except Exception as e:
print(e)
import pdb
pdb.set_trace()
raise e
# output clang compiler diagnostics information (for debugging)
for diagnostic in translation_unit.diagnostics:
diagnosticMsg = diagnostic.format()
print(diagnostic)
traverse(translation_unit.cursor, [], headerPath)
jsonData = json.dumps({
'op_structs': opTypes,
'op_enums': opEnums
})
f = open("bindings.json", "w")
f.write(jsonData)
f.close()

View file

@ -31,6 +31,8 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#include "print.hpp"
static llama_context ** g_ctx;
static llama_model ** g_model;
static gpt_params * g_params;
@ -99,6 +101,7 @@ static void sigint_handler(int signo) {
}
}
#endif
using namespace refl;
int main(int argc, char ** argv) {
gpt_params params;
@ -117,6 +120,7 @@ int main(int argc, char ** argv) {
// TODO: Dump params ?
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
print_fields(params);
// save choice to use color for later
// (note for later: this is a slightly awkward choice)
@ -234,6 +238,8 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd_inp;
print_fields(*model);
if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n");
if (params.chatml) {
@ -277,7 +283,8 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
}
print_fields(*ctx);
//print_fields(session_tokens);
// debug message about similarity of saved session, if applicable
size_t n_matching_session_tokens = 0;
if (!session_tokens.empty()) {
@ -365,6 +372,10 @@ int main(int argc, char ** argv) {
for (int i = 0; i < (int) guidance_inp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
}
print_fields(*ctx_guidance);
}
if (params.n_keep > 0) {
@ -473,6 +484,7 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd_guidance;
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
print_fields(*ctx_sampling);
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict
@ -508,6 +520,7 @@ int main(int argc, char ** argv) {
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
print_fields(*ctx);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
@ -624,7 +637,7 @@ int main(int argc, char ** argv) {
}
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
//print_fields(id);
llama_sampling_accept(ctx_sampling, ctx, id, true);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());

View file

@ -24,6 +24,7 @@
#include <thread>
#include <mutex>
#include <chrono>
#include "print.hpp"
#ifndef SERVER_VERBOSE
#define SERVER_VERBOSE 1
@ -33,6 +34,9 @@
using json = nlohmann::json;
REFL_TYPE(std::less< ::nlohmann::detail::value_t>)
REFL_END
struct server_params
{
std::string hostname = "127.0.0.1";
@ -41,6 +45,13 @@ struct server_params
int32_t read_timeout = 600;
int32_t write_timeout = 600;
};
REFL_TYPE(server_params)
REFL_FIELD(hostname)
REFL_FIELD(public_path)
REFL_FIELD(port)
REFL_FIELD(read_timeout)
REFL_FIELD(write_timeout)
REFL_END
static bool server_verbose = false;
@ -157,6 +168,15 @@ struct task_server {
bool embedding_mode = false;
};
REFL_TYPE(task_server)
REFL_FIELD(id)
REFL_FIELD(target_id)
REFL_FIELD(type)
REFL_FIELD(data)
REFL_FIELD(infill_mode)
REFL_FIELD(embedding_mode)
REFL_END
struct task_result {
int id;
bool stop;
@ -193,6 +213,18 @@ struct slot_params
json input_suffix;
};
REFL_TYPE(slot_params)
REFL_FIELD(stream)
REFL_FIELD(cache_prompt)
REFL_FIELD(seed)
REFL_FIELD(n_keep)
REFL_FIELD(n_predict)
REFL_FIELD(antiprompt)
REFL_FIELD(input_prefix)
REFL_FIELD(input_suffix)
REFL_END
struct slot_image
{
int32_t id;
@ -220,6 +252,17 @@ struct completion_token_output
std::string text_to_send;
};
REFL_TYPE(completion_token_output)
REFL_FIELD(probs)
REFL_FIELD(tok)
REFL_FIELD(text_to_send)
REFL_END
REFL_TYPE(completion_token_output::token_prob)
REFL_FIELD(tok)
REFL_FIELD(prob)
REFL_END
static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
{
size_t i;
@ -496,6 +539,51 @@ struct llama_client_slot
}
};
//REFL_TYPE(llama_client_slot::llama_sampling_params)
//REFL_END
REFL_TYPE(llama_client_slot)
REFL_FIELD(id)
REFL_FIELD(task_id)
REFL_FIELD(params)
REFL_FIELD(state)
REFL_FIELD(command)
REFL_FIELD(t_last_used)
REFL_FIELD(n_ctx)
REFL_FIELD(n_past)
REFL_FIELD(n_decoded)
REFL_FIELD(n_remaining)
REFL_FIELD(i_batch)
REFL_FIELD(num_prompt_tokens)
REFL_FIELD(num_prompt_tokens_processed)
REFL_FIELD(multibyte_pending)
REFL_FIELD(prompt)
REFL_FIELD(generated_text)
REFL_FIELD(sampled)
REFL_FIELD(cache_tokens)
REFL_FIELD(generated_token_probs)
REFL_FIELD(infill)
REFL_FIELD(embedding)
REFL_FIELD(has_next_token)
REFL_FIELD(truncated)
REFL_FIELD(stopped_eos)
REFL_FIELD(stopped_word)
REFL_FIELD(stopped_limit)
REFL_FIELD(oaicompat)
REFL_FIELD(oaicompat_model)
REFL_FIELD(stopping_word)
REFL_FIELD(sparams)
REFL_FIELD(ctx_sampling)
REFL_FIELD(images)
REFL_FIELD(sent_count)
REFL_FIELD(sent_token_probs_index)
REFL_FIELD(t_start_process_prompt)
REFL_FIELD(t_start_genereration)
REFL_FIELD(t_prompt_processing)
REFL_FIELD(t_token_generation)
REFL_END
struct llama_server_context
{
llama_model *model = nullptr;
@ -878,7 +966,7 @@ struct llama_server_context
all_slots_are_idle = false;
LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id);
print_fields(*slot);
return true;
}
@ -1787,6 +1875,31 @@ struct llama_server_context
}
};
REFL_TYPE(llama_server_context)
REFL_FIELD(model)
REFL_FIELD(ctx)
REFL_FIELD(clp_ctx)
REFL_FIELD(params)
REFL_FIELD(batch)
REFL_FIELD(multimodal)
REFL_FIELD(clean_kv_cache)
REFL_FIELD(all_slots_are_idle)
REFL_FIELD(add_bos_token)
REFL_FIELD(id_gen)
REFL_FIELD(n_ctx)
REFL_FIELD(system_need_update)
REFL_FIELD(system_prompt)
REFL_FIELD(system_tokens)
REFL_FIELD(name_user)
REFL_FIELD(name_assistant)
REFL_FIELD(slots)
REFL_FIELD(queue_tasks)
REFL_FIELD(queue_results)
REFL_FIELD(mutex_tasks)
REFL_FIELD(mutex_results)
REFL_END
static void server_print_usage(const char *argv0, const gpt_params &params,
const server_params &sparams)
{
@ -2497,6 +2610,11 @@ struct token_translator
std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
};
REFL_TYPE(token_translator)
REFL_FIELD(ctx)
REFL_END
static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, llama_client_slot *slot)
{
auto & gtps = slot->generated_token_probs;

View file

@ -386,7 +386,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) {
free(galloc->parse_seq);
galloc->parse_seq = malloc(sizeof(int) * n);
galloc->parse_seq = (int*)malloc(sizeof(int) * n);
for (int i = 0; i < n; i++) {
galloc->parse_seq[i] = list[i];
@ -646,9 +646,9 @@ size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, st
if (galloc->hash_values != NULL) {
free(galloc->hash_values);
}
galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size);
galloc->hash_set.keys = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * hash_size);
galloc->hash_set.size = hash_size;
galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
galloc->hash_values = (hash_node*)malloc(sizeof(struct hash_node) * hash_size);
}
// reset hash table
@ -674,7 +674,7 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap
// alloc hash_values if needed
if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) {
free(galloc->hash_values);
galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
galloc->hash_values = (hash_node*)malloc(sizeof(struct hash_node) * hash_size);
galloc->hash_values_size = hash_size;
}

View file

@ -20,7 +20,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
struct ggml_backend_buffer_i iface,
ggml_backend_buffer_context_t context,
size_t size) {
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
ggml_backend_buffer_t buffer = (ggml_backend_buffer*)malloc(sizeof(struct ggml_backend_buffer));
GGML_ASSERT(iface.get_base != NULL);
@ -195,9 +195,9 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
// TODO: allow backends to support copy to/from same backend
if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst);
ggml_get_backend(dst)->iface.cpy_tensor_from((ggml_backend_t)ggml_get_backend(dst)->context, src, dst);
} else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst);
ggml_get_backend(src)->iface.cpy_tensor_to((ggml_backend_t)ggml_get_backend(src)->context, src, dst);
} else {
// shouldn't be hit when copying from/to CPU
#ifndef NDEBUG
@ -316,13 +316,13 @@ struct ggml_backend_plan_cpu {
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
struct ggml_backend_plan_cpu * cpu_plan = (ggml_backend_plan_cpu*)malloc(sizeof(struct ggml_backend_plan_cpu));
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
cpu_plan->cgraph = *cgraph;
if (cpu_plan->cplan.work_size > 0) {
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
cpu_plan->cplan.work_data = (uint8_t*)malloc(cpu_plan->cplan.work_size);
}
return cpu_plan;
@ -356,7 +356,7 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
cpu_ctx->work_size = cplan.work_size;
}
cplan.work_data = cpu_ctx->work_data;
cplan.work_data = (uint8_t*)cpu_ctx->work_data;
ggml_graph_compute(cgraph, &cplan);
}
@ -385,13 +385,13 @@ static struct ggml_backend_i cpu_backend_i = {
};
ggml_backend_t ggml_backend_cpu_init(void) {
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
struct ggml_backend_cpu_context * ctx = (ggml_backend_cpu_context*)malloc(sizeof(struct ggml_backend_cpu_context));
ctx->n_threads = GGML_DEFAULT_N_THREADS;
ctx->work_data = NULL;
ctx->work_size = 0;
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
ggml_backend_t cpu_backend = (ggml_backend_t)malloc(sizeof(struct ggml_backend));
*cpu_backend = (struct ggml_backend) {
/* .interface = */ cpu_backend_i,
@ -869,7 +869,7 @@ static void sched_reset(ggml_backend_sched_t sched) {
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched));
struct ggml_backend_sched * sched = (ggml_backend_sched*)malloc(sizeof(struct ggml_backend_sched));
memset(sched, 0, sizeof(struct ggml_backend_sched));
fprintf(stderr, "ggml_backend_sched size: %lu KB\n", sizeof(struct ggml_backend_sched)/1024);
@ -907,9 +907,9 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
// initialize hash tables
size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
sched->hash_set.size = hash_size;
sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size);
sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size);
sched->hash_set.keys = (ggml_tensor**)malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
sched->node_talloc = (ggml_tallocr**)malloc(sizeof(sched->node_talloc[0]) * hash_size);
sched->node_copies = (ggml_tensor *(*)[4])malloc(sizeof(sched->node_copies[0]) * hash_size);
sched_split_graph(sched, measure_graph);
sched_alloc_splits(sched);

View file

@ -22,7 +22,7 @@ extern "C" {
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
#define static_assert(cond, msg) _Static_assert(cond, msg)
#else
#define static_assert(cond, msg) struct global_scope_noop_trick
//#define static_assert(cond, msg) struct global_scope_noop_trick
#endif
#endif

258
ggml-internal.hpp Normal file
View file

@ -0,0 +1,258 @@
struct ggml_context {
size_t mem_size;
void * mem_buffer;
bool mem_buffer_owned;
bool no_alloc;
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
int n_objects;
struct ggml_object * objects_begin;
struct ggml_object * objects_end;
struct ggml_scratch scratch;
struct ggml_scratch scratch_save;
ggml_context():
mem_size(0),
mem_buffer(0),
mem_buffer_owned(0),
no_alloc(0),
no_alloc_save(0),
n_objects(0),
objects_begin(0),
objects_end(0),
scratch(),
scratch_save()
{
}
};
struct ggml_context_container {
bool used;
struct ggml_context context;
ggml_context_container(): used(0),context(){
}
};
typedef double ggml_float;
typedef void * thread_ret_t;
#define MAX_FREE_BLOCKS 256
struct free_block {
void * addr;
size_t size;
};
struct ggml_tallocr {
struct ggml_backend_buffer * buffer;
bool buffer_owned;
void * base;
size_t alignment;
int n_free_blocks;
struct free_block free_blocks[MAX_FREE_BLOCKS];
size_t max_size;
bool measure;
#ifdef GGML_ALLOCATOR_DEBUG
struct ggml_tensor * allocated_tensors[1024];
#endif
};
struct hash_node {
int n_children;
int n_views;
};
typedef struct ggml_tallocr * ggml_tallocr_t;
typedef struct ggml_gallocr * ggml_gallocr_t;
struct ggml_gallocr {
ggml_tallocr_t talloc;
struct ggml_hash_set hash_set;
struct hash_node * hash_values;
size_t hash_values_size;
ggml_tallocr_t * hash_allocs;
int * parse_seq;
int parse_seq_len;
};
struct ggml_allocr {
ggml_tallocr_t talloc;
ggml_gallocr_t galloc;
};
#define GGML_NUMA_MAX_NODES 8
#define GGML_NUMA_MAX_CPUS 512
struct ggml_numa_node {
uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
uint32_t n_cpus;
};
struct ggml_numa_nodes {
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
uint32_t n_nodes;
uint32_t total_cpus; // hardware threads on system
};
struct ggml_state {
struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
struct ggml_numa_nodes numa;
ggml_state():contexts(), numa()
{
}
};
struct gguf_str {
uint64_t n; // GGUFv2
char * data;
};
struct ggml_map_custom1_op_params {
ggml_custom1_op_t fun;
int n_tasks;
void * userdata;
};
struct ggml_map_custom2_op_params {
ggml_custom2_op_t fun;
int n_tasks;
void * userdata;
};
struct ggml_map_custom3_op_params {
ggml_custom3_op_t fun;
int n_tasks;
void * userdata;
};
struct hash_map {
struct ggml_hash_set set;
struct ggml_tensor ** vals;
};
#if defined(_WIN32)
typedef volatile LONG atomic_int;
typedef atomic_int atomic_bool;
#else
#include<atomic>
using namespace std;
#endif
struct ggml_compute_state_shared {
const struct ggml_cgraph * cgraph;
const struct ggml_cplan * cplan;
int64_t perf_node_start_cycles;
int64_t perf_node_start_time_us;
const int n_threads;
// synchronization primitives
atomic_int n_active; // num active threads
atomic_int node_n; // active graph node
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
void * abort_callback_data;
};
typedef pthread_t ggml_thread_t;
struct ggml_compute_state {
ggml_thread_t thrd;
int ith;
struct ggml_compute_state_shared * shared;
};
union gguf_value {
uint8_t uint8;
int8_t int8;
uint16_t uint16;
int16_t int16;
uint32_t uint32;
int32_t int32;
float float32;
uint64_t uint64;
int64_t int64;
double float64;
bool bool_;
struct gguf_str str;
struct gguf_array_T {
enum gguf_type type;
uint64_t n; // GGUFv2
void * data;
} arr;
};
struct ggml_lbfgs_iteration_data {
float alpha;
float ys;
float * s;
float * y;
};
struct gguf_kv {
struct gguf_str key;
enum gguf_type type;
union gguf_value value;
};
struct gguf_header {
char magic[4];
uint32_t version;
uint64_t n_tensors; // GGUFv2
uint64_t n_kv; // GGUFv2
};
struct gguf_tensor_info {
struct gguf_str name;
uint32_t n_dims;
uint64_t ne[GGML_MAX_DIMS];
enum ggml_type type;
uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
// for writing API
const void * data;
size_t size;
};
struct gguf_context {
struct gguf_header header;
struct gguf_kv * kv;
struct gguf_tensor_info * infos;
size_t alignment;
size_t offset; // offset of `data` from beginning of file
size_t size; // size of `data` in bytes
//uint8_t * padding;
void * data;
};
struct gguf_buf {
void * data;
size_t size;
size_t offset;
};
#include "ggml-backend-impl.h"

File diff suppressed because it is too large Load diff

View file

@ -167,58 +167,58 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
// Quantization
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
void quantize_row_q4_0_reference(const float * __restrict__ x, block_q4_0 * __restrict__ y, int k);
void quantize_row_q4_1_reference(const float * __restrict__ x, block_q4_1 * __restrict__ y, int k);
void quantize_row_q5_0_reference(const float * __restrict__ x, block_q5_0 * __restrict__ y, int k);
void quantize_row_q5_1_reference(const float * __restrict__ x, block_q5_1 * __restrict__ y, int k);
void quantize_row_q8_0_reference(const float * __restrict__ x, block_q8_0 * __restrict__ y, int k);
void quantize_row_q8_1_reference(const float * __restrict__ x, block_q8_1 * __restrict__ y, int k);
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
void quantize_row_q2_K_reference(const float * __restrict__ x, block_q2_K * __restrict__ y, int k);
void quantize_row_q3_K_reference(const float * __restrict__ x, block_q3_K * __restrict__ y, int k);
void quantize_row_q4_K_reference(const float * __restrict__ x, block_q4_K * __restrict__ y, int k);
void quantize_row_q5_K_reference(const float * __restrict__ x, block_q5_K * __restrict__ y, int k);
void quantize_row_q6_K_reference(const float * __restrict__ x, block_q6_K * __restrict__ y, int k);
void quantize_row_q8_K_reference(const float * __restrict__ x, block_q8_K * __restrict__ y, int k);
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
void quantize_row_q4_0(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q4_1(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q5_0(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q5_1(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q8_0(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q8_1(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q2_K(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q3_K(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q4_K(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q5_K(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q6_K(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q8_K(const float * __restrict__ x, void * __restrict__ y, int k);
// Dequantization
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
void dequantize_row_q4_0(const block_q4_0 * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q4_1(const block_q4_1 * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q5_0(const block_q5_0 * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q5_1(const block_q5_1 * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q8_0(const block_q8_0 * __restrict__ x, float * __restrict__ y, int k);
//void dequantize_row_q8_1(const block_q8_1 * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
void dequantize_row_q2_K(const block_q2_K * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q3_K(const block_q3_K * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q4_K(const block_q4_K * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q5_K(const block_q5_K * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q6_K(const block_q6_K * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q8_K(const block_q8_K * __restrict__ x, float * __restrict__ y, int k);
// Dot product
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q4_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q4_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q5_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q5_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q8_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q2_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q3_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q4_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q5_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q6_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);

View file

@ -38,6 +38,14 @@
#pragma warning(disable: 4996)
#endif
// initializers for static data called in the ggml_init function
static size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {};
static char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {};
void type_traits_init();
void GGUF_TYPE_SIZE_init();
void GGUF_TYPE_NAME_init();
#if defined(_WIN32)
#include <windows.h>
@ -86,7 +94,9 @@ static int sched_yield (void) {
}
#else
#include <pthread.h>
#include <stdatomic.h>
//#include <stdatomic.h>
#include <atomic>
using namespace std;
typedef void * thread_ret_t;
@ -96,6 +106,8 @@ typedef void * thread_ret_t;
#endif
#include <atomic>
#ifdef GGML_USE_CPU_HBM
#include <hbwmalloc.h>
#endif
@ -409,37 +421,39 @@ int64_t ggml_cycles_per_ms(void) {
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
static void ggml_vec_dot_f32(const int n, float * GGML_RESTRICT s, const float * GGML_RESTRICT x, const float * GGML_RESTRICT y);
static void ggml_vec_dot_f16(const int n, float * GGML_RESTRICT s, ggml_fp16_t * GGML_RESTRICT x, ggml_fp16_t * GGML_RESTRICT y);
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
[GGML_TYPE_I8] = {
static ggml_type_traits_t type_traits[GGML_TYPE_COUNT];
void type_traits_init(){
type_traits[GGML_TYPE_I8] = {
.type_name = "i8",
.blck_size = 1,
.type_size = sizeof(int8_t),
.is_quantized = false,
},
[GGML_TYPE_I16] = {
};
type_traits[GGML_TYPE_I16] = {
.type_name = "i16",
.blck_size = 1,
.type_size = sizeof(int16_t),
.is_quantized = false,
},
[GGML_TYPE_I32] = {
};
type_traits[GGML_TYPE_I32] = {
.type_name = "i32",
.blck_size = 1,
.type_size = sizeof(int32_t),
.is_quantized = false,
},
[GGML_TYPE_F32] = {
};
type_traits[GGML_TYPE_F32] = {
.type_name = "f32",
.blck_size = 1,
.type_size = sizeof(float),
.is_quantized = false,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
},
[GGML_TYPE_F16] = {
};
type_traits[GGML_TYPE_F16] = {
.type_name = "f16",
.blck_size = 1,
.type_size = sizeof(ggml_fp16_t),
@ -449,8 +463,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
.vec_dot_type = GGML_TYPE_F16,
},
[GGML_TYPE_Q4_0] = {
};
type_traits[GGML_TYPE_Q4_0] = {
.type_name = "q4_0",
.blck_size = QK4_0,
.type_size = sizeof(block_q4_0),
@ -460,8 +474,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
.vec_dot = ggml_vec_dot_q4_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
},
[GGML_TYPE_Q4_1] = {
};
type_traits[GGML_TYPE_Q4_1] = {
.type_name = "q4_1",
.blck_size = QK4_1,
.type_size = sizeof(block_q4_1),
@ -471,8 +485,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
.vec_dot = ggml_vec_dot_q4_1_q8_1,
.vec_dot_type = GGML_TYPE_Q8_1,
},
[4] = { // GGML_TYPE_Q4_2
};
type_traits[4] = { // GGML_TYPE_Q4_2
.type_name = "DEPRECATED",
.blck_size = 0,
.type_size = 0,
@ -482,8 +496,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = NULL,
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_COUNT,
},
[5] = { // GGML_TYPE_Q4_3
};
type_traits[5] = { // GGML_TYPE_Q4_3
.type_name = "DEPRECATED",
.blck_size = 0,
.type_size = 0,
@ -493,8 +507,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = NULL,
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_COUNT,
},
[GGML_TYPE_Q5_0] = {
};
type_traits[GGML_TYPE_Q5_0] = {
.type_name = "q5_0",
.blck_size = QK5_0,
.type_size = sizeof(block_q5_0),
@ -504,8 +518,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
.vec_dot = ggml_vec_dot_q5_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
},
[GGML_TYPE_Q5_1] = {
};
type_traits[GGML_TYPE_Q5_1] = {
.type_name = "q5_1",
.blck_size = QK5_1,
.type_size = sizeof(block_q5_1),
@ -515,8 +529,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
.vec_dot = ggml_vec_dot_q5_1_q8_1,
.vec_dot_type = GGML_TYPE_Q8_1,
},
[GGML_TYPE_Q8_0] = {
};
type_traits[GGML_TYPE_Q8_0] = {
.type_name = "q8_0",
.blck_size = QK8_0,
.type_size = sizeof(block_q8_0),
@ -526,8 +540,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
.vec_dot = ggml_vec_dot_q8_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
},
[GGML_TYPE_Q8_1] = {
};
type_traits[GGML_TYPE_Q8_1] = {
.type_name = "q8_1",
.blck_size = QK8_1,
.type_size = sizeof(block_q8_1),
@ -535,8 +549,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float = quantize_row_q8_1,
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
.vec_dot_type = GGML_TYPE_Q8_1,
},
[GGML_TYPE_Q2_K] = {
};
type_traits[GGML_TYPE_Q2_K] = {
.type_name = "q2_K",
.blck_size = QK_K,
.type_size = sizeof(block_q2_K),
@ -546,8 +560,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
.vec_dot = ggml_vec_dot_q2_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
},
[GGML_TYPE_Q3_K] = {
};
type_traits[GGML_TYPE_Q3_K] = {
.type_name = "q3_K",
.blck_size = QK_K,
.type_size = sizeof(block_q3_K),
@ -557,8 +571,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
.vec_dot = ggml_vec_dot_q3_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
},
[GGML_TYPE_Q4_K] = {
};
type_traits[GGML_TYPE_Q4_K] = {
.type_name = "q4_K",
.blck_size = QK_K,
.type_size = sizeof(block_q4_K),
@ -568,8 +582,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
.vec_dot = ggml_vec_dot_q4_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
},
[GGML_TYPE_Q5_K] = {
};
type_traits[GGML_TYPE_Q5_K] = {
.type_name = "q5_K",
.blck_size = QK_K,
.type_size = sizeof(block_q5_K),
@ -579,8 +593,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
.vec_dot = ggml_vec_dot_q5_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
},
[GGML_TYPE_Q6_K] = {
};
type_traits[GGML_TYPE_Q6_K] = {
.type_name = "q6_K",
.blck_size = QK_K,
.type_size = sizeof(block_q6_K),
@ -590,15 +604,15 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
.vec_dot = ggml_vec_dot_q6_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
},
[GGML_TYPE_Q8_K] = {
};
type_traits[GGML_TYPE_Q8_K] = {
.type_name = "q8_K",
.blck_size = QK_K,
.type_size = sizeof(block_q8_K),
.is_quantized = true,
.from_float = quantize_row_q8_K,
}
};
};
}
// For internal test use
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
@ -1160,7 +1174,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
static void ggml_vec_dot_f32(const int n, float * GGML_RESTRICT s, const float * GGML_RESTRICT x, const float * GGML_RESTRICT y) {
#ifdef GGML_SIMD
float sumf = 0.0f;
const int np = (n & ~(GGML_F32_STEP - 1));
@ -1197,7 +1211,7 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
*s = sumf;
}
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
static void ggml_vec_dot_f16(const int n, float * GGML_RESTRICT s, ggml_fp16_t * GGML_RESTRICT x, ggml_fp16_t * GGML_RESTRICT y) {
ggml_float sumf = 0.0;
#if defined(GGML_SIMD)
@ -1235,10 +1249,10 @@ static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * rest
// compute GGML_VEC_DOT_UNROLL dot products at once
// xs - x row stride in bytes
inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
@ -1288,7 +1302,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re
}
}
inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
#if defined(GGML_SIMD)
const int np = (n & ~(GGML_F32_STEP - 1));
@ -1320,10 +1334,10 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
}
// xs and vs are byte strides of x and v
inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
const float * restrict x[GGML_VEC_MAD_UNROLL];
const float * restrict v[GGML_VEC_MAD_UNROLL];
const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
x[i] = (const float *) ((const char *) xv + i*xs);
@ -2175,11 +2189,19 @@ static inline int ggml_up(int n, int m) {
////////////////////////////////////////////////////////////////////////////////
struct ggml_context * ggml_init(struct ggml_init_params params) {
// initialize the data in the arrays
type_traits_init();
GGUF_TYPE_SIZE_init();
GGUF_TYPE_NAME_init();
struct ggml_context * ctx = NULL;
static bool is_first_call = true;
// make this function thread safe
ggml_critical_section_start();
static bool is_first_call = true;
if (is_first_call) {
// initialize time system (required on Windows)
@ -2238,7 +2260,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
}
// find non-used context in g_state
struct ggml_context * ctx = NULL;
for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
if (!g_state.contexts[i].used) {
@ -2402,7 +2424,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
// align to GGML_MEM_ALIGN
size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
char * const mem_buffer = ctx->mem_buffer;
char * const mem_buffer = (char*)ctx->mem_buffer;
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
@ -2475,7 +2497,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
return NULL;
}
data = (char * const) ctx->scratch.data + ctx->scratch.offs;
data = (void*)(((char *)ctx->scratch.data) + ctx->scratch.offs);
ctx->scratch.offs += data_size;
} else {
@ -2630,7 +2652,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
const int nc = tensor->ne[0];
const size_t n1 = tensor->nb[1];
char * const data = tensor->data;
char * const data = (char*)tensor->data;
switch (tensor->type) {
case GGML_TYPE_I8:
@ -2682,7 +2704,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
const int nc = tensor->ne[0];
const size_t n1 = tensor->nb[1];
char * const data = tensor->data;
char * const data = (char*)tensor->data;
switch (tensor->type) {
case GGML_TYPE_I8:
@ -3063,7 +3085,7 @@ struct ggml_tensor * ggml_view_tensor(
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
struct ggml_object * obj = ctx->objects_begin;
char * const mem_buffer = ctx->mem_buffer;
char * const mem_buffer = (char*)ctx->mem_buffer;
while (obj != NULL) {
if (obj->type == GGML_OBJECT_TENSOR) {
@ -3080,7 +3102,7 @@ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
obj = obj->next;
char * const mem_buffer = ctx->mem_buffer;
char * const mem_buffer = (char*)ctx->mem_buffer;
while (obj != NULL) {
if (obj->type == GGML_OBJECT_TENSOR) {
@ -3096,7 +3118,7 @@ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml
struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
struct ggml_object * obj = ctx->objects_begin;
char * const mem_buffer = ctx->mem_buffer;
char * const mem_buffer = (char*)ctx->mem_buffer;
while (obj != NULL) {
if (obj->type == GGML_OBJECT_TENSOR) {
@ -3292,7 +3314,7 @@ static struct ggml_tensor * ggml_acc_impl(
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
int32_t params[] = { (int32_t)nb1, (int32_t)nb2, (int32_t)nb3, (int32_t)offset, inplace ? 1 : 0 };
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_ACC;
@ -4145,7 +4167,7 @@ static struct ggml_tensor * ggml_set_impl(
// make a view of the destination
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
int32_t params[] = { (int32_t)nb1,(int32_t) nb2, (int32_t)nb3, (int32_t)offset, inplace ? 1 : 0 };
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_SET;
@ -5402,7 +5424,7 @@ struct ggml_tensor * ggml_pool_2d(
};
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
int32_t params[] = { op, k0, k1, s0, s1, (int32_t)p0, (int32_t)p1 };
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_POOL_2D;
@ -8262,7 +8284,7 @@ static void ggml_compute_forward_repeat_back_f32(
GGML_ASSERT(nb00 == sizeof(float));
if (ggml_is_contiguous(dst)) {
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
} else {
for (int k3 = 0; k3 < ne3; k3++) {
for (int k2 = 0; k2 < ne2; k2++) {
@ -9390,6 +9412,7 @@ static void ggml_compute_forward_mul_mat(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
int64_t t0 = ggml_perf_time_us();
UNUSED(t0);
@ -9492,7 +9515,7 @@ static void ggml_compute_forward_mul_mat(
if (params->type == GGML_TASK_INIT) {
if (src1->type != vec_dot_type) {
char * wdata = params->wdata;
char * wdata = (char*)params->wdata;
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
for (int64_t i13 = 0; i13 < ne13; ++i13) {
@ -9646,7 +9669,7 @@ static void ggml_compute_forward_out_prod_f32(
return;
}
#endif
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
return;
}
@ -9829,7 +9852,7 @@ static void ggml_compute_forward_out_prod_q_f32(
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
if (params->type == GGML_TASK_INIT) {
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
return;
}
@ -11843,7 +11866,7 @@ static void ggml_compute_forward_pool_1d(
struct ggml_tensor * dst) {
const int32_t * opts = (const int32_t *)dst->op_params;
enum ggml_op_pool op = opts[0];
enum ggml_op_pool op = (ggml_op_pool)opts[0];
const int k0 = opts[1];
const int s0 = opts[2];
const int p0 = opts[3];
@ -11867,7 +11890,7 @@ static void ggml_compute_forward_pool_2d(
}
const int32_t * opts = (const int32_t *)dst->op_params;
enum ggml_op_pool op = opts[0];
enum ggml_op_pool op = (ggml_op_pool)opts[0];
const int k0 = opts[1];
const int k1 = opts[2];
const int s0 = opts[3];
@ -14098,7 +14121,7 @@ static struct ggml_hash_set ggml_hash_set_new(size_t size) {
size = ggml_hash_size(size);
struct ggml_hash_set result;
result.size = size;
result.keys = malloc(sizeof(struct ggml_tensor *) * size);
result.keys = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * size);
memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
return result;
}
@ -14113,9 +14136,9 @@ struct hash_map {
};
static struct hash_map * ggml_new_hash_map(size_t size) {
struct hash_map * result = malloc(sizeof(struct hash_map));
struct hash_map * result = (hash_map*)malloc(sizeof(struct hash_map));
result->set = ggml_hash_set_new(size);
result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
result->vals = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * result->set.size);
memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
return result;
}
@ -16034,7 +16057,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
/*.abort_callback =*/ NULL,
/*.abort_callback_data =*/ NULL,
};
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
struct ggml_compute_state * workers = (ggml_compute_state*)alloca(sizeof(struct ggml_compute_state)*n_threads);
// create thread pool
if (n_threads > 1) {
@ -16631,7 +16654,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
continue;
}
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name((ggml_op)i), (double) perf_total_per_op_us[i] / 1000.0);
}
GGML_PRINT("========================================\n");
@ -16903,11 +16926,11 @@ static enum ggml_opt_result ggml_opt_adam(
const int n_accum = MAX(1, params.n_gradient_accumulation);
const float accum_norm = 1.0f / (float) n_accum;
float * g = opt->adam.g->data; // gradients
float * m = opt->adam.m->data; // first moment
float * v = opt->adam.v->data; // second moment
float * g = (float*)opt->adam.g->data; // gradients
float * m = (float*)opt->adam.m->data; // first moment
float * v = (float*)opt->adam.v->data; // second moment
float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
float * pf = params.past > 0 ? (float *)opt->adam.pf->data : NULL; // past function values
struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
@ -17175,7 +17198,7 @@ static enum ggml_opt_result linesearch_backtracking(
} else {
// Armijo condition is satisfied
if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) {
return count;
return (ggml_opt_result)count;
}
ggml_vec_dot_f32(nx, &dg, g, d);
@ -17186,14 +17209,14 @@ static enum ggml_opt_result linesearch_backtracking(
} else {
if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) {
// regular Wolfe conditions
return count;
return (ggml_opt_result)count;
}
if(dg > -params->lbfgs.wolfe*dginit) {
width = dec;
} else {
// strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
return count;
return (ggml_opt_result)count;
}
}
}
@ -17258,13 +17281,13 @@ static enum ggml_opt_result ggml_opt_lbfgs(
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
float * x = opt->lbfgs.x->data; // current parameters
float * xp = opt->lbfgs.xp->data; // previous parameters
float * g = opt->lbfgs.g->data; // current gradient
float * gp = opt->lbfgs.gp->data; // previous gradient
float * d = opt->lbfgs.d->data; // search direction
float * x = (float*)opt->lbfgs.x->data; // current parameters
float * xp = (float*)opt->lbfgs.xp->data; // previous parameters
float * g = (float*)opt->lbfgs.g->data; // current gradient
float * gp = (float*)opt->lbfgs.gp->data; // previous gradient
float * d = (float*)opt->lbfgs.d->data; // search direction
float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values
float * pf = params.past > 0 ? (float*)opt->lbfgs.pf->data : NULL; // past function values
const int n_accum = MAX(1, params.n_gradient_accumulation);
const float accum_norm = 1.0f / (float) n_accum;
@ -17277,10 +17300,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
ggml_opt_get_params(np, ps, x);
// the L-BFGS memory
float * lm_alpha = opt->lbfgs.lmal->data;
float * lm_ys = opt->lbfgs.lmys->data;
float * lm_s = opt->lbfgs.lms->data;
float * lm_y = opt->lbfgs.lmy->data;
float * lm_alpha = (float*)opt->lbfgs.lmal->data;
float * lm_ys = (float*)opt->lbfgs.lmys->data;
float * lm_s = (float*)opt->lbfgs.lms->data;
float * lm_y = (float*)opt->lbfgs.lmy->data;
bool cancel = false;
@ -17377,7 +17400,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
ggml_vec_cpy_f32(nx, x, xp);
ggml_vec_cpy_f32(nx, g, gp);
return ls;
return (ggml_opt_result)ls;
}
opt->loss_after = fx;
@ -17718,7 +17741,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
const int nb = k / QK4_0;
for (int b = 0; b < n; b += k) {
block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
block_q4_0 * GGML_RESTRICT y = (block_q4_0 *) dst + b/QK4_0;
quantize_row_q4_0_reference(src + b, y, k);
@ -17741,7 +17764,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
const int nb = k / QK4_1;
for (int b = 0; b < n; b += k) {
block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
block_q4_1 * GGML_RESTRICT y = (block_q4_1 *) dst + b/QK4_1;
quantize_row_q4_1_reference(src + b, y, k);
@ -17764,7 +17787,7 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
const int nb = k / QK5_0;
for (int b = 0; b < n; b += k) {
block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
block_q5_0 * GGML_RESTRICT y = (block_q5_0 *)dst + b/QK5_0;
quantize_row_q5_0_reference(src + b, y, k);
@ -17794,7 +17817,7 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
const int nb = k / QK5_1;
for (int b = 0; b < n; b += k) {
block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
block_q5_1 * GGML_RESTRICT y = (block_q5_1 *)dst + b/QK5_1;
quantize_row_q5_1_reference(src + b, y, k);
@ -17824,7 +17847,7 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
const int nb = k / QK8_0;
for (int b = 0; b < n; b += k) {
block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
block_q8_0 * GGML_RESTRICT y = (block_q8_0 *)dst + b/QK8_0;
quantize_row_q8_0_reference(src + b, y, k);
@ -17928,37 +17951,39 @@ struct gguf_str {
char * data;
};
static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
[GGUF_TYPE_UINT8] = sizeof(uint8_t),
[GGUF_TYPE_INT8] = sizeof(int8_t),
[GGUF_TYPE_UINT16] = sizeof(uint16_t),
[GGUF_TYPE_INT16] = sizeof(int16_t),
[GGUF_TYPE_UINT32] = sizeof(uint32_t),
[GGUF_TYPE_INT32] = sizeof(int32_t),
[GGUF_TYPE_FLOAT32] = sizeof(float),
[GGUF_TYPE_BOOL] = sizeof(bool),
[GGUF_TYPE_STRING] = sizeof(struct gguf_str),
[GGUF_TYPE_UINT64] = sizeof(uint64_t),
[GGUF_TYPE_INT64] = sizeof(int64_t),
[GGUF_TYPE_FLOAT64] = sizeof(double),
[GGUF_TYPE_ARRAY] = 0, // undefined
void GGUF_TYPE_SIZE_init() {
GGUF_TYPE_SIZE[GGUF_TYPE_UINT8] = sizeof(uint8_t);
GGUF_TYPE_SIZE[GGUF_TYPE_INT8] = sizeof(int8_t);
GGUF_TYPE_SIZE[GGUF_TYPE_UINT16] = sizeof(uint16_t);
GGUF_TYPE_SIZE[GGUF_TYPE_INT16] = sizeof(int16_t);
GGUF_TYPE_SIZE[GGUF_TYPE_UINT32] = sizeof(uint32_t);
GGUF_TYPE_SIZE[GGUF_TYPE_INT32] = sizeof(int32_t);
GGUF_TYPE_SIZE[GGUF_TYPE_FLOAT32] = sizeof(float);
GGUF_TYPE_SIZE[GGUF_TYPE_BOOL] = sizeof(bool);
GGUF_TYPE_SIZE[GGUF_TYPE_STRING] = sizeof(struct gguf_str);
GGUF_TYPE_SIZE[GGUF_TYPE_UINT64] = sizeof(uint64_t);
GGUF_TYPE_SIZE[GGUF_TYPE_INT64] = sizeof(int64_t);
GGUF_TYPE_SIZE[GGUF_TYPE_FLOAT64] = sizeof(double);
GGUF_TYPE_SIZE[GGUF_TYPE_ARRAY] = 0; // undefined
};
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
[GGUF_TYPE_UINT8] = "u8",
[GGUF_TYPE_INT8] = "i8",
[GGUF_TYPE_UINT16] = "u16",
[GGUF_TYPE_INT16] = "i16",
[GGUF_TYPE_UINT32] = "u32",
[GGUF_TYPE_INT32] = "i32",
[GGUF_TYPE_FLOAT32] = "f32",
[GGUF_TYPE_BOOL] = "bool",
[GGUF_TYPE_STRING] = "str",
[GGUF_TYPE_ARRAY] = "arr",
[GGUF_TYPE_UINT64] = "u64",
[GGUF_TYPE_INT64] = "i64",
[GGUF_TYPE_FLOAT64] = "f64",
void GGUF_TYPE_NAME_init(){
GGUF_TYPE_NAME[GGUF_TYPE_UINT8] = "u8";
GGUF_TYPE_NAME[GGUF_TYPE_INT8] = "i8";
GGUF_TYPE_NAME[GGUF_TYPE_UINT16] = "u16";
GGUF_TYPE_NAME[GGUF_TYPE_INT16] = "i16";
GGUF_TYPE_NAME[GGUF_TYPE_UINT32] = "u32";
GGUF_TYPE_NAME[GGUF_TYPE_INT32] = "i32";
GGUF_TYPE_NAME[GGUF_TYPE_FLOAT32] = "f32";
GGUF_TYPE_NAME[GGUF_TYPE_BOOL] = "bool";
GGUF_TYPE_NAME[GGUF_TYPE_STRING] = "str";
GGUF_TYPE_NAME[GGUF_TYPE_ARRAY] = "arr";
GGUF_TYPE_NAME[GGUF_TYPE_UINT64] = "u64";
GGUF_TYPE_NAME[GGUF_TYPE_INT64] = "i64";
GGUF_TYPE_NAME[GGUF_TYPE_FLOAT64] = "f64";
};
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
@ -18040,14 +18065,14 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
bool ok = true;
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = (char*)calloc(p->n + 1, 1);
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
return ok;
}
struct gguf_context * gguf_init_empty(void) {
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
ctx->header.version = GGUF_VERSION;
@ -18092,7 +18117,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
bool ok = true;
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
// read the header
{
@ -18124,7 +18149,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// read the kv pairs
{
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
ctx->kv = (gguf_kv*)malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
struct gguf_kv * kv = &ctx->kv[i];
@ -18199,7 +18224,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// read the tensor infos
{
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
ctx->infos = (gguf_tensor_info*)malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
struct gguf_tensor_info * info = &ctx->infos[i];
@ -18319,10 +18344,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// create the tensors
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
const int64_t ne[GGML_MAX_DIMS] = {
ctx->infos[i].ne[0],
ctx->infos[i].ne[1],
ctx->infos[i].ne[2],
ctx->infos[i].ne[3],
(int64_t)ctx->infos[i].ne[0],
(int64_t)ctx->infos[i].ne[1],
(int64_t)ctx->infos[i].ne[2],
(int64_t)ctx->infos[i].ne[3],
};
struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
@ -18603,7 +18628,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
const int n_kv = gguf_get_n_kv(ctx);
ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
ctx->kv = (gguf_kv*)realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
ctx->kv[n_kv].key.n = strlen(key);
ctx->kv[n_kv].key.data = strdup(key);
ctx->header.n_kv++;
@ -18739,7 +18764,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
case GGUF_TYPE_ARRAY:
{
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
const char ** data = (const char **)malloc(src->kv[i].value.arr.n*sizeof(char *));
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
}
@ -18760,7 +18785,7 @@ void gguf_add_tensor(
struct gguf_context * ctx,
const struct ggml_tensor * tensor) {
const int idx = ctx->header.n_tensors;
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
ctx->infos = (gguf_tensor_info*)realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
ctx->infos[idx].name.n = strlen(tensor->name);
ctx->infos[idx].name.data = strdup(tensor->name);

16
ggml.h
View file

@ -285,8 +285,10 @@
GGML_UNUSED(prefix##3);
#ifdef __cplusplus
#ifndef CPP_ONLY
extern "C" {
#endif
#endif
#if defined(__ARM_NEON) && defined(__CUDACC__)
typedef half ggml_fp16_t;
@ -1859,7 +1861,7 @@ extern "C" {
int n_gradient_accumulation;
// ADAM parameters
struct {
struct ggml_adam{
int n_iter;
float sched; // schedule multiplier (fixed, decay or warmup)
@ -1875,7 +1877,7 @@ extern "C" {
} adam;
// LBFGS parameters
struct {
struct ggml_lbfgs{
int m; // number of corrections to approximate the inv. Hessian
int n_iter;
int max_linesearch;
@ -1902,7 +1904,7 @@ extern "C" {
float loss_before;
float loss_after;
struct {
struct ggml_grad{
struct ggml_tensor * g; // current gradient
struct ggml_tensor * m; // first moment
struct ggml_tensor * v; // second moment
@ -1912,7 +1914,7 @@ extern "C" {
int n_no_improvement;
} adam;
struct {
struct ggml_params{
struct ggml_tensor * x; // current parameters
struct ggml_tensor * xp; // previous parameters
struct ggml_tensor * g; // current gradient
@ -2136,13 +2138,13 @@ extern "C" {
// restrict not standard in C++
#define GGML_RESTRICT
#else
#define GGML_RESTRICT restrict
#define GGML_RESTRICT __restrict__
#endif
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
typedef struct {
typedef struct ggml_something{
const char * type_name;
int blck_size;
size_t type_size;
@ -2157,5 +2159,7 @@ extern "C" {
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
#ifdef __cplusplus
#ifndef CPP_ONLY
}
#endif
#endif

896
llama-internal.hpp Normal file
View file

@ -0,0 +1,896 @@
#include <set>
#include <queue>
enum llm_arch {
LLM_ARCH_LLAMA,
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
LLM_ARCH_GPT2,
LLM_ARCH_GPTJ,
LLM_ARCH_GPTNEOX,
LLM_ARCH_MPT,
LLM_ARCH_STARCODER,
LLM_ARCH_PERSIMMON,
LLM_ARCH_REFACT,
LLM_ARCH_BLOOM,
LLM_ARCH_STABLELM,
LLM_ARCH_UNKNOWN,
};
enum llm_kv {
LLM_KV_GENERAL_ARCHITECTURE,
LLM_KV_GENERAL_QUANTIZATION_VERSION,
LLM_KV_GENERAL_ALIGNMENT,
LLM_KV_GENERAL_NAME,
LLM_KV_GENERAL_AUTHOR,
LLM_KV_GENERAL_URL,
LLM_KV_GENERAL_DESCRIPTION,
LLM_KV_GENERAL_LICENSE,
LLM_KV_GENERAL_SOURCE_URL,
LLM_KV_GENERAL_SOURCE_HF_REPO,
LLM_KV_CONTEXT_LENGTH,
LLM_KV_EMBEDDING_LENGTH,
LLM_KV_BLOCK_COUNT,
LLM_KV_FEED_FORWARD_LENGTH,
LLM_KV_USE_PARALLEL_RESIDUAL,
LLM_KV_TENSOR_DATA_LAYOUT,
LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV,
LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
LLM_KV_ATTENTION_CLAMP_KQV,
LLM_KV_ATTENTION_LAYERNORM_EPS,
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_FREQ_BASE,
LLM_KV_ROPE_SCALE_LINEAR,
LLM_KV_ROPE_SCALING_TYPE,
LLM_KV_ROPE_SCALING_FACTOR,
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
LLM_KV_ROPE_SCALING_FINETUNED,
LLM_KV_TOKENIZER_MODEL,
LLM_KV_TOKENIZER_LIST,
LLM_KV_TOKENIZER_TOKEN_TYPE,
LLM_KV_TOKENIZER_SCORES,
LLM_KV_TOKENIZER_MERGES,
LLM_KV_TOKENIZER_BOS_ID,
LLM_KV_TOKENIZER_EOS_ID,
LLM_KV_TOKENIZER_UNK_ID,
LLM_KV_TOKENIZER_SEP_ID,
LLM_KV_TOKENIZER_PAD_ID,
LLM_KV_TOKENIZER_ADD_BOS,
LLM_KV_TOKENIZER_ADD_EOS,
LLM_KV_TOKENIZER_HF_JSON,
LLM_KV_TOKENIZER_RWKV,
};
// available llama models
enum e_model {
MODEL_UNKNOWN,
MODEL_1B,
MODEL_3B,
MODEL_7B,
MODEL_8B,
MODEL_13B,
MODEL_15B,
MODEL_30B,
MODEL_34B,
MODEL_40B,
MODEL_65B,
MODEL_70B,
};
enum llama_fver {
GGUF_FILE_VERSION_V1 = 1,
GGUF_FILE_VERSION_V2 = 2,
GGUF_FILE_VERSION_V3 = 3,
};
struct LLM_KV {
LLM_KV(llm_arch arch) : arch(arch) {}
llm_arch arch;
std::string operator()(llm_kv kv) const; // moved to llama.cpp file
};
enum llm_tensor {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_TOKEN_EMBD_NORM,
LLM_TENSOR_POS_EMBD,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_ROPE_FREQS,
LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_K,
LLM_TENSOR_ATTN_V,
LLM_TENSOR_ATTN_QKV,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_NORM_2,
LLM_TENSOR_ATTN_ROT_EMBD,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM,
};
struct llama_cparams {
uint32_t n_ctx; // context size used during inference
uint32_t n_batch;
uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing
float rope_freq_base;
float rope_freq_scale;
uint32_t n_yarn_orig_ctx;
// These hyperparameters are not exposed in GGUF, because all
// existing YaRN models use the same values for them.
float yarn_ext_factor;
float yarn_attn_factor;
float yarn_beta_fast;
float yarn_beta_slow;
bool mul_mat_q;
};
struct llama_layer {
// normalization
struct ggml_tensor * attn_norm;
struct ggml_tensor * attn_norm_b;
struct ggml_tensor * attn_norm_2;
struct ggml_tensor * attn_norm_2_b;
struct ggml_tensor * attn_q_norm;
struct ggml_tensor * attn_q_norm_b;
struct ggml_tensor * attn_k_norm;
struct ggml_tensor * attn_k_norm_b;
// attention
struct ggml_tensor * wq;
struct ggml_tensor * wk;
struct ggml_tensor * wv;
struct ggml_tensor * wo;
struct ggml_tensor * wqkv;
// attention bias
struct ggml_tensor * bo;
struct ggml_tensor * bqkv;
// normalization
struct ggml_tensor * ffn_norm;
struct ggml_tensor * ffn_norm_b;
// ff
struct ggml_tensor * ffn_gate; // w1
struct ggml_tensor * ffn_down; // w2
struct ggml_tensor * ffn_up; // w3
// ff bias
struct ggml_tensor * ffn_down_b; // b2
struct ggml_tensor * ffn_up_b; // b3
};
struct llama_kv_cell {
llama_pos pos = -1;
llama_pos delta = 0;
std::set<llama_seq_id> seq_id;
bool has_seq_id(const llama_seq_id & id) const {
return seq_id.find(id) != seq_id.end();
}
};
struct llama_buffer {
void * data = NULL;
size_t size = 0;
// fallback to malloc / free
// useful in cases where CUDA can try to allocate PINNED memory
bool fallback = false;
void resize(size_t n) ;
~llama_buffer();
};
// ring-buffer of cached KV data
struct llama_kv_cache {
bool has_shift = false;
// Note: The value of head isn't only used to optimize searching
// for a free KV slot. llama_decode_internal also uses it, so it
// cannot be freely changed after a slot has been allocated.
uint32_t head = 0;
uint32_t size = 0;
// computed before each graph build
uint32_t n = 0;
std::vector<llama_kv_cell> cells;
struct ggml_tensor * k = NULL;
struct ggml_tensor * v = NULL;
struct ggml_context * ctx = NULL;
llama_buffer buf;
~llama_kv_cache() {
if (ctx) {
ggml_free(ctx);
}
#ifdef GGML_USE_CUBLAS
if (ggml_cublas_loaded()) {
ggml_cuda_free_data(k);
ggml_cuda_free_data(v);
}
#endif
}
};
struct llama_vocab {
using id = int32_t;
using token = std::string;
using ttype = llama_token_type;
struct token_data {
token text;
float score;
ttype type;
};
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
std::unordered_map<token, id> token_to_id;
std::vector<token_data> id_to_token;
std::unordered_map<token, id> special_tokens_cache;
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
// default LLaMA special tokens
id special_bos_id = 1;
id special_eos_id = 2;
id special_unk_id = 0;
id special_sep_id = -1;
id special_pad_id = -1;
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
id linefeed_id = 13;
id special_prefix_id = 32007;
id special_middle_id = 32009;
id special_suffix_id = 32008;
id special_eot_id = 32010;
int find_bpe_rank(std::string token_left, std::string token_right) const {
GGML_ASSERT(token_left.find(" ") == std::string::npos);
GGML_ASSERT(token_left.find("\n") == std::string::npos);
GGML_ASSERT(token_right.find(" ") == std::string::npos);
GGML_ASSERT(token_right.find("\n") == std::string::npos);
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
if (it == bpe_ranks.end()) {
return -1;
}
return it->second;
}
};
struct llama_mmap {
void * addr;
size_t size;
llama_mmap(const llama_mmap &) = delete;
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false);
~llama_mmap();
#ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true;
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
#else
static constexpr bool SUPPORTED = false;
#endif
};
struct llama_hparams {
bool vocab_only;
uint32_t n_vocab;
uint32_t n_ctx_train; // context size the model was trained on
uint32_t n_embd;
uint32_t n_head;
uint32_t n_head_kv;
uint32_t n_layer;
uint32_t n_rot;
uint32_t n_ff;
float f_norm_eps;
float f_norm_rms_eps;
float rope_freq_base_train;
float rope_freq_scale_train;
uint32_t n_yarn_orig_ctx;
int8_t rope_scaling_type_train : 3;
bool rope_finetuned : 1;
float f_clamp_kqv;
float f_max_alibi_bias;
bool operator!=(const llama_hparams & other) const;
uint32_t n_gqa() const {
return n_head/n_head_kv;
}
uint32_t n_embd_head() const {
return n_embd/n_head;
}
uint32_t n_embd_gqa() const {
return n_embd/n_gqa();
}
};
struct llama_mlock {
void * addr = NULL;
size_t size = 0;
bool failed_already = false;
llama_mlock() ;
llama_mlock(const llama_mlock &) = delete;
~llama_mlock();
void init(void * ptr);
void grow_to(size_t target_size);
#ifdef _POSIX_MEMLOCK_RANGE
static constexpr bool SUPPORTED = true;
static size_t lock_granularity();
#ifdef __APPLE__
#define MLOCK_SUGGESTION \
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
#else
#define MLOCK_SUGGESTION \
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
#endif
bool raw_lock(const void * addr, size_t size) const ;
#undef MLOCK_SUGGESTION
static void raw_unlock(void * addr, size_t size);
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
static size_t lock_granularity();
bool raw_lock(void * ptr, size_t len) const ;
static void raw_unlock(void * ptr, size_t len);
#else
static constexpr bool SUPPORTED = false;
static size_t lock_granularity();
bool raw_lock(const void * addr, size_t len) const;
static void raw_unlock(const void * addr, size_t len);
#endif
};
struct llama_model {
e_model type = MODEL_UNKNOWN;
llm_arch arch = LLM_ARCH_UNKNOWN;
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
std::string name = "n/a";
llama_hparams hparams = {};
llama_vocab vocab;
struct ggml_tensor * tok_embd;
struct ggml_tensor * pos_embd;
struct ggml_tensor * tok_norm;
struct ggml_tensor * tok_norm_b;
struct ggml_tensor * output_norm;
struct ggml_tensor * output_norm_b;
struct ggml_tensor * output;
std::vector<llama_layer> layers;
int n_gpu_layers;
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
// context
struct ggml_context * ctx = NULL;
// the model memory buffer
llama_buffer buf;
// model memory mapped file
std::unique_ptr<llama_mmap> mapping;
// objects representing data potentially being locked in memory
llama_mlock mlock_buf;
llama_mlock mlock_mmap;
// for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
int64_t t_load_us = 0;
int64_t t_start_us = 0;
~llama_model() {
if (ctx) {
ggml_free(ctx);
}
#ifdef GGML_USE_CUBLAS
if (ggml_cublas_loaded()) {
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
ggml_cuda_free_data(tensors_by_name[i].second);
}
ggml_cuda_free_scratch();
}
#endif
#if defined(GGML_USE_CLBLAST)
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
ggml_cl_free_data(tensors_by_name[i].second);
}
#endif
}
};
struct llama_context {
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
~llama_context();
llama_cparams cparams;
const llama_model & model;
// key + value cache for the self attention
struct llama_kv_cache kv_self;
std::mt19937 rng;
bool has_evaluated_once = false;
int64_t t_start_us;
int64_t t_load_us;
int64_t t_sample_us = 0;
int64_t t_p_eval_us = 0;
int64_t t_eval_us = 0;
int32_t n_sample = 0; // number of tokens sampled
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
int32_t n_eval = 0; // number of eval calls
// decode output (2-dimensional array: [n_tokens][n_vocab])
std::vector<float> logits;
bool logits_all = false;
// input embedding (1-dimensional array: [n_embd])
std::vector<float> embedding;
// reusable buffer for `struct ggml_graph_plan.work_data`
std::vector<uint8_t> work_buffer;
// memory buffers used to evaluate the model
llama_buffer buf_compute;
llama_buffer buf_alloc;
ggml_allocr * alloc = NULL;
#ifdef GGML_USE_METAL
ggml_metal_context * ctx_metal = NULL;
#endif
#ifdef GGML_USE_MPI
ggml_mpi_context * ctx_mpi = NULL;
#endif
};
struct LLM_TN {
LLM_TN(llm_arch arch) ;
llm_arch arch;
std::string operator()(llm_tensor tensor) const;
std::string operator()(llm_tensor tensor, const std::string & suffix) const ;
std::string operator()(llm_tensor tensor, int bid) const ;
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const ;
};
struct llama_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
size_t size;
llama_file(const char * fname, const char * mode) ;
size_t tell() const;
void seek(size_t offset, int whence) const;
void read_raw(void * ptr, size_t len) const;
uint32_t read_u32() const;
void write_raw(const void * ptr, size_t len) const ;
void write_u32(std::uint32_t val) const;
~llama_file();
};
struct llama_state {
llama_state();
// We save the log callback globally
ggml_log_callback log_callback;
void * log_callback_user_data = nullptr;
};
struct llama_model_loader {
int n_kv = 0;
int n_tensors = 0;
int n_created = 0;
int64_t n_elements = 0;
size_t n_bytes = 0;
bool use_mmap = false;
llama_file file;
llama_ftype ftype;
llama_fver fver;
std::unique_ptr<llama_mmap> mapping;
struct gguf_context * ctx_gguf = NULL;
struct ggml_context * ctx_meta = NULL;
llama_model_loader(const std::string & fname, bool use_mmap) ;
~llama_model_loader();
std::string get_arch_name() const;
enum llm_arch get_arch() const ;
const char * get_tensor_name(int i) const;
struct ggml_tensor * get_tensor_meta(int i) const;
void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const;
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) ;
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) ;
void done_getting_tensors() const;
size_t file_offset(const char * name) const;
void load_data_for(struct ggml_tensor * cur) const ;
void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) ;
};
struct llama_data_context {
virtual void write(const void * src, size_t size) = 0;
virtual size_t get_size_written() = 0;
virtual ~llama_data_context() = default;
};
struct llama_data_buffer_context : llama_data_context {
uint8_t * ptr;
size_t size_written = 0;
llama_data_buffer_context(uint8_t * p) ;
void write(const void * src, size_t size) override ;
size_t get_size_written() override ;
};
struct llama_data_file_context : llama_data_context {
llama_file * file;
size_t size_written = 0;
llama_data_file_context(llama_file * f);
size_t get_size_written() override ;
void write(const void * src, size_t size);
};
struct llama_beam {
std::vector<llama_token> tokens;
float p; // Cumulative beam probability (renormalized relative to all beams)
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
// Sort beams by probability. In case of ties, prefer beams at eob.
bool operator<(const llama_beam & rhs) const ;
void shift_tokens(const size_t n) ;
llama_beam_view view() const;
};
// A struct for calculating logit-related info.
struct llama_logit_info {
const float * const logits;
const int n_vocab;
const float max_l;
const float normalizer;
struct sum_exp {
float max_l;
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
};
llama_logit_info(llama_context * ctx);
llama_token_data get_token_data(const llama_token token_id) const ;
std::vector<llama_token_data> top_k(size_t k) ;
float probability_from_logit(float logit) const ;
};
struct llama_beam_search_data {
llama_context * ctx;
size_t n_beams;
int n_past;
int n_predict;
std::vector<llama_beam> beams;
std::vector<llama_beam> next_beams;
size_t common_prefix_length;
std::vector<llama_beam_view> beam_views;
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict);
void collapse_beams(const size_t beam_idx) ;
void fill_next_beams_by_top_probabilities(llama_beam & beam) ;
size_t find_common_prefix_length() ;
llama_beams_state get_beams_state(const bool last_call) ;
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data);
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) ;
size_t top_beam_index();
void update_beams_from_beam_views();
};
using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
enum llm_rope_type {
LLM_ROPE,
LLM_ROPE_NEOX,
LLM_ROPE_GLM,
};
enum llm_ffn_op_type {
LLM_FFN_SILU,
LLM_FFN_GELU,
LLM_FFN_RELU,
LLM_FFN_RELU_SQR,
};
enum llm_ffn_gate_type {
LLM_FFN_SEQ,
LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
};
enum llm_norm_type {
LLM_NORM,
LLM_NORM_RMS,
};
struct llm_build_context {
const llama_model & model;
const llama_hparams & hparams;
const llama_cparams & cparams;
const llama_batch & batch;
const llama_kv_cache & kv_self;
const int64_t n_embd;
const int64_t n_layer;
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
const int64_t n_head;
const int64_t n_head_kv;
const int64_t n_embd_head;
const int64_t n_embd_gqa;
const float freq_base;
const float freq_scale;
const float ext_factor;
const float attn_factor;
const float beta_fast;
const float beta_slow;
const float norm_eps;
const float norm_rms_eps;
const int32_t n_tokens;
const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx)
const int32_t kv_head; // index of where we store new KV data in the cache
const int32_t n_orig_ctx;
const bool do_rope_shift;
const llm_build_cb & cb;
llama_buffer & buf_compute;
struct ggml_context * ctx0 = nullptr;
// TODO: consider making the entire interface noexcept
llm_build_context(
llama_context & lctx,
const llama_batch & batch,
const llm_build_cb & cb,
bool worst_case);
void init() ;
void free() ;
struct ggml_cgraph * build_llama() ;
struct ggml_cgraph * build_baichuan() ;
struct ggml_cgraph * build_falcon() ;
struct ggml_cgraph * build_starcoder() ;
struct ggml_cgraph * build_persimmon() ;
struct ggml_cgraph * build_refact() ;
struct ggml_cgraph * build_bloom() ;
struct ggml_cgraph * build_mpt() ;
struct ggml_cgraph * build_stablelm();
};
enum llm_offload_func_e {
OFFLOAD_FUNC_NOP,
OFFLOAD_FUNC,
OFFLOAD_FUNC_KQ,
OFFLOAD_FUNC_V,
OFFLOAD_FUNC_NR,
OFFLOAD_FUNC_EMB,
OFFLOAD_FUNC_OUT,
};
struct llm_offload_trie {
struct node {
~node() ;
node * children[256] = { nullptr };
llm_offload_func_e func = OFFLOAD_FUNC_NOP;
};
node * root = nullptr;
llm_offload_trie();
llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) ;
~llm_offload_trie();
void add(const char * name, llm_offload_func_e func);
llm_offload_func_e find(const char * name) const;
};
struct llm_symbol {
using index = int;
index prev;
index next;
const char * text;
size_t n;
};
struct llm_bigram_spm {
struct comparator {
bool operator()(llm_bigram_spm & l, llm_bigram_spm & r);
};
using queue_storage = std::vector<llm_bigram_spm>;
using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
llm_symbol::index left;
llm_symbol::index right;
float score;
size_t size;
};
struct llm_tokenizer_spm {
llm_tokenizer_spm(const llama_vocab & vocab);
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output);
private:
void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) ;
void try_add_bigram(int left, int right) ;
const llama_vocab & vocab;
std::vector<llm_symbol> symbols;
llm_bigram_spm::queue work_queue;
std::map<std::string, std::pair<int, int>> rev_merge;
};
// BPE tokenizer
// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
// tried to simplify unicode stuff, so most likely does not work 100% correctly!
// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
struct llm_bigram_bpe {
struct comparator {
bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const ;
};
using queue_storage = std::vector<llm_bigram_bpe>;
using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
llm_symbol::index left;
llm_symbol::index right;
std::string text;
int rank;
size_t size;
};
struct llm_tokenizer_bpe {
llm_tokenizer_bpe(const llama_vocab & vocab);
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output);
private:
void add_new_bigram(int left, int right) ;
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) ;
const llama_vocab & vocab;
std::vector<llm_symbol> symbols;
std::vector<llm_symbol> symbols_final;
llm_bigram_bpe::queue work_queue;
};
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
} FRAGMENT_BUFFER_VARIANT_TYPE;
struct fragment_buffer_variant{
fragment_buffer_variant(llama_vocab::id _token);
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length);
const FRAGMENT_BUFFER_VARIANT_TYPE type;
const llama_vocab::id token;
const std::string _dummy;
const std::string & raw_text;
const uint64_t offset;
const uint64_t length;
};
struct llama_partial_utf8 {
uint32_t value; // bit value so far (unshifted)
int n_remain; // num bytes remaining; -1 indicates invalid sequence
};
struct llama_grammar {
const std::vector<std::vector<llama_grammar_element>> rules;
std::vector<std::vector<const llama_grammar_element *>> stacks;
// buffer for partially generated UTF-8 sequence from accepted tokens
llama_partial_utf8 partial_utf8;
};
struct llama_grammar_candidate {
size_t index;
const uint32_t * code_points;
llama_partial_utf8 partial_utf8;
};
struct quantize_state_internal {
const llama_model & model;
const llama_model_quantize_params * params;
int n_attention_wv = 0;
int n_feed_forward_w2 = 0;
int i_attention_wv = 0;
int i_feed_forward_w2 = 0;
int n_k_quantized = 0;
int n_fallback = 0;
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
: model(model)
, params(params)
{}
};

View file

@ -50,7 +50,9 @@
#endif
#ifdef __cplusplus
#ifndef CPP_ONLY
extern "C" {
#endif
#endif
//
@ -827,8 +829,10 @@ extern "C" {
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
#ifdef __cplusplus
#ifndef CPP_ONLY
}
#endif
#endif
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
#ifdef LLAMA_API_INTERNAL
@ -844,4 +848,8 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
#endif // LLAMA_API_INTERNAL
#endif // LLAMA_H

763
print.hpp Normal file
View file

@ -0,0 +1,763 @@
#include <refl-cpp/refl.hpp>
#include <iostream>
#include "llama.h"
#include "ggml-internal.hpp"
#include "llama-internal.hpp"
REFL_TYPE(ggml_init_params )
REFL_END
// we use the named data type patch
#define ggml_opt_params_names
#ifdef ggml_opt_params_names
REFL_TYPE(ggml_opt_params::ggml_adam)
REFL_END
REFL_TYPE(ggml_opt_params::ggml_lbfgs)
REFL_END
REFL_TYPE(ggml_opt_context::ggml_grad )
REFL_END
#endif
REFL_TYPE(gpt_params )
REFL_FIELD( seed )
REFL_FIELD( n_threads)
REFL_FIELD( n_threads_batch)
REFL_FIELD( n_predict )
REFL_FIELD( n_ctx )
REFL_FIELD( n_batch)
REFL_FIELD( n_keep )
REFL_FIELD( n_draft)
REFL_FIELD( n_chunks )
REFL_FIELD( n_parallel)
REFL_FIELD( n_sequences)
REFL_FIELD( p_accept )
REFL_FIELD( p_split )
REFL_FIELD( n_gpu_layers)
REFL_FIELD( n_gpu_layers_draft)
REFL_FIELD( main_gpu )
REFL_FIELD( tensor_split)
REFL_FIELD( n_beams )
REFL_FIELD(rope_freq_base)
REFL_FIELD( rope_freq_scale )
REFL_FIELD( yarn_ext_factor )
REFL_FIELD( yarn_attn_factor )
REFL_FIELD( yarn_beta_fast )
REFL_FIELD( yarn_beta_slow )
REFL_FIELD( yarn_orig_ctx)
REFL_FIELD( rope_scaling_type)
REFL_FIELD( sparams)
REFL_FIELD(model )
REFL_FIELD(model_draft )
REFL_FIELD(model_alias)
REFL_FIELD(prompt )
REFL_FIELD(prompt_file )
REFL_FIELD(path_prompt_cache )
REFL_FIELD(input_prefix )
REFL_FIELD(input_suffix )
REFL_FIELD( antiprompt)
REFL_FIELD(logdir )
REFL_FIELD( lora_adapter)
REFL_FIELD(lora_base )
REFL_FIELD( ppl_stride )
REFL_FIELD( ppl_output_type )
REFL_FIELD( hellaswag )
REFL_FIELD( hellaswag_tasks )
REFL_FIELD( mul_mat_q )
REFL_FIELD( memory_f16)
REFL_FIELD( random_prompt )
REFL_FIELD( use_color )
REFL_FIELD( interactive )
REFL_FIELD( chatml )
REFL_FIELD( prompt_cache_all )
REFL_FIELD( prompt_cache_ro )
REFL_FIELD( embedding )
REFL_FIELD( escape )
REFL_FIELD( interactive_first )
REFL_FIELD( multiline_input )
REFL_FIELD( simple_io )
REFL_FIELD( cont_batching )
REFL_FIELD( input_prefix_bos )
REFL_FIELD( ignore_eos )
REFL_FIELD( instruct )
REFL_FIELD( logits_all )
REFL_FIELD( use_mmap)
REFL_FIELD( use_mlock )
REFL_FIELD( numa )
REFL_FIELD( verbose_prompt )
REFL_FIELD( infill )
REFL_FIELD(mmproj )
REFL_FIELD( image)
REFL_END
REFL_TYPE(llama_sampling_params)
REFL_END
REFL_TYPE(llm_arch)
REFL_END
REFL_TYPE(llama_sampling_context )
REFL_FIELD( params)
REFL_FIELD( mirostat_mu)
REFL_FIELD( grammar)
REFL_FIELD( parsed_grammar)
//REFL_FIELD( prev) // TODO fixme has null data
//REFL_FIELD( cur)
REFL_END
REFL_TYPE(llama_token_data )
REFL_END
REFL_TYPE(llama_token_data_array )
REFL_END
REFL_TYPE(llama_batch )
REFL_END
REFL_TYPE(ggml_object)
REFL_FIELD(offs)
REFL_END
REFL_TYPE(ggml_tensor)
REFL_FIELD(type)
REFL_END
REFL_TYPE(ggml_cplan)
REFL_FIELD(work_size)
REFL_END
REFL_TYPE(ggml_hash_set)
REFL_FIELD(size)
REFL_END
REFL_TYPE(ggml_cgraph)
REFL_FIELD(size)
REFL_END
REFL_TYPE(ggml_scratch)
REFL_FIELD(offs)
REFL_END
REFL_TYPE(ggml_compute_params)
REFL_FIELD(type)
REFL_END
REFL_TYPE(ggml_opt_params)
REFL_FIELD(type)
REFL_END
REFL_TYPE(ggml_opt_context)
REFL_FIELD(ctx)
REFL_END
REFL_TYPE(gguf_init_params)
REFL_END
REFL_TYPE(ggml_something)
REFL_FIELD(type_name)
REFL_END
REFL_TYPE(ggml_context)
REFL_FIELD(mem_size)
REFL_FIELD(mem_buffer)
REFL_FIELD(mem_buffer_owned)
REFL_FIELD( no_alloc)
REFL_FIELD( no_alloc_save)
REFL_FIELD( n_objects)
REFL_FIELD( objects_begin)
REFL_FIELD( objects_end)
REFL_FIELD( scratch)
REFL_FIELD( scratch_save)
REFL_END
REFL_TYPE(ggml_context_container)
REFL_FIELD(used)
REFL_FIELD(context)
REFL_END
REFL_TYPE(ggml_numa_node)
REFL_FIELD(cpus)
REFL_FIELD(n_cpus)
REFL_END
REFL_TYPE(ggml_numa_nodes)
REFL_FIELD(nodes)
REFL_FIELD(n_nodes)
REFL_END
REFL_TYPE(ggml_state)
REFL_FIELD(contexts)
REFL_FIELD(numa)
REFL_END
REFL_TYPE(gguf_str)
REFL_FIELD(n)
REFL_FIELD(data)
REFL_END
REFL_TYPE(ggml_map_custom1_op_params)
REFL_FIELD(fun)
REFL_FIELD(n_tasks)
REFL_END
REFL_TYPE(ggml_map_custom2_op_params)
REFL_FIELD(fun)
REFL_FIELD(n_tasks)
REFL_END
REFL_TYPE(ggml_map_custom3_op_params)
REFL_FIELD(fun)
REFL_FIELD(n_tasks)
REFL_END
REFL_TYPE(hash_map)
REFL_FIELD(set)
REFL_FIELD(vals)
REFL_END
REFL_TYPE(ggml_compute_state_shared)
REFL_FIELD(cgraph)
REFL_FIELD(cplan)
REFL_END
REFL_TYPE(ggml_compute_state)
REFL_FIELD(thrd)
REFL_FIELD(ith)
REFL_END
REFL_TYPE(ggml_lbfgs_iteration_data)
REFL_FIELD(alpha)
REFL_FIELD(ys)
REFL_END
REFL_TYPE(gguf_kv)
REFL_FIELD(key)
REFL_FIELD(type)
REFL_END
REFL_TYPE(gguf_header)
REFL_FIELD(magic)
REFL_FIELD(version)
REFL_END
REFL_TYPE(gguf_tensor_info)
REFL_FIELD(name)
REFL_FIELD(n_dims)
REFL_END
REFL_TYPE(gguf_context)
REFL_FIELD(header)
REFL_FIELD(kv)
REFL_END
REFL_TYPE(gguf_buf)
REFL_FIELD(data)
REFL_FIELD(size)
REFL_END
REFL_TYPE(llama_model_params)
REFL_FIELD(n_gpu_layers)
REFL_END
REFL_TYPE(llama_context_params)
REFL_FIELD(seed)
REFL_END
REFL_TYPE(llama_model_quantize_params)
REFL_FIELD(nthread)
REFL_END
REFL_TYPE(llama_grammar_element)
REFL_END
REFL_TYPE(llama_timings)
REFL_FIELD(t_start_ms)
REFL_END
REFL_TYPE(llama_beam_view)
REFL_FIELD(tokens)
REFL_END
REFL_TYPE(llama_beams_state)
REFL_FIELD(beam_views)
REFL_END
REFL_TYPE(ggml_backend)
REFL_END
REFL_TYPE(ggml_backend_buffer)
REFL_END
REFL_TYPE(ggml_allocr)
REFL_END
REFL_TYPE(ggml_tallocr)
REFL_END
REFL_TYPE(ggml_gallocr)
REFL_END
REFL_TYPE(llama_buffer)
REFL_FIELD(data)
REFL_FIELD(size)
REFL_END
REFL_TYPE(llama_file)
REFL_FIELD(fp)
REFL_FIELD(size)
REFL_END
REFL_TYPE(llama_mmap)
REFL_FIELD(addr)
REFL_FIELD(size)
REFL_END
REFL_TYPE(llama_mlock)
REFL_FIELD(addr)
REFL_FIELD(size)
REFL_END
REFL_TYPE(llama_state)
REFL_FIELD(log_callback)
REFL_FIELD(log_callback_user_data)
REFL_END
REFL_TYPE(llama_hparams)
REFL_FIELD(vocab_only)
REFL_FIELD(n_vocab)
REFL_END
REFL_TYPE(llama_cparams)
REFL_FIELD(n_ctx)
REFL_FIELD(n_batch)
REFL_END
REFL_TYPE(llama_layer)
REFL_FIELD(attn_norm)
REFL_FIELD(attn_norm_b)
REFL_END
REFL_TYPE(llama_kv_cell)
REFL_FIELD(pos)
REFL_FIELD(delta)
REFL_END
REFL_TYPE(llama_kv_cache)
REFL_FIELD(has_shift)
REFL_FIELD(head)
REFL_END
REFL_TYPE(e_model)
REFL_END
REFL_TYPE(llama_ftype)
REFL_END
REFL_TYPE(llama_model)
REFL_FIELD(type)
REFL_FIELD(arch)
REFL_FIELD(ftype )
REFL_FIELD( name )
REFL_FIELD( hparams )
REFL_FIELD( vocab)
REFL_FIELD( tok_embd)
REFL_FIELD( pos_embd)
REFL_FIELD( tok_norm)
REFL_FIELD( tok_norm_b)
REFL_FIELD( output_norm)
REFL_FIELD( output_norm_b)
REFL_FIELD( output)
REFL_FIELD( layers)
REFL_FIELD( n_gpu_layers)
REFL_FIELD( gguf_kv) //unordered map
REFL_FIELD( ctx)
REFL_FIELD( buf)
REFL_FIELD( mapping) //std::unique_ptr
REFL_FIELD( mlock_buf)
REFL_FIELD( mlock_mmap)
REFL_FIELD( tensors_by_name)
REFL_FIELD( t_load_us)
REFL_FIELD( t_start_us)
REFL_END
REFL_TYPE(llama_vocab)
REFL_END
REFL_TYPE(grammar_parser::parse_state)
REFL_END
REFL_TYPE(llama_context)
REFL_FIELD( cparams)
//REFL_FIELD(model)
REFL_FIELD(kv_self)
REFL_FIELD(rng) //random numbers
REFL_FIELD(has_evaluated_once )
REFL_FIELD( t_start_us)
REFL_FIELD( t_load_us)
REFL_FIELD( t_sample_us )
REFL_FIELD( t_p_eval_us )
REFL_FIELD( t_eval_us)
REFL_FIELD( n_sample )
REFL_FIELD( n_p_eval )
REFL_FIELD( n_eval )
//REFL_FIELD( logits) crash
REFL_FIELD( logits_all )
REFL_FIELD( embedding)
//REFL_FIELD( work_buffer)
REFL_FIELD( buf_compute)
REFL_FIELD( buf_alloc)
REFL_FIELD( alloc )
#ifdef GGML_USE_METAL
REFL_FIELD( ctx_metal )
#endif
#ifdef GGML_USE_MPI
REFL_FIELD( ctx_mpi )
#endif
REFL_END
REFL_TYPE(llama_model_loader)
REFL_FIELD(n_kv)
REFL_FIELD(n_tensors)
REFL_END
REFL_TYPE(llm_build_context)
// REFL_FIELD(model) cannot create pointer to reference member llm_build_context::model
// REFL_FIELD(hparams) cannot create pointer to reference member llm_build_context::hparams
REFL_END
REFL_TYPE(llm_offload_trie)
REFL_END
REFL_TYPE(llm_symbol)
REFL_FIELD(prev)
REFL_END
REFL_TYPE(llm_bigram_spm)
REFL_END
REFL_TYPE(llm_tokenizer_spm)
REFL_END
REFL_TYPE(llm_bigram_bpe)
REFL_END
REFL_TYPE(llm_tokenizer_bpe)
REFL_END
REFL_TYPE(fragment_buffer_variant)
REFL_END
REFL_TYPE(llama_partial_utf8)
REFL_FIELD(value)
REFL_FIELD(n_remain)
REFL_END
REFL_TYPE(llama_grammar)
REFL_FIELD(rules)
REFL_FIELD(stacks)
REFL_END
REFL_TYPE(llama_grammar_candidate)
REFL_FIELD(index)
REFL_FIELD(code_points)
REFL_END
REFL_TYPE(llama_beam)
REFL_FIELD(tokens)
REFL_FIELD(p)
REFL_END
REFL_TYPE(llama_logit_info)
// REFL_FIELD(logits)
REFL_FIELD(n_vocab)
REFL_END
REFL_TYPE(llama_beam_search_data)
REFL_FIELD(ctx)
REFL_FIELD(n_beams)
REFL_END
REFL_TYPE(quantize_state_internal)
// REFL_FIELD(model)
REFL_FIELD(params)
REFL_FIELD( n_attention_wv )
REFL_FIELD( n_feed_forward_w2 )
REFL_FIELD( i_attention_wv )
REFL_FIELD( i_feed_forward_w2 )
REFL_FIELD( n_k_quantized )
REFL_FIELD( n_fallback )
REFL_END
REFL_TYPE(llama_data_context)
REFL_END
REFL_TYPE(llama_data_buffer_context)
REFL_FIELD(ptr)
REFL_END
REFL_TYPE(llama_data_file_context)
REFL_FIELD(file)
REFL_END
template <typename T>
constexpr auto get_value_type_name(const T t) noexcept
{
return t.value_type;
}
namespace runtime2
{
using namespace refl;
using namespace refl::descriptor;
template <typename CharT, typename T>
void debug(std::basic_ostream<CharT>& os, const T& value, bool compact = false);
namespace detail
{
template <typename CharT, typename T, typename = decltype(std::declval<std::basic_ostream<CharT>&>() << std::declval<T>())>
std::true_type is_ostream_printable_test(int);
template <typename CharT, typename T>
std::false_type is_ostream_printable_test(...);
template <typename CharT, typename T>
constexpr bool is_ostream_printable_v{ decltype(is_ostream_printable_test<CharT, T>(0))::value };
namespace
{
[[maybe_unused]] int next_depth(int depth)
{
return depth == -1 || depth > 8
? -1
: depth + 1;
}
}
template <typename CharT>
void indent(std::basic_ostream<CharT>& os, int depth)
{
for (int i = 0; i < depth; i++) {
os << " ";
}
}
template <typename CharT, typename T>
void debug_impl(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] int depth);
template <typename CharT, typename T>
void debug_detailed(std::basic_ostream<CharT>& os, const T& value, int depth)
{
using type_descriptor = type_descriptor<T>;
bool compact = depth == -1;
// print type with members enclosed in braces
os << type_descriptor::name << " { ";
if (!compact) os << '\n';
constexpr auto readable_members = filter(type_descriptor::members, [](auto member) { return is_readable(member); });
for_each(readable_members, [&](auto member, [[maybe_unused]] auto index) {
int new_depth = next_depth(depth);
indent(os, new_depth);
os << get_display_name(member) << " = ";
if constexpr (util::contains_instance<attr::debug>(member.attributes)) {
// use the debug attribute to print
auto debug_attr = util::get_instance<attr::debug>(member.attributes);
debug_attr.write(os, value);
}
else {
debug_impl(os, member(value), new_depth);
}
if (!compact || index + 1 != readable_members.size) {
os << ", ";
}
if (!compact) {
indent(os, depth);
os << '\n';
}
});
if (compact) os << ' ';
indent(os, depth);
os << '}';
}
template <typename CharT, typename T>
void debug_reflectable(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] int depth)
{
using type_descriptor = type_descriptor<T>;
if constexpr (trait::contains_instance_v<attr::debug, typename type_descriptor::attribute_types>) {
// use the debug attribute to print
auto debug_attr = util::get_instance<attr::debug>(type_descriptor::attributes);
debug_attr.write(os, value);
}
else if constexpr (detail::is_ostream_printable_v<CharT, T>) {
// type supports printing natively, just use that
os << value;
}
else {
debug_detailed(os, value, depth);
}
}
template <typename CharT, typename T>
void debug_container(std::basic_ostream<CharT>& os, const T& value, int depth)
{
bool compact = depth == -1;
os << "[";
auto end = value.end();
for (auto it = value.begin(); it != end; ++it)
{
if (!compact) os << '\n';
int new_depth = next_depth(depth);
indent(os, new_depth);
debug_impl(os, *it, new_depth);
if (std::next(it, 1) != end) {
os << ", ";
}
else if (!compact) {
os << '\n';
}
}
indent(os, depth);
os << "]";
}
template <typename CharT, typename T>
void debug_impl(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] int depth)
{
using no_pointer_t = std::remove_pointer_t<T>;
if constexpr (std::is_same_v<bool, T>) {
os << (value ? "true" : "false");
}
else if constexpr (std::is_pointer_v<T> && !std::is_void_v<no_pointer_t> && trait::is_reflectable_v<no_pointer_t>) {
if (value == nullptr) {
os << "nullptr";
}
else {
os << '&';
debug_impl(os, *value, -1);
}
}
else if constexpr (trait::is_reflectable_v<T>) {
debug_reflectable(os, value, depth);
}
else if constexpr (detail::is_ostream_printable_v<CharT, T>) {
os << value;
}
else if constexpr (trait::is_container_v<T>) {
debug_container(os, value, depth);
}
else {
os << "(not printable)";
}
}
}
/**
* Writes the debug representation of value to the given std::ostream.
* Calls the function specified by the debug<F> attribute whenever possible,
* before falling back to recursively interating the members and printing them.
* Takes an optional arguments specifying whether to print a compact representation.
* The compact representation contains no newlines.
*/
template <typename CharT, typename T>
void debug(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] bool compact)
{
static_assert(trait::is_reflectable_v<T> || trait::is_container_v<T> || detail::is_ostream_printable_v<CharT, T>,
"Type is not reflectable, not a container of reflectable types and does not support operator<<(std::ostream&, T)!");
detail::debug_impl(os, value, compact ? -1 : 0);
}
/**
* Writes the compact debug representation of the provided values to the given std::ostream.
*/
template <typename CharT, typename... Ts>
void debug_all(std::basic_ostream<CharT>& os, const Ts&... values)
{
refl::runtime::debug(os, std::forward_as_tuple(static_cast<const Ts&>(values)...), true);
}
/**
* Writes the debug representation of the provided value to an std::string and returns it.
* Takes an optional arguments specifying whether to print a compact representation.
* The compact representation contains no newlines.
*/
template <typename CharT = char, typename T>
std::basic_string<CharT> debug_str(const T& value, bool compact = false)
{
std::basic_stringstream<CharT> ss;
debug(ss, value, compact);
return ss.str();
}
/**
* Writes the compact debug representation of the provided values to an std::string and returns it.
*/
template <typename CharT = char, typename... Ts>
std::basic_string<CharT> debug_all_str(const Ts&... values)
{
return refl::runtime::debug_str(std::forward_as_tuple(static_cast<const Ts&>(values)...), true);
}
}
// // A generic function to print out the fields of any object
template<typename T>
void print_fields(const T& t) {
runtime2::debug(std::cout, t);
constexpr auto type = refl::reflect<T>();
constexpr auto membertype = refl::member_list<T>();
constexpr auto members = get_members(type);
std::cout << "DEBUG Type: " << type.name.c_str() << "\n";
std::cout << "DEBUG Type2: " << typeid(membertype).name() << "\n";
std::cout << "DEBUG Type3: " << typeid(members).name() << "\n";
refl::util::for_each(members, [&](auto member) {
//using member_t = decltype(member::value_type);
//typename type3 = member::value_type;
//typename trait::remove_qualifiers_t<member_t>::value_type>;
//constexpr auto type2 = refl::reflect(type3);
//std::cout << "Auto:" << foo <<"\n";
std::cout << "Auto:" << member.name <<"\n";
//std::cout << "DEBUG Type2: " << typeid(member_t).name() << "\n";
//std::cout << "DEBUG Type2: " << type2.name.c_str() << "\n";
});
std::cout << "\n";
}

View file

@ -46,6 +46,6 @@ llama_build_and_test_executable(test-grad0.cpp) # SLOW
llama_build_and_test_executable(test-rope.cpp)
# dummy executable - not installed
get_filename_component(TEST_TARGET test-c.c NAME_WE)
add_executable(${TEST_TARGET} test-c.c)
get_filename_component(TEST_TARGET test-c.cpp NAME_WE)
add_executable(${TEST_TARGET} test-c.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE llama)