still not working
ready to rebase working
This commit is contained in:
parent
04814e718e
commit
3faef69427
18 changed files with 2818 additions and 520 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -99,3 +99,6 @@ tests/test-tokenizer-0-llama
|
||||||
tests/test-tokenizer-0-falcon
|
tests/test-tokenizer-0-falcon
|
||||||
tests/test-tokenizer-1-llama
|
tests/test-tokenizer-1-llama
|
||||||
tests/test-tokenizer-1-bpe
|
tests/test-tokenizer-1-bpe
|
||||||
|
/#llama.cpp#
|
||||||
|
#*
|
||||||
|
\\#*
|
||||||
|
|
|
@ -104,7 +104,7 @@ option(LLAMA_BUILD_SERVER "llama: build server example"
|
||||||
# Compile flags
|
# Compile flags
|
||||||
#
|
#
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 11)
|
set(CMAKE_CXX_STANDARD 20)
|
||||||
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
||||||
set(CMAKE_C_STANDARD 11)
|
set(CMAKE_C_STANDARD 11)
|
||||||
set(CMAKE_C_STANDARD_REQUIRED true)
|
set(CMAKE_C_STANDARD_REQUIRED true)
|
||||||
|
@ -230,7 +230,12 @@ if (LLAMA_BLAS)
|
||||||
|
|
||||||
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
|
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
|
||||||
add_compile_options(${BLAS_LINKER_FLAGS})
|
add_compile_options(${BLAS_LINKER_FLAGS})
|
||||||
add_compile_definitions(GGML_USE_OPENBLAS)
|
|
||||||
|
# from https://github.com/NVIDIA/cutlass
|
||||||
|
make_directory("${PROJECT_BINARY_DIR}/nvcc_tmp")
|
||||||
|
set(cuda_flags --keep "SHELL:--keep-dir ${PROJECT_BINARY_DIR}/nvcc_tmp" ${cuda_flags})
|
||||||
|
|
||||||
|
# add_compile_definitions(GGML_USE_OPENBLAS)
|
||||||
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
|
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
|
||||||
add_compile_definitions(GGML_BLAS_USE_MKL)
|
add_compile_definitions(GGML_BLAS_USE_MKL)
|
||||||
endif()
|
endif()
|
||||||
|
@ -312,7 +317,7 @@ if (LLAMA_MPI)
|
||||||
if (MPI_C_FOUND)
|
if (MPI_C_FOUND)
|
||||||
message(STATUS "MPI found")
|
message(STATUS "MPI found")
|
||||||
set(GGML_HEADERS_MPI ggml-mpi.h)
|
set(GGML_HEADERS_MPI ggml-mpi.h)
|
||||||
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
|
set(GGML_SOURCES_MPI ggml-mpi.cpp ggml-mpi.h)
|
||||||
add_compile_definitions(GGML_USE_MPI)
|
add_compile_definitions(GGML_USE_MPI)
|
||||||
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
|
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
|
@ -438,6 +443,9 @@ if (NOT cuda_host_flags STREQUAL "")
|
||||||
set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
|
set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
set(cuda_flags --verbose -G ${cuda_flags})
|
||||||
|
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
|
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
|
||||||
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
|
@ -485,6 +493,8 @@ if (NOT MSVC)
|
||||||
add_link_options(-static-libgcc -static-libstdc++)
|
add_link_options(-static-libgcc -static-libstdc++)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
add_link_options("-Wl,-Map=${TARGET}.map")
|
||||||
|
|
||||||
if (LLAMA_GPROF)
|
if (LLAMA_GPROF)
|
||||||
add_compile_options(-pg)
|
add_compile_options(-pg)
|
||||||
endif()
|
endif()
|
||||||
|
@ -645,13 +655,16 @@ if (GGML_USE_CPU_HBM)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_library(ggml OBJECT
|
add_library(ggml OBJECT
|
||||||
ggml.c
|
ggml.cpp
|
||||||
ggml.h
|
ggml.h
|
||||||
ggml-alloc.c
|
print.hpp
|
||||||
|
ggml-internal.hpp
|
||||||
|
llama-internal.hpp
|
||||||
|
ggml-alloc.cpp
|
||||||
ggml-alloc.h
|
ggml-alloc.h
|
||||||
ggml-backend.c
|
ggml-backend.cpp
|
||||||
ggml-backend.h
|
ggml-backend.h
|
||||||
ggml-quants.c
|
ggml-quants.cpp
|
||||||
ggml-quants.h
|
ggml-quants.h
|
||||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||||
|
@ -683,7 +696,7 @@ add_library(llama
|
||||||
)
|
)
|
||||||
|
|
||||||
target_include_directories(llama PUBLIC .)
|
target_include_directories(llama PUBLIC .)
|
||||||
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
|
target_compile_features(llama PUBLIC cxx_std_20) # don't bump
|
||||||
target_link_libraries(llama PRIVATE
|
target_link_libraries(llama PRIVATE
|
||||||
ggml
|
ggml
|
||||||
${LLAMA_EXTRA_LIBS}
|
${LLAMA_EXTRA_LIBS}
|
||||||
|
|
24
Makefile
24
Makefile
|
@ -116,7 +116,7 @@ endif
|
||||||
# keep standard at C11 and C++11
|
# keep standard at C11 and C++11
|
||||||
MK_CPPFLAGS = -I. -Icommon
|
MK_CPPFLAGS = -I. -Icommon
|
||||||
MK_CFLAGS = -std=c11 -fPIC
|
MK_CFLAGS = -std=c11 -fPIC
|
||||||
MK_CXXFLAGS = -std=c++11 -fPIC
|
MK_CXXFLAGS = -std=c++20 -fPIC -fpermissive -DCPP_ONLY
|
||||||
|
|
||||||
# -Ofast tends to produce faster code, but may not be available for some compilers.
|
# -Ofast tends to produce faster code, but may not be available for some compilers.
|
||||||
ifdef LLAMA_FAST
|
ifdef LLAMA_FAST
|
||||||
|
@ -502,7 +502,7 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
|
||||||
endif # LLAMA_METAL
|
endif # LLAMA_METAL
|
||||||
|
|
||||||
ifdef LLAMA_MPI
|
ifdef LLAMA_MPI
|
||||||
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
ggml-mpi.o: ggml-mpi.cpp ggml-mpi.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
endif # LLAMA_MPI
|
endif # LLAMA_MPI
|
||||||
|
|
||||||
|
@ -537,17 +537,17 @@ $(info )
|
||||||
# Build library
|
# Build library
|
||||||
#
|
#
|
||||||
|
|
||||||
ggml.o: ggml.c ggml.h ggml-cuda.h
|
ggml.o: ggml.cpp ggml.h ggml-cuda.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
ggml-alloc.o: ggml-alloc.cpp ggml.h ggml-alloc.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
|
ggml-backend.o: ggml-backend.cpp ggml.h ggml-backend.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
|
ggml-quants.o: ggml-quants.cpp ggml.h ggml-quants.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
|
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
|
||||||
|
|
||||||
|
@ -734,5 +734,5 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
|
||||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-c.o: tests/test-c.c llama.h
|
tests/test-c.o: tests/test-c.cpp llama.h
|
||||||
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
|
$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
|
||||||
|
|
334
binding.py
Normal file
334
binding.py
Normal file
|
@ -0,0 +1,334 @@
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import clang.cindex
|
||||||
|
|
||||||
|
# configurable part
|
||||||
|
|
||||||
|
CLANG_VERSION='13.0.1'
|
||||||
|
# homebrew installs for llvm (brew info llvm gives details):
|
||||||
|
# x64: /usr/local/opt/llvm/lib
|
||||||
|
# arm64: /opt/homebrew/opt/llvm/lib
|
||||||
|
llvmLibPath = "/usr/lib/llvm-15/lib/"
|
||||||
|
|
||||||
|
cxxClientRoot = "/home/mdupont/experiments/llama.cpp/"
|
||||||
|
|
||||||
|
fileList = [
|
||||||
|
"ggml.cpp",
|
||||||
|
"llama.cpp"
|
||||||
|
]
|
||||||
|
|
||||||
|
typeList = [
|
||||||
|
]
|
||||||
|
|
||||||
|
# end of configurable part
|
||||||
|
|
||||||
|
clang.cindex.Config.set_library_path(llvmLibPath)
|
||||||
|
|
||||||
|
|
||||||
|
def list_headers_in_dir(path):
|
||||||
|
# enumerates a folder but keeps the full pathing for the files returned
|
||||||
|
# and removes certain files we don't want (like non-hxx, _json.hxx or _fmt.hxx)
|
||||||
|
|
||||||
|
# list all the files in the folder
|
||||||
|
files = os.listdir(path)
|
||||||
|
# only include .hxx files
|
||||||
|
files = list(filter(lambda x: x.endswith('.hxx'), files))
|
||||||
|
# add the folder path back on
|
||||||
|
files = list(map(lambda x: path + x, files))
|
||||||
|
return files
|
||||||
|
|
||||||
|
|
||||||
|
# parse through the list of files specified and expand wildcards
|
||||||
|
fullFileList = []
|
||||||
|
for filePath in fileList:
|
||||||
|
if "*" in filePath:
|
||||||
|
# wildcard path
|
||||||
|
basePath = filePath[:-1]
|
||||||
|
if "*" in basePath:
|
||||||
|
# if there is still a wildcard, we have an issue...
|
||||||
|
raise NotImplementedError(
|
||||||
|
"wildcard only supported at end of file path")
|
||||||
|
files = list_headers_in_dir(os.path.join(cxxClientRoot, basePath))
|
||||||
|
fullFileList = fullFileList + files
|
||||||
|
else:
|
||||||
|
# normal path
|
||||||
|
ff = os.path.join(cxxClientRoot, filePath)
|
||||||
|
fullFileList.append(ff)
|
||||||
|
print("DBUG",ff)
|
||||||
|
# exclude _json.hxx files
|
||||||
|
fullFileList = list(
|
||||||
|
filter(lambda x: not x.endswith('_json.hxx'), fullFileList))
|
||||||
|
# exclude _fmt.hxx files
|
||||||
|
fullFileList = list(
|
||||||
|
filter(lambda x: not x.endswith('_fmt.hxx'), fullFileList))
|
||||||
|
|
||||||
|
|
||||||
|
# generate a list of regexps from the type list (for handling wildcards)
|
||||||
|
typeListRe = list(map(lambda x: x.replace("*", "(.*)") + "(.*)", typeList))
|
||||||
|
|
||||||
|
|
||||||
|
def is_included_type(name, with_durability=False):
|
||||||
|
|
||||||
|
# TODO(brett19): This should be generalized somehow...
|
||||||
|
if "is_compound_operation" in name:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if "replica_context" in name:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if with_durability is True and '_with_legacy_durability' not in name:
|
||||||
|
return False
|
||||||
|
|
||||||
|
for x in typeListRe:
|
||||||
|
if re.fullmatch(x, name):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
opTypes = []
|
||||||
|
opEnums = []
|
||||||
|
|
||||||
|
|
||||||
|
def parse_type(type):
|
||||||
|
typeStr = type.get_canonical().spelling
|
||||||
|
return parse_type_str(typeStr)
|
||||||
|
|
||||||
|
std_comparators = ["std::less<>", "std::greater<>", "std::less_equal<>", "std::greater_equal<>"]
|
||||||
|
|
||||||
|
def parse_type_str(typeStr):
|
||||||
|
if typeStr == "std::mutex":
|
||||||
|
return {"name": "std::mutex"}
|
||||||
|
if typeStr == "std::string":
|
||||||
|
return {"name": "std::string"}
|
||||||
|
if typeStr == "std::chrono::duration<long long>":
|
||||||
|
return {"name": "std::chrono::seconds"}
|
||||||
|
if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000>>":
|
||||||
|
return {"name": "std::chrono::milliseconds"}
|
||||||
|
if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000000>>":
|
||||||
|
return {"name": "std::chrono::microseconds"}
|
||||||
|
if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000000000>>":
|
||||||
|
return {"name": "std::chrono::nanoseconds"}
|
||||||
|
if typeStr == "std::error_code":
|
||||||
|
return {"name": "std::error_code"}
|
||||||
|
if typeStr == "std::monostate":
|
||||||
|
return {"name": "std::monostate"}
|
||||||
|
if typeStr == "std::byte":
|
||||||
|
return {"name": "std::byte"}
|
||||||
|
if typeStr == "unsigned long":
|
||||||
|
return {"name": "std::size_t"}
|
||||||
|
if typeStr == "char":
|
||||||
|
return {"name": "std::int8_t"}
|
||||||
|
if typeStr == "unsigned char":
|
||||||
|
return {"name": "std::uint8_t"}
|
||||||
|
if typeStr == "short":
|
||||||
|
return {"name": "std::int16_t"}
|
||||||
|
if typeStr == "unsigned short":
|
||||||
|
return {"name": "std::uint16_t"}
|
||||||
|
if typeStr == "int":
|
||||||
|
return {"name": "std::int32_t"}
|
||||||
|
if typeStr == "unsigned int":
|
||||||
|
return {"name": "std::uint32_t"}
|
||||||
|
if typeStr == "long long":
|
||||||
|
return {"name": "std::int64_t"}
|
||||||
|
if typeStr == "unsigned long long":
|
||||||
|
return {"name": "std::uint64_t"}
|
||||||
|
if typeStr == "bool":
|
||||||
|
return {"name": "std::bool"}
|
||||||
|
if typeStr == "float":
|
||||||
|
return {"name": "std::float"}
|
||||||
|
if typeStr == "double":
|
||||||
|
return {"name": "std::double"}
|
||||||
|
if typeStr == "std::nullptr_t":
|
||||||
|
return {"name": "std::nullptr_t"}
|
||||||
|
if typeStr in std_comparators:
|
||||||
|
return {"name": typeStr}
|
||||||
|
|
||||||
|
tplParts = typeStr.split("<", 1)
|
||||||
|
if len(tplParts) > 1:
|
||||||
|
tplClassName = tplParts[0]
|
||||||
|
tplParams = tplParts[1][:-1]
|
||||||
|
if tplClassName == "std::function":
|
||||||
|
return {
|
||||||
|
"name": "std::function"
|
||||||
|
}
|
||||||
|
if tplClassName == "std::optional":
|
||||||
|
return {
|
||||||
|
"name": "std::optional",
|
||||||
|
"of": parse_type_str(tplParams)
|
||||||
|
}
|
||||||
|
if tplClassName == "std::vector":
|
||||||
|
return {
|
||||||
|
"name": "std::vector",
|
||||||
|
"of": parse_type_str(tplParams)
|
||||||
|
}
|
||||||
|
if tplClassName == "std::set":
|
||||||
|
return {
|
||||||
|
"name": "std::set",
|
||||||
|
"of": parse_type_str(tplParams)
|
||||||
|
}
|
||||||
|
if tplClassName == "std::variant":
|
||||||
|
variantParts = tplParams.split(", ")
|
||||||
|
variantTypes = []
|
||||||
|
for variantPart in variantParts:
|
||||||
|
variantTypes.append(parse_type_str(variantPart))
|
||||||
|
return {
|
||||||
|
"name": "std::variant",
|
||||||
|
"of": variantTypes
|
||||||
|
}
|
||||||
|
if tplClassName == "std::array":
|
||||||
|
variantParts = tplParams.split(", ")
|
||||||
|
if len(variantParts) != 2:
|
||||||
|
print("FAILED TO PARSE ARRAY TYPES: " + typeStr)
|
||||||
|
return {"name": "unknown", "str": typeStr}
|
||||||
|
return {
|
||||||
|
"name": "std::array",
|
||||||
|
"of": parse_type_str(variantParts[0]),
|
||||||
|
"size": int(variantParts[1])
|
||||||
|
}
|
||||||
|
if tplClassName == "std::map":
|
||||||
|
variantParts = tplParams.split(", ")
|
||||||
|
if len(variantParts) < 2 or len(variantParts) > 3:
|
||||||
|
print("FAILED TO PARSE MAP TYPES: " + typeStr)
|
||||||
|
return {"name": "unknown", "str": typeStr}
|
||||||
|
|
||||||
|
if len(variantParts) == 2:
|
||||||
|
return {
|
||||||
|
"name": "std::map",
|
||||||
|
"of": parse_type_str(variantParts[0]),
|
||||||
|
"to": parse_type_str(variantParts[1])
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
return {
|
||||||
|
"name": "std::map",
|
||||||
|
"of": parse_type_str(variantParts[0]),
|
||||||
|
"to": parse_type_str(variantParts[1]),
|
||||||
|
"comparator": parse_type_str(variantParts[2])
|
||||||
|
}
|
||||||
|
|
||||||
|
if tplClassName == "std::shared_ptr":
|
||||||
|
return {
|
||||||
|
"name": "std::shared_ptr",
|
||||||
|
"of": parse_type_str(tplParams)
|
||||||
|
}
|
||||||
|
|
||||||
|
#return {"name": "unknown", "str": typeStr}
|
||||||
|
|
||||||
|
if 'unnamed struct' in typeStr:
|
||||||
|
print("WARNING: Found unnamed struct: " + typeStr)
|
||||||
|
|
||||||
|
return {"name": typeStr}
|
||||||
|
|
||||||
|
internal_structs = []
|
||||||
|
UNNAMED_STRUCT_DELIM = '::(unnamed struct'
|
||||||
|
|
||||||
|
def traverse(node, namespace, main_file):
|
||||||
|
# only scan the elements of the file we parsed
|
||||||
|
#print("FILE", node.location.file )
|
||||||
|
|
||||||
|
if node.kind == clang.cindex.CursorKind.STRUCT_DECL or node.kind == clang.cindex.CursorKind.CLASS_DECL:
|
||||||
|
fullStructName = "::".join([*namespace, node.displayname])
|
||||||
|
print("REFL_TYPE(" + fullStructName + ")")
|
||||||
|
|
||||||
|
structFields = []
|
||||||
|
for child in node.get_children():
|
||||||
|
if child.kind == clang.cindex.CursorKind.FIELD_DECL:
|
||||||
|
struct_type = parse_type(child.type)
|
||||||
|
type_str = child.type.get_canonical().spelling
|
||||||
|
print(" REFL_FIELD(" + child.displayname + ")")
|
||||||
|
if 'unnamed' in type_str:
|
||||||
|
name_tokens = type_str.split('::')
|
||||||
|
name_override = '::'.join(name_tokens[:-1] + [child.displayname])
|
||||||
|
struct_type['name'] = name_override
|
||||||
|
internal_structs.append(name_override)
|
||||||
|
|
||||||
|
structFields.append({
|
||||||
|
"name": child.displayname,
|
||||||
|
"type": struct_type,
|
||||||
|
})
|
||||||
|
# replica read changes introduced duplicate get requests
|
||||||
|
if any(map(lambda op: op['name'] == fullStructName, opTypes)):
|
||||||
|
return
|
||||||
|
|
||||||
|
opTypes.append({
|
||||||
|
"name": fullStructName,
|
||||||
|
"fields": structFields,
|
||||||
|
})
|
||||||
|
print("REFL_END")
|
||||||
|
|
||||||
|
if node.kind == clang.cindex.CursorKind.TYPE_ALIAS_DECL:
|
||||||
|
fullStructName = "::".join([*namespace, node.displayname])
|
||||||
|
if is_included_type(fullStructName, with_durability=True):
|
||||||
|
type_ref = next((c for c in node.get_children() if c.kind == clang.cindex.CursorKind.TYPE_REF), None)
|
||||||
|
if type_ref:
|
||||||
|
base_request_name = type_ref.displayname.replace('struct', '').strip()
|
||||||
|
base_request = next((op for op in opTypes if op['name'] == base_request_name), None)
|
||||||
|
if base_request:
|
||||||
|
new_fields = [f for f in base_request['fields'] if f['name'] != 'durability_level']
|
||||||
|
new_fields.extend([
|
||||||
|
{"name":"persist_to", "type":{"name":"couchbase::persist_to"}},
|
||||||
|
{"name":"replicate_to", "type":{"name":"couchbase::replicate_to"}}
|
||||||
|
])
|
||||||
|
|
||||||
|
opTypes.append({
|
||||||
|
"name": fullStructName,
|
||||||
|
"fields": new_fields
|
||||||
|
})
|
||||||
|
if node.kind == clang.cindex.CursorKind.ENUM_DECL:
|
||||||
|
fullEnumName = "::".join([*namespace, node.displayname])
|
||||||
|
if is_included_type(fullEnumName):
|
||||||
|
enumValues = []
|
||||||
|
|
||||||
|
for child in node.get_children():
|
||||||
|
if child.kind == clang.cindex.CursorKind.ENUM_CONSTANT_DECL:
|
||||||
|
enumValues.append({
|
||||||
|
"name": child.displayname,
|
||||||
|
"value": child.enum_value,
|
||||||
|
})
|
||||||
|
opEnums.append({
|
||||||
|
"name": fullEnumName,
|
||||||
|
"type": parse_type(node.enum_type),
|
||||||
|
"values": enumValues,
|
||||||
|
})
|
||||||
|
|
||||||
|
if node.kind == clang.cindex.CursorKind.NAMESPACE:
|
||||||
|
namespace = [*namespace, node.displayname]
|
||||||
|
if node.kind == clang.cindex.CursorKind.CLASS_DECL:
|
||||||
|
namespace = [*namespace, node.displayname]
|
||||||
|
if node.kind == clang.cindex.CursorKind.STRUCT_DECL:
|
||||||
|
namespace = [*namespace, node.displayname]
|
||||||
|
|
||||||
|
for child in node.get_children():
|
||||||
|
traverse(child, namespace, main_file)
|
||||||
|
|
||||||
|
for headerPath in fullFileList:
|
||||||
|
print("processing " + headerPath)
|
||||||
|
index = clang.cindex.Index.create()
|
||||||
|
args = [
|
||||||
|
'-std=c++17',
|
||||||
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
|
translation_unit = index.parse(headerPath, args=args)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
import pdb
|
||||||
|
pdb.set_trace()
|
||||||
|
raise e
|
||||||
|
|
||||||
|
# output clang compiler diagnostics information (for debugging)
|
||||||
|
|
||||||
|
for diagnostic in translation_unit.diagnostics:
|
||||||
|
diagnosticMsg = diagnostic.format()
|
||||||
|
print(diagnostic)
|
||||||
|
|
||||||
|
traverse(translation_unit.cursor, [], headerPath)
|
||||||
|
|
||||||
|
jsonData = json.dumps({
|
||||||
|
'op_structs': opTypes,
|
||||||
|
'op_enums': opEnums
|
||||||
|
})
|
||||||
|
|
||||||
|
f = open("bindings.json", "w")
|
||||||
|
f.write(jsonData)
|
||||||
|
f.close()
|
|
@ -386,7 +386,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||||
|
|
||||||
void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) {
|
void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) {
|
||||||
free(galloc->parse_seq);
|
free(galloc->parse_seq);
|
||||||
galloc->parse_seq = malloc(sizeof(int) * n);
|
galloc->parse_seq = (int*)malloc(sizeof(int) * n);
|
||||||
|
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
galloc->parse_seq[i] = list[i];
|
galloc->parse_seq[i] = list[i];
|
||||||
|
@ -646,9 +646,9 @@ size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, st
|
||||||
if (galloc->hash_values != NULL) {
|
if (galloc->hash_values != NULL) {
|
||||||
free(galloc->hash_values);
|
free(galloc->hash_values);
|
||||||
}
|
}
|
||||||
galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size);
|
galloc->hash_set.keys = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * hash_size);
|
||||||
galloc->hash_set.size = hash_size;
|
galloc->hash_set.size = hash_size;
|
||||||
galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
|
galloc->hash_values = (hash_node*)malloc(sizeof(struct hash_node) * hash_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
// reset hash table
|
// reset hash table
|
||||||
|
@ -674,7 +674,7 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap
|
||||||
// alloc hash_values if needed
|
// alloc hash_values if needed
|
||||||
if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) {
|
if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) {
|
||||||
free(galloc->hash_values);
|
free(galloc->hash_values);
|
||||||
galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
|
galloc->hash_values = (hash_node*)malloc(sizeof(struct hash_node) * hash_size);
|
||||||
galloc->hash_values_size = hash_size;
|
galloc->hash_values_size = hash_size;
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,7 +20,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||||
struct ggml_backend_buffer_i iface,
|
struct ggml_backend_buffer_i iface,
|
||||||
ggml_backend_buffer_context_t context,
|
ggml_backend_buffer_context_t context,
|
||||||
size_t size) {
|
size_t size) {
|
||||||
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
|
ggml_backend_buffer_t buffer = (ggml_backend_buffer*)malloc(sizeof(struct ggml_backend_buffer));
|
||||||
|
|
||||||
GGML_ASSERT(iface.get_base != NULL);
|
GGML_ASSERT(iface.get_base != NULL);
|
||||||
|
|
||||||
|
@ -195,9 +195,9 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
||||||
// TODO: allow backends to support copy to/from same backend
|
// TODO: allow backends to support copy to/from same backend
|
||||||
|
|
||||||
if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
|
if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
|
||||||
ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst);
|
ggml_get_backend(dst)->iface.cpy_tensor_from((ggml_backend_t)ggml_get_backend(dst)->context, src, dst);
|
||||||
} else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
|
} else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
|
||||||
ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst);
|
ggml_get_backend(src)->iface.cpy_tensor_to((ggml_backend_t)ggml_get_backend(src)->context, src, dst);
|
||||||
} else {
|
} else {
|
||||||
// shouldn't be hit when copying from/to CPU
|
// shouldn't be hit when copying from/to CPU
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
|
@ -316,13 +316,13 @@ struct ggml_backend_plan_cpu {
|
||||||
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||||
|
|
||||||
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
struct ggml_backend_plan_cpu * cpu_plan = (ggml_backend_plan_cpu*)malloc(sizeof(struct ggml_backend_plan_cpu));
|
||||||
|
|
||||||
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
||||||
cpu_plan->cgraph = *cgraph;
|
cpu_plan->cgraph = *cgraph;
|
||||||
|
|
||||||
if (cpu_plan->cplan.work_size > 0) {
|
if (cpu_plan->cplan.work_size > 0) {
|
||||||
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
cpu_plan->cplan.work_data = (uint8_t*)malloc(cpu_plan->cplan.work_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
return cpu_plan;
|
return cpu_plan;
|
||||||
|
@ -356,7 +356,7 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
|
||||||
cpu_ctx->work_size = cplan.work_size;
|
cpu_ctx->work_size = cplan.work_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
cplan.work_data = cpu_ctx->work_data;
|
cplan.work_data = (uint8_t*)cpu_ctx->work_data;
|
||||||
|
|
||||||
ggml_graph_compute(cgraph, &cplan);
|
ggml_graph_compute(cgraph, &cplan);
|
||||||
}
|
}
|
||||||
|
@ -385,13 +385,13 @@ static struct ggml_backend_i cpu_backend_i = {
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_cpu_init(void) {
|
ggml_backend_t ggml_backend_cpu_init(void) {
|
||||||
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
|
struct ggml_backend_cpu_context * ctx = (ggml_backend_cpu_context*)malloc(sizeof(struct ggml_backend_cpu_context));
|
||||||
|
|
||||||
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
||||||
ctx->work_data = NULL;
|
ctx->work_data = NULL;
|
||||||
ctx->work_size = 0;
|
ctx->work_size = 0;
|
||||||
|
|
||||||
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
|
ggml_backend_t cpu_backend = (ggml_backend_t)malloc(sizeof(struct ggml_backend));
|
||||||
|
|
||||||
*cpu_backend = (struct ggml_backend) {
|
*cpu_backend = (struct ggml_backend) {
|
||||||
/* .interface = */ cpu_backend_i,
|
/* .interface = */ cpu_backend_i,
|
||||||
|
@ -869,7 +869,7 @@ static void sched_reset(ggml_backend_sched_t sched) {
|
||||||
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
|
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
|
||||||
GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
|
GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
|
||||||
|
|
||||||
struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched));
|
struct ggml_backend_sched * sched = (ggml_backend_sched*)malloc(sizeof(struct ggml_backend_sched));
|
||||||
memset(sched, 0, sizeof(struct ggml_backend_sched));
|
memset(sched, 0, sizeof(struct ggml_backend_sched));
|
||||||
|
|
||||||
fprintf(stderr, "ggml_backend_sched size: %lu KB\n", sizeof(struct ggml_backend_sched)/1024);
|
fprintf(stderr, "ggml_backend_sched size: %lu KB\n", sizeof(struct ggml_backend_sched)/1024);
|
||||||
|
@ -907,9 +907,9 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
|
||||||
// initialize hash tables
|
// initialize hash tables
|
||||||
size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
|
size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
|
||||||
sched->hash_set.size = hash_size;
|
sched->hash_set.size = hash_size;
|
||||||
sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
|
sched->hash_set.keys = (ggml_tensor**)malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
|
||||||
sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size);
|
sched->node_talloc = (ggml_tallocr**)malloc(sizeof(sched->node_talloc[0]) * hash_size);
|
||||||
sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size);
|
sched->node_copies = (ggml_tensor *(*)[4])malloc(sizeof(sched->node_copies[0]) * hash_size);
|
||||||
|
|
||||||
sched_split_graph(sched, measure_graph);
|
sched_split_graph(sched, measure_graph);
|
||||||
sched_alloc_splits(sched);
|
sched_alloc_splits(sched);
|
|
@ -22,7 +22,7 @@ extern "C" {
|
||||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
||||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||||
#else
|
#else
|
||||||
#define static_assert(cond, msg) struct global_scope_noop_trick
|
//#define static_assert(cond, msg) struct global_scope_noop_trick
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
258
ggml-internal.hpp
Normal file
258
ggml-internal.hpp
Normal file
|
@ -0,0 +1,258 @@
|
||||||
|
struct ggml_context {
|
||||||
|
size_t mem_size;
|
||||||
|
void * mem_buffer;
|
||||||
|
bool mem_buffer_owned;
|
||||||
|
bool no_alloc;
|
||||||
|
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
|
||||||
|
|
||||||
|
int n_objects;
|
||||||
|
|
||||||
|
struct ggml_object * objects_begin;
|
||||||
|
struct ggml_object * objects_end;
|
||||||
|
|
||||||
|
struct ggml_scratch scratch;
|
||||||
|
struct ggml_scratch scratch_save;
|
||||||
|
|
||||||
|
ggml_context():
|
||||||
|
mem_size(0),
|
||||||
|
mem_buffer(0),
|
||||||
|
mem_buffer_owned(0),
|
||||||
|
no_alloc(0),
|
||||||
|
no_alloc_save(0),
|
||||||
|
n_objects(0),
|
||||||
|
objects_begin(0),
|
||||||
|
objects_end(0),
|
||||||
|
scratch(),
|
||||||
|
scratch_save()
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_context_container {
|
||||||
|
bool used;
|
||||||
|
|
||||||
|
struct ggml_context context;
|
||||||
|
|
||||||
|
ggml_context_container(): used(0),context(){
|
||||||
|
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef double ggml_float;
|
||||||
|
typedef void * thread_ret_t;
|
||||||
|
|
||||||
|
#define MAX_FREE_BLOCKS 256
|
||||||
|
|
||||||
|
struct free_block {
|
||||||
|
void * addr;
|
||||||
|
size_t size;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_tallocr {
|
||||||
|
struct ggml_backend_buffer * buffer;
|
||||||
|
bool buffer_owned;
|
||||||
|
void * base;
|
||||||
|
size_t alignment;
|
||||||
|
|
||||||
|
int n_free_blocks;
|
||||||
|
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
||||||
|
|
||||||
|
size_t max_size;
|
||||||
|
|
||||||
|
bool measure;
|
||||||
|
|
||||||
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
|
struct ggml_tensor * allocated_tensors[1024];
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct hash_node {
|
||||||
|
int n_children;
|
||||||
|
int n_views;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef struct ggml_tallocr * ggml_tallocr_t;
|
||||||
|
typedef struct ggml_gallocr * ggml_gallocr_t;
|
||||||
|
|
||||||
|
struct ggml_gallocr {
|
||||||
|
ggml_tallocr_t talloc;
|
||||||
|
struct ggml_hash_set hash_set;
|
||||||
|
struct hash_node * hash_values;
|
||||||
|
size_t hash_values_size;
|
||||||
|
ggml_tallocr_t * hash_allocs;
|
||||||
|
int * parse_seq;
|
||||||
|
int parse_seq_len;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_allocr {
|
||||||
|
ggml_tallocr_t talloc;
|
||||||
|
ggml_gallocr_t galloc;
|
||||||
|
};
|
||||||
|
|
||||||
|
#define GGML_NUMA_MAX_NODES 8
|
||||||
|
#define GGML_NUMA_MAX_CPUS 512
|
||||||
|
|
||||||
|
struct ggml_numa_node {
|
||||||
|
uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
|
||||||
|
uint32_t n_cpus;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_numa_nodes {
|
||||||
|
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
|
||||||
|
uint32_t n_nodes;
|
||||||
|
uint32_t total_cpus; // hardware threads on system
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_state {
|
||||||
|
struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
|
||||||
|
struct ggml_numa_nodes numa;
|
||||||
|
|
||||||
|
ggml_state():contexts(), numa()
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gguf_str {
|
||||||
|
uint64_t n; // GGUFv2
|
||||||
|
char * data;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_map_custom1_op_params {
|
||||||
|
ggml_custom1_op_t fun;
|
||||||
|
int n_tasks;
|
||||||
|
void * userdata;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_map_custom2_op_params {
|
||||||
|
ggml_custom2_op_t fun;
|
||||||
|
int n_tasks;
|
||||||
|
void * userdata;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_map_custom3_op_params {
|
||||||
|
ggml_custom3_op_t fun;
|
||||||
|
int n_tasks;
|
||||||
|
void * userdata;
|
||||||
|
};
|
||||||
|
struct hash_map {
|
||||||
|
struct ggml_hash_set set;
|
||||||
|
struct ggml_tensor ** vals;
|
||||||
|
};
|
||||||
|
|
||||||
|
#if defined(_WIN32)
|
||||||
|
typedef volatile LONG atomic_int;
|
||||||
|
typedef atomic_int atomic_bool;
|
||||||
|
#else
|
||||||
|
#include<atomic>
|
||||||
|
using namespace std;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct ggml_compute_state_shared {
|
||||||
|
const struct ggml_cgraph * cgraph;
|
||||||
|
const struct ggml_cplan * cplan;
|
||||||
|
|
||||||
|
int64_t perf_node_start_cycles;
|
||||||
|
int64_t perf_node_start_time_us;
|
||||||
|
|
||||||
|
const int n_threads;
|
||||||
|
|
||||||
|
// synchronization primitives
|
||||||
|
atomic_int n_active; // num active threads
|
||||||
|
atomic_int node_n; // active graph node
|
||||||
|
|
||||||
|
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
|
||||||
|
void * abort_callback_data;
|
||||||
|
};
|
||||||
|
typedef pthread_t ggml_thread_t;
|
||||||
|
struct ggml_compute_state {
|
||||||
|
ggml_thread_t thrd;
|
||||||
|
int ith;
|
||||||
|
struct ggml_compute_state_shared * shared;
|
||||||
|
};
|
||||||
|
|
||||||
|
union gguf_value {
|
||||||
|
uint8_t uint8;
|
||||||
|
int8_t int8;
|
||||||
|
uint16_t uint16;
|
||||||
|
int16_t int16;
|
||||||
|
uint32_t uint32;
|
||||||
|
int32_t int32;
|
||||||
|
float float32;
|
||||||
|
uint64_t uint64;
|
||||||
|
int64_t int64;
|
||||||
|
double float64;
|
||||||
|
bool bool_;
|
||||||
|
|
||||||
|
struct gguf_str str;
|
||||||
|
|
||||||
|
struct gguf_array_T {
|
||||||
|
enum gguf_type type;
|
||||||
|
|
||||||
|
uint64_t n; // GGUFv2
|
||||||
|
void * data;
|
||||||
|
} arr;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_lbfgs_iteration_data {
|
||||||
|
float alpha;
|
||||||
|
float ys;
|
||||||
|
float * s;
|
||||||
|
float * y;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gguf_kv {
|
||||||
|
struct gguf_str key;
|
||||||
|
|
||||||
|
enum gguf_type type;
|
||||||
|
union gguf_value value;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
struct gguf_header {
|
||||||
|
char magic[4];
|
||||||
|
uint32_t version;
|
||||||
|
uint64_t n_tensors; // GGUFv2
|
||||||
|
uint64_t n_kv; // GGUFv2
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gguf_tensor_info {
|
||||||
|
struct gguf_str name;
|
||||||
|
|
||||||
|
uint32_t n_dims;
|
||||||
|
uint64_t ne[GGML_MAX_DIMS];
|
||||||
|
|
||||||
|
enum ggml_type type;
|
||||||
|
|
||||||
|
uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
|
||||||
|
|
||||||
|
// for writing API
|
||||||
|
const void * data;
|
||||||
|
size_t size;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gguf_context {
|
||||||
|
struct gguf_header header;
|
||||||
|
|
||||||
|
struct gguf_kv * kv;
|
||||||
|
struct gguf_tensor_info * infos;
|
||||||
|
|
||||||
|
size_t alignment;
|
||||||
|
size_t offset; // offset of `data` from beginning of file
|
||||||
|
size_t size; // size of `data` in bytes
|
||||||
|
|
||||||
|
//uint8_t * padding;
|
||||||
|
void * data;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gguf_buf {
|
||||||
|
void * data;
|
||||||
|
size_t size;
|
||||||
|
size_t offset;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
#include "ggml-backend-impl.h"
|
File diff suppressed because it is too large
Load diff
|
@ -167,58 +167,58 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
|
||||||
|
|
||||||
|
|
||||||
// Quantization
|
// Quantization
|
||||||
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
|
void quantize_row_q4_0_reference(const float * __restrict__ x, block_q4_0 * __restrict__ y, int k);
|
||||||
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
|
void quantize_row_q4_1_reference(const float * __restrict__ x, block_q4_1 * __restrict__ y, int k);
|
||||||
void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
|
void quantize_row_q5_0_reference(const float * __restrict__ x, block_q5_0 * __restrict__ y, int k);
|
||||||
void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
|
void quantize_row_q5_1_reference(const float * __restrict__ x, block_q5_1 * __restrict__ y, int k);
|
||||||
void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
|
void quantize_row_q8_0_reference(const float * __restrict__ x, block_q8_0 * __restrict__ y, int k);
|
||||||
void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
|
void quantize_row_q8_1_reference(const float * __restrict__ x, block_q8_1 * __restrict__ y, int k);
|
||||||
|
|
||||||
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
|
void quantize_row_q2_K_reference(const float * __restrict__ x, block_q2_K * __restrict__ y, int k);
|
||||||
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
|
void quantize_row_q3_K_reference(const float * __restrict__ x, block_q3_K * __restrict__ y, int k);
|
||||||
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
|
void quantize_row_q4_K_reference(const float * __restrict__ x, block_q4_K * __restrict__ y, int k);
|
||||||
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
void quantize_row_q5_K_reference(const float * __restrict__ x, block_q5_K * __restrict__ y, int k);
|
||||||
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
void quantize_row_q6_K_reference(const float * __restrict__ x, block_q6_K * __restrict__ y, int k);
|
||||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
void quantize_row_q8_K_reference(const float * __restrict__ x, block_q8_K * __restrict__ y, int k);
|
||||||
|
|
||||||
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q4_0(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q4_1(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q5_0(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q5_1(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q8_0(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q8_1(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
|
|
||||||
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q2_K(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q3_K(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q4_K(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q5_K(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q6_K(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q8_K(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
|
|
||||||
// Dequantization
|
// Dequantization
|
||||||
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
|
void dequantize_row_q4_0(const block_q4_0 * __restrict__ x, float * __restrict__ y, int k);
|
||||||
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
|
void dequantize_row_q4_1(const block_q4_1 * __restrict__ x, float * __restrict__ y, int k);
|
||||||
void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
|
void dequantize_row_q5_0(const block_q5_0 * __restrict__ x, float * __restrict__ y, int k);
|
||||||
void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
|
void dequantize_row_q5_1(const block_q5_1 * __restrict__ x, float * __restrict__ y, int k);
|
||||||
void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
|
void dequantize_row_q8_0(const block_q8_0 * __restrict__ x, float * __restrict__ y, int k);
|
||||||
//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
|
//void dequantize_row_q8_1(const block_q8_1 * __restrict__ x, float * __restrict__ y, int k);
|
||||||
|
|
||||||
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q2_K(const block_q2_K * __restrict__ x, float * __restrict__ y, int k);
|
||||||
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q3_K(const block_q3_K * __restrict__ x, float * __restrict__ y, int k);
|
||||||
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q4_K(const block_q4_K * __restrict__ x, float * __restrict__ y, int k);
|
||||||
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q5_K(const block_q5_K * __restrict__ x, float * __restrict__ y, int k);
|
||||||
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q6_K(const block_q6_K * __restrict__ x, float * __restrict__ y, int k);
|
||||||
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q8_K(const block_q8_K * __restrict__ x, float * __restrict__ y, int k);
|
||||||
|
|
||||||
// Dot product
|
// Dot product
|
||||||
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q4_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
|
||||||
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q4_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
|
||||||
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q5_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
|
||||||
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q5_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
|
||||||
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q8_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
|
||||||
|
|
||||||
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q2_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
|
||||||
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q3_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
|
||||||
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q4_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
|
||||||
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q5_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
|
||||||
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q6_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
|
||||||
|
|
|
@ -38,6 +38,14 @@
|
||||||
#pragma warning(disable: 4996)
|
#pragma warning(disable: 4996)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// initializers for static data called in the ggml_init function
|
||||||
|
static size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {};
|
||||||
|
static char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {};
|
||||||
|
|
||||||
|
void type_traits_init();
|
||||||
|
void GGUF_TYPE_SIZE_init();
|
||||||
|
void GGUF_TYPE_NAME_init();
|
||||||
|
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
|
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
|
@ -86,7 +94,9 @@ static int sched_yield (void) {
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
#include <pthread.h>
|
#include <pthread.h>
|
||||||
#include <stdatomic.h>
|
//#include <stdatomic.h>
|
||||||
|
#include <atomic>
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
typedef void * thread_ret_t;
|
typedef void * thread_ret_t;
|
||||||
|
|
||||||
|
@ -96,6 +106,8 @@ typedef void * thread_ret_t;
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include <atomic>
|
||||||
|
|
||||||
#ifdef GGML_USE_CPU_HBM
|
#ifdef GGML_USE_CPU_HBM
|
||||||
#include <hbwmalloc.h>
|
#include <hbwmalloc.h>
|
||||||
#endif
|
#endif
|
||||||
|
@ -409,37 +421,39 @@ int64_t ggml_cycles_per_ms(void) {
|
||||||
|
|
||||||
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
||||||
|
|
||||||
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
|
static void ggml_vec_dot_f32(const int n, float * GGML_RESTRICT s, const float * GGML_RESTRICT x, const float * GGML_RESTRICT y);
|
||||||
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
|
static void ggml_vec_dot_f16(const int n, float * GGML_RESTRICT s, ggml_fp16_t * GGML_RESTRICT x, ggml_fp16_t * GGML_RESTRICT y);
|
||||||
|
|
||||||
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
||||||
[GGML_TYPE_I8] = {
|
static ggml_type_traits_t type_traits[GGML_TYPE_COUNT];
|
||||||
|
void type_traits_init(){
|
||||||
|
type_traits[GGML_TYPE_I8] = {
|
||||||
.type_name = "i8",
|
.type_name = "i8",
|
||||||
.blck_size = 1,
|
.blck_size = 1,
|
||||||
.type_size = sizeof(int8_t),
|
.type_size = sizeof(int8_t),
|
||||||
.is_quantized = false,
|
.is_quantized = false,
|
||||||
},
|
};
|
||||||
[GGML_TYPE_I16] = {
|
type_traits[GGML_TYPE_I16] = {
|
||||||
.type_name = "i16",
|
.type_name = "i16",
|
||||||
.blck_size = 1,
|
.blck_size = 1,
|
||||||
.type_size = sizeof(int16_t),
|
.type_size = sizeof(int16_t),
|
||||||
.is_quantized = false,
|
.is_quantized = false,
|
||||||
},
|
};
|
||||||
[GGML_TYPE_I32] = {
|
type_traits[GGML_TYPE_I32] = {
|
||||||
.type_name = "i32",
|
.type_name = "i32",
|
||||||
.blck_size = 1,
|
.blck_size = 1,
|
||||||
.type_size = sizeof(int32_t),
|
.type_size = sizeof(int32_t),
|
||||||
.is_quantized = false,
|
.is_quantized = false,
|
||||||
},
|
};
|
||||||
[GGML_TYPE_F32] = {
|
type_traits[GGML_TYPE_F32] = {
|
||||||
.type_name = "f32",
|
.type_name = "f32",
|
||||||
.blck_size = 1,
|
.blck_size = 1,
|
||||||
.type_size = sizeof(float),
|
.type_size = sizeof(float),
|
||||||
.is_quantized = false,
|
.is_quantized = false,
|
||||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
||||||
.vec_dot_type = GGML_TYPE_F32,
|
.vec_dot_type = GGML_TYPE_F32,
|
||||||
},
|
};
|
||||||
[GGML_TYPE_F16] = {
|
type_traits[GGML_TYPE_F16] = {
|
||||||
.type_name = "f16",
|
.type_name = "f16",
|
||||||
.blck_size = 1,
|
.blck_size = 1,
|
||||||
.type_size = sizeof(ggml_fp16_t),
|
.type_size = sizeof(ggml_fp16_t),
|
||||||
|
@ -449,8 +463,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
||||||
.vec_dot_type = GGML_TYPE_F16,
|
.vec_dot_type = GGML_TYPE_F16,
|
||||||
},
|
};
|
||||||
[GGML_TYPE_Q4_0] = {
|
type_traits[GGML_TYPE_Q4_0] = {
|
||||||
.type_name = "q4_0",
|
.type_name = "q4_0",
|
||||||
.blck_size = QK4_0,
|
.blck_size = QK4_0,
|
||||||
.type_size = sizeof(block_q4_0),
|
.type_size = sizeof(block_q4_0),
|
||||||
|
@ -460,8 +474,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
||||||
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
},
|
};
|
||||||
[GGML_TYPE_Q4_1] = {
|
type_traits[GGML_TYPE_Q4_1] = {
|
||||||
.type_name = "q4_1",
|
.type_name = "q4_1",
|
||||||
.blck_size = QK4_1,
|
.blck_size = QK4_1,
|
||||||
.type_size = sizeof(block_q4_1),
|
.type_size = sizeof(block_q4_1),
|
||||||
|
@ -471,8 +485,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
||||||
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
},
|
};
|
||||||
[4] = { // GGML_TYPE_Q4_2
|
type_traits[4] = { // GGML_TYPE_Q4_2
|
||||||
.type_name = "DEPRECATED",
|
.type_name = "DEPRECATED",
|
||||||
.blck_size = 0,
|
.blck_size = 0,
|
||||||
.type_size = 0,
|
.type_size = 0,
|
||||||
|
@ -482,8 +496,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = NULL,
|
.from_float_reference = NULL,
|
||||||
.vec_dot = NULL,
|
.vec_dot = NULL,
|
||||||
.vec_dot_type = GGML_TYPE_COUNT,
|
.vec_dot_type = GGML_TYPE_COUNT,
|
||||||
},
|
};
|
||||||
[5] = { // GGML_TYPE_Q4_3
|
type_traits[5] = { // GGML_TYPE_Q4_3
|
||||||
.type_name = "DEPRECATED",
|
.type_name = "DEPRECATED",
|
||||||
.blck_size = 0,
|
.blck_size = 0,
|
||||||
.type_size = 0,
|
.type_size = 0,
|
||||||
|
@ -493,8 +507,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = NULL,
|
.from_float_reference = NULL,
|
||||||
.vec_dot = NULL,
|
.vec_dot = NULL,
|
||||||
.vec_dot_type = GGML_TYPE_COUNT,
|
.vec_dot_type = GGML_TYPE_COUNT,
|
||||||
},
|
};
|
||||||
[GGML_TYPE_Q5_0] = {
|
type_traits[GGML_TYPE_Q5_0] = {
|
||||||
.type_name = "q5_0",
|
.type_name = "q5_0",
|
||||||
.blck_size = QK5_0,
|
.blck_size = QK5_0,
|
||||||
.type_size = sizeof(block_q5_0),
|
.type_size = sizeof(block_q5_0),
|
||||||
|
@ -504,8 +518,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
||||||
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
},
|
};
|
||||||
[GGML_TYPE_Q5_1] = {
|
type_traits[GGML_TYPE_Q5_1] = {
|
||||||
.type_name = "q5_1",
|
.type_name = "q5_1",
|
||||||
.blck_size = QK5_1,
|
.blck_size = QK5_1,
|
||||||
.type_size = sizeof(block_q5_1),
|
.type_size = sizeof(block_q5_1),
|
||||||
|
@ -515,8 +529,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
||||||
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
},
|
};
|
||||||
[GGML_TYPE_Q8_0] = {
|
type_traits[GGML_TYPE_Q8_0] = {
|
||||||
.type_name = "q8_0",
|
.type_name = "q8_0",
|
||||||
.blck_size = QK8_0,
|
.blck_size = QK8_0,
|
||||||
.type_size = sizeof(block_q8_0),
|
.type_size = sizeof(block_q8_0),
|
||||||
|
@ -526,8 +540,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
||||||
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
},
|
};
|
||||||
[GGML_TYPE_Q8_1] = {
|
type_traits[GGML_TYPE_Q8_1] = {
|
||||||
.type_name = "q8_1",
|
.type_name = "q8_1",
|
||||||
.blck_size = QK8_1,
|
.blck_size = QK8_1,
|
||||||
.type_size = sizeof(block_q8_1),
|
.type_size = sizeof(block_q8_1),
|
||||||
|
@ -535,8 +549,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float = quantize_row_q8_1,
|
.from_float = quantize_row_q8_1,
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
},
|
};
|
||||||
[GGML_TYPE_Q2_K] = {
|
type_traits[GGML_TYPE_Q2_K] = {
|
||||||
.type_name = "q2_K",
|
.type_name = "q2_K",
|
||||||
.blck_size = QK_K,
|
.blck_size = QK_K,
|
||||||
.type_size = sizeof(block_q2_K),
|
.type_size = sizeof(block_q2_K),
|
||||||
|
@ -546,8 +560,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
||||||
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
},
|
};
|
||||||
[GGML_TYPE_Q3_K] = {
|
type_traits[GGML_TYPE_Q3_K] = {
|
||||||
.type_name = "q3_K",
|
.type_name = "q3_K",
|
||||||
.blck_size = QK_K,
|
.blck_size = QK_K,
|
||||||
.type_size = sizeof(block_q3_K),
|
.type_size = sizeof(block_q3_K),
|
||||||
|
@ -557,8 +571,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
||||||
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
},
|
};
|
||||||
[GGML_TYPE_Q4_K] = {
|
type_traits[GGML_TYPE_Q4_K] = {
|
||||||
.type_name = "q4_K",
|
.type_name = "q4_K",
|
||||||
.blck_size = QK_K,
|
.blck_size = QK_K,
|
||||||
.type_size = sizeof(block_q4_K),
|
.type_size = sizeof(block_q4_K),
|
||||||
|
@ -568,8 +582,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
||||||
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
},
|
};
|
||||||
[GGML_TYPE_Q5_K] = {
|
type_traits[GGML_TYPE_Q5_K] = {
|
||||||
.type_name = "q5_K",
|
.type_name = "q5_K",
|
||||||
.blck_size = QK_K,
|
.blck_size = QK_K,
|
||||||
.type_size = sizeof(block_q5_K),
|
.type_size = sizeof(block_q5_K),
|
||||||
|
@ -579,8 +593,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
||||||
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
},
|
};
|
||||||
[GGML_TYPE_Q6_K] = {
|
type_traits[GGML_TYPE_Q6_K] = {
|
||||||
.type_name = "q6_K",
|
.type_name = "q6_K",
|
||||||
.blck_size = QK_K,
|
.blck_size = QK_K,
|
||||||
.type_size = sizeof(block_q6_K),
|
.type_size = sizeof(block_q6_K),
|
||||||
|
@ -590,15 +604,15 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
||||||
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
},
|
};
|
||||||
[GGML_TYPE_Q8_K] = {
|
type_traits[GGML_TYPE_Q8_K] = {
|
||||||
.type_name = "q8_K",
|
.type_name = "q8_K",
|
||||||
.blck_size = QK_K,
|
.blck_size = QK_K,
|
||||||
.type_size = sizeof(block_q8_K),
|
.type_size = sizeof(block_q8_K),
|
||||||
.is_quantized = true,
|
.is_quantized = true,
|
||||||
.from_float = quantize_row_q8_K,
|
.from_float = quantize_row_q8_K,
|
||||||
}
|
};
|
||||||
};
|
}
|
||||||
|
|
||||||
// For internal test use
|
// For internal test use
|
||||||
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
||||||
|
@ -1160,7 +1174,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
|
||||||
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
||||||
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
||||||
|
|
||||||
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
|
static void ggml_vec_dot_f32(const int n, float * GGML_RESTRICT s, const float * GGML_RESTRICT x, const float * GGML_RESTRICT y) {
|
||||||
#ifdef GGML_SIMD
|
#ifdef GGML_SIMD
|
||||||
float sumf = 0.0f;
|
float sumf = 0.0f;
|
||||||
const int np = (n & ~(GGML_F32_STEP - 1));
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
||||||
|
@ -1197,7 +1211,7 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
|
static void ggml_vec_dot_f16(const int n, float * GGML_RESTRICT s, ggml_fp16_t * GGML_RESTRICT x, ggml_fp16_t * GGML_RESTRICT y) {
|
||||||
ggml_float sumf = 0.0;
|
ggml_float sumf = 0.0;
|
||||||
|
|
||||||
#if defined(GGML_SIMD)
|
#if defined(GGML_SIMD)
|
||||||
|
@ -1235,10 +1249,10 @@ static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * rest
|
||||||
|
|
||||||
// compute GGML_VEC_DOT_UNROLL dot products at once
|
// compute GGML_VEC_DOT_UNROLL dot products at once
|
||||||
// xs - x row stride in bytes
|
// xs - x row stride in bytes
|
||||||
inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
|
inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
|
||||||
ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
|
ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
|
||||||
|
|
||||||
ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
|
ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
|
||||||
|
|
||||||
for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
|
for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
|
||||||
x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
|
x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
|
||||||
|
@ -1288,7 +1302,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
|
inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
|
||||||
#if defined(GGML_SIMD)
|
#if defined(GGML_SIMD)
|
||||||
const int np = (n & ~(GGML_F32_STEP - 1));
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
||||||
|
|
||||||
|
@ -1320,10 +1334,10 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
|
||||||
}
|
}
|
||||||
|
|
||||||
// xs and vs are byte strides of x and v
|
// xs and vs are byte strides of x and v
|
||||||
inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
|
inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
|
||||||
|
|
||||||
const float * restrict x[GGML_VEC_MAD_UNROLL];
|
const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
|
||||||
const float * restrict v[GGML_VEC_MAD_UNROLL];
|
const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
|
||||||
|
|
||||||
for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
|
for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
|
||||||
x[i] = (const float *) ((const char *) xv + i*xs);
|
x[i] = (const float *) ((const char *) xv + i*xs);
|
||||||
|
@ -2175,11 +2189,19 @@ static inline int ggml_up(int n, int m) {
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
struct ggml_context * ggml_init(struct ggml_init_params params) {
|
struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||||
|
|
||||||
|
// initialize the data in the arrays
|
||||||
|
type_traits_init();
|
||||||
|
GGUF_TYPE_SIZE_init();
|
||||||
|
GGUF_TYPE_NAME_init();
|
||||||
|
|
||||||
|
struct ggml_context * ctx = NULL;
|
||||||
|
static bool is_first_call = true;
|
||||||
// make this function thread safe
|
// make this function thread safe
|
||||||
ggml_critical_section_start();
|
ggml_critical_section_start();
|
||||||
|
|
||||||
static bool is_first_call = true;
|
|
||||||
|
|
||||||
if (is_first_call) {
|
if (is_first_call) {
|
||||||
// initialize time system (required on Windows)
|
// initialize time system (required on Windows)
|
||||||
|
@ -2238,7 +2260,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// find non-used context in g_state
|
// find non-used context in g_state
|
||||||
struct ggml_context * ctx = NULL;
|
|
||||||
|
|
||||||
for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
|
for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
|
||||||
if (!g_state.contexts[i].used) {
|
if (!g_state.contexts[i].used) {
|
||||||
|
@ -2402,7 +2424,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
|
||||||
// align to GGML_MEM_ALIGN
|
// align to GGML_MEM_ALIGN
|
||||||
size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
|
size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
|
||||||
|
|
||||||
char * const mem_buffer = ctx->mem_buffer;
|
char * const mem_buffer = (char*)ctx->mem_buffer;
|
||||||
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
|
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
|
||||||
|
|
||||||
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
|
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
|
||||||
|
@ -2475,7 +2497,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
data = (char * const) ctx->scratch.data + ctx->scratch.offs;
|
data = (void*)(((char *)ctx->scratch.data) + ctx->scratch.offs);
|
||||||
|
|
||||||
ctx->scratch.offs += data_size;
|
ctx->scratch.offs += data_size;
|
||||||
} else {
|
} else {
|
||||||
|
@ -2630,7 +2652,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
|
||||||
const int nc = tensor->ne[0];
|
const int nc = tensor->ne[0];
|
||||||
const size_t n1 = tensor->nb[1];
|
const size_t n1 = tensor->nb[1];
|
||||||
|
|
||||||
char * const data = tensor->data;
|
char * const data = (char*)tensor->data;
|
||||||
|
|
||||||
switch (tensor->type) {
|
switch (tensor->type) {
|
||||||
case GGML_TYPE_I8:
|
case GGML_TYPE_I8:
|
||||||
|
@ -2682,7 +2704,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
|
||||||
const int nc = tensor->ne[0];
|
const int nc = tensor->ne[0];
|
||||||
const size_t n1 = tensor->nb[1];
|
const size_t n1 = tensor->nb[1];
|
||||||
|
|
||||||
char * const data = tensor->data;
|
char * const data = (char*)tensor->data;
|
||||||
|
|
||||||
switch (tensor->type) {
|
switch (tensor->type) {
|
||||||
case GGML_TYPE_I8:
|
case GGML_TYPE_I8:
|
||||||
|
@ -3063,7 +3085,7 @@ struct ggml_tensor * ggml_view_tensor(
|
||||||
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
||||||
struct ggml_object * obj = ctx->objects_begin;
|
struct ggml_object * obj = ctx->objects_begin;
|
||||||
|
|
||||||
char * const mem_buffer = ctx->mem_buffer;
|
char * const mem_buffer = (char*)ctx->mem_buffer;
|
||||||
|
|
||||||
while (obj != NULL) {
|
while (obj != NULL) {
|
||||||
if (obj->type == GGML_OBJECT_TENSOR) {
|
if (obj->type == GGML_OBJECT_TENSOR) {
|
||||||
|
@ -3080,7 +3102,7 @@ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml
|
||||||
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
|
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
|
||||||
obj = obj->next;
|
obj = obj->next;
|
||||||
|
|
||||||
char * const mem_buffer = ctx->mem_buffer;
|
char * const mem_buffer = (char*)ctx->mem_buffer;
|
||||||
|
|
||||||
while (obj != NULL) {
|
while (obj != NULL) {
|
||||||
if (obj->type == GGML_OBJECT_TENSOR) {
|
if (obj->type == GGML_OBJECT_TENSOR) {
|
||||||
|
@ -3096,7 +3118,7 @@ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml
|
||||||
struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
|
struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
|
||||||
struct ggml_object * obj = ctx->objects_begin;
|
struct ggml_object * obj = ctx->objects_begin;
|
||||||
|
|
||||||
char * const mem_buffer = ctx->mem_buffer;
|
char * const mem_buffer = (char*)ctx->mem_buffer;
|
||||||
|
|
||||||
while (obj != NULL) {
|
while (obj != NULL) {
|
||||||
if (obj->type == GGML_OBJECT_TENSOR) {
|
if (obj->type == GGML_OBJECT_TENSOR) {
|
||||||
|
@ -3292,7 +3314,7 @@ static struct ggml_tensor * ggml_acc_impl(
|
||||||
|
|
||||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
|
int32_t params[] = { (int32_t)nb1, (int32_t)nb2, (int32_t)nb3, (int32_t)offset, inplace ? 1 : 0 };
|
||||||
ggml_set_op_params(result, params, sizeof(params));
|
ggml_set_op_params(result, params, sizeof(params));
|
||||||
|
|
||||||
result->op = GGML_OP_ACC;
|
result->op = GGML_OP_ACC;
|
||||||
|
@ -4145,7 +4167,7 @@ static struct ggml_tensor * ggml_set_impl(
|
||||||
// make a view of the destination
|
// make a view of the destination
|
||||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
|
int32_t params[] = { (int32_t)nb1,(int32_t) nb2, (int32_t)nb3, (int32_t)offset, inplace ? 1 : 0 };
|
||||||
ggml_set_op_params(result, params, sizeof(params));
|
ggml_set_op_params(result, params, sizeof(params));
|
||||||
|
|
||||||
result->op = GGML_OP_SET;
|
result->op = GGML_OP_SET;
|
||||||
|
@ -5402,7 +5424,7 @@ struct ggml_tensor * ggml_pool_2d(
|
||||||
};
|
};
|
||||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
||||||
|
|
||||||
int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
|
int32_t params[] = { op, k0, k1, s0, s1, (int32_t)p0, (int32_t)p1 };
|
||||||
ggml_set_op_params(result, params, sizeof(params));
|
ggml_set_op_params(result, params, sizeof(params));
|
||||||
|
|
||||||
result->op = GGML_OP_POOL_2D;
|
result->op = GGML_OP_POOL_2D;
|
||||||
|
@ -8262,7 +8284,7 @@ static void ggml_compute_forward_repeat_back_f32(
|
||||||
GGML_ASSERT(nb00 == sizeof(float));
|
GGML_ASSERT(nb00 == sizeof(float));
|
||||||
|
|
||||||
if (ggml_is_contiguous(dst)) {
|
if (ggml_is_contiguous(dst)) {
|
||||||
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
|
||||||
} else {
|
} else {
|
||||||
for (int k3 = 0; k3 < ne3; k3++) {
|
for (int k3 = 0; k3 < ne3; k3++) {
|
||||||
for (int k2 = 0; k2 < ne2; k2++) {
|
for (int k2 = 0; k2 < ne2; k2++) {
|
||||||
|
@ -9390,6 +9412,7 @@ static void ggml_compute_forward_mul_mat(
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
|
|
||||||
int64_t t0 = ggml_perf_time_us();
|
int64_t t0 = ggml_perf_time_us();
|
||||||
UNUSED(t0);
|
UNUSED(t0);
|
||||||
|
|
||||||
|
@ -9492,7 +9515,7 @@ static void ggml_compute_forward_mul_mat(
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT) {
|
if (params->type == GGML_TASK_INIT) {
|
||||||
if (src1->type != vec_dot_type) {
|
if (src1->type != vec_dot_type) {
|
||||||
char * wdata = params->wdata;
|
char * wdata = (char*)params->wdata;
|
||||||
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
||||||
|
|
||||||
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
||||||
|
@ -9646,7 +9669,7 @@ static void ggml_compute_forward_out_prod_f32(
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9829,7 +9852,7 @@ static void ggml_compute_forward_out_prod_q_f32(
|
||||||
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT) {
|
if (params->type == GGML_TASK_INIT) {
|
||||||
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11843,7 +11866,7 @@ static void ggml_compute_forward_pool_1d(
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
|
|
||||||
const int32_t * opts = (const int32_t *)dst->op_params;
|
const int32_t * opts = (const int32_t *)dst->op_params;
|
||||||
enum ggml_op_pool op = opts[0];
|
enum ggml_op_pool op = (ggml_op_pool)opts[0];
|
||||||
const int k0 = opts[1];
|
const int k0 = opts[1];
|
||||||
const int s0 = opts[2];
|
const int s0 = opts[2];
|
||||||
const int p0 = opts[3];
|
const int p0 = opts[3];
|
||||||
|
@ -11867,7 +11890,7 @@ static void ggml_compute_forward_pool_2d(
|
||||||
}
|
}
|
||||||
|
|
||||||
const int32_t * opts = (const int32_t *)dst->op_params;
|
const int32_t * opts = (const int32_t *)dst->op_params;
|
||||||
enum ggml_op_pool op = opts[0];
|
enum ggml_op_pool op = (ggml_op_pool)opts[0];
|
||||||
const int k0 = opts[1];
|
const int k0 = opts[1];
|
||||||
const int k1 = opts[2];
|
const int k1 = opts[2];
|
||||||
const int s0 = opts[3];
|
const int s0 = opts[3];
|
||||||
|
@ -14098,7 +14121,7 @@ static struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
||||||
size = ggml_hash_size(size);
|
size = ggml_hash_size(size);
|
||||||
struct ggml_hash_set result;
|
struct ggml_hash_set result;
|
||||||
result.size = size;
|
result.size = size;
|
||||||
result.keys = malloc(sizeof(struct ggml_tensor *) * size);
|
result.keys = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * size);
|
||||||
memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
|
memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -14113,9 +14136,9 @@ struct hash_map {
|
||||||
};
|
};
|
||||||
|
|
||||||
static struct hash_map * ggml_new_hash_map(size_t size) {
|
static struct hash_map * ggml_new_hash_map(size_t size) {
|
||||||
struct hash_map * result = malloc(sizeof(struct hash_map));
|
struct hash_map * result = (hash_map*)malloc(sizeof(struct hash_map));
|
||||||
result->set = ggml_hash_set_new(size);
|
result->set = ggml_hash_set_new(size);
|
||||||
result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
|
result->vals = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * result->set.size);
|
||||||
memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
|
memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -16034,7 +16057,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
||||||
/*.abort_callback =*/ NULL,
|
/*.abort_callback =*/ NULL,
|
||||||
/*.abort_callback_data =*/ NULL,
|
/*.abort_callback_data =*/ NULL,
|
||||||
};
|
};
|
||||||
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
struct ggml_compute_state * workers = (ggml_compute_state*)alloca(sizeof(struct ggml_compute_state)*n_threads);
|
||||||
|
|
||||||
// create thread pool
|
// create thread pool
|
||||||
if (n_threads > 1) {
|
if (n_threads > 1) {
|
||||||
|
@ -16631,7 +16654,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
|
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name((ggml_op)i), (double) perf_total_per_op_us[i] / 1000.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_PRINT("========================================\n");
|
GGML_PRINT("========================================\n");
|
||||||
|
@ -16903,11 +16926,11 @@ static enum ggml_opt_result ggml_opt_adam(
|
||||||
const int n_accum = MAX(1, params.n_gradient_accumulation);
|
const int n_accum = MAX(1, params.n_gradient_accumulation);
|
||||||
const float accum_norm = 1.0f / (float) n_accum;
|
const float accum_norm = 1.0f / (float) n_accum;
|
||||||
|
|
||||||
float * g = opt->adam.g->data; // gradients
|
float * g = (float*)opt->adam.g->data; // gradients
|
||||||
float * m = opt->adam.m->data; // first moment
|
float * m = (float*)opt->adam.m->data; // first moment
|
||||||
float * v = opt->adam.v->data; // second moment
|
float * v = (float*)opt->adam.v->data; // second moment
|
||||||
|
|
||||||
float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
|
float * pf = params.past > 0 ? (float *)opt->adam.pf->data : NULL; // past function values
|
||||||
|
|
||||||
struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
|
struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
|
||||||
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
|
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
|
||||||
|
@ -17175,7 +17198,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
||||||
} else {
|
} else {
|
||||||
// Armijo condition is satisfied
|
// Armijo condition is satisfied
|
||||||
if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) {
|
if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) {
|
||||||
return count;
|
return (ggml_opt_result)count;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_vec_dot_f32(nx, &dg, g, d);
|
ggml_vec_dot_f32(nx, &dg, g, d);
|
||||||
|
@ -17186,14 +17209,14 @@ static enum ggml_opt_result linesearch_backtracking(
|
||||||
} else {
|
} else {
|
||||||
if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) {
|
if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) {
|
||||||
// regular Wolfe conditions
|
// regular Wolfe conditions
|
||||||
return count;
|
return (ggml_opt_result)count;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(dg > -params->lbfgs.wolfe*dginit) {
|
if(dg > -params->lbfgs.wolfe*dginit) {
|
||||||
width = dec;
|
width = dec;
|
||||||
} else {
|
} else {
|
||||||
// strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
|
// strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
|
||||||
return count;
|
return (ggml_opt_result)count;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -17258,13 +17281,13 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
|
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
|
||||||
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
||||||
|
|
||||||
float * x = opt->lbfgs.x->data; // current parameters
|
float * x = (float*)opt->lbfgs.x->data; // current parameters
|
||||||
float * xp = opt->lbfgs.xp->data; // previous parameters
|
float * xp = (float*)opt->lbfgs.xp->data; // previous parameters
|
||||||
float * g = opt->lbfgs.g->data; // current gradient
|
float * g = (float*)opt->lbfgs.g->data; // current gradient
|
||||||
float * gp = opt->lbfgs.gp->data; // previous gradient
|
float * gp = (float*)opt->lbfgs.gp->data; // previous gradient
|
||||||
float * d = opt->lbfgs.d->data; // search direction
|
float * d = (float*)opt->lbfgs.d->data; // search direction
|
||||||
|
|
||||||
float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values
|
float * pf = params.past > 0 ? (float*)opt->lbfgs.pf->data : NULL; // past function values
|
||||||
|
|
||||||
const int n_accum = MAX(1, params.n_gradient_accumulation);
|
const int n_accum = MAX(1, params.n_gradient_accumulation);
|
||||||
const float accum_norm = 1.0f / (float) n_accum;
|
const float accum_norm = 1.0f / (float) n_accum;
|
||||||
|
@ -17277,10 +17300,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
ggml_opt_get_params(np, ps, x);
|
ggml_opt_get_params(np, ps, x);
|
||||||
|
|
||||||
// the L-BFGS memory
|
// the L-BFGS memory
|
||||||
float * lm_alpha = opt->lbfgs.lmal->data;
|
float * lm_alpha = (float*)opt->lbfgs.lmal->data;
|
||||||
float * lm_ys = opt->lbfgs.lmys->data;
|
float * lm_ys = (float*)opt->lbfgs.lmys->data;
|
||||||
float * lm_s = opt->lbfgs.lms->data;
|
float * lm_s = (float*)opt->lbfgs.lms->data;
|
||||||
float * lm_y = opt->lbfgs.lmy->data;
|
float * lm_y = (float*)opt->lbfgs.lmy->data;
|
||||||
|
|
||||||
bool cancel = false;
|
bool cancel = false;
|
||||||
|
|
||||||
|
@ -17377,7 +17400,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
ggml_vec_cpy_f32(nx, x, xp);
|
ggml_vec_cpy_f32(nx, x, xp);
|
||||||
ggml_vec_cpy_f32(nx, g, gp);
|
ggml_vec_cpy_f32(nx, g, gp);
|
||||||
|
|
||||||
return ls;
|
return (ggml_opt_result)ls;
|
||||||
}
|
}
|
||||||
|
|
||||||
opt->loss_after = fx;
|
opt->loss_after = fx;
|
||||||
|
@ -17718,7 +17741,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
|
||||||
const int nb = k / QK4_0;
|
const int nb = k / QK4_0;
|
||||||
|
|
||||||
for (int b = 0; b < n; b += k) {
|
for (int b = 0; b < n; b += k) {
|
||||||
block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
|
block_q4_0 * GGML_RESTRICT y = (block_q4_0 *) dst + b/QK4_0;
|
||||||
|
|
||||||
quantize_row_q4_0_reference(src + b, y, k);
|
quantize_row_q4_0_reference(src + b, y, k);
|
||||||
|
|
||||||
|
@ -17741,7 +17764,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
|
||||||
const int nb = k / QK4_1;
|
const int nb = k / QK4_1;
|
||||||
|
|
||||||
for (int b = 0; b < n; b += k) {
|
for (int b = 0; b < n; b += k) {
|
||||||
block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
|
block_q4_1 * GGML_RESTRICT y = (block_q4_1 *) dst + b/QK4_1;
|
||||||
|
|
||||||
quantize_row_q4_1_reference(src + b, y, k);
|
quantize_row_q4_1_reference(src + b, y, k);
|
||||||
|
|
||||||
|
@ -17764,7 +17787,7 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
|
||||||
const int nb = k / QK5_0;
|
const int nb = k / QK5_0;
|
||||||
|
|
||||||
for (int b = 0; b < n; b += k) {
|
for (int b = 0; b < n; b += k) {
|
||||||
block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
|
block_q5_0 * GGML_RESTRICT y = (block_q5_0 *)dst + b/QK5_0;
|
||||||
|
|
||||||
quantize_row_q5_0_reference(src + b, y, k);
|
quantize_row_q5_0_reference(src + b, y, k);
|
||||||
|
|
||||||
|
@ -17794,7 +17817,7 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
|
||||||
const int nb = k / QK5_1;
|
const int nb = k / QK5_1;
|
||||||
|
|
||||||
for (int b = 0; b < n; b += k) {
|
for (int b = 0; b < n; b += k) {
|
||||||
block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
|
block_q5_1 * GGML_RESTRICT y = (block_q5_1 *)dst + b/QK5_1;
|
||||||
|
|
||||||
quantize_row_q5_1_reference(src + b, y, k);
|
quantize_row_q5_1_reference(src + b, y, k);
|
||||||
|
|
||||||
|
@ -17824,7 +17847,7 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
|
||||||
const int nb = k / QK8_0;
|
const int nb = k / QK8_0;
|
||||||
|
|
||||||
for (int b = 0; b < n; b += k) {
|
for (int b = 0; b < n; b += k) {
|
||||||
block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
|
block_q8_0 * GGML_RESTRICT y = (block_q8_0 *)dst + b/QK8_0;
|
||||||
|
|
||||||
quantize_row_q8_0_reference(src + b, y, k);
|
quantize_row_q8_0_reference(src + b, y, k);
|
||||||
|
|
||||||
|
@ -17928,37 +17951,39 @@ struct gguf_str {
|
||||||
char * data;
|
char * data;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
|
|
||||||
[GGUF_TYPE_UINT8] = sizeof(uint8_t),
|
void GGUF_TYPE_SIZE_init() {
|
||||||
[GGUF_TYPE_INT8] = sizeof(int8_t),
|
GGUF_TYPE_SIZE[GGUF_TYPE_UINT8] = sizeof(uint8_t);
|
||||||
[GGUF_TYPE_UINT16] = sizeof(uint16_t),
|
GGUF_TYPE_SIZE[GGUF_TYPE_INT8] = sizeof(int8_t);
|
||||||
[GGUF_TYPE_INT16] = sizeof(int16_t),
|
GGUF_TYPE_SIZE[GGUF_TYPE_UINT16] = sizeof(uint16_t);
|
||||||
[GGUF_TYPE_UINT32] = sizeof(uint32_t),
|
GGUF_TYPE_SIZE[GGUF_TYPE_INT16] = sizeof(int16_t);
|
||||||
[GGUF_TYPE_INT32] = sizeof(int32_t),
|
GGUF_TYPE_SIZE[GGUF_TYPE_UINT32] = sizeof(uint32_t);
|
||||||
[GGUF_TYPE_FLOAT32] = sizeof(float),
|
GGUF_TYPE_SIZE[GGUF_TYPE_INT32] = sizeof(int32_t);
|
||||||
[GGUF_TYPE_BOOL] = sizeof(bool),
|
GGUF_TYPE_SIZE[GGUF_TYPE_FLOAT32] = sizeof(float);
|
||||||
[GGUF_TYPE_STRING] = sizeof(struct gguf_str),
|
GGUF_TYPE_SIZE[GGUF_TYPE_BOOL] = sizeof(bool);
|
||||||
[GGUF_TYPE_UINT64] = sizeof(uint64_t),
|
GGUF_TYPE_SIZE[GGUF_TYPE_STRING] = sizeof(struct gguf_str);
|
||||||
[GGUF_TYPE_INT64] = sizeof(int64_t),
|
GGUF_TYPE_SIZE[GGUF_TYPE_UINT64] = sizeof(uint64_t);
|
||||||
[GGUF_TYPE_FLOAT64] = sizeof(double),
|
GGUF_TYPE_SIZE[GGUF_TYPE_INT64] = sizeof(int64_t);
|
||||||
[GGUF_TYPE_ARRAY] = 0, // undefined
|
GGUF_TYPE_SIZE[GGUF_TYPE_FLOAT64] = sizeof(double);
|
||||||
|
GGUF_TYPE_SIZE[GGUF_TYPE_ARRAY] = 0; // undefined
|
||||||
};
|
};
|
||||||
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
|
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
|
||||||
|
|
||||||
static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
|
|
||||||
[GGUF_TYPE_UINT8] = "u8",
|
void GGUF_TYPE_NAME_init(){
|
||||||
[GGUF_TYPE_INT8] = "i8",
|
GGUF_TYPE_NAME[GGUF_TYPE_UINT8] = "u8";
|
||||||
[GGUF_TYPE_UINT16] = "u16",
|
GGUF_TYPE_NAME[GGUF_TYPE_INT8] = "i8";
|
||||||
[GGUF_TYPE_INT16] = "i16",
|
GGUF_TYPE_NAME[GGUF_TYPE_UINT16] = "u16";
|
||||||
[GGUF_TYPE_UINT32] = "u32",
|
GGUF_TYPE_NAME[GGUF_TYPE_INT16] = "i16";
|
||||||
[GGUF_TYPE_INT32] = "i32",
|
GGUF_TYPE_NAME[GGUF_TYPE_UINT32] = "u32";
|
||||||
[GGUF_TYPE_FLOAT32] = "f32",
|
GGUF_TYPE_NAME[GGUF_TYPE_INT32] = "i32";
|
||||||
[GGUF_TYPE_BOOL] = "bool",
|
GGUF_TYPE_NAME[GGUF_TYPE_FLOAT32] = "f32";
|
||||||
[GGUF_TYPE_STRING] = "str",
|
GGUF_TYPE_NAME[GGUF_TYPE_BOOL] = "bool";
|
||||||
[GGUF_TYPE_ARRAY] = "arr",
|
GGUF_TYPE_NAME[GGUF_TYPE_STRING] = "str";
|
||||||
[GGUF_TYPE_UINT64] = "u64",
|
GGUF_TYPE_NAME[GGUF_TYPE_ARRAY] = "arr";
|
||||||
[GGUF_TYPE_INT64] = "i64",
|
GGUF_TYPE_NAME[GGUF_TYPE_UINT64] = "u64";
|
||||||
[GGUF_TYPE_FLOAT64] = "f64",
|
GGUF_TYPE_NAME[GGUF_TYPE_INT64] = "i64";
|
||||||
|
GGUF_TYPE_NAME[GGUF_TYPE_FLOAT64] = "f64";
|
||||||
};
|
};
|
||||||
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
|
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
|
||||||
|
|
||||||
|
@ -18040,14 +18065,14 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
||||||
|
|
||||||
bool ok = true;
|
bool ok = true;
|
||||||
|
|
||||||
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
|
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = (char*)calloc(p->n + 1, 1);
|
||||||
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
||||||
|
|
||||||
return ok;
|
return ok;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct gguf_context * gguf_init_empty(void) {
|
struct gguf_context * gguf_init_empty(void) {
|
||||||
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
||||||
|
|
||||||
memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
|
memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
|
||||||
ctx->header.version = GGUF_VERSION;
|
ctx->header.version = GGUF_VERSION;
|
||||||
|
@ -18092,7 +18117,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
|
|
||||||
bool ok = true;
|
bool ok = true;
|
||||||
|
|
||||||
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
||||||
|
|
||||||
// read the header
|
// read the header
|
||||||
{
|
{
|
||||||
|
@ -18124,7 +18149,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
|
|
||||||
// read the kv pairs
|
// read the kv pairs
|
||||||
{
|
{
|
||||||
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
ctx->kv = (gguf_kv*)malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
||||||
|
|
||||||
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
||||||
struct gguf_kv * kv = &ctx->kv[i];
|
struct gguf_kv * kv = &ctx->kv[i];
|
||||||
|
@ -18199,7 +18224,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
|
|
||||||
// read the tensor infos
|
// read the tensor infos
|
||||||
{
|
{
|
||||||
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
ctx->infos = (gguf_tensor_info*)malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
||||||
|
|
||||||
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||||
struct gguf_tensor_info * info = &ctx->infos[i];
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||||
|
@ -18319,10 +18344,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
// create the tensors
|
// create the tensors
|
||||||
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||||
const int64_t ne[GGML_MAX_DIMS] = {
|
const int64_t ne[GGML_MAX_DIMS] = {
|
||||||
ctx->infos[i].ne[0],
|
(int64_t)ctx->infos[i].ne[0],
|
||||||
ctx->infos[i].ne[1],
|
(int64_t)ctx->infos[i].ne[1],
|
||||||
ctx->infos[i].ne[2],
|
(int64_t)ctx->infos[i].ne[2],
|
||||||
ctx->infos[i].ne[3],
|
(int64_t)ctx->infos[i].ne[3],
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
|
struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
|
||||||
|
@ -18603,7 +18628,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
||||||
|
|
||||||
const int n_kv = gguf_get_n_kv(ctx);
|
const int n_kv = gguf_get_n_kv(ctx);
|
||||||
|
|
||||||
ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
|
ctx->kv = (gguf_kv*)realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
|
||||||
ctx->kv[n_kv].key.n = strlen(key);
|
ctx->kv[n_kv].key.n = strlen(key);
|
||||||
ctx->kv[n_kv].key.data = strdup(key);
|
ctx->kv[n_kv].key.data = strdup(key);
|
||||||
ctx->header.n_kv++;
|
ctx->header.n_kv++;
|
||||||
|
@ -18739,7 +18764,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
||||||
case GGUF_TYPE_ARRAY:
|
case GGUF_TYPE_ARRAY:
|
||||||
{
|
{
|
||||||
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
|
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
|
||||||
const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
|
const char ** data = (const char **)malloc(src->kv[i].value.arr.n*sizeof(char *));
|
||||||
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
|
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
|
||||||
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
||||||
}
|
}
|
||||||
|
@ -18760,7 +18785,7 @@ void gguf_add_tensor(
|
||||||
struct gguf_context * ctx,
|
struct gguf_context * ctx,
|
||||||
const struct ggml_tensor * tensor) {
|
const struct ggml_tensor * tensor) {
|
||||||
const int idx = ctx->header.n_tensors;
|
const int idx = ctx->header.n_tensors;
|
||||||
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
|
ctx->infos = (gguf_tensor_info*)realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
|
||||||
|
|
||||||
ctx->infos[idx].name.n = strlen(tensor->name);
|
ctx->infos[idx].name.n = strlen(tensor->name);
|
||||||
ctx->infos[idx].name.data = strdup(tensor->name);
|
ctx->infos[idx].name.data = strdup(tensor->name);
|
6
ggml.h
6
ggml.h
|
@ -285,8 +285,10 @@
|
||||||
GGML_UNUSED(prefix##3);
|
GGML_UNUSED(prefix##3);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
#ifndef CPP_ONLY
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
||||||
typedef half ggml_fp16_t;
|
typedef half ggml_fp16_t;
|
||||||
|
@ -2136,7 +2138,7 @@ extern "C" {
|
||||||
// restrict not standard in C++
|
// restrict not standard in C++
|
||||||
#define GGML_RESTRICT
|
#define GGML_RESTRICT
|
||||||
#else
|
#else
|
||||||
#define GGML_RESTRICT restrict
|
#define GGML_RESTRICT __restrict__
|
||||||
#endif
|
#endif
|
||||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||||
|
@ -2157,5 +2159,7 @@ extern "C" {
|
||||||
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
#ifndef CPP_ONLY
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
896
llama-internal.hpp
Normal file
896
llama-internal.hpp
Normal file
|
@ -0,0 +1,896 @@
|
||||||
|
#include <set>
|
||||||
|
#include <queue>
|
||||||
|
enum llm_arch {
|
||||||
|
LLM_ARCH_LLAMA,
|
||||||
|
LLM_ARCH_FALCON,
|
||||||
|
LLM_ARCH_BAICHUAN,
|
||||||
|
LLM_ARCH_GPT2,
|
||||||
|
LLM_ARCH_GPTJ,
|
||||||
|
LLM_ARCH_GPTNEOX,
|
||||||
|
LLM_ARCH_MPT,
|
||||||
|
LLM_ARCH_STARCODER,
|
||||||
|
LLM_ARCH_PERSIMMON,
|
||||||
|
LLM_ARCH_REFACT,
|
||||||
|
LLM_ARCH_BLOOM,
|
||||||
|
LLM_ARCH_STABLELM,
|
||||||
|
LLM_ARCH_UNKNOWN,
|
||||||
|
};
|
||||||
|
|
||||||
|
enum llm_kv {
|
||||||
|
LLM_KV_GENERAL_ARCHITECTURE,
|
||||||
|
LLM_KV_GENERAL_QUANTIZATION_VERSION,
|
||||||
|
LLM_KV_GENERAL_ALIGNMENT,
|
||||||
|
LLM_KV_GENERAL_NAME,
|
||||||
|
LLM_KV_GENERAL_AUTHOR,
|
||||||
|
LLM_KV_GENERAL_URL,
|
||||||
|
LLM_KV_GENERAL_DESCRIPTION,
|
||||||
|
LLM_KV_GENERAL_LICENSE,
|
||||||
|
LLM_KV_GENERAL_SOURCE_URL,
|
||||||
|
LLM_KV_GENERAL_SOURCE_HF_REPO,
|
||||||
|
|
||||||
|
LLM_KV_CONTEXT_LENGTH,
|
||||||
|
LLM_KV_EMBEDDING_LENGTH,
|
||||||
|
LLM_KV_BLOCK_COUNT,
|
||||||
|
LLM_KV_FEED_FORWARD_LENGTH,
|
||||||
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
||||||
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
||||||
|
|
||||||
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
||||||
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||||
|
LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
|
||||||
|
LLM_KV_ATTENTION_CLAMP_KQV,
|
||||||
|
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
||||||
|
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
||||||
|
|
||||||
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||||
|
LLM_KV_ROPE_FREQ_BASE,
|
||||||
|
LLM_KV_ROPE_SCALE_LINEAR,
|
||||||
|
LLM_KV_ROPE_SCALING_TYPE,
|
||||||
|
LLM_KV_ROPE_SCALING_FACTOR,
|
||||||
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
||||||
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
||||||
|
|
||||||
|
LLM_KV_TOKENIZER_MODEL,
|
||||||
|
LLM_KV_TOKENIZER_LIST,
|
||||||
|
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
||||||
|
LLM_KV_TOKENIZER_SCORES,
|
||||||
|
LLM_KV_TOKENIZER_MERGES,
|
||||||
|
LLM_KV_TOKENIZER_BOS_ID,
|
||||||
|
LLM_KV_TOKENIZER_EOS_ID,
|
||||||
|
LLM_KV_TOKENIZER_UNK_ID,
|
||||||
|
LLM_KV_TOKENIZER_SEP_ID,
|
||||||
|
LLM_KV_TOKENIZER_PAD_ID,
|
||||||
|
LLM_KV_TOKENIZER_ADD_BOS,
|
||||||
|
LLM_KV_TOKENIZER_ADD_EOS,
|
||||||
|
LLM_KV_TOKENIZER_HF_JSON,
|
||||||
|
LLM_KV_TOKENIZER_RWKV,
|
||||||
|
};
|
||||||
|
|
||||||
|
// available llama models
|
||||||
|
enum e_model {
|
||||||
|
MODEL_UNKNOWN,
|
||||||
|
MODEL_1B,
|
||||||
|
MODEL_3B,
|
||||||
|
MODEL_7B,
|
||||||
|
MODEL_8B,
|
||||||
|
MODEL_13B,
|
||||||
|
MODEL_15B,
|
||||||
|
MODEL_30B,
|
||||||
|
MODEL_34B,
|
||||||
|
MODEL_40B,
|
||||||
|
MODEL_65B,
|
||||||
|
MODEL_70B,
|
||||||
|
};
|
||||||
|
|
||||||
|
enum llama_fver {
|
||||||
|
GGUF_FILE_VERSION_V1 = 1,
|
||||||
|
GGUF_FILE_VERSION_V2 = 2,
|
||||||
|
GGUF_FILE_VERSION_V3 = 3,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct LLM_KV {
|
||||||
|
LLM_KV(llm_arch arch) : arch(arch) {}
|
||||||
|
|
||||||
|
llm_arch arch;
|
||||||
|
|
||||||
|
std::string operator()(llm_kv kv) const; // moved to llama.cpp file
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
enum llm_tensor {
|
||||||
|
LLM_TENSOR_TOKEN_EMBD,
|
||||||
|
LLM_TENSOR_TOKEN_EMBD_NORM,
|
||||||
|
LLM_TENSOR_POS_EMBD,
|
||||||
|
LLM_TENSOR_OUTPUT,
|
||||||
|
LLM_TENSOR_OUTPUT_NORM,
|
||||||
|
LLM_TENSOR_ROPE_FREQS,
|
||||||
|
LLM_TENSOR_ATTN_Q,
|
||||||
|
LLM_TENSOR_ATTN_K,
|
||||||
|
LLM_TENSOR_ATTN_V,
|
||||||
|
LLM_TENSOR_ATTN_QKV,
|
||||||
|
LLM_TENSOR_ATTN_OUT,
|
||||||
|
LLM_TENSOR_ATTN_NORM,
|
||||||
|
LLM_TENSOR_ATTN_NORM_2,
|
||||||
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
||||||
|
LLM_TENSOR_FFN_GATE,
|
||||||
|
LLM_TENSOR_FFN_DOWN,
|
||||||
|
LLM_TENSOR_FFN_UP,
|
||||||
|
LLM_TENSOR_FFN_NORM,
|
||||||
|
LLM_TENSOR_ATTN_Q_NORM,
|
||||||
|
LLM_TENSOR_ATTN_K_NORM,
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct llama_cparams {
|
||||||
|
uint32_t n_ctx; // context size used during inference
|
||||||
|
uint32_t n_batch;
|
||||||
|
uint32_t n_threads; // number of threads to use for generation
|
||||||
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||||
|
|
||||||
|
float rope_freq_base;
|
||||||
|
float rope_freq_scale;
|
||||||
|
|
||||||
|
uint32_t n_yarn_orig_ctx;
|
||||||
|
// These hyperparameters are not exposed in GGUF, because all
|
||||||
|
// existing YaRN models use the same values for them.
|
||||||
|
float yarn_ext_factor;
|
||||||
|
float yarn_attn_factor;
|
||||||
|
float yarn_beta_fast;
|
||||||
|
float yarn_beta_slow;
|
||||||
|
|
||||||
|
bool mul_mat_q;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_layer {
|
||||||
|
// normalization
|
||||||
|
struct ggml_tensor * attn_norm;
|
||||||
|
struct ggml_tensor * attn_norm_b;
|
||||||
|
struct ggml_tensor * attn_norm_2;
|
||||||
|
struct ggml_tensor * attn_norm_2_b;
|
||||||
|
struct ggml_tensor * attn_q_norm;
|
||||||
|
struct ggml_tensor * attn_q_norm_b;
|
||||||
|
struct ggml_tensor * attn_k_norm;
|
||||||
|
struct ggml_tensor * attn_k_norm_b;
|
||||||
|
|
||||||
|
// attention
|
||||||
|
struct ggml_tensor * wq;
|
||||||
|
struct ggml_tensor * wk;
|
||||||
|
struct ggml_tensor * wv;
|
||||||
|
struct ggml_tensor * wo;
|
||||||
|
struct ggml_tensor * wqkv;
|
||||||
|
|
||||||
|
// attention bias
|
||||||
|
struct ggml_tensor * bo;
|
||||||
|
struct ggml_tensor * bqkv;
|
||||||
|
|
||||||
|
// normalization
|
||||||
|
struct ggml_tensor * ffn_norm;
|
||||||
|
struct ggml_tensor * ffn_norm_b;
|
||||||
|
|
||||||
|
// ff
|
||||||
|
struct ggml_tensor * ffn_gate; // w1
|
||||||
|
struct ggml_tensor * ffn_down; // w2
|
||||||
|
struct ggml_tensor * ffn_up; // w3
|
||||||
|
|
||||||
|
// ff bias
|
||||||
|
struct ggml_tensor * ffn_down_b; // b2
|
||||||
|
struct ggml_tensor * ffn_up_b; // b3
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_kv_cell {
|
||||||
|
llama_pos pos = -1;
|
||||||
|
llama_pos delta = 0;
|
||||||
|
|
||||||
|
std::set<llama_seq_id> seq_id;
|
||||||
|
|
||||||
|
bool has_seq_id(const llama_seq_id & id) const {
|
||||||
|
return seq_id.find(id) != seq_id.end();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_buffer {
|
||||||
|
void * data = NULL;
|
||||||
|
size_t size = 0;
|
||||||
|
|
||||||
|
// fallback to malloc / free
|
||||||
|
// useful in cases where CUDA can try to allocate PINNED memory
|
||||||
|
bool fallback = false;
|
||||||
|
|
||||||
|
void resize(size_t n) ;
|
||||||
|
|
||||||
|
|
||||||
|
~llama_buffer();
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
// ring-buffer of cached KV data
|
||||||
|
struct llama_kv_cache {
|
||||||
|
bool has_shift = false;
|
||||||
|
|
||||||
|
// Note: The value of head isn't only used to optimize searching
|
||||||
|
// for a free KV slot. llama_decode_internal also uses it, so it
|
||||||
|
// cannot be freely changed after a slot has been allocated.
|
||||||
|
uint32_t head = 0;
|
||||||
|
uint32_t size = 0;
|
||||||
|
|
||||||
|
// computed before each graph build
|
||||||
|
uint32_t n = 0;
|
||||||
|
|
||||||
|
std::vector<llama_kv_cell> cells;
|
||||||
|
|
||||||
|
struct ggml_tensor * k = NULL;
|
||||||
|
struct ggml_tensor * v = NULL;
|
||||||
|
|
||||||
|
struct ggml_context * ctx = NULL;
|
||||||
|
|
||||||
|
llama_buffer buf;
|
||||||
|
|
||||||
|
~llama_kv_cache() {
|
||||||
|
if (ctx) {
|
||||||
|
ggml_free(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (ggml_cublas_loaded()) {
|
||||||
|
ggml_cuda_free_data(k);
|
||||||
|
ggml_cuda_free_data(v);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_vocab {
|
||||||
|
using id = int32_t;
|
||||||
|
using token = std::string;
|
||||||
|
using ttype = llama_token_type;
|
||||||
|
|
||||||
|
struct token_data {
|
||||||
|
token text;
|
||||||
|
float score;
|
||||||
|
ttype type;
|
||||||
|
};
|
||||||
|
|
||||||
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
||||||
|
|
||||||
|
std::unordered_map<token, id> token_to_id;
|
||||||
|
std::vector<token_data> id_to_token;
|
||||||
|
|
||||||
|
std::unordered_map<token, id> special_tokens_cache;
|
||||||
|
|
||||||
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
||||||
|
|
||||||
|
// default LLaMA special tokens
|
||||||
|
id special_bos_id = 1;
|
||||||
|
id special_eos_id = 2;
|
||||||
|
id special_unk_id = 0;
|
||||||
|
id special_sep_id = -1;
|
||||||
|
id special_pad_id = -1;
|
||||||
|
|
||||||
|
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
||||||
|
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
||||||
|
|
||||||
|
id linefeed_id = 13;
|
||||||
|
id special_prefix_id = 32007;
|
||||||
|
id special_middle_id = 32009;
|
||||||
|
id special_suffix_id = 32008;
|
||||||
|
id special_eot_id = 32010;
|
||||||
|
|
||||||
|
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
||||||
|
GGML_ASSERT(token_left.find(" ") == std::string::npos);
|
||||||
|
GGML_ASSERT(token_left.find("\n") == std::string::npos);
|
||||||
|
GGML_ASSERT(token_right.find(" ") == std::string::npos);
|
||||||
|
GGML_ASSERT(token_right.find("\n") == std::string::npos);
|
||||||
|
|
||||||
|
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
||||||
|
if (it == bpe_ranks.end()) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return it->second;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_mmap {
|
||||||
|
void * addr;
|
||||||
|
size_t size;
|
||||||
|
|
||||||
|
llama_mmap(const llama_mmap &) = delete;
|
||||||
|
|
||||||
|
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false);
|
||||||
|
~llama_mmap();
|
||||||
|
|
||||||
|
#ifdef _POSIX_MAPPED_FILES
|
||||||
|
static constexpr bool SUPPORTED = true;
|
||||||
|
#elif defined(_WIN32)
|
||||||
|
static constexpr bool SUPPORTED = true;
|
||||||
|
#else
|
||||||
|
static constexpr bool SUPPORTED = false;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct llama_hparams {
|
||||||
|
bool vocab_only;
|
||||||
|
uint32_t n_vocab;
|
||||||
|
uint32_t n_ctx_train; // context size the model was trained on
|
||||||
|
uint32_t n_embd;
|
||||||
|
uint32_t n_head;
|
||||||
|
uint32_t n_head_kv;
|
||||||
|
uint32_t n_layer;
|
||||||
|
uint32_t n_rot;
|
||||||
|
uint32_t n_ff;
|
||||||
|
|
||||||
|
float f_norm_eps;
|
||||||
|
float f_norm_rms_eps;
|
||||||
|
|
||||||
|
float rope_freq_base_train;
|
||||||
|
float rope_freq_scale_train;
|
||||||
|
uint32_t n_yarn_orig_ctx;
|
||||||
|
int8_t rope_scaling_type_train : 3;
|
||||||
|
bool rope_finetuned : 1;
|
||||||
|
|
||||||
|
float f_clamp_kqv;
|
||||||
|
float f_max_alibi_bias;
|
||||||
|
|
||||||
|
bool operator!=(const llama_hparams & other) const;
|
||||||
|
uint32_t n_gqa() const {
|
||||||
|
return n_head/n_head_kv;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t n_embd_head() const {
|
||||||
|
return n_embd/n_head;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t n_embd_gqa() const {
|
||||||
|
return n_embd/n_gqa();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_mlock {
|
||||||
|
void * addr = NULL;
|
||||||
|
size_t size = 0;
|
||||||
|
bool failed_already = false;
|
||||||
|
llama_mlock() ;
|
||||||
|
|
||||||
|
llama_mlock(const llama_mlock &) = delete;
|
||||||
|
~llama_mlock();
|
||||||
|
void init(void * ptr);
|
||||||
|
void grow_to(size_t target_size);
|
||||||
|
#ifdef _POSIX_MEMLOCK_RANGE
|
||||||
|
static constexpr bool SUPPORTED = true;
|
||||||
|
static size_t lock_granularity();
|
||||||
|
#ifdef __APPLE__
|
||||||
|
#define MLOCK_SUGGESTION \
|
||||||
|
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
||||||
|
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
|
||||||
|
#else
|
||||||
|
#define MLOCK_SUGGESTION \
|
||||||
|
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
|
||||||
|
#endif
|
||||||
|
bool raw_lock(const void * addr, size_t size) const ;
|
||||||
|
#undef MLOCK_SUGGESTION
|
||||||
|
static void raw_unlock(void * addr, size_t size);
|
||||||
|
#elif defined(_WIN32)
|
||||||
|
static constexpr bool SUPPORTED = true;
|
||||||
|
static size_t lock_granularity();
|
||||||
|
bool raw_lock(void * ptr, size_t len) const ;
|
||||||
|
static void raw_unlock(void * ptr, size_t len);
|
||||||
|
#else
|
||||||
|
static constexpr bool SUPPORTED = false;
|
||||||
|
static size_t lock_granularity();
|
||||||
|
bool raw_lock(const void * addr, size_t len) const;
|
||||||
|
static void raw_unlock(const void * addr, size_t len);
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct llama_model {
|
||||||
|
e_model type = MODEL_UNKNOWN;
|
||||||
|
llm_arch arch = LLM_ARCH_UNKNOWN;
|
||||||
|
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
|
||||||
|
|
||||||
|
std::string name = "n/a";
|
||||||
|
|
||||||
|
llama_hparams hparams = {};
|
||||||
|
llama_vocab vocab;
|
||||||
|
|
||||||
|
struct ggml_tensor * tok_embd;
|
||||||
|
struct ggml_tensor * pos_embd;
|
||||||
|
struct ggml_tensor * tok_norm;
|
||||||
|
struct ggml_tensor * tok_norm_b;
|
||||||
|
|
||||||
|
struct ggml_tensor * output_norm;
|
||||||
|
struct ggml_tensor * output_norm_b;
|
||||||
|
struct ggml_tensor * output;
|
||||||
|
|
||||||
|
std::vector<llama_layer> layers;
|
||||||
|
|
||||||
|
int n_gpu_layers;
|
||||||
|
|
||||||
|
// gguf metadata
|
||||||
|
std::unordered_map<std::string, std::string> gguf_kv;
|
||||||
|
|
||||||
|
// context
|
||||||
|
struct ggml_context * ctx = NULL;
|
||||||
|
|
||||||
|
// the model memory buffer
|
||||||
|
llama_buffer buf;
|
||||||
|
|
||||||
|
// model memory mapped file
|
||||||
|
std::unique_ptr<llama_mmap> mapping;
|
||||||
|
|
||||||
|
// objects representing data potentially being locked in memory
|
||||||
|
llama_mlock mlock_buf;
|
||||||
|
llama_mlock mlock_mmap;
|
||||||
|
|
||||||
|
// for quantize-stats only
|
||||||
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
||||||
|
|
||||||
|
int64_t t_load_us = 0;
|
||||||
|
int64_t t_start_us = 0;
|
||||||
|
|
||||||
|
~llama_model() {
|
||||||
|
if (ctx) {
|
||||||
|
ggml_free(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (ggml_cublas_loaded()) {
|
||||||
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
||||||
|
ggml_cuda_free_data(tensors_by_name[i].second);
|
||||||
|
}
|
||||||
|
ggml_cuda_free_scratch();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
||||||
|
ggml_cl_free_data(tensors_by_name[i].second);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_context {
|
||||||
|
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
|
||||||
|
~llama_context();
|
||||||
|
|
||||||
|
llama_cparams cparams;
|
||||||
|
|
||||||
|
const llama_model & model;
|
||||||
|
|
||||||
|
// key + value cache for the self attention
|
||||||
|
struct llama_kv_cache kv_self;
|
||||||
|
|
||||||
|
std::mt19937 rng;
|
||||||
|
|
||||||
|
bool has_evaluated_once = false;
|
||||||
|
|
||||||
|
int64_t t_start_us;
|
||||||
|
int64_t t_load_us;
|
||||||
|
int64_t t_sample_us = 0;
|
||||||
|
int64_t t_p_eval_us = 0;
|
||||||
|
int64_t t_eval_us = 0;
|
||||||
|
|
||||||
|
int32_t n_sample = 0; // number of tokens sampled
|
||||||
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
||||||
|
int32_t n_eval = 0; // number of eval calls
|
||||||
|
|
||||||
|
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
||||||
|
std::vector<float> logits;
|
||||||
|
bool logits_all = false;
|
||||||
|
|
||||||
|
// input embedding (1-dimensional array: [n_embd])
|
||||||
|
std::vector<float> embedding;
|
||||||
|
|
||||||
|
// reusable buffer for `struct ggml_graph_plan.work_data`
|
||||||
|
std::vector<uint8_t> work_buffer;
|
||||||
|
|
||||||
|
// memory buffers used to evaluate the model
|
||||||
|
llama_buffer buf_compute;
|
||||||
|
|
||||||
|
llama_buffer buf_alloc;
|
||||||
|
ggml_allocr * alloc = NULL;
|
||||||
|
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
ggml_metal_context * ctx_metal = NULL;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_MPI
|
||||||
|
ggml_mpi_context * ctx_mpi = NULL;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct LLM_TN {
|
||||||
|
LLM_TN(llm_arch arch) ;
|
||||||
|
|
||||||
|
llm_arch arch;
|
||||||
|
|
||||||
|
std::string operator()(llm_tensor tensor) const;
|
||||||
|
|
||||||
|
std::string operator()(llm_tensor tensor, const std::string & suffix) const ;
|
||||||
|
|
||||||
|
std::string operator()(llm_tensor tensor, int bid) const ;
|
||||||
|
|
||||||
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const ;
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct llama_file {
|
||||||
|
// use FILE * so we don't have to re-open the file to mmap
|
||||||
|
FILE * fp;
|
||||||
|
size_t size;
|
||||||
|
|
||||||
|
llama_file(const char * fname, const char * mode) ;
|
||||||
|
size_t tell() const;
|
||||||
|
void seek(size_t offset, int whence) const;
|
||||||
|
void read_raw(void * ptr, size_t len) const;
|
||||||
|
uint32_t read_u32() const;
|
||||||
|
void write_raw(const void * ptr, size_t len) const ;
|
||||||
|
void write_u32(std::uint32_t val) const;
|
||||||
|
~llama_file();
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct llama_state {
|
||||||
|
llama_state();
|
||||||
|
// We save the log callback globally
|
||||||
|
ggml_log_callback log_callback;
|
||||||
|
void * log_callback_user_data = nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
struct llama_model_loader {
|
||||||
|
int n_kv = 0;
|
||||||
|
int n_tensors = 0;
|
||||||
|
int n_created = 0;
|
||||||
|
|
||||||
|
int64_t n_elements = 0;
|
||||||
|
size_t n_bytes = 0;
|
||||||
|
|
||||||
|
bool use_mmap = false;
|
||||||
|
|
||||||
|
llama_file file;
|
||||||
|
llama_ftype ftype;
|
||||||
|
llama_fver fver;
|
||||||
|
|
||||||
|
std::unique_ptr<llama_mmap> mapping;
|
||||||
|
|
||||||
|
struct gguf_context * ctx_gguf = NULL;
|
||||||
|
struct ggml_context * ctx_meta = NULL;
|
||||||
|
|
||||||
|
llama_model_loader(const std::string & fname, bool use_mmap) ;
|
||||||
|
|
||||||
|
~llama_model_loader();
|
||||||
|
|
||||||
|
std::string get_arch_name() const;
|
||||||
|
|
||||||
|
enum llm_arch get_arch() const ;
|
||||||
|
const char * get_tensor_name(int i) const;
|
||||||
|
|
||||||
|
struct ggml_tensor * get_tensor_meta(int i) const;
|
||||||
|
|
||||||
|
void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const;
|
||||||
|
|
||||||
|
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) ;
|
||||||
|
|
||||||
|
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) ;
|
||||||
|
|
||||||
|
void done_getting_tensors() const;
|
||||||
|
|
||||||
|
size_t file_offset(const char * name) const;
|
||||||
|
|
||||||
|
|
||||||
|
void load_data_for(struct ggml_tensor * cur) const ;
|
||||||
|
void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) ;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_data_context {
|
||||||
|
virtual void write(const void * src, size_t size) = 0;
|
||||||
|
virtual size_t get_size_written() = 0;
|
||||||
|
virtual ~llama_data_context() = default;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_data_buffer_context : llama_data_context {
|
||||||
|
uint8_t * ptr;
|
||||||
|
size_t size_written = 0;
|
||||||
|
llama_data_buffer_context(uint8_t * p) ;
|
||||||
|
void write(const void * src, size_t size) override ;
|
||||||
|
size_t get_size_written() override ;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_data_file_context : llama_data_context {
|
||||||
|
llama_file * file;
|
||||||
|
size_t size_written = 0;
|
||||||
|
llama_data_file_context(llama_file * f);
|
||||||
|
size_t get_size_written() override ;
|
||||||
|
void write(const void * src, size_t size);
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct llama_beam {
|
||||||
|
std::vector<llama_token> tokens;
|
||||||
|
float p; // Cumulative beam probability (renormalized relative to all beams)
|
||||||
|
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
|
||||||
|
// Sort beams by probability. In case of ties, prefer beams at eob.
|
||||||
|
bool operator<(const llama_beam & rhs) const ;
|
||||||
|
void shift_tokens(const size_t n) ;
|
||||||
|
llama_beam_view view() const;
|
||||||
|
};
|
||||||
|
|
||||||
|
// A struct for calculating logit-related info.
|
||||||
|
struct llama_logit_info {
|
||||||
|
const float * const logits;
|
||||||
|
const int n_vocab;
|
||||||
|
const float max_l;
|
||||||
|
const float normalizer;
|
||||||
|
struct sum_exp {
|
||||||
|
float max_l;
|
||||||
|
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
|
||||||
|
};
|
||||||
|
llama_logit_info(llama_context * ctx);
|
||||||
|
llama_token_data get_token_data(const llama_token token_id) const ;
|
||||||
|
std::vector<llama_token_data> top_k(size_t k) ;
|
||||||
|
float probability_from_logit(float logit) const ;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct llama_beam_search_data {
|
||||||
|
llama_context * ctx;
|
||||||
|
size_t n_beams;
|
||||||
|
int n_past;
|
||||||
|
int n_predict;
|
||||||
|
std::vector<llama_beam> beams;
|
||||||
|
std::vector<llama_beam> next_beams;
|
||||||
|
size_t common_prefix_length;
|
||||||
|
std::vector<llama_beam_view> beam_views;
|
||||||
|
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict);
|
||||||
|
void collapse_beams(const size_t beam_idx) ;
|
||||||
|
void fill_next_beams_by_top_probabilities(llama_beam & beam) ;
|
||||||
|
size_t find_common_prefix_length() ;
|
||||||
|
llama_beams_state get_beams_state(const bool last_call) ;
|
||||||
|
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data);
|
||||||
|
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) ;
|
||||||
|
size_t top_beam_index();
|
||||||
|
void update_beams_from_beam_views();
|
||||||
|
};
|
||||||
|
|
||||||
|
using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
|
||||||
|
|
||||||
|
enum llm_rope_type {
|
||||||
|
LLM_ROPE,
|
||||||
|
LLM_ROPE_NEOX,
|
||||||
|
LLM_ROPE_GLM,
|
||||||
|
};
|
||||||
|
|
||||||
|
enum llm_ffn_op_type {
|
||||||
|
LLM_FFN_SILU,
|
||||||
|
LLM_FFN_GELU,
|
||||||
|
LLM_FFN_RELU,
|
||||||
|
LLM_FFN_RELU_SQR,
|
||||||
|
};
|
||||||
|
|
||||||
|
enum llm_ffn_gate_type {
|
||||||
|
LLM_FFN_SEQ,
|
||||||
|
LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
|
||||||
|
};
|
||||||
|
|
||||||
|
enum llm_norm_type {
|
||||||
|
LLM_NORM,
|
||||||
|
LLM_NORM_RMS,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llm_build_context {
|
||||||
|
const llama_model & model;
|
||||||
|
const llama_hparams & hparams;
|
||||||
|
const llama_cparams & cparams;
|
||||||
|
const llama_batch & batch;
|
||||||
|
const llama_kv_cache & kv_self;
|
||||||
|
|
||||||
|
const int64_t n_embd;
|
||||||
|
const int64_t n_layer;
|
||||||
|
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
|
||||||
|
const int64_t n_head;
|
||||||
|
const int64_t n_head_kv;
|
||||||
|
const int64_t n_embd_head;
|
||||||
|
const int64_t n_embd_gqa;
|
||||||
|
|
||||||
|
const float freq_base;
|
||||||
|
const float freq_scale;
|
||||||
|
const float ext_factor;
|
||||||
|
const float attn_factor;
|
||||||
|
const float beta_fast;
|
||||||
|
const float beta_slow;
|
||||||
|
const float norm_eps;
|
||||||
|
const float norm_rms_eps;
|
||||||
|
|
||||||
|
const int32_t n_tokens;
|
||||||
|
const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx)
|
||||||
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
||||||
|
const int32_t n_orig_ctx;
|
||||||
|
|
||||||
|
const bool do_rope_shift;
|
||||||
|
|
||||||
|
const llm_build_cb & cb;
|
||||||
|
|
||||||
|
llama_buffer & buf_compute;
|
||||||
|
|
||||||
|
struct ggml_context * ctx0 = nullptr;
|
||||||
|
|
||||||
|
// TODO: consider making the entire interface noexcept
|
||||||
|
llm_build_context(
|
||||||
|
llama_context & lctx,
|
||||||
|
const llama_batch & batch,
|
||||||
|
const llm_build_cb & cb,
|
||||||
|
bool worst_case);
|
||||||
|
|
||||||
|
void init() ;
|
||||||
|
void free() ;
|
||||||
|
struct ggml_cgraph * build_llama() ;
|
||||||
|
struct ggml_cgraph * build_baichuan() ;
|
||||||
|
struct ggml_cgraph * build_falcon() ;
|
||||||
|
struct ggml_cgraph * build_starcoder() ;
|
||||||
|
struct ggml_cgraph * build_persimmon() ;
|
||||||
|
struct ggml_cgraph * build_refact() ;
|
||||||
|
struct ggml_cgraph * build_bloom() ;
|
||||||
|
struct ggml_cgraph * build_mpt() ;
|
||||||
|
struct ggml_cgraph * build_stablelm();
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
enum llm_offload_func_e {
|
||||||
|
OFFLOAD_FUNC_NOP,
|
||||||
|
OFFLOAD_FUNC,
|
||||||
|
OFFLOAD_FUNC_KQ,
|
||||||
|
OFFLOAD_FUNC_V,
|
||||||
|
OFFLOAD_FUNC_NR,
|
||||||
|
OFFLOAD_FUNC_EMB,
|
||||||
|
OFFLOAD_FUNC_OUT,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llm_offload_trie {
|
||||||
|
struct node {
|
||||||
|
~node() ;
|
||||||
|
node * children[256] = { nullptr };
|
||||||
|
llm_offload_func_e func = OFFLOAD_FUNC_NOP;
|
||||||
|
};
|
||||||
|
node * root = nullptr;
|
||||||
|
llm_offload_trie();
|
||||||
|
llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) ;
|
||||||
|
~llm_offload_trie();
|
||||||
|
void add(const char * name, llm_offload_func_e func);
|
||||||
|
llm_offload_func_e find(const char * name) const;
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llm_symbol {
|
||||||
|
using index = int;
|
||||||
|
index prev;
|
||||||
|
index next;
|
||||||
|
const char * text;
|
||||||
|
size_t n;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct llm_bigram_spm {
|
||||||
|
struct comparator {
|
||||||
|
bool operator()(llm_bigram_spm & l, llm_bigram_spm & r);
|
||||||
|
};
|
||||||
|
using queue_storage = std::vector<llm_bigram_spm>;
|
||||||
|
using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
|
||||||
|
llm_symbol::index left;
|
||||||
|
llm_symbol::index right;
|
||||||
|
float score;
|
||||||
|
size_t size;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llm_tokenizer_spm {
|
||||||
|
llm_tokenizer_spm(const llama_vocab & vocab);
|
||||||
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output);
|
||||||
|
|
||||||
|
|
||||||
|
private:
|
||||||
|
void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) ;
|
||||||
|
void try_add_bigram(int left, int right) ;
|
||||||
|
const llama_vocab & vocab;
|
||||||
|
|
||||||
|
std::vector<llm_symbol> symbols;
|
||||||
|
llm_bigram_spm::queue work_queue;
|
||||||
|
|
||||||
|
std::map<std::string, std::pair<int, int>> rev_merge;
|
||||||
|
};
|
||||||
|
|
||||||
|
// BPE tokenizer
|
||||||
|
// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
|
||||||
|
// tried to simplify unicode stuff, so most likely does not work 100% correctly!
|
||||||
|
|
||||||
|
// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
|
||||||
|
|
||||||
|
struct llm_bigram_bpe {
|
||||||
|
struct comparator {
|
||||||
|
bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const ;
|
||||||
|
};
|
||||||
|
|
||||||
|
using queue_storage = std::vector<llm_bigram_bpe>;
|
||||||
|
using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
|
||||||
|
llm_symbol::index left;
|
||||||
|
llm_symbol::index right;
|
||||||
|
std::string text;
|
||||||
|
int rank;
|
||||||
|
size_t size;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llm_tokenizer_bpe {
|
||||||
|
llm_tokenizer_bpe(const llama_vocab & vocab);
|
||||||
|
|
||||||
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output);
|
||||||
|
|
||||||
|
private:
|
||||||
|
void add_new_bigram(int left, int right) ;
|
||||||
|
|
||||||
|
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) ;
|
||||||
|
|
||||||
|
const llama_vocab & vocab;
|
||||||
|
|
||||||
|
std::vector<llm_symbol> symbols;
|
||||||
|
std::vector<llm_symbol> symbols_final;
|
||||||
|
|
||||||
|
llm_bigram_bpe::queue work_queue;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
|
||||||
|
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
|
||||||
|
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
|
||||||
|
} FRAGMENT_BUFFER_VARIANT_TYPE;
|
||||||
|
|
||||||
|
struct fragment_buffer_variant{
|
||||||
|
fragment_buffer_variant(llama_vocab::id _token);
|
||||||
|
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length);
|
||||||
|
const FRAGMENT_BUFFER_VARIANT_TYPE type;
|
||||||
|
const llama_vocab::id token;
|
||||||
|
const std::string _dummy;
|
||||||
|
const std::string & raw_text;
|
||||||
|
const uint64_t offset;
|
||||||
|
const uint64_t length;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_partial_utf8 {
|
||||||
|
uint32_t value; // bit value so far (unshifted)
|
||||||
|
int n_remain; // num bytes remaining; -1 indicates invalid sequence
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_grammar {
|
||||||
|
const std::vector<std::vector<llama_grammar_element>> rules;
|
||||||
|
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
||||||
|
|
||||||
|
// buffer for partially generated UTF-8 sequence from accepted tokens
|
||||||
|
llama_partial_utf8 partial_utf8;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_grammar_candidate {
|
||||||
|
size_t index;
|
||||||
|
const uint32_t * code_points;
|
||||||
|
llama_partial_utf8 partial_utf8;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct quantize_state_internal {
|
||||||
|
const llama_model & model;
|
||||||
|
const llama_model_quantize_params * params;
|
||||||
|
|
||||||
|
int n_attention_wv = 0;
|
||||||
|
int n_feed_forward_w2 = 0;
|
||||||
|
int i_attention_wv = 0;
|
||||||
|
int i_feed_forward_w2 = 0;
|
||||||
|
|
||||||
|
int n_k_quantized = 0;
|
||||||
|
int n_fallback = 0;
|
||||||
|
|
||||||
|
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
|
||||||
|
: model(model)
|
||||||
|
, params(params)
|
||||||
|
{}
|
||||||
|
};
|
8
llama.h
8
llama.h
|
@ -50,7 +50,9 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
#ifndef CPP_ONLY
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -827,8 +829,10 @@ extern "C" {
|
||||||
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
#ifndef CPP_ONLY
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
|
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
|
||||||
#ifdef LLAMA_API_INTERNAL
|
#ifdef LLAMA_API_INTERNAL
|
||||||
|
@ -844,4 +848,8 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
||||||
|
|
||||||
#endif // LLAMA_API_INTERNAL
|
#endif // LLAMA_API_INTERNAL
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#endif // LLAMA_H
|
#endif // LLAMA_H
|
||||||
|
|
||||||
|
|
||||||
|
|
756
print.hpp
Normal file
756
print.hpp
Normal file
|
@ -0,0 +1,756 @@
|
||||||
|
#include <iostream>
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ggml-internal.hpp"
|
||||||
|
#include "llama-internal.hpp"
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_init_params )
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_opt_params::ggml_adam)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_opt_params::ggml_lbfgs)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_opt_context::ggml_grad )
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(gpt_params )
|
||||||
|
|
||||||
|
REFL_FIELD( seed )
|
||||||
|
REFL_FIELD( n_threads)
|
||||||
|
REFL_FIELD( n_threads_batch)
|
||||||
|
REFL_FIELD( n_predict )
|
||||||
|
REFL_FIELD( n_ctx )
|
||||||
|
REFL_FIELD( n_batch)
|
||||||
|
REFL_FIELD( n_keep )
|
||||||
|
REFL_FIELD( n_draft)
|
||||||
|
REFL_FIELD( n_chunks )
|
||||||
|
REFL_FIELD( n_parallel)
|
||||||
|
REFL_FIELD( n_sequences)
|
||||||
|
REFL_FIELD( p_accept )
|
||||||
|
REFL_FIELD( p_split )
|
||||||
|
REFL_FIELD( n_gpu_layers)
|
||||||
|
REFL_FIELD( n_gpu_layers_draft)
|
||||||
|
REFL_FIELD( main_gpu )
|
||||||
|
REFL_FIELD( tensor_split)
|
||||||
|
REFL_FIELD( n_beams )
|
||||||
|
REFL_FIELD(rope_freq_base)
|
||||||
|
REFL_FIELD( rope_freq_scale )
|
||||||
|
REFL_FIELD( yarn_ext_factor )
|
||||||
|
REFL_FIELD( yarn_attn_factor )
|
||||||
|
REFL_FIELD( yarn_beta_fast )
|
||||||
|
REFL_FIELD( yarn_beta_slow )
|
||||||
|
REFL_FIELD( yarn_orig_ctx)
|
||||||
|
REFL_FIELD( rope_scaling_type)
|
||||||
|
REFL_FIELD( sparams)
|
||||||
|
REFL_FIELD(model )
|
||||||
|
REFL_FIELD(model_draft )
|
||||||
|
REFL_FIELD(model_alias)
|
||||||
|
REFL_FIELD(prompt )
|
||||||
|
REFL_FIELD(prompt_file )
|
||||||
|
REFL_FIELD(path_prompt_cache )
|
||||||
|
REFL_FIELD(input_prefix )
|
||||||
|
REFL_FIELD(input_suffix )
|
||||||
|
REFL_FIELD( antiprompt)
|
||||||
|
REFL_FIELD(logdir )
|
||||||
|
REFL_FIELD( lora_adapter)
|
||||||
|
REFL_FIELD(lora_base )
|
||||||
|
REFL_FIELD( ppl_stride )
|
||||||
|
REFL_FIELD( ppl_output_type )
|
||||||
|
REFL_FIELD( hellaswag )
|
||||||
|
REFL_FIELD( hellaswag_tasks )
|
||||||
|
REFL_FIELD( mul_mat_q )
|
||||||
|
REFL_FIELD( memory_f16)
|
||||||
|
REFL_FIELD( random_prompt )
|
||||||
|
REFL_FIELD( use_color )
|
||||||
|
REFL_FIELD( interactive )
|
||||||
|
REFL_FIELD( chatml )
|
||||||
|
REFL_FIELD( prompt_cache_all )
|
||||||
|
REFL_FIELD( prompt_cache_ro )
|
||||||
|
REFL_FIELD( embedding )
|
||||||
|
REFL_FIELD( escape )
|
||||||
|
REFL_FIELD( interactive_first )
|
||||||
|
REFL_FIELD( multiline_input )
|
||||||
|
REFL_FIELD( simple_io )
|
||||||
|
REFL_FIELD( cont_batching )
|
||||||
|
REFL_FIELD( input_prefix_bos )
|
||||||
|
REFL_FIELD( ignore_eos )
|
||||||
|
REFL_FIELD( instruct )
|
||||||
|
REFL_FIELD( logits_all )
|
||||||
|
REFL_FIELD( use_mmap)
|
||||||
|
REFL_FIELD( use_mlock )
|
||||||
|
REFL_FIELD( numa )
|
||||||
|
REFL_FIELD( verbose_prompt )
|
||||||
|
REFL_FIELD( infill )
|
||||||
|
REFL_FIELD(mmproj )
|
||||||
|
REFL_FIELD( image)
|
||||||
|
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llama_sampling_params)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llm_arch)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llama_sampling_context )
|
||||||
|
REFL_FIELD( params)
|
||||||
|
REFL_FIELD( mirostat_mu)
|
||||||
|
REFL_FIELD( grammar)
|
||||||
|
REFL_FIELD( parsed_grammar)
|
||||||
|
REFL_FIELD( prev)
|
||||||
|
REFL_FIELD( cur)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llama_token_data )
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
|
||||||
|
REFL_TYPE(llama_token_data_array )
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llama_batch )
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_object)
|
||||||
|
REFL_FIELD(offs)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_tensor)
|
||||||
|
REFL_FIELD(type)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_cplan)
|
||||||
|
REFL_FIELD(work_size)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_hash_set)
|
||||||
|
REFL_FIELD(size)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_cgraph)
|
||||||
|
REFL_FIELD(size)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_scratch)
|
||||||
|
REFL_FIELD(offs)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_compute_params)
|
||||||
|
REFL_FIELD(type)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_opt_params)
|
||||||
|
REFL_FIELD(type)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_opt_context)
|
||||||
|
REFL_FIELD(ctx)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(gguf_init_params)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_something)
|
||||||
|
REFL_FIELD(type_name)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_context)
|
||||||
|
REFL_FIELD(mem_size)
|
||||||
|
REFL_FIELD(mem_buffer)
|
||||||
|
REFL_FIELD(mem_buffer_owned)
|
||||||
|
REFL_FIELD( no_alloc)
|
||||||
|
REFL_FIELD( no_alloc_save)
|
||||||
|
REFL_FIELD( n_objects)
|
||||||
|
REFL_FIELD( objects_begin)
|
||||||
|
REFL_FIELD( objects_end)
|
||||||
|
REFL_FIELD( scratch)
|
||||||
|
REFL_FIELD( scratch_save)
|
||||||
|
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_context_container)
|
||||||
|
REFL_FIELD(used)
|
||||||
|
REFL_FIELD(context)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_numa_node)
|
||||||
|
REFL_FIELD(cpus)
|
||||||
|
REFL_FIELD(n_cpus)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_numa_nodes)
|
||||||
|
REFL_FIELD(nodes)
|
||||||
|
REFL_FIELD(n_nodes)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_state)
|
||||||
|
REFL_FIELD(contexts)
|
||||||
|
REFL_FIELD(numa)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(gguf_str)
|
||||||
|
REFL_FIELD(n)
|
||||||
|
REFL_FIELD(data)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_map_custom1_op_params)
|
||||||
|
REFL_FIELD(fun)
|
||||||
|
REFL_FIELD(n_tasks)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_map_custom2_op_params)
|
||||||
|
REFL_FIELD(fun)
|
||||||
|
REFL_FIELD(n_tasks)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_map_custom3_op_params)
|
||||||
|
REFL_FIELD(fun)
|
||||||
|
REFL_FIELD(n_tasks)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(hash_map)
|
||||||
|
REFL_FIELD(set)
|
||||||
|
REFL_FIELD(vals)
|
||||||
|
REFL_END
|
||||||
|
REFL_TYPE(ggml_compute_state_shared)
|
||||||
|
REFL_FIELD(cgraph)
|
||||||
|
REFL_FIELD(cplan)
|
||||||
|
REFL_END
|
||||||
|
REFL_TYPE(ggml_compute_state)
|
||||||
|
REFL_FIELD(thrd)
|
||||||
|
REFL_FIELD(ith)
|
||||||
|
REFL_END
|
||||||
|
REFL_TYPE(ggml_lbfgs_iteration_data)
|
||||||
|
REFL_FIELD(alpha)
|
||||||
|
REFL_FIELD(ys)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(gguf_kv)
|
||||||
|
REFL_FIELD(key)
|
||||||
|
REFL_FIELD(type)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(gguf_header)
|
||||||
|
REFL_FIELD(magic)
|
||||||
|
REFL_FIELD(version)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(gguf_tensor_info)
|
||||||
|
REFL_FIELD(name)
|
||||||
|
REFL_FIELD(n_dims)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(gguf_context)
|
||||||
|
REFL_FIELD(header)
|
||||||
|
REFL_FIELD(kv)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(gguf_buf)
|
||||||
|
REFL_FIELD(data)
|
||||||
|
REFL_FIELD(size)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
|
||||||
|
REFL_TYPE(llama_model_params)
|
||||||
|
REFL_FIELD(n_gpu_layers)
|
||||||
|
REFL_END
|
||||||
|
REFL_TYPE(llama_context_params)
|
||||||
|
REFL_FIELD(seed)
|
||||||
|
REFL_END
|
||||||
|
REFL_TYPE(llama_model_quantize_params)
|
||||||
|
REFL_FIELD(nthread)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llama_grammar_element)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llama_timings)
|
||||||
|
REFL_FIELD(t_start_ms)
|
||||||
|
REFL_END
|
||||||
|
REFL_TYPE(llama_beam_view)
|
||||||
|
REFL_FIELD(tokens)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llama_beams_state)
|
||||||
|
REFL_FIELD(beam_views)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_backend)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_backend_buffer)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_allocr)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_tallocr)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(ggml_gallocr)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
|
||||||
|
REFL_TYPE(llama_buffer)
|
||||||
|
REFL_FIELD(data)
|
||||||
|
REFL_FIELD(size)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
|
||||||
|
REFL_TYPE(llama_file)
|
||||||
|
REFL_FIELD(fp)
|
||||||
|
REFL_FIELD(size)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
|
||||||
|
REFL_TYPE(llama_mmap)
|
||||||
|
REFL_FIELD(addr)
|
||||||
|
REFL_FIELD(size)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
|
||||||
|
REFL_TYPE(llama_mlock)
|
||||||
|
REFL_FIELD(addr)
|
||||||
|
REFL_FIELD(size)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llama_state)
|
||||||
|
REFL_FIELD(log_callback)
|
||||||
|
REFL_FIELD(log_callback_user_data)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
|
||||||
|
REFL_TYPE(llama_hparams)
|
||||||
|
REFL_FIELD(vocab_only)
|
||||||
|
REFL_FIELD(n_vocab)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
|
||||||
|
REFL_TYPE(llama_cparams)
|
||||||
|
REFL_FIELD(n_ctx)
|
||||||
|
REFL_FIELD(n_batch)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llama_layer)
|
||||||
|
REFL_FIELD(attn_norm)
|
||||||
|
REFL_FIELD(attn_norm_b)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llama_kv_cell)
|
||||||
|
REFL_FIELD(pos)
|
||||||
|
REFL_FIELD(delta)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llama_kv_cache)
|
||||||
|
REFL_FIELD(has_shift)
|
||||||
|
REFL_FIELD(head)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(e_model)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llama_ftype)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llama_model)
|
||||||
|
REFL_FIELD(type)
|
||||||
|
REFL_FIELD(arch)
|
||||||
|
REFL_FIELD(ftype )
|
||||||
|
|
||||||
|
REFL_FIELD( name )
|
||||||
|
|
||||||
|
REFL_FIELD( hparams )
|
||||||
|
REFL_FIELD( vocab)
|
||||||
|
|
||||||
|
REFL_FIELD( tok_embd)
|
||||||
|
REFL_FIELD( pos_embd)
|
||||||
|
REFL_FIELD( tok_norm)
|
||||||
|
REFL_FIELD( tok_norm_b)
|
||||||
|
|
||||||
|
REFL_FIELD( output_norm)
|
||||||
|
REFL_FIELD( output_norm_b)
|
||||||
|
REFL_FIELD( output)
|
||||||
|
|
||||||
|
REFL_FIELD( layers)
|
||||||
|
|
||||||
|
REFL_FIELD( n_gpu_layers)
|
||||||
|
|
||||||
|
REFL_FIELD( gguf_kv) //unordered map
|
||||||
|
REFL_FIELD( ctx)
|
||||||
|
REFL_FIELD( buf)
|
||||||
|
REFL_FIELD( mapping) //std::unique_ptr
|
||||||
|
REFL_FIELD( mlock_buf)
|
||||||
|
REFL_FIELD( mlock_mmap)
|
||||||
|
REFL_FIELD( tensors_by_name)
|
||||||
|
REFL_FIELD( t_load_us)
|
||||||
|
REFL_FIELD( t_start_us)
|
||||||
|
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llama_vocab)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(grammar_parser::parse_state)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llama_context)
|
||||||
|
REFL_FIELD( cparams)
|
||||||
|
//REFL_FIELD(model)
|
||||||
|
REFL_FIELD(kv_self)
|
||||||
|
REFL_FIELD(rng) //random numbers
|
||||||
|
REFL_FIELD(has_evaluated_once )
|
||||||
|
REFL_FIELD( t_start_us)
|
||||||
|
REFL_FIELD( t_load_us)
|
||||||
|
REFL_FIELD( t_sample_us )
|
||||||
|
REFL_FIELD( t_p_eval_us )
|
||||||
|
REFL_FIELD( t_eval_us)
|
||||||
|
REFL_FIELD( n_sample )
|
||||||
|
REFL_FIELD( n_p_eval )
|
||||||
|
REFL_FIELD( n_eval )
|
||||||
|
REFL_FIELD( logits)
|
||||||
|
REFL_FIELD( logits_all )
|
||||||
|
REFL_FIELD( embedding)
|
||||||
|
REFL_FIELD( work_buffer)
|
||||||
|
REFL_FIELD( buf_compute)
|
||||||
|
REFL_FIELD( buf_alloc)
|
||||||
|
REFL_FIELD( alloc )
|
||||||
|
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
REFL_FIELD( ctx_metal )
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_MPI
|
||||||
|
REFL_FIELD( ctx_mpi )
|
||||||
|
|
||||||
|
#endif
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llama_model_loader)
|
||||||
|
REFL_FIELD(n_kv)
|
||||||
|
REFL_FIELD(n_tensors)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llm_build_context)
|
||||||
|
// REFL_FIELD(model) cannot create pointer to reference member ‘llm_build_context::model’
|
||||||
|
// REFL_FIELD(hparams) cannot create pointer to reference member ‘llm_build_context::hparams’
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llm_offload_trie)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llm_symbol)
|
||||||
|
REFL_FIELD(prev)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llm_bigram_spm)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llm_tokenizer_spm)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llm_bigram_bpe)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llm_tokenizer_bpe)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
|
||||||
|
REFL_TYPE(fragment_buffer_variant)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
|
||||||
|
REFL_TYPE(llama_partial_utf8)
|
||||||
|
REFL_FIELD(value)
|
||||||
|
REFL_FIELD(n_remain)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
|
||||||
|
REFL_TYPE(llama_grammar)
|
||||||
|
REFL_FIELD(rules)
|
||||||
|
REFL_FIELD(stacks)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
|
||||||
|
REFL_TYPE(llama_grammar_candidate)
|
||||||
|
REFL_FIELD(index)
|
||||||
|
REFL_FIELD(code_points)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
|
||||||
|
REFL_TYPE(llama_beam)
|
||||||
|
REFL_FIELD(tokens)
|
||||||
|
REFL_FIELD(p)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
|
||||||
|
REFL_TYPE(llama_logit_info)
|
||||||
|
REFL_FIELD(logits)
|
||||||
|
REFL_FIELD(n_vocab)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llama_beam_search_data)
|
||||||
|
REFL_FIELD(ctx)
|
||||||
|
REFL_FIELD(n_beams)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
|
||||||
|
REFL_TYPE(quantize_state_internal)
|
||||||
|
// REFL_FIELD(model)
|
||||||
|
REFL_FIELD(params)
|
||||||
|
REFL_FIELD( n_attention_wv )
|
||||||
|
REFL_FIELD( n_feed_forward_w2 )
|
||||||
|
REFL_FIELD( i_attention_wv )
|
||||||
|
REFL_FIELD( i_feed_forward_w2 )
|
||||||
|
REFL_FIELD( n_k_quantized )
|
||||||
|
REFL_FIELD( n_fallback )
|
||||||
|
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llama_data_context)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llama_data_buffer_context)
|
||||||
|
REFL_FIELD(ptr)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
REFL_TYPE(llama_data_file_context)
|
||||||
|
REFL_FIELD(file)
|
||||||
|
REFL_END
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
constexpr auto get_value_type_name(const T t) noexcept
|
||||||
|
{
|
||||||
|
return t.value_type;
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace runtime2
|
||||||
|
{
|
||||||
|
using namespace refl;
|
||||||
|
using namespace refl::descriptor;
|
||||||
|
template <typename CharT, typename T>
|
||||||
|
void debug(std::basic_ostream<CharT>& os, const T& value, bool compact = false);
|
||||||
|
|
||||||
|
namespace detail
|
||||||
|
{
|
||||||
|
template <typename CharT, typename T, typename = decltype(std::declval<std::basic_ostream<CharT>&>() << std::declval<T>())>
|
||||||
|
std::true_type is_ostream_printable_test(int);
|
||||||
|
|
||||||
|
template <typename CharT, typename T>
|
||||||
|
std::false_type is_ostream_printable_test(...);
|
||||||
|
|
||||||
|
template <typename CharT, typename T>
|
||||||
|
constexpr bool is_ostream_printable_v{ decltype(is_ostream_printable_test<CharT, T>(0))::value };
|
||||||
|
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
[[maybe_unused]] int next_depth(int depth)
|
||||||
|
{
|
||||||
|
return depth == -1 || depth > 8
|
||||||
|
? -1
|
||||||
|
: depth + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename CharT>
|
||||||
|
void indent(std::basic_ostream<CharT>& os, int depth)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < depth; i++) {
|
||||||
|
os << " ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename CharT, typename T>
|
||||||
|
void debug_impl(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] int depth);
|
||||||
|
|
||||||
|
template <typename CharT, typename T>
|
||||||
|
void debug_detailed(std::basic_ostream<CharT>& os, const T& value, int depth)
|
||||||
|
{
|
||||||
|
|
||||||
|
using type_descriptor = type_descriptor<T>;
|
||||||
|
bool compact = depth == -1;
|
||||||
|
// print type with members enclosed in braces
|
||||||
|
os << type_descriptor::name << " { ";
|
||||||
|
if (!compact) os << '\n';
|
||||||
|
|
||||||
|
constexpr auto readable_members = filter(type_descriptor::members, [](auto member) { return is_readable(member); });
|
||||||
|
for_each(readable_members, [&](auto member, [[maybe_unused]] auto index) {
|
||||||
|
int new_depth = next_depth(depth);
|
||||||
|
|
||||||
|
indent(os, new_depth);
|
||||||
|
os << get_display_name(member) << " = ";
|
||||||
|
|
||||||
|
if constexpr (util::contains_instance<attr::debug>(member.attributes)) {
|
||||||
|
// use the debug attribute to print
|
||||||
|
auto debug_attr = util::get_instance<attr::debug>(member.attributes);
|
||||||
|
debug_attr.write(os, value);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
debug_impl(os, member(value), new_depth);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!compact || index + 1 != readable_members.size) {
|
||||||
|
os << ", ";
|
||||||
|
}
|
||||||
|
if (!compact) {
|
||||||
|
indent(os, depth);
|
||||||
|
os << '\n';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (compact) os << ' ';
|
||||||
|
indent(os, depth);
|
||||||
|
os << '}';
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename CharT, typename T>
|
||||||
|
void debug_reflectable(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] int depth)
|
||||||
|
{
|
||||||
|
using type_descriptor = type_descriptor<T>;
|
||||||
|
if constexpr (trait::contains_instance_v<attr::debug, typename type_descriptor::attribute_types>) {
|
||||||
|
// use the debug attribute to print
|
||||||
|
auto debug_attr = util::get_instance<attr::debug>(type_descriptor::attributes);
|
||||||
|
debug_attr.write(os, value);
|
||||||
|
}
|
||||||
|
else if constexpr (detail::is_ostream_printable_v<CharT, T>) {
|
||||||
|
// type supports printing natively, just use that
|
||||||
|
os << value;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
debug_detailed(os, value, depth);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename CharT, typename T>
|
||||||
|
void debug_container(std::basic_ostream<CharT>& os, const T& value, int depth)
|
||||||
|
{
|
||||||
|
bool compact = depth == -1;
|
||||||
|
os << "[";
|
||||||
|
|
||||||
|
auto end = value.end();
|
||||||
|
for (auto it = value.begin(); it != end; ++it)
|
||||||
|
{
|
||||||
|
if (!compact) os << '\n';
|
||||||
|
int new_depth = next_depth(depth);
|
||||||
|
indent(os, new_depth);
|
||||||
|
|
||||||
|
debug_impl(os, *it, new_depth);
|
||||||
|
if (std::next(it, 1) != end) {
|
||||||
|
os << ", ";
|
||||||
|
}
|
||||||
|
else if (!compact) {
|
||||||
|
os << '\n';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
indent(os, depth);
|
||||||
|
os << "]";
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename CharT, typename T>
|
||||||
|
void debug_impl(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] int depth)
|
||||||
|
{
|
||||||
|
using no_pointer_t = std::remove_pointer_t<T>;
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<bool, T>) {
|
||||||
|
os << (value ? "true" : "false");
|
||||||
|
}
|
||||||
|
else if constexpr (std::is_pointer_v<T> && !std::is_void_v<no_pointer_t> && trait::is_reflectable_v<no_pointer_t>) {
|
||||||
|
if (value == nullptr) {
|
||||||
|
os << "nullptr";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
os << '&';
|
||||||
|
debug_impl(os, *value, -1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if constexpr (trait::is_reflectable_v<T>) {
|
||||||
|
debug_reflectable(os, value, depth);
|
||||||
|
}
|
||||||
|
else if constexpr (detail::is_ostream_printable_v<CharT, T>) {
|
||||||
|
os << value;
|
||||||
|
}
|
||||||
|
else if constexpr (trait::is_container_v<T>) {
|
||||||
|
debug_container(os, value, depth);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
os << "(not printable)";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes the debug representation of value to the given std::ostream.
|
||||||
|
* Calls the function specified by the debug<F> attribute whenever possible,
|
||||||
|
* before falling back to recursively interating the members and printing them.
|
||||||
|
* Takes an optional arguments specifying whether to print a compact representation.
|
||||||
|
* The compact representation contains no newlines.
|
||||||
|
*/
|
||||||
|
template <typename CharT, typename T>
|
||||||
|
void debug(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] bool compact)
|
||||||
|
{
|
||||||
|
static_assert(trait::is_reflectable_v<T> || trait::is_container_v<T> || detail::is_ostream_printable_v<CharT, T>,
|
||||||
|
"Type is not reflectable, not a container of reflectable types and does not support operator<<(std::ostream&, T)!");
|
||||||
|
|
||||||
|
detail::debug_impl(os, value, compact ? -1 : 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes the compact debug representation of the provided values to the given std::ostream.
|
||||||
|
*/
|
||||||
|
template <typename CharT, typename... Ts>
|
||||||
|
void debug_all(std::basic_ostream<CharT>& os, const Ts&... values)
|
||||||
|
{
|
||||||
|
refl::runtime::debug(os, std::forward_as_tuple(static_cast<const Ts&>(values)...), true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes the debug representation of the provided value to an std::string and returns it.
|
||||||
|
* Takes an optional arguments specifying whether to print a compact representation.
|
||||||
|
* The compact representation contains no newlines.
|
||||||
|
*/
|
||||||
|
template <typename CharT = char, typename T>
|
||||||
|
std::basic_string<CharT> debug_str(const T& value, bool compact = false)
|
||||||
|
{
|
||||||
|
std::basic_stringstream<CharT> ss;
|
||||||
|
debug(ss, value, compact);
|
||||||
|
return ss.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes the compact debug representation of the provided values to an std::string and returns it.
|
||||||
|
*/
|
||||||
|
template <typename CharT = char, typename... Ts>
|
||||||
|
std::basic_string<CharT> debug_all_str(const Ts&... values)
|
||||||
|
{
|
||||||
|
return refl::runtime::debug_str(std::forward_as_tuple(static_cast<const Ts&>(values)...), true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// // A generic function to print out the fields of any object
|
||||||
|
template<typename T>
|
||||||
|
void print_fields(const T& t) {
|
||||||
|
runtime2::debug(std::cout, t);
|
||||||
|
constexpr auto type = refl::reflect<T>();
|
||||||
|
|
||||||
|
constexpr auto membertype = refl::member_list<T>();
|
||||||
|
|
||||||
|
constexpr auto members = get_members(type);
|
||||||
|
std::cout << "DEBUG Type: " << type.name.c_str() << "\n";
|
||||||
|
std::cout << "DEBUG Type2: " << typeid(membertype).name() << "\n";
|
||||||
|
std::cout << "DEBUG Type3: " << typeid(members).name() << "\n";
|
||||||
|
refl::util::for_each(members, [&](auto member) {
|
||||||
|
//using member_t = decltype(member::value_type);
|
||||||
|
//typename type3 = member::value_type;
|
||||||
|
//typename trait::remove_qualifiers_t<member_t>::value_type>;
|
||||||
|
//constexpr auto type2 = refl::reflect(type3);
|
||||||
|
//std::cout << "Auto:" << foo <<"\n";
|
||||||
|
std::cout << "Auto:" << member.name <<"\n";
|
||||||
|
//std::cout << "DEBUG Type2: " << typeid(member_t).name() << "\n";
|
||||||
|
//std::cout << "DEBUG Type2: " << type2.name.c_str() << "\n";
|
||||||
|
});
|
||||||
|
std::cout << "\n";
|
||||||
|
}
|
|
@ -46,6 +46,6 @@ llama_build_and_test_executable(test-grad0.cpp) # SLOW
|
||||||
llama_build_and_test_executable(test-rope.cpp)
|
llama_build_and_test_executable(test-rope.cpp)
|
||||||
|
|
||||||
# dummy executable - not installed
|
# dummy executable - not installed
|
||||||
get_filename_component(TEST_TARGET test-c.c NAME_WE)
|
get_filename_component(TEST_TARGET test-c.cpp NAME_WE)
|
||||||
add_executable(${TEST_TARGET} test-c.c)
|
add_executable(${TEST_TARGET} test-c.cpp)
|
||||||
target_link_libraries(${TEST_TARGET} PRIVATE llama)
|
target_link_libraries(${TEST_TARGET} PRIVATE llama)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue