rebased and trimmed down

now compiling again,

now it might even run

adding debug notes

update

diff

improvement

update

update

working

simplify

running

moving to using refl-cpp for llama as well

now working

compiling and running

debugging

adding the print module with type information

the first type names are being printed

adding binding generator

bindings

refl now working, not on pointers but on the types

update

now has a model

adding new header for llama internal

demonstrate crash

remove crash

now starting to refactor the code

now the debug print is working
This commit is contained in:
mike dupont 2023-11-21 11:25:37 -05:00
parent 52c8bc3cf3
commit 2b6ff2ec54
42 changed files with 6898 additions and 3518 deletions

29
.gitignore vendored
View file

@ -47,7 +47,6 @@ models-mnt
/libllama.so /libllama.so
/llama-bench /llama-bench
/llava-cli /llava-cli
/lookahead
/main /main
/metal /metal
/perplexity /perplexity
@ -88,16 +87,18 @@ poetry.lock
poetry.toml poetry.toml
# Test binaries # Test binaries
/tests/test-grammar-parser tests/test-grammar-parser
/tests/test-llama-grammar tests/test-llama-grammar
/tests/test-double-float tests/test-double-float
/tests/test-grad0 tests/test-grad0
/tests/test-opt tests/test-opt
/tests/test-quantize-fns tests/test-quantize-fns
/tests/test-quantize-perf tests/test-quantize-perf
/tests/test-sampling tests/test-sampling
/tests/test-tokenizer-0-llama tests/test-tokenizer-0-llama
/tests/test-tokenizer-0-falcon tests/test-tokenizer-0-falcon
/tests/test-tokenizer-1-llama tests/test-tokenizer-1-llama
/tests/test-tokenizer-1-bpe tests/test-tokenizer-1-bpe
/tests/test-rope /#llama.cpp#
#*
\\#*

View file

@ -1,8 +1,34 @@
cmake_minimum_required(VERSION 3.13) # for add_link_options cmake_minimum_required(VERSION 3.13) # for add_link_options
project("llama.cpp" C CXX) project("llama.cpp" C CXX)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) if (NOT MSVC)
set(cuda_flags -Wno-pedantic)
endif()
set(LLAMA_CUBLAS ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(LLAMA_CUDA_F16 ON)
set(LLAMA_ACCELERATE ON)
set(LLAMA_K_QUANTS ON)
#-DLLAMA_NATIVE=off
set(LLAMA_AVX ON)
set(LLAMA_AVX2 OFF)
set(LLAMA_AVX512 OFF)
set(LLAMA_FMA OFF)
set(LLAMA_F16C OFF)
set(CMAKE_CUDA_FLAGS "--verbose") #
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
set(CUDACXX /usr/local/cuda-12.3/bin/nvcc)
set(CMAKE_CUDA_COMPILER /usr/local/cuda-12.3/bin/nvcc)
set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda-12.3)
#GGML_USE_CUBLAS
#set(CMAKE_EXE_LINKER_FLAGS -pg)
#set(CMAKE_SHARED_LINKER_FLAGS -pg)
set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type" FORCE)
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
@ -45,7 +71,7 @@ endif()
# general # general
option(BUILD_SHARED_LIBS "build shared libraries" OFF) option(BUILD_SHARED_LIBS "build shared libraries" OFF)
option(LLAMA_STATIC "llama: static link libraries" OFF) option(LLAMA_STATIC "llama: static link libraries" OFF)
option(LLAMA_NATIVE "llama: enable -march=native flag" ON) option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
option(LLAMA_LTO "llama: enable link time optimization" OFF) option(LLAMA_LTO "llama: enable link time optimization" OFF)
# debug # debug
@ -78,9 +104,9 @@ endif()
# 3rd party libs # 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF) option(LLAMA_BLAS "llama: use BLAS" ON)
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use CUDA" OFF) option(LLAMA_CUBLAS "llama: use CUDA" ON)
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF) #option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF) option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
@ -108,7 +134,7 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
# Compile flags # Compile flags
# #
set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED true) set(CMAKE_CXX_STANDARD_REQUIRED true)
set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true) set(CMAKE_C_STANDARD_REQUIRED true)
@ -239,7 +265,12 @@ if (LLAMA_BLAS)
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
add_compile_options(${BLAS_LINKER_FLAGS}) add_compile_options(${BLAS_LINKER_FLAGS})
add_compile_definitions(GGML_USE_OPENBLAS)
# from https://github.com/NVIDIA/cutlass
make_directory("${PROJECT_BINARY_DIR}/nvcc_tmp")
set(cuda_flags --keep "SHELL:--keep-dir ${PROJECT_BINARY_DIR}/nvcc_tmp" ${cuda_flags})
# add_compile_definitions(GGML_USE_OPENBLAS)
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel")) if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
add_compile_definitions(GGML_BLAS_USE_MKL) add_compile_definitions(GGML_BLAS_USE_MKL)
endif() endif()
@ -281,6 +312,7 @@ if (LLAMA_CUBLAS)
endif() endif()
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
if (DEFINED LLAMA_CUDA_DMMV_Y) if (DEFINED LLAMA_CUDA_DMMV_Y)
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
endif() endif()
@ -321,7 +353,7 @@ if (LLAMA_MPI)
if (MPI_C_FOUND) if (MPI_C_FOUND)
message(STATUS "MPI found") message(STATUS "MPI found")
set(GGML_HEADERS_MPI ggml-mpi.h) set(GGML_HEADERS_MPI ggml-mpi.h)
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h) set(GGML_SOURCES_MPI ggml-mpi.cpp ggml-mpi.h)
add_compile_definitions(GGML_USE_MPI) add_compile_definitions(GGML_USE_MPI)
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS}) add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
if (NOT MSVC) if (NOT MSVC)
@ -399,14 +431,15 @@ endif()
if (LLAMA_ALL_WARNINGS) if (LLAMA_ALL_WARNINGS)
if (NOT MSVC) if (NOT MSVC)
set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) # -Wpedantic
set(warning_flags -Wall -Wextra -Wcast-qual -Wno-unused-function)
set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration) set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration)
set(cxx_flags -Wmissing-declarations -Wmissing-noreturn) set(cxx_flags -Wmissing-declarations -Wmissing-noreturn -fpermissive)
set(host_cxx_flags "") set(host_cxx_flags "")
if (CMAKE_C_COMPILER_ID MATCHES "Clang") if (CMAKE_C_COMPILER_ID MATCHES "Clang")
set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return) set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi) set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi -fpermissive)
if ( if (
(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR (CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
@ -416,30 +449,27 @@ if (LLAMA_ALL_WARNINGS)
endif() endif()
elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU") elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
set(c_flags ${c_flags} -Wdouble-promotion) set(c_flags ${c_flags} -Wdouble-promotion)
set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds) set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds -fpermissive)
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0) if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation) set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation -fpermissive)
endif() endif()
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0) if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
set(host_cxx_flags ${host_cxx_flags} -Wextra-semi) set(host_cxx_flags ${host_cxx_flags} -Wextra-semi -fpermissive)
endif() endif()
endif() endif()
else() else()
# todo : msvc # todo : msvc
endif() endif()
set(c_flags ${c_flags} ${warning_flags}) set(c_flags ${c_flags} -save-temps --verbose ${warning_flags})
set(cxx_flags ${cxx_flags} ${warning_flags}) set(cxx_flags ${cxx_flags} -fpermissive -save-temps --verbose ${warning_flags})
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>" add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>" "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
"$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>") "$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
endif() endif()
if (NOT MSVC)
set(cuda_flags -Wno-pedantic)
endif()
set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags}) set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
list(JOIN host_cxx_flags " " cuda_host_flags) # pass host compiler flags as a single argument list(JOIN host_cxx_flags " " cuda_host_flags) # pass host compiler flags as a single argument
@ -447,6 +477,9 @@ if (NOT cuda_host_flags STREQUAL "")
set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags}) set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
endif() endif()
#
set(cuda_flags --verbose -G ${cuda_flags})
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>") add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
if (WIN32) if (WIN32)
@ -494,8 +527,10 @@ if (NOT MSVC)
add_link_options(-static-libgcc -static-libstdc++) add_link_options(-static-libgcc -static-libstdc++)
endif() endif()
endif() endif()
add_link_options("-Wl,-Map=${TARGET}.map")
if (LLAMA_GPROF) if (LLAMA_GPROF)
add_compile_options(-pg) add_compile_options(-pg)
endif() endif()
endif() endif()
@ -654,13 +689,16 @@ if (GGML_USE_CPU_HBM)
endif() endif()
add_library(ggml OBJECT add_library(ggml OBJECT
ggml.c ggml.cpp
ggml.h ggml.h
ggml-alloc.c print.hpp
ggml-internal.hpp
llama-internal.hpp
ggml-alloc.cpp
ggml-alloc.h ggml-alloc.h
ggml-backend.c ggml-backend.cpp
ggml-backend.h ggml-backend.h
ggml-quants.c ggml-quants.cpp
ggml-quants.h ggml-quants.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
@ -692,7 +730,7 @@ add_library(llama
) )
target_include_directories(llama PUBLIC .) target_include_directories(llama PUBLIC .)
target_compile_features(llama PUBLIC cxx_std_11) # don't bump target_compile_features(llama PUBLIC cxx_std_20) # don't bump
target_link_libraries(llama PRIVATE target_link_libraries(llama PRIVATE
ggml ggml
${LLAMA_EXTRA_LIBS} ${LLAMA_EXTRA_LIBS}

View file

@ -1,3 +1,4 @@
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = \ BUILD_TARGETS = \
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
@ -116,7 +117,7 @@ endif
# keep standard at C11 and C++11 # keep standard at C11 and C++11
MK_CPPFLAGS = -I. -Icommon MK_CPPFLAGS = -I. -Icommon
MK_CFLAGS = -std=c11 -fPIC MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++11 -fPIC MK_CXXFLAGS = -std=c++17 -fPIC -fpermissive
# -Ofast tends to produce faster code, but may not be available for some compilers. # -Ofast tends to produce faster code, but may not be available for some compilers.
ifdef LLAMA_FAST ifdef LLAMA_FAST
@ -506,7 +507,7 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
endif # LLAMA_METAL endif # LLAMA_METAL
ifdef LLAMA_MPI ifdef LLAMA_MPI
ggml-mpi.o: ggml-mpi.c ggml-mpi.h ggml-mpi.o: ggml-mpi.cpp ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI endif # LLAMA_MPI
@ -541,17 +542,17 @@ $(info )
# Build library # Build library
# #
ggml.o: ggml.c ggml.h ggml-cuda.h ggml.o: ggml.cpp ggml.h ggml-cuda.h
$(CC) $(CFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h ggml-alloc.o: ggml-alloc.cpp ggml.h ggml-alloc.h
$(CC) $(CFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h ggml-backend.o: ggml-backend.cpp ggml.h ggml-backend.h
$(CC) $(CFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-quants.o: ggml-quants.cpp ggml.h ggml-quants.h
$(CC) $(CFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
@ -586,7 +587,7 @@ clean:
# Examples # Examples
# #
main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@echo @echo
@echo '==== Run ./main -h for help. ====' @echo '==== Run ./main -h for help. ===='
@ -685,6 +686,9 @@ common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
build-info.o: common/build-info.cpp build-info.o: common/build-info.cpp
$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@ $(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
#print.o: print.cpp # print.hpp
# $(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
# #
# Tests # Tests
# #
@ -744,5 +748,5 @@ tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(
tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS) tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-c.o: tests/test-c.c llama.h tests/test-c.o: tests/test-c.cpp llama.h
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@ $(CC) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@

View file

@ -701,7 +701,7 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md). The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one. For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets ygou write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
### Instruction mode with Alpaca ### Instruction mode with Alpaca

1097
README.org Normal file

File diff suppressed because it is too large Load diff

334
binding.py Normal file
View file

@ -0,0 +1,334 @@
import os
import json
import re
import clang.cindex
# configurable part
CLANG_VERSION='13.0.1'
# homebrew installs for llvm (brew info llvm gives details):
# x64: /usr/local/opt/llvm/lib
# arm64: /opt/homebrew/opt/llvm/lib
llvmLibPath = "/usr/lib/llvm-15/lib/"
cxxClientRoot = "/home/mdupont/experiments/llama.cpp/"
fileList = [
"ggml.cpp",
"llama.cpp"
]
typeList = [
]
# end of configurable part
clang.cindex.Config.set_library_path(llvmLibPath)
def list_headers_in_dir(path):
# enumerates a folder but keeps the full pathing for the files returned
# and removes certain files we don't want (like non-hxx, _json.hxx or _fmt.hxx)
# list all the files in the folder
files = os.listdir(path)
# only include .hxx files
files = list(filter(lambda x: x.endswith('.hxx'), files))
# add the folder path back on
files = list(map(lambda x: path + x, files))
return files
# parse through the list of files specified and expand wildcards
fullFileList = []
for filePath in fileList:
if "*" in filePath:
# wildcard path
basePath = filePath[:-1]
if "*" in basePath:
# if there is still a wildcard, we have an issue...
raise NotImplementedError(
"wildcard only supported at end of file path")
files = list_headers_in_dir(os.path.join(cxxClientRoot, basePath))
fullFileList = fullFileList + files
else:
# normal path
ff = os.path.join(cxxClientRoot, filePath)
fullFileList.append(ff)
print("DBUG",ff)
# exclude _json.hxx files
fullFileList = list(
filter(lambda x: not x.endswith('_json.hxx'), fullFileList))
# exclude _fmt.hxx files
fullFileList = list(
filter(lambda x: not x.endswith('_fmt.hxx'), fullFileList))
# generate a list of regexps from the type list (for handling wildcards)
typeListRe = list(map(lambda x: x.replace("*", "(.*)") + "(.*)", typeList))
def is_included_type(name, with_durability=False):
# TODO(brett19): This should be generalized somehow...
if "is_compound_operation" in name:
return False
if "replica_context" in name:
return False
if with_durability is True and '_with_legacy_durability' not in name:
return False
for x in typeListRe:
if re.fullmatch(x, name):
return True
return False
opTypes = []
opEnums = []
def parse_type(type):
typeStr = type.get_canonical().spelling
return parse_type_str(typeStr)
std_comparators = ["std::less<>", "std::greater<>", "std::less_equal<>", "std::greater_equal<>"]
def parse_type_str(typeStr):
if typeStr == "std::mutex":
return {"name": "std::mutex"}
if typeStr == "std::string":
return {"name": "std::string"}
if typeStr == "std::chrono::duration<long long>":
return {"name": "std::chrono::seconds"}
if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000>>":
return {"name": "std::chrono::milliseconds"}
if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000000>>":
return {"name": "std::chrono::microseconds"}
if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000000000>>":
return {"name": "std::chrono::nanoseconds"}
if typeStr == "std::error_code":
return {"name": "std::error_code"}
if typeStr == "std::monostate":
return {"name": "std::monostate"}
if typeStr == "std::byte":
return {"name": "std::byte"}
if typeStr == "unsigned long":
return {"name": "std::size_t"}
if typeStr == "char":
return {"name": "std::int8_t"}
if typeStr == "unsigned char":
return {"name": "std::uint8_t"}
if typeStr == "short":
return {"name": "std::int16_t"}
if typeStr == "unsigned short":
return {"name": "std::uint16_t"}
if typeStr == "int":
return {"name": "std::int32_t"}
if typeStr == "unsigned int":
return {"name": "std::uint32_t"}
if typeStr == "long long":
return {"name": "std::int64_t"}
if typeStr == "unsigned long long":
return {"name": "std::uint64_t"}
if typeStr == "bool":
return {"name": "std::bool"}
if typeStr == "float":
return {"name": "std::float"}
if typeStr == "double":
return {"name": "std::double"}
if typeStr == "std::nullptr_t":
return {"name": "std::nullptr_t"}
if typeStr in std_comparators:
return {"name": typeStr}
tplParts = typeStr.split("<", 1)
if len(tplParts) > 1:
tplClassName = tplParts[0]
tplParams = tplParts[1][:-1]
if tplClassName == "std::function":
return {
"name": "std::function"
}
if tplClassName == "std::optional":
return {
"name": "std::optional",
"of": parse_type_str(tplParams)
}
if tplClassName == "std::vector":
return {
"name": "std::vector",
"of": parse_type_str(tplParams)
}
if tplClassName == "std::set":
return {
"name": "std::set",
"of": parse_type_str(tplParams)
}
if tplClassName == "std::variant":
variantParts = tplParams.split(", ")
variantTypes = []
for variantPart in variantParts:
variantTypes.append(parse_type_str(variantPart))
return {
"name": "std::variant",
"of": variantTypes
}
if tplClassName == "std::array":
variantParts = tplParams.split(", ")
if len(variantParts) != 2:
print("FAILED TO PARSE ARRAY TYPES: " + typeStr)
return {"name": "unknown", "str": typeStr}
return {
"name": "std::array",
"of": parse_type_str(variantParts[0]),
"size": int(variantParts[1])
}
if tplClassName == "std::map":
variantParts = tplParams.split(", ")
if len(variantParts) < 2 or len(variantParts) > 3:
print("FAILED TO PARSE MAP TYPES: " + typeStr)
return {"name": "unknown", "str": typeStr}
if len(variantParts) == 2:
return {
"name": "std::map",
"of": parse_type_str(variantParts[0]),
"to": parse_type_str(variantParts[1])
}
else:
return {
"name": "std::map",
"of": parse_type_str(variantParts[0]),
"to": parse_type_str(variantParts[1]),
"comparator": parse_type_str(variantParts[2])
}
if tplClassName == "std::shared_ptr":
return {
"name": "std::shared_ptr",
"of": parse_type_str(tplParams)
}
#return {"name": "unknown", "str": typeStr}
if 'unnamed struct' in typeStr:
print("WARNING: Found unnamed struct: " + typeStr)
return {"name": typeStr}
internal_structs = []
UNNAMED_STRUCT_DELIM = '::(unnamed struct'
def traverse(node, namespace, main_file):
# only scan the elements of the file we parsed
#print("FILE", node.location.file )
if node.kind == clang.cindex.CursorKind.STRUCT_DECL or node.kind == clang.cindex.CursorKind.CLASS_DECL:
fullStructName = "::".join([*namespace, node.displayname])
print("REFL_TYPE(" + fullStructName + ")")
structFields = []
for child in node.get_children():
if child.kind == clang.cindex.CursorKind.FIELD_DECL:
struct_type = parse_type(child.type)
type_str = child.type.get_canonical().spelling
print(" REFL_FIELD(" + child.displayname + ")")
if 'unnamed' in type_str:
name_tokens = type_str.split('::')
name_override = '::'.join(name_tokens[:-1] + [child.displayname])
struct_type['name'] = name_override
internal_structs.append(name_override)
structFields.append({
"name": child.displayname,
"type": struct_type,
})
# replica read changes introduced duplicate get requests
if any(map(lambda op: op['name'] == fullStructName, opTypes)):
return
opTypes.append({
"name": fullStructName,
"fields": structFields,
})
print("REFL_END")
if node.kind == clang.cindex.CursorKind.TYPE_ALIAS_DECL:
fullStructName = "::".join([*namespace, node.displayname])
if is_included_type(fullStructName, with_durability=True):
type_ref = next((c for c in node.get_children() if c.kind == clang.cindex.CursorKind.TYPE_REF), None)
if type_ref:
base_request_name = type_ref.displayname.replace('struct', '').strip()
base_request = next((op for op in opTypes if op['name'] == base_request_name), None)
if base_request:
new_fields = [f for f in base_request['fields'] if f['name'] != 'durability_level']
new_fields.extend([
{"name":"persist_to", "type":{"name":"couchbase::persist_to"}},
{"name":"replicate_to", "type":{"name":"couchbase::replicate_to"}}
])
opTypes.append({
"name": fullStructName,
"fields": new_fields
})
if node.kind == clang.cindex.CursorKind.ENUM_DECL:
fullEnumName = "::".join([*namespace, node.displayname])
if is_included_type(fullEnumName):
enumValues = []
for child in node.get_children():
if child.kind == clang.cindex.CursorKind.ENUM_CONSTANT_DECL:
enumValues.append({
"name": child.displayname,
"value": child.enum_value,
})
opEnums.append({
"name": fullEnumName,
"type": parse_type(node.enum_type),
"values": enumValues,
})
if node.kind == clang.cindex.CursorKind.NAMESPACE:
namespace = [*namespace, node.displayname]
if node.kind == clang.cindex.CursorKind.CLASS_DECL:
namespace = [*namespace, node.displayname]
if node.kind == clang.cindex.CursorKind.STRUCT_DECL:
namespace = [*namespace, node.displayname]
for child in node.get_children():
traverse(child, namespace, main_file)
for headerPath in fullFileList:
print("processing " + headerPath)
index = clang.cindex.Index.create()
args = [
'-std=c++17',
]
try:
translation_unit = index.parse(headerPath, args=args)
except Exception as e:
print(e)
import pdb
pdb.set_trace()
raise e
# output clang compiler diagnostics information (for debugging)
for diagnostic in translation_unit.diagnostics:
diagnosticMsg = diagnostic.format()
print(diagnostic)
traverse(translation_unit.cursor, [], headerPath)
jsonData = json.dumps({
'op_structs': opTypes,
'op_enums': opEnums
})
f = open("bindings.json", "w")
f.write(jsonData)
f.close()

View file

@ -144,7 +144,7 @@ namespace grammar_parser {
while (*pos != '"') { while (*pos != '"') {
auto char_pair = parse_char(pos); auto char_pair = parse_char(pos);
pos = char_pair.second; pos = char_pair.second;
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first}); out_elements.push_back(llama_grammar_element(LLAMA_GRETYPE_CHAR, char_pair.first));
} }
pos = parse_space(pos + 1, is_nested); pos = parse_space(pos + 1, is_nested);
} else if (*pos == '[') { // char range(s) } else if (*pos == '[') { // char range(s)
@ -162,11 +162,11 @@ namespace grammar_parser {
? LLAMA_GRETYPE_CHAR_ALT ? LLAMA_GRETYPE_CHAR_ALT
: start_type; : start_type;
out_elements.push_back({type, char_pair.first}); out_elements.push_back(llama_grammar_element(type, char_pair.first));
if (pos[0] == '-' && pos[1] != ']') { if (pos[0] == '-' && pos[1] != ']') {
auto endchar_pair = parse_char(pos + 1); auto endchar_pair = parse_char(pos + 1);
pos = endchar_pair.second; pos = endchar_pair.second;
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first}); out_elements.push_back(llama_grammar_element(LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first));
} }
} }
pos = parse_space(pos + 1, is_nested); pos = parse_space(pos + 1, is_nested);
@ -175,7 +175,7 @@ namespace grammar_parser {
uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos); uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos);
pos = parse_space(name_end, is_nested); pos = parse_space(name_end, is_nested);
last_sym_start = out_elements.size(); last_sym_start = out_elements.size();
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id}); out_elements.push_back(llama_grammar_element(LLAMA_GRETYPE_RULE_REF, ref_rule_id));
} else if (*pos == '(') { // grouping } else if (*pos == '(') { // grouping
// parse nested alternates into synthesized rule // parse nested alternates into synthesized rule
pos = parse_space(pos + 1, true); pos = parse_space(pos + 1, true);
@ -183,7 +183,7 @@ namespace grammar_parser {
pos = parse_alternates(state, pos, rule_name, sub_rule_id, true); pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
last_sym_start = out_elements.size(); last_sym_start = out_elements.size();
// output reference to synthesized rule // output reference to synthesized rule
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id}); out_elements.push_back(llama_grammar_element(LLAMA_GRETYPE_RULE_REF, sub_rule_id));
if (*pos != ')') { if (*pos != ')') {
throw std::runtime_error(std::string("expecting ')' at ") + pos); throw std::runtime_error(std::string("expecting ')' at ") + pos);
} }
@ -219,7 +219,8 @@ namespace grammar_parser {
// in original rule, replace previous symbol with reference to generated rule // in original rule, replace previous symbol with reference to generated rule
out_elements.resize(last_sym_start); out_elements.resize(last_sym_start);
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id}); llama_grammar_element a(LLAMA_GRETYPE_RULE_REF, sub_rule_id);
out_elements.push_back(a);
pos = parse_space(pos + 1, is_nested); pos = parse_space(pos + 1, is_nested);
} else { } else {

View file

@ -181,7 +181,7 @@ llama_token llama_sampling_sample(
cur.clear(); cur.clear();
for (llama_token token_id = 0; token_id < n_vocab; token_id++) { for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); cur.emplace_back(llama_token_data(token_id, logits[token_id], 0.0f));
} }
llama_token_data_array cur_p = { cur.data(), cur.size(), false }; llama_token_data_array cur_p = { cur.data(), cur.size(), false };

View file

@ -1527,11 +1527,14 @@ int main(int argc, char ** argv) {
std::vector<uint8_t> work_buffer; std::vector<uint8_t> work_buffer;
for (int ex=0; ex<n_examples; ++ex) { for (int ex=0; ex<n_examples; ++ex) {
struct ggml_init_params params = { struct ggml_init_params params(
/*.mem_size =*/ compute_size, //.mem_size =
/*.mem_buffer =*/ compute_addr, compute_size,
/*.no_alloc =*/ false, //.mem_buffer =
}; compute_addr,
//.no_alloc =
false
);
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
@ -1602,11 +1605,14 @@ int main(int argc, char ** argv) {
} }
printf("---\n"); printf("---\n");
for (int i=0; i<n_gen; ++i) { for (int i=0; i<n_gen; ++i) {
struct ggml_init_params params = { struct ggml_init_params params(
/*.mem_size =*/ compute_size, //.mem_size =
/*.mem_buffer =*/ compute_addr, compute_size,
/*.no_alloc =*/ false, //.mem_buffer =
}; compute_addr,
//.no_alloc =
false
);
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
ggml_cgraph gf = {}; ggml_cgraph gf = {};

View file

@ -121,16 +121,18 @@ int main(int argc, char ** argv) {
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = { llama_batch batch_view(
n_tokens, /* .n_tokens= */ n_tokens,
batch.token + i, /* .token= */ batch.token + i,
nullptr, /* .embd= */ nullptr,
batch.pos + i, /* .pos= */ batch.pos + i,
batch.n_seq_id + i, /* .n_seq_id= */ batch.n_seq_id + i,
batch.seq_id + i, /* .seq_id= */ batch.seq_id + i,
batch.logits + i, /* .logits= */ batch.logits + i,
0, 0, 0, // unused /* .all_pos_0= */0,
}; /* .all_pos_1= */0,
/* .all_seq_id= */0 // unused
);
const int ret = llama_decode(ctx, batch_view); const int ret = llama_decode(ctx, batch_view);
if (ret != 0) { if (ret != 0) {

View file

@ -169,10 +169,13 @@ int main(int argc, char ** argv) {
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) { for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); candidates.emplace_back(llama_token_data(
token_id,
logits[token_id],
0.0f ));
} }
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; llama_token_data_array candidates_p (candidates.data(), candidates.size(), false );
const int top_k = 40; const int top_k = 40;
const float top_p = 0.9f; const float top_p = 0.9f;

View file

@ -140,11 +140,14 @@ int main(int argc, char ** argv) {
printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024)); printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
struct ggml_init_params params = { struct ggml_init_params params(
/*.mem_size =*/ ctx_size, //.mem_size =
/*.mem_buffer =*/ NULL, ctx_size,
/* no_alloc =*/ 0 //.mem_buffer =
}; NULL,
//.no_alloc =
0
);
ctx = ggml_init(params); ctx = ggml_init(params);
if (!ctx) { if (!ctx) {

View file

@ -553,10 +553,12 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
if (is_ggml_file(filename)) { if (is_ggml_file(filename)) {
struct ggml_context * ctx_data = NULL; struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = { struct gguf_init_params params(
/*.no_alloc = */ false, //.no_alloc =
/*.ctx = */ &ctx_data, false,
}; //.ctx =
&ctx_data
);
struct gguf_context * ctx = gguf_init_from_file(filename, params); struct gguf_context * ctx = gguf_init_from_file(filename, params);
GGML_ASSERT(ctx != NULL); GGML_ASSERT(ctx != NULL);

View file

@ -389,9 +389,11 @@ static void export_lora(struct export_lora_params * params) {
// open base model gguf, read tensors without their data // open base model gguf, read tensors without their data
struct ggml_context * ctx_in; struct ggml_context * ctx_in;
struct gguf_init_params params_gguf; struct gguf_init_params params_gguf(
params_gguf.no_alloc = true; //params_gguf.no_alloc =
params_gguf.ctx = &ctx_in; true,
//params_gguf.ctx =
&ctx_in);
struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf); struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
// create new gguf // create new gguf

View file

@ -294,10 +294,12 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
// get parameters directly from gguf file // get parameters directly from gguf file
{ {
struct gguf_init_params params = { struct gguf_init_params params(
/*.no_alloc = */ false, //.no_alloc =
/*.ctx = */ NULL, false,
}; //.ctx =
NULL
);
struct gguf_context * mctx = gguf_init_from_file(fn_model, params); struct gguf_context * mctx = gguf_init_from_file(fn_model, params);
load_model_hparams_gguf(mctx, &hparams, "llama"); load_model_hparams_gguf(mctx, &hparams, "llama");
@ -598,7 +600,9 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
const bool enable_flash_attn, const bool enable_flash_attn,
const bool enable_checkpointing) { const bool enable_checkpointing) {
ggml_set_scratch(ctx, { 0, 0, nullptr, }); //FIXME
assert(0);
//ggml_set_scratch(ctx, { 0, 0, nullptr, });
const int n_past = 0; const int n_past = 0;
const int N = n_tokens; const int N = n_tokens;
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
@ -989,9 +993,11 @@ static void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llam
static bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { static bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
struct ggml_context * f_ggml_ctx; struct ggml_context * f_ggml_ctx;
struct gguf_init_params params; struct gguf_init_params params(
params.no_alloc = false; //params.no_alloc =
params.ctx = &f_ggml_ctx; false,
//params.ctx =
&f_ggml_ctx);
struct gguf_context * fctx = gguf_init_from_file(filename, params); struct gguf_context * fctx = gguf_init_from_file(filename, params);
if (fctx == NULL) { if (fctx == NULL) {
return false; return false;
@ -1706,11 +1712,14 @@ int main(int argc, char ** argv) {
std::vector<uint8_t> mem_compute_data; std::vector<uint8_t> mem_compute_data;
// context for input tensors without their data // context for input tensors without their data
struct ggml_init_params ctx_input_params = { struct ggml_init_params ctx_input_params(
ggml_tensor_overhead() * 2, // mem_size //.mem_size=
NULL, // mem_buffer ggml_tensor_overhead() * 2, // mem_size
true, // no_alloc //.mem_buffer=
}; NULL, // mem_buffer
//.no_alloc=
true // no_alloc
);
struct ggml_context * ctx_input = ggml_init(ctx_input_params); struct ggml_context * ctx_input = ggml_init(ctx_input_params);
// the input tensors // the input tensors
@ -1735,11 +1744,14 @@ int main(int argc, char ** argv) {
2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() + 2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
(params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true)) (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
); );
struct ggml_init_params ctx_compute_params = { struct ggml_init_params ctx_compute_params(
estimated_compute_size_wo_data, // mem_size //.mem_size=
NULL, // mem_buffer estimated_compute_size_wo_data, // mem_size
true, // no_alloc //.mem_buffer=
}; NULL, // mem_buffer
//.no_alloc=
true // no_alloc
);
struct ggml_context * ctx_compute = NULL; struct ggml_context * ctx_compute = NULL;
struct ggml_tensor * loss = NULL; struct ggml_tensor * loss = NULL;
@ -1902,11 +1914,14 @@ int main(int argc, char ** argv) {
printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f)); printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
// context for work buffer // context for work buffer
struct ggml_init_params ctx_work_params = { struct ggml_init_params ctx_work_params(
max_work_size, // mem_size //.mem_size=
NULL, // mem_buffer max_work_size, // mem_size
false, // no_alloc //.mem_buffer =
}; NULL, // mem_buffer
//.no_alloc =
false // no_alloc
);
struct ggml_context * ctx_work = ggml_init(ctx_work_params); struct ggml_context * ctx_work = ggml_init(ctx_work_params);
int64_t t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();

View file

@ -40,11 +40,14 @@ static bool gguf_ex_write(const std::string & fname) {
gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3); gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
gguf_set_arr_str (ctx, "some.parameter.arr.str", std::vector<const char *>{ "hello", "world", "!" }.data(), 3); gguf_set_arr_str (ctx, "some.parameter.arr.str", std::vector<const char *>{ "hello", "world", "!" }.data(), 3);
struct ggml_init_params params = { struct ggml_init_params params(
/*.mem_size =*/ 128ull*1024ull*1024ull, //.mem_size =
/*.mem_buffer =*/ NULL, 128ull*1024ull*1024ull,
/*.no_alloc =*/ false, //.mem_buffer =
}; NULL,
//.no_alloc =
false
);
struct ggml_context * ctx_data = ggml_init(params); struct ggml_context * ctx_data = ggml_init(params);
@ -86,10 +89,12 @@ static bool gguf_ex_write(const std::string & fname) {
// just read tensor info // just read tensor info
static bool gguf_ex_read_0(const std::string & fname) { static bool gguf_ex_read_0(const std::string & fname) {
struct gguf_init_params params = { struct gguf_init_params params (
/*.no_alloc = */ false, //.no_alloc =
/*.ctx = */ NULL, false,
}; //.ctx =
NULL
);
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
@ -146,10 +151,12 @@ static bool gguf_ex_read_0(const std::string & fname) {
static bool gguf_ex_read_1(const std::string & fname) { static bool gguf_ex_read_1(const std::string & fname) {
struct ggml_context * ctx_data = NULL; struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = { struct gguf_init_params params (
/*.no_alloc = */ false, //.no_alloc =
/*.ctx = */ &ctx_data, false,
}; //.ctx =
&ctx_data
);
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);

View file

@ -255,11 +255,14 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
const auto & buf_compute = ctx->buf_compute; const auto & buf_compute = ctx->buf_compute;
struct ggml_init_params params = { struct ggml_init_params params(
/*.mem_size =*/ buf_compute.size, //.mem_size =
/*.mem_buffer =*/ buf_compute.data, buf_compute.size,
/*.no_alloc =*/ false, //.mem_buffer =
}; buf_compute.data,
//.no_alloc =
false
);
params.no_alloc = true; params.no_alloc = true;
@ -455,10 +458,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
struct ggml_context * meta = NULL; struct ggml_context * meta = NULL;
struct gguf_init_params params = { struct gguf_init_params params(
/*.no_alloc = */ true, //.no_alloc =
/*.ctx = */ &meta, true,
}; //.ctx =
&meta);
struct gguf_context * ctx = gguf_init_from_file(fname, params); struct gguf_context * ctx = gguf_init_from_file(fname, params);
if (!ctx) { if (!ctx) {
@ -552,11 +557,14 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
// load tensors // load tensors
{ {
struct ggml_init_params params = { struct ggml_init_params params(
/*.mem_size =*/ ctx_size, //.mem_size =
/*.mem_buffer =*/ NULL, ctx_size,
/*.no_alloc =*/ false, //.mem_buffer =
}; NULL,
//.no_alloc =
false
);
new_clip->ctx = ggml_init(params); new_clip->ctx = ggml_init(params);
if (!new_clip->ctx) { if (!new_clip->ctx) {

View file

@ -75,7 +75,18 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
if (n_eval > n_batch) { if (n_eval > n_batch) {
n_eval = n_batch; n_eval = n_batch;
} }
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, }; llama_batch batch(
/* .n_tokens= */int32_t(n_eval),
/* .token= */nullptr,
/* .embd= */(image_embed->embed+i*n_embd),
/* .pos= */nullptr,
/* .n_seq_id= */nullptr,
/* .seq_id= */nullptr,
/* .logits= */nullptr,
/* .all_pos_0= */*n_past,
/* .all_pos_1= */1,
/* .all_seq_id= */0
);
if (llama_decode(ctx_llama, batch)) { if (llama_decode(ctx_llama, batch)) {
fprintf(stderr, "%s : failed to eval\n", __func__); fprintf(stderr, "%s : failed to eval\n", __func__);
return false; return false;

View file

@ -31,6 +31,8 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
#include "print.hpp"
static llama_context ** g_ctx; static llama_context ** g_ctx;
static llama_model ** g_model; static llama_model ** g_model;
static gpt_params * g_params; static gpt_params * g_params;
@ -99,6 +101,7 @@ static void sigint_handler(int signo) {
} }
} }
#endif #endif
using namespace refl;
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
(void) level; (void) level;
@ -110,6 +113,9 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
g_params = &params; g_params = &params;
//using Td = type_descriptor<gpt_params>;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params)) {
return 1; return 1;
} }
@ -124,7 +130,8 @@ int main(int argc, char ** argv) {
// TODO: Dump params ? // TODO: Dump params ?
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity)); //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
print_fields(params);
// save choice to use color for later // save choice to use color for later
// (note for later: this is a slightly awkward choice) // (note for later: this is a slightly awkward choice)
console::init(params.simple_io, params.use_color); console::init(params.simple_io, params.use_color);
@ -181,7 +188,7 @@ int main(int argc, char ** argv) {
llama_context * ctx_guidance = NULL; llama_context * ctx_guidance = NULL;
g_model = &model; g_model = &model;
g_ctx = &ctx; g_ctx = &ctx;
// load the model and apply lora adapter, if any // load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__); LOG("%s: load the model and apply lora adapter, if any\n", __func__);
std::tie(model, ctx) = llama_init_from_gpt_params(params); std::tie(model, ctx) = llama_init_from_gpt_params(params);
@ -241,6 +248,8 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
print_fields(*model);
if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) { if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n"); LOG("tokenize the prompt\n");
if (params.chatml) { if (params.chatml) {
@ -284,7 +293,8 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1; return 1;
} }
print_fields(*ctx);
//print_fields(session_tokens);
// debug message about similarity of saved session, if applicable // debug message about similarity of saved session, if applicable
size_t n_matching_session_tokens = 0; size_t n_matching_session_tokens = 0;
if (!session_tokens.empty()) { if (!session_tokens.empty()) {
@ -372,6 +382,10 @@ int main(int argc, char ** argv) {
for (int i = 0; i < (int) guidance_inp.size(); i++) { for (int i = 0; i < (int) guidance_inp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
} }
print_fields(*ctx_guidance);
} }
if (params.n_keep > 0) { if (params.n_keep > 0) {
@ -481,7 +495,8 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd_guidance; std::vector<llama_token> embd_guidance;
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
print_fields(*ctx_sampling);
while ((n_remain != 0 && !is_antiprompt) || params.interactive) { while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict // predict
if (!embd.empty()) { if (!embd.empty()) {
@ -490,6 +505,7 @@ int main(int argc, char ** argv) {
int max_embd_size = n_ctx - 4; int max_embd_size = n_ctx - 4;
// Ensure the input doesn't exceed the context size by truncating embd if necessary. // Ensure the input doesn't exceed the context size by truncating embd if necessary.
//print_fields(embd);
if ((int) embd.size() > max_embd_size) { if ((int) embd.size() > max_embd_size) {
const int skipped_tokens = (int) embd.size() - max_embd_size; const int skipped_tokens = (int) embd.size() - max_embd_size;
embd.resize(max_embd_size); embd.resize(max_embd_size);
@ -516,6 +532,7 @@ int main(int argc, char ** argv) {
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard); n_past, n_left, n_ctx, params.n_keep, n_discard);
print_fields(*ctx);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
@ -632,7 +649,7 @@ int main(int argc, char ** argv) {
} }
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance); const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
print_fields(id);
llama_sampling_accept(ctx_sampling, ctx, id, true); llama_sampling_accept(ctx_sampling, ctx, id, true);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
@ -667,7 +684,7 @@ int main(int argc, char ** argv) {
if (input_echo) { if (input_echo) {
for (auto id : embd) { for (auto id : embd) {
const std::string token_str = llama_token_to_piece(ctx, id); const std::string token_str = llama_token_to_piece(ctx, id);
printf("%s", token_str.c_str()); printf("TOKEN:%s\n", token_str.c_str());
if (embd.size() > 1) { if (embd.size() > 1) {
input_tokens.push_back(id); input_tokens.push_back(id);
@ -858,6 +875,9 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx); llama_print_timings(ctx);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
// dont dump core
//int *ptr = 0; *ptr = 1;
if (ctx_guidance) { llama_free(ctx_guidance); } if (ctx_guidance) { llama_free(ctx_guidance); }
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);

View file

@ -67,9 +67,12 @@ int main(int argc, char ** argv) {
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) { for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); candidates.emplace_back(llama_token_data(
token_id,
logits[token_id],
0.0f));
} }
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; llama_token_data_array candidates_p(candidates.data(), candidates.size(), false );
auto next_token = llama_sample_token(ctx, &candidates_p); auto next_token = llama_sample_token(ctx, &candidates_p);
auto next_token_str = llama_token_to_piece(ctx, next_token); auto next_token_str = llama_token_to_piece(ctx, next_token);

View file

@ -33,8 +33,16 @@
using json = nlohmann::json; using json = nlohmann::json;
struct server_params struct server_params : refl::attr::usage::type
{ {
server_params():
hostname( "127.0.0.1"),
public_path(public_path),
port(port),
read_timeout(read_timeout),
write_timeout( 600) {};
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::string public_path = "examples/server/public"; std::string public_path = "examples/server/public";
int32_t port = 8080; int32_t port = 8080;
@ -543,6 +551,28 @@ struct llama_server_context
std::vector<task_multi> queue_multitasks; std::vector<task_multi> queue_multitasks;
std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks
std::mutex mutex_results; std::mutex mutex_results;
llama_server_context():
model(nullptr),
ctx(nullptr),
clp_ctx(nullptr),
params(params),
batch(batch),
multimodal(false),
clean_kv_cache( true),
all_slots_are_idle( false),
add_bos_token( true),
//int32_t id_gen;
//int32_t n_ctx; // total context for all clients / slots
system_need_update(false){}
//std::string system_prompt;
//std::vector<llama_token> system_tokens;
//std::string name_user; // this should be the antiprompt
//std::string name_assistant;
//std::vector<llama_client_slot> slots;
//std::vector<task_server> queue_tasks;
//std::vector<task_result> queue_results;
//std::mutex mutex_tasks;
//std::mutex mutex_results;
~llama_server_context() ~llama_server_context()
{ {
@ -1402,7 +1432,7 @@ struct llama_server_context
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
{ {
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = { llama_batch batch_view(
n_tokens, n_tokens,
batch.token + i, batch.token + i,
nullptr, nullptr,
@ -1410,8 +1440,8 @@ struct llama_server_context
batch.n_seq_id + i, batch.n_seq_id + i,
batch.seq_id + i, batch.seq_id + i,
batch.logits + i, batch.logits + i,
0, 0, 0, // unused 0, 0, 0 // unused
}; );
if (llama_decode(ctx, batch_view)) if (llama_decode(ctx, batch_view))
{ {
LOG_TEE("%s : failed to eval\n", __func__); LOG_TEE("%s : failed to eval\n", __func__);
@ -1818,17 +1848,18 @@ struct llama_server_context
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
{ {
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = llama_batch batch_view(
{ /* .n_tokens= */n_tokens,
n_tokens, /* .token= */batch.token + i,
batch.token + i, /* .embd= */nullptr,
nullptr, /* .pos= */batch.pos + i,
batch.pos + i, /* .n_seq_id= */batch.n_seq_id + i,
batch.n_seq_id + i, /* .seq_id= */batch.seq_id + i,
batch.seq_id + i, /* .logits= */batch.logits + i,
batch.logits + i, /* .all_pos_0= */.0,
0, 0, 0, // unused /* .all_pos_1= */0,
}; /* .all_seq_id= */0 // unused
);
const int ret = llama_decode(ctx, batch_view); const int ret = llama_decode(ctx, batch_view);
if (ret != 0) if (ret != 0)
@ -1875,7 +1906,10 @@ struct llama_server_context
slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3; slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
} }
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false }; llama_token_data_array cur_p(
slot.ctx_sampling->cur.data(),
slot.ctx_sampling->cur.size(),
false );
result.tok = id; result.tok = id;
const int32_t n_probs = slot.sparams.n_probs; const int32_t n_probs = slot.sparams.n_probs;
@ -3067,4 +3101,4 @@ int main(int argc, char **argv)
llama_backend_free(); llama_backend_free();
return 0; return 0;
} }

View file

@ -124,10 +124,15 @@ int main(int argc, char ** argv) {
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) { for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); candidates.emplace_back(llama_token_data( token_id,
logits[token_id],
0.0f ));
} }
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; llama_token_data_array candidates_p(
candidates.data(),
candidates.size(),
false );
// sample the most likely token // sample the most likely token
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

View file

@ -311,7 +311,8 @@ static struct ggml_tensor * llama_build_train_graphs(
const bool enable_flash_attn, const bool enable_flash_attn,
const bool enable_checkpointing) { const bool enable_checkpointing) {
ggml_set_scratch(ctx, { 0, 0, nullptr, }); assert(0);
//ggml_set_scratch(ctx, { 0, 0, nullptr, });
const int n_past = 0; const int n_past = 0;
const int N = n_tokens; const int N = n_tokens;
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
@ -599,10 +600,12 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo
// set vocab by copying from vocab_model gguf file // set vocab by copying from vocab_model gguf file
{ {
struct gguf_init_params params = { struct gguf_init_params params(
/*.no_alloc = */ false, //.no_alloc =
/*.ctx = */ NULL, false,
}; //.ctx =
NULL
);
struct gguf_context * vctx = gguf_init_from_file(fn_vocab_model, params); struct gguf_context * vctx = gguf_init_from_file(fn_vocab_model, params);
const int token_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_LIST)); const int token_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_LIST));
@ -744,9 +747,11 @@ static void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_voc
static bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct train_state * train) { static bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct train_state * train) {
struct ggml_context * f_ggml_ctx; struct ggml_context * f_ggml_ctx;
struct gguf_init_params params; struct gguf_init_params params(
params.no_alloc = false; //params.no_alloc =
params.ctx = &f_ggml_ctx; false,
//params.ctx =
&f_ggml_ctx);
struct gguf_context * fctx = gguf_init_from_file(filename, params); struct gguf_context * fctx = gguf_init_from_file(filename, params);
if (fctx == NULL) { if (fctx == NULL) {
return false; return false;
@ -1084,11 +1089,14 @@ int main(int argc, char ** argv) {
ggml_allocr * alloc = NULL; ggml_allocr * alloc = NULL;
// context for input tensors without their data // context for input tensors without their data
struct ggml_init_params ctx_input_params = { struct ggml_init_params ctx_input_params (
ggml_tensor_overhead() * 2, // mem_size //.mem_size =
NULL, // mem_buffer ggml_tensor_overhead() * 2, // mem_size
true, // no_alloc // .mem_buffer =
}; NULL, // mem_buffer
// .no_alloc =
true // no_alloc
);
struct ggml_context * ctx_input = ggml_init(ctx_input_params); struct ggml_context * ctx_input = ggml_init(ctx_input_params);
// the input tensors // the input tensors
@ -1113,11 +1121,14 @@ int main(int argc, char ** argv) {
2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() + 2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
(params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true)) (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
); );
struct ggml_init_params ctx_compute_params = { struct ggml_init_params ctx_compute_params(
estimated_compute_size_wo_data, // mem_size // .mem_size =
NULL, // mem_buffer estimated_compute_size_wo_data, // mem_size
true, // no_alloc //.mem_buffer=
}; NULL, // mem_buffer
//.no_alloc =
true // no_alloc
);
struct ggml_context * ctx_compute = NULL; struct ggml_context * ctx_compute = NULL;
struct ggml_tensor * loss = NULL; struct ggml_tensor * loss = NULL;
@ -1266,11 +1277,14 @@ int main(int argc, char ** argv) {
printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f)); printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
// context for work buffer // context for work buffer
struct ggml_init_params ctx_work_params = { struct ggml_init_params ctx_work_params(
max_work_size, // mem_size //.mem_size=
NULL, // mem_buffer max_work_size, //
false, // no_alloc //.mem_buffer=
}; NULL, //
//.no_alloc=
false //
);
struct ggml_context * ctx_work = ggml_init(ctx_work_params); struct ggml_context * ctx_work = ggml_init(ctx_work_params);
int64_t t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();

View file

@ -8,9 +8,9 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include "ggml-internal.hpp"
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MAX_FREE_BLOCKS 256
//#define GGML_ALLOCATOR_DEBUG //#define GGML_ALLOCATOR_DEBUG
@ -24,28 +24,7 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
return offset + align; return offset + align;
} }
struct free_block {
void * addr;
size_t size;
};
struct ggml_tallocr {
struct ggml_backend_buffer * buffer;
bool buffer_owned;
void * base;
size_t alignment;
int n_free_blocks;
struct free_block free_blocks[MAX_FREE_BLOCKS];
size_t max_size;
bool measure;
#ifdef GGML_ALLOCATOR_DEBUG
struct ggml_tensor * allocated_tensors[1024];
#endif
};
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG
static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
@ -333,33 +312,20 @@ size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
// graph allocator // graph allocator
struct hash_node {
int n_children;
int n_views;
};
struct ggml_gallocr {
ggml_tallocr_t talloc;
struct ggml_hash_set hash_set;
struct hash_node * hash_values;
size_t hash_values_size;
ggml_tallocr_t * hash_allocs;
int * parse_seq;
int parse_seq_len;
};
ggml_gallocr_t ggml_gallocr_new(void) { ggml_gallocr_t ggml_gallocr_new(void) {
ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr)); ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr));
ggml_hash_set hs = {.size=0, .keys=NULL};
*galloc = (struct ggml_gallocr) { *galloc = (struct ggml_gallocr) {
/*.talloc = */ NULL, .talloc = NULL,
/*.hash_set = */ {0}, .hash_set =hs,
/*.hash_values = */ NULL, .hash_values = NULL,
/*.hash_values_size = */ 0, .hash_values_size = 0,
/*.hash_allocs = */ NULL, .hash_allocs = NULL,
/*.parse_seq = */ NULL, .parse_seq = NULL,
/*.parse_seq_len = */ 0, .parse_seq_len = 0,
}; };
//((*galloc).hash_set)[0] = 0;
return galloc; return galloc;
} }
@ -698,16 +664,12 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap
// legacy API wrapper // legacy API wrapper
struct ggml_allocr {
ggml_tallocr_t talloc;
ggml_gallocr_t galloc;
};
static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) { static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) {
ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr)); ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr));
*alloc = (struct ggml_allocr) { *alloc = (struct ggml_allocr) {
/*.talloc = */ talloc, .talloc = talloc,
/*.galloc = */ ggml_gallocr_new(), .galloc = ggml_gallocr_new(),
}; };
return alloc; return alloc;
} }

View file

@ -25,10 +25,10 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
GGML_ASSERT(iface.get_base != NULL); GGML_ASSERT(iface.get_base != NULL);
(*buffer) = (struct ggml_backend_buffer) { (*buffer) = (struct ggml_backend_buffer) {
/* .interface = */ iface, .iface = iface,
/* .backend = */ backend, .backend = backend,
/* .context = */ context, .context = context,
/* .size = */ size, .size = size,
}; };
return buffer; return buffer;
@ -586,11 +586,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size); memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
sched->n_splits = 0; sched->n_splits = 0;
struct ggml_init_params params = { struct ggml_init_params params(
/*.mem_size = */ sizeof(sched->context_buffer), //.mem_size =
/*.mem_buffer = */ sched->context_buffer, sizeof(sched->context_buffer),
/*.no_alloc = */ true //.mem_buffer =
}; sched->context_buffer,
//.no_alloc =
true
);
if (sched->ctx != NULL) { if (sched->ctx != NULL) {
ggml_free(sched->ctx); ggml_free(sched->ctx);

View file

@ -7675,12 +7675,12 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
#endif #endif
// debug helpers // debug helpers
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); // printf("JSON: { \"data\":{ \"src0\": { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ], \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}, \"src1\": { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ], \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}, \"dst\" : { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ], \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}}}\n",
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]); // src0->name, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
//printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]); // ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name,
//printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]); // src1->name, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name,
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); // dst->name, dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], ggml_is_contiguous(dst), ggml_is_transposed(dst), ggml_type_name(dst->type), dst->name
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); // );
if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
// KQ single-batch // KQ single-batch

View file

@ -22,7 +22,7 @@ extern "C" {
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
#define static_assert(cond, msg) _Static_assert(cond, msg) #define static_assert(cond, msg) _Static_assert(cond, msg)
#else #else
#define static_assert(cond, msg) struct global_scope_noop_trick //#define static_assert(cond, msg) struct global_scope_noop_trick
#endif #endif
#endif #endif

258
ggml-internal.hpp Normal file
View file

@ -0,0 +1,258 @@
struct ggml_context {
size_t mem_size;
void * mem_buffer;
bool mem_buffer_owned;
bool no_alloc;
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
int n_objects;
struct ggml_object * objects_begin;
struct ggml_object * objects_end;
struct ggml_scratch scratch;
struct ggml_scratch scratch_save;
ggml_context():
mem_size(0),
mem_buffer(0),
mem_buffer_owned(0),
no_alloc(0),
no_alloc_save(0),
n_objects(0),
objects_begin(0),
objects_end(0),
scratch(),
scratch_save()
{
}
};
struct ggml_context_container {
bool used;
struct ggml_context context;
ggml_context_container(): used(0),context(){
}
};
typedef double ggml_float;
typedef void * thread_ret_t;
#define MAX_FREE_BLOCKS 256
struct free_block {
void * addr;
size_t size;
};
struct ggml_tallocr {
struct ggml_backend_buffer * buffer;
bool buffer_owned;
void * base;
size_t alignment;
int n_free_blocks;
struct free_block free_blocks[MAX_FREE_BLOCKS];
size_t max_size;
bool measure;
#ifdef GGML_ALLOCATOR_DEBUG
struct ggml_tensor * allocated_tensors[1024];
#endif
};
struct hash_node {
int n_children;
int n_views;
};
typedef struct ggml_tallocr * ggml_tallocr_t;
typedef struct ggml_gallocr * ggml_gallocr_t;
struct ggml_gallocr {
ggml_tallocr_t talloc;
struct ggml_hash_set hash_set;
struct hash_node * hash_values;
size_t hash_values_size;
ggml_tallocr_t * hash_allocs;
int * parse_seq;
int parse_seq_len;
};
struct ggml_allocr {
ggml_tallocr_t talloc;
ggml_gallocr_t galloc;
};
#define GGML_NUMA_MAX_NODES 8
#define GGML_NUMA_MAX_CPUS 512
struct ggml_numa_node {
uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
uint32_t n_cpus;
};
struct ggml_numa_nodes {
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
uint32_t n_nodes;
uint32_t total_cpus; // hardware threads on system
};
struct ggml_state {
struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
struct ggml_numa_nodes numa;
ggml_state():contexts(), numa()
{
}
};
struct gguf_str {
uint64_t n; // GGUFv2
char * data;
};
struct ggml_map_custom1_op_params {
ggml_custom1_op_t fun;
int n_tasks;
void * userdata;
};
struct ggml_map_custom2_op_params {
ggml_custom2_op_t fun;
int n_tasks;
void * userdata;
};
struct ggml_map_custom3_op_params {
ggml_custom3_op_t fun;
int n_tasks;
void * userdata;
};
struct hash_map {
struct ggml_hash_set set;
struct ggml_tensor ** vals;
};
#if defined(_WIN32)
typedef volatile LONG atomic_int;
typedef atomic_int atomic_bool;
#else
#include<atomic>
using namespace std;
#endif
struct ggml_compute_state_shared {
const struct ggml_cgraph * cgraph;
const struct ggml_cplan * cplan;
int64_t perf_node_start_cycles;
int64_t perf_node_start_time_us;
const int n_threads;
// synchronization primitives
atomic_int n_active; // num active threads
atomic_int node_n; // active graph node
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
void * abort_callback_data;
};
typedef pthread_t ggml_thread_t;
struct ggml_compute_state {
ggml_thread_t thrd;
int ith;
struct ggml_compute_state_shared * shared;
};
union gguf_value {
uint8_t uint8;
int8_t int8;
uint16_t uint16;
int16_t int16;
uint32_t uint32;
int32_t int32;
float float32;
uint64_t uint64;
int64_t int64;
double float64;
bool bool_;
struct gguf_str str;
struct gguf_array_T {
enum gguf_type type;
uint64_t n; // GGUFv2
void * data;
} arr;
};
struct ggml_lbfgs_iteration_data {
float alpha;
float ys;
float * s;
float * y;
};
struct gguf_kv {
struct gguf_str key;
enum gguf_type type;
union gguf_value value;
};
struct gguf_header {
char magic[4];
uint32_t version;
uint64_t n_tensors; // GGUFv2
uint64_t n_kv; // GGUFv2
};
struct gguf_tensor_info {
struct gguf_str name;
uint32_t n_dims;
uint64_t ne[GGML_MAX_DIMS];
enum ggml_type type;
uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
// for writing API
const void * data;
size_t size;
};
struct gguf_context {
struct gguf_header header;
struct gguf_kv * kv;
struct gguf_tensor_info * infos;
size_t alignment;
size_t offset; // offset of `data` from beginning of file
size_t size; // size of `data` in bytes
//uint8_t * padding;
void * data;
};
struct gguf_buf {
void * data;
size_t size;
size_t offset;
};
#include "ggml-backend-impl.h"

File diff suppressed because it is too large Load diff

View file

@ -167,58 +167,58 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
// Quantization // Quantization
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k); void quantize_row_q4_0_reference(const float * __restrict__ x, block_q4_0 * __restrict__ y, int k);
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k); void quantize_row_q4_1_reference(const float * __restrict__ x, block_q4_1 * __restrict__ y, int k);
void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k); void quantize_row_q5_0_reference(const float * __restrict__ x, block_q5_0 * __restrict__ y, int k);
void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k); void quantize_row_q5_1_reference(const float * __restrict__ x, block_q5_1 * __restrict__ y, int k);
void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k); void quantize_row_q8_0_reference(const float * __restrict__ x, block_q8_0 * __restrict__ y, int k);
void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k); void quantize_row_q8_1_reference(const float * __restrict__ x, block_q8_1 * __restrict__ y, int k);
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k); void quantize_row_q2_K_reference(const float * __restrict__ x, block_q2_K * __restrict__ y, int k);
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k); void quantize_row_q3_K_reference(const float * __restrict__ x, block_q3_K * __restrict__ y, int k);
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k); void quantize_row_q4_K_reference(const float * __restrict__ x, block_q4_K * __restrict__ y, int k);
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k); void quantize_row_q5_K_reference(const float * __restrict__ x, block_q5_K * __restrict__ y, int k);
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); void quantize_row_q6_K_reference(const float * __restrict__ x, block_q6_K * __restrict__ y, int k);
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); void quantize_row_q8_K_reference(const float * __restrict__ x, block_q8_K * __restrict__ y, int k);
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k); void quantize_row_q4_0(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k); void quantize_row_q4_1(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q5_0(const float * restrict x, void * restrict y, int k); void quantize_row_q5_0(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q5_1(const float * restrict x, void * restrict y, int k); void quantize_row_q5_1(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q8_0(const float * restrict x, void * restrict y, int k); void quantize_row_q8_0(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q8_1(const float * restrict x, void * restrict y, int k); void quantize_row_q8_1(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k); void quantize_row_q2_K(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k); void quantize_row_q3_K(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k); void quantize_row_q4_K(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k); void quantize_row_q5_K(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k); void quantize_row_q6_K(const float * __restrict__ x, void * __restrict__ y, int k);
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); void quantize_row_q8_K(const float * __restrict__ x, void * __restrict__ y, int k);
// Dequantization // Dequantization
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k); void dequantize_row_q4_0(const block_q4_0 * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k); void dequantize_row_q4_1(const block_q4_1 * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k); void dequantize_row_q5_0(const block_q5_0 * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k); void dequantize_row_q5_1(const block_q5_1 * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k); void dequantize_row_q8_0(const block_q8_0 * __restrict__ x, float * __restrict__ y, int k);
//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k); //void dequantize_row_q8_1(const block_q8_1 * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k); void dequantize_row_q2_K(const block_q2_K * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k); void dequantize_row_q3_K(const block_q3_K * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k); void dequantize_row_q4_K(const block_q4_K * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k); void dequantize_row_q5_K(const block_q5_K * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k); void dequantize_row_q6_K(const block_q6_K * __restrict__ x, float * __restrict__ y, int k);
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); void dequantize_row_q8_K(const block_q8_K * __restrict__ x, float * __restrict__ y, int k);
// Dot product // Dot product
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q4_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q4_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q5_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q5_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q8_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q2_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q3_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q4_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q5_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q6_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);

File diff suppressed because it is too large Load diff

57
ggml.h
View file

@ -1,5 +1,6 @@
#pragma once #pragma once
#include<refl-cpp/refl.hpp>
// //
// GGML Tensor Library // GGML Tensor Library
// //
@ -284,7 +285,7 @@
GGML_UNUSED(prefix##3); GGML_UNUSED(prefix##3);
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { //extern "C" {
#endif #endif
#if defined(__ARM_NEON) && defined(__CUDACC__) #if defined(__ARM_NEON) && defined(__CUDACC__)
@ -464,7 +465,7 @@ extern "C" {
}; };
// ggml object // ggml object
struct ggml_object { struct ggml_object : refl::attr::usage::type {
size_t offs; size_t offs;
size_t size; size_t size;
@ -478,7 +479,7 @@ extern "C" {
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
// n-dimensional tensor // n-dimensional tensor
struct ggml_tensor { struct ggml_tensor : refl::attr::usage::type{
enum ggml_type type; enum ggml_type type;
enum ggml_backend_type backend; enum ggml_backend_type backend;
@ -523,7 +524,7 @@ extern "C" {
// the compute plan that needs to be prepared for ggml_graph_compute() // the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287 // since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan { struct ggml_cplan : refl::attr::usage::type{
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()` size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
@ -540,13 +541,13 @@ extern "C" {
GGML_CGRAPH_EVAL_ORDER_COUNT GGML_CGRAPH_EVAL_ORDER_COUNT
}; };
struct ggml_hash_set { struct ggml_hash_set : refl::attr::usage::type{
size_t size; size_t size;
struct ggml_tensor ** keys; struct ggml_tensor ** keys;
}; };
// computation graph // computation graph
struct ggml_cgraph { struct ggml_cgraph : refl::attr::usage::type{
int size; int size;
int n_nodes; int n_nodes;
int n_leafs; int n_leafs;
@ -566,13 +567,31 @@ extern "C" {
}; };
// scratch buffer // scratch buffer
struct ggml_scratch { struct ggml_scratch : refl::attr::usage::type{
size_t offs; size_t offs;
size_t size; size_t size;
void * data; void * data;
ggml_scratch()
: offs(0),
size(0),
data(0)
{}
}; };
struct ggml_init_params { struct ggml_init_params : refl::attr::usage::type{
ggml_init_params(size_t mem_size,
void * mem_buffer,
bool no_alloc):
mem_size( mem_size),
mem_buffer(mem_buffer),
no_alloc(no_alloc){}
ggml_init_params():
mem_size(0),
mem_buffer(0),
no_alloc(0){}
// memory pool // memory pool
size_t mem_size; // bytes size_t mem_size; // bytes
void * mem_buffer; // if NULL, memory will be allocated internally void * mem_buffer; // if NULL, memory will be allocated internally
@ -590,7 +609,7 @@ extern "C" {
GGML_TASK_FINALIZE, GGML_TASK_FINALIZE,
}; };
struct ggml_compute_params { struct ggml_compute_params : refl::attr::usage::type{
enum ggml_task_type type; enum ggml_task_type type;
// ith = thread index, nth = number of threads // ith = thread index, nth = number of threads
@ -1836,7 +1855,7 @@ extern "C" {
// //
// see ggml.c (ggml_opt_default_params) for default values // see ggml.c (ggml_opt_default_params) for default values
// //
struct ggml_opt_params { struct ggml_opt_params : refl::attr::usage::type{
enum ggml_opt_type type; enum ggml_opt_type type;
size_t graph_size; size_t graph_size;
@ -1866,7 +1885,7 @@ extern "C" {
int n_gradient_accumulation; int n_gradient_accumulation;
// ADAM parameters // ADAM parameters
struct { struct ggml_adam: refl::attr::usage::type{
int n_iter; int n_iter;
float sched; // schedule multiplier (fixed, decay or warmup) float sched; // schedule multiplier (fixed, decay or warmup)
@ -1882,7 +1901,7 @@ extern "C" {
} adam; } adam;
// LBFGS parameters // LBFGS parameters
struct { struct ggml_lbfgs: refl::attr::usage::type{
int m; // number of corrections to approximate the inv. Hessian int m; // number of corrections to approximate the inv. Hessian
int n_iter; int n_iter;
int max_linesearch; int max_linesearch;
@ -1897,7 +1916,7 @@ extern "C" {
} lbfgs; } lbfgs;
}; };
struct ggml_opt_context { struct ggml_opt_context : refl::attr::usage::type{
struct ggml_context * ctx; struct ggml_context * ctx;
struct ggml_opt_params params; struct ggml_opt_params params;
@ -1909,7 +1928,7 @@ extern "C" {
float loss_before; float loss_before;
float loss_after; float loss_after;
struct { struct ggml_grad : refl::attr::usage::type{
struct ggml_tensor * g; // current gradient struct ggml_tensor * g; // current gradient
struct ggml_tensor * m; // first moment struct ggml_tensor * m; // first moment
struct ggml_tensor * v; // second moment struct ggml_tensor * v; // second moment
@ -1919,7 +1938,7 @@ extern "C" {
int n_no_improvement; int n_no_improvement;
} adam; } adam;
struct { struct ggml_params : refl::attr::usage::type{
struct ggml_tensor * x; // current parameters struct ggml_tensor * x; // current parameters
struct ggml_tensor * xp; // previous parameters struct ggml_tensor * xp; // previous parameters
struct ggml_tensor * g; // current gradient struct ggml_tensor * g; // current gradient
@ -2012,7 +2031,9 @@ extern "C" {
struct gguf_context; struct gguf_context;
struct gguf_init_params { struct gguf_init_params : refl::attr::usage::type{
gguf_init_params(bool no_alloc, struct ggml_context ** ctx): no_alloc(no_alloc),ctx(ctx){}
bool no_alloc; bool no_alloc;
// if not NULL, create a ggml_context and allocate the tensor data in it // if not NULL, create a ggml_context and allocate the tensor data in it
@ -2149,7 +2170,7 @@ extern "C" {
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
typedef struct { typedef struct ggml_something : refl::attr::usage::type{
const char * type_name; const char * type_name;
int blck_size; int blck_size;
size_t type_size; size_t type_size;
@ -2164,5 +2185,5 @@ extern "C" {
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
#ifdef __cplusplus #ifdef __cplusplus
} //}
#endif #endif

896
llama-internal.hpp Normal file
View file

@ -0,0 +1,896 @@
#include <set>
#include <queue>
enum llm_arch {
LLM_ARCH_LLAMA,
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
LLM_ARCH_GPT2,
LLM_ARCH_GPTJ,
LLM_ARCH_GPTNEOX,
LLM_ARCH_MPT,
LLM_ARCH_STARCODER,
LLM_ARCH_PERSIMMON,
LLM_ARCH_REFACT,
LLM_ARCH_BLOOM,
LLM_ARCH_STABLELM,
LLM_ARCH_UNKNOWN,
};
enum llm_kv {
LLM_KV_GENERAL_ARCHITECTURE,
LLM_KV_GENERAL_QUANTIZATION_VERSION,
LLM_KV_GENERAL_ALIGNMENT,
LLM_KV_GENERAL_NAME,
LLM_KV_GENERAL_AUTHOR,
LLM_KV_GENERAL_URL,
LLM_KV_GENERAL_DESCRIPTION,
LLM_KV_GENERAL_LICENSE,
LLM_KV_GENERAL_SOURCE_URL,
LLM_KV_GENERAL_SOURCE_HF_REPO,
LLM_KV_CONTEXT_LENGTH,
LLM_KV_EMBEDDING_LENGTH,
LLM_KV_BLOCK_COUNT,
LLM_KV_FEED_FORWARD_LENGTH,
LLM_KV_USE_PARALLEL_RESIDUAL,
LLM_KV_TENSOR_DATA_LAYOUT,
LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV,
LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
LLM_KV_ATTENTION_CLAMP_KQV,
LLM_KV_ATTENTION_LAYERNORM_EPS,
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_FREQ_BASE,
LLM_KV_ROPE_SCALE_LINEAR,
LLM_KV_ROPE_SCALING_TYPE,
LLM_KV_ROPE_SCALING_FACTOR,
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
LLM_KV_ROPE_SCALING_FINETUNED,
LLM_KV_TOKENIZER_MODEL,
LLM_KV_TOKENIZER_LIST,
LLM_KV_TOKENIZER_TOKEN_TYPE,
LLM_KV_TOKENIZER_SCORES,
LLM_KV_TOKENIZER_MERGES,
LLM_KV_TOKENIZER_BOS_ID,
LLM_KV_TOKENIZER_EOS_ID,
LLM_KV_TOKENIZER_UNK_ID,
LLM_KV_TOKENIZER_SEP_ID,
LLM_KV_TOKENIZER_PAD_ID,
LLM_KV_TOKENIZER_ADD_BOS,
LLM_KV_TOKENIZER_ADD_EOS,
LLM_KV_TOKENIZER_HF_JSON,
LLM_KV_TOKENIZER_RWKV,
};
// available llama models
enum e_model {
MODEL_UNKNOWN,
MODEL_1B,
MODEL_3B,
MODEL_7B,
MODEL_8B,
MODEL_13B,
MODEL_15B,
MODEL_30B,
MODEL_34B,
MODEL_40B,
MODEL_65B,
MODEL_70B,
};
enum llama_fver {
GGUF_FILE_VERSION_V1 = 1,
GGUF_FILE_VERSION_V2 = 2,
GGUF_FILE_VERSION_V3 = 3,
};
struct LLM_KV {
LLM_KV(llm_arch arch) : arch(arch) {}
llm_arch arch;
std::string operator()(llm_kv kv) const; // moved to llama.cpp file
};
enum llm_tensor {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_TOKEN_EMBD_NORM,
LLM_TENSOR_POS_EMBD,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_ROPE_FREQS,
LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_K,
LLM_TENSOR_ATTN_V,
LLM_TENSOR_ATTN_QKV,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_NORM_2,
LLM_TENSOR_ATTN_ROT_EMBD,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM,
};
struct llama_cparams {
uint32_t n_ctx; // context size used during inference
uint32_t n_batch;
uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing
float rope_freq_base;
float rope_freq_scale;
uint32_t n_yarn_orig_ctx;
// These hyperparameters are not exposed in GGUF, because all
// existing YaRN models use the same values for them.
float yarn_ext_factor;
float yarn_attn_factor;
float yarn_beta_fast;
float yarn_beta_slow;
bool mul_mat_q;
};
struct llama_layer {
// normalization
struct ggml_tensor * attn_norm;
struct ggml_tensor * attn_norm_b;
struct ggml_tensor * attn_norm_2;
struct ggml_tensor * attn_norm_2_b;
struct ggml_tensor * attn_q_norm;
struct ggml_tensor * attn_q_norm_b;
struct ggml_tensor * attn_k_norm;
struct ggml_tensor * attn_k_norm_b;
// attention
struct ggml_tensor * wq;
struct ggml_tensor * wk;
struct ggml_tensor * wv;
struct ggml_tensor * wo;
struct ggml_tensor * wqkv;
// attention bias
struct ggml_tensor * bo;
struct ggml_tensor * bqkv;
// normalization
struct ggml_tensor * ffn_norm;
struct ggml_tensor * ffn_norm_b;
// ff
struct ggml_tensor * ffn_gate; // w1
struct ggml_tensor * ffn_down; // w2
struct ggml_tensor * ffn_up; // w3
// ff bias
struct ggml_tensor * ffn_down_b; // b2
struct ggml_tensor * ffn_up_b; // b3
};
struct llama_kv_cell {
llama_pos pos = -1;
llama_pos delta = 0;
std::set<llama_seq_id> seq_id;
bool has_seq_id(const llama_seq_id & id) const {
return seq_id.find(id) != seq_id.end();
}
};
struct llama_buffer {
void * data = NULL;
size_t size = 0;
// fallback to malloc / free
// useful in cases where CUDA can try to allocate PINNED memory
bool fallback = false;
void resize(size_t n) ;
~llama_buffer();
};
// ring-buffer of cached KV data
struct llama_kv_cache {
bool has_shift = false;
// Note: The value of head isn't only used to optimize searching
// for a free KV slot. llama_decode_internal also uses it, so it
// cannot be freely changed after a slot has been allocated.
uint32_t head = 0;
uint32_t size = 0;
// computed before each graph build
uint32_t n = 0;
std::vector<llama_kv_cell> cells;
struct ggml_tensor * k = NULL;
struct ggml_tensor * v = NULL;
struct ggml_context * ctx = NULL;
llama_buffer buf;
~llama_kv_cache() {
if (ctx) {
ggml_free(ctx);
}
#ifdef GGML_USE_CUBLAS
if (ggml_cublas_loaded()) {
ggml_cuda_free_data(k);
ggml_cuda_free_data(v);
}
#endif
}
};
struct llama_vocab {
using id = int32_t;
using token = std::string;
using ttype = llama_token_type;
struct token_data {
token text;
float score;
ttype type;
};
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
std::unordered_map<token, id> token_to_id;
std::vector<token_data> id_to_token;
std::unordered_map<token, id> special_tokens_cache;
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
// default LLaMA special tokens
id special_bos_id = 1;
id special_eos_id = 2;
id special_unk_id = 0;
id special_sep_id = -1;
id special_pad_id = -1;
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
id linefeed_id = 13;
id special_prefix_id = 32007;
id special_middle_id = 32009;
id special_suffix_id = 32008;
id special_eot_id = 32010;
int find_bpe_rank(std::string token_left, std::string token_right) const {
GGML_ASSERT(token_left.find(" ") == std::string::npos);
GGML_ASSERT(token_left.find("\n") == std::string::npos);
GGML_ASSERT(token_right.find(" ") == std::string::npos);
GGML_ASSERT(token_right.find("\n") == std::string::npos);
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
if (it == bpe_ranks.end()) {
return -1;
}
return it->second;
}
};
struct llama_mmap {
void * addr;
size_t size;
llama_mmap(const llama_mmap &) = delete;
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false);
~llama_mmap();
#ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true;
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
#else
static constexpr bool SUPPORTED = false;
#endif
};
struct llama_hparams {
bool vocab_only;
uint32_t n_vocab;
uint32_t n_ctx_train; // context size the model was trained on
uint32_t n_embd;
uint32_t n_head;
uint32_t n_head_kv;
uint32_t n_layer;
uint32_t n_rot;
uint32_t n_ff;
float f_norm_eps;
float f_norm_rms_eps;
float rope_freq_base_train;
float rope_freq_scale_train;
uint32_t n_yarn_orig_ctx;
int8_t rope_scaling_type_train : 3;
bool rope_finetuned : 1;
float f_clamp_kqv;
float f_max_alibi_bias;
bool operator!=(const llama_hparams & other) const;
uint32_t n_gqa() const {
return n_head/n_head_kv;
}
uint32_t n_embd_head() const {
return n_embd/n_head;
}
uint32_t n_embd_gqa() const {
return n_embd/n_gqa();
}
};
struct llama_mlock {
void * addr = NULL;
size_t size = 0;
bool failed_already = false;
llama_mlock() ;
llama_mlock(const llama_mlock &) = delete;
~llama_mlock();
void init(void * ptr);
void grow_to(size_t target_size);
#ifdef _POSIX_MEMLOCK_RANGE
static constexpr bool SUPPORTED = true;
static size_t lock_granularity();
#ifdef __APPLE__
#define MLOCK_SUGGESTION \
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
#else
#define MLOCK_SUGGESTION \
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
#endif
bool raw_lock(const void * addr, size_t size) const ;
#undef MLOCK_SUGGESTION
static void raw_unlock(void * addr, size_t size);
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
static size_t lock_granularity();
bool raw_lock(void * ptr, size_t len) const ;
static void raw_unlock(void * ptr, size_t len);
#else
static constexpr bool SUPPORTED = false;
static size_t lock_granularity();
bool raw_lock(const void * addr, size_t len) const;
static void raw_unlock(const void * addr, size_t len);
#endif
};
struct llama_model {
e_model type = MODEL_UNKNOWN;
llm_arch arch = LLM_ARCH_UNKNOWN;
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
std::string name = "n/a";
llama_hparams hparams = {};
llama_vocab vocab;
struct ggml_tensor * tok_embd;
struct ggml_tensor * pos_embd;
struct ggml_tensor * tok_norm;
struct ggml_tensor * tok_norm_b;
struct ggml_tensor * output_norm;
struct ggml_tensor * output_norm_b;
struct ggml_tensor * output;
std::vector<llama_layer> layers;
int n_gpu_layers;
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
// context
struct ggml_context * ctx = NULL;
// the model memory buffer
llama_buffer buf;
// model memory mapped file
std::unique_ptr<llama_mmap> mapping;
// objects representing data potentially being locked in memory
llama_mlock mlock_buf;
llama_mlock mlock_mmap;
// for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
int64_t t_load_us = 0;
int64_t t_start_us = 0;
~llama_model() {
if (ctx) {
ggml_free(ctx);
}
#ifdef GGML_USE_CUBLAS
if (ggml_cublas_loaded()) {
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
ggml_cuda_free_data(tensors_by_name[i].second);
}
ggml_cuda_free_scratch();
}
#endif
#if defined(GGML_USE_CLBLAST)
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
ggml_cl_free_data(tensors_by_name[i].second);
}
#endif
}
};
struct llama_context {
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
~llama_context();
llama_cparams cparams;
const llama_model & model;
// key + value cache for the self attention
struct llama_kv_cache kv_self;
std::mt19937 rng;
bool has_evaluated_once = false;
int64_t t_start_us;
int64_t t_load_us;
int64_t t_sample_us = 0;
int64_t t_p_eval_us = 0;
int64_t t_eval_us = 0;
int32_t n_sample = 0; // number of tokens sampled
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
int32_t n_eval = 0; // number of eval calls
// decode output (2-dimensional array: [n_tokens][n_vocab])
std::vector<float> logits;
bool logits_all = false;
// input embedding (1-dimensional array: [n_embd])
std::vector<float> embedding;
// reusable buffer for `struct ggml_graph_plan.work_data`
std::vector<uint8_t> work_buffer;
// memory buffers used to evaluate the model
llama_buffer buf_compute;
llama_buffer buf_alloc;
ggml_allocr * alloc = NULL;
#ifdef GGML_USE_METAL
ggml_metal_context * ctx_metal = NULL;
#endif
#ifdef GGML_USE_MPI
ggml_mpi_context * ctx_mpi = NULL;
#endif
};
struct LLM_TN {
LLM_TN(llm_arch arch) ;
llm_arch arch;
std::string operator()(llm_tensor tensor) const;
std::string operator()(llm_tensor tensor, const std::string & suffix) const ;
std::string operator()(llm_tensor tensor, int bid) const ;
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const ;
};
struct llama_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
size_t size;
llama_file(const char * fname, const char * mode) ;
size_t tell() const;
void seek(size_t offset, int whence) const;
void read_raw(void * ptr, size_t len) const;
uint32_t read_u32() const;
void write_raw(const void * ptr, size_t len) const ;
void write_u32(std::uint32_t val) const;
~llama_file();
};
struct llama_state {
llama_state();
// We save the log callback globally
ggml_log_callback log_callback;
void * log_callback_user_data = nullptr;
};
struct llama_model_loader {
int n_kv = 0;
int n_tensors = 0;
int n_created = 0;
int64_t n_elements = 0;
size_t n_bytes = 0;
bool use_mmap = false;
llama_file file;
llama_ftype ftype;
llama_fver fver;
std::unique_ptr<llama_mmap> mapping;
struct gguf_context * ctx_gguf = NULL;
struct ggml_context * ctx_meta = NULL;
llama_model_loader(const std::string & fname, bool use_mmap) ;
~llama_model_loader();
std::string get_arch_name() const;
enum llm_arch get_arch() const ;
const char * get_tensor_name(int i) const;
struct ggml_tensor * get_tensor_meta(int i) const;
void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const;
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) ;
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) ;
void done_getting_tensors() const;
size_t file_offset(const char * name) const;
void load_data_for(struct ggml_tensor * cur) const ;
void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) ;
};
struct llama_data_context {
virtual void write(const void * src, size_t size) = 0;
virtual size_t get_size_written() = 0;
virtual ~llama_data_context() = default;
};
struct llama_data_buffer_context : llama_data_context {
uint8_t * ptr;
size_t size_written = 0;
llama_data_buffer_context(uint8_t * p) ;
void write(const void * src, size_t size) override ;
size_t get_size_written() override ;
};
struct llama_data_file_context : llama_data_context {
llama_file * file;
size_t size_written = 0;
llama_data_file_context(llama_file * f);
size_t get_size_written() override ;
void write(const void * src, size_t size);
};
struct llama_beam {
std::vector<llama_token> tokens;
float p; // Cumulative beam probability (renormalized relative to all beams)
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
// Sort beams by probability. In case of ties, prefer beams at eob.
bool operator<(const llama_beam & rhs) const ;
void shift_tokens(const size_t n) ;
llama_beam_view view() const;
};
// A struct for calculating logit-related info.
struct llama_logit_info {
const float * const logits;
const int n_vocab;
const float max_l;
const float normalizer;
struct sum_exp {
float max_l;
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
};
llama_logit_info(llama_context * ctx);
llama_token_data get_token_data(const llama_token token_id) const ;
std::vector<llama_token_data> top_k(size_t k) ;
float probability_from_logit(float logit) const ;
};
struct llama_beam_search_data {
llama_context * ctx;
size_t n_beams;
int n_past;
int n_predict;
std::vector<llama_beam> beams;
std::vector<llama_beam> next_beams;
size_t common_prefix_length;
std::vector<llama_beam_view> beam_views;
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict);
void collapse_beams(const size_t beam_idx) ;
void fill_next_beams_by_top_probabilities(llama_beam & beam) ;
size_t find_common_prefix_length() ;
llama_beams_state get_beams_state(const bool last_call) ;
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data);
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) ;
size_t top_beam_index();
void update_beams_from_beam_views();
};
using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
enum llm_rope_type {
LLM_ROPE,
LLM_ROPE_NEOX,
LLM_ROPE_GLM,
};
enum llm_ffn_op_type {
LLM_FFN_SILU,
LLM_FFN_GELU,
LLM_FFN_RELU,
LLM_FFN_RELU_SQR,
};
enum llm_ffn_gate_type {
LLM_FFN_SEQ,
LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
};
enum llm_norm_type {
LLM_NORM,
LLM_NORM_RMS,
};
struct llm_build_context {
const llama_model & model;
const llama_hparams & hparams;
const llama_cparams & cparams;
const llama_batch & batch;
const llama_kv_cache & kv_self;
const int64_t n_embd;
const int64_t n_layer;
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
const int64_t n_head;
const int64_t n_head_kv;
const int64_t n_embd_head;
const int64_t n_embd_gqa;
const float freq_base;
const float freq_scale;
const float ext_factor;
const float attn_factor;
const float beta_fast;
const float beta_slow;
const float norm_eps;
const float norm_rms_eps;
const int32_t n_tokens;
const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx)
const int32_t kv_head; // index of where we store new KV data in the cache
const int32_t n_orig_ctx;
const bool do_rope_shift;
const llm_build_cb & cb;
llama_buffer & buf_compute;
struct ggml_context * ctx0 = nullptr;
// TODO: consider making the entire interface noexcept
llm_build_context(
llama_context & lctx,
const llama_batch & batch,
const llm_build_cb & cb,
bool worst_case);
void init() ;
void free() ;
struct ggml_cgraph * build_llama() ;
struct ggml_cgraph * build_baichuan() ;
struct ggml_cgraph * build_falcon() ;
struct ggml_cgraph * build_starcoder() ;
struct ggml_cgraph * build_persimmon() ;
struct ggml_cgraph * build_refact() ;
struct ggml_cgraph * build_bloom() ;
struct ggml_cgraph * build_mpt() ;
struct ggml_cgraph * build_stablelm();
};
enum llm_offload_func_e {
OFFLOAD_FUNC_NOP,
OFFLOAD_FUNC,
OFFLOAD_FUNC_KQ,
OFFLOAD_FUNC_V,
OFFLOAD_FUNC_NR,
OFFLOAD_FUNC_EMB,
OFFLOAD_FUNC_OUT,
};
struct llm_offload_trie {
struct node {
~node() ;
node * children[256] = { nullptr };
llm_offload_func_e func = OFFLOAD_FUNC_NOP;
};
node * root = nullptr;
llm_offload_trie();
llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) ;
~llm_offload_trie();
void add(const char * name, llm_offload_func_e func);
llm_offload_func_e find(const char * name) const;
};
struct llm_symbol {
using index = int;
index prev;
index next;
const char * text;
size_t n;
};
struct llm_bigram_spm {
struct comparator {
bool operator()(llm_bigram_spm & l, llm_bigram_spm & r);
};
using queue_storage = std::vector<llm_bigram_spm>;
using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
llm_symbol::index left;
llm_symbol::index right;
float score;
size_t size;
};
struct llm_tokenizer_spm {
llm_tokenizer_spm(const llama_vocab & vocab);
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output);
private:
void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) ;
void try_add_bigram(int left, int right) ;
const llama_vocab & vocab;
std::vector<llm_symbol> symbols;
llm_bigram_spm::queue work_queue;
std::map<std::string, std::pair<int, int>> rev_merge;
};
// BPE tokenizer
// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
// tried to simplify unicode stuff, so most likely does not work 100% correctly!
// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
struct llm_bigram_bpe {
struct comparator {
bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const ;
};
using queue_storage = std::vector<llm_bigram_bpe>;
using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
llm_symbol::index left;
llm_symbol::index right;
std::string text;
int rank;
size_t size;
};
struct llm_tokenizer_bpe {
llm_tokenizer_bpe(const llama_vocab & vocab);
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output);
private:
void add_new_bigram(int left, int right) ;
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) ;
const llama_vocab & vocab;
std::vector<llm_symbol> symbols;
std::vector<llm_symbol> symbols_final;
llm_bigram_bpe::queue work_queue;
};
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
} FRAGMENT_BUFFER_VARIANT_TYPE;
struct fragment_buffer_variant{
fragment_buffer_variant(llama_vocab::id _token);
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length);
const FRAGMENT_BUFFER_VARIANT_TYPE type;
const llama_vocab::id token;
const std::string _dummy;
const std::string & raw_text;
const uint64_t offset;
const uint64_t length;
};
struct llama_partial_utf8 {
uint32_t value; // bit value so far (unshifted)
int n_remain; // num bytes remaining; -1 indicates invalid sequence
};
struct llama_grammar {
const std::vector<std::vector<llama_grammar_element>> rules;
std::vector<std::vector<const llama_grammar_element *>> stacks;
// buffer for partially generated UTF-8 sequence from accepted tokens
llama_partial_utf8 partial_utf8;
};
struct llama_grammar_candidate {
size_t index;
const uint32_t * code_points;
llama_partial_utf8 partial_utf8;
};
struct quantize_state_internal {
const llama_model & model;
const llama_model_quantize_params * params;
int n_attention_wv = 0;
int n_feed_forward_w2 = 0;
int i_attention_wv = 0;
int i_feed_forward_w2 = 0;
int n_k_quantized = 0;
int n_fallback = 0;
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
: model(model)
, params(params)
{}
};

4304
llama.cpp

File diff suppressed because it is too large Load diff

45
llama.h
View file

@ -50,7 +50,7 @@
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { //extern "C" {
#endif #endif
// //
@ -115,12 +115,20 @@ extern "C" {
}; };
typedef struct llama_token_data { typedef struct llama_token_data {
llama_token_data( llama_token id, float logit, float p):
id( id),logit(logit),p(p){ }
llama_token id; // token id llama_token id; // token id
float logit; // log-odds of the token float logit; // log-odds of the token
float p; // probability of the token float p; // probability of the token
} llama_token_data; } llama_token_data;
typedef struct llama_token_data_array { typedef struct llama_token_data_array {
llama_token_data_array(llama_token_data * data,
size_t size,
bool sorted):
data(data),
size(size),
sorted(sorted){}
llama_token_data * data; llama_token_data * data;
size_t size; size_t size;
bool sorted; bool sorted;
@ -139,6 +147,29 @@ extern "C" {
// - logits : if zero, the logits for the respective token will not be output // - logits : if zero, the logits for the respective token will not be output
// //
typedef struct llama_batch { typedef struct llama_batch {
llama_batch(int32_t n_tokens,
llama_token * token,
float * embd,
llama_pos * pos,
int32_t * n_seq_id,
llama_seq_id ** seq_id,
int8_t * logits,
llama_pos all_pos_0,
llama_pos all_pos_1,
llama_seq_id all_seq_id
) :
n_tokens(n_tokens),
token(token),
embd(embd),
pos(pos),
n_seq_id(n_seq_id),
seq_id(seq_id),
logits(logits),
all_pos_0(all_pos_0),
all_pos_1(all_pos_1),
all_seq_id(all_seq_id) {}
int32_t n_tokens; int32_t n_tokens;
llama_token * token; llama_token * token;
@ -174,7 +205,7 @@ extern "C" {
bool use_mlock; // force system to keep model in RAM bool use_mlock; // force system to keep model in RAM
}; };
struct llama_context_params { struct llama_context_params{
uint32_t seed; // RNG seed, -1 for random uint32_t seed; // RNG seed, -1 for random
uint32_t n_ctx; // text context, 0 = from model uint32_t n_ctx; // text context, 0 = from model
uint32_t n_batch; // prompt processing maximum batch size uint32_t n_batch; // prompt processing maximum batch size
@ -238,6 +269,10 @@ extern "C" {
}; };
typedef struct llama_grammar_element { typedef struct llama_grammar_element {
llama_grammar_element( enum llama_gretype type,
uint32_t value // Unicode code point or rule ID
):type(type), value(value){}
llama_grammar_element( ):type(llama_gretype(0)), value(0){}
enum llama_gretype type; enum llama_gretype type;
uint32_t value; // Unicode code point or rule ID uint32_t value; // Unicode code point or rule ID
} llama_grammar_element; } llama_grammar_element;
@ -827,7 +862,7 @@ extern "C" {
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx); LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
#ifdef __cplusplus #ifdef __cplusplus
} //}
#endif #endif
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
@ -844,4 +879,8 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
#endif // LLAMA_API_INTERNAL #endif // LLAMA_API_INTERNAL
#endif // LLAMA_H #endif // LLAMA_H

553
print.hpp Normal file
View file

@ -0,0 +1,553 @@
#include <iostream>
#include "llama.h"
#include "ggml-internal.hpp"
#include "llama-internal.hpp"
REFL_TYPE(ggml_init_params )
REFL_END
REFL_TYPE(ggml_opt_params::ggml_adam)
REFL_END
REFL_TYPE(ggml_opt_params::ggml_lbfgs)
REFL_END
REFL_TYPE(ggml_opt_context::ggml_grad )
REFL_END
REFL_TYPE(gpt_params )
REFL_FIELD( seed )
REFL_FIELD( n_threads)
REFL_FIELD( n_threads_batch)
REFL_FIELD( n_predict )
REFL_FIELD( n_ctx )
REFL_FIELD( n_batch)
REFL_FIELD( n_keep )
REFL_FIELD( n_draft)
REFL_FIELD( n_chunks )
REFL_FIELD( n_parallel)
REFL_FIELD( n_sequences)
REFL_FIELD( p_accept )
REFL_FIELD( p_split )
REFL_FIELD( n_gpu_layers)
REFL_FIELD( n_gpu_layers_draft)
REFL_FIELD( main_gpu )
REFL_FIELD( tensor_split)
REFL_FIELD( n_beams )
REFL_FIELD(rope_freq_base)
REFL_FIELD( rope_freq_scale )
REFL_FIELD( yarn_ext_factor )
REFL_FIELD( yarn_attn_factor )
REFL_FIELD( yarn_beta_fast )
REFL_FIELD( yarn_beta_slow )
REFL_FIELD( yarn_orig_ctx)
REFL_FIELD( rope_scaling_type)
REFL_FIELD( sparams)
REFL_FIELD(model )
REFL_FIELD(model_draft )
REFL_FIELD(model_alias)
REFL_FIELD(prompt )
REFL_FIELD(prompt_file )
REFL_FIELD(path_prompt_cache )
REFL_FIELD(input_prefix )
REFL_FIELD(input_suffix )
REFL_FIELD( antiprompt)
REFL_FIELD(logdir )
REFL_FIELD( lora_adapter)
REFL_FIELD(lora_base )
REFL_FIELD( ppl_stride )
REFL_FIELD( ppl_output_type )
REFL_FIELD( hellaswag )
REFL_FIELD( hellaswag_tasks )
REFL_FIELD( mul_mat_q )
REFL_FIELD( memory_f16)
REFL_FIELD( random_prompt )
REFL_FIELD( use_color )
REFL_FIELD( interactive )
REFL_FIELD( chatml )
REFL_FIELD( prompt_cache_all )
REFL_FIELD( prompt_cache_ro )
REFL_FIELD( embedding )
REFL_FIELD( escape )
REFL_FIELD( interactive_first )
REFL_FIELD( multiline_input )
REFL_FIELD( simple_io )
REFL_FIELD( cont_batching )
REFL_FIELD( input_prefix_bos )
REFL_FIELD( ignore_eos )
REFL_FIELD( instruct )
REFL_FIELD( logits_all )
REFL_FIELD( use_mmap)
REFL_FIELD( use_mlock )
REFL_FIELD( numa )
REFL_FIELD( verbose_prompt )
REFL_FIELD( infill )
REFL_FIELD(mmproj )
REFL_FIELD( image)
REFL_END
REFL_TYPE(llama_sampling_params)
REFL_END
REFL_TYPE(llm_arch)
REFL_END
REFL_TYPE(llama_sampling_context )
REFL_FIELD( params)
REFL_FIELD( mirostat_mu)
REFL_FIELD( grammar)
REFL_FIELD( parsed_grammar)
REFL_FIELD( prev)
REFL_FIELD( cur)
REFL_END
REFL_TYPE(llama_token_data )
REFL_END
REFL_TYPE(llama_token_data_array )
REFL_END
REFL_TYPE(llama_batch )
REFL_END
REFL_TYPE(ggml_object)
REFL_FIELD(offs)
REFL_END
REFL_TYPE(ggml_tensor)
REFL_FIELD(type)
REFL_END
REFL_TYPE(ggml_cplan)
REFL_FIELD(work_size)
REFL_END
REFL_TYPE(ggml_hash_set)
REFL_FIELD(size)
REFL_END
REFL_TYPE(ggml_cgraph)
REFL_FIELD(size)
REFL_END
REFL_TYPE(ggml_scratch)
REFL_FIELD(offs)
REFL_END
REFL_TYPE(ggml_compute_params)
REFL_FIELD(type)
REFL_END
REFL_TYPE(ggml_opt_params)
REFL_FIELD(type)
REFL_END
REFL_TYPE(ggml_opt_context)
REFL_FIELD(ctx)
REFL_END
REFL_TYPE(gguf_init_params)
REFL_END
REFL_TYPE(ggml_something)
REFL_FIELD(type_name)
REFL_END
REFL_TYPE(ggml_context)
REFL_FIELD(mem_size)
REFL_FIELD(mem_buffer)
REFL_FIELD(mem_buffer_owned)
REFL_FIELD( no_alloc)
REFL_FIELD( no_alloc_save)
REFL_FIELD( n_objects)
REFL_FIELD( objects_begin)
REFL_FIELD( objects_end)
REFL_FIELD( scratch)
REFL_FIELD( scratch_save)
REFL_END
REFL_TYPE(ggml_context_container)
REFL_FIELD(used)
REFL_FIELD(context)
REFL_END
REFL_TYPE(ggml_numa_node)
REFL_FIELD(cpus)
REFL_FIELD(n_cpus)
REFL_END
REFL_TYPE(ggml_numa_nodes)
REFL_FIELD(nodes)
REFL_FIELD(n_nodes)
REFL_END
REFL_TYPE(ggml_state)
REFL_FIELD(contexts)
REFL_FIELD(numa)
REFL_END
REFL_TYPE(gguf_str)
REFL_FIELD(n)
REFL_FIELD(data)
REFL_END
REFL_TYPE(ggml_map_custom1_op_params)
REFL_FIELD(fun)
REFL_FIELD(n_tasks)
REFL_END
REFL_TYPE(ggml_map_custom2_op_params)
REFL_FIELD(fun)
REFL_FIELD(n_tasks)
REFL_END
REFL_TYPE(ggml_map_custom3_op_params)
REFL_FIELD(fun)
REFL_FIELD(n_tasks)
REFL_END
REFL_TYPE(hash_map)
REFL_FIELD(set)
REFL_FIELD(vals)
REFL_END
REFL_TYPE(ggml_compute_state_shared)
REFL_FIELD(cgraph)
REFL_FIELD(cplan)
REFL_END
REFL_TYPE(ggml_compute_state)
REFL_FIELD(thrd)
REFL_FIELD(ith)
REFL_END
REFL_TYPE(ggml_lbfgs_iteration_data)
REFL_FIELD(alpha)
REFL_FIELD(ys)
REFL_END
REFL_TYPE(gguf_kv)
REFL_FIELD(key)
REFL_FIELD(type)
REFL_END
REFL_TYPE(gguf_header)
REFL_FIELD(magic)
REFL_FIELD(version)
REFL_END
REFL_TYPE(gguf_tensor_info)
REFL_FIELD(name)
REFL_FIELD(n_dims)
REFL_END
REFL_TYPE(gguf_context)
REFL_FIELD(header)
REFL_FIELD(kv)
REFL_END
REFL_TYPE(gguf_buf)
REFL_FIELD(data)
REFL_FIELD(size)
REFL_END
REFL_TYPE(llama_model_params)
REFL_FIELD(n_gpu_layers)
REFL_END
REFL_TYPE(llama_context_params)
REFL_FIELD(seed)
REFL_END
REFL_TYPE(llama_model_quantize_params)
REFL_FIELD(nthread)
REFL_END
REFL_TYPE(llama_grammar_element)
REFL_END
REFL_TYPE(llama_timings)
REFL_FIELD(t_start_ms)
REFL_END
REFL_TYPE(llama_beam_view)
REFL_FIELD(tokens)
REFL_END
REFL_TYPE(llama_beams_state)
REFL_FIELD(beam_views)
REFL_END
REFL_TYPE(ggml_backend)
REFL_END
REFL_TYPE(ggml_backend_buffer)
REFL_END
REFL_TYPE(ggml_allocr)
REFL_END
REFL_TYPE(ggml_tallocr)
REFL_END
REFL_TYPE(ggml_gallocr)
REFL_END
REFL_TYPE(llama_buffer)
REFL_FIELD(data)
REFL_FIELD(size)
REFL_END
REFL_TYPE(llama_file)
REFL_FIELD(fp)
REFL_FIELD(size)
REFL_END
REFL_TYPE(llama_mmap)
REFL_FIELD(addr)
REFL_FIELD(size)
REFL_END
REFL_TYPE(llama_mlock)
REFL_FIELD(addr)
REFL_FIELD(size)
REFL_END
REFL_TYPE(llama_state)
REFL_FIELD(log_callback)
REFL_FIELD(log_callback_user_data)
REFL_END
REFL_TYPE(llama_hparams)
REFL_FIELD(vocab_only)
REFL_FIELD(n_vocab)
REFL_END
REFL_TYPE(llama_cparams)
REFL_FIELD(n_ctx)
REFL_FIELD(n_batch)
REFL_END
REFL_TYPE(llama_layer)
REFL_FIELD(attn_norm)
REFL_FIELD(attn_norm_b)
REFL_END
REFL_TYPE(llama_kv_cell)
REFL_FIELD(pos)
REFL_FIELD(delta)
REFL_END
REFL_TYPE(llama_kv_cache)
REFL_FIELD(has_shift)
REFL_FIELD(head)
REFL_END
REFL_TYPE(e_model)
REFL_END
REFL_TYPE(llama_ftype)
REFL_END
REFL_TYPE(llama_model)
REFL_FIELD(type)
REFL_FIELD(arch)
REFL_FIELD(ftype )
REFL_FIELD( name )
REFL_FIELD( hparams )
REFL_FIELD( vocab)
REFL_FIELD( tok_embd)
REFL_FIELD( pos_embd)
REFL_FIELD( tok_norm)
REFL_FIELD( tok_norm_b)
REFL_FIELD( output_norm)
REFL_FIELD( output_norm_b)
REFL_FIELD( output)
REFL_FIELD( layers)
REFL_FIELD( n_gpu_layers)
REFL_FIELD( gguf_kv) //unordered map
REFL_FIELD( ctx)
REFL_FIELD( buf)
REFL_FIELD( mapping) //std::unique_ptr
REFL_FIELD( mlock_buf)
REFL_FIELD( mlock_mmap)
REFL_FIELD( tensors_by_name)
REFL_FIELD( t_load_us)
REFL_FIELD( t_start_us)
REFL_END
REFL_TYPE(llama_vocab)
REFL_END
REFL_TYPE(grammar_parser::parse_state)
REFL_END
REFL_TYPE(llama_context)
REFL_FIELD( cparams)
//REFL_FIELD(model)
REFL_FIELD(kv_self)
REFL_FIELD(rng) //random numbers
REFL_FIELD(has_evaluated_once )
REFL_FIELD( t_start_us)
REFL_FIELD( t_load_us)
REFL_FIELD( t_sample_us )
REFL_FIELD( t_p_eval_us )
REFL_FIELD( t_eval_us)
REFL_FIELD( n_sample )
REFL_FIELD( n_p_eval )
REFL_FIELD( n_eval )
REFL_FIELD( logits)
REFL_FIELD( logits_all )
REFL_FIELD( embedding)
REFL_FIELD( work_buffer)
REFL_FIELD( buf_compute)
REFL_FIELD( buf_alloc)
REFL_FIELD( alloc )
#ifdef GGML_USE_METAL
REFL_FIELD( ctx_metal )
#endif
#ifdef GGML_USE_MPI
REFL_FIELD( ctx_mpi )
#endif
REFL_END
REFL_TYPE(llama_model_loader)
REFL_FIELD(n_kv)
REFL_FIELD(n_tensors)
REFL_END
REFL_TYPE(llm_build_context)
// REFL_FIELD(model) cannot create pointer to reference member llm_build_context::model
// REFL_FIELD(hparams) cannot create pointer to reference member llm_build_context::hparams
REFL_END
REFL_TYPE(llm_offload_trie)
REFL_END
REFL_TYPE(llm_symbol)
REFL_FIELD(prev)
REFL_END
REFL_TYPE(llm_bigram_spm)
REFL_END
REFL_TYPE(llm_tokenizer_spm)
REFL_END
REFL_TYPE(llm_bigram_bpe)
REFL_END
REFL_TYPE(llm_tokenizer_bpe)
REFL_END
REFL_TYPE(fragment_buffer_variant)
REFL_END
REFL_TYPE(llama_partial_utf8)
REFL_FIELD(value)
REFL_FIELD(n_remain)
REFL_END
REFL_TYPE(llama_grammar)
REFL_FIELD(rules)
REFL_FIELD(stacks)
REFL_END
REFL_TYPE(llama_grammar_candidate)
REFL_FIELD(index)
REFL_FIELD(code_points)
REFL_END
REFL_TYPE(llama_beam)
REFL_FIELD(tokens)
REFL_FIELD(p)
REFL_END
REFL_TYPE(llama_logit_info)
REFL_FIELD(logits)
REFL_FIELD(n_vocab)
REFL_END
REFL_TYPE(llama_beam_search_data)
REFL_FIELD(ctx)
REFL_FIELD(n_beams)
REFL_END
REFL_TYPE(quantize_state_internal)
// REFL_FIELD(model)
REFL_FIELD(params)
REFL_FIELD( n_attention_wv )
REFL_FIELD( n_feed_forward_w2 )
REFL_FIELD( i_attention_wv )
REFL_FIELD( i_feed_forward_w2 )
REFL_FIELD( n_k_quantized )
REFL_FIELD( n_fallback )
REFL_END
REFL_TYPE(llama_data_context)
REFL_END
REFL_TYPE(llama_data_buffer_context)
REFL_FIELD(ptr)
REFL_END
REFL_TYPE(llama_data_file_context)
REFL_FIELD(file)
REFL_END
template <typename T>
constexpr auto get_value_type_name(const T t) noexcept
{
return t.value_type;
}
// // A generic function to print out the fields of any object
template<typename T>
void print_fields(const T& t) {
refl::runtime::debug(std::cout, t);
constexpr auto type = refl::reflect<T>();
constexpr auto membertype = refl::member_list<T>();
constexpr auto members = get_members(type);
std::cout << "DEBUG Type: " << type.name.c_str() << "\n";
std::cout << "DEBUG Type2: " << typeid(membertype).name() << "\n";
std::cout << "DEBUG Type3: " << typeid(members).name() << "\n";
refl::util::for_each(members, [&](auto member) {
//using member_t = decltype(member::value_type);
//typename type3 = member::value_type;
//typename trait::remove_qualifiers_t<member_t>::value_type>;
//constexpr auto type2 = refl::reflect(type3);
//std::cout << "Auto:" << foo <<"\n";
std::cout << "Auto:" << member.name <<"\n";
//std::cout << "DEBUG Type2: " << typeid(member_t).name() << "\n";
//std::cout << "DEBUG Type2: " << type2.name.c_str() << "\n";
});
std::cout << "\n";
}

View file

@ -46,6 +46,6 @@ llama_build_and_test_executable(test-grad0.cpp) # SLOW
llama_build_and_test_executable(test-rope.cpp) llama_build_and_test_executable(test-rope.cpp)
# dummy executable - not installed # dummy executable - not installed
get_filename_component(TEST_TARGET test-c.c NAME_WE) get_filename_component(TEST_TARGET test-c.cpp NAME_WE)
add_executable(${TEST_TARGET} test-c.c) add_executable(${TEST_TARGET} test-c.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE llama) target_link_libraries(${TEST_TARGET} PRIVATE llama)

View file

@ -115,11 +115,11 @@ int main(int argc, char * argv[]) {
generate_data(1.0, test_data2.size(), test_data2.data()); generate_data(1.0, test_data2.size(), test_data2.data());
// Initialize GGML, ensures float conversion tables are initialized // Initialize GGML, ensures float conversion tables are initialized
struct ggml_init_params ggml_params = { struct ggml_init_params ggml_params(
/* .mem_size = */ 1*1024, /* .mem_size = */ 1*1024,
/* .mem_buffer = */ NULL, /* .mem_buffer = */ NULL,
/* .no_alloc = */ true, /* .no_alloc = */ true
}; );
struct ggml_context * ctx = ggml_init(ggml_params); struct ggml_context * ctx = ggml_init(ggml_params);
int num_failed = 0; int num_failed = 0;

View file

@ -261,11 +261,11 @@ int main(int argc, char * argv[]) {
// Initialize GGML, ensures float conversion tables are initialized // Initialize GGML, ensures float conversion tables are initialized
struct ggml_init_params ggml_params = { struct ggml_init_params ggml_params(
/* .mem_size = */ 1*1024, /* .mem_size = */ 1*1024,
/* .mem_buffer = */ NULL, /* .mem_buffer = */ NULL,
/* .no_alloc = */ true, /* .no_alloc = */ true
}; );
struct ggml_context * ctx = ggml_init(ggml_params); struct ggml_context * ctx = ggml_init(ggml_params);
for (int i = 0; i < GGML_TYPE_COUNT; i++) { for (int i = 0; i < GGML_TYPE_COUNT; i++) {

View file

@ -124,11 +124,11 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
} }
int main(int /*argc*/, const char ** /*argv*/) { int main(int /*argc*/, const char ** /*argv*/) {
struct ggml_init_params params = { struct ggml_init_params params(
/* .mem_size = */ 128*1024*1024, /* .mem_size = */ 128*1024*1024,
/* .mem_buffer = */ NULL, /* .mem_buffer = */ NULL,
/* .no_alloc = */ false, /* .no_alloc = */ false
}; );
std::vector<uint8_t> work_buffer; std::vector<uint8_t> work_buffer;