rebased and trimmed down

now compiling again, now it might even run adding debug notes update diff improvement update update working simplify running moving to using refl-cpp for llama as well now working compiling and running debugging adding the print module with type information the first type names are being printed adding binding generator bindings refl now working, not on pointers but on the types update now has a model adding new header for llama internal demonstrate crash remove crash now starting to refactor the code now the debug print is working
2023-11-21 11:25:37 -05:00 · 2023-11-21 11:25:37 -05:00 · 2b6ff2ec54
commit 2b6ff2ec54
parent 52c8bc3cf3
42 changed files with 6898 additions and 3518 deletions
--- a/.gitignore
+++ b/.gitignore
@ -47,7 +47,6 @@ models-mnt
 /libllama.so
 /llama-bench
 /llava-cli
-/lookahead
 /main
 /metal
 /perplexity
@ -88,16 +87,18 @@ poetry.lock
 poetry.toml

 # Test binaries
-/tests/test-grammar-parser
-/tests/test-llama-grammar
-/tests/test-double-float
-/tests/test-grad0
-/tests/test-opt
-/tests/test-quantize-fns
-/tests/test-quantize-perf
-/tests/test-sampling
-/tests/test-tokenizer-0-llama
-/tests/test-tokenizer-0-falcon
-/tests/test-tokenizer-1-llama
-/tests/test-tokenizer-1-bpe
-/tests/test-rope
+tests/test-grammar-parser
+tests/test-llama-grammar
+tests/test-double-float
+tests/test-grad0
+tests/test-opt
+tests/test-quantize-fns
+tests/test-quantize-perf
+tests/test-sampling
+tests/test-tokenizer-0-llama
+tests/test-tokenizer-0-falcon
+tests/test-tokenizer-1-llama
+tests/test-tokenizer-1-bpe
+/#llama.cpp#
+#*
+\\#*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,7 +1,33 @@
 cmake_minimum_required(VERSION 3.13)  # for add_link_options
 project("llama.cpp" C CXX)

+if (NOT MSVC)
+    set(cuda_flags -Wno-pedantic)
+endif()
+
+set(LLAMA_CUBLAS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(LLAMA_CUDA_F16 ON)
+set(LLAMA_ACCELERATE ON)
+set(LLAMA_K_QUANTS ON)
+
+#-DLLAMA_NATIVE=off
+set(LLAMA_AVX ON)
+set(LLAMA_AVX2 OFF)
+set(LLAMA_AVX512 OFF)
+set(LLAMA_FMA OFF)
+set(LLAMA_F16C OFF)
+set(CMAKE_CUDA_FLAGS "--verbose") #
+set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
+set(CUDACXX /usr/local/cuda-12.3/bin/nvcc)
+set(CMAKE_CUDA_COMPILER /usr/local/cuda-12.3/bin/nvcc)
+set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda-12.3)
+#GGML_USE_CUBLAS
+
+#set(CMAKE_EXE_LINKER_FLAGS -pg)
+#set(CMAKE_SHARED_LINKER_FLAGS -pg)
+
+set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type" FORCE)
    
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
@ -45,7 +71,7 @@ endif()
 # general
 option(BUILD_SHARED_LIBS                "build shared libraries"                                OFF)
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
-option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
+option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      OFF)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)

 # debug
@ -78,9 +104,9 @@ endif()

 # 3rd party libs
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
-option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
+option(LLAMA_BLAS                            "llama: use BLAS"                                  ON)
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
-option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF)
+option(LLAMA_CUBLAS                          "llama: use CUDA"                                  ON)
 #option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF)
 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(LLAMA_CUDA_FORCE_MMQ                  "llama: use mmq kernels instead of cuBLAS"         OFF)
@ -108,7 +134,7 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
 # Compile flags
 #

-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
@ -239,7 +265,12 @@ if (LLAMA_BLAS)

        message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
        add_compile_options(${BLAS_LINKER_FLAGS})
-        add_compile_definitions(GGML_USE_OPENBLAS)
+
+	# from https://github.com/NVIDIA/cutlass
+	make_directory("${PROJECT_BINARY_DIR}/nvcc_tmp")
+	set(cuda_flags --keep "SHELL:--keep-dir ${PROJECT_BINARY_DIR}/nvcc_tmp" ${cuda_flags})
+
+	#        add_compile_definitions(GGML_USE_OPENBLAS)
        if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
            add_compile_definitions(GGML_BLAS_USE_MKL)
        endif()
@ -281,6 +312,7 @@ if (LLAMA_CUBLAS)
        endif()
        add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
        add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
+
        if (DEFINED LLAMA_CUDA_DMMV_Y)
            add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
        endif()
@ -321,7 +353,7 @@ if (LLAMA_MPI)
    if (MPI_C_FOUND)
        message(STATUS "MPI found")
        set(GGML_HEADERS_MPI ggml-mpi.h)
-        set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
+        set(GGML_SOURCES_MPI ggml-mpi.cpp ggml-mpi.h)
        add_compile_definitions(GGML_USE_MPI)
        add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
        if (NOT MSVC)
@ -399,14 +431,15 @@ endif()

 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
-        set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+    # -Wpedantic
+        set(warning_flags -Wall -Wextra  -Wcast-qual -Wno-unused-function)
        set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration)
-        set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
+        set(cxx_flags -Wmissing-declarations -Wmissing-noreturn -fpermissive)
        set(host_cxx_flags "")

        if (CMAKE_C_COMPILER_ID MATCHES "Clang")
            set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
-            set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi)
+            set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi -fpermissive)

            if (
                (CMAKE_C_COMPILER_ID STREQUAL "Clang"      AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
@ -416,30 +449,27 @@ if (LLAMA_ALL_WARNINGS)
            endif()
        elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
            set(c_flags ${c_flags} -Wdouble-promotion)
-            set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds)
+            set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds -fpermissive)

            if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
-                set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation)
+                set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation -fpermissive)
            endif()
            if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
-                set(host_cxx_flags ${host_cxx_flags} -Wextra-semi)
+                set(host_cxx_flags ${host_cxx_flags} -Wextra-semi -fpermissive)
            endif()
        endif()
    else()
        # todo : msvc
    endif()

-    set(c_flags   ${c_flags}   ${warning_flags})
-    set(cxx_flags ${cxx_flags} ${warning_flags})
+    set(c_flags   ${c_flags}  -save-temps --verbose  ${warning_flags})
+    set(cxx_flags ${cxx_flags} -fpermissive  -save-temps --verbose ${warning_flags})
    add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
                        "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
                        "$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")

 endif()

-if (NOT MSVC)
-    set(cuda_flags -Wno-pedantic)
-endif()
 set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})

 list(JOIN host_cxx_flags " " cuda_host_flags)  # pass host compiler flags as a single argument
@ -447,6 +477,9 @@ if (NOT cuda_host_flags STREQUAL "")
    set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
 endif()

+# 
+set(cuda_flags --verbose -G  ${cuda_flags})
+
 add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")

 if (WIN32)
@ -494,6 +527,8 @@ if (NOT MSVC)
            add_link_options(-static-libgcc -static-libstdc++)
        endif()
    endif()
+    add_link_options("-Wl,-Map=${TARGET}.map")
+
    if (LLAMA_GPROF)
      add_compile_options(-pg)
    endif()
@ -654,13 +689,16 @@ if (GGML_USE_CPU_HBM)
 endif()

 add_library(ggml OBJECT
-            ggml.c
+            ggml.cpp
            ggml.h
-            ggml-alloc.c
+	    print.hpp
+	    ggml-internal.hpp
+	    llama-internal.hpp
+            ggml-alloc.cpp
            ggml-alloc.h
-            ggml-backend.c
+            ggml-backend.cpp
            ggml-backend.h
-            ggml-quants.c
+            ggml-quants.cpp
            ggml-quants.h
            ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
            ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
@ -692,7 +730,7 @@ add_library(llama
            )

 target_include_directories(llama PUBLIC .)
-target_compile_features(llama PUBLIC cxx_std_11) # don't bump
+target_compile_features(llama PUBLIC cxx_std_20) # don't bump
 target_link_libraries(llama PRIVATE
    ggml
    ${LLAMA_EXTRA_LIBS}
--- a/28
+++ b/28
@ -1,3 +1,4 @@
+
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
@ -116,7 +117,7 @@ endif
 # keep standard at C11 and C++11
 MK_CPPFLAGS = -I. -Icommon
 MK_CFLAGS   = -std=c11   -fPIC
-MK_CXXFLAGS = -std=c++11 -fPIC
+MK_CXXFLAGS = -std=c++17 -fPIC -fpermissive

 # -Ofast tends to produce faster code, but may not be available for some compilers.
 ifdef LLAMA_FAST
@ -506,7 +507,7 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
 endif # LLAMA_METAL

 ifdef LLAMA_MPI
-ggml-mpi.o: ggml-mpi.c ggml-mpi.h
+ggml-mpi.o: ggml-mpi.cpp ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI

@ -541,17 +542,17 @@ $(info )
 # Build library
 #

-ggml.o: ggml.c ggml.h ggml-cuda.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+ggml.o: ggml.cpp ggml.h ggml-cuda.h
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@

-ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+ggml-alloc.o: ggml-alloc.cpp ggml.h ggml-alloc.h
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@

-ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+ggml-backend.o: ggml-backend.cpp ggml.h ggml-backend.h
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@

-ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
-	$(CC) $(CFLAGS)    -c $< -o $@
+ggml-quants.o: ggml-quants.cpp ggml.h ggml-quants.h
+	$(CXX) $(CXXFLAGS)    -c $< -o $@

 OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o

@ -685,6 +686,9 @@ common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
 build-info.o: common/build-info.cpp
 	$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@

+#print.o: print.cpp # print.hpp
+#	$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
+
 #
 # Tests
 #
@ -744,5 +748,5 @@ tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(
 tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-c.o: tests/test-c.c llama.h
-	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
+tests/test-c.o: tests/test-c.cpp llama.h
+	$(CC) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
--- a/README.md
+++ b/README.md
@ -701,7 +701,7 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \

 The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).

-For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
+For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets ygou write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.

 ### Instruction mode with Alpaca

--- a/README.org
+++ b/README.org
--- a/binding.py
+++ b/binding.py
@ -0,0 +1,334 @@
+import os
+import json
+import re
+import clang.cindex
+
+# configurable part
+
+CLANG_VERSION='13.0.1'
+#   homebrew installs for llvm (brew info llvm gives details):
+#       x64: /usr/local/opt/llvm/lib
+#       arm64: /opt/homebrew/opt/llvm/lib
+llvmLibPath = "/usr/lib/llvm-15/lib/"
+
+cxxClientRoot = "/home/mdupont/experiments/llama.cpp/"
+
+fileList = [
+    "ggml.cpp",
+    "llama.cpp"
+]
+
+typeList = [
+]
+
+# end of configurable part
+
+clang.cindex.Config.set_library_path(llvmLibPath)
+
+
+def list_headers_in_dir(path):
+    # enumerates a folder but keeps the full pathing for the files returned
+    # and removes certain files we don't want (like non-hxx, _json.hxx or _fmt.hxx)
+
+    # list all the files in the folder
+    files = os.listdir(path)
+    # only include .hxx files
+    files = list(filter(lambda x: x.endswith('.hxx'), files))
+    # add the folder path back on
+    files = list(map(lambda x: path + x, files))
+    return files
+
+
+# parse through the list of files specified and expand wildcards
+fullFileList = []
+for filePath in fileList:
+    if "*" in filePath:
+        # wildcard path
+        basePath = filePath[:-1]
+        if "*" in basePath:
+            # if there is still a wildcard, we have an issue...
+            raise NotImplementedError(
+                "wildcard only supported at end of file path")
+        files = list_headers_in_dir(os.path.join(cxxClientRoot, basePath))
+        fullFileList = fullFileList + files
+    else:
+        # normal path
+        ff = os.path.join(cxxClientRoot, filePath)
+        fullFileList.append(ff)
+        print("DBUG",ff)
+# exclude _json.hxx files
+fullFileList = list(
+    filter(lambda x: not x.endswith('_json.hxx'), fullFileList))
+# exclude _fmt.hxx files
+fullFileList = list(
+    filter(lambda x: not x.endswith('_fmt.hxx'), fullFileList))
+
+
+# generate a list of regexps from the type list (for handling wildcards)
+typeListRe = list(map(lambda x: x.replace("*", "(.*)") + "(.*)", typeList))
+
+
+def is_included_type(name, with_durability=False):
+
+    # TODO(brett19): This should be generalized somehow...
+    if "is_compound_operation" in name:
+        return False
+
+    if "replica_context" in name:
+        return False
+
+    if with_durability is True and '_with_legacy_durability' not in name:
+        return False
+
+    for x in typeListRe:
+        if re.fullmatch(x, name):
+            return True
+    return False
+
+
+opTypes = []
+opEnums = []
+
+
+def parse_type(type):
+    typeStr = type.get_canonical().spelling
+    return parse_type_str(typeStr)
+
+std_comparators = ["std::less<>", "std::greater<>", "std::less_equal<>", "std::greater_equal<>"]
+
+def parse_type_str(typeStr):
+    if typeStr == "std::mutex":
+        return {"name": "std::mutex"}
+    if typeStr == "std::string":
+        return {"name": "std::string"}
+    if typeStr == "std::chrono::duration<long long>":
+        return {"name": "std::chrono::seconds"}
+    if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000>>":
+        return {"name": "std::chrono::milliseconds"}
+    if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000000>>":
+        return {"name": "std::chrono::microseconds"}
+    if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000000000>>":
+        return {"name": "std::chrono::nanoseconds"}
+    if typeStr == "std::error_code":
+        return {"name": "std::error_code"}
+    if typeStr == "std::monostate":
+        return {"name": "std::monostate"}
+    if typeStr == "std::byte":
+        return {"name": "std::byte"}
+    if typeStr == "unsigned long":
+        return {"name": "std::size_t"}
+    if typeStr == "char":
+        return {"name": "std::int8_t"}
+    if typeStr == "unsigned char":
+        return {"name": "std::uint8_t"}
+    if typeStr == "short":
+        return {"name": "std::int16_t"}
+    if typeStr == "unsigned short":
+        return {"name": "std::uint16_t"}
+    if typeStr == "int":
+        return {"name": "std::int32_t"}
+    if typeStr == "unsigned int":
+        return {"name": "std::uint32_t"}
+    if typeStr == "long long":
+        return {"name": "std::int64_t"}
+    if typeStr == "unsigned long long":
+        return {"name": "std::uint64_t"}
+    if typeStr == "bool":
+        return {"name": "std::bool"}
+    if typeStr == "float":
+        return {"name": "std::float"}
+    if typeStr == "double":
+        return {"name": "std::double"}
+    if typeStr == "std::nullptr_t":
+        return {"name": "std::nullptr_t"}
+    if typeStr in std_comparators:
+        return {"name": typeStr}
+
+    tplParts = typeStr.split("<", 1)
+    if len(tplParts) > 1:
+        tplClassName = tplParts[0]
+        tplParams = tplParts[1][:-1]
+        if tplClassName == "std::function":
+            return {
+                "name": "std::function"
+            }
+        if tplClassName == "std::optional":
+            return {
+                "name": "std::optional",
+                "of": parse_type_str(tplParams)
+            }
+        if tplClassName == "std::vector":
+            return {
+                "name": "std::vector",
+                "of": parse_type_str(tplParams)
+            }
+        if tplClassName == "std::set":
+            return {
+                "name": "std::set",
+                "of": parse_type_str(tplParams)
+            }
+        if tplClassName == "std::variant":
+            variantParts = tplParams.split(", ")
+            variantTypes = []
+            for variantPart in variantParts:
+                variantTypes.append(parse_type_str(variantPart))
+            return {
+                "name": "std::variant",
+                "of": variantTypes
+            }
+        if tplClassName == "std::array":
+            variantParts = tplParams.split(", ")
+            if len(variantParts) != 2:
+                print("FAILED TO PARSE ARRAY TYPES: " + typeStr)
+                return {"name": "unknown", "str": typeStr}
+            return {
+                "name": "std::array",
+                "of": parse_type_str(variantParts[0]),
+                "size": int(variantParts[1])
+            }
+        if tplClassName == "std::map":
+            variantParts = tplParams.split(", ")
+            if len(variantParts) < 2 or len(variantParts) > 3:
+                print("FAILED TO PARSE MAP TYPES: " + typeStr)
+                return {"name": "unknown", "str": typeStr}
+
+            if len(variantParts) == 2:
+                return {
+                    "name": "std::map",
+                    "of": parse_type_str(variantParts[0]),
+                    "to": parse_type_str(variantParts[1])
+                }
+            else:
+                return {
+                    "name": "std::map",
+                    "of": parse_type_str(variantParts[0]),
+                    "to": parse_type_str(variantParts[1]),
+                    "comparator": parse_type_str(variantParts[2])
+                }
+
+        if tplClassName == "std::shared_ptr":
+            return {
+                "name": "std::shared_ptr",
+                "of": parse_type_str(tplParams)
+            }
+
+        #return {"name": "unknown", "str": typeStr}
+
+    if 'unnamed struct' in typeStr:
+        print("WARNING:  Found unnamed struct: " + typeStr)
+
+    return {"name": typeStr}
+
+internal_structs = []
+UNNAMED_STRUCT_DELIM = '::(unnamed struct'
+
+def traverse(node, namespace, main_file):
+    # only scan the elements of the file we parsed
+    #print("FILE", node.location.file )
+
+    if node.kind == clang.cindex.CursorKind.STRUCT_DECL or node.kind == clang.cindex.CursorKind.CLASS_DECL:
+        fullStructName = "::".join([*namespace, node.displayname])
+        print("REFL_TYPE(" + fullStructName + ")")
+
+        structFields = []
+        for child in node.get_children():
+            if child.kind == clang.cindex.CursorKind.FIELD_DECL:
+                struct_type = parse_type(child.type)
+                type_str = child.type.get_canonical().spelling
+                print("  REFL_FIELD(" + child.displayname + ")")
+                if 'unnamed' in type_str:
+                    name_tokens = type_str.split('::')
+                    name_override = '::'.join(name_tokens[:-1] + [child.displayname])
+                    struct_type['name'] = name_override
+                    internal_structs.append(name_override)
+
+                    structFields.append({
+                        "name": child.displayname,
+                        "type": struct_type,
+                    })
+            # replica read changes introduced duplicate get requests
+            if any(map(lambda op: op['name'] == fullStructName, opTypes)):
+                return
+
+            opTypes.append({
+                "name": fullStructName,
+                "fields": structFields,
+            })
+        print("REFL_END")
+        
+    if node.kind == clang.cindex.CursorKind.TYPE_ALIAS_DECL:
+        fullStructName = "::".join([*namespace, node.displayname])
+        if is_included_type(fullStructName, with_durability=True):
+            type_ref = next((c for c in node.get_children() if c.kind == clang.cindex.CursorKind.TYPE_REF), None)
+            if type_ref:
+                base_request_name = type_ref.displayname.replace('struct', '').strip()
+                base_request = next((op for op in opTypes if op['name'] == base_request_name), None)
+                if base_request:
+                    new_fields = [f for f in base_request['fields'] if f['name'] != 'durability_level']
+                    new_fields.extend([
+                            {"name":"persist_to", "type":{"name":"couchbase::persist_to"}},
+                            {"name":"replicate_to", "type":{"name":"couchbase::replicate_to"}}
+                        ])
+
+                    opTypes.append({
+                        "name": fullStructName,
+                        "fields": new_fields
+                    })
+    if node.kind == clang.cindex.CursorKind.ENUM_DECL:
+        fullEnumName = "::".join([*namespace, node.displayname])
+        if is_included_type(fullEnumName):
+            enumValues = []
+
+            for child in node.get_children():
+                if child.kind == clang.cindex.CursorKind.ENUM_CONSTANT_DECL:
+                    enumValues.append({
+                        "name": child.displayname,
+                        "value": child.enum_value,
+                    })
+            opEnums.append({
+                "name": fullEnumName,
+                "type": parse_type(node.enum_type),
+                "values": enumValues,
+            })
+
+    if node.kind == clang.cindex.CursorKind.NAMESPACE:
+        namespace = [*namespace, node.displayname]
+    if node.kind == clang.cindex.CursorKind.CLASS_DECL:
+        namespace = [*namespace, node.displayname]
+    if node.kind == clang.cindex.CursorKind.STRUCT_DECL:
+        namespace = [*namespace, node.displayname]
+
+    for child in node.get_children():
+        traverse(child, namespace, main_file)
+
+for headerPath in fullFileList:
+    print("processing " + headerPath)
+    index = clang.cindex.Index.create()
+    args = [
+        '-std=c++17',
+    ]
+    
+    try:
+        translation_unit = index.parse(headerPath, args=args)
+    except Exception as e:
+        print(e)
+        import pdb
+        pdb.set_trace()
+        raise e
+
+    # output clang compiler diagnostics information (for debugging)
+
+    for diagnostic in translation_unit.diagnostics:
+        diagnosticMsg = diagnostic.format()
+        print(diagnostic)
+
+    traverse(translation_unit.cursor, [], headerPath)
+
+jsonData = json.dumps({
+    'op_structs': opTypes,
+    'op_enums': opEnums
+})
+
+f = open("bindings.json", "w")
+f.write(jsonData)
+f.close()
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@ -144,7 +144,7 @@ namespace grammar_parser {
                while (*pos != '"') {
                    auto char_pair = parse_char(pos);
                         pos       = char_pair.second;
-                    out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
+			 out_elements.push_back(llama_grammar_element(LLAMA_GRETYPE_CHAR, char_pair.first));
                }
                pos = parse_space(pos + 1, is_nested);
            } else if (*pos == '[') { // char range(s)
@ -162,11 +162,11 @@ namespace grammar_parser {
                        ? LLAMA_GRETYPE_CHAR_ALT
                        : start_type;

-                    out_elements.push_back({type, char_pair.first});
+                    out_elements.push_back(llama_grammar_element(type, char_pair.first));
                    if (pos[0] == '-' && pos[1] != ']') {
                        auto endchar_pair = parse_char(pos + 1);
                             pos          = endchar_pair.second;
-                        out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
+			     out_elements.push_back(llama_grammar_element(LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first));
                    }
                }
                pos = parse_space(pos + 1, is_nested);
@ -175,7 +175,7 @@ namespace grammar_parser {
                uint32_t     ref_rule_id = get_symbol_id(state, pos, name_end - pos);
                pos = parse_space(name_end, is_nested);
                last_sym_start = out_elements.size();
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
+                out_elements.push_back(llama_grammar_element(LLAMA_GRETYPE_RULE_REF, ref_rule_id));
            } else if (*pos == '(') { // grouping
                // parse nested alternates into synthesized rule
                pos = parse_space(pos + 1, true);
@ -183,7 +183,7 @@ namespace grammar_parser {
                pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
                last_sym_start = out_elements.size();
                // output reference to synthesized rule
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
+                out_elements.push_back(llama_grammar_element(LLAMA_GRETYPE_RULE_REF, sub_rule_id));
                if (*pos != ')') {
                    throw std::runtime_error(std::string("expecting ')' at ") + pos);
                }
@ -219,7 +219,8 @@ namespace grammar_parser {

                // in original rule, replace previous symbol with reference to generated rule
                out_elements.resize(last_sym_start);
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
+		llama_grammar_element a(LLAMA_GRETYPE_RULE_REF, sub_rule_id);
+                out_elements.push_back(a);

                pos = parse_space(pos + 1, is_nested);
            } else {
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -181,7 +181,7 @@ llama_token llama_sampling_sample(
    cur.clear();

    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+      cur.emplace_back(llama_token_data(token_id, logits[token_id], 0.0f));
    }

    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@ -1527,11 +1527,14 @@ int main(int argc, char ** argv) {
    std::vector<uint8_t> work_buffer;

    for (int ex=0; ex<n_examples; ++ex) {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ compute_size,
-            /*.mem_buffer =*/ compute_addr,
-            /*.no_alloc   =*/ false,
-        };
+      struct ggml_init_params params(
+				     //.mem_size   =
+				     compute_size,
+				     //.mem_buffer =
+				     compute_addr,
+				     //.no_alloc   =
+				     false
+				     );

        struct ggml_context * ctx0 = ggml_init(params);

@ -1602,11 +1605,14 @@ int main(int argc, char ** argv) {
        }
        printf("---\n");
        for (int i=0; i<n_gen; ++i) {
-            struct ggml_init_params params = {
-                /*.mem_size   =*/ compute_size,
-                /*.mem_buffer =*/ compute_addr,
-                /*.no_alloc   =*/ false,
-            };
+	  struct ggml_init_params params(
+					 //.mem_size   =
+					 compute_size,
+					 //.mem_buffer =
+					 compute_addr,
+					 //.no_alloc   =
+					 false
+					 );
            struct ggml_context * ctx0 = ggml_init(params);

            ggml_cgraph gf = {};
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -121,16 +121,18 @@ int main(int argc, char ** argv) {
        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));

-            llama_batch batch_view = {
-                n_tokens,
-                batch.token    + i,
-                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
-                0, 0, 0, // unused
-            };
+            llama_batch batch_view(
+	      /* .n_tokens= */ n_tokens,
+	      /* .token=    */  batch.token    + i,
+	      /* .embd=     */ nullptr,
+	      /* .pos= */      batch.pos      + i,
+              /* .n_seq_id= */ batch.n_seq_id + i,
+	      /* .seq_id= */ batch.seq_id   + i,
+	      /* .logits= */ batch.logits   + i,
+	      /* .all_pos_0= */0,
+	      /* .all_pos_1= */0,
+	      /* .all_seq_id= */0 // unused
+				   );

            const int ret = llama_decode(ctx, batch_view);
            if (ret != 0) {
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -169,10 +169,13 @@ int main(int argc, char ** argv) {
            candidates.reserve(n_vocab);

            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+	      candidates.emplace_back(llama_token_data(
+						       token_id,
+						       logits[token_id],
+						       0.0f ));
            }

-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+            llama_token_data_array candidates_p (candidates.data(), candidates.size(), false );

            const int   top_k = 40;
            const float top_p = 0.9f;
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@ -140,11 +140,14 @@ int main(int argc, char ** argv)  {

    printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));

-    struct ggml_init_params params = {
-        /*.mem_size   =*/ ctx_size,
-        /*.mem_buffer =*/ NULL,
-        /* no_alloc   =*/ 0
-    };
+    struct ggml_init_params params(
+				   //.mem_size   =
+				   ctx_size,
+				   //.mem_buffer =
+				   NULL,
+				   //.no_alloc   =
+				   0
+				   );

    ctx = ggml_init(params);
    if (!ctx) {
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -553,10 +553,12 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
    if (is_ggml_file(filename)) {
        struct ggml_context * ctx_data = NULL;

-        struct gguf_init_params params = {
-            /*.no_alloc = */ false,
-            /*.ctx      = */ &ctx_data,
-        };
+        struct gguf_init_params params(
+				       //.no_alloc =
+				       false,
+				       //.ctx      =
+				       &ctx_data
+				       );

        struct gguf_context * ctx = gguf_init_from_file(filename, params);
        GGML_ASSERT(ctx != NULL);
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@ -389,9 +389,11 @@ static void export_lora(struct export_lora_params * params) {

    // open base model gguf, read tensors without their data
    struct ggml_context * ctx_in;
-    struct gguf_init_params params_gguf;
-    params_gguf.no_alloc = true;
-    params_gguf.ctx      = &ctx_in;
+    struct gguf_init_params params_gguf(
+					//params_gguf.no_alloc =
+					true,
+					//params_gguf.ctx      =
+					&ctx_in);
    struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);

    // create new gguf
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -294,10 +294,12 @@ static void init_model(struct llama_model * input, struct my_llama_model * model

    // get parameters directly from gguf file
    {
-        struct gguf_init_params params = {
-            /*.no_alloc = */ false,
-            /*.ctx      = */ NULL,
-        };
+      struct gguf_init_params params(
+				     //.no_alloc =
+				     false,
+				     //.ctx      =
+				     NULL
+				     );
        struct gguf_context * mctx = gguf_init_from_file(fn_model, params);

        load_model_hparams_gguf(mctx, &hparams, "llama");
@ -598,7 +600,9 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        const  bool             enable_flash_attn,
        const  bool             enable_checkpointing) {

-    ggml_set_scratch(ctx, { 0, 0, nullptr, });
+  //FIXME
+  assert(0);
+  //ggml_set_scratch(ctx, { 0, 0, nullptr, });
    const int n_past = 0;
    const int N = n_tokens;
    const auto & hparams  = model->hparams;
@ -989,9 +993,11 @@ static void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llam

 static bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
    struct ggml_context * f_ggml_ctx;
-    struct gguf_init_params params;
-    params.no_alloc = false;
-    params.ctx = &f_ggml_ctx;
+    struct gguf_init_params params(
+				   //params.no_alloc =
+				   false,
+				   //params.ctx =
+				   &f_ggml_ctx);
    struct gguf_context * fctx = gguf_init_from_file(filename, params);
    if (fctx == NULL) {
        return false;
@ -1706,11 +1712,14 @@ int main(int argc, char ** argv) {
    std::vector<uint8_t> mem_compute_data;

    // context for input tensors without their data
-    struct ggml_init_params ctx_input_params = {
+    struct ggml_init_params ctx_input_params(
+					     //.mem_size=
 					     ggml_tensor_overhead() * 2, // mem_size
+					     //.mem_buffer=
 					     NULL,                       // mem_buffer
-        true,                       // no_alloc
-    };
+					     //.no_alloc=
+					     true                       // no_alloc
+					     );
    struct ggml_context * ctx_input = ggml_init(ctx_input_params);

    // the input tensors
@ -1735,11 +1744,14 @@ int main(int argc, char ** argv) {
            2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
            (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
    );
-    struct ggml_init_params ctx_compute_params = {
+    struct ggml_init_params ctx_compute_params(
+					       //.mem_size=
 					       estimated_compute_size_wo_data, // mem_size
+					       //.mem_buffer=
 					       NULL,                           // mem_buffer
-        true,                           // no_alloc
-    };
+					       //.no_alloc=
+					       true                           // no_alloc
+					       );
    struct ggml_context * ctx_compute = NULL;

    struct ggml_tensor * loss   = NULL;
@ -1902,11 +1914,14 @@ int main(int argc, char ** argv) {
    printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));

    // context for work buffer
-    struct ggml_init_params ctx_work_params = {
+    struct ggml_init_params ctx_work_params(
+					    //.mem_size=
 					    max_work_size, // mem_size
+					    //.mem_buffer =
 					    NULL,          // mem_buffer
-        false,         // no_alloc
-    };
+					    //.no_alloc  =
+					    false         // no_alloc
+					    );
    struct ggml_context * ctx_work = ggml_init(ctx_work_params);

    int64_t t0 = ggml_time_ms();
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@ -40,11 +40,14 @@ static bool gguf_ex_write(const std::string & fname) {
    gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
    gguf_set_arr_str (ctx, "some.parameter.arr.str",                    std::vector<const char *>{ "hello", "world", "!" }.data(), 3);

-    struct ggml_init_params params = {
-        /*.mem_size   =*/ 128ull*1024ull*1024ull,
-        /*.mem_buffer =*/ NULL,
-        /*.no_alloc   =*/ false,
-    };
+    struct ggml_init_params params(
+      //.mem_size   =
+				   128ull*1024ull*1024ull,
+				   //.mem_buffer =
+				   NULL,
+				   //.no_alloc   =
+				   false
+				   );

    struct ggml_context * ctx_data = ggml_init(params);

@ -86,10 +89,12 @@ static bool gguf_ex_write(const std::string & fname) {

 // just read tensor info
 static bool gguf_ex_read_0(const std::string & fname) {
-    struct gguf_init_params params = {
-        /*.no_alloc = */ false,
-        /*.ctx      = */ NULL,
-    };
+  struct gguf_init_params params (
+      //.no_alloc =
+      false,
+      //.ctx      =
+      NULL
+      );

    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);

@ -146,10 +151,12 @@ static bool gguf_ex_read_0(const std::string & fname) {
 static bool gguf_ex_read_1(const std::string & fname) {
    struct ggml_context * ctx_data = NULL;

-    struct gguf_init_params params = {
-        /*.no_alloc = */ false,
-        /*.ctx      = */ &ctx_data,
-    };
+    struct gguf_init_params params (
+				    //.no_alloc =
+				    false,
+				    //.ctx      =
+				    &ctx_data
+				    );

    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);

--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -255,11 +255,14 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima

    const auto & buf_compute = ctx->buf_compute;

-    struct ggml_init_params params = {
-        /*.mem_size =*/ buf_compute.size,
-        /*.mem_buffer =*/ buf_compute.data,
-        /*.no_alloc =*/ false,
-    };
+    struct ggml_init_params params(
+				   //.mem_size =
+				   buf_compute.size,
+				   //.mem_buffer =
+				   buf_compute.data,
+				   //.no_alloc =
+				   false
+				   );

    params.no_alloc = true;

@ -455,10 +458,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

    struct ggml_context * meta = NULL;

-    struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ &meta,
-    };
+    struct gguf_init_params params(
+				   //.no_alloc =
+				   true,
+				   //.ctx      =
+				   &meta);
+    

    struct gguf_context * ctx = gguf_init_from_file(fname, params);
    if (!ctx) {
@ -552,11 +557,14 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

    // load tensors
    {
-        struct ggml_init_params params = {
-            /*.mem_size =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc =*/ false,
-        };
+      struct ggml_init_params params(
+				     //.mem_size =
+				     ctx_size,
+				     //.mem_buffer =
+				     NULL,
+				     //.no_alloc =
+				     false
+				     );

        new_clip->ctx = ggml_init(params);
        if (!new_clip->ctx) {
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -75,7 +75,18 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
-        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
+        llama_batch batch(
+			  /* .n_tokens= */int32_t(n_eval),
+	  /* .token= */nullptr,
+	  /* .embd= */(image_embed->embed+i*n_embd),
+	  /* .pos= */nullptr,
+	  /* .n_seq_id= */nullptr,
+	  /* .seq_id= */nullptr,
+	  /* .logits= */nullptr,
+	  /* .all_pos_0= */*n_past,
+	  /* .all_pos_1= */1,
+	  /* .all_seq_id= */0
+			  );
        if (llama_decode(ctx_llama, batch)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return false;
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -31,6 +31,8 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+#include "print.hpp"
+
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
 static gpt_params               * g_params;
@ -99,6 +101,7 @@ static void sigint_handler(int signo) {
    }
 }
 #endif
+using namespace refl;

 static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
    (void) level;
@ -110,6 +113,9 @@ int main(int argc, char ** argv) {
    gpt_params params;
    g_params = &params;

+    //using Td = type_descriptor<gpt_params>;
+
+
    if (!gpt_params_parse(argc, argv, params)) {
        return 1;
    }
@ -124,6 +130,7 @@ int main(int argc, char ** argv) {

    // TODO: Dump params ?
    //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
+    print_fields(params);
    
    // save choice to use color for later
    // (note for later: this is a slightly awkward choice)
@ -241,6 +248,8 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd_inp;

+    print_fields(*model);
+	
    if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
        LOG("tokenize the prompt\n");
        if (params.chatml) {
@ -284,7 +293,8 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
        return 1;
    }
-
+    print_fields(*ctx);
+    //print_fields(session_tokens);
    // debug message about similarity of saved session, if applicable
    size_t n_matching_session_tokens = 0;
    if (!session_tokens.empty()) {
@ -372,6 +382,10 @@ int main(int argc, char ** argv) {
            for (int i = 0; i < (int) guidance_inp.size(); i++) {
                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
            }
+	    
+	    print_fields(*ctx_guidance);
+
+
        }

        if (params.n_keep > 0) {
@ -481,6 +495,7 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> embd_guidance;

    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
+    print_fields(*ctx_sampling);
    
    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
@ -490,6 +505,7 @@ int main(int argc, char ** argv) {
            int max_embd_size = n_ctx - 4;

            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
+	    //print_fields(embd);
            if ((int) embd.size() > max_embd_size) {
                const int skipped_tokens = (int) embd.size() - max_embd_size;
                embd.resize(max_embd_size);
@ -516,6 +532,7 @@ int main(int argc, char ** argv) {
                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                    n_past, n_left, n_ctx, params.n_keep, n_discard);

+		print_fields(*ctx);
                llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
                llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);

@ -632,7 +649,7 @@ int main(int argc, char ** argv) {
            }

            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
-
+	    print_fields(id);
            llama_sampling_accept(ctx_sampling, ctx, id, true);

            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
@ -667,7 +684,7 @@ int main(int argc, char ** argv) {
        if (input_echo) {
            for (auto id : embd) {
                const std::string token_str = llama_token_to_piece(ctx, id);
-                printf("%s", token_str.c_str());
+                printf("TOKEN:%s\n", token_str.c_str());

                if (embd.size() > 1) {
                    input_tokens.push_back(id);
@ -858,6 +875,9 @@ int main(int argc, char ** argv) {
    llama_print_timings(ctx);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);

+    // dont dump core
+    //int *ptr = 0; *ptr = 1;
+    
    if (ctx_guidance) { llama_free(ctx_guidance); }
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -67,9 +67,12 @@ int main(int argc, char ** argv) {
        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+	  candidates.emplace_back(llama_token_data(
+						   token_id,
+						   logits[token_id],
+						   0.0f));
        }
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+        llama_token_data_array candidates_p(candidates.data(), candidates.size(), false );
        auto next_token = llama_sample_token(ctx, &candidates_p);
        auto next_token_str = llama_token_to_piece(ctx, next_token);

--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -33,8 +33,16 @@

 using json = nlohmann::json;

-struct server_params
+struct server_params : refl::attr::usage::type
 {
+  
+  server_params():
+    hostname( "127.0.0.1"),
+    public_path(public_path),
+    port(port),
+    read_timeout(read_timeout),
+    write_timeout( 600) {};
+  
    std::string hostname = "127.0.0.1";
    std::string public_path = "examples/server/public";
    int32_t port = 8080;
@ -543,6 +551,28 @@ struct llama_server_context
    std::vector<task_multi>  queue_multitasks;
    std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks
    std::mutex mutex_results;
+  llama_server_context():
+    model(nullptr),
+    ctx(nullptr),
+    clp_ctx(nullptr),
+    params(params),
+    batch(batch),
+    multimodal(false),
+    clean_kv_cache( true),
+    all_slots_are_idle( false),
+    add_bos_token(  true),
+    //int32_t id_gen;
+    //int32_t n_ctx;  // total context for all clients / slots
+    system_need_update(false){}
+    //std::string              system_prompt;
+    //std::vector<llama_token> system_tokens;
+    //std::string name_user;      // this should be the antiprompt
+    //std::string name_assistant;
+    //std::vector<llama_client_slot> slots;
+    //std::vector<task_server> queue_tasks;
+    //std::vector<task_result> queue_results;
+    //std::mutex mutex_tasks;
+    //std::mutex mutex_results;

    ~llama_server_context()
    {
@ -1402,7 +1432,7 @@ struct llama_server_context
            for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
            {
                const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
-                llama_batch batch_view = {
+                llama_batch batch_view(
                    n_tokens,
                    batch.token    + i,
                    nullptr,
@ -1410,8 +1440,8 @@ struct llama_server_context
                    batch.n_seq_id + i,
                    batch.seq_id   + i,
                    batch.logits   + i,
-                    0, 0, 0, // unused
-                };
+                    0, 0, 0 // unused
+		    );
                if (llama_decode(ctx, batch_view))
                {
                    LOG_TEE("%s : failed to eval\n", __func__);
@ -1818,17 +1848,18 @@ struct llama_server_context
        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
        {
            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
-            llama_batch batch_view =
-            {
-                n_tokens,
-                batch.token    + i,
-                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
-                0, 0, 0, // unused
-            };
+            llama_batch batch_view(
+				   /* .n_tokens= */n_tokens,
+                /* .token= */batch.token    + i,
+                /* .embd= */nullptr,
+                /* .pos= */batch.pos      + i,
+                /* .n_seq_id= */batch.n_seq_id + i,
+                /* .seq_id= */batch.seq_id   + i,
+                /* .logits= */batch.logits   + i,
+                /* .all_pos_0= */.0,
+		/* .all_pos_1= */0,
+		/* .all_seq_id= */0 // unused
+		);

            const int ret = llama_decode(ctx, batch_view);
            if (ret != 0)
@ -1875,7 +1906,10 @@ struct llama_server_context
                    slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
                }

-                llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
+                llama_token_data_array cur_p(
+					     slot.ctx_sampling->cur.data(),
+					     slot.ctx_sampling->cur.size(),
+					     false );
                result.tok = id;

                const int32_t n_probs = slot.sparams.n_probs;
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -124,10 +124,15 @@ int main(int argc, char ** argv) {
            candidates.reserve(n_vocab);

            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+	      candidates.emplace_back(llama_token_data( token_id,
+							logits[token_id],
+							0.0f ));
            }

-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+            llama_token_data_array candidates_p(
+						candidates.data(),
+						candidates.size(),
+						false );

            // sample the most likely token
            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -311,7 +311,8 @@ static struct ggml_tensor * llama_build_train_graphs(
        const  bool             enable_flash_attn,
        const  bool             enable_checkpointing) {

-    ggml_set_scratch(ctx, { 0, 0, nullptr, });
+  assert(0);
+  //ggml_set_scratch(ctx, { 0, 0, nullptr, });
    const int n_past = 0;
    const int N = n_tokens;
    const auto & hparams = model->hparams;
@ -599,10 +600,12 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo

    // set vocab by copying from vocab_model gguf file
    {
-        struct gguf_init_params params = {
-            /*.no_alloc = */ false,
-            /*.ctx      = */ NULL,
-        };
+      struct gguf_init_params params(
+	  //.no_alloc =
+	  false,
+	  //.ctx      =
+	  NULL
+				     );
        struct gguf_context * vctx = gguf_init_from_file(fn_vocab_model, params);

        const int token_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_LIST));
@ -744,9 +747,11 @@ static void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_voc

 static bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct train_state * train) {
    struct ggml_context * f_ggml_ctx;
-    struct gguf_init_params params;
-    params.no_alloc = false;
-    params.ctx = &f_ggml_ctx;
+    struct gguf_init_params params(
+				   //params.no_alloc =
+				   false,
+				   //params.ctx =
+				   &f_ggml_ctx);
    struct gguf_context * fctx = gguf_init_from_file(filename, params);
    if (fctx == NULL) {
        return false;
@ -1084,11 +1089,14 @@ int main(int argc, char ** argv) {
    ggml_allocr * alloc = NULL;

    // context for input tensors without their data
-    struct ggml_init_params ctx_input_params = {
+    struct ggml_init_params ctx_input_params (
+					      //.mem_size =
 					      ggml_tensor_overhead() * 2, // mem_size
+					      //       .mem_buffer =
 					      NULL,                       // mem_buffer
-        true,                       // no_alloc
-    };
+					      //       .no_alloc =
+					      true                       // no_alloc
+					      );
    struct ggml_context * ctx_input = ggml_init(ctx_input_params);

    // the input tensors
@ -1113,11 +1121,14 @@ int main(int argc, char ** argv) {
            2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
            (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
    );
-    struct ggml_init_params ctx_compute_params = {
+    struct ggml_init_params ctx_compute_params(
+					       //    .mem_size =
 					       estimated_compute_size_wo_data, // mem_size
+					       //.mem_buffer=
 					       NULL,                           // mem_buffer
-        true,                           // no_alloc
-    };
+					       //.no_alloc =
+					       true                           // no_alloc
+					       );
    struct ggml_context * ctx_compute = NULL;

    struct ggml_tensor * loss   = NULL;
@ -1266,11 +1277,14 @@ int main(int argc, char ** argv) {
    printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));

    // context for work buffer
-    struct ggml_init_params ctx_work_params = {
-        max_work_size, // mem_size
-        NULL,          // mem_buffer
-        false,         // no_alloc
-    };
+    struct ggml_init_params ctx_work_params(
+					    //.mem_size=
+					    max_work_size, // 
+					    //.mem_buffer=
+					    NULL,          // 
+					    //.no_alloc=
+					    false         // 
+					    );
    struct ggml_context * ctx_work = ggml_init(ctx_work_params);

    int64_t t0 = ggml_time_ms();
--- a/ggml-alloc.cpp
+++ b/ggml-alloc.cpp
@ -8,9 +8,9 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-
+#include "ggml-internal.hpp"
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
-#define MAX_FREE_BLOCKS 256
+

 //#define GGML_ALLOCATOR_DEBUG

@ -24,28 +24,7 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
    return offset + align;
 }

-struct free_block {
-    void * addr;
-    size_t size;
-};

-struct ggml_tallocr {
-    struct ggml_backend_buffer * buffer;
-    bool buffer_owned;
-    void * base;
-    size_t alignment;
-
-    int n_free_blocks;
-    struct free_block free_blocks[MAX_FREE_BLOCKS];
-
-    size_t max_size;
-
-    bool measure;
-
-#ifdef GGML_ALLOCATOR_DEBUG
-    struct ggml_tensor * allocated_tensors[1024];
-#endif
-};

 #ifdef GGML_ALLOCATOR_DEBUG
 static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
@ -333,33 +312,20 @@ size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {

 // graph allocator

-struct hash_node {
-    int n_children;
-    int n_views;
-};
-
-struct ggml_gallocr {
-    ggml_tallocr_t talloc;
-    struct ggml_hash_set hash_set;
-    struct hash_node * hash_values;
-    size_t hash_values_size;
-    ggml_tallocr_t * hash_allocs;
-    int * parse_seq;
-    int parse_seq_len;
-};
-
 ggml_gallocr_t ggml_gallocr_new(void) {
    ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr));

+    ggml_hash_set hs = {.size=0, .keys=NULL};
    *galloc = (struct ggml_gallocr) {
-        /*.talloc           = */ NULL,
-        /*.hash_set         = */ {0},
-        /*.hash_values      = */ NULL,
-        /*.hash_values_size = */ 0,
-        /*.hash_allocs      = */ NULL,
-        /*.parse_seq        = */ NULL,
-        /*.parse_seq_len    = */ 0,
+      .talloc           =  NULL,
+      .hash_set  =hs,
+      .hash_values      =  NULL,
+      .hash_values_size =  0,
+      .hash_allocs      =  NULL,
+      .parse_seq        =  NULL,
+      .parse_seq_len    =  0,
    };
+    //((*galloc).hash_set)[0]         =  0;

    return galloc;
 }
@ -698,16 +664,12 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap

 // legacy API wrapper

-struct ggml_allocr {
-    ggml_tallocr_t talloc;
-    ggml_gallocr_t galloc;
-};

 static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) {
    ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr));
    *alloc = (struct ggml_allocr) {
-        /*.talloc = */ talloc,
-        /*.galloc = */ ggml_gallocr_new(),
+      .talloc =  talloc,
+      .galloc =  ggml_gallocr_new(),
    };
    return alloc;
 }
--- a/ggml-backend.cpp
+++ b/ggml-backend.cpp
@ -25,10 +25,10 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
    GGML_ASSERT(iface.get_base != NULL);

    (*buffer) = (struct ggml_backend_buffer) {
-        /* .interface = */ iface,
-        /* .backend   = */ backend,
-        /* .context   = */ context,
-        /* .size      = */ size,
+      .iface =  iface,
+        .backend   =  backend,
+	.context   =  context,
+	.size      = size,
    };

    return buffer;
@ -586,11 +586,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
    memset(sched->node_copies,   0, sizeof(sched->node_copies[0])   * hash_size);
    sched->n_splits = 0;

-    struct ggml_init_params params = {
-        /*.mem_size =   */ sizeof(sched->context_buffer),
-        /*.mem_buffer = */ sched->context_buffer,
-        /*.no_alloc =   */ true
-    };
+    struct ggml_init_params params(
+				   //.mem_size =
+				   sizeof(sched->context_buffer),
+				   //.mem_buffer =
+				   sched->context_buffer,
+				   //.no_alloc =
+				   true
+				   );

    if (sched->ctx != NULL) {
        ggml_free(sched->ctx);
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -7675,12 +7675,12 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
 #endif

    // debug helpers
-    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
-    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
-    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
-    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
-    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
-    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
+           // printf("JSON: { \"data\":{ \"src0\": { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ],	   \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}, \"src1\": { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ],	   \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}, \"dst\" : { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ],	   \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}}}\n",
+	   // src0->name, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
+	   // ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name,
+	   // src1->name, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name,
+	   // dst->name, dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], ggml_is_contiguous(dst), ggml_is_transposed(dst), ggml_type_name(dst->type), dst->name
+	   // );

    if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
        // KQ single-batch
--- a/ggml-impl.h
+++ b/ggml-impl.h
@ -22,7 +22,7 @@ extern "C" {
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
 #define static_assert(cond, msg) _Static_assert(cond, msg)
 #else
-#define static_assert(cond, msg) struct global_scope_noop_trick
+  //#define static_assert(cond, msg) struct global_scope_noop_trick
 #endif
 #endif

--- a/ggml-internal.hpp
+++ b/ggml-internal.hpp
@ -0,0 +1,258 @@
+struct ggml_context {
+    size_t mem_size;
+    void * mem_buffer;
+    bool   mem_buffer_owned;
+    bool   no_alloc;
+    bool   no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
+
+    int    n_objects;
+
+    struct ggml_object * objects_begin;
+    struct ggml_object * objects_end;
+
+    struct ggml_scratch scratch;
+    struct ggml_scratch scratch_save;
+
+  ggml_context():
+    mem_size(0),
+    mem_buffer(0),
+    mem_buffer_owned(0),
+    no_alloc(0),
+    no_alloc_save(0),
+    n_objects(0),
+    objects_begin(0),
+    objects_end(0),
+    scratch(),
+    scratch_save()
+  {
+    
+  }
+};
+
+struct ggml_context_container {
+    bool used;
+
+    struct ggml_context context;
+
+  ggml_context_container(): used(0),context(){
+    
+  }
+};
+
+typedef double ggml_float;
+typedef void * thread_ret_t;
+
+#define MAX_FREE_BLOCKS 256
+
+struct free_block {
+    void * addr;
+    size_t size;
+};
+
+struct ggml_tallocr {
+    struct ggml_backend_buffer * buffer;
+    bool buffer_owned;
+    void * base;
+    size_t alignment;
+
+    int n_free_blocks;
+    struct free_block free_blocks[MAX_FREE_BLOCKS];
+
+    size_t max_size;
+
+    bool measure;
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    struct ggml_tensor * allocated_tensors[1024];
+#endif
+};
+
+
+struct hash_node {
+    int n_children;
+    int n_views;
+};
+
+typedef struct ggml_tallocr * ggml_tallocr_t;
+typedef struct ggml_gallocr * ggml_gallocr_t;
+
+struct ggml_gallocr {
+    ggml_tallocr_t talloc;
+    struct ggml_hash_set hash_set;
+    struct hash_node * hash_values;
+    size_t hash_values_size;
+    ggml_tallocr_t * hash_allocs;
+    int * parse_seq;
+    int parse_seq_len;
+};
+
+struct ggml_allocr {
+    ggml_tallocr_t talloc;
+    ggml_gallocr_t galloc;
+};
+
+#define GGML_NUMA_MAX_NODES 8
+#define GGML_NUMA_MAX_CPUS 512
+
+struct ggml_numa_node {
+    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
+    uint32_t n_cpus;
+};
+
+struct ggml_numa_nodes {
+    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
+    uint32_t n_nodes;
+    uint32_t total_cpus; // hardware threads on system
+};
+
+struct ggml_state {
+    struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
+    struct ggml_numa_nodes numa;
+
+  ggml_state():contexts(), numa()
+  {
+    
+  }
+};
+
+struct gguf_str {
+    uint64_t n;  // GGUFv2
+    char * data;
+};
+
+struct ggml_map_custom1_op_params {
+    ggml_custom1_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
+struct ggml_map_custom2_op_params {
+    ggml_custom2_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
+struct ggml_map_custom3_op_params {
+    ggml_custom3_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+struct hash_map {
+    struct ggml_hash_set set;
+    struct ggml_tensor ** vals;
+};
+
+#if defined(_WIN32)
+typedef volatile LONG atomic_int;
+typedef atomic_int atomic_bool;
+#else
+#include<atomic>
+using namespace std;
+#endif
+
+struct ggml_compute_state_shared {
+    const struct ggml_cgraph * cgraph;
+    const struct ggml_cplan  * cplan;
+
+    int64_t perf_node_start_cycles;
+    int64_t perf_node_start_time_us;
+
+    const int n_threads;
+
+    // synchronization primitives
+    atomic_int n_active; // num active threads
+    atomic_int node_n;   // active graph node
+
+    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
+    void * abort_callback_data;
+};
+typedef pthread_t ggml_thread_t;
+struct ggml_compute_state {
+    ggml_thread_t thrd;
+    int ith;
+    struct ggml_compute_state_shared * shared;
+};
+
+union gguf_value {
+    uint8_t  uint8;
+    int8_t   int8;
+    uint16_t uint16;
+    int16_t  int16;
+    uint32_t uint32;
+    int32_t  int32;
+    float    float32;
+    uint64_t uint64;
+    int64_t  int64;
+    double   float64;
+    bool     bool_;
+
+    struct gguf_str str;
+
+    struct gguf_array_T {
+        enum gguf_type type;
+
+        uint64_t n;  // GGUFv2
+        void * data;
+    } arr;
+};
+
+struct ggml_lbfgs_iteration_data {
+    float alpha;
+    float ys;
+    float * s;
+    float * y;
+};
+
+struct gguf_kv {
+    struct gguf_str key;
+
+    enum  gguf_type  type;
+    union gguf_value value;
+};
+
+
+
+struct gguf_header {
+    char magic[4];
+    uint32_t version;
+    uint64_t n_tensors; // GGUFv2
+    uint64_t n_kv;      // GGUFv2
+};
+
+struct gguf_tensor_info {
+    struct gguf_str name;
+
+    uint32_t n_dims;
+    uint64_t ne[GGML_MAX_DIMS];
+
+    enum ggml_type type;
+
+    uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
+
+    // for writing API
+    const void * data;
+    size_t size;
+};
+
+struct gguf_context {
+    struct gguf_header header;
+
+    struct gguf_kv          * kv;
+    struct gguf_tensor_info * infos;
+
+    size_t alignment;
+    size_t offset;    // offset of `data` from beginning of file
+    size_t size;      // size of `data` in bytes
+
+    //uint8_t * padding;
+    void * data;
+};
+
+struct gguf_buf {
+    void * data;
+    size_t size;
+    size_t offset;
+};
+
+
+#include "ggml-backend-impl.h"
--- a/ggml-mpi.cpp
+++ b/ggml-mpi.cpp
--- a/ggml-quants.cpp
+++ b/ggml-quants.cpp
--- a/ggml-quants.h
+++ b/ggml-quants.h
@ -167,58 +167,58 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_


 // Quantization
-void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
-void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
-void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
-void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
-void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
-void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
+void quantize_row_q4_0_reference(const float * __restrict__ x, block_q4_0 * __restrict__ y, int k);
+void quantize_row_q4_1_reference(const float * __restrict__ x, block_q4_1 * __restrict__ y, int k);
+void quantize_row_q5_0_reference(const float * __restrict__ x, block_q5_0 * __restrict__ y, int k);
+void quantize_row_q5_1_reference(const float * __restrict__ x, block_q5_1 * __restrict__ y, int k);
+void quantize_row_q8_0_reference(const float * __restrict__ x, block_q8_0 * __restrict__ y, int k);
+void quantize_row_q8_1_reference(const float * __restrict__ x, block_q8_1 * __restrict__ y, int k);

-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
+void quantize_row_q2_K_reference(const float * __restrict__ x, block_q2_K * __restrict__ y, int k);
+void quantize_row_q3_K_reference(const float * __restrict__ x, block_q3_K * __restrict__ y, int k);
+void quantize_row_q4_K_reference(const float * __restrict__ x, block_q4_K * __restrict__ y, int k);
+void quantize_row_q5_K_reference(const float * __restrict__ x, block_q5_K * __restrict__ y, int k);
+void quantize_row_q6_K_reference(const float * __restrict__ x, block_q6_K * __restrict__ y, int k);
+void quantize_row_q8_K_reference(const float * __restrict__ x, block_q8_K * __restrict__ y, int k);

-void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
+void quantize_row_q4_0(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q4_1(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q5_0(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q5_1(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q8_0(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q8_1(const float * __restrict__ x, void * __restrict__ y, int k);

-void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q2_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q3_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q4_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q5_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q6_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q8_K(const float * __restrict__ x, void * __restrict__ y, int k);

 // Dequantization
-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
-//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
+void dequantize_row_q4_0(const block_q4_0 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q4_1(const block_q4_1 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q5_0(const block_q5_0 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q5_1(const block_q5_1 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q8_0(const block_q8_0 * __restrict__ x, float * __restrict__ y, int k);
+//void dequantize_row_q8_1(const block_q8_1 * __restrict__ x, float * __restrict__ y, int k);

-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
+void dequantize_row_q2_K(const block_q2_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q3_K(const block_q3_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q4_K(const block_q4_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q5_K(const block_q5_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q6_K(const block_q6_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q8_K(const block_q8_K * __restrict__ x, float * __restrict__ y, int k);

 // Dot product
-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q4_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q4_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q5_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q5_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q8_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);

-void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q2_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q3_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q4_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q5_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q6_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
--- a/ggml.cpp
+++ b/ggml.cpp
--- a/ggml.h
+++ b/ggml.h
@ -1,5 +1,6 @@
 #pragma once

+#include<refl-cpp/refl.hpp>
 //
 // GGML Tensor Library
 //
@ -284,7 +285,7 @@
    GGML_UNUSED(prefix##3);

 #ifdef  __cplusplus
-extern "C" {
+//extern "C" {
 #endif

 #if defined(__ARM_NEON) && defined(__CUDACC__)
@ -464,7 +465,7 @@ extern "C" {
    };

    // ggml object
-    struct ggml_object {
+    struct ggml_object : refl::attr::usage::type {
        size_t offs;
        size_t size;

@ -478,7 +479,7 @@ extern "C" {
    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);

    // n-dimensional tensor
-    struct ggml_tensor {
+    struct ggml_tensor : refl::attr::usage::type{
        enum ggml_type         type;
        enum ggml_backend_type backend;

@ -523,7 +524,7 @@ extern "C" {

    // the compute plan that needs to be prepared for ggml_graph_compute()
    // since https://github.com/ggerganov/ggml/issues/287
-    struct ggml_cplan {
+    struct ggml_cplan : refl::attr::usage::type{
        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`

@ -540,13 +541,13 @@ extern "C" {
        GGML_CGRAPH_EVAL_ORDER_COUNT
    };

-    struct ggml_hash_set {
+    struct ggml_hash_set : refl::attr::usage::type{
        size_t size;
        struct ggml_tensor ** keys;
    };

    // computation graph
-    struct ggml_cgraph {
+    struct ggml_cgraph : refl::attr::usage::type{
        int size;
        int n_nodes;
        int n_leafs;
@ -566,13 +567,31 @@ extern "C" {
    };

    // scratch buffer
-    struct ggml_scratch {
+    struct ggml_scratch : refl::attr::usage::type{
        size_t offs;
        size_t size;
        void * data;
+
+      ggml_scratch()
+      : offs(0),
+	  size(0),
+	  data(0)
+      {}
    };

-    struct ggml_init_params {
+    struct ggml_init_params : refl::attr::usage::type{
+
+      ggml_init_params(size_t mem_size,
+		       void * mem_buffer,
+		       bool   no_alloc):
+	mem_size( mem_size),
+        mem_buffer(mem_buffer),
+        no_alloc(no_alloc){}
+      ggml_init_params():
+	mem_size(0),
+        mem_buffer(0),
+        no_alloc(0){}
+      
        // memory pool
        size_t mem_size;   // bytes
        void * mem_buffer; // if NULL, memory will be allocated internally
@ -590,7 +609,7 @@ extern "C" {
        GGML_TASK_FINALIZE,
    };

-    struct ggml_compute_params {
+    struct ggml_compute_params : refl::attr::usage::type{
        enum ggml_task_type type;

        // ith = thread index, nth = number of threads
@ -1836,7 +1855,7 @@ extern "C" {
    //
    //   see ggml.c (ggml_opt_default_params) for default values
    //
-    struct ggml_opt_params {
+    struct ggml_opt_params : refl::attr::usage::type{
        enum ggml_opt_type type;

        size_t graph_size;
@ -1866,7 +1885,7 @@ extern "C" {
        int n_gradient_accumulation;

        // ADAM parameters
-        struct {
+        struct ggml_adam: refl::attr::usage::type{
            int n_iter;

            float sched; // schedule multiplier (fixed, decay or warmup)
@ -1882,7 +1901,7 @@ extern "C" {
        } adam;

        // LBFGS parameters
-        struct {
+        struct ggml_lbfgs: refl::attr::usage::type{
            int m; // number of corrections to approximate the inv. Hessian
            int n_iter;
            int max_linesearch;
@ -1897,7 +1916,7 @@ extern "C" {
        } lbfgs;
    };

-    struct ggml_opt_context {
+    struct ggml_opt_context : refl::attr::usage::type{
        struct ggml_context * ctx;
        struct ggml_opt_params params;

@ -1909,7 +1928,7 @@ extern "C" {
        float loss_before;
        float loss_after;

-        struct {
+        struct ggml_grad : refl::attr::usage::type{
            struct ggml_tensor * g;  // current gradient
            struct ggml_tensor * m;  // first moment
            struct ggml_tensor * v;  // second moment
@ -1919,7 +1938,7 @@ extern "C" {
            int n_no_improvement;
        } adam;

-        struct {
+        struct ggml_params : refl::attr::usage::type{
            struct ggml_tensor * x;    // current parameters
            struct ggml_tensor * xp;   // previous parameters
            struct ggml_tensor * g;    // current gradient
@ -2012,7 +2031,9 @@ extern "C" {

    struct gguf_context;

-    struct gguf_init_params {
+    struct gguf_init_params : refl::attr::usage::type{
+      gguf_init_params(bool no_alloc, struct ggml_context ** ctx): no_alloc(no_alloc),ctx(ctx){}
+      
        bool no_alloc;

        // if not NULL, create a ggml_context and allocate the tensor data in it
@ -2149,7 +2170,7 @@ extern "C" {
    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k);
    typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);

-    typedef struct {
+    typedef struct ggml_something : refl::attr::usage::type{
        const char      * type_name;
        int               blck_size;
        size_t            type_size;
@ -2164,5 +2185,5 @@ extern "C" {
    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);

 #ifdef  __cplusplus
-}
+//}
 #endif
--- a/llama-internal.hpp
+++ b/llama-internal.hpp
@ -0,0 +1,896 @@
+#include <set>
+#include <queue>
+enum llm_arch {
+    LLM_ARCH_LLAMA,
+    LLM_ARCH_FALCON,
+    LLM_ARCH_BAICHUAN,
+    LLM_ARCH_GPT2,
+    LLM_ARCH_GPTJ,
+    LLM_ARCH_GPTNEOX,
+    LLM_ARCH_MPT,
+    LLM_ARCH_STARCODER,
+    LLM_ARCH_PERSIMMON,
+    LLM_ARCH_REFACT,
+    LLM_ARCH_BLOOM,
+    LLM_ARCH_STABLELM,
+    LLM_ARCH_UNKNOWN,
+};
+
+enum llm_kv {
+    LLM_KV_GENERAL_ARCHITECTURE,
+    LLM_KV_GENERAL_QUANTIZATION_VERSION,
+    LLM_KV_GENERAL_ALIGNMENT,
+    LLM_KV_GENERAL_NAME,
+    LLM_KV_GENERAL_AUTHOR,
+    LLM_KV_GENERAL_URL,
+    LLM_KV_GENERAL_DESCRIPTION,
+    LLM_KV_GENERAL_LICENSE,
+    LLM_KV_GENERAL_SOURCE_URL,
+    LLM_KV_GENERAL_SOURCE_HF_REPO,
+
+    LLM_KV_CONTEXT_LENGTH,
+    LLM_KV_EMBEDDING_LENGTH,
+    LLM_KV_BLOCK_COUNT,
+    LLM_KV_FEED_FORWARD_LENGTH,
+    LLM_KV_USE_PARALLEL_RESIDUAL,
+    LLM_KV_TENSOR_DATA_LAYOUT,
+
+    LLM_KV_ATTENTION_HEAD_COUNT,
+    LLM_KV_ATTENTION_HEAD_COUNT_KV,
+    LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
+    LLM_KV_ATTENTION_CLAMP_KQV,
+    LLM_KV_ATTENTION_LAYERNORM_EPS,
+    LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
+
+    LLM_KV_ROPE_DIMENSION_COUNT,
+    LLM_KV_ROPE_FREQ_BASE,
+    LLM_KV_ROPE_SCALE_LINEAR,
+    LLM_KV_ROPE_SCALING_TYPE,
+    LLM_KV_ROPE_SCALING_FACTOR,
+    LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
+    LLM_KV_ROPE_SCALING_FINETUNED,
+
+    LLM_KV_TOKENIZER_MODEL,
+    LLM_KV_TOKENIZER_LIST,
+    LLM_KV_TOKENIZER_TOKEN_TYPE,
+    LLM_KV_TOKENIZER_SCORES,
+    LLM_KV_TOKENIZER_MERGES,
+    LLM_KV_TOKENIZER_BOS_ID,
+    LLM_KV_TOKENIZER_EOS_ID,
+    LLM_KV_TOKENIZER_UNK_ID,
+    LLM_KV_TOKENIZER_SEP_ID,
+    LLM_KV_TOKENIZER_PAD_ID,
+    LLM_KV_TOKENIZER_ADD_BOS,
+    LLM_KV_TOKENIZER_ADD_EOS,
+    LLM_KV_TOKENIZER_HF_JSON,
+    LLM_KV_TOKENIZER_RWKV,
+};
+
+// available llama models
+enum e_model {
+    MODEL_UNKNOWN,
+    MODEL_1B,
+    MODEL_3B,
+    MODEL_7B,
+    MODEL_8B,
+    MODEL_13B,
+    MODEL_15B,
+    MODEL_30B,
+    MODEL_34B,
+    MODEL_40B,
+    MODEL_65B,
+    MODEL_70B,
+};
+
+enum llama_fver {
+    GGUF_FILE_VERSION_V1 = 1,
+    GGUF_FILE_VERSION_V2 = 2,
+    GGUF_FILE_VERSION_V3 = 3,
+};
+
+struct LLM_KV {
+  LLM_KV(llm_arch arch) : arch(arch) {}
+
+  llm_arch arch;
+
+  std::string operator()(llm_kv kv) const; // moved to llama.cpp file
+
+};
+
+enum llm_tensor {
+    LLM_TENSOR_TOKEN_EMBD,
+    LLM_TENSOR_TOKEN_EMBD_NORM,
+    LLM_TENSOR_POS_EMBD,
+    LLM_TENSOR_OUTPUT,
+    LLM_TENSOR_OUTPUT_NORM,
+    LLM_TENSOR_ROPE_FREQS,
+    LLM_TENSOR_ATTN_Q,
+    LLM_TENSOR_ATTN_K,
+    LLM_TENSOR_ATTN_V,
+    LLM_TENSOR_ATTN_QKV,
+    LLM_TENSOR_ATTN_OUT,
+    LLM_TENSOR_ATTN_NORM,
+    LLM_TENSOR_ATTN_NORM_2,
+    LLM_TENSOR_ATTN_ROT_EMBD,
+    LLM_TENSOR_FFN_GATE,
+    LLM_TENSOR_FFN_DOWN,
+    LLM_TENSOR_FFN_UP,
+    LLM_TENSOR_FFN_NORM,
+    LLM_TENSOR_ATTN_Q_NORM,
+    LLM_TENSOR_ATTN_K_NORM,
+};
+
+
+struct llama_cparams {
+    uint32_t n_ctx;       // context size used during inference
+    uint32_t n_batch;
+    uint32_t n_threads;       // number of threads to use for generation
+    uint32_t n_threads_batch; // number of threads to use for batch processing
+
+    float    rope_freq_base;
+    float    rope_freq_scale;
+
+    uint32_t n_yarn_orig_ctx;
+    // These hyperparameters are not exposed in GGUF, because all
+    // existing YaRN models use the same values for them.
+    float yarn_ext_factor;
+    float yarn_attn_factor;
+    float yarn_beta_fast;
+    float yarn_beta_slow;
+
+    bool mul_mat_q;
+};
+
+struct llama_layer {
+    // normalization
+    struct ggml_tensor * attn_norm;
+    struct ggml_tensor * attn_norm_b;
+    struct ggml_tensor * attn_norm_2;
+    struct ggml_tensor * attn_norm_2_b;
+    struct ggml_tensor * attn_q_norm;
+    struct ggml_tensor * attn_q_norm_b;
+    struct ggml_tensor * attn_k_norm;
+    struct ggml_tensor * attn_k_norm_b;
+
+    // attention
+    struct ggml_tensor * wq;
+    struct ggml_tensor * wk;
+    struct ggml_tensor * wv;
+    struct ggml_tensor * wo;
+    struct ggml_tensor * wqkv;
+
+    // attention bias
+    struct ggml_tensor * bo;
+    struct ggml_tensor * bqkv;
+
+    // normalization
+    struct ggml_tensor * ffn_norm;
+    struct ggml_tensor * ffn_norm_b;
+
+    // ff
+    struct ggml_tensor * ffn_gate; // w1
+    struct ggml_tensor * ffn_down; // w2
+    struct ggml_tensor * ffn_up;   // w3
+
+    // ff bias
+    struct ggml_tensor * ffn_down_b; // b2
+    struct ggml_tensor * ffn_up_b;   // b3
+};
+
+struct llama_kv_cell {
+    llama_pos pos   = -1;
+    llama_pos delta = 0;
+
+    std::set<llama_seq_id> seq_id;
+
+    bool has_seq_id(const llama_seq_id & id) const {
+	return seq_id.find(id) != seq_id.end();
+    }
+};
+
+struct llama_buffer {
+    void * data = NULL;
+    size_t size = 0;
+
+    // fallback to malloc / free
+    // useful in cases where CUDA can try to allocate PINNED memory
+    bool fallback = false;
+
+  void resize(size_t n) ;
+
+
+  ~llama_buffer();
+
+};
+
+// ring-buffer of cached KV data
+struct llama_kv_cache {
+    bool has_shift = false;
+
+    // Note: The value of head isn't only used to optimize searching
+    // for a free KV slot. llama_decode_internal also uses it, so it
+    // cannot be freely changed after a slot has been allocated.
+    uint32_t head = 0;
+    uint32_t size = 0;
+
+    // computed before each graph build
+    uint32_t n = 0;
+
+    std::vector<llama_kv_cell> cells;
+
+    struct ggml_tensor * k = NULL;
+    struct ggml_tensor * v = NULL;
+
+    struct ggml_context * ctx = NULL;
+
+    llama_buffer buf;
+
+    ~llama_kv_cache() {
+	if (ctx) {
+	    ggml_free(ctx);
+	}
+
+#ifdef GGML_USE_CUBLAS
+	if (ggml_cublas_loaded()) {
+	    ggml_cuda_free_data(k);
+	    ggml_cuda_free_data(v);
+	}
+#endif
+    }
+};
+
+struct llama_vocab {
+    using id    = int32_t;
+    using token = std::string;
+    using ttype = llama_token_type;
+
+    struct token_data {
+	token text;
+	float score;
+	ttype type;
+    };
+
+    enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_data>       id_to_token;
+
+    std::unordered_map<token, id> special_tokens_cache;
+
+    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
+
+    // default LLaMA special tokens
+    id special_bos_id = 1;
+    id special_eos_id = 2;
+    id special_unk_id = 0;
+    id special_sep_id = -1;
+    id special_pad_id = -1;
+
+    int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
+    int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
+
+    id linefeed_id       = 13;
+    id special_prefix_id = 32007;
+    id special_middle_id = 32009;
+    id special_suffix_id = 32008;
+    id special_eot_id    = 32010;
+
+    int find_bpe_rank(std::string token_left, std::string token_right) const {
+	GGML_ASSERT(token_left.find(" ") == std::string::npos);
+	GGML_ASSERT(token_left.find("\n") == std::string::npos);
+	GGML_ASSERT(token_right.find(" ") == std::string::npos);
+	GGML_ASSERT(token_right.find("\n") == std::string::npos);
+
+	auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
+	if (it == bpe_ranks.end()) {
+	    return -1;
+	}
+
+	return it->second;
+    }
+};
+
+struct llama_mmap {
+  void * addr;
+  size_t size;
+
+  llama_mmap(const llama_mmap &) = delete;
+
+  llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false);
+  ~llama_mmap();
+
+#ifdef _POSIX_MAPPED_FILES
+    static constexpr bool SUPPORTED = true;
+#elif defined(_WIN32)
+    static constexpr bool SUPPORTED = true;
+#else
+    static constexpr bool SUPPORTED = false;
+#endif
+};
+
+
+struct llama_hparams {
+    bool     vocab_only;
+    uint32_t n_vocab;
+    uint32_t n_ctx_train; // context size the model was trained on
+    uint32_t n_embd;
+    uint32_t n_head;
+    uint32_t n_head_kv;
+    uint32_t n_layer;
+    uint32_t n_rot;
+    uint32_t n_ff;
+
+    float f_norm_eps;
+    float f_norm_rms_eps;
+
+    float    rope_freq_base_train;
+    float    rope_freq_scale_train;
+    uint32_t n_yarn_orig_ctx;
+    int8_t   rope_scaling_type_train : 3;
+    bool     rope_finetuned : 1;
+
+    float f_clamp_kqv;
+    float f_max_alibi_bias;
+
+  bool operator!=(const llama_hparams & other) const;
+    uint32_t n_gqa() const {
+	return n_head/n_head_kv;
+    }
+
+    uint32_t n_embd_head() const {
+	return n_embd/n_head;
+    }
+
+    uint32_t n_embd_gqa() const {
+	return n_embd/n_gqa();
+    }
+};
+
+struct llama_mlock {
+  void * addr = NULL;
+  size_t size = 0;
+  bool failed_already = false;
+  llama_mlock() ;
+
+  llama_mlock(const llama_mlock &) = delete;
+  ~llama_mlock();
+  void init(void * ptr);
+  void grow_to(size_t target_size);
+#ifdef _POSIX_MEMLOCK_RANGE
+  static constexpr bool SUPPORTED = true;
+  static size_t lock_granularity();
+#ifdef __APPLE__
+#define MLOCK_SUGGESTION						\
+  "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
+  "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
+#else
+#define MLOCK_SUGGESTION						\
+  "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
+#endif
+  bool raw_lock(const void * addr, size_t size) const ;
+#undef MLOCK_SUGGESTION
+  static void raw_unlock(void * addr, size_t size);
+#elif defined(_WIN32)
+  static constexpr bool SUPPORTED = true;
+  static size_t lock_granularity();
+  bool raw_lock(void * ptr, size_t len) const ;
+  static void raw_unlock(void * ptr, size_t len);
+#else
+    static constexpr bool SUPPORTED = false;
+  static size_t lock_granularity();
+  bool raw_lock(const void * addr, size_t len) const;
+  static void raw_unlock(const void * addr, size_t len);
+#endif
+};
+
+
+struct llama_model {
+    e_model     type  = MODEL_UNKNOWN;
+    llm_arch    arch  = LLM_ARCH_UNKNOWN;
+    llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
+
+    std::string name = "n/a";
+
+    llama_hparams hparams = {};
+    llama_vocab   vocab;
+
+    struct ggml_tensor * tok_embd;
+    struct ggml_tensor * pos_embd;
+    struct ggml_tensor * tok_norm;
+    struct ggml_tensor * tok_norm_b;
+
+    struct ggml_tensor * output_norm;
+    struct ggml_tensor * output_norm_b;
+    struct ggml_tensor * output;
+
+    std::vector<llama_layer> layers;
+
+    int n_gpu_layers;
+
+    // gguf metadata
+    std::unordered_map<std::string, std::string> gguf_kv;
+
+    // context
+    struct ggml_context * ctx = NULL;
+
+    // the model memory buffer
+    llama_buffer buf;
+
+    // model memory mapped file
+    std::unique_ptr<llama_mmap> mapping;
+
+    // objects representing data potentially being locked in memory
+    llama_mlock mlock_buf;
+    llama_mlock mlock_mmap;
+
+    // for quantize-stats only
+    std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
+
+    int64_t t_load_us = 0;
+    int64_t t_start_us = 0;
+
+    ~llama_model() {
+	if (ctx) {
+	    ggml_free(ctx);
+	}
+
+#ifdef GGML_USE_CUBLAS
+	if (ggml_cublas_loaded()) {
+	    for (size_t i = 0; i < tensors_by_name.size(); ++i) {
+		ggml_cuda_free_data(tensors_by_name[i].second);
+	    }
+	    ggml_cuda_free_scratch();
+	}
+#endif
+
+#if defined(GGML_USE_CLBLAST)
+	for (size_t i = 0; i < tensors_by_name.size(); ++i) {
+	    ggml_cl_free_data(tensors_by_name[i].second);
+	}
+#endif
+    }
+};
+
+struct llama_context {
+    llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
+  ~llama_context();
+
+    llama_cparams cparams;
+
+    const llama_model & model;
+
+    // key + value cache for the self attention
+    struct llama_kv_cache kv_self;
+
+    std::mt19937 rng;
+
+    bool has_evaluated_once = false;
+
+    int64_t t_start_us;
+    int64_t t_load_us;
+    int64_t t_sample_us = 0;
+    int64_t t_p_eval_us = 0;
+    int64_t t_eval_us   = 0;
+
+    int32_t n_sample = 0; // number of tokens sampled
+    int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
+    int32_t n_eval   = 0; // number of eval calls
+
+    // decode output (2-dimensional array: [n_tokens][n_vocab])
+    std::vector<float> logits;
+    bool logits_all = false;
+
+    // input embedding (1-dimensional array: [n_embd])
+    std::vector<float> embedding;
+
+    // reusable buffer for `struct ggml_graph_plan.work_data`
+    std::vector<uint8_t> work_buffer;
+
+    // memory buffers used to evaluate the model
+    llama_buffer buf_compute;
+
+    llama_buffer buf_alloc;
+    ggml_allocr * alloc = NULL;
+
+#ifdef GGML_USE_METAL
+    ggml_metal_context * ctx_metal = NULL;
+#endif
+
+#ifdef GGML_USE_MPI
+    ggml_mpi_context * ctx_mpi = NULL;
+#endif
+};
+
+
+struct LLM_TN {
+  LLM_TN(llm_arch arch) ;
+
+  llm_arch arch;
+
+  std::string operator()(llm_tensor tensor) const;
+
+  std::string operator()(llm_tensor tensor, const std::string & suffix) const ;
+
+  std::string operator()(llm_tensor tensor, int bid) const ;
+
+  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const ;
+
+};
+
+
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+  llama_file(const char * fname, const char * mode) ;
+  size_t tell() const;
+  void seek(size_t offset, int whence) const;
+  void read_raw(void * ptr, size_t len) const;
+  uint32_t read_u32() const;
+  void write_raw(const void * ptr, size_t len) const ;
+  void write_u32(std::uint32_t val) const;
+  ~llama_file();
+
+};
+
+
+struct llama_state {
+  llama_state();
+    // We save the log callback globally
+    ggml_log_callback log_callback;
+    void * log_callback_user_data = nullptr;
+};
+
+
+
+struct llama_model_loader {
+    int n_kv      = 0;
+    int n_tensors = 0;
+    int n_created = 0;
+
+    int64_t n_elements = 0;
+    size_t  n_bytes    = 0;
+
+    bool use_mmap = false;
+
+    llama_file  file;
+    llama_ftype ftype;
+    llama_fver  fver;
+
+    std::unique_ptr<llama_mmap> mapping;
+
+    struct gguf_context * ctx_gguf = NULL;
+    struct ggml_context * ctx_meta = NULL;
+
+  llama_model_loader(const std::string & fname, bool use_mmap) ;
+
+  ~llama_model_loader();
+
+  std::string get_arch_name() const;
+
+  enum llm_arch get_arch() const ;
+  const char * get_tensor_name(int i) const;
+
+  struct ggml_tensor * get_tensor_meta(int i) const;
+
+  void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const;
+
+  struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) ;
+
+  struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) ;
+
+  void done_getting_tensors() const;
+
+  size_t file_offset(const char * name) const;
+
+
+  void load_data_for(struct ggml_tensor * cur) const ;
+  void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) ;
+};
+
+struct llama_data_context {
+    virtual void write(const void * src, size_t size) = 0;
+    virtual size_t get_size_written() = 0;
+    virtual ~llama_data_context() = default;
+};
+
+struct llama_data_buffer_context : llama_data_context {
+    uint8_t * ptr;
+    size_t size_written = 0;
+  llama_data_buffer_context(uint8_t * p) ;
+  void write(const void * src, size_t size) override ;
+  size_t get_size_written() override ;
+};
+
+struct llama_data_file_context : llama_data_context {
+    llama_file * file;
+    size_t size_written = 0;
+  llama_data_file_context(llama_file * f);
+  size_t get_size_written() override ;
+  void write(const void * src, size_t size);
+};
+
+
+struct llama_beam {
+  std::vector<llama_token> tokens;
+  float p;  // Cumulative beam probability (renormalized relative to all beams)
+  bool eob; // Initialize end-of-beam to false. Callback sets this to true.
+  // Sort beams by probability. In case of ties, prefer beams at eob.
+  bool operator<(const llama_beam & rhs) const ;
+  void shift_tokens(const size_t n) ;
+  llama_beam_view view() const;
+};
+
+// A struct for calculating logit-related info.
+struct llama_logit_info {
+    const float * const logits;
+    const int n_vocab;
+    const float max_l;
+    const float normalizer;
+    struct sum_exp {
+	float max_l;
+	float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
+    };
+  llama_logit_info(llama_context * ctx);
+  llama_token_data get_token_data(const llama_token token_id) const ;
+  std::vector<llama_token_data> top_k(size_t k) ;
+  float probability_from_logit(float logit) const ;
+};
+
+
+struct llama_beam_search_data {
+  llama_context * ctx;
+  size_t n_beams;
+  int n_past;
+  int n_predict;
+  std::vector<llama_beam> beams;
+  std::vector<llama_beam> next_beams;
+  size_t common_prefix_length;
+  std::vector<llama_beam_view> beam_views;
+  llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict);
+  void collapse_beams(const size_t beam_idx) ;
+  void fill_next_beams_by_top_probabilities(llama_beam & beam) ;
+  size_t find_common_prefix_length() ;
+  llama_beams_state get_beams_state(const bool last_call) ;
+  void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data);
+  static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) ;
+  size_t top_beam_index();
+  void update_beams_from_beam_views();
+};
+
+using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
+
+enum llm_rope_type {
+    LLM_ROPE,
+    LLM_ROPE_NEOX,
+    LLM_ROPE_GLM,
+};
+
+enum llm_ffn_op_type {
+    LLM_FFN_SILU,
+    LLM_FFN_GELU,
+    LLM_FFN_RELU,
+    LLM_FFN_RELU_SQR,
+};
+
+enum llm_ffn_gate_type {
+    LLM_FFN_SEQ,
+    LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
+};
+
+enum llm_norm_type {
+    LLM_NORM,
+    LLM_NORM_RMS,
+};
+
+struct llm_build_context {
+    const llama_model    & model;
+    const llama_hparams  & hparams;
+    const llama_cparams  & cparams;
+    const llama_batch    & batch;
+    const llama_kv_cache & kv_self;
+
+    const int64_t n_embd;
+    const int64_t n_layer;
+    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
+    const int64_t n_head;
+    const int64_t n_head_kv;
+    const int64_t n_embd_head;
+    const int64_t n_embd_gqa;
+
+    const float freq_base;
+    const float freq_scale;
+    const float ext_factor;
+    const float attn_factor;
+    const float beta_fast;
+    const float beta_slow;
+    const float norm_eps;
+    const float norm_rms_eps;
+
+    const int32_t n_tokens;
+    const int32_t n_kv;     // size of KV cache to consider (n_kv <= n_ctx)
+    const int32_t kv_head;  // index of where we store new KV data in the cache
+    const int32_t n_orig_ctx;
+
+    const bool do_rope_shift;
+
+    const llm_build_cb & cb;
+
+    llama_buffer & buf_compute;
+
+    struct ggml_context * ctx0 = nullptr;
+
+    // TODO: consider making the entire interface noexcept
+    llm_build_context(
+	llama_context  & lctx,
+    const llama_batch  & batch,
+    const llm_build_cb & cb,
+	bool   worst_case);
+
+  void init() ;
+  void free() ;
+  struct ggml_cgraph * build_llama() ;
+  struct ggml_cgraph * build_baichuan() ;
+  struct ggml_cgraph * build_falcon() ;
+  struct ggml_cgraph * build_starcoder() ;
+  struct ggml_cgraph * build_persimmon() ;
+  struct ggml_cgraph * build_refact() ;
+  struct ggml_cgraph * build_bloom() ;
+  struct ggml_cgraph * build_mpt() ;
+  struct ggml_cgraph * build_stablelm();
+};
+
+
+enum llm_offload_func_e {
+    OFFLOAD_FUNC_NOP,
+    OFFLOAD_FUNC,
+    OFFLOAD_FUNC_KQ,
+    OFFLOAD_FUNC_V,
+    OFFLOAD_FUNC_NR,
+    OFFLOAD_FUNC_EMB,
+    OFFLOAD_FUNC_OUT,
+};
+
+struct llm_offload_trie {
+  struct node {
+    ~node() ;
+    node * children[256] = { nullptr };
+    llm_offload_func_e func = OFFLOAD_FUNC_NOP;
+  };
+  node * root = nullptr;
+  llm_offload_trie();
+  llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) ;
+  ~llm_offload_trie();
+  void add(const char * name, llm_offload_func_e func);
+  llm_offload_func_e find(const char * name) const;
+  
+};
+
+struct llm_symbol {
+    using index = int;
+    index prev;
+    index next;
+    const char * text;
+    size_t n;
+};
+
+
+struct llm_bigram_spm {
+    struct comparator {
+      bool operator()(llm_bigram_spm & l, llm_bigram_spm & r);
+    };
+    using queue_storage = std::vector<llm_bigram_spm>;
+    using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
+    llm_symbol::index left;
+    llm_symbol::index right;
+    float score;
+    size_t size;
+};
+
+struct llm_tokenizer_spm {
+  llm_tokenizer_spm(const llama_vocab & vocab);
+    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output);
+
+
+private:
+  void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) ;
+  void try_add_bigram(int left, int right) ;
+  const llama_vocab & vocab;
+
+  std::vector<llm_symbol> symbols;
+  llm_bigram_spm::queue work_queue;
+
+    std::map<std::string, std::pair<int, int>> rev_merge;
+};
+
+// BPE tokenizer
+// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
+// tried to simplify unicode stuff, so most likely does not work 100% correctly!
+
+// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
+
+struct llm_bigram_bpe {
+    struct comparator {
+      bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const ;
+    };
+
+    using queue_storage = std::vector<llm_bigram_bpe>;
+    using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
+    llm_symbol::index left;
+    llm_symbol::index right;
+    std::string text;
+    int rank;
+    size_t size;
+};
+
+struct llm_tokenizer_bpe {
+  llm_tokenizer_bpe(const llama_vocab & vocab);
+
+  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output);
+
+private:
+  void add_new_bigram(int left, int right) ;
+
+  std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) ;
+
+  const llama_vocab & vocab;
+
+  std::vector<llm_symbol> symbols;
+  std::vector<llm_symbol> symbols_final;
+
+    llm_bigram_bpe::queue work_queue;
+};
+
+typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
+    FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
+    FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
+} FRAGMENT_BUFFER_VARIANT_TYPE;
+
+struct fragment_buffer_variant{
+  fragment_buffer_variant(llama_vocab::id _token);
+  fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length);
+  const FRAGMENT_BUFFER_VARIANT_TYPE type;
+  const llama_vocab::id token;
+  const std::string _dummy;
+  const std::string & raw_text;
+  const uint64_t offset;
+  const uint64_t length;
+};
+
+struct llama_partial_utf8 {
+    uint32_t value;    // bit value so far (unshifted)
+    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
+};
+
+struct llama_grammar {
+    const std::vector<std::vector<llama_grammar_element>>   rules;
+    std::vector<std::vector<const llama_grammar_element *>> stacks;
+
+    // buffer for partially generated UTF-8 sequence from accepted tokens
+    llama_partial_utf8                                      partial_utf8;
+};
+
+struct llama_grammar_candidate {
+    size_t               index;
+    const uint32_t     * code_points;
+    llama_partial_utf8   partial_utf8;
+};
+
+struct quantize_state_internal {
+    const llama_model                 & model;
+    const llama_model_quantize_params * params;
+
+    int n_attention_wv    = 0;
+    int n_feed_forward_w2 = 0;
+    int i_attention_wv    = 0;
+    int i_feed_forward_w2 = 0;
+
+    int n_k_quantized     = 0;
+    int n_fallback        = 0;
+
+    quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
+        : model(model)
+        , params(params)
+        {}
+};
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@ -50,7 +50,7 @@
 #endif

 #ifdef __cplusplus
-extern "C" {
+//extern "C" {
 #endif

    //
@ -115,12 +115,20 @@ extern "C" {
    };

    typedef struct llama_token_data {
+      llama_token_data( llama_token id, float logit,     float p):
+	id( id),logit(logit),p(p){      }
        llama_token id; // token id
        float logit;    // log-odds of the token
        float p;        // probability of the token
    } llama_token_data;

    typedef struct llama_token_data_array {
+      llama_token_data_array(llama_token_data * data,
+			     size_t size,
+			     bool sorted):
+	data(data),
+	size(size),
+	sorted(sorted){}
        llama_token_data * data;
        size_t size;
        bool sorted;
@ -139,6 +147,29 @@ extern "C" {
    // - logits : if zero, the logits for the respective token will not be output
    //
    typedef struct llama_batch {
+
+      llama_batch(int32_t n_tokens,
+		  llama_token  *  token,
+		  float        *  embd,
+		  llama_pos    *  pos,
+		  int32_t      *  n_seq_id,
+		  llama_seq_id ** seq_id,
+		  int8_t       *  logits,
+		  llama_pos    all_pos_0,
+		  llama_pos    all_pos_1,
+		  llama_seq_id all_seq_id
+		  ) :
+	n_tokens(n_tokens),
+	token(token),
+	embd(embd),
+	pos(pos),
+	n_seq_id(n_seq_id),
+	seq_id(seq_id),
+	logits(logits),      
+	all_pos_0(all_pos_0),
+	all_pos_1(all_pos_1),
+	all_seq_id(all_seq_id) {}
+      
        int32_t n_tokens;

        llama_token  *  token;
@ -174,7 +205,7 @@ extern "C" {
        bool use_mlock;  // force system to keep model in RAM
    };

-    struct llama_context_params {
+    struct llama_context_params{
        uint32_t seed;              // RNG seed, -1 for random
        uint32_t n_ctx;             // text context, 0 = from model
        uint32_t n_batch;           // prompt processing maximum batch size
@ -238,6 +269,10 @@ extern "C" {
    };

    typedef struct llama_grammar_element {
+      llama_grammar_element(        enum llama_gretype type,
+				    uint32_t           value // Unicode code point or rule ID
+				    ):type(type), value(value){}
+      llama_grammar_element( ):type(llama_gretype(0)), value(0){}
        enum llama_gretype type;
        uint32_t           value; // Unicode code point or rule ID
    } llama_grammar_element;
@ -827,7 +862,7 @@ extern "C" {
    LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);

 #ifdef __cplusplus
-}
+//}
 #endif

 // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
@ -844,4 +879,8 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal

 #endif // LLAMA_API_INTERNAL

+
+
 #endif // LLAMA_H
+
+
--- a/print.hpp
+++ b/print.hpp
@ -0,0 +1,553 @@
+#include <iostream>
+#include "llama.h"
+#include "ggml-internal.hpp"
+#include "llama-internal.hpp"
+
+REFL_TYPE(ggml_init_params )
+REFL_END
+
+REFL_TYPE(ggml_opt_params::ggml_adam)
+REFL_END
+
+REFL_TYPE(ggml_opt_params::ggml_lbfgs)
+REFL_END
+
+
+REFL_TYPE(ggml_opt_context::ggml_grad )
+REFL_END
+
+REFL_TYPE(gpt_params )
+
+REFL_FIELD( seed )
+REFL_FIELD( n_threads)
+REFL_FIELD( n_threads_batch)
+REFL_FIELD( n_predict )
+REFL_FIELD( n_ctx )
+REFL_FIELD( n_batch)
+REFL_FIELD( n_keep )
+REFL_FIELD( n_draft)
+REFL_FIELD( n_chunks )
+REFL_FIELD( n_parallel)
+REFL_FIELD( n_sequences)
+REFL_FIELD( p_accept  )
+REFL_FIELD( p_split )
+REFL_FIELD( n_gpu_layers)
+REFL_FIELD( n_gpu_layers_draft)
+REFL_FIELD( main_gpu )
+REFL_FIELD( tensor_split)
+REFL_FIELD( n_beams )
+REFL_FIELD(rope_freq_base)
+REFL_FIELD( rope_freq_scale )
+REFL_FIELD( yarn_ext_factor )
+REFL_FIELD( yarn_attn_factor )
+REFL_FIELD( yarn_beta_fast )
+REFL_FIELD( yarn_beta_slow )
+REFL_FIELD( yarn_orig_ctx)
+REFL_FIELD( rope_scaling_type)
+REFL_FIELD( sparams)
+REFL_FIELD(model )
+REFL_FIELD(model_draft )
+REFL_FIELD(model_alias)
+REFL_FIELD(prompt )
+REFL_FIELD(prompt_file )
+REFL_FIELD(path_prompt_cache )
+REFL_FIELD(input_prefix )
+REFL_FIELD(input_suffix )
+REFL_FIELD( antiprompt)
+REFL_FIELD(logdir )
+REFL_FIELD( lora_adapter)
+REFL_FIELD(lora_base )
+REFL_FIELD( ppl_stride )
+REFL_FIELD( ppl_output_type )
+REFL_FIELD( hellaswag )
+REFL_FIELD( hellaswag_tasks )
+REFL_FIELD( mul_mat_q )
+REFL_FIELD( memory_f16)
+REFL_FIELD( random_prompt )
+REFL_FIELD( use_color )
+REFL_FIELD( interactive )
+REFL_FIELD( chatml )
+REFL_FIELD( prompt_cache_all )
+REFL_FIELD( prompt_cache_ro )
+REFL_FIELD( embedding )
+REFL_FIELD( escape )
+REFL_FIELD( interactive_first )
+REFL_FIELD( multiline_input )
+REFL_FIELD( simple_io )
+REFL_FIELD( cont_batching )
+REFL_FIELD( input_prefix_bos )
+REFL_FIELD( ignore_eos )
+REFL_FIELD( instruct )
+REFL_FIELD( logits_all )
+REFL_FIELD( use_mmap)
+REFL_FIELD( use_mlock )
+REFL_FIELD( numa )
+REFL_FIELD( verbose_prompt )
+REFL_FIELD( infill ) 
+REFL_FIELD(mmproj )
+REFL_FIELD( image)
+
+REFL_END
+
+REFL_TYPE(llama_sampling_params)
+REFL_END
+
+REFL_TYPE(llm_arch)
+REFL_END
+
+REFL_TYPE(llama_sampling_context )
+REFL_FIELD( params)
+REFL_FIELD( mirostat_mu)
+REFL_FIELD( grammar)
+REFL_FIELD( parsed_grammar)
+REFL_FIELD( prev) 
+REFL_FIELD( cur)
+REFL_END
+
+REFL_TYPE(llama_token_data )
+REFL_END
+
+
+REFL_TYPE(llama_token_data_array )
+REFL_END
+
+REFL_TYPE(llama_batch )
+REFL_END
+
+
+REFL_TYPE(ggml_object)
+  REFL_FIELD(offs)
+REFL_END
+
+REFL_TYPE(ggml_tensor)
+  REFL_FIELD(type)
+REFL_END
+
+REFL_TYPE(ggml_cplan)
+  REFL_FIELD(work_size)
+REFL_END
+
+REFL_TYPE(ggml_hash_set)
+  REFL_FIELD(size)
+REFL_END
+
+REFL_TYPE(ggml_cgraph)
+  REFL_FIELD(size)
+REFL_END
+
+REFL_TYPE(ggml_scratch)
+  REFL_FIELD(offs)
+REFL_END
+
+REFL_TYPE(ggml_compute_params)
+  REFL_FIELD(type)
+REFL_END
+
+REFL_TYPE(ggml_opt_params)
+  REFL_FIELD(type)
+REFL_END
+
+REFL_TYPE(ggml_opt_context)
+  REFL_FIELD(ctx)
+REFL_END
+
+REFL_TYPE(gguf_init_params)
+REFL_END
+
+REFL_TYPE(ggml_something)
+  REFL_FIELD(type_name)
+REFL_END
+
+REFL_TYPE(ggml_context)
+  REFL_FIELD(mem_size)
+REFL_FIELD(mem_buffer)
+REFL_FIELD(mem_buffer_owned)
+REFL_FIELD(    no_alloc)
+REFL_FIELD(    no_alloc_save)
+REFL_FIELD(    n_objects)
+REFL_FIELD(    objects_begin)
+REFL_FIELD(    objects_end)
+REFL_FIELD(    scratch)
+REFL_FIELD(    scratch_save)
+
+REFL_END
+
+REFL_TYPE(ggml_context_container)
+  REFL_FIELD(used)
+  REFL_FIELD(context)
+REFL_END
+
+ REFL_TYPE(ggml_numa_node)
+   REFL_FIELD(cpus)
+   REFL_FIELD(n_cpus)
+ REFL_END
+
+ REFL_TYPE(ggml_numa_nodes)
+   REFL_FIELD(nodes)
+   REFL_FIELD(n_nodes)
+ REFL_END
+
+ REFL_TYPE(ggml_state)
+   REFL_FIELD(contexts)
+   REFL_FIELD(numa)
+   REFL_END
+
+ REFL_TYPE(gguf_str)
+   REFL_FIELD(n)
+   REFL_FIELD(data)
+ REFL_END
+
+ REFL_TYPE(ggml_map_custom1_op_params)
+   REFL_FIELD(fun)
+   REFL_FIELD(n_tasks)
+ REFL_END
+
+REFL_TYPE(ggml_map_custom2_op_params)
+  REFL_FIELD(fun)
+  REFL_FIELD(n_tasks)
+REFL_END
+
+REFL_TYPE(ggml_map_custom3_op_params)
+  REFL_FIELD(fun)
+  REFL_FIELD(n_tasks)
+REFL_END
+
+REFL_TYPE(hash_map)
+  REFL_FIELD(set)
+  REFL_FIELD(vals)
+REFL_END
+REFL_TYPE(ggml_compute_state_shared)
+  REFL_FIELD(cgraph)
+  REFL_FIELD(cplan)
+REFL_END
+REFL_TYPE(ggml_compute_state)
+  REFL_FIELD(thrd)
+  REFL_FIELD(ith)
+REFL_END
+REFL_TYPE(ggml_lbfgs_iteration_data)
+  REFL_FIELD(alpha)
+  REFL_FIELD(ys)
+REFL_END
+
+REFL_TYPE(gguf_kv)
+  REFL_FIELD(key)
+  REFL_FIELD(type)
+REFL_END
+
+REFL_TYPE(gguf_header)
+  REFL_FIELD(magic)
+  REFL_FIELD(version)
+REFL_END
+
+REFL_TYPE(gguf_tensor_info)
+  REFL_FIELD(name)
+  REFL_FIELD(n_dims)
+REFL_END
+
+REFL_TYPE(gguf_context)
+  REFL_FIELD(header)
+  REFL_FIELD(kv)
+REFL_END
+
+REFL_TYPE(gguf_buf)
+  REFL_FIELD(data)
+  REFL_FIELD(size)
+REFL_END
+
+
+REFL_TYPE(llama_model_params)
+  REFL_FIELD(n_gpu_layers)
+REFL_END
+REFL_TYPE(llama_context_params)
+  REFL_FIELD(seed)
+REFL_END
+REFL_TYPE(llama_model_quantize_params)
+  REFL_FIELD(nthread)
+REFL_END
+
+REFL_TYPE(llama_grammar_element)
+REFL_END
+
+REFL_TYPE(llama_timings)
+  REFL_FIELD(t_start_ms)
+REFL_END
+REFL_TYPE(llama_beam_view)
+  REFL_FIELD(tokens)
+REFL_END
+
+REFL_TYPE(llama_beams_state)
+  REFL_FIELD(beam_views)
+REFL_END
+  
+REFL_TYPE(ggml_backend)
+REFL_END
+
+REFL_TYPE(ggml_backend_buffer)
+REFL_END
+
+REFL_TYPE(ggml_allocr)
+REFL_END
+
+REFL_TYPE(ggml_tallocr)
+REFL_END
+
+REFL_TYPE(ggml_gallocr)
+REFL_END
+
+
+REFL_TYPE(llama_buffer)
+REFL_FIELD(data)
+REFL_FIELD(size)
+REFL_END
+  
+
+REFL_TYPE(llama_file)
+REFL_FIELD(fp)
+REFL_FIELD(size)
+REFL_END
+  
+
+REFL_TYPE(llama_mmap)
+REFL_FIELD(addr)
+REFL_FIELD(size)
+REFL_END
+
+
+REFL_TYPE(llama_mlock)
+  REFL_FIELD(addr)
+  REFL_FIELD(size)
+REFL_END
+
+REFL_TYPE(llama_state)
+ REFL_FIELD(log_callback)
+ REFL_FIELD(log_callback_user_data)
+ REFL_END
+  
+
+REFL_TYPE(llama_hparams)
+  REFL_FIELD(vocab_only)
+  REFL_FIELD(n_vocab)
+  REFL_END
+
+
+REFL_TYPE(llama_cparams)
+  REFL_FIELD(n_ctx)
+  REFL_FIELD(n_batch)
+REFL_END
+
+REFL_TYPE(llama_layer)
+ REFL_FIELD(attn_norm)
+ REFL_FIELD(attn_norm_b)
+REFL_END
+
+REFL_TYPE(llama_kv_cell)
+  REFL_FIELD(pos)
+  REFL_FIELD(delta)
+REFL_END
+
+REFL_TYPE(llama_kv_cache)
+   REFL_FIELD(has_shift)
+   REFL_FIELD(head)
+ REFL_END
+
+REFL_TYPE(e_model)
+REFL_END
+
+REFL_TYPE(llama_ftype)
+REFL_END
+
+REFL_TYPE(llama_model)
+  REFL_FIELD(type)
+  REFL_FIELD(arch)
+REFL_FIELD(ftype )
+
+REFL_FIELD(  name )
+
+  REFL_FIELD(   hparams )
+REFL_FIELD(    vocab)
+
+REFL_FIELD(   tok_embd)
+REFL_FIELD(   pos_embd)
+REFL_FIELD(   tok_norm)
+REFL_FIELD(   tok_norm_b)
+
+REFL_FIELD(   output_norm)
+REFL_FIELD(  output_norm_b)
+REFL_FIELD(  output)
+
+REFL_FIELD(  layers)
+
+REFL_FIELD(  n_gpu_layers)
+
+  REFL_FIELD(  gguf_kv) //unordered map
+  REFL_FIELD( ctx)
+  REFL_FIELD( buf)
+ REFL_FIELD( mapping) //std::unique_ptr 
+REFL_FIELD( mlock_buf)
+REFL_FIELD( mlock_mmap)
+REFL_FIELD( tensors_by_name)
+  REFL_FIELD( t_load_us)
+REFL_FIELD( t_start_us)
+
+REFL_END
+
+REFL_TYPE(llama_vocab)
+  REFL_END
+  
+  REFL_TYPE(grammar_parser::parse_state)
+  REFL_END
+  
+REFL_TYPE(llama_context)
+REFL_FIELD( cparams)
+//REFL_FIELD(model)
+REFL_FIELD(kv_self)
+ REFL_FIELD(rng) //random numbers
+REFL_FIELD(has_evaluated_once )
+REFL_FIELD( t_start_us)
+REFL_FIELD( t_load_us)
+  REFL_FIELD( t_sample_us )
+REFL_FIELD( t_p_eval_us )
+  REFL_FIELD( t_eval_us)
+REFL_FIELD( n_sample )
+REFL_FIELD( n_p_eval )
+  REFL_FIELD( n_eval  )
+REFL_FIELD(  logits)
+REFL_FIELD(  logits_all )
+REFL_FIELD(  embedding)
+REFL_FIELD(   work_buffer)
+  REFL_FIELD(   buf_compute)
+  REFL_FIELD( buf_alloc)
+REFL_FIELD( alloc ) 
+
+#ifdef GGML_USE_METAL
+REFL_FIELD( ctx_metal )
+#endif
+
+#ifdef GGML_USE_MPI
+REFL_FIELD( ctx_mpi )
+
+#endif
+REFL_END
+
+REFL_TYPE(llama_model_loader)
+  REFL_FIELD(n_kv)
+  REFL_FIELD(n_tensors)
+REFL_END
+
+REFL_TYPE(llm_build_context)
+// REFL_FIELD(model) cannot create pointer to reference member ‘llm_build_context::model’
+//  REFL_FIELD(hparams) cannot create pointer to reference member ‘llm_build_context::hparams’
+REFL_END
+
+REFL_TYPE(llm_offload_trie)
+REFL_END
+
+REFL_TYPE(llm_symbol)
+  REFL_FIELD(prev)
+REFL_END
+
+REFL_TYPE(llm_bigram_spm)
+REFL_END
+
+REFL_TYPE(llm_tokenizer_spm)
+REFL_END
+
+REFL_TYPE(llm_bigram_bpe)
+REFL_END
+
+REFL_TYPE(llm_tokenizer_bpe)
+REFL_END
+  
+
+REFL_TYPE(fragment_buffer_variant)
+REFL_END
+  
+
+REFL_TYPE(llama_partial_utf8)
+  REFL_FIELD(value)
+  REFL_FIELD(n_remain)
+REFL_END
+  
+
+REFL_TYPE(llama_grammar)
+ REFL_FIELD(rules)
+ REFL_FIELD(stacks)
+REFL_END
+  
+
+REFL_TYPE(llama_grammar_candidate)
+ REFL_FIELD(index)
+ REFL_FIELD(code_points)
+REFL_END
+  
+
+REFL_TYPE(llama_beam)
+  REFL_FIELD(tokens)
+  REFL_FIELD(p)
+REFL_END
+  
+
+REFL_TYPE(llama_logit_info)
+  REFL_FIELD(logits)
+  REFL_FIELD(n_vocab)
+REFL_END
+
+REFL_TYPE(llama_beam_search_data)
+  REFL_FIELD(ctx)
+  REFL_FIELD(n_beams)
+REFL_END
+
+
+REFL_TYPE(quantize_state_internal)
+//  REFL_FIELD(model)
+  REFL_FIELD(params)
+REFL_FIELD( n_attention_wv )
+REFL_FIELD(    n_feed_forward_w2 )
+  REFL_FIELD(    i_attention_wv    )
+  REFL_FIELD(    i_feed_forward_w2 )
+REFL_FIELD(    n_k_quantized     )
+REFL_FIELD(     n_fallback        )
+
+REFL_END
+
+REFL_TYPE(llama_data_context)
+REFL_END
+  
+REFL_TYPE(llama_data_buffer_context)
+  REFL_FIELD(ptr)
+REFL_END
+
+REFL_TYPE(llama_data_file_context)
+  REFL_FIELD(file)
+REFL_END
+
+template <typename T>
+constexpr auto get_value_type_name(const T t) noexcept
+{
+  return t.value_type;
+}
+
+// // A generic function to print out the fields of any object
+template<typename T>
+void print_fields(const T& t) {
+  refl::runtime::debug(std::cout, t);
+  constexpr auto type = refl::reflect<T>();
+
+  constexpr auto membertype = refl::member_list<T>();
+
+  constexpr auto members = get_members(type);
+  std::cout << "DEBUG Type: " << type.name.c_str() << "\n";
+  std::cout << "DEBUG Type2: " << typeid(membertype).name() << "\n";
+  std::cout << "DEBUG Type3: " << typeid(members).name() << "\n";
+     refl::util::for_each(members, [&](auto member) {
+       //using member_t = decltype(member::value_type);
+       //typename type3 = member::value_type;
+       //typename trait::remove_qualifiers_t<member_t>::value_type>;
+       //constexpr auto type2 = refl::reflect(type3);
+	 //std::cout  << "Auto:" << foo <<"\n";       
+       std::cout  << "Auto:" << member.name <<"\n";
+       //std::cout << "DEBUG Type2: " << typeid(member_t).name() << "\n";
+       //std::cout << "DEBUG Type2: " << type2.name.c_str() << "\n";
+     });
+     std::cout << "\n";
+}
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -46,6 +46,6 @@ llama_build_and_test_executable(test-grad0.cpp) # SLOW
 llama_build_and_test_executable(test-rope.cpp)

 # dummy executable - not installed
-get_filename_component(TEST_TARGET test-c.c NAME_WE)
-add_executable(${TEST_TARGET} test-c.c)
+get_filename_component(TEST_TARGET test-c.cpp NAME_WE)
+add_executable(${TEST_TARGET} test-c.cpp)
 target_link_libraries(${TEST_TARGET} PRIVATE llama)
--- a/tests/test-c.cpp
+++ b/tests/test-c.cpp
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@ -115,11 +115,11 @@ int main(int argc, char * argv[]) {
    generate_data(1.0, test_data2.size(), test_data2.data());

    // Initialize GGML, ensures float conversion tables are initialized
-    struct ggml_init_params ggml_params = {
+    struct ggml_init_params ggml_params(
        /* .mem_size   = */ 1*1024,
        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ true,
-    };
+        /* .no_alloc   = */ true
+					);
    struct ggml_context * ctx = ggml_init(ggml_params);

    int num_failed = 0;
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@ -261,11 +261,11 @@ int main(int argc, char * argv[]) {


    // Initialize GGML, ensures float conversion tables are initialized
-    struct ggml_init_params ggml_params = {
+    struct ggml_init_params ggml_params(
        /* .mem_size   = */ 1*1024,
        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ true,
-    };
+        /* .no_alloc   = */ true
+					);
    struct ggml_context * ctx = ggml_init(ggml_params);

    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@ -124,11 +124,11 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
 }

 int main(int /*argc*/, const char ** /*argv*/) {
-    struct ggml_init_params params = {
+  struct ggml_init_params params(
        /* .mem_size   = */ 128*1024*1024,
        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ false,
-    };
+        /* .no_alloc   = */ false
+				 );

    std::vector<uint8_t> work_buffer;