still not working

ready to rebase working
2023-11-24 19:09:19 -05:00 · 2023-11-24 19:09:19 -05:00 · 3faef69427
commit 3faef69427
parent 04814e718e
18 changed files with 2818 additions and 520 deletions
--- a/.gitignore
+++ b/.gitignore
@ -99,3 +99,6 @@ tests/test-tokenizer-0-llama
 tests/test-tokenizer-0-falcon
 tests/test-tokenizer-1-llama
 tests/test-tokenizer-1-bpe
+/#llama.cpp#
+#*
+\\#*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -104,7 +104,7 @@ option(LLAMA_BUILD_SERVER               "llama: build server example"
 # Compile flags
 #

-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
@ -230,7 +230,12 @@ if (LLAMA_BLAS)

        message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
        add_compile_options(${BLAS_LINKER_FLAGS})
-        add_compile_definitions(GGML_USE_OPENBLAS)
+
+	# from https://github.com/NVIDIA/cutlass
+	make_directory("${PROJECT_BINARY_DIR}/nvcc_tmp")
+	set(cuda_flags --keep "SHELL:--keep-dir ${PROJECT_BINARY_DIR}/nvcc_tmp" ${cuda_flags})
+
+	#        add_compile_definitions(GGML_USE_OPENBLAS)
        if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
            add_compile_definitions(GGML_BLAS_USE_MKL)
        endif()
@ -312,7 +317,7 @@ if (LLAMA_MPI)
    if (MPI_C_FOUND)
        message(STATUS "MPI found")
        set(GGML_HEADERS_MPI ggml-mpi.h)
-        set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
+        set(GGML_SOURCES_MPI ggml-mpi.cpp ggml-mpi.h)
        add_compile_definitions(GGML_USE_MPI)
        add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
        if (NOT MSVC)
@ -438,6 +443,9 @@ if (NOT cuda_host_flags STREQUAL "")
    set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
 endif()

+# 
+set(cuda_flags --verbose -G  ${cuda_flags})
+
 add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")

 if (WIN32)
@ -485,8 +493,10 @@ if (NOT MSVC)
            add_link_options(-static-libgcc -static-libstdc++)
        endif()
    endif()
+    add_link_options("-Wl,-Map=${TARGET}.map")
+
    if (LLAMA_GPROF)
-        add_compile_options(-pg)
+      add_compile_options(-pg)
    endif()
 endif()

@ -645,13 +655,16 @@ if (GGML_USE_CPU_HBM)
 endif()

 add_library(ggml OBJECT
-            ggml.c
+            ggml.cpp
            ggml.h
-            ggml-alloc.c
+	    print.hpp
+	    ggml-internal.hpp
+	    llama-internal.hpp
+            ggml-alloc.cpp
            ggml-alloc.h
-            ggml-backend.c
+            ggml-backend.cpp
            ggml-backend.h
-            ggml-quants.c
+            ggml-quants.cpp
            ggml-quants.h
            ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
            ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
@ -683,7 +696,7 @@ add_library(llama
            )

 target_include_directories(llama PUBLIC .)
-target_compile_features(llama PUBLIC cxx_std_11) # don't bump
+target_compile_features(llama PUBLIC cxx_std_20) # don't bump
 target_link_libraries(llama PRIVATE
    ggml
    ${LLAMA_EXTRA_LIBS}
--- a/24
+++ b/24
@ -116,7 +116,7 @@ endif
 # keep standard at C11 and C++11
 MK_CPPFLAGS = -I. -Icommon
 MK_CFLAGS   = -std=c11   -fPIC
-MK_CXXFLAGS = -std=c++11 -fPIC
+MK_CXXFLAGS = -std=c++20 -fPIC -fpermissive -DCPP_ONLY

 # -Ofast tends to produce faster code, but may not be available for some compilers.
 ifdef LLAMA_FAST
@ -502,7 +502,7 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
 endif # LLAMA_METAL

 ifdef LLAMA_MPI
-ggml-mpi.o: ggml-mpi.c ggml-mpi.h
+ggml-mpi.o: ggml-mpi.cpp ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI

@ -537,17 +537,17 @@ $(info )
 # Build library
 #

-ggml.o: ggml.c ggml.h ggml-cuda.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+ggml.o: ggml.cpp ggml.h ggml-cuda.h
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@

-ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+ggml-alloc.o: ggml-alloc.cpp ggml.h ggml-alloc.h
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@

-ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+ggml-backend.o: ggml-backend.cpp ggml.h ggml-backend.h
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@

-ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
-	$(CC) $(CFLAGS)    -c $< -o $@
+ggml-quants.o: ggml-quants.cpp ggml.h ggml-quants.h
+	$(CXX) $(CXXFLAGS)    -c $< -o $@

 OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o

@ -734,5 +734,5 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
 tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-c.o: tests/test-c.c llama.h
-	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
+tests/test-c.o: tests/test-c.cpp llama.h
+	$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
--- a/binding.py
+++ b/binding.py
@ -0,0 +1,334 @@
+import os
+import json
+import re
+import clang.cindex
+
+# configurable part
+
+CLANG_VERSION='13.0.1'
+#   homebrew installs for llvm (brew info llvm gives details):
+#       x64: /usr/local/opt/llvm/lib
+#       arm64: /opt/homebrew/opt/llvm/lib
+llvmLibPath = "/usr/lib/llvm-15/lib/"
+
+cxxClientRoot = "/home/mdupont/experiments/llama.cpp/"
+
+fileList = [
+    "ggml.cpp",
+    "llama.cpp"
+]
+
+typeList = [
+]
+
+# end of configurable part
+
+clang.cindex.Config.set_library_path(llvmLibPath)
+
+
+def list_headers_in_dir(path):
+    # enumerates a folder but keeps the full pathing for the files returned
+    # and removes certain files we don't want (like non-hxx, _json.hxx or _fmt.hxx)
+
+    # list all the files in the folder
+    files = os.listdir(path)
+    # only include .hxx files
+    files = list(filter(lambda x: x.endswith('.hxx'), files))
+    # add the folder path back on
+    files = list(map(lambda x: path + x, files))
+    return files
+
+
+# parse through the list of files specified and expand wildcards
+fullFileList = []
+for filePath in fileList:
+    if "*" in filePath:
+        # wildcard path
+        basePath = filePath[:-1]
+        if "*" in basePath:
+            # if there is still a wildcard, we have an issue...
+            raise NotImplementedError(
+                "wildcard only supported at end of file path")
+        files = list_headers_in_dir(os.path.join(cxxClientRoot, basePath))
+        fullFileList = fullFileList + files
+    else:
+        # normal path
+        ff = os.path.join(cxxClientRoot, filePath)
+        fullFileList.append(ff)
+        print("DBUG",ff)
+# exclude _json.hxx files
+fullFileList = list(
+    filter(lambda x: not x.endswith('_json.hxx'), fullFileList))
+# exclude _fmt.hxx files
+fullFileList = list(
+    filter(lambda x: not x.endswith('_fmt.hxx'), fullFileList))
+
+
+# generate a list of regexps from the type list (for handling wildcards)
+typeListRe = list(map(lambda x: x.replace("*", "(.*)") + "(.*)", typeList))
+
+
+def is_included_type(name, with_durability=False):
+
+    # TODO(brett19): This should be generalized somehow...
+    if "is_compound_operation" in name:
+        return False
+
+    if "replica_context" in name:
+        return False
+
+    if with_durability is True and '_with_legacy_durability' not in name:
+        return False
+
+    for x in typeListRe:
+        if re.fullmatch(x, name):
+            return True
+    return False
+
+
+opTypes = []
+opEnums = []
+
+
+def parse_type(type):
+    typeStr = type.get_canonical().spelling
+    return parse_type_str(typeStr)
+
+std_comparators = ["std::less<>", "std::greater<>", "std::less_equal<>", "std::greater_equal<>"]
+
+def parse_type_str(typeStr):
+    if typeStr == "std::mutex":
+        return {"name": "std::mutex"}
+    if typeStr == "std::string":
+        return {"name": "std::string"}
+    if typeStr == "std::chrono::duration<long long>":
+        return {"name": "std::chrono::seconds"}
+    if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000>>":
+        return {"name": "std::chrono::milliseconds"}
+    if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000000>>":
+        return {"name": "std::chrono::microseconds"}
+    if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000000000>>":
+        return {"name": "std::chrono::nanoseconds"}
+    if typeStr == "std::error_code":
+        return {"name": "std::error_code"}
+    if typeStr == "std::monostate":
+        return {"name": "std::monostate"}
+    if typeStr == "std::byte":
+        return {"name": "std::byte"}
+    if typeStr == "unsigned long":
+        return {"name": "std::size_t"}
+    if typeStr == "char":
+        return {"name": "std::int8_t"}
+    if typeStr == "unsigned char":
+        return {"name": "std::uint8_t"}
+    if typeStr == "short":
+        return {"name": "std::int16_t"}
+    if typeStr == "unsigned short":
+        return {"name": "std::uint16_t"}
+    if typeStr == "int":
+        return {"name": "std::int32_t"}
+    if typeStr == "unsigned int":
+        return {"name": "std::uint32_t"}
+    if typeStr == "long long":
+        return {"name": "std::int64_t"}
+    if typeStr == "unsigned long long":
+        return {"name": "std::uint64_t"}
+    if typeStr == "bool":
+        return {"name": "std::bool"}
+    if typeStr == "float":
+        return {"name": "std::float"}
+    if typeStr == "double":
+        return {"name": "std::double"}
+    if typeStr == "std::nullptr_t":
+        return {"name": "std::nullptr_t"}
+    if typeStr in std_comparators:
+        return {"name": typeStr}
+
+    tplParts = typeStr.split("<", 1)
+    if len(tplParts) > 1:
+        tplClassName = tplParts[0]
+        tplParams = tplParts[1][:-1]
+        if tplClassName == "std::function":
+            return {
+                "name": "std::function"
+            }
+        if tplClassName == "std::optional":
+            return {
+                "name": "std::optional",
+                "of": parse_type_str(tplParams)
+            }
+        if tplClassName == "std::vector":
+            return {
+                "name": "std::vector",
+                "of": parse_type_str(tplParams)
+            }
+        if tplClassName == "std::set":
+            return {
+                "name": "std::set",
+                "of": parse_type_str(tplParams)
+            }
+        if tplClassName == "std::variant":
+            variantParts = tplParams.split(", ")
+            variantTypes = []
+            for variantPart in variantParts:
+                variantTypes.append(parse_type_str(variantPart))
+            return {
+                "name": "std::variant",
+                "of": variantTypes
+            }
+        if tplClassName == "std::array":
+            variantParts = tplParams.split(", ")
+            if len(variantParts) != 2:
+                print("FAILED TO PARSE ARRAY TYPES: " + typeStr)
+                return {"name": "unknown", "str": typeStr}
+            return {
+                "name": "std::array",
+                "of": parse_type_str(variantParts[0]),
+                "size": int(variantParts[1])
+            }
+        if tplClassName == "std::map":
+            variantParts = tplParams.split(", ")
+            if len(variantParts) < 2 or len(variantParts) > 3:
+                print("FAILED TO PARSE MAP TYPES: " + typeStr)
+                return {"name": "unknown", "str": typeStr}
+
+            if len(variantParts) == 2:
+                return {
+                    "name": "std::map",
+                    "of": parse_type_str(variantParts[0]),
+                    "to": parse_type_str(variantParts[1])
+                }
+            else:
+                return {
+                    "name": "std::map",
+                    "of": parse_type_str(variantParts[0]),
+                    "to": parse_type_str(variantParts[1]),
+                    "comparator": parse_type_str(variantParts[2])
+                }
+
+        if tplClassName == "std::shared_ptr":
+            return {
+                "name": "std::shared_ptr",
+                "of": parse_type_str(tplParams)
+            }
+
+        #return {"name": "unknown", "str": typeStr}
+
+    if 'unnamed struct' in typeStr:
+        print("WARNING:  Found unnamed struct: " + typeStr)
+
+    return {"name": typeStr}
+
+internal_structs = []
+UNNAMED_STRUCT_DELIM = '::(unnamed struct'
+
+def traverse(node, namespace, main_file):
+    # only scan the elements of the file we parsed
+    #print("FILE", node.location.file )
+
+    if node.kind == clang.cindex.CursorKind.STRUCT_DECL or node.kind == clang.cindex.CursorKind.CLASS_DECL:
+        fullStructName = "::".join([*namespace, node.displayname])
+        print("REFL_TYPE(" + fullStructName + ")")
+
+        structFields = []
+        for child in node.get_children():
+            if child.kind == clang.cindex.CursorKind.FIELD_DECL:
+                struct_type = parse_type(child.type)
+                type_str = child.type.get_canonical().spelling
+                print("  REFL_FIELD(" + child.displayname + ")")
+                if 'unnamed' in type_str:
+                    name_tokens = type_str.split('::')
+                    name_override = '::'.join(name_tokens[:-1] + [child.displayname])
+                    struct_type['name'] = name_override
+                    internal_structs.append(name_override)
+
+                    structFields.append({
+                        "name": child.displayname,
+                        "type": struct_type,
+                    })
+            # replica read changes introduced duplicate get requests
+            if any(map(lambda op: op['name'] == fullStructName, opTypes)):
+                return
+
+            opTypes.append({
+                "name": fullStructName,
+                "fields": structFields,
+            })
+        print("REFL_END")
+        
+    if node.kind == clang.cindex.CursorKind.TYPE_ALIAS_DECL:
+        fullStructName = "::".join([*namespace, node.displayname])
+        if is_included_type(fullStructName, with_durability=True):
+            type_ref = next((c for c in node.get_children() if c.kind == clang.cindex.CursorKind.TYPE_REF), None)
+            if type_ref:
+                base_request_name = type_ref.displayname.replace('struct', '').strip()
+                base_request = next((op for op in opTypes if op['name'] == base_request_name), None)
+                if base_request:
+                    new_fields = [f for f in base_request['fields'] if f['name'] != 'durability_level']
+                    new_fields.extend([
+                            {"name":"persist_to", "type":{"name":"couchbase::persist_to"}},
+                            {"name":"replicate_to", "type":{"name":"couchbase::replicate_to"}}
+                        ])
+
+                    opTypes.append({
+                        "name": fullStructName,
+                        "fields": new_fields
+                    })
+    if node.kind == clang.cindex.CursorKind.ENUM_DECL:
+        fullEnumName = "::".join([*namespace, node.displayname])
+        if is_included_type(fullEnumName):
+            enumValues = []
+
+            for child in node.get_children():
+                if child.kind == clang.cindex.CursorKind.ENUM_CONSTANT_DECL:
+                    enumValues.append({
+                        "name": child.displayname,
+                        "value": child.enum_value,
+                    })
+            opEnums.append({
+                "name": fullEnumName,
+                "type": parse_type(node.enum_type),
+                "values": enumValues,
+            })
+
+    if node.kind == clang.cindex.CursorKind.NAMESPACE:
+        namespace = [*namespace, node.displayname]
+    if node.kind == clang.cindex.CursorKind.CLASS_DECL:
+        namespace = [*namespace, node.displayname]
+    if node.kind == clang.cindex.CursorKind.STRUCT_DECL:
+        namespace = [*namespace, node.displayname]
+
+    for child in node.get_children():
+        traverse(child, namespace, main_file)
+
+for headerPath in fullFileList:
+    print("processing " + headerPath)
+    index = clang.cindex.Index.create()
+    args = [
+        '-std=c++17',
+    ]
+    
+    try:
+        translation_unit = index.parse(headerPath, args=args)
+    except Exception as e:
+        print(e)
+        import pdb
+        pdb.set_trace()
+        raise e
+
+    # output clang compiler diagnostics information (for debugging)
+
+    for diagnostic in translation_unit.diagnostics:
+        diagnosticMsg = diagnostic.format()
+        print(diagnostic)
+
+    traverse(translation_unit.cursor, [], headerPath)
+
+jsonData = json.dumps({
+    'op_structs': opTypes,
+    'op_enums': opEnums
+})
+
+f = open("bindings.json", "w")
+f.write(jsonData)
+f.close()
--- a/ggml-alloc.cpp
+++ b/ggml-alloc.cpp
@ -386,7 +386,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {

 void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) {
    free(galloc->parse_seq);
-    galloc->parse_seq = malloc(sizeof(int) * n);
+    galloc->parse_seq = (int*)malloc(sizeof(int) * n);

    for (int i = 0; i < n; i++) {
        galloc->parse_seq[i] = list[i];
@ -646,9 +646,9 @@ size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, st
        if (galloc->hash_values != NULL) {
            free(galloc->hash_values);
        }
-        galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size);
+        galloc->hash_set.keys = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * hash_size);
        galloc->hash_set.size = hash_size;
-        galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
+        galloc->hash_values = (hash_node*)malloc(sizeof(struct hash_node) * hash_size);
    }

    // reset hash table
@ -674,7 +674,7 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap
    // alloc hash_values if needed
    if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) {
        free(galloc->hash_values);
-        galloc->hash_values      = malloc(sizeof(struct hash_node) * hash_size);
+        galloc->hash_values      = (hash_node*)malloc(sizeof(struct hash_node) * hash_size);
        galloc->hash_values_size = hash_size;
    }

--- a/ggml-backend.cpp
+++ b/ggml-backend.cpp
@ -20,7 +20,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
        struct ggml_backend_buffer_i           iface,
               ggml_backend_buffer_context_t   context,
               size_t                          size) {
-    ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
+  ggml_backend_buffer_t buffer = (ggml_backend_buffer*)malloc(sizeof(struct ggml_backend_buffer));

    GGML_ASSERT(iface.get_base != NULL);

@ -195,9 +195,9 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
    // TODO: allow backends to support copy to/from same backend

    if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
-        ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst);
+      ggml_get_backend(dst)->iface.cpy_tensor_from((ggml_backend_t)ggml_get_backend(dst)->context, src, dst);
    } else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
-        ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst);
+      ggml_get_backend(src)->iface.cpy_tensor_to((ggml_backend_t)ggml_get_backend(src)->context, src, dst);
    } else {
        // shouldn't be hit when copying from/to CPU
        #ifndef NDEBUG
@ -316,13 +316,13 @@ struct ggml_backend_plan_cpu {
 static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;

-    struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
+    struct ggml_backend_plan_cpu * cpu_plan = (ggml_backend_plan_cpu*)malloc(sizeof(struct ggml_backend_plan_cpu));

    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
    cpu_plan->cgraph = *cgraph;

    if (cpu_plan->cplan.work_size > 0) {
-        cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
+      cpu_plan->cplan.work_data = (uint8_t*)malloc(cpu_plan->cplan.work_size);
    }

    return cpu_plan;
@ -356,7 +356,7 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
        cpu_ctx->work_size = cplan.work_size;
    }

-    cplan.work_data = cpu_ctx->work_data;
+    cplan.work_data = (uint8_t*)cpu_ctx->work_data;

    ggml_graph_compute(cgraph, &cplan);
 }
@ -385,13 +385,13 @@ static struct ggml_backend_i cpu_backend_i = {
 };

 ggml_backend_t ggml_backend_cpu_init(void) {
-    struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
+  struct ggml_backend_cpu_context * ctx = (ggml_backend_cpu_context*)malloc(sizeof(struct ggml_backend_cpu_context));

    ctx->n_threads = GGML_DEFAULT_N_THREADS;
    ctx->work_data = NULL;
    ctx->work_size = 0;

-    ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
+    ggml_backend_t cpu_backend = (ggml_backend_t)malloc(sizeof(struct ggml_backend));

    *cpu_backend = (struct ggml_backend) {
        /* .interface = */ cpu_backend_i,
@ -869,7 +869,7 @@ static void sched_reset(ggml_backend_sched_t sched) {
 ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
    GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);

-    struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched));
+    struct ggml_backend_sched * sched = (ggml_backend_sched*)malloc(sizeof(struct ggml_backend_sched));
    memset(sched, 0, sizeof(struct ggml_backend_sched));

    fprintf(stderr, "ggml_backend_sched size: %lu KB\n", sizeof(struct ggml_backend_sched)/1024);
@ -907,9 +907,9 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
    // initialize hash tables
    size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
    sched->hash_set.size = hash_size;
-    sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
-    sched->node_talloc   = malloc(sizeof(sched->node_talloc[0])   * hash_size);
-    sched->node_copies   = malloc(sizeof(sched->node_copies[0])   * hash_size);
+    sched->hash_set.keys = (ggml_tensor**)malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
+    sched->node_talloc   = (ggml_tallocr**)malloc(sizeof(sched->node_talloc[0])   * hash_size);
+    sched->node_copies   = (ggml_tensor *(*)[4])malloc(sizeof(sched->node_copies[0])   * hash_size);

    sched_split_graph(sched, measure_graph);
    sched_alloc_splits(sched);
--- a/ggml-impl.h
+++ b/ggml-impl.h
@ -22,7 +22,7 @@ extern "C" {
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
 #define static_assert(cond, msg) _Static_assert(cond, msg)
 #else
-#define static_assert(cond, msg) struct global_scope_noop_trick
+  //#define static_assert(cond, msg) struct global_scope_noop_trick
 #endif
 #endif

--- a/ggml-internal.hpp
+++ b/ggml-internal.hpp
@ -0,0 +1,258 @@
+struct ggml_context {
+    size_t mem_size;
+    void * mem_buffer;
+    bool   mem_buffer_owned;
+    bool   no_alloc;
+    bool   no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
+
+    int    n_objects;
+
+    struct ggml_object * objects_begin;
+    struct ggml_object * objects_end;
+
+    struct ggml_scratch scratch;
+    struct ggml_scratch scratch_save;
+
+  ggml_context():
+    mem_size(0),
+    mem_buffer(0),
+    mem_buffer_owned(0),
+    no_alloc(0),
+    no_alloc_save(0),
+    n_objects(0),
+    objects_begin(0),
+    objects_end(0),
+    scratch(),
+    scratch_save()
+  {
+    
+  }
+};
+
+struct ggml_context_container {
+    bool used;
+
+    struct ggml_context context;
+
+  ggml_context_container(): used(0),context(){
+    
+  }
+};
+
+typedef double ggml_float;
+typedef void * thread_ret_t;
+
+#define MAX_FREE_BLOCKS 256
+
+struct free_block {
+    void * addr;
+    size_t size;
+};
+
+struct ggml_tallocr {
+    struct ggml_backend_buffer * buffer;
+    bool buffer_owned;
+    void * base;
+    size_t alignment;
+
+    int n_free_blocks;
+    struct free_block free_blocks[MAX_FREE_BLOCKS];
+
+    size_t max_size;
+
+    bool measure;
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    struct ggml_tensor * allocated_tensors[1024];
+#endif
+};
+
+
+struct hash_node {
+    int n_children;
+    int n_views;
+};
+
+typedef struct ggml_tallocr * ggml_tallocr_t;
+typedef struct ggml_gallocr * ggml_gallocr_t;
+
+struct ggml_gallocr {
+    ggml_tallocr_t talloc;
+    struct ggml_hash_set hash_set;
+    struct hash_node * hash_values;
+    size_t hash_values_size;
+    ggml_tallocr_t * hash_allocs;
+    int * parse_seq;
+    int parse_seq_len;
+};
+
+struct ggml_allocr {
+    ggml_tallocr_t talloc;
+    ggml_gallocr_t galloc;
+};
+
+#define GGML_NUMA_MAX_NODES 8
+#define GGML_NUMA_MAX_CPUS 512
+
+struct ggml_numa_node {
+    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
+    uint32_t n_cpus;
+};
+
+struct ggml_numa_nodes {
+    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
+    uint32_t n_nodes;
+    uint32_t total_cpus; // hardware threads on system
+};
+
+struct ggml_state {
+    struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
+    struct ggml_numa_nodes numa;
+
+  ggml_state():contexts(), numa()
+  {
+    
+  }
+};
+
+struct gguf_str {
+    uint64_t n;  // GGUFv2
+    char * data;
+};
+
+struct ggml_map_custom1_op_params {
+    ggml_custom1_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
+struct ggml_map_custom2_op_params {
+    ggml_custom2_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
+struct ggml_map_custom3_op_params {
+    ggml_custom3_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+struct hash_map {
+    struct ggml_hash_set set;
+    struct ggml_tensor ** vals;
+};
+
+#if defined(_WIN32)
+typedef volatile LONG atomic_int;
+typedef atomic_int atomic_bool;
+#else
+#include<atomic>
+using namespace std;
+#endif
+
+struct ggml_compute_state_shared {
+    const struct ggml_cgraph * cgraph;
+    const struct ggml_cplan  * cplan;
+
+    int64_t perf_node_start_cycles;
+    int64_t perf_node_start_time_us;
+
+    const int n_threads;
+
+    // synchronization primitives
+    atomic_int n_active; // num active threads
+    atomic_int node_n;   // active graph node
+
+    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
+    void * abort_callback_data;
+};
+typedef pthread_t ggml_thread_t;
+struct ggml_compute_state {
+    ggml_thread_t thrd;
+    int ith;
+    struct ggml_compute_state_shared * shared;
+};
+
+union gguf_value {
+    uint8_t  uint8;
+    int8_t   int8;
+    uint16_t uint16;
+    int16_t  int16;
+    uint32_t uint32;
+    int32_t  int32;
+    float    float32;
+    uint64_t uint64;
+    int64_t  int64;
+    double   float64;
+    bool     bool_;
+
+    struct gguf_str str;
+
+    struct gguf_array_T {
+        enum gguf_type type;
+
+        uint64_t n;  // GGUFv2
+        void * data;
+    } arr;
+};
+
+struct ggml_lbfgs_iteration_data {
+    float alpha;
+    float ys;
+    float * s;
+    float * y;
+};
+
+struct gguf_kv {
+    struct gguf_str key;
+
+    enum  gguf_type  type;
+    union gguf_value value;
+};
+
+
+
+struct gguf_header {
+    char magic[4];
+    uint32_t version;
+    uint64_t n_tensors; // GGUFv2
+    uint64_t n_kv;      // GGUFv2
+};
+
+struct gguf_tensor_info {
+    struct gguf_str name;
+
+    uint32_t n_dims;
+    uint64_t ne[GGML_MAX_DIMS];
+
+    enum ggml_type type;
+
+    uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
+
+    // for writing API
+    const void * data;
+    size_t size;
+};
+
+struct gguf_context {
+    struct gguf_header header;
+
+    struct gguf_kv          * kv;
+    struct gguf_tensor_info * infos;
+
+    size_t alignment;
+    size_t offset;    // offset of `data` from beginning of file
+    size_t size;      // size of `data` in bytes
+
+    //uint8_t * padding;
+    void * data;
+};
+
+struct gguf_buf {
+    void * data;
+    size_t size;
+    size_t offset;
+};
+
+
+#include "ggml-backend-impl.h"
--- a/ggml-mpi.cpp
+++ b/ggml-mpi.cpp
--- a/ggml-quants.cpp
+++ b/ggml-quants.cpp
--- a/ggml-quants.h
+++ b/ggml-quants.h
@ -167,58 +167,58 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_


 // Quantization
-void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
-void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
-void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
-void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
-void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
-void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
+void quantize_row_q4_0_reference(const float * __restrict__ x, block_q4_0 * __restrict__ y, int k);
+void quantize_row_q4_1_reference(const float * __restrict__ x, block_q4_1 * __restrict__ y, int k);
+void quantize_row_q5_0_reference(const float * __restrict__ x, block_q5_0 * __restrict__ y, int k);
+void quantize_row_q5_1_reference(const float * __restrict__ x, block_q5_1 * __restrict__ y, int k);
+void quantize_row_q8_0_reference(const float * __restrict__ x, block_q8_0 * __restrict__ y, int k);
+void quantize_row_q8_1_reference(const float * __restrict__ x, block_q8_1 * __restrict__ y, int k);

-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
+void quantize_row_q2_K_reference(const float * __restrict__ x, block_q2_K * __restrict__ y, int k);
+void quantize_row_q3_K_reference(const float * __restrict__ x, block_q3_K * __restrict__ y, int k);
+void quantize_row_q4_K_reference(const float * __restrict__ x, block_q4_K * __restrict__ y, int k);
+void quantize_row_q5_K_reference(const float * __restrict__ x, block_q5_K * __restrict__ y, int k);
+void quantize_row_q6_K_reference(const float * __restrict__ x, block_q6_K * __restrict__ y, int k);
+void quantize_row_q8_K_reference(const float * __restrict__ x, block_q8_K * __restrict__ y, int k);

-void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
+void quantize_row_q4_0(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q4_1(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q5_0(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q5_1(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q8_0(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q8_1(const float * __restrict__ x, void * __restrict__ y, int k);

-void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q2_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q3_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q4_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q5_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q6_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q8_K(const float * __restrict__ x, void * __restrict__ y, int k);

 // Dequantization
-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
-//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
+void dequantize_row_q4_0(const block_q4_0 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q4_1(const block_q4_1 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q5_0(const block_q5_0 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q5_1(const block_q5_1 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q8_0(const block_q8_0 * __restrict__ x, float * __restrict__ y, int k);
+//void dequantize_row_q8_1(const block_q8_1 * __restrict__ x, float * __restrict__ y, int k);

-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
+void dequantize_row_q2_K(const block_q2_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q3_K(const block_q3_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q4_K(const block_q4_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q5_K(const block_q5_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q6_K(const block_q6_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q8_K(const block_q8_K * __restrict__ x, float * __restrict__ y, int k);

 // Dot product
-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q4_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q4_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q5_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q5_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q8_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);

-void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q2_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q3_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q4_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q5_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q6_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
--- a/ggml.cpp
+++ b/ggml.cpp
@ -38,6 +38,14 @@
 #pragma warning(disable: 4996)
 #endif

+// initializers for static data called in the ggml_init function
+static  size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {};
+static  char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {};
+
+void type_traits_init();
+void GGUF_TYPE_SIZE_init();
+void GGUF_TYPE_NAME_init();
+
 #if defined(_WIN32)

 #include <windows.h>
@ -86,7 +94,9 @@ static int sched_yield (void) {
 }
 #else
 #include <pthread.h>
-#include <stdatomic.h>
+//#include <stdatomic.h>
+#include <atomic>
+using namespace std;

 typedef void * thread_ret_t;

@ -96,6 +106,8 @@ typedef void * thread_ret_t;

 #endif

+#include <atomic>
+
 #ifdef GGML_USE_CPU_HBM
 #include <hbwmalloc.h>
 #endif
@ -409,37 +421,39 @@ int64_t ggml_cycles_per_ms(void) {

 static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);

-static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
-static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
+static void ggml_vec_dot_f32(const int n, float * GGML_RESTRICT  s, const float * GGML_RESTRICT  x, const float * GGML_RESTRICT  y);
+static void ggml_vec_dot_f16(const int n, float * GGML_RESTRICT  s, ggml_fp16_t * GGML_RESTRICT  x, ggml_fp16_t * GGML_RESTRICT  y);

-static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_I8] = {
+
+static  ggml_type_traits_t type_traits[GGML_TYPE_COUNT]; 
+void type_traits_init(){
+    type_traits[GGML_TYPE_I8] = {
        .type_name                = "i8",
        .blck_size                = 1,
        .type_size                = sizeof(int8_t),
        .is_quantized             = false,
-    },
-    [GGML_TYPE_I16] = {
+    };
+    type_traits[GGML_TYPE_I16] = {
        .type_name                = "i16",
        .blck_size                = 1,
        .type_size                = sizeof(int16_t),
        .is_quantized             = false,
-    },
-    [GGML_TYPE_I32] = {
+    };
+    type_traits[GGML_TYPE_I32] = {
        .type_name                = "i32",
        .blck_size                = 1,
        .type_size                = sizeof(int32_t),
        .is_quantized             = false,
-    },
-    [GGML_TYPE_F32] = {
+    };
+    type_traits[GGML_TYPE_F32] = {
        .type_name                = "f32",
        .blck_size                = 1,
        .type_size                = sizeof(float),
        .is_quantized             = false,
        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
        .vec_dot_type             = GGML_TYPE_F32,
-    },
-    [GGML_TYPE_F16] = {
+    };
+    type_traits[GGML_TYPE_F16] = {
        .type_name                = "f16",
        .blck_size                = 1,
        .type_size                = sizeof(ggml_fp16_t),
@ -449,8 +463,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) ggml_fp32_to_fp16_row,
        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
        .vec_dot_type             = GGML_TYPE_F16,
-    },
-    [GGML_TYPE_Q4_0] = {
+    };
+    type_traits[GGML_TYPE_Q4_0] = {
        .type_name                = "q4_0",
        .blck_size                = QK4_0,
        .type_size                = sizeof(block_q4_0),
@ -460,8 +474,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
        .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q4_1] = {
+    };
+    type_traits[GGML_TYPE_Q4_1] = {
        .type_name                = "q4_1",
        .blck_size                = QK4_1,
        .type_size                = sizeof(block_q4_1),
@ -471,8 +485,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_1_reference,
        .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [4] = { // GGML_TYPE_Q4_2
+    };
+    type_traits[4] = { // GGML_TYPE_Q4_2
        .type_name                = "DEPRECATED",
        .blck_size                = 0,
        .type_size                = 0,
@ -482,8 +496,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = NULL,
        .vec_dot                  = NULL,
        .vec_dot_type             = GGML_TYPE_COUNT,
-    },
-    [5] = { // GGML_TYPE_Q4_3
+    };
+    type_traits[5] = { // GGML_TYPE_Q4_3
        .type_name                = "DEPRECATED",
        .blck_size                = 0,
        .type_size                = 0,
@ -493,8 +507,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = NULL,
        .vec_dot                  = NULL,
        .vec_dot_type             = GGML_TYPE_COUNT,
-    },
-    [GGML_TYPE_Q5_0] = {
+    };
+    type_traits[GGML_TYPE_Q5_0] = {
        .type_name                = "q5_0",
        .blck_size                = QK5_0,
        .type_size                = sizeof(block_q5_0),
@ -504,8 +518,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_0_reference,
        .vec_dot                  = ggml_vec_dot_q5_0_q8_0,
        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q5_1] = {
+    };
+    type_traits[GGML_TYPE_Q5_1] = {
        .type_name                = "q5_1",
        .blck_size                = QK5_1,
        .type_size                = sizeof(block_q5_1),
@ -515,8 +529,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_1_reference,
        .vec_dot                  = ggml_vec_dot_q5_1_q8_1,
        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [GGML_TYPE_Q8_0] = {
+    };
+    type_traits[GGML_TYPE_Q8_0] = {
        .type_name                = "q8_0",
        .blck_size                = QK8_0,
        .type_size                = sizeof(block_q8_0),
@ -526,8 +540,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_0_reference,
        .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q8_1] = {
+    };
+    type_traits[GGML_TYPE_Q8_1] = {
        .type_name                = "q8_1",
        .blck_size                = QK8_1,
        .type_size                = sizeof(block_q8_1),
@ -535,8 +549,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float               = quantize_row_q8_1,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_1_reference,
        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [GGML_TYPE_Q2_K] = {
+    };
+    type_traits[GGML_TYPE_Q2_K] = {
        .type_name                = "q2_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q2_K),
@ -546,8 +560,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q2_K_reference,
        .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q3_K] = {
+    };
+    type_traits[GGML_TYPE_Q3_K] = {
        .type_name                = "q3_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q3_K),
@ -557,8 +571,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q3_K_reference,
        .vec_dot                  = ggml_vec_dot_q3_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q4_K] = {
+    };
+    type_traits[GGML_TYPE_Q4_K] = {
        .type_name                = "q4_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q4_K),
@ -568,8 +582,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_K_reference,
        .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q5_K] = {
+    };
+    type_traits[GGML_TYPE_Q5_K] = {
        .type_name                = "q5_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q5_K),
@ -579,8 +593,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_K_reference,
        .vec_dot                  = ggml_vec_dot_q5_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q6_K] = {
+    };
+    type_traits[GGML_TYPE_Q6_K] = {
        .type_name                = "q6_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q6_K),
@ -590,15 +604,15 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q6_K_reference,
        .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q8_K] = {
+    };
+    type_traits[GGML_TYPE_Q8_K] = {
        .type_name                = "q8_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q8_K),
        .is_quantized             = true,
        .from_float               = quantize_row_q8_K,
-    }
-};
+    };
+}

 // For internal test use
 ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
@ -1160,7 +1174,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
 inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
 inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }

-static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
+static void ggml_vec_dot_f32(const int n, float * GGML_RESTRICT s, const float * GGML_RESTRICT x, const float * GGML_RESTRICT y) {
 #ifdef GGML_SIMD
    float sumf = 0.0f;
    const int np = (n & ~(GGML_F32_STEP - 1));
@ -1197,7 +1211,7 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
    *s = sumf;
 }

-static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
+static void ggml_vec_dot_f16(const int n, float * GGML_RESTRICT s, ggml_fp16_t * GGML_RESTRICT x, ggml_fp16_t * GGML_RESTRICT y) {
    ggml_float sumf = 0.0;

 #if defined(GGML_SIMD)
@ -1235,10 +1249,10 @@ static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * rest

 // compute GGML_VEC_DOT_UNROLL dot products at once
 // xs - x row stride in bytes
-inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
+inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
    ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };

-    ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
+    ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];

    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
        x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
@ -1288,7 +1302,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re
    }
 }

-inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
+inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
 #if defined(GGML_SIMD)
    const int np = (n & ~(GGML_F32_STEP - 1));

@ -1320,10 +1334,10 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
 }

 // xs and vs are byte strides of x and v
-inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
+inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {

-    const float * restrict x[GGML_VEC_MAD_UNROLL];
-    const float * restrict v[GGML_VEC_MAD_UNROLL];
+    const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
+    const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];

    for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
        x[i] = (const float *) ((const char *) xv + i*xs);
@ -2175,18 +2189,26 @@ static inline int ggml_up(int n, int m) {

 ////////////////////////////////////////////////////////////////////////////////

+
 struct ggml_context * ggml_init(struct ggml_init_params params) {
-    // make this function thread safe
-    ggml_critical_section_start();

-    static bool is_first_call = true;
+  // initialize the data in the arrays
+  type_traits_init();
+  GGUF_TYPE_SIZE_init();
+  GGUF_TYPE_NAME_init();

-    if (is_first_call) {
-        // initialize time system (required on Windows)
-        ggml_time_init();
+  struct ggml_context * ctx = NULL;
+  static bool is_first_call = true;
+  // make this function thread safe
+  ggml_critical_section_start();
  
-        // initialize GELU, Quick GELU, SILU and EXP F32 tables
-        {
+
+  if (is_first_call) {
+    // initialize time system (required on Windows)
+    ggml_time_init();
+    
+    // initialize GELU, Quick GELU, SILU and EXP F32 tables
+    {
            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);

            ggml_fp16_t ii;
@ -2238,7 +2260,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
    }

    // find non-used context in g_state
-    struct ggml_context * ctx = NULL;
+    

    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
        if (!g_state.contexts[i].used) {
@ -2402,7 +2424,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
    // align to GGML_MEM_ALIGN
    size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);

-    char * const mem_buffer = ctx->mem_buffer;
+    char * const mem_buffer = (char*)ctx->mem_buffer;
    struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);

    if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
@ -2475,7 +2497,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
                return NULL;
            }

-            data = (char * const) ctx->scratch.data + ctx->scratch.offs;
+            data = (void*)(((char *)ctx->scratch.data) + ctx->scratch.offs);

            ctx->scratch.offs += data_size;
        } else {
@ -2630,7 +2652,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
    const int nc    = tensor->ne[0];
    const size_t n1 = tensor->nb[1];

-    char * const data = tensor->data;
+    char * const data = (char*)tensor->data;

    switch (tensor->type) {
        case GGML_TYPE_I8:
@ -2682,7 +2704,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
    const int nc    = tensor->ne[0];
    const size_t n1 = tensor->nb[1];

-    char * const data = tensor->data;
+    char * const data = (char*)tensor->data;

    switch (tensor->type) {
        case GGML_TYPE_I8:
@ -3063,7 +3085,7 @@ struct ggml_tensor * ggml_view_tensor(
 struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
    struct ggml_object * obj = ctx->objects_begin;

-    char * const mem_buffer = ctx->mem_buffer;
+    char * const mem_buffer = (char*)ctx->mem_buffer;

    while (obj != NULL) {
        if (obj->type == GGML_OBJECT_TENSOR) {
@ -3080,7 +3102,7 @@ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml
    struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
    obj = obj->next;

-    char * const mem_buffer = ctx->mem_buffer;
+    char * const mem_buffer = (char*)ctx->mem_buffer;

    while (obj != NULL) {
        if (obj->type == GGML_OBJECT_TENSOR) {
@ -3096,7 +3118,7 @@ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml
 struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
    struct ggml_object * obj = ctx->objects_begin;

-    char * const mem_buffer = ctx->mem_buffer;
+    char * const mem_buffer = (char*)ctx->mem_buffer;

    while (obj != NULL) {
        if (obj->type == GGML_OBJECT_TENSOR) {
@ -3292,7 +3314,7 @@ static struct ggml_tensor * ggml_acc_impl(

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

-    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
+    int32_t params[] = { (int32_t)nb1, (int32_t)nb2, (int32_t)nb3, (int32_t)offset, inplace ? 1 : 0 };
    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_ACC;
@ -4145,7 +4167,7 @@ static struct ggml_tensor * ggml_set_impl(
    // make a view of the destination
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

-    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
+    int32_t params[] = { (int32_t)nb1,(int32_t) nb2, (int32_t)nb3, (int32_t)offset, inplace ? 1 : 0 };
    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_SET;
@ -5402,7 +5424,7 @@ struct ggml_tensor * ggml_pool_2d(
    };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);

-    int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
+    int32_t params[] = { op, k0, k1, s0, s1, (int32_t)p0, (int32_t)p1 };
    ggml_set_op_params(result, params, sizeof(params));

    result->op = GGML_OP_POOL_2D;
@ -8262,7 +8284,7 @@ static void ggml_compute_forward_repeat_back_f32(
    GGML_ASSERT(nb00 == sizeof(float));

    if (ggml_is_contiguous(dst)) {
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+      ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
    } else {
        for         (int k3 = 0; k3 < ne3; k3++) {
            for     (int k2 = 0; k2 < ne2; k2++) {
@ -9390,6 +9412,7 @@ static void ggml_compute_forward_mul_mat(
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
              struct ggml_tensor * dst) {
+
    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);

@ -9492,7 +9515,7 @@ static void ggml_compute_forward_mul_mat(

    if (params->type == GGML_TASK_INIT) {
        if (src1->type != vec_dot_type) {
-            char * wdata = params->wdata;
+	  char * wdata = (char*)params->wdata;
            const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);

            for (int64_t i13 = 0; i13 < ne13; ++i13) {
@ -9646,7 +9669,7 @@ static void ggml_compute_forward_out_prod_f32(
            return;
        }
 #endif
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+        ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
        return;
    }

@ -9829,7 +9852,7 @@ static void ggml_compute_forward_out_prod_q_f32(
    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)

    if (params->type == GGML_TASK_INIT) {
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+      ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
        return;
    }

@ -11843,7 +11866,7 @@ static void ggml_compute_forward_pool_1d(
              struct ggml_tensor * dst) {

    const int32_t * opts = (const int32_t *)dst->op_params;
-    enum ggml_op_pool op = opts[0];
+    enum ggml_op_pool op = (ggml_op_pool)opts[0];
    const int k0 = opts[1];
    const int s0 = opts[2];
    const int p0 = opts[3];
@ -11867,7 +11890,7 @@ static void ggml_compute_forward_pool_2d(
    }

    const int32_t * opts = (const int32_t *)dst->op_params;
-    enum ggml_op_pool op = opts[0];
+    enum ggml_op_pool op = (ggml_op_pool)opts[0];
    const int k0 = opts[1];
    const int k1 = opts[2];
    const int s0 = opts[3];
@ -14098,7 +14121,7 @@ static struct ggml_hash_set ggml_hash_set_new(size_t size) {
    size = ggml_hash_size(size);
    struct ggml_hash_set result;
    result.size = size;
-    result.keys = malloc(sizeof(struct ggml_tensor *) * size);
+    result.keys = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * size);
    memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
    return result;
 }
@ -14113,9 +14136,9 @@ struct hash_map {
 };

 static struct hash_map * ggml_new_hash_map(size_t size) {
-    struct hash_map * result = malloc(sizeof(struct hash_map));
+  struct hash_map * result = (hash_map*)malloc(sizeof(struct hash_map));
    result->set = ggml_hash_set_new(size);
-    result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
+    result->vals = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * result->set.size);
    memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
    return result;
 }
@ -16034,7 +16057,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
        /*.abort_callback          =*/ NULL,
        /*.abort_callback_data     =*/ NULL,
    };
-    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
+    struct ggml_compute_state * workers = (ggml_compute_state*)alloca(sizeof(struct ggml_compute_state)*n_threads);

    // create thread pool
    if (n_threads > 1) {
@ -16631,7 +16654,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
            continue;
        }

-        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
+        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name((ggml_op)i), (double) perf_total_per_op_us[i] / 1000.0);
    }

    GGML_PRINT("========================================\n");
@ -16903,11 +16926,11 @@ static enum ggml_opt_result ggml_opt_adam(
    const int n_accum = MAX(1, params.n_gradient_accumulation);
    const float accum_norm = 1.0f / (float) n_accum;

-    float * g  = opt->adam.g->data;  // gradients
-    float * m  = opt->adam.m->data;  // first moment
-    float * v  = opt->adam.v->data;  // second moment
+    float * g  = (float*)opt->adam.g->data;  // gradients
+    float * m  = (float*)opt->adam.m->data;  // first moment
+    float * v  = (float*)opt->adam.v->data;  // second moment

-    float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
+    float * pf = params.past > 0 ? (float *)opt->adam.pf->data : NULL; // past function values

    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
@ -17175,7 +17198,7 @@ static enum ggml_opt_result linesearch_backtracking(
        } else {
            // Armijo condition is satisfied
            if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) {
-                return count;
+	      return (ggml_opt_result)count;
            }

            ggml_vec_dot_f32(nx, &dg, g, d);
@ -17186,14 +17209,14 @@ static enum ggml_opt_result linesearch_backtracking(
            } else {
                if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) {
                    // regular Wolfe conditions
-                    return count;
+		  return (ggml_opt_result)count;
                }

                if(dg > -params->lbfgs.wolfe*dginit) {
                    width = dec;
                } else {
                    // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
-                    return count;
+		  return (ggml_opt_result)count;
                }
            }
        }
@ -17258,13 +17281,13 @@ static enum ggml_opt_result ggml_opt_lbfgs(
    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;

-    float * x  = opt->lbfgs.x->data;  // current parameters
-    float * xp = opt->lbfgs.xp->data; // previous parameters
-    float * g  = opt->lbfgs.g->data;  // current gradient
-    float * gp = opt->lbfgs.gp->data; // previous gradient
-    float * d  = opt->lbfgs.d->data;  // search direction
+    float * x  = (float*)opt->lbfgs.x->data;  // current parameters
+    float * xp = (float*)opt->lbfgs.xp->data; // previous parameters
+    float * g  = (float*)opt->lbfgs.g->data;  // current gradient
+    float * gp = (float*)opt->lbfgs.gp->data; // previous gradient
+    float * d  = (float*)opt->lbfgs.d->data;  // search direction

-    float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values
+    float * pf = params.past > 0 ? (float*)opt->lbfgs.pf->data : NULL; // past function values

    const int n_accum = MAX(1, params.n_gradient_accumulation);
    const float accum_norm = 1.0f / (float) n_accum;
@ -17277,10 +17300,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
    ggml_opt_get_params(np, ps, x);

    // the L-BFGS memory
-    float * lm_alpha = opt->lbfgs.lmal->data;
-    float * lm_ys    = opt->lbfgs.lmys->data;
-    float * lm_s     = opt->lbfgs.lms->data;
-    float * lm_y     = opt->lbfgs.lmy->data;
+    float * lm_alpha = (float*)opt->lbfgs.lmal->data;
+    float * lm_ys    = (float*)opt->lbfgs.lmys->data;
+    float * lm_s     = (float*)opt->lbfgs.lms->data;
+    float * lm_y     = (float*)opt->lbfgs.lmy->data;

    bool cancel = false;

@ -17377,7 +17400,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
            ggml_vec_cpy_f32(nx, x, xp);
            ggml_vec_cpy_f32(nx, g, gp);

-            return ls;
+            return (ggml_opt_result)ls;
        }

        opt->loss_after = fx;
@ -17564,7 +17587,7 @@ GGML_API void ggml_opt_init(
    opt->nx = nx;
    opt->just_initialized = true;
    if (opt->ctx == NULL) {
-        struct ggml_init_params ctx_opt_params;
+      struct ggml_init_params ctx_opt_params;
        if (opt->params.type == GGML_OPT_ADAM) {
            ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3;
            if (opt->params.past > 0) {
@ -17718,7 +17741,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
    const int nb = k / QK4_0;

    for (int b = 0; b < n; b += k) {
-        block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
+        block_q4_0 * GGML_RESTRICT y = (block_q4_0 *) dst + b/QK4_0;

        quantize_row_q4_0_reference(src + b, y, k);

@ -17741,7 +17764,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
    const int nb = k / QK4_1;

    for (int b = 0; b < n; b += k) {
-        block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
+        block_q4_1 * GGML_RESTRICT y = (block_q4_1 *) dst + b/QK4_1;

        quantize_row_q4_1_reference(src + b, y, k);

@ -17764,7 +17787,7 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
    const int nb = k / QK5_0;

    for (int b = 0; b < n; b += k) {
-        block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
+        block_q5_0 * GGML_RESTRICT y = (block_q5_0 *)dst + b/QK5_0;

        quantize_row_q5_0_reference(src + b, y, k);

@ -17794,7 +17817,7 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
    const int nb = k / QK5_1;

    for (int b = 0; b < n; b += k) {
-        block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
+        block_q5_1 * GGML_RESTRICT y = (block_q5_1 *)dst + b/QK5_1;

        quantize_row_q5_1_reference(src + b, y, k);

@ -17824,7 +17847,7 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
    const int nb = k / QK8_0;

    for (int b = 0; b < n; b += k) {
-        block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
+        block_q8_0 * GGML_RESTRICT y = (block_q8_0 *)dst + b/QK8_0;

        quantize_row_q8_0_reference(src + b, y, k);

@ -17928,37 +17951,39 @@ struct gguf_str {
    char * data;
 };

-static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
-    [GGUF_TYPE_UINT8]   = sizeof(uint8_t),
-    [GGUF_TYPE_INT8]    = sizeof(int8_t),
-    [GGUF_TYPE_UINT16]  = sizeof(uint16_t),
-    [GGUF_TYPE_INT16]   = sizeof(int16_t),
-    [GGUF_TYPE_UINT32]  = sizeof(uint32_t),
-    [GGUF_TYPE_INT32]   = sizeof(int32_t),
-    [GGUF_TYPE_FLOAT32] = sizeof(float),
-    [GGUF_TYPE_BOOL]    = sizeof(bool),
-    [GGUF_TYPE_STRING]  = sizeof(struct gguf_str),
-    [GGUF_TYPE_UINT64]  = sizeof(uint64_t),
-    [GGUF_TYPE_INT64]   = sizeof(int64_t),
-    [GGUF_TYPE_FLOAT64] = sizeof(double),
-    [GGUF_TYPE_ARRAY]   = 0, // undefined
+
+void GGUF_TYPE_SIZE_init() {
+    GGUF_TYPE_SIZE[GGUF_TYPE_UINT8]   = sizeof(uint8_t);
+    GGUF_TYPE_SIZE[GGUF_TYPE_INT8]    = sizeof(int8_t);
+    GGUF_TYPE_SIZE[GGUF_TYPE_UINT16]  = sizeof(uint16_t);
+    GGUF_TYPE_SIZE[GGUF_TYPE_INT16]   = sizeof(int16_t);
+    GGUF_TYPE_SIZE[GGUF_TYPE_UINT32]  = sizeof(uint32_t);
+    GGUF_TYPE_SIZE[GGUF_TYPE_INT32]   = sizeof(int32_t);
+    GGUF_TYPE_SIZE[GGUF_TYPE_FLOAT32] = sizeof(float);
+    GGUF_TYPE_SIZE[GGUF_TYPE_BOOL]    = sizeof(bool);
+    GGUF_TYPE_SIZE[GGUF_TYPE_STRING]  = sizeof(struct gguf_str);
+    GGUF_TYPE_SIZE[GGUF_TYPE_UINT64]  = sizeof(uint64_t);
+    GGUF_TYPE_SIZE[GGUF_TYPE_INT64]   = sizeof(int64_t);
+    GGUF_TYPE_SIZE[GGUF_TYPE_FLOAT64] = sizeof(double);
+    GGUF_TYPE_SIZE[GGUF_TYPE_ARRAY]   = 0; // undefined
 };
 static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");

-static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
-    [GGUF_TYPE_UINT8]   = "u8",
-    [GGUF_TYPE_INT8]    = "i8",
-    [GGUF_TYPE_UINT16]  = "u16",
-    [GGUF_TYPE_INT16]   = "i16",
-    [GGUF_TYPE_UINT32]  = "u32",
-    [GGUF_TYPE_INT32]   = "i32",
-    [GGUF_TYPE_FLOAT32] = "f32",
-    [GGUF_TYPE_BOOL]    = "bool",
-    [GGUF_TYPE_STRING]  = "str",
-    [GGUF_TYPE_ARRAY]   = "arr",
-    [GGUF_TYPE_UINT64]  = "u64",
-    [GGUF_TYPE_INT64]   = "i64",
-    [GGUF_TYPE_FLOAT64] = "f64",
+
+void GGUF_TYPE_NAME_init(){
+  GGUF_TYPE_NAME[GGUF_TYPE_UINT8]   = "u8";
+  GGUF_TYPE_NAME[GGUF_TYPE_INT8]    = "i8";
+  GGUF_TYPE_NAME[GGUF_TYPE_UINT16]  = "u16";
+  GGUF_TYPE_NAME[GGUF_TYPE_INT16]   = "i16";
+    GGUF_TYPE_NAME[GGUF_TYPE_UINT32]  = "u32";
+    GGUF_TYPE_NAME[GGUF_TYPE_INT32]   = "i32";
+    GGUF_TYPE_NAME[GGUF_TYPE_FLOAT32] = "f32";
+    GGUF_TYPE_NAME[GGUF_TYPE_BOOL]    = "bool";
+    GGUF_TYPE_NAME[GGUF_TYPE_STRING]  = "str";
+    GGUF_TYPE_NAME[GGUF_TYPE_ARRAY]   = "arr";
+    GGUF_TYPE_NAME[GGUF_TYPE_UINT64]  = "u64";
+    GGUF_TYPE_NAME[GGUF_TYPE_INT64]   = "i64";
+    GGUF_TYPE_NAME[GGUF_TYPE_FLOAT64] = "f64";
 };
 static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");

@ -18040,14 +18065,14 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {

    bool ok = true;

-    ok = ok && gguf_fread_el(file, &p->n,    sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
+    ok = ok && gguf_fread_el(file, &p->n,    sizeof(p->n), offset); p->data = (char*)calloc(p->n + 1, 1);
    ok = ok && gguf_fread_el(file,  p->data, p->n,         offset);

    return ok;
 }

 struct gguf_context * gguf_init_empty(void) {
-    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+  struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));

    memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
    ctx->header.version   = GGUF_VERSION;
@ -18092,7 +18117,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p

    bool ok = true;

-    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+    struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));

    // read the header
    {
@ -18124,7 +18149,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p

    // read the kv pairs
    {
-        ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
+      ctx->kv = (gguf_kv*)malloc(ctx->header.n_kv * sizeof(struct gguf_kv));

        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
            struct gguf_kv * kv = &ctx->kv[i];
@ -18199,7 +18224,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p

    // read the tensor infos
    {
-        ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
+      ctx->infos = (gguf_tensor_info*)malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));

        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
            struct gguf_tensor_info * info = &ctx->infos[i];
@ -18319,10 +18344,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
        // create the tensors
        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
            const int64_t ne[GGML_MAX_DIMS] = {
-                ctx->infos[i].ne[0],
-                ctx->infos[i].ne[1],
-                ctx->infos[i].ne[2],
-                ctx->infos[i].ne[3],
+	      (int64_t)ctx->infos[i].ne[0],
+	      (int64_t)ctx->infos[i].ne[1],
+	      (int64_t)ctx->infos[i].ne[2],
+	      (int64_t)ctx->infos[i].ne[3],
            };

            struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
@ -18603,7 +18628,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {

    const int n_kv = gguf_get_n_kv(ctx);

-    ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
+    ctx->kv = (gguf_kv*)realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
    ctx->kv[n_kv].key.n    = strlen(key);
    ctx->kv[n_kv].key.data = strdup(key);
    ctx->header.n_kv++;
@ -18739,7 +18764,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
            case GGUF_TYPE_ARRAY:
                {
                    if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
-                        const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
+		      const char ** data = (const char **)malloc(src->kv[i].value.arr.n*sizeof(char *));
                        for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
                            data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
                        }
@ -18760,7 +18785,7 @@ void gguf_add_tensor(
             struct gguf_context * ctx,
        const struct ggml_tensor * tensor) {
    const int idx = ctx->header.n_tensors;
-    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
+    ctx->infos = (gguf_tensor_info*)realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));

    ctx->infos[idx].name.n    = strlen(tensor->name);
    ctx->infos[idx].name.data = strdup(tensor->name);
--- a/ggml.h
+++ b/ggml.h
@ -285,8 +285,10 @@
    GGML_UNUSED(prefix##3);

 #ifdef  __cplusplus
+#ifndef CPP_ONLY
 extern "C" {
 #endif
+#endif

 #if defined(__ARM_NEON) && defined(__CUDACC__)
    typedef half ggml_fp16_t;
@ -2136,7 +2138,7 @@ extern "C" {
 // restrict not standard in C++
 #define GGML_RESTRICT 
 #else
-#define GGML_RESTRICT restrict
+#define GGML_RESTRICT __restrict__
 #endif
    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k);
@ -2157,5 +2159,7 @@ extern "C" {
    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);

 #ifdef  __cplusplus
+#ifndef CPP_ONLY
 }
 #endif
+#endif
--- a/llama-internal.hpp
+++ b/llama-internal.hpp
@ -0,0 +1,896 @@
+#include <set>
+#include <queue>
+enum llm_arch {
+    LLM_ARCH_LLAMA,
+    LLM_ARCH_FALCON,
+    LLM_ARCH_BAICHUAN,
+    LLM_ARCH_GPT2,
+    LLM_ARCH_GPTJ,
+    LLM_ARCH_GPTNEOX,
+    LLM_ARCH_MPT,
+    LLM_ARCH_STARCODER,
+    LLM_ARCH_PERSIMMON,
+    LLM_ARCH_REFACT,
+    LLM_ARCH_BLOOM,
+    LLM_ARCH_STABLELM,
+    LLM_ARCH_UNKNOWN,
+};
+
+enum llm_kv {
+    LLM_KV_GENERAL_ARCHITECTURE,
+    LLM_KV_GENERAL_QUANTIZATION_VERSION,
+    LLM_KV_GENERAL_ALIGNMENT,
+    LLM_KV_GENERAL_NAME,
+    LLM_KV_GENERAL_AUTHOR,
+    LLM_KV_GENERAL_URL,
+    LLM_KV_GENERAL_DESCRIPTION,
+    LLM_KV_GENERAL_LICENSE,
+    LLM_KV_GENERAL_SOURCE_URL,
+    LLM_KV_GENERAL_SOURCE_HF_REPO,
+
+    LLM_KV_CONTEXT_LENGTH,
+    LLM_KV_EMBEDDING_LENGTH,
+    LLM_KV_BLOCK_COUNT,
+    LLM_KV_FEED_FORWARD_LENGTH,
+    LLM_KV_USE_PARALLEL_RESIDUAL,
+    LLM_KV_TENSOR_DATA_LAYOUT,
+
+    LLM_KV_ATTENTION_HEAD_COUNT,
+    LLM_KV_ATTENTION_HEAD_COUNT_KV,
+    LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
+    LLM_KV_ATTENTION_CLAMP_KQV,
+    LLM_KV_ATTENTION_LAYERNORM_EPS,
+    LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
+
+    LLM_KV_ROPE_DIMENSION_COUNT,
+    LLM_KV_ROPE_FREQ_BASE,
+    LLM_KV_ROPE_SCALE_LINEAR,
+    LLM_KV_ROPE_SCALING_TYPE,
+    LLM_KV_ROPE_SCALING_FACTOR,
+    LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
+    LLM_KV_ROPE_SCALING_FINETUNED,
+
+    LLM_KV_TOKENIZER_MODEL,
+    LLM_KV_TOKENIZER_LIST,
+    LLM_KV_TOKENIZER_TOKEN_TYPE,
+    LLM_KV_TOKENIZER_SCORES,
+    LLM_KV_TOKENIZER_MERGES,
+    LLM_KV_TOKENIZER_BOS_ID,
+    LLM_KV_TOKENIZER_EOS_ID,
+    LLM_KV_TOKENIZER_UNK_ID,
+    LLM_KV_TOKENIZER_SEP_ID,
+    LLM_KV_TOKENIZER_PAD_ID,
+    LLM_KV_TOKENIZER_ADD_BOS,
+    LLM_KV_TOKENIZER_ADD_EOS,
+    LLM_KV_TOKENIZER_HF_JSON,
+    LLM_KV_TOKENIZER_RWKV,
+};
+
+// available llama models
+enum e_model {
+    MODEL_UNKNOWN,
+    MODEL_1B,
+    MODEL_3B,
+    MODEL_7B,
+    MODEL_8B,
+    MODEL_13B,
+    MODEL_15B,
+    MODEL_30B,
+    MODEL_34B,
+    MODEL_40B,
+    MODEL_65B,
+    MODEL_70B,
+};
+
+enum llama_fver {
+    GGUF_FILE_VERSION_V1 = 1,
+    GGUF_FILE_VERSION_V2 = 2,
+    GGUF_FILE_VERSION_V3 = 3,
+};
+
+struct LLM_KV {
+  LLM_KV(llm_arch arch) : arch(arch) {}
+
+  llm_arch arch;
+
+  std::string operator()(llm_kv kv) const; // moved to llama.cpp file
+
+};
+
+enum llm_tensor {
+    LLM_TENSOR_TOKEN_EMBD,
+    LLM_TENSOR_TOKEN_EMBD_NORM,
+    LLM_TENSOR_POS_EMBD,
+    LLM_TENSOR_OUTPUT,
+    LLM_TENSOR_OUTPUT_NORM,
+    LLM_TENSOR_ROPE_FREQS,
+    LLM_TENSOR_ATTN_Q,
+    LLM_TENSOR_ATTN_K,
+    LLM_TENSOR_ATTN_V,
+    LLM_TENSOR_ATTN_QKV,
+    LLM_TENSOR_ATTN_OUT,
+    LLM_TENSOR_ATTN_NORM,
+    LLM_TENSOR_ATTN_NORM_2,
+    LLM_TENSOR_ATTN_ROT_EMBD,
+    LLM_TENSOR_FFN_GATE,
+    LLM_TENSOR_FFN_DOWN,
+    LLM_TENSOR_FFN_UP,
+    LLM_TENSOR_FFN_NORM,
+    LLM_TENSOR_ATTN_Q_NORM,
+    LLM_TENSOR_ATTN_K_NORM,
+};
+
+
+struct llama_cparams {
+    uint32_t n_ctx;       // context size used during inference
+    uint32_t n_batch;
+    uint32_t n_threads;       // number of threads to use for generation
+    uint32_t n_threads_batch; // number of threads to use for batch processing
+
+    float    rope_freq_base;
+    float    rope_freq_scale;
+
+    uint32_t n_yarn_orig_ctx;
+    // These hyperparameters are not exposed in GGUF, because all
+    // existing YaRN models use the same values for them.
+    float yarn_ext_factor;
+    float yarn_attn_factor;
+    float yarn_beta_fast;
+    float yarn_beta_slow;
+
+    bool mul_mat_q;
+};
+
+struct llama_layer {
+    // normalization
+    struct ggml_tensor * attn_norm;
+    struct ggml_tensor * attn_norm_b;
+    struct ggml_tensor * attn_norm_2;
+    struct ggml_tensor * attn_norm_2_b;
+    struct ggml_tensor * attn_q_norm;
+    struct ggml_tensor * attn_q_norm_b;
+    struct ggml_tensor * attn_k_norm;
+    struct ggml_tensor * attn_k_norm_b;
+
+    // attention
+    struct ggml_tensor * wq;
+    struct ggml_tensor * wk;
+    struct ggml_tensor * wv;
+    struct ggml_tensor * wo;
+    struct ggml_tensor * wqkv;
+
+    // attention bias
+    struct ggml_tensor * bo;
+    struct ggml_tensor * bqkv;
+
+    // normalization
+    struct ggml_tensor * ffn_norm;
+    struct ggml_tensor * ffn_norm_b;
+
+    // ff
+    struct ggml_tensor * ffn_gate; // w1
+    struct ggml_tensor * ffn_down; // w2
+    struct ggml_tensor * ffn_up;   // w3
+
+    // ff bias
+    struct ggml_tensor * ffn_down_b; // b2
+    struct ggml_tensor * ffn_up_b;   // b3
+};
+
+struct llama_kv_cell {
+    llama_pos pos   = -1;
+    llama_pos delta = 0;
+
+    std::set<llama_seq_id> seq_id;
+
+    bool has_seq_id(const llama_seq_id & id) const {
+	return seq_id.find(id) != seq_id.end();
+    }
+};
+
+struct llama_buffer {
+    void * data = NULL;
+    size_t size = 0;
+
+    // fallback to malloc / free
+    // useful in cases where CUDA can try to allocate PINNED memory
+    bool fallback = false;
+
+  void resize(size_t n) ;
+
+
+  ~llama_buffer();
+
+};
+
+// ring-buffer of cached KV data
+struct llama_kv_cache {
+    bool has_shift = false;
+
+    // Note: The value of head isn't only used to optimize searching
+    // for a free KV slot. llama_decode_internal also uses it, so it
+    // cannot be freely changed after a slot has been allocated.
+    uint32_t head = 0;
+    uint32_t size = 0;
+
+    // computed before each graph build
+    uint32_t n = 0;
+
+    std::vector<llama_kv_cell> cells;
+
+    struct ggml_tensor * k = NULL;
+    struct ggml_tensor * v = NULL;
+
+    struct ggml_context * ctx = NULL;
+
+    llama_buffer buf;
+
+    ~llama_kv_cache() {
+	if (ctx) {
+	    ggml_free(ctx);
+	}
+
+#ifdef GGML_USE_CUBLAS
+	if (ggml_cublas_loaded()) {
+	    ggml_cuda_free_data(k);
+	    ggml_cuda_free_data(v);
+	}
+#endif
+    }
+};
+
+struct llama_vocab {
+    using id    = int32_t;
+    using token = std::string;
+    using ttype = llama_token_type;
+
+    struct token_data {
+	token text;
+	float score;
+	ttype type;
+    };
+
+    enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_data>       id_to_token;
+
+    std::unordered_map<token, id> special_tokens_cache;
+
+    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
+
+    // default LLaMA special tokens
+    id special_bos_id = 1;
+    id special_eos_id = 2;
+    id special_unk_id = 0;
+    id special_sep_id = -1;
+    id special_pad_id = -1;
+
+    int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
+    int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
+
+    id linefeed_id       = 13;
+    id special_prefix_id = 32007;
+    id special_middle_id = 32009;
+    id special_suffix_id = 32008;
+    id special_eot_id    = 32010;
+
+    int find_bpe_rank(std::string token_left, std::string token_right) const {
+	GGML_ASSERT(token_left.find(" ") == std::string::npos);
+	GGML_ASSERT(token_left.find("\n") == std::string::npos);
+	GGML_ASSERT(token_right.find(" ") == std::string::npos);
+	GGML_ASSERT(token_right.find("\n") == std::string::npos);
+
+	auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
+	if (it == bpe_ranks.end()) {
+	    return -1;
+	}
+
+	return it->second;
+    }
+};
+
+struct llama_mmap {
+  void * addr;
+  size_t size;
+
+  llama_mmap(const llama_mmap &) = delete;
+
+  llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false);
+  ~llama_mmap();
+
+#ifdef _POSIX_MAPPED_FILES
+    static constexpr bool SUPPORTED = true;
+#elif defined(_WIN32)
+    static constexpr bool SUPPORTED = true;
+#else
+    static constexpr bool SUPPORTED = false;
+#endif
+};
+
+
+struct llama_hparams {
+    bool     vocab_only;
+    uint32_t n_vocab;
+    uint32_t n_ctx_train; // context size the model was trained on
+    uint32_t n_embd;
+    uint32_t n_head;
+    uint32_t n_head_kv;
+    uint32_t n_layer;
+    uint32_t n_rot;
+    uint32_t n_ff;
+
+    float f_norm_eps;
+    float f_norm_rms_eps;
+
+    float    rope_freq_base_train;
+    float    rope_freq_scale_train;
+    uint32_t n_yarn_orig_ctx;
+    int8_t   rope_scaling_type_train : 3;
+    bool     rope_finetuned : 1;
+
+    float f_clamp_kqv;
+    float f_max_alibi_bias;
+
+  bool operator!=(const llama_hparams & other) const;
+    uint32_t n_gqa() const {
+	return n_head/n_head_kv;
+    }
+
+    uint32_t n_embd_head() const {
+	return n_embd/n_head;
+    }
+
+    uint32_t n_embd_gqa() const {
+	return n_embd/n_gqa();
+    }
+};
+
+struct llama_mlock {
+  void * addr = NULL;
+  size_t size = 0;
+  bool failed_already = false;
+  llama_mlock() ;
+
+  llama_mlock(const llama_mlock &) = delete;
+  ~llama_mlock();
+  void init(void * ptr);
+  void grow_to(size_t target_size);
+#ifdef _POSIX_MEMLOCK_RANGE
+  static constexpr bool SUPPORTED = true;
+  static size_t lock_granularity();
+#ifdef __APPLE__
+#define MLOCK_SUGGESTION						\
+  "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
+  "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
+#else
+#define MLOCK_SUGGESTION						\
+  "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
+#endif
+  bool raw_lock(const void * addr, size_t size) const ;
+#undef MLOCK_SUGGESTION
+  static void raw_unlock(void * addr, size_t size);
+#elif defined(_WIN32)
+  static constexpr bool SUPPORTED = true;
+  static size_t lock_granularity();
+  bool raw_lock(void * ptr, size_t len) const ;
+  static void raw_unlock(void * ptr, size_t len);
+#else
+    static constexpr bool SUPPORTED = false;
+  static size_t lock_granularity();
+  bool raw_lock(const void * addr, size_t len) const;
+  static void raw_unlock(const void * addr, size_t len);
+#endif
+};
+
+
+struct llama_model {
+    e_model     type  = MODEL_UNKNOWN;
+    llm_arch    arch  = LLM_ARCH_UNKNOWN;
+    llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
+
+    std::string name = "n/a";
+
+    llama_hparams hparams = {};
+    llama_vocab   vocab;
+
+    struct ggml_tensor * tok_embd;
+    struct ggml_tensor * pos_embd;
+    struct ggml_tensor * tok_norm;
+    struct ggml_tensor * tok_norm_b;
+
+    struct ggml_tensor * output_norm;
+    struct ggml_tensor * output_norm_b;
+    struct ggml_tensor * output;
+
+    std::vector<llama_layer> layers;
+
+    int n_gpu_layers;
+
+    // gguf metadata
+    std::unordered_map<std::string, std::string> gguf_kv;
+
+    // context
+    struct ggml_context * ctx = NULL;
+
+    // the model memory buffer
+    llama_buffer buf;
+
+    // model memory mapped file
+    std::unique_ptr<llama_mmap> mapping;
+
+    // objects representing data potentially being locked in memory
+    llama_mlock mlock_buf;
+    llama_mlock mlock_mmap;
+
+    // for quantize-stats only
+    std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
+
+    int64_t t_load_us = 0;
+    int64_t t_start_us = 0;
+
+    ~llama_model() {
+	if (ctx) {
+	    ggml_free(ctx);
+	}
+
+#ifdef GGML_USE_CUBLAS
+	if (ggml_cublas_loaded()) {
+	    for (size_t i = 0; i < tensors_by_name.size(); ++i) {
+		ggml_cuda_free_data(tensors_by_name[i].second);
+	    }
+	    ggml_cuda_free_scratch();
+	}
+#endif
+
+#if defined(GGML_USE_CLBLAST)
+	for (size_t i = 0; i < tensors_by_name.size(); ++i) {
+	    ggml_cl_free_data(tensors_by_name[i].second);
+	}
+#endif
+    }
+};
+
+struct llama_context {
+    llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
+  ~llama_context();
+
+    llama_cparams cparams;
+
+    const llama_model & model;
+
+    // key + value cache for the self attention
+    struct llama_kv_cache kv_self;
+
+    std::mt19937 rng;
+
+    bool has_evaluated_once = false;
+
+    int64_t t_start_us;
+    int64_t t_load_us;
+    int64_t t_sample_us = 0;
+    int64_t t_p_eval_us = 0;
+    int64_t t_eval_us   = 0;
+
+    int32_t n_sample = 0; // number of tokens sampled
+    int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
+    int32_t n_eval   = 0; // number of eval calls
+
+    // decode output (2-dimensional array: [n_tokens][n_vocab])
+    std::vector<float> logits;
+    bool logits_all = false;
+
+    // input embedding (1-dimensional array: [n_embd])
+    std::vector<float> embedding;
+
+    // reusable buffer for `struct ggml_graph_plan.work_data`
+    std::vector<uint8_t> work_buffer;
+
+    // memory buffers used to evaluate the model
+    llama_buffer buf_compute;
+
+    llama_buffer buf_alloc;
+    ggml_allocr * alloc = NULL;
+
+#ifdef GGML_USE_METAL
+    ggml_metal_context * ctx_metal = NULL;
+#endif
+
+#ifdef GGML_USE_MPI
+    ggml_mpi_context * ctx_mpi = NULL;
+#endif
+};
+
+
+struct LLM_TN {
+  LLM_TN(llm_arch arch) ;
+
+  llm_arch arch;
+
+  std::string operator()(llm_tensor tensor) const;
+
+  std::string operator()(llm_tensor tensor, const std::string & suffix) const ;
+
+  std::string operator()(llm_tensor tensor, int bid) const ;
+
+  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const ;
+
+};
+
+
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+  llama_file(const char * fname, const char * mode) ;
+  size_t tell() const;
+  void seek(size_t offset, int whence) const;
+  void read_raw(void * ptr, size_t len) const;
+  uint32_t read_u32() const;
+  void write_raw(const void * ptr, size_t len) const ;
+  void write_u32(std::uint32_t val) const;
+  ~llama_file();
+
+};
+
+
+struct llama_state {
+  llama_state();
+    // We save the log callback globally
+    ggml_log_callback log_callback;
+    void * log_callback_user_data = nullptr;
+};
+
+
+
+struct llama_model_loader {
+    int n_kv      = 0;
+    int n_tensors = 0;
+    int n_created = 0;
+
+    int64_t n_elements = 0;
+    size_t  n_bytes    = 0;
+
+    bool use_mmap = false;
+
+    llama_file  file;
+    llama_ftype ftype;
+    llama_fver  fver;
+
+    std::unique_ptr<llama_mmap> mapping;
+
+    struct gguf_context * ctx_gguf = NULL;
+    struct ggml_context * ctx_meta = NULL;
+
+  llama_model_loader(const std::string & fname, bool use_mmap) ;
+
+  ~llama_model_loader();
+
+  std::string get_arch_name() const;
+
+  enum llm_arch get_arch() const ;
+  const char * get_tensor_name(int i) const;
+
+  struct ggml_tensor * get_tensor_meta(int i) const;
+
+  void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const;
+
+  struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) ;
+
+  struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) ;
+
+  void done_getting_tensors() const;
+
+  size_t file_offset(const char * name) const;
+
+
+  void load_data_for(struct ggml_tensor * cur) const ;
+  void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) ;
+};
+
+struct llama_data_context {
+    virtual void write(const void * src, size_t size) = 0;
+    virtual size_t get_size_written() = 0;
+    virtual ~llama_data_context() = default;
+};
+
+struct llama_data_buffer_context : llama_data_context {
+    uint8_t * ptr;
+    size_t size_written = 0;
+  llama_data_buffer_context(uint8_t * p) ;
+  void write(const void * src, size_t size) override ;
+  size_t get_size_written() override ;
+};
+
+struct llama_data_file_context : llama_data_context {
+    llama_file * file;
+    size_t size_written = 0;
+  llama_data_file_context(llama_file * f);
+  size_t get_size_written() override ;
+  void write(const void * src, size_t size);
+};
+
+
+struct llama_beam {
+  std::vector<llama_token> tokens;
+  float p;  // Cumulative beam probability (renormalized relative to all beams)
+  bool eob; // Initialize end-of-beam to false. Callback sets this to true.
+  // Sort beams by probability. In case of ties, prefer beams at eob.
+  bool operator<(const llama_beam & rhs) const ;
+  void shift_tokens(const size_t n) ;
+  llama_beam_view view() const;
+};
+
+// A struct for calculating logit-related info.
+struct llama_logit_info {
+    const float * const logits;
+    const int n_vocab;
+    const float max_l;
+    const float normalizer;
+    struct sum_exp {
+	float max_l;
+	float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
+    };
+  llama_logit_info(llama_context * ctx);
+  llama_token_data get_token_data(const llama_token token_id) const ;
+  std::vector<llama_token_data> top_k(size_t k) ;
+  float probability_from_logit(float logit) const ;
+};
+
+
+struct llama_beam_search_data {
+  llama_context * ctx;
+  size_t n_beams;
+  int n_past;
+  int n_predict;
+  std::vector<llama_beam> beams;
+  std::vector<llama_beam> next_beams;
+  size_t common_prefix_length;
+  std::vector<llama_beam_view> beam_views;
+  llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict);
+  void collapse_beams(const size_t beam_idx) ;
+  void fill_next_beams_by_top_probabilities(llama_beam & beam) ;
+  size_t find_common_prefix_length() ;
+  llama_beams_state get_beams_state(const bool last_call) ;
+  void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data);
+  static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) ;
+  size_t top_beam_index();
+  void update_beams_from_beam_views();
+};
+
+using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
+
+enum llm_rope_type {
+    LLM_ROPE,
+    LLM_ROPE_NEOX,
+    LLM_ROPE_GLM,
+};
+
+enum llm_ffn_op_type {
+    LLM_FFN_SILU,
+    LLM_FFN_GELU,
+    LLM_FFN_RELU,
+    LLM_FFN_RELU_SQR,
+};
+
+enum llm_ffn_gate_type {
+    LLM_FFN_SEQ,
+    LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
+};
+
+enum llm_norm_type {
+    LLM_NORM,
+    LLM_NORM_RMS,
+};
+
+struct llm_build_context {
+    const llama_model    & model;
+    const llama_hparams  & hparams;
+    const llama_cparams  & cparams;
+    const llama_batch    & batch;
+    const llama_kv_cache & kv_self;
+
+    const int64_t n_embd;
+    const int64_t n_layer;
+    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
+    const int64_t n_head;
+    const int64_t n_head_kv;
+    const int64_t n_embd_head;
+    const int64_t n_embd_gqa;
+
+    const float freq_base;
+    const float freq_scale;
+    const float ext_factor;
+    const float attn_factor;
+    const float beta_fast;
+    const float beta_slow;
+    const float norm_eps;
+    const float norm_rms_eps;
+
+    const int32_t n_tokens;
+    const int32_t n_kv;     // size of KV cache to consider (n_kv <= n_ctx)
+    const int32_t kv_head;  // index of where we store new KV data in the cache
+    const int32_t n_orig_ctx;
+
+    const bool do_rope_shift;
+
+    const llm_build_cb & cb;
+
+    llama_buffer & buf_compute;
+
+    struct ggml_context * ctx0 = nullptr;
+
+    // TODO: consider making the entire interface noexcept
+    llm_build_context(
+	llama_context  & lctx,
+    const llama_batch  & batch,
+    const llm_build_cb & cb,
+	bool   worst_case);
+
+  void init() ;
+  void free() ;
+  struct ggml_cgraph * build_llama() ;
+  struct ggml_cgraph * build_baichuan() ;
+  struct ggml_cgraph * build_falcon() ;
+  struct ggml_cgraph * build_starcoder() ;
+  struct ggml_cgraph * build_persimmon() ;
+  struct ggml_cgraph * build_refact() ;
+  struct ggml_cgraph * build_bloom() ;
+  struct ggml_cgraph * build_mpt() ;
+  struct ggml_cgraph * build_stablelm();
+};
+
+
+enum llm_offload_func_e {
+    OFFLOAD_FUNC_NOP,
+    OFFLOAD_FUNC,
+    OFFLOAD_FUNC_KQ,
+    OFFLOAD_FUNC_V,
+    OFFLOAD_FUNC_NR,
+    OFFLOAD_FUNC_EMB,
+    OFFLOAD_FUNC_OUT,
+};
+
+struct llm_offload_trie {
+  struct node {
+    ~node() ;
+    node * children[256] = { nullptr };
+    llm_offload_func_e func = OFFLOAD_FUNC_NOP;
+  };
+  node * root = nullptr;
+  llm_offload_trie();
+  llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) ;
+  ~llm_offload_trie();
+  void add(const char * name, llm_offload_func_e func);
+  llm_offload_func_e find(const char * name) const;
+  
+};
+
+struct llm_symbol {
+    using index = int;
+    index prev;
+    index next;
+    const char * text;
+    size_t n;
+};
+
+
+struct llm_bigram_spm {
+    struct comparator {
+      bool operator()(llm_bigram_spm & l, llm_bigram_spm & r);
+    };
+    using queue_storage = std::vector<llm_bigram_spm>;
+    using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
+    llm_symbol::index left;
+    llm_symbol::index right;
+    float score;
+    size_t size;
+};
+
+struct llm_tokenizer_spm {
+  llm_tokenizer_spm(const llama_vocab & vocab);
+    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output);
+
+
+private:
+  void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) ;
+  void try_add_bigram(int left, int right) ;
+  const llama_vocab & vocab;
+
+  std::vector<llm_symbol> symbols;
+  llm_bigram_spm::queue work_queue;
+
+    std::map<std::string, std::pair<int, int>> rev_merge;
+};
+
+// BPE tokenizer
+// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
+// tried to simplify unicode stuff, so most likely does not work 100% correctly!
+
+// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
+
+struct llm_bigram_bpe {
+    struct comparator {
+      bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const ;
+    };
+
+    using queue_storage = std::vector<llm_bigram_bpe>;
+    using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
+    llm_symbol::index left;
+    llm_symbol::index right;
+    std::string text;
+    int rank;
+    size_t size;
+};
+
+struct llm_tokenizer_bpe {
+  llm_tokenizer_bpe(const llama_vocab & vocab);
+
+  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output);
+
+private:
+  void add_new_bigram(int left, int right) ;
+
+  std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) ;
+
+  const llama_vocab & vocab;
+
+  std::vector<llm_symbol> symbols;
+  std::vector<llm_symbol> symbols_final;
+
+    llm_bigram_bpe::queue work_queue;
+};
+
+typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
+    FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
+    FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
+} FRAGMENT_BUFFER_VARIANT_TYPE;
+
+struct fragment_buffer_variant{
+  fragment_buffer_variant(llama_vocab::id _token);
+  fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length);
+  const FRAGMENT_BUFFER_VARIANT_TYPE type;
+  const llama_vocab::id token;
+  const std::string _dummy;
+  const std::string & raw_text;
+  const uint64_t offset;
+  const uint64_t length;
+};
+
+struct llama_partial_utf8 {
+    uint32_t value;    // bit value so far (unshifted)
+    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
+};
+
+struct llama_grammar {
+    const std::vector<std::vector<llama_grammar_element>>   rules;
+    std::vector<std::vector<const llama_grammar_element *>> stacks;
+
+    // buffer for partially generated UTF-8 sequence from accepted tokens
+    llama_partial_utf8                                      partial_utf8;
+};
+
+struct llama_grammar_candidate {
+    size_t               index;
+    const uint32_t     * code_points;
+    llama_partial_utf8   partial_utf8;
+};
+
+struct quantize_state_internal {
+    const llama_model                 & model;
+    const llama_model_quantize_params * params;
+
+    int n_attention_wv    = 0;
+    int n_feed_forward_w2 = 0;
+    int i_attention_wv    = 0;
+    int i_feed_forward_w2 = 0;
+
+    int n_k_quantized     = 0;
+    int n_fallback        = 0;
+
+    quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
+        : model(model)
+        , params(params)
+        {}
+};
--- a/llama.h
+++ b/llama.h
@ -50,7 +50,9 @@
 #endif

 #ifdef __cplusplus
+#ifndef CPP_ONLY
 extern "C" {
+#endif
 #endif

    //
@ -827,8 +829,10 @@ extern "C" {
    LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);

 #ifdef __cplusplus
+#ifndef CPP_ONLY
 }
 #endif
+#endif

 // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
 #ifdef LLAMA_API_INTERNAL
@ -844,4 +848,8 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal

 #endif // LLAMA_API_INTERNAL

+
+
 #endif // LLAMA_H
+
+
--- a/print.hpp
+++ b/print.hpp
@ -0,0 +1,756 @@
+#include <iostream>
+#include "llama.h"
+#include "ggml-internal.hpp"
+#include "llama-internal.hpp"
+
+REFL_TYPE(ggml_init_params )
+REFL_END
+
+REFL_TYPE(ggml_opt_params::ggml_adam)
+REFL_END
+
+REFL_TYPE(ggml_opt_params::ggml_lbfgs)
+REFL_END
+
+
+REFL_TYPE(ggml_opt_context::ggml_grad )
+REFL_END
+
+REFL_TYPE(gpt_params )
+
+REFL_FIELD( seed )
+REFL_FIELD( n_threads)
+REFL_FIELD( n_threads_batch)
+REFL_FIELD( n_predict )
+REFL_FIELD( n_ctx )
+REFL_FIELD( n_batch)
+REFL_FIELD( n_keep )
+REFL_FIELD( n_draft)
+REFL_FIELD( n_chunks )
+REFL_FIELD( n_parallel)
+REFL_FIELD( n_sequences)
+REFL_FIELD( p_accept  )
+REFL_FIELD( p_split )
+REFL_FIELD( n_gpu_layers)
+REFL_FIELD( n_gpu_layers_draft)
+REFL_FIELD( main_gpu )
+REFL_FIELD( tensor_split)
+REFL_FIELD( n_beams )
+REFL_FIELD(rope_freq_base)
+REFL_FIELD( rope_freq_scale )
+REFL_FIELD( yarn_ext_factor )
+REFL_FIELD( yarn_attn_factor )
+REFL_FIELD( yarn_beta_fast )
+REFL_FIELD( yarn_beta_slow )
+REFL_FIELD( yarn_orig_ctx)
+REFL_FIELD( rope_scaling_type)
+REFL_FIELD( sparams)
+REFL_FIELD(model )
+REFL_FIELD(model_draft )
+REFL_FIELD(model_alias)
+REFL_FIELD(prompt )
+REFL_FIELD(prompt_file )
+REFL_FIELD(path_prompt_cache )
+REFL_FIELD(input_prefix )
+REFL_FIELD(input_suffix )
+REFL_FIELD( antiprompt)
+REFL_FIELD(logdir )
+REFL_FIELD( lora_adapter)
+REFL_FIELD(lora_base )
+REFL_FIELD( ppl_stride )
+REFL_FIELD( ppl_output_type )
+REFL_FIELD( hellaswag )
+REFL_FIELD( hellaswag_tasks )
+REFL_FIELD( mul_mat_q )
+REFL_FIELD( memory_f16)
+REFL_FIELD( random_prompt )
+REFL_FIELD( use_color )
+REFL_FIELD( interactive )
+REFL_FIELD( chatml )
+REFL_FIELD( prompt_cache_all )
+REFL_FIELD( prompt_cache_ro )
+REFL_FIELD( embedding )
+REFL_FIELD( escape )
+REFL_FIELD( interactive_first )
+REFL_FIELD( multiline_input )
+REFL_FIELD( simple_io )
+REFL_FIELD( cont_batching )
+REFL_FIELD( input_prefix_bos )
+REFL_FIELD( ignore_eos )
+REFL_FIELD( instruct )
+REFL_FIELD( logits_all )
+REFL_FIELD( use_mmap)
+REFL_FIELD( use_mlock )
+REFL_FIELD( numa )
+REFL_FIELD( verbose_prompt )
+REFL_FIELD( infill ) 
+REFL_FIELD(mmproj )
+REFL_FIELD( image)
+
+REFL_END
+
+REFL_TYPE(llama_sampling_params)
+REFL_END
+
+REFL_TYPE(llm_arch)
+REFL_END
+
+REFL_TYPE(llama_sampling_context )
+REFL_FIELD( params)
+REFL_FIELD( mirostat_mu)
+REFL_FIELD( grammar)
+REFL_FIELD( parsed_grammar)
+REFL_FIELD( prev) 
+REFL_FIELD( cur)
+REFL_END
+
+REFL_TYPE(llama_token_data )
+REFL_END
+
+
+REFL_TYPE(llama_token_data_array )
+REFL_END
+
+REFL_TYPE(llama_batch )
+REFL_END
+
+
+REFL_TYPE(ggml_object)
+  REFL_FIELD(offs)
+REFL_END
+
+REFL_TYPE(ggml_tensor)
+  REFL_FIELD(type)
+REFL_END
+
+REFL_TYPE(ggml_cplan)
+  REFL_FIELD(work_size)
+REFL_END
+
+REFL_TYPE(ggml_hash_set)
+  REFL_FIELD(size)
+REFL_END
+
+REFL_TYPE(ggml_cgraph)
+  REFL_FIELD(size)
+REFL_END
+
+REFL_TYPE(ggml_scratch)
+  REFL_FIELD(offs)
+REFL_END
+
+REFL_TYPE(ggml_compute_params)
+  REFL_FIELD(type)
+REFL_END
+
+REFL_TYPE(ggml_opt_params)
+  REFL_FIELD(type)
+REFL_END
+
+REFL_TYPE(ggml_opt_context)
+  REFL_FIELD(ctx)
+REFL_END
+
+REFL_TYPE(gguf_init_params)
+REFL_END
+
+REFL_TYPE(ggml_something)
+  REFL_FIELD(type_name)
+REFL_END
+
+REFL_TYPE(ggml_context)
+  REFL_FIELD(mem_size)
+REFL_FIELD(mem_buffer)
+REFL_FIELD(mem_buffer_owned)
+REFL_FIELD(    no_alloc)
+REFL_FIELD(    no_alloc_save)
+REFL_FIELD(    n_objects)
+REFL_FIELD(    objects_begin)
+REFL_FIELD(    objects_end)
+REFL_FIELD(    scratch)
+REFL_FIELD(    scratch_save)
+
+REFL_END
+
+REFL_TYPE(ggml_context_container)
+  REFL_FIELD(used)
+  REFL_FIELD(context)
+REFL_END
+
+ REFL_TYPE(ggml_numa_node)
+   REFL_FIELD(cpus)
+   REFL_FIELD(n_cpus)
+ REFL_END
+
+ REFL_TYPE(ggml_numa_nodes)
+   REFL_FIELD(nodes)
+   REFL_FIELD(n_nodes)
+ REFL_END
+
+ REFL_TYPE(ggml_state)
+   REFL_FIELD(contexts)
+   REFL_FIELD(numa)
+   REFL_END
+
+ REFL_TYPE(gguf_str)
+   REFL_FIELD(n)
+   REFL_FIELD(data)
+ REFL_END
+
+ REFL_TYPE(ggml_map_custom1_op_params)
+   REFL_FIELD(fun)
+   REFL_FIELD(n_tasks)
+ REFL_END
+
+REFL_TYPE(ggml_map_custom2_op_params)
+  REFL_FIELD(fun)
+  REFL_FIELD(n_tasks)
+REFL_END
+
+REFL_TYPE(ggml_map_custom3_op_params)
+  REFL_FIELD(fun)
+  REFL_FIELD(n_tasks)
+REFL_END
+
+REFL_TYPE(hash_map)
+  REFL_FIELD(set)
+  REFL_FIELD(vals)
+REFL_END
+REFL_TYPE(ggml_compute_state_shared)
+  REFL_FIELD(cgraph)
+  REFL_FIELD(cplan)
+REFL_END
+REFL_TYPE(ggml_compute_state)
+  REFL_FIELD(thrd)
+  REFL_FIELD(ith)
+REFL_END
+REFL_TYPE(ggml_lbfgs_iteration_data)
+  REFL_FIELD(alpha)
+  REFL_FIELD(ys)
+REFL_END
+
+REFL_TYPE(gguf_kv)
+  REFL_FIELD(key)
+  REFL_FIELD(type)
+REFL_END
+
+REFL_TYPE(gguf_header)
+  REFL_FIELD(magic)
+  REFL_FIELD(version)
+REFL_END
+
+REFL_TYPE(gguf_tensor_info)
+  REFL_FIELD(name)
+  REFL_FIELD(n_dims)
+REFL_END
+
+REFL_TYPE(gguf_context)
+  REFL_FIELD(header)
+  REFL_FIELD(kv)
+REFL_END
+
+REFL_TYPE(gguf_buf)
+  REFL_FIELD(data)
+  REFL_FIELD(size)
+REFL_END
+
+
+REFL_TYPE(llama_model_params)
+  REFL_FIELD(n_gpu_layers)
+REFL_END
+REFL_TYPE(llama_context_params)
+  REFL_FIELD(seed)
+REFL_END
+REFL_TYPE(llama_model_quantize_params)
+  REFL_FIELD(nthread)
+REFL_END
+
+REFL_TYPE(llama_grammar_element)
+REFL_END
+
+REFL_TYPE(llama_timings)
+  REFL_FIELD(t_start_ms)
+REFL_END
+REFL_TYPE(llama_beam_view)
+  REFL_FIELD(tokens)
+REFL_END
+
+REFL_TYPE(llama_beams_state)
+  REFL_FIELD(beam_views)
+REFL_END
+  
+REFL_TYPE(ggml_backend)
+REFL_END
+
+REFL_TYPE(ggml_backend_buffer)
+REFL_END
+
+REFL_TYPE(ggml_allocr)
+REFL_END
+
+REFL_TYPE(ggml_tallocr)
+REFL_END
+
+REFL_TYPE(ggml_gallocr)
+REFL_END
+
+
+REFL_TYPE(llama_buffer)
+REFL_FIELD(data)
+REFL_FIELD(size)
+REFL_END
+  
+
+REFL_TYPE(llama_file)
+REFL_FIELD(fp)
+REFL_FIELD(size)
+REFL_END
+  
+
+REFL_TYPE(llama_mmap)
+REFL_FIELD(addr)
+REFL_FIELD(size)
+REFL_END
+
+
+REFL_TYPE(llama_mlock)
+  REFL_FIELD(addr)
+  REFL_FIELD(size)
+REFL_END
+
+REFL_TYPE(llama_state)
+ REFL_FIELD(log_callback)
+ REFL_FIELD(log_callback_user_data)
+ REFL_END
+  
+
+REFL_TYPE(llama_hparams)
+  REFL_FIELD(vocab_only)
+  REFL_FIELD(n_vocab)
+  REFL_END
+
+
+REFL_TYPE(llama_cparams)
+  REFL_FIELD(n_ctx)
+  REFL_FIELD(n_batch)
+REFL_END
+
+REFL_TYPE(llama_layer)
+ REFL_FIELD(attn_norm)
+ REFL_FIELD(attn_norm_b)
+REFL_END
+
+REFL_TYPE(llama_kv_cell)
+  REFL_FIELD(pos)
+  REFL_FIELD(delta)
+REFL_END
+
+REFL_TYPE(llama_kv_cache)
+   REFL_FIELD(has_shift)
+   REFL_FIELD(head)
+ REFL_END
+
+REFL_TYPE(e_model)
+REFL_END
+
+REFL_TYPE(llama_ftype)
+REFL_END
+
+REFL_TYPE(llama_model)
+  REFL_FIELD(type)
+  REFL_FIELD(arch)
+REFL_FIELD(ftype )
+
+REFL_FIELD(  name )
+
+  REFL_FIELD(   hparams )
+REFL_FIELD(    vocab)
+
+REFL_FIELD(   tok_embd)
+REFL_FIELD(   pos_embd)
+REFL_FIELD(   tok_norm)
+REFL_FIELD(   tok_norm_b)
+
+REFL_FIELD(   output_norm)
+REFL_FIELD(  output_norm_b)
+REFL_FIELD(  output)
+
+REFL_FIELD(  layers)
+
+REFL_FIELD(  n_gpu_layers)
+
+  REFL_FIELD(  gguf_kv) //unordered map
+  REFL_FIELD( ctx)
+  REFL_FIELD( buf)
+ REFL_FIELD( mapping) //std::unique_ptr 
+REFL_FIELD( mlock_buf)
+REFL_FIELD( mlock_mmap)
+REFL_FIELD( tensors_by_name)
+  REFL_FIELD( t_load_us)
+REFL_FIELD( t_start_us)
+
+REFL_END
+
+REFL_TYPE(llama_vocab)
+  REFL_END
+  
+  REFL_TYPE(grammar_parser::parse_state)
+  REFL_END
+  
+REFL_TYPE(llama_context)
+REFL_FIELD( cparams)
+//REFL_FIELD(model)
+REFL_FIELD(kv_self)
+ REFL_FIELD(rng) //random numbers
+REFL_FIELD(has_evaluated_once )
+REFL_FIELD( t_start_us)
+REFL_FIELD( t_load_us)
+  REFL_FIELD( t_sample_us )
+REFL_FIELD( t_p_eval_us )
+  REFL_FIELD( t_eval_us)
+REFL_FIELD( n_sample )
+REFL_FIELD( n_p_eval )
+  REFL_FIELD( n_eval  )
+REFL_FIELD(  logits)
+REFL_FIELD(  logits_all )
+REFL_FIELD(  embedding)
+REFL_FIELD(   work_buffer)
+  REFL_FIELD(   buf_compute)
+  REFL_FIELD( buf_alloc)
+REFL_FIELD( alloc ) 
+
+#ifdef GGML_USE_METAL
+REFL_FIELD( ctx_metal )
+#endif
+
+#ifdef GGML_USE_MPI
+REFL_FIELD( ctx_mpi )
+
+#endif
+REFL_END
+
+REFL_TYPE(llama_model_loader)
+  REFL_FIELD(n_kv)
+  REFL_FIELD(n_tensors)
+REFL_END
+
+REFL_TYPE(llm_build_context)
+// REFL_FIELD(model) cannot create pointer to reference member ‘llm_build_context::model’
+//  REFL_FIELD(hparams) cannot create pointer to reference member ‘llm_build_context::hparams’
+REFL_END
+
+REFL_TYPE(llm_offload_trie)
+REFL_END
+
+REFL_TYPE(llm_symbol)
+  REFL_FIELD(prev)
+REFL_END
+
+REFL_TYPE(llm_bigram_spm)
+REFL_END
+
+REFL_TYPE(llm_tokenizer_spm)
+REFL_END
+
+REFL_TYPE(llm_bigram_bpe)
+REFL_END
+
+REFL_TYPE(llm_tokenizer_bpe)
+REFL_END
+  
+
+REFL_TYPE(fragment_buffer_variant)
+REFL_END
+  
+
+REFL_TYPE(llama_partial_utf8)
+  REFL_FIELD(value)
+  REFL_FIELD(n_remain)
+REFL_END
+  
+
+REFL_TYPE(llama_grammar)
+ REFL_FIELD(rules)
+ REFL_FIELD(stacks)
+REFL_END
+  
+
+REFL_TYPE(llama_grammar_candidate)
+ REFL_FIELD(index)
+ REFL_FIELD(code_points)
+REFL_END
+  
+
+REFL_TYPE(llama_beam)
+  REFL_FIELD(tokens)
+  REFL_FIELD(p)
+REFL_END
+  
+
+REFL_TYPE(llama_logit_info)
+  REFL_FIELD(logits)
+  REFL_FIELD(n_vocab)
+REFL_END
+
+REFL_TYPE(llama_beam_search_data)
+  REFL_FIELD(ctx)
+  REFL_FIELD(n_beams)
+REFL_END
+
+
+REFL_TYPE(quantize_state_internal)
+//  REFL_FIELD(model)
+  REFL_FIELD(params)
+REFL_FIELD( n_attention_wv )
+REFL_FIELD(    n_feed_forward_w2 )
+  REFL_FIELD(    i_attention_wv    )
+  REFL_FIELD(    i_feed_forward_w2 )
+REFL_FIELD(    n_k_quantized     )
+REFL_FIELD(     n_fallback        )
+
+REFL_END
+
+REFL_TYPE(llama_data_context)
+REFL_END
+  
+REFL_TYPE(llama_data_buffer_context)
+  REFL_FIELD(ptr)
+REFL_END
+
+REFL_TYPE(llama_data_file_context)
+  REFL_FIELD(file)
+REFL_END
+
+template <typename T>
+constexpr auto get_value_type_name(const T t) noexcept
+{
+  return t.value_type;
+}
+
+namespace runtime2
+    {
+      using namespace refl;
+      using namespace refl::descriptor;
+      template <typename CharT, typename T>
+        void debug(std::basic_ostream<CharT>& os, const T& value, bool compact = false);
+
+        namespace detail
+        {
+            template <typename CharT, typename T, typename = decltype(std::declval<std::basic_ostream<CharT>&>() << std::declval<T>())>
+            std::true_type is_ostream_printable_test(int);
+
+            template <typename CharT, typename T>
+            std::false_type is_ostream_printable_test(...);
+
+            template <typename CharT, typename T>
+            constexpr bool is_ostream_printable_v{ decltype(is_ostream_printable_test<CharT, T>(0))::value };
+
+            namespace
+            {
+                [[maybe_unused]] int next_depth(int depth)
+                {
+                    return depth == -1 || depth > 8
+                        ? -1
+                        : depth + 1;
+                }
+            }
+
+            template <typename CharT>
+            void indent(std::basic_ostream<CharT>& os, int depth)
+            {
+                for (int i = 0; i < depth; i++) {
+                    os << "    ";
+                }
+            }
+
+            template <typename CharT, typename T>
+            void debug_impl(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] int depth);
+
+            template <typename CharT, typename T>
+            void debug_detailed(std::basic_ostream<CharT>& os, const T& value, int depth)
+            {
+
+                using type_descriptor = type_descriptor<T>;
+                bool compact = depth == -1;
+                // print type with members enclosed in braces
+                os << type_descriptor::name << " { ";
+                if (!compact) os << '\n';
+
+                constexpr auto readable_members = filter(type_descriptor::members, [](auto member) { return is_readable(member); });
+                for_each(readable_members, [&](auto member, [[maybe_unused]] auto index) {
+                    int new_depth = next_depth(depth);
+
+                    indent(os, new_depth);
+                    os << get_display_name(member) << " = ";
+
+                    if constexpr (util::contains_instance<attr::debug>(member.attributes)) {
+                        // use the debug attribute to print
+                        auto debug_attr = util::get_instance<attr::debug>(member.attributes);
+                        debug_attr.write(os, value);
+                    }
+                    else {
+                        debug_impl(os, member(value), new_depth);
+                    }
+
+                    if (!compact || index + 1 != readable_members.size) {
+                        os << ", ";
+                    }
+                    if (!compact) {
+                        indent(os, depth);
+                        os << '\n';
+                    }
+                });
+
+                if (compact) os << ' ';
+                indent(os, depth);
+                os << '}';
+            }
+
+            template <typename CharT, typename T>
+            void debug_reflectable(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] int depth)
+            {
+                using type_descriptor = type_descriptor<T>;
+                if constexpr (trait::contains_instance_v<attr::debug, typename type_descriptor::attribute_types>) {
+                    // use the debug attribute to print
+                    auto debug_attr = util::get_instance<attr::debug>(type_descriptor::attributes);
+                    debug_attr.write(os, value);
+                }
+                else if constexpr (detail::is_ostream_printable_v<CharT, T>) {
+                    // type supports printing natively, just use that
+                    os << value;
+                }
+                else {
+                    debug_detailed(os, value, depth);
+                }
+            }
+
+            template <typename CharT, typename T>
+            void debug_container(std::basic_ostream<CharT>& os, const T& value, int depth)
+            {
+                bool compact = depth == -1;
+                os << "[";
+
+                auto end = value.end();
+                for (auto it = value.begin(); it != end; ++it)
+                {
+                    if (!compact) os << '\n';
+                    int new_depth = next_depth(depth);
+                    indent(os, new_depth);
+
+                    debug_impl(os, *it, new_depth);
+                    if (std::next(it, 1) != end) {
+                        os << ", ";
+                    }
+                    else if (!compact) {
+                        os << '\n';
+                    }
+                }
+
+                indent(os, depth);
+                os << "]";
+            }
+
+            template <typename CharT, typename T>
+            void debug_impl(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] int depth)
+            {
+                using no_pointer_t = std::remove_pointer_t<T>;
+
+                if constexpr (std::is_same_v<bool, T>) {
+                    os << (value ? "true" : "false");
+                }
+                else if constexpr (std::is_pointer_v<T> && !std::is_void_v<no_pointer_t> && trait::is_reflectable_v<no_pointer_t>) {
+                    if (value == nullptr) {
+                        os << "nullptr";
+                    }
+                    else {
+                        os << '&';
+                        debug_impl(os, *value, -1);
+                    }
+                }
+                else if constexpr (trait::is_reflectable_v<T>) {
+                    debug_reflectable(os, value, depth);
+                }
+                else if constexpr (detail::is_ostream_printable_v<CharT, T>) {
+                    os << value;
+                }
+                else if constexpr (trait::is_container_v<T>) {
+                    debug_container(os, value, depth);
+                }
+                else {
+                    os << "(not printable)";
+                }
+            }
+        }
+
+        /**
+         * Writes the debug representation of value to the given std::ostream.
+         * Calls the function specified by the debug<F> attribute whenever possible,
+         * before falling back to recursively interating the members and printing them.
+         * Takes an optional arguments specifying whether to print a compact representation.
+         * The compact representation contains no newlines.
+         */
+        template <typename CharT, typename T>
+        void debug(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] bool compact)
+        {
+            static_assert(trait::is_reflectable_v<T> || trait::is_container_v<T> || detail::is_ostream_printable_v<CharT, T>,
+                "Type is not reflectable, not a container of reflectable types and does not support operator<<(std::ostream&, T)!");
+
+            detail::debug_impl(os, value, compact ? -1 : 0);
+        }
+
+        /**
+         * Writes the compact debug representation of the provided values to the given std::ostream.
+         */
+        template <typename CharT, typename... Ts>
+        void debug_all(std::basic_ostream<CharT>& os, const Ts&... values)
+        {
+            refl::runtime::debug(os, std::forward_as_tuple(static_cast<const Ts&>(values)...), true);
+        }
+
+        /**
+         * Writes the debug representation of the provided value to an std::string and returns it.
+         * Takes an optional arguments specifying whether to print a compact representation.
+         * The compact representation contains no newlines.
+         */
+        template <typename CharT = char, typename T>
+        std::basic_string<CharT> debug_str(const T& value, bool compact = false)
+        {
+            std::basic_stringstream<CharT> ss;
+            debug(ss, value, compact);
+            return ss.str();
+        }
+
+        /**
+         * Writes the compact debug representation of the provided values to an std::string and returns it.
+         */
+        template <typename CharT = char, typename... Ts>
+        std::basic_string<CharT> debug_all_str(const Ts&... values)
+        {
+            return refl::runtime::debug_str(std::forward_as_tuple(static_cast<const Ts&>(values)...), true);
+        }
+}
+
+// // A generic function to print out the fields of any object
+template<typename T>
+void print_fields(const T& t) {
+  runtime2::debug(std::cout, t);
+  constexpr auto type = refl::reflect<T>();
+
+  constexpr auto membertype = refl::member_list<T>();
+
+  constexpr auto members = get_members(type);
+  std::cout << "DEBUG Type: " << type.name.c_str() << "\n";
+  std::cout << "DEBUG Type2: " << typeid(membertype).name() << "\n";
+  std::cout << "DEBUG Type3: " << typeid(members).name() << "\n";
+     refl::util::for_each(members, [&](auto member) {
+       //using member_t = decltype(member::value_type);
+       //typename type3 = member::value_type;
+       //typename trait::remove_qualifiers_t<member_t>::value_type>;
+       //constexpr auto type2 = refl::reflect(type3);
+	 //std::cout  << "Auto:" << foo <<"\n";       
+       std::cout  << "Auto:" << member.name <<"\n";
+       //std::cout << "DEBUG Type2: " << typeid(member_t).name() << "\n";
+       //std::cout << "DEBUG Type2: " << type2.name.c_str() << "\n";
+     });
+     std::cout << "\n";
+}
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -46,6 +46,6 @@ llama_build_and_test_executable(test-grad0.cpp) # SLOW
 llama_build_and_test_executable(test-rope.cpp)

 # dummy executable - not installed
-get_filename_component(TEST_TARGET test-c.c NAME_WE)
-add_executable(${TEST_TARGET} test-c.c)
+get_filename_component(TEST_TARGET test-c.cpp NAME_WE)
+add_executable(${TEST_TARGET} test-c.cpp)
 target_link_libraries(${TEST_TARGET} PRIVATE llama)
--- a/tests/test-c.cpp
+++ b/tests/test-c.cpp