still not working

ready to rebase working
2023-11-24 19:09:19 -05:00 · 2023-11-24 19:09:19 -05:00 · 3faef69427
commit 3faef69427
parent 04814e718e
18 changed files with 2818 additions and 520 deletions
--- a/.gitignore
+++ b/.gitignore
@ -99,3 +99,6 @@ tests/test-tokenizer-0-llama
 tests/test-tokenizer-0-falcon
 tests/test-tokenizer-1-llama
 tests/test-tokenizer-1-bpe
 /#llama.cpp#
 #*
 \\#*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -104,7 +104,7 @@ option(LLAMA_BUILD_SERVER               "llama: build server example"
 # Compile flags
 #
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
@ -230,7 +230,12 @@ if (LLAMA_BLAS)
        message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
        add_compile_options(${BLAS_LINKER_FLAGS})
-        add_compile_definitions(GGML_USE_OPENBLAS)
+
 	# from https://github.com/NVIDIA/cutlass
 	make_directory("${PROJECT_BINARY_DIR}/nvcc_tmp")
 	set(cuda_flags --keep "SHELL:--keep-dir ${PROJECT_BINARY_DIR}/nvcc_tmp" ${cuda_flags})
 	#        add_compile_definitions(GGML_USE_OPENBLAS)
        if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
            add_compile_definitions(GGML_BLAS_USE_MKL)
        endif()
@ -312,7 +317,7 @@ if (LLAMA_MPI)
    if (MPI_C_FOUND)
        message(STATUS "MPI found")
        set(GGML_HEADERS_MPI ggml-mpi.h)
-        set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
+        set(GGML_SOURCES_MPI ggml-mpi.cpp ggml-mpi.h)
        add_compile_definitions(GGML_USE_MPI)
        add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
        if (NOT MSVC)
@ -438,6 +443,9 @@ if (NOT cuda_host_flags STREQUAL "")
    set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
 endif()
 # 
 set(cuda_flags --verbose -G  ${cuda_flags})
 add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
 if (WIN32)
@ -485,6 +493,8 @@ if (NOT MSVC)
            add_link_options(-static-libgcc -static-libstdc++)
        endif()
    endif()
    add_link_options("-Wl,-Map=${TARGET}.map")
    if (LLAMA_GPROF)
      add_compile_options(-pg)
    endif()
@ -645,13 +655,16 @@ if (GGML_USE_CPU_HBM)
 endif()
 add_library(ggml OBJECT
-            ggml.c
+            ggml.cpp
            ggml.h
-            ggml-alloc.c
+	    print.hpp
 	    ggml-internal.hpp
 	    llama-internal.hpp
            ggml-alloc.cpp
            ggml-alloc.h
-            ggml-backend.c
+            ggml-backend.cpp
            ggml-backend.h
-            ggml-quants.c
+            ggml-quants.cpp
            ggml-quants.h
            ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
            ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
@ -683,7 +696,7 @@ add_library(llama
            )
 target_include_directories(llama PUBLIC .)
-target_compile_features(llama PUBLIC cxx_std_11) # don't bump
+target_compile_features(llama PUBLIC cxx_std_20) # don't bump
 target_link_libraries(llama PRIVATE
    ggml
    ${LLAMA_EXTRA_LIBS}
--- a/24
+++ b/24
@ -116,7 +116,7 @@ endif
 # keep standard at C11 and C++11
 MK_CPPFLAGS = -I. -Icommon
 MK_CFLAGS   = -std=c11   -fPIC
-MK_CXXFLAGS = -std=c++11 -fPIC
+MK_CXXFLAGS = -std=c++20 -fPIC -fpermissive -DCPP_ONLY
 # -Ofast tends to produce faster code, but may not be available for some compilers.
 ifdef LLAMA_FAST
@ -502,7 +502,7 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
 endif # LLAMA_METAL
 ifdef LLAMA_MPI
-ggml-mpi.o: ggml-mpi.c ggml-mpi.h
+ggml-mpi.o: ggml-mpi.cpp ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI
@ -537,17 +537,17 @@ $(info )
 # Build library
 #
-ggml.o: ggml.c ggml.h ggml-cuda.h
+ggml.o: ggml.cpp ggml.h ggml-cuda.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@
-ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
+ggml-alloc.o: ggml-alloc.cpp ggml.h ggml-alloc.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@
-ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
+ggml-backend.o: ggml-backend.cpp ggml.h ggml-backend.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@
-ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
+ggml-quants.o: ggml-quants.cpp ggml.h ggml-quants.h
-	$(CC) $(CFLAGS)    -c $< -o $@
+	$(CXX) $(CXXFLAGS)    -c $< -o $@
 OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
@ -734,5 +734,5 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
 tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-c.o: tests/test-c.c llama.h
+tests/test-c.o: tests/test-c.cpp llama.h
-	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
+	$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
--- a/binding.py
+++ b/binding.py
@ -0,0 +1,334 @@
 import os
 import json
 import re
 import clang.cindex
 # configurable part
 CLANG_VERSION='13.0.1'
 #   homebrew installs for llvm (brew info llvm gives details):
 #       x64: /usr/local/opt/llvm/lib
 #       arm64: /opt/homebrew/opt/llvm/lib
 llvmLibPath = "/usr/lib/llvm-15/lib/"
 cxxClientRoot = "/home/mdupont/experiments/llama.cpp/"
 fileList = [
    "ggml.cpp",
    "llama.cpp"
 ]
 typeList = [
 ]
 # end of configurable part
 clang.cindex.Config.set_library_path(llvmLibPath)
 def list_headers_in_dir(path):
    # enumerates a folder but keeps the full pathing for the files returned
    # and removes certain files we don't want (like non-hxx, _json.hxx or _fmt.hxx)
    # list all the files in the folder
    files = os.listdir(path)
    # only include .hxx files
    files = list(filter(lambda x: x.endswith('.hxx'), files))
    # add the folder path back on
    files = list(map(lambda x: path + x, files))
    return files
 # parse through the list of files specified and expand wildcards
 fullFileList = []
 for filePath in fileList:
    if "*" in filePath:
        # wildcard path
        basePath = filePath[:-1]
        if "*" in basePath:
            # if there is still a wildcard, we have an issue...
            raise NotImplementedError(
                "wildcard only supported at end of file path")
        files = list_headers_in_dir(os.path.join(cxxClientRoot, basePath))
        fullFileList = fullFileList + files
    else:
        # normal path
        ff = os.path.join(cxxClientRoot, filePath)
        fullFileList.append(ff)
        print("DBUG",ff)
 # exclude _json.hxx files
 fullFileList = list(
    filter(lambda x: not x.endswith('_json.hxx'), fullFileList))
 # exclude _fmt.hxx files
 fullFileList = list(
    filter(lambda x: not x.endswith('_fmt.hxx'), fullFileList))
 # generate a list of regexps from the type list (for handling wildcards)
 typeListRe = list(map(lambda x: x.replace("*", "(.*)") + "(.*)", typeList))
 def is_included_type(name, with_durability=False):
    # TODO(brett19): This should be generalized somehow...
    if "is_compound_operation" in name:
        return False
    if "replica_context" in name:
        return False
    if with_durability is True and '_with_legacy_durability' not in name:
        return False
    for x in typeListRe:
        if re.fullmatch(x, name):
            return True
    return False
 opTypes = []
 opEnums = []
 def parse_type(type):
    typeStr = type.get_canonical().spelling
    return parse_type_str(typeStr)
 std_comparators = ["std::less<>", "std::greater<>", "std::less_equal<>", "std::greater_equal<>"]
 def parse_type_str(typeStr):
    if typeStr == "std::mutex":
        return {"name": "std::mutex"}
    if typeStr == "std::string":
        return {"name": "std::string"}
    if typeStr == "std::chrono::duration<long long>":
        return {"name": "std::chrono::seconds"}
    if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000>>":
        return {"name": "std::chrono::milliseconds"}
    if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000000>>":
        return {"name": "std::chrono::microseconds"}
    if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000000000>>":
        return {"name": "std::chrono::nanoseconds"}
    if typeStr == "std::error_code":
        return {"name": "std::error_code"}
    if typeStr == "std::monostate":
        return {"name": "std::monostate"}
    if typeStr == "std::byte":
        return {"name": "std::byte"}
    if typeStr == "unsigned long":
        return {"name": "std::size_t"}
    if typeStr == "char":
        return {"name": "std::int8_t"}
    if typeStr == "unsigned char":
        return {"name": "std::uint8_t"}
    if typeStr == "short":
        return {"name": "std::int16_t"}
    if typeStr == "unsigned short":
        return {"name": "std::uint16_t"}
    if typeStr == "int":
        return {"name": "std::int32_t"}
    if typeStr == "unsigned int":
        return {"name": "std::uint32_t"}
    if typeStr == "long long":
        return {"name": "std::int64_t"}
    if typeStr == "unsigned long long":
        return {"name": "std::uint64_t"}
    if typeStr == "bool":
        return {"name": "std::bool"}
    if typeStr == "float":
        return {"name": "std::float"}
    if typeStr == "double":
        return {"name": "std::double"}
    if typeStr == "std::nullptr_t":
        return {"name": "std::nullptr_t"}
    if typeStr in std_comparators:
        return {"name": typeStr}
    tplParts = typeStr.split("<", 1)
    if len(tplParts) > 1:
        tplClassName = tplParts[0]
        tplParams = tplParts[1][:-1]
        if tplClassName == "std::function":
            return {
                "name": "std::function"
            }
        if tplClassName == "std::optional":
            return {
                "name": "std::optional",
                "of": parse_type_str(tplParams)
            }
        if tplClassName == "std::vector":
            return {
                "name": "std::vector",
                "of": parse_type_str(tplParams)
            }
        if tplClassName == "std::set":
            return {
                "name": "std::set",
                "of": parse_type_str(tplParams)
            }
        if tplClassName == "std::variant":
            variantParts = tplParams.split(", ")
            variantTypes = []
            for variantPart in variantParts:
                variantTypes.append(parse_type_str(variantPart))
            return {
                "name": "std::variant",
                "of": variantTypes
            }
        if tplClassName == "std::array":
            variantParts = tplParams.split(", ")
            if len(variantParts) != 2:
                print("FAILED TO PARSE ARRAY TYPES: " + typeStr)
                return {"name": "unknown", "str": typeStr}
            return {
                "name": "std::array",
                "of": parse_type_str(variantParts[0]),
                "size": int(variantParts[1])
            }
        if tplClassName == "std::map":
            variantParts = tplParams.split(", ")
            if len(variantParts) < 2 or len(variantParts) > 3:
                print("FAILED TO PARSE MAP TYPES: " + typeStr)
                return {"name": "unknown", "str": typeStr}
            if len(variantParts) == 2:
                return {
                    "name": "std::map",
                    "of": parse_type_str(variantParts[0]),
                    "to": parse_type_str(variantParts[1])
                }
            else:
                return {
                    "name": "std::map",
                    "of": parse_type_str(variantParts[0]),
                    "to": parse_type_str(variantParts[1]),
                    "comparator": parse_type_str(variantParts[2])
                }
        if tplClassName == "std::shared_ptr":
            return {
                "name": "std::shared_ptr",
                "of": parse_type_str(tplParams)
            }
        #return {"name": "unknown", "str": typeStr}
    if 'unnamed struct' in typeStr:
        print("WARNING:  Found unnamed struct: " + typeStr)
    return {"name": typeStr}
 internal_structs = []
 UNNAMED_STRUCT_DELIM = '::(unnamed struct'
 def traverse(node, namespace, main_file):
    # only scan the elements of the file we parsed
    #print("FILE", node.location.file )
    if node.kind == clang.cindex.CursorKind.STRUCT_DECL or node.kind == clang.cindex.CursorKind.CLASS_DECL:
        fullStructName = "::".join([*namespace, node.displayname])
        print("REFL_TYPE(" + fullStructName + ")")
        structFields = []
        for child in node.get_children():
            if child.kind == clang.cindex.CursorKind.FIELD_DECL:
                struct_type = parse_type(child.type)
                type_str = child.type.get_canonical().spelling
                print("  REFL_FIELD(" + child.displayname + ")")
                if 'unnamed' in type_str:
                    name_tokens = type_str.split('::')
                    name_override = '::'.join(name_tokens[:-1] + [child.displayname])
                    struct_type['name'] = name_override
                    internal_structs.append(name_override)
                    structFields.append({
                        "name": child.displayname,
                        "type": struct_type,
                    })
            # replica read changes introduced duplicate get requests
            if any(map(lambda op: op['name'] == fullStructName, opTypes)):
                return
            opTypes.append({
                "name": fullStructName,
                "fields": structFields,
            })
        print("REFL_END")
    if node.kind == clang.cindex.CursorKind.TYPE_ALIAS_DECL:
        fullStructName = "::".join([*namespace, node.displayname])
        if is_included_type(fullStructName, with_durability=True):
            type_ref = next((c for c in node.get_children() if c.kind == clang.cindex.CursorKind.TYPE_REF), None)
            if type_ref:
                base_request_name = type_ref.displayname.replace('struct', '').strip()
                base_request = next((op for op in opTypes if op['name'] == base_request_name), None)
                if base_request:
                    new_fields = [f for f in base_request['fields'] if f['name'] != 'durability_level']
                    new_fields.extend([
                            {"name":"persist_to", "type":{"name":"couchbase::persist_to"}},
                            {"name":"replicate_to", "type":{"name":"couchbase::replicate_to"}}
                        ])
                    opTypes.append({
                        "name": fullStructName,
                        "fields": new_fields
                    })
    if node.kind == clang.cindex.CursorKind.ENUM_DECL:
        fullEnumName = "::".join([*namespace, node.displayname])
        if is_included_type(fullEnumName):
            enumValues = []
            for child in node.get_children():
                if child.kind == clang.cindex.CursorKind.ENUM_CONSTANT_DECL:
                    enumValues.append({
                        "name": child.displayname,
                        "value": child.enum_value,
                    })
            opEnums.append({
                "name": fullEnumName,
                "type": parse_type(node.enum_type),
                "values": enumValues,
            })
    if node.kind == clang.cindex.CursorKind.NAMESPACE:
        namespace = [*namespace, node.displayname]
    if node.kind == clang.cindex.CursorKind.CLASS_DECL:
        namespace = [*namespace, node.displayname]
    if node.kind == clang.cindex.CursorKind.STRUCT_DECL:
        namespace = [*namespace, node.displayname]
    for child in node.get_children():
        traverse(child, namespace, main_file)
 for headerPath in fullFileList:
    print("processing " + headerPath)
    index = clang.cindex.Index.create()
    args = [
        '-std=c++17',
    ]
    try:
        translation_unit = index.parse(headerPath, args=args)
    except Exception as e:
        print(e)
        import pdb
        pdb.set_trace()
        raise e
    # output clang compiler diagnostics information (for debugging)
    for diagnostic in translation_unit.diagnostics:
        diagnosticMsg = diagnostic.format()
        print(diagnostic)
    traverse(translation_unit.cursor, [], headerPath)
 jsonData = json.dumps({
    'op_structs': opTypes,
    'op_enums': opEnums
 })
 f = open("bindings.json", "w")
 f.write(jsonData)
 f.close()
--- a/ggml-alloc.cpp
+++ b/ggml-alloc.cpp
@ -386,7 +386,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
 void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) {
    free(galloc->parse_seq);
-    galloc->parse_seq = malloc(sizeof(int) * n);
+    galloc->parse_seq = (int*)malloc(sizeof(int) * n);
    for (int i = 0; i < n; i++) {
        galloc->parse_seq[i] = list[i];
@ -646,9 +646,9 @@ size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, st
        if (galloc->hash_values != NULL) {
            free(galloc->hash_values);
        }
-        galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size);
+        galloc->hash_set.keys = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * hash_size);
        galloc->hash_set.size = hash_size;
-        galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
+        galloc->hash_values = (hash_node*)malloc(sizeof(struct hash_node) * hash_size);
    }
    // reset hash table
@ -674,7 +674,7 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap
    // alloc hash_values if needed
    if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) {
        free(galloc->hash_values);
-        galloc->hash_values      = malloc(sizeof(struct hash_node) * hash_size);
+        galloc->hash_values      = (hash_node*)malloc(sizeof(struct hash_node) * hash_size);
        galloc->hash_values_size = hash_size;
    }
--- a/ggml-backend.cpp
+++ b/ggml-backend.cpp
@ -20,7 +20,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
        struct ggml_backend_buffer_i           iface,
               ggml_backend_buffer_context_t   context,
               size_t                          size) {
-    ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
+  ggml_backend_buffer_t buffer = (ggml_backend_buffer*)malloc(sizeof(struct ggml_backend_buffer));
    GGML_ASSERT(iface.get_base != NULL);
@ -195,9 +195,9 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
    // TODO: allow backends to support copy to/from same backend
    if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
-        ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst);
+      ggml_get_backend(dst)->iface.cpy_tensor_from((ggml_backend_t)ggml_get_backend(dst)->context, src, dst);
    } else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
-        ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst);
+      ggml_get_backend(src)->iface.cpy_tensor_to((ggml_backend_t)ggml_get_backend(src)->context, src, dst);
    } else {
        // shouldn't be hit when copying from/to CPU
        #ifndef NDEBUG
@ -316,13 +316,13 @@ struct ggml_backend_plan_cpu {
 static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-    struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
+    struct ggml_backend_plan_cpu * cpu_plan = (ggml_backend_plan_cpu*)malloc(sizeof(struct ggml_backend_plan_cpu));
    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
    cpu_plan->cgraph = *cgraph;
    if (cpu_plan->cplan.work_size > 0) {
-        cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
+      cpu_plan->cplan.work_data = (uint8_t*)malloc(cpu_plan->cplan.work_size);
    }
    return cpu_plan;
@ -356,7 +356,7 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
        cpu_ctx->work_size = cplan.work_size;
    }
-    cplan.work_data = cpu_ctx->work_data;
+    cplan.work_data = (uint8_t*)cpu_ctx->work_data;
    ggml_graph_compute(cgraph, &cplan);
 }
@ -385,13 +385,13 @@ static struct ggml_backend_i cpu_backend_i = {
 };
 ggml_backend_t ggml_backend_cpu_init(void) {
-    struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
+  struct ggml_backend_cpu_context * ctx = (ggml_backend_cpu_context*)malloc(sizeof(struct ggml_backend_cpu_context));
    ctx->n_threads = GGML_DEFAULT_N_THREADS;
    ctx->work_data = NULL;
    ctx->work_size = 0;
-    ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
+    ggml_backend_t cpu_backend = (ggml_backend_t)malloc(sizeof(struct ggml_backend));
    *cpu_backend = (struct ggml_backend) {
        /* .interface = */ cpu_backend_i,
@ -869,7 +869,7 @@ static void sched_reset(ggml_backend_sched_t sched) {
 ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
    GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
-    struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched));
+    struct ggml_backend_sched * sched = (ggml_backend_sched*)malloc(sizeof(struct ggml_backend_sched));
    memset(sched, 0, sizeof(struct ggml_backend_sched));
    fprintf(stderr, "ggml_backend_sched size: %lu KB\n", sizeof(struct ggml_backend_sched)/1024);
@ -907,9 +907,9 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
    // initialize hash tables
    size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
    sched->hash_set.size = hash_size;
-    sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
+    sched->hash_set.keys = (ggml_tensor**)malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
-    sched->node_talloc   = malloc(sizeof(sched->node_talloc[0])   * hash_size);
+    sched->node_talloc   = (ggml_tallocr**)malloc(sizeof(sched->node_talloc[0])   * hash_size);
-    sched->node_copies   = malloc(sizeof(sched->node_copies[0])   * hash_size);
+    sched->node_copies   = (ggml_tensor *(*)[4])malloc(sizeof(sched->node_copies[0])   * hash_size);
    sched_split_graph(sched, measure_graph);
    sched_alloc_splits(sched);
--- a/ggml-impl.h
+++ b/ggml-impl.h
@ -22,7 +22,7 @@ extern "C" {
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
 #define static_assert(cond, msg) _Static_assert(cond, msg)
 #else
-#define static_assert(cond, msg) struct global_scope_noop_trick
+  //#define static_assert(cond, msg) struct global_scope_noop_trick
 #endif
 #endif
--- a/ggml-internal.hpp
+++ b/ggml-internal.hpp
@ -0,0 +1,258 @@
 struct ggml_context {
    size_t mem_size;
    void * mem_buffer;
    bool   mem_buffer_owned;
    bool   no_alloc;
    bool   no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
    int    n_objects;
    struct ggml_object * objects_begin;
    struct ggml_object * objects_end;
    struct ggml_scratch scratch;
    struct ggml_scratch scratch_save;
  ggml_context():
    mem_size(0),
    mem_buffer(0),
    mem_buffer_owned(0),
    no_alloc(0),
    no_alloc_save(0),
    n_objects(0),
    objects_begin(0),
    objects_end(0),
    scratch(),
    scratch_save()
  {
  }
 };
 struct ggml_context_container {
    bool used;
    struct ggml_context context;
  ggml_context_container(): used(0),context(){
  }
 };
 typedef double ggml_float;
 typedef void * thread_ret_t;
 #define MAX_FREE_BLOCKS 256
 struct free_block {
    void * addr;
    size_t size;
 };
 struct ggml_tallocr {
    struct ggml_backend_buffer * buffer;
    bool buffer_owned;
    void * base;
    size_t alignment;
    int n_free_blocks;
    struct free_block free_blocks[MAX_FREE_BLOCKS];
    size_t max_size;
    bool measure;
 #ifdef GGML_ALLOCATOR_DEBUG
    struct ggml_tensor * allocated_tensors[1024];
 #endif
 };
 struct hash_node {
    int n_children;
    int n_views;
 };
 typedef struct ggml_tallocr * ggml_tallocr_t;
 typedef struct ggml_gallocr * ggml_gallocr_t;
 struct ggml_gallocr {
    ggml_tallocr_t talloc;
    struct ggml_hash_set hash_set;
    struct hash_node * hash_values;
    size_t hash_values_size;
    ggml_tallocr_t * hash_allocs;
    int * parse_seq;
    int parse_seq_len;
 };
 struct ggml_allocr {
    ggml_tallocr_t talloc;
    ggml_gallocr_t galloc;
 };
 #define GGML_NUMA_MAX_NODES 8
 #define GGML_NUMA_MAX_CPUS 512
 struct ggml_numa_node {
    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
    uint32_t n_cpus;
 };
 struct ggml_numa_nodes {
    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
    uint32_t n_nodes;
    uint32_t total_cpus; // hardware threads on system
 };
 struct ggml_state {
    struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
    struct ggml_numa_nodes numa;
  ggml_state():contexts(), numa()
  {
  }
 };
 struct gguf_str {
    uint64_t n;  // GGUFv2
    char * data;
 };
 struct ggml_map_custom1_op_params {
    ggml_custom1_op_t fun;
    int n_tasks;
    void * userdata;
 };
 struct ggml_map_custom2_op_params {
    ggml_custom2_op_t fun;
    int n_tasks;
    void * userdata;
 };
 struct ggml_map_custom3_op_params {
    ggml_custom3_op_t fun;
    int n_tasks;
    void * userdata;
 };
 struct hash_map {
    struct ggml_hash_set set;
    struct ggml_tensor ** vals;
 };
 #if defined(_WIN32)
 typedef volatile LONG atomic_int;
 typedef atomic_int atomic_bool;
 #else
 #include<atomic>
 using namespace std;
 #endif
 struct ggml_compute_state_shared {
    const struct ggml_cgraph * cgraph;
    const struct ggml_cplan  * cplan;
    int64_t perf_node_start_cycles;
    int64_t perf_node_start_time_us;
    const int n_threads;
    // synchronization primitives
    atomic_int n_active; // num active threads
    atomic_int node_n;   // active graph node
    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
    void * abort_callback_data;
 };
 typedef pthread_t ggml_thread_t;
 struct ggml_compute_state {
    ggml_thread_t thrd;
    int ith;
    struct ggml_compute_state_shared * shared;
 };
 union gguf_value {
    uint8_t  uint8;
    int8_t   int8;
    uint16_t uint16;
    int16_t  int16;
    uint32_t uint32;
    int32_t  int32;
    float    float32;
    uint64_t uint64;
    int64_t  int64;
    double   float64;
    bool     bool_;
    struct gguf_str str;
    struct gguf_array_T {
        enum gguf_type type;
        uint64_t n;  // GGUFv2
        void * data;
    } arr;
 };
 struct ggml_lbfgs_iteration_data {
    float alpha;
    float ys;
    float * s;
    float * y;
 };
 struct gguf_kv {
    struct gguf_str key;
    enum  gguf_type  type;
    union gguf_value value;
 };
 struct gguf_header {
    char magic[4];
    uint32_t version;
    uint64_t n_tensors; // GGUFv2
    uint64_t n_kv;      // GGUFv2
 };
 struct gguf_tensor_info {
    struct gguf_str name;
    uint32_t n_dims;
    uint64_t ne[GGML_MAX_DIMS];
    enum ggml_type type;
    uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
    // for writing API
    const void * data;
    size_t size;
 };
 struct gguf_context {
    struct gguf_header header;
    struct gguf_kv          * kv;
    struct gguf_tensor_info * infos;
    size_t alignment;
    size_t offset;    // offset of `data` from beginning of file
    size_t size;      // size of `data` in bytes
    //uint8_t * padding;
    void * data;
 };
 struct gguf_buf {
    void * data;
    size_t size;
    size_t offset;
 };
 #include "ggml-backend-impl.h"
--- a/ggml-mpi.cpp
+++ b/ggml-mpi.cpp
--- a/ggml-quants.cpp
+++ b/ggml-quants.cpp
--- a/ggml-quants.h
+++ b/ggml-quants.h
@ -167,58 +167,58 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
 // Quantization
-void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
+void quantize_row_q4_0_reference(const float * __restrict__ x, block_q4_0 * __restrict__ y, int k);
-void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
+void quantize_row_q4_1_reference(const float * __restrict__ x, block_q4_1 * __restrict__ y, int k);
-void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
+void quantize_row_q5_0_reference(const float * __restrict__ x, block_q5_0 * __restrict__ y, int k);
-void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
+void quantize_row_q5_1_reference(const float * __restrict__ x, block_q5_1 * __restrict__ y, int k);
-void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
+void quantize_row_q8_0_reference(const float * __restrict__ x, block_q8_0 * __restrict__ y, int k);
-void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
+void quantize_row_q8_1_reference(const float * __restrict__ x, block_q8_1 * __restrict__ y, int k);
-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
+void quantize_row_q2_K_reference(const float * __restrict__ x, block_q2_K * __restrict__ y, int k);
-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
+void quantize_row_q3_K_reference(const float * __restrict__ x, block_q3_K * __restrict__ y, int k);
-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
+void quantize_row_q4_K_reference(const float * __restrict__ x, block_q4_K * __restrict__ y, int k);
-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
+void quantize_row_q5_K_reference(const float * __restrict__ x, block_q5_K * __restrict__ y, int k);
-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
+void quantize_row_q6_K_reference(const float * __restrict__ x, block_q6_K * __restrict__ y, int k);
-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
+void quantize_row_q8_K_reference(const float * __restrict__ x, block_q8_K * __restrict__ y, int k);
-void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
+void quantize_row_q4_0(const float * __restrict__ x, void * __restrict__ y, int k);
-void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
+void quantize_row_q4_1(const float * __restrict__ x, void * __restrict__ y, int k);
-void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
+void quantize_row_q5_0(const float * __restrict__ x, void * __restrict__ y, int k);
-void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
+void quantize_row_q5_1(const float * __restrict__ x, void * __restrict__ y, int k);
-void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
+void quantize_row_q8_0(const float * __restrict__ x, void * __restrict__ y, int k);
-void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
+void quantize_row_q8_1(const float * __restrict__ x, void * __restrict__ y, int k);
-void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q2_K(const float * __restrict__ x, void * __restrict__ y, int k);
-void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q3_K(const float * __restrict__ x, void * __restrict__ y, int k);
-void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q4_K(const float * __restrict__ x, void * __restrict__ y, int k);
-void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q5_K(const float * __restrict__ x, void * __restrict__ y, int k);
-void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q6_K(const float * __restrict__ x, void * __restrict__ y, int k);
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q8_K(const float * __restrict__ x, void * __restrict__ y, int k);
 // Dequantization
-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
+void dequantize_row_q4_0(const block_q4_0 * __restrict__ x, float * __restrict__ y, int k);
-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
+void dequantize_row_q4_1(const block_q4_1 * __restrict__ x, float * __restrict__ y, int k);
-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
+void dequantize_row_q5_0(const block_q5_0 * __restrict__ x, float * __restrict__ y, int k);
-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
+void dequantize_row_q5_1(const block_q5_1 * __restrict__ x, float * __restrict__ y, int k);
-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
+void dequantize_row_q8_0(const block_q8_0 * __restrict__ x, float * __restrict__ y, int k);
-//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
+//void dequantize_row_q8_1(const block_q8_1 * __restrict__ x, float * __restrict__ y, int k);
-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
+void dequantize_row_q2_K(const block_q2_K * __restrict__ x, float * __restrict__ y, int k);
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
+void dequantize_row_q3_K(const block_q3_K * __restrict__ x, float * __restrict__ y, int k);
-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
+void dequantize_row_q4_K(const block_q4_K * __restrict__ x, float * __restrict__ y, int k);
-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
+void dequantize_row_q5_K(const block_q5_K * __restrict__ x, float * __restrict__ y, int k);
-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
+void dequantize_row_q6_K(const block_q6_K * __restrict__ x, float * __restrict__ y, int k);
-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
+void dequantize_row_q8_K(const block_q8_K * __restrict__ x, float * __restrict__ y, int k);
 // Dot product
-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q4_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
-void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q4_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
-void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q5_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
-void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q5_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
-void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q8_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
-void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q2_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
-void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q3_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
-void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q4_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q5_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
-void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q6_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
--- a/ggml.cpp
+++ b/ggml.cpp
@ -38,6 +38,14 @@
 #pragma warning(disable: 4996)
 #endif
 // initializers for static data called in the ggml_init function
 static  size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {};
 static  char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {};
 void type_traits_init();
 void GGUF_TYPE_SIZE_init();
 void GGUF_TYPE_NAME_init();
 #if defined(_WIN32)
 #include <windows.h>
@ -86,7 +94,9 @@ static int sched_yield (void) {
 }
 #else
 #include <pthread.h>
-#include <stdatomic.h>
+//#include <stdatomic.h>
 #include <atomic>
 using namespace std;
 typedef void * thread_ret_t;
@ -96,6 +106,8 @@ typedef void * thread_ret_t;
 #endif
 #include <atomic>
 #ifdef GGML_USE_CPU_HBM
 #include <hbwmalloc.h>
 #endif
@ -409,37 +421,39 @@ int64_t ggml_cycles_per_ms(void) {
 static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
-static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
+static void ggml_vec_dot_f32(const int n, float * GGML_RESTRICT  s, const float * GGML_RESTRICT  x, const float * GGML_RESTRICT  y);
-static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
+static void ggml_vec_dot_f16(const int n, float * GGML_RESTRICT  s, ggml_fp16_t * GGML_RESTRICT  x, ggml_fp16_t * GGML_RESTRICT  y);
-static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
+
-    [GGML_TYPE_I8] = {
+static  ggml_type_traits_t type_traits[GGML_TYPE_COUNT]; 
 void type_traits_init(){
    type_traits[GGML_TYPE_I8] = {
        .type_name                = "i8",
        .blck_size                = 1,
        .type_size                = sizeof(int8_t),
        .is_quantized             = false,
-    },
+    };
-    [GGML_TYPE_I16] = {
+    type_traits[GGML_TYPE_I16] = {
        .type_name                = "i16",
        .blck_size                = 1,
        .type_size                = sizeof(int16_t),
        .is_quantized             = false,
-    },
+    };
-    [GGML_TYPE_I32] = {
+    type_traits[GGML_TYPE_I32] = {
        .type_name                = "i32",
        .blck_size                = 1,
        .type_size                = sizeof(int32_t),
        .is_quantized             = false,
-    },
+    };
-    [GGML_TYPE_F32] = {
+    type_traits[GGML_TYPE_F32] = {
        .type_name                = "f32",
        .blck_size                = 1,
        .type_size                = sizeof(float),
        .is_quantized             = false,
        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
        .vec_dot_type             = GGML_TYPE_F32,
-    },
+    };
-    [GGML_TYPE_F16] = {
+    type_traits[GGML_TYPE_F16] = {
        .type_name                = "f16",
        .blck_size                = 1,
        .type_size                = sizeof(ggml_fp16_t),
@ -449,8 +463,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) ggml_fp32_to_fp16_row,
        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
        .vec_dot_type             = GGML_TYPE_F16,
-    },
+    };
-    [GGML_TYPE_Q4_0] = {
+    type_traits[GGML_TYPE_Q4_0] = {
        .type_name                = "q4_0",
        .blck_size                = QK4_0,
        .type_size                = sizeof(block_q4_0),
@ -460,8 +474,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
        .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
+    };
-    [GGML_TYPE_Q4_1] = {
+    type_traits[GGML_TYPE_Q4_1] = {
        .type_name                = "q4_1",
        .blck_size                = QK4_1,
        .type_size                = sizeof(block_q4_1),
@ -471,8 +485,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_1_reference,
        .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
+    };
-    [4] = { // GGML_TYPE_Q4_2
+    type_traits[4] = { // GGML_TYPE_Q4_2
        .type_name                = "DEPRECATED",
        .blck_size                = 0,
        .type_size                = 0,
@ -482,8 +496,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = NULL,
        .vec_dot                  = NULL,
        .vec_dot_type             = GGML_TYPE_COUNT,
-    },
+    };
-    [5] = { // GGML_TYPE_Q4_3
+    type_traits[5] = { // GGML_TYPE_Q4_3
        .type_name                = "DEPRECATED",
        .blck_size                = 0,
        .type_size                = 0,
@ -493,8 +507,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = NULL,
        .vec_dot                  = NULL,
        .vec_dot_type             = GGML_TYPE_COUNT,
-    },
+    };
-    [GGML_TYPE_Q5_0] = {
+    type_traits[GGML_TYPE_Q5_0] = {
        .type_name                = "q5_0",
        .blck_size                = QK5_0,
        .type_size                = sizeof(block_q5_0),
@ -504,8 +518,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_0_reference,
        .vec_dot                  = ggml_vec_dot_q5_0_q8_0,
        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
+    };
-    [GGML_TYPE_Q5_1] = {
+    type_traits[GGML_TYPE_Q5_1] = {
        .type_name                = "q5_1",
        .blck_size                = QK5_1,
        .type_size                = sizeof(block_q5_1),
@ -515,8 +529,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_1_reference,
        .vec_dot                  = ggml_vec_dot_q5_1_q8_1,
        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
+    };
-    [GGML_TYPE_Q8_0] = {
+    type_traits[GGML_TYPE_Q8_0] = {
        .type_name                = "q8_0",
        .blck_size                = QK8_0,
        .type_size                = sizeof(block_q8_0),
@ -526,8 +540,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_0_reference,
        .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
+    };
-    [GGML_TYPE_Q8_1] = {
+    type_traits[GGML_TYPE_Q8_1] = {
        .type_name                = "q8_1",
        .blck_size                = QK8_1,
        .type_size                = sizeof(block_q8_1),
@ -535,8 +549,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float               = quantize_row_q8_1,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_1_reference,
        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
+    };
-    [GGML_TYPE_Q2_K] = {
+    type_traits[GGML_TYPE_Q2_K] = {
        .type_name                = "q2_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q2_K),
@ -546,8 +560,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q2_K_reference,
        .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
+    };
-    [GGML_TYPE_Q3_K] = {
+    type_traits[GGML_TYPE_Q3_K] = {
        .type_name                = "q3_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q3_K),
@ -557,8 +571,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q3_K_reference,
        .vec_dot                  = ggml_vec_dot_q3_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
+    };
-    [GGML_TYPE_Q4_K] = {
+    type_traits[GGML_TYPE_Q4_K] = {
        .type_name                = "q4_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q4_K),
@ -568,8 +582,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_K_reference,
        .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
+    };
-    [GGML_TYPE_Q5_K] = {
+    type_traits[GGML_TYPE_Q5_K] = {
        .type_name                = "q5_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q5_K),
@ -579,8 +593,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_K_reference,
        .vec_dot                  = ggml_vec_dot_q5_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
+    };
-    [GGML_TYPE_Q6_K] = {
+    type_traits[GGML_TYPE_Q6_K] = {
        .type_name                = "q6_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q6_K),
@ -590,15 +604,15 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q6_K_reference,
        .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
+    };
-    [GGML_TYPE_Q8_K] = {
+    type_traits[GGML_TYPE_Q8_K] = {
        .type_name                = "q8_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q8_K),
        .is_quantized             = true,
        .from_float               = quantize_row_q8_K,
-    }
+    };
-};
+}
 // For internal test use
 ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
@ -1160,7 +1174,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
 inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
 inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
-static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
+static void ggml_vec_dot_f32(const int n, float * GGML_RESTRICT s, const float * GGML_RESTRICT x, const float * GGML_RESTRICT y) {
 #ifdef GGML_SIMD
    float sumf = 0.0f;
    const int np = (n & ~(GGML_F32_STEP - 1));
@ -1197,7 +1211,7 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
    *s = sumf;
 }
-static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
+static void ggml_vec_dot_f16(const int n, float * GGML_RESTRICT s, ggml_fp16_t * GGML_RESTRICT x, ggml_fp16_t * GGML_RESTRICT y) {
    ggml_float sumf = 0.0;
 #if defined(GGML_SIMD)
@ -1235,10 +1249,10 @@ static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * rest
 // compute GGML_VEC_DOT_UNROLL dot products at once
 // xs - x row stride in bytes
-inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
+inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
    ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
-    ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
+    ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
        x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
@ -1288,7 +1302,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re
    }
 }
-inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
+inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
 #if defined(GGML_SIMD)
    const int np = (n & ~(GGML_F32_STEP - 1));
@ -1320,10 +1334,10 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
 }
 // xs and vs are byte strides of x and v
-inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
+inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
-    const float * restrict x[GGML_VEC_MAD_UNROLL];
+    const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
-    const float * restrict v[GGML_VEC_MAD_UNROLL];
+    const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
    for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
        x[i] = (const float *) ((const char *) xv + i*xs);
@ -2175,11 +2189,19 @@ static inline int ggml_up(int n, int m) {
 ////////////////////////////////////////////////////////////////////////////////
 struct ggml_context * ggml_init(struct ggml_init_params params) {
  // initialize the data in the arrays
  type_traits_init();
  GGUF_TYPE_SIZE_init();
  GGUF_TYPE_NAME_init();
  struct ggml_context * ctx = NULL;
  static bool is_first_call = true;
  // make this function thread safe
  ggml_critical_section_start();
    static bool is_first_call = true;
  if (is_first_call) {
    // initialize time system (required on Windows)
@ -2238,7 +2260,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
    }
    // find non-used context in g_state
-    struct ggml_context * ctx = NULL;
+    
    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
        if (!g_state.contexts[i].used) {
@ -2402,7 +2424,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
    // align to GGML_MEM_ALIGN
    size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
-    char * const mem_buffer = ctx->mem_buffer;
+    char * const mem_buffer = (char*)ctx->mem_buffer;
    struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
    if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
@ -2475,7 +2497,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
                return NULL;
            }
-            data = (char * const) ctx->scratch.data + ctx->scratch.offs;
+            data = (void*)(((char *)ctx->scratch.data) + ctx->scratch.offs);
            ctx->scratch.offs += data_size;
        } else {
@ -2630,7 +2652,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
    const int nc    = tensor->ne[0];
    const size_t n1 = tensor->nb[1];
-    char * const data = tensor->data;
+    char * const data = (char*)tensor->data;
    switch (tensor->type) {
        case GGML_TYPE_I8:
@ -2682,7 +2704,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
    const int nc    = tensor->ne[0];
    const size_t n1 = tensor->nb[1];
-    char * const data = tensor->data;
+    char * const data = (char*)tensor->data;
    switch (tensor->type) {
        case GGML_TYPE_I8:
@ -3063,7 +3085,7 @@ struct ggml_tensor * ggml_view_tensor(
 struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
    struct ggml_object * obj = ctx->objects_begin;
-    char * const mem_buffer = ctx->mem_buffer;
+    char * const mem_buffer = (char*)ctx->mem_buffer;
    while (obj != NULL) {
        if (obj->type == GGML_OBJECT_TENSOR) {
@ -3080,7 +3102,7 @@ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml
    struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
    obj = obj->next;
-    char * const mem_buffer = ctx->mem_buffer;
+    char * const mem_buffer = (char*)ctx->mem_buffer;
    while (obj != NULL) {
        if (obj->type == GGML_OBJECT_TENSOR) {
@ -3096,7 +3118,7 @@ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml
 struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
    struct ggml_object * obj = ctx->objects_begin;
-    char * const mem_buffer = ctx->mem_buffer;
+    char * const mem_buffer = (char*)ctx->mem_buffer;
    while (obj != NULL) {
        if (obj->type == GGML_OBJECT_TENSOR) {
@ -3292,7 +3314,7 @@ static struct ggml_tensor * ggml_acc_impl(
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
+    int32_t params[] = { (int32_t)nb1, (int32_t)nb2, (int32_t)nb3, (int32_t)offset, inplace ? 1 : 0 };
    ggml_set_op_params(result, params, sizeof(params));
    result->op   = GGML_OP_ACC;
@ -4145,7 +4167,7 @@ static struct ggml_tensor * ggml_set_impl(
    // make a view of the destination
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
+    int32_t params[] = { (int32_t)nb1,(int32_t) nb2, (int32_t)nb3, (int32_t)offset, inplace ? 1 : 0 };
    ggml_set_op_params(result, params, sizeof(params));
    result->op   = GGML_OP_SET;
@ -5402,7 +5424,7 @@ struct ggml_tensor * ggml_pool_2d(
    };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
-    int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
+    int32_t params[] = { op, k0, k1, s0, s1, (int32_t)p0, (int32_t)p1 };
    ggml_set_op_params(result, params, sizeof(params));
    result->op = GGML_OP_POOL_2D;
@ -8262,7 +8284,7 @@ static void ggml_compute_forward_repeat_back_f32(
    GGML_ASSERT(nb00 == sizeof(float));
    if (ggml_is_contiguous(dst)) {
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+      ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
    } else {
        for         (int k3 = 0; k3 < ne3; k3++) {
            for     (int k2 = 0; k2 < ne2; k2++) {
@ -9390,6 +9412,7 @@ static void ggml_compute_forward_mul_mat(
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
              struct ggml_tensor * dst) {
    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);
@ -9492,7 +9515,7 @@ static void ggml_compute_forward_mul_mat(
    if (params->type == GGML_TASK_INIT) {
        if (src1->type != vec_dot_type) {
-            char * wdata = params->wdata;
+	  char * wdata = (char*)params->wdata;
            const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
            for (int64_t i13 = 0; i13 < ne13; ++i13) {
@ -9646,7 +9669,7 @@ static void ggml_compute_forward_out_prod_f32(
            return;
        }
 #endif
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+        ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
        return;
    }
@ -9829,7 +9852,7 @@ static void ggml_compute_forward_out_prod_q_f32(
    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
    if (params->type == GGML_TASK_INIT) {
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+      ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
        return;
    }
@ -11843,7 +11866,7 @@ static void ggml_compute_forward_pool_1d(
              struct ggml_tensor * dst) {
    const int32_t * opts = (const int32_t *)dst->op_params;
-    enum ggml_op_pool op = opts[0];
+    enum ggml_op_pool op = (ggml_op_pool)opts[0];
    const int k0 = opts[1];
    const int s0 = opts[2];
    const int p0 = opts[3];
@ -11867,7 +11890,7 @@ static void ggml_compute_forward_pool_2d(
    }
    const int32_t * opts = (const int32_t *)dst->op_params;
-    enum ggml_op_pool op = opts[0];
+    enum ggml_op_pool op = (ggml_op_pool)opts[0];
    const int k0 = opts[1];
    const int k1 = opts[2];
    const int s0 = opts[3];
@ -14098,7 +14121,7 @@ static struct ggml_hash_set ggml_hash_set_new(size_t size) {
    size = ggml_hash_size(size);
    struct ggml_hash_set result;
    result.size = size;
-    result.keys = malloc(sizeof(struct ggml_tensor *) * size);
+    result.keys = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * size);
    memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
    return result;
 }
@ -14113,9 +14136,9 @@ struct hash_map {
 };
 static struct hash_map * ggml_new_hash_map(size_t size) {
-    struct hash_map * result = malloc(sizeof(struct hash_map));
+  struct hash_map * result = (hash_map*)malloc(sizeof(struct hash_map));
    result->set = ggml_hash_set_new(size);
-    result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
+    result->vals = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * result->set.size);
    memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
    return result;
 }
@ -16034,7 +16057,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
        /*.abort_callback          =*/ NULL,
        /*.abort_callback_data     =*/ NULL,
    };
-    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
+    struct ggml_compute_state * workers = (ggml_compute_state*)alloca(sizeof(struct ggml_compute_state)*n_threads);
    // create thread pool
    if (n_threads > 1) {
@ -16631,7 +16654,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
            continue;
        }
-        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
+        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name((ggml_op)i), (double) perf_total_per_op_us[i] / 1000.0);
    }
    GGML_PRINT("========================================\n");
@ -16903,11 +16926,11 @@ static enum ggml_opt_result ggml_opt_adam(
    const int n_accum = MAX(1, params.n_gradient_accumulation);
    const float accum_norm = 1.0f / (float) n_accum;
-    float * g  = opt->adam.g->data;  // gradients
+    float * g  = (float*)opt->adam.g->data;  // gradients
-    float * m  = opt->adam.m->data;  // first moment
+    float * m  = (float*)opt->adam.m->data;  // first moment
-    float * v  = opt->adam.v->data;  // second moment
+    float * v  = (float*)opt->adam.v->data;  // second moment
-    float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
+    float * pf = params.past > 0 ? (float *)opt->adam.pf->data : NULL; // past function values
    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
@ -17175,7 +17198,7 @@ static enum ggml_opt_result linesearch_backtracking(
        } else {
            // Armijo condition is satisfied
            if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) {
-                return count;
+	      return (ggml_opt_result)count;
            }
            ggml_vec_dot_f32(nx, &dg, g, d);
@ -17186,14 +17209,14 @@ static enum ggml_opt_result linesearch_backtracking(
            } else {
                if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) {
                    // regular Wolfe conditions
-                    return count;
+		  return (ggml_opt_result)count;
                }
                if(dg > -params->lbfgs.wolfe*dginit) {
                    width = dec;
                } else {
                    // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
-                    return count;
+		  return (ggml_opt_result)count;
                }
            }
        }
@ -17258,13 +17281,13 @@ static enum ggml_opt_result ggml_opt_lbfgs(
    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
-    float * x  = opt->lbfgs.x->data;  // current parameters
+    float * x  = (float*)opt->lbfgs.x->data;  // current parameters
-    float * xp = opt->lbfgs.xp->data; // previous parameters
+    float * xp = (float*)opt->lbfgs.xp->data; // previous parameters
-    float * g  = opt->lbfgs.g->data;  // current gradient
+    float * g  = (float*)opt->lbfgs.g->data;  // current gradient
-    float * gp = opt->lbfgs.gp->data; // previous gradient
+    float * gp = (float*)opt->lbfgs.gp->data; // previous gradient
-    float * d  = opt->lbfgs.d->data;  // search direction
+    float * d  = (float*)opt->lbfgs.d->data;  // search direction
-    float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values
+    float * pf = params.past > 0 ? (float*)opt->lbfgs.pf->data : NULL; // past function values
    const int n_accum = MAX(1, params.n_gradient_accumulation);
    const float accum_norm = 1.0f / (float) n_accum;
@ -17277,10 +17300,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
    ggml_opt_get_params(np, ps, x);
    // the L-BFGS memory
-    float * lm_alpha = opt->lbfgs.lmal->data;
+    float * lm_alpha = (float*)opt->lbfgs.lmal->data;
-    float * lm_ys    = opt->lbfgs.lmys->data;
+    float * lm_ys    = (float*)opt->lbfgs.lmys->data;
-    float * lm_s     = opt->lbfgs.lms->data;
+    float * lm_s     = (float*)opt->lbfgs.lms->data;
-    float * lm_y     = opt->lbfgs.lmy->data;
+    float * lm_y     = (float*)opt->lbfgs.lmy->data;
    bool cancel = false;
@ -17377,7 +17400,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
            ggml_vec_cpy_f32(nx, x, xp);
            ggml_vec_cpy_f32(nx, g, gp);
-            return ls;
+            return (ggml_opt_result)ls;
        }
        opt->loss_after = fx;
@ -17718,7 +17741,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
    const int nb = k / QK4_0;
    for (int b = 0; b < n; b += k) {
-        block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
+        block_q4_0 * GGML_RESTRICT y = (block_q4_0 *) dst + b/QK4_0;
        quantize_row_q4_0_reference(src + b, y, k);
@ -17741,7 +17764,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
    const int nb = k / QK4_1;
    for (int b = 0; b < n; b += k) {
-        block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
+        block_q4_1 * GGML_RESTRICT y = (block_q4_1 *) dst + b/QK4_1;
        quantize_row_q4_1_reference(src + b, y, k);
@ -17764,7 +17787,7 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
    const int nb = k / QK5_0;
    for (int b = 0; b < n; b += k) {
-        block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
+        block_q5_0 * GGML_RESTRICT y = (block_q5_0 *)dst + b/QK5_0;
        quantize_row_q5_0_reference(src + b, y, k);
@ -17794,7 +17817,7 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
    const int nb = k / QK5_1;
    for (int b = 0; b < n; b += k) {
-        block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
+        block_q5_1 * GGML_RESTRICT y = (block_q5_1 *)dst + b/QK5_1;
        quantize_row_q5_1_reference(src + b, y, k);
@ -17824,7 +17847,7 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
    const int nb = k / QK8_0;
    for (int b = 0; b < n; b += k) {
-        block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
+        block_q8_0 * GGML_RESTRICT y = (block_q8_0 *)dst + b/QK8_0;
        quantize_row_q8_0_reference(src + b, y, k);
@ -17928,37 +17951,39 @@ struct gguf_str {
    char * data;
 };
-static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
+
-    [GGUF_TYPE_UINT8]   = sizeof(uint8_t),
+void GGUF_TYPE_SIZE_init() {
-    [GGUF_TYPE_INT8]    = sizeof(int8_t),
+    GGUF_TYPE_SIZE[GGUF_TYPE_UINT8]   = sizeof(uint8_t);
-    [GGUF_TYPE_UINT16]  = sizeof(uint16_t),
+    GGUF_TYPE_SIZE[GGUF_TYPE_INT8]    = sizeof(int8_t);
-    [GGUF_TYPE_INT16]   = sizeof(int16_t),
+    GGUF_TYPE_SIZE[GGUF_TYPE_UINT16]  = sizeof(uint16_t);
-    [GGUF_TYPE_UINT32]  = sizeof(uint32_t),
+    GGUF_TYPE_SIZE[GGUF_TYPE_INT16]   = sizeof(int16_t);
-    [GGUF_TYPE_INT32]   = sizeof(int32_t),
+    GGUF_TYPE_SIZE[GGUF_TYPE_UINT32]  = sizeof(uint32_t);
-    [GGUF_TYPE_FLOAT32] = sizeof(float),
+    GGUF_TYPE_SIZE[GGUF_TYPE_INT32]   = sizeof(int32_t);
-    [GGUF_TYPE_BOOL]    = sizeof(bool),
+    GGUF_TYPE_SIZE[GGUF_TYPE_FLOAT32] = sizeof(float);
-    [GGUF_TYPE_STRING]  = sizeof(struct gguf_str),
+    GGUF_TYPE_SIZE[GGUF_TYPE_BOOL]    = sizeof(bool);
-    [GGUF_TYPE_UINT64]  = sizeof(uint64_t),
+    GGUF_TYPE_SIZE[GGUF_TYPE_STRING]  = sizeof(struct gguf_str);
-    [GGUF_TYPE_INT64]   = sizeof(int64_t),
+    GGUF_TYPE_SIZE[GGUF_TYPE_UINT64]  = sizeof(uint64_t);
-    [GGUF_TYPE_FLOAT64] = sizeof(double),
+    GGUF_TYPE_SIZE[GGUF_TYPE_INT64]   = sizeof(int64_t);
-    [GGUF_TYPE_ARRAY]   = 0, // undefined
+    GGUF_TYPE_SIZE[GGUF_TYPE_FLOAT64] = sizeof(double);
    GGUF_TYPE_SIZE[GGUF_TYPE_ARRAY]   = 0; // undefined
 };
 static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
-static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
+
-    [GGUF_TYPE_UINT8]   = "u8",
+void GGUF_TYPE_NAME_init(){
-    [GGUF_TYPE_INT8]    = "i8",
+  GGUF_TYPE_NAME[GGUF_TYPE_UINT8]   = "u8";
-    [GGUF_TYPE_UINT16]  = "u16",
+  GGUF_TYPE_NAME[GGUF_TYPE_INT8]    = "i8";
-    [GGUF_TYPE_INT16]   = "i16",
+  GGUF_TYPE_NAME[GGUF_TYPE_UINT16]  = "u16";
-    [GGUF_TYPE_UINT32]  = "u32",
+  GGUF_TYPE_NAME[GGUF_TYPE_INT16]   = "i16";
-    [GGUF_TYPE_INT32]   = "i32",
+    GGUF_TYPE_NAME[GGUF_TYPE_UINT32]  = "u32";
-    [GGUF_TYPE_FLOAT32] = "f32",
+    GGUF_TYPE_NAME[GGUF_TYPE_INT32]   = "i32";
-    [GGUF_TYPE_BOOL]    = "bool",
+    GGUF_TYPE_NAME[GGUF_TYPE_FLOAT32] = "f32";
-    [GGUF_TYPE_STRING]  = "str",
+    GGUF_TYPE_NAME[GGUF_TYPE_BOOL]    = "bool";
-    [GGUF_TYPE_ARRAY]   = "arr",
+    GGUF_TYPE_NAME[GGUF_TYPE_STRING]  = "str";
-    [GGUF_TYPE_UINT64]  = "u64",
+    GGUF_TYPE_NAME[GGUF_TYPE_ARRAY]   = "arr";
-    [GGUF_TYPE_INT64]   = "i64",
+    GGUF_TYPE_NAME[GGUF_TYPE_UINT64]  = "u64";
-    [GGUF_TYPE_FLOAT64] = "f64",
+    GGUF_TYPE_NAME[GGUF_TYPE_INT64]   = "i64";
    GGUF_TYPE_NAME[GGUF_TYPE_FLOAT64] = "f64";
 };
 static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
@ -18040,14 +18065,14 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
    bool ok = true;
-    ok = ok && gguf_fread_el(file, &p->n,    sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
+    ok = ok && gguf_fread_el(file, &p->n,    sizeof(p->n), offset); p->data = (char*)calloc(p->n + 1, 1);
    ok = ok && gguf_fread_el(file,  p->data, p->n,         offset);
    return ok;
 }
 struct gguf_context * gguf_init_empty(void) {
-    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+  struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
    memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
    ctx->header.version   = GGUF_VERSION;
@ -18092,7 +18117,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
    bool ok = true;
-    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+    struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
    // read the header
    {
@ -18124,7 +18149,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
    // read the kv pairs
    {
-        ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
+      ctx->kv = (gguf_kv*)malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
            struct gguf_kv * kv = &ctx->kv[i];
@ -18199,7 +18224,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
    // read the tensor infos
    {
-        ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
+      ctx->infos = (gguf_tensor_info*)malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
            struct gguf_tensor_info * info = &ctx->infos[i];
@ -18319,10 +18344,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
        // create the tensors
        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
            const int64_t ne[GGML_MAX_DIMS] = {
-                ctx->infos[i].ne[0],
+	      (int64_t)ctx->infos[i].ne[0],
-                ctx->infos[i].ne[1],
+	      (int64_t)ctx->infos[i].ne[1],
-                ctx->infos[i].ne[2],
+	      (int64_t)ctx->infos[i].ne[2],
-                ctx->infos[i].ne[3],
+	      (int64_t)ctx->infos[i].ne[3],
            };
            struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
@ -18603,7 +18628,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
    const int n_kv = gguf_get_n_kv(ctx);
-    ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
+    ctx->kv = (gguf_kv*)realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
    ctx->kv[n_kv].key.n    = strlen(key);
    ctx->kv[n_kv].key.data = strdup(key);
    ctx->header.n_kv++;
@ -18739,7 +18764,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
            case GGUF_TYPE_ARRAY:
                {
                    if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
-                        const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
+		      const char ** data = (const char **)malloc(src->kv[i].value.arr.n*sizeof(char *));
                        for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
                            data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
                        }
@ -18760,7 +18785,7 @@ void gguf_add_tensor(
             struct gguf_context * ctx,
        const struct ggml_tensor * tensor) {
    const int idx = ctx->header.n_tensors;
-    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
+    ctx->infos = (gguf_tensor_info*)realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
    ctx->infos[idx].name.n    = strlen(tensor->name);
    ctx->infos[idx].name.data = strdup(tensor->name);
--- a/ggml.h
+++ b/ggml.h
@ -285,8 +285,10 @@
    GGML_UNUSED(prefix##3);
 #ifdef  __cplusplus
 #ifndef CPP_ONLY
 extern "C" {
 #endif
 #endif
 #if defined(__ARM_NEON) && defined(__CUDACC__)
    typedef half ggml_fp16_t;
@ -2136,7 +2138,7 @@ extern "C" {
 // restrict not standard in C++
 #define GGML_RESTRICT 
 #else
-#define GGML_RESTRICT restrict
+#define GGML_RESTRICT __restrict__
 #endif
    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k);
@ -2157,5 +2159,7 @@ extern "C" {
    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
 #ifdef  __cplusplus
 #ifndef CPP_ONLY
 }
 #endif
 #endif
--- a/llama-internal.hpp
+++ b/llama-internal.hpp
@ -0,0 +1,896 @@
 #include <set>
 #include <queue>
 enum llm_arch {
    LLM_ARCH_LLAMA,
    LLM_ARCH_FALCON,
    LLM_ARCH_BAICHUAN,
    LLM_ARCH_GPT2,
    LLM_ARCH_GPTJ,
    LLM_ARCH_GPTNEOX,
    LLM_ARCH_MPT,
    LLM_ARCH_STARCODER,
    LLM_ARCH_PERSIMMON,
    LLM_ARCH_REFACT,
    LLM_ARCH_BLOOM,
    LLM_ARCH_STABLELM,
    LLM_ARCH_UNKNOWN,
 };
 enum llm_kv {
    LLM_KV_GENERAL_ARCHITECTURE,
    LLM_KV_GENERAL_QUANTIZATION_VERSION,
    LLM_KV_GENERAL_ALIGNMENT,
    LLM_KV_GENERAL_NAME,
    LLM_KV_GENERAL_AUTHOR,
    LLM_KV_GENERAL_URL,
    LLM_KV_GENERAL_DESCRIPTION,
    LLM_KV_GENERAL_LICENSE,
    LLM_KV_GENERAL_SOURCE_URL,
    LLM_KV_GENERAL_SOURCE_HF_REPO,
    LLM_KV_CONTEXT_LENGTH,
    LLM_KV_EMBEDDING_LENGTH,
    LLM_KV_BLOCK_COUNT,
    LLM_KV_FEED_FORWARD_LENGTH,
    LLM_KV_USE_PARALLEL_RESIDUAL,
    LLM_KV_TENSOR_DATA_LAYOUT,
    LLM_KV_ATTENTION_HEAD_COUNT,
    LLM_KV_ATTENTION_HEAD_COUNT_KV,
    LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
    LLM_KV_ATTENTION_CLAMP_KQV,
    LLM_KV_ATTENTION_LAYERNORM_EPS,
    LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
    LLM_KV_ROPE_DIMENSION_COUNT,
    LLM_KV_ROPE_FREQ_BASE,
    LLM_KV_ROPE_SCALE_LINEAR,
    LLM_KV_ROPE_SCALING_TYPE,
    LLM_KV_ROPE_SCALING_FACTOR,
    LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
    LLM_KV_ROPE_SCALING_FINETUNED,
    LLM_KV_TOKENIZER_MODEL,
    LLM_KV_TOKENIZER_LIST,
    LLM_KV_TOKENIZER_TOKEN_TYPE,
    LLM_KV_TOKENIZER_SCORES,
    LLM_KV_TOKENIZER_MERGES,
    LLM_KV_TOKENIZER_BOS_ID,
    LLM_KV_TOKENIZER_EOS_ID,
    LLM_KV_TOKENIZER_UNK_ID,
    LLM_KV_TOKENIZER_SEP_ID,
    LLM_KV_TOKENIZER_PAD_ID,
    LLM_KV_TOKENIZER_ADD_BOS,
    LLM_KV_TOKENIZER_ADD_EOS,
    LLM_KV_TOKENIZER_HF_JSON,
    LLM_KV_TOKENIZER_RWKV,
 };
 // available llama models
 enum e_model {
    MODEL_UNKNOWN,
    MODEL_1B,
    MODEL_3B,
    MODEL_7B,
    MODEL_8B,
    MODEL_13B,
    MODEL_15B,
    MODEL_30B,
    MODEL_34B,
    MODEL_40B,
    MODEL_65B,
    MODEL_70B,
 };
 enum llama_fver {
    GGUF_FILE_VERSION_V1 = 1,
    GGUF_FILE_VERSION_V2 = 2,
    GGUF_FILE_VERSION_V3 = 3,
 };
 struct LLM_KV {
  LLM_KV(llm_arch arch) : arch(arch) {}
  llm_arch arch;
  std::string operator()(llm_kv kv) const; // moved to llama.cpp file
 };
 enum llm_tensor {
    LLM_TENSOR_TOKEN_EMBD,
    LLM_TENSOR_TOKEN_EMBD_NORM,
    LLM_TENSOR_POS_EMBD,
    LLM_TENSOR_OUTPUT,
    LLM_TENSOR_OUTPUT_NORM,
    LLM_TENSOR_ROPE_FREQS,
    LLM_TENSOR_ATTN_Q,
    LLM_TENSOR_ATTN_K,
    LLM_TENSOR_ATTN_V,
    LLM_TENSOR_ATTN_QKV,
    LLM_TENSOR_ATTN_OUT,
    LLM_TENSOR_ATTN_NORM,
    LLM_TENSOR_ATTN_NORM_2,
    LLM_TENSOR_ATTN_ROT_EMBD,
    LLM_TENSOR_FFN_GATE,
    LLM_TENSOR_FFN_DOWN,
    LLM_TENSOR_FFN_UP,
    LLM_TENSOR_FFN_NORM,
    LLM_TENSOR_ATTN_Q_NORM,
    LLM_TENSOR_ATTN_K_NORM,
 };
 struct llama_cparams {
    uint32_t n_ctx;       // context size used during inference
    uint32_t n_batch;
    uint32_t n_threads;       // number of threads to use for generation
    uint32_t n_threads_batch; // number of threads to use for batch processing
    float    rope_freq_base;
    float    rope_freq_scale;
    uint32_t n_yarn_orig_ctx;
    // These hyperparameters are not exposed in GGUF, because all
    // existing YaRN models use the same values for them.
    float yarn_ext_factor;
    float yarn_attn_factor;
    float yarn_beta_fast;
    float yarn_beta_slow;
    bool mul_mat_q;
 };
 struct llama_layer {
    // normalization
    struct ggml_tensor * attn_norm;
    struct ggml_tensor * attn_norm_b;
    struct ggml_tensor * attn_norm_2;
    struct ggml_tensor * attn_norm_2_b;
    struct ggml_tensor * attn_q_norm;
    struct ggml_tensor * attn_q_norm_b;
    struct ggml_tensor * attn_k_norm;
    struct ggml_tensor * attn_k_norm_b;
    // attention
    struct ggml_tensor * wq;
    struct ggml_tensor * wk;
    struct ggml_tensor * wv;
    struct ggml_tensor * wo;
    struct ggml_tensor * wqkv;
    // attention bias
    struct ggml_tensor * bo;
    struct ggml_tensor * bqkv;
    // normalization
    struct ggml_tensor * ffn_norm;
    struct ggml_tensor * ffn_norm_b;
    // ff
    struct ggml_tensor * ffn_gate; // w1
    struct ggml_tensor * ffn_down; // w2
    struct ggml_tensor * ffn_up;   // w3
    // ff bias
    struct ggml_tensor * ffn_down_b; // b2
    struct ggml_tensor * ffn_up_b;   // b3
 };
 struct llama_kv_cell {
    llama_pos pos   = -1;
    llama_pos delta = 0;
    std::set<llama_seq_id> seq_id;
    bool has_seq_id(const llama_seq_id & id) const {
 	return seq_id.find(id) != seq_id.end();
    }
 };
 struct llama_buffer {
    void * data = NULL;
    size_t size = 0;
    // fallback to malloc / free
    // useful in cases where CUDA can try to allocate PINNED memory
    bool fallback = false;
  void resize(size_t n) ;
  ~llama_buffer();
 };
 // ring-buffer of cached KV data
 struct llama_kv_cache {
    bool has_shift = false;
    // Note: The value of head isn't only used to optimize searching
    // for a free KV slot. llama_decode_internal also uses it, so it
    // cannot be freely changed after a slot has been allocated.
    uint32_t head = 0;
    uint32_t size = 0;
    // computed before each graph build
    uint32_t n = 0;
    std::vector<llama_kv_cell> cells;
    struct ggml_tensor * k = NULL;
    struct ggml_tensor * v = NULL;
    struct ggml_context * ctx = NULL;
    llama_buffer buf;
    ~llama_kv_cache() {
 	if (ctx) {
 	    ggml_free(ctx);
 	}
 #ifdef GGML_USE_CUBLAS
 	if (ggml_cublas_loaded()) {
 	    ggml_cuda_free_data(k);
 	    ggml_cuda_free_data(v);
 	}
 #endif
    }
 };
 struct llama_vocab {
    using id    = int32_t;
    using token = std::string;
    using ttype = llama_token_type;
    struct token_data {
 	token text;
 	float score;
 	ttype type;
    };
    enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
    std::unordered_map<token, id> token_to_id;
    std::vector<token_data>       id_to_token;
    std::unordered_map<token, id> special_tokens_cache;
    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
    // default LLaMA special tokens
    id special_bos_id = 1;
    id special_eos_id = 2;
    id special_unk_id = 0;
    id special_sep_id = -1;
    id special_pad_id = -1;
    int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
    int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
    id linefeed_id       = 13;
    id special_prefix_id = 32007;
    id special_middle_id = 32009;
    id special_suffix_id = 32008;
    id special_eot_id    = 32010;
    int find_bpe_rank(std::string token_left, std::string token_right) const {
 	GGML_ASSERT(token_left.find(" ") == std::string::npos);
 	GGML_ASSERT(token_left.find("\n") == std::string::npos);
 	GGML_ASSERT(token_right.find(" ") == std::string::npos);
 	GGML_ASSERT(token_right.find("\n") == std::string::npos);
 	auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
 	if (it == bpe_ranks.end()) {
 	    return -1;
 	}
 	return it->second;
    }
 };
 struct llama_mmap {
  void * addr;
  size_t size;
  llama_mmap(const llama_mmap &) = delete;
  llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false);
  ~llama_mmap();
 #ifdef _POSIX_MAPPED_FILES
    static constexpr bool SUPPORTED = true;
 #elif defined(_WIN32)
    static constexpr bool SUPPORTED = true;
 #else
    static constexpr bool SUPPORTED = false;
 #endif
 };
 struct llama_hparams {
    bool     vocab_only;
    uint32_t n_vocab;
    uint32_t n_ctx_train; // context size the model was trained on
    uint32_t n_embd;
    uint32_t n_head;
    uint32_t n_head_kv;
    uint32_t n_layer;
    uint32_t n_rot;
    uint32_t n_ff;
    float f_norm_eps;
    float f_norm_rms_eps;
    float    rope_freq_base_train;
    float    rope_freq_scale_train;
    uint32_t n_yarn_orig_ctx;
    int8_t   rope_scaling_type_train : 3;
    bool     rope_finetuned : 1;
    float f_clamp_kqv;
    float f_max_alibi_bias;
  bool operator!=(const llama_hparams & other) const;
    uint32_t n_gqa() const {
 	return n_head/n_head_kv;
    }
    uint32_t n_embd_head() const {
 	return n_embd/n_head;
    }
    uint32_t n_embd_gqa() const {
 	return n_embd/n_gqa();
    }
 };
 struct llama_mlock {
  void * addr = NULL;
  size_t size = 0;
  bool failed_already = false;
  llama_mlock() ;
  llama_mlock(const llama_mlock &) = delete;
  ~llama_mlock();
  void init(void * ptr);
  void grow_to(size_t target_size);
 #ifdef _POSIX_MEMLOCK_RANGE
  static constexpr bool SUPPORTED = true;
  static size_t lock_granularity();
 #ifdef __APPLE__
 #define MLOCK_SUGGESTION						\
  "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
  "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
 #else
 #define MLOCK_SUGGESTION						\
  "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
 #endif
  bool raw_lock(const void * addr, size_t size) const ;
 #undef MLOCK_SUGGESTION
  static void raw_unlock(void * addr, size_t size);
 #elif defined(_WIN32)
  static constexpr bool SUPPORTED = true;
  static size_t lock_granularity();
  bool raw_lock(void * ptr, size_t len) const ;
  static void raw_unlock(void * ptr, size_t len);
 #else
    static constexpr bool SUPPORTED = false;
  static size_t lock_granularity();
  bool raw_lock(const void * addr, size_t len) const;
  static void raw_unlock(const void * addr, size_t len);
 #endif
 };
 struct llama_model {
    e_model     type  = MODEL_UNKNOWN;
    llm_arch    arch  = LLM_ARCH_UNKNOWN;
    llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
    std::string name = "n/a";
    llama_hparams hparams = {};
    llama_vocab   vocab;
    struct ggml_tensor * tok_embd;
    struct ggml_tensor * pos_embd;
    struct ggml_tensor * tok_norm;
    struct ggml_tensor * tok_norm_b;
    struct ggml_tensor * output_norm;
    struct ggml_tensor * output_norm_b;
    struct ggml_tensor * output;
    std::vector<llama_layer> layers;
    int n_gpu_layers;
    // gguf metadata
    std::unordered_map<std::string, std::string> gguf_kv;
    // context
    struct ggml_context * ctx = NULL;
    // the model memory buffer
    llama_buffer buf;
    // model memory mapped file
    std::unique_ptr<llama_mmap> mapping;
    // objects representing data potentially being locked in memory
    llama_mlock mlock_buf;
    llama_mlock mlock_mmap;
    // for quantize-stats only
    std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
    int64_t t_load_us = 0;
    int64_t t_start_us = 0;
    ~llama_model() {
 	if (ctx) {
 	    ggml_free(ctx);
 	}
 #ifdef GGML_USE_CUBLAS
 	if (ggml_cublas_loaded()) {
 	    for (size_t i = 0; i < tensors_by_name.size(); ++i) {
 		ggml_cuda_free_data(tensors_by_name[i].second);
 	    }
 	    ggml_cuda_free_scratch();
 	}
 #endif
 #if defined(GGML_USE_CLBLAST)
 	for (size_t i = 0; i < tensors_by_name.size(); ++i) {
 	    ggml_cl_free_data(tensors_by_name[i].second);
 	}
 #endif
    }
 };
 struct llama_context {
    llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
  ~llama_context();
    llama_cparams cparams;
    const llama_model & model;
    // key + value cache for the self attention
    struct llama_kv_cache kv_self;
    std::mt19937 rng;
    bool has_evaluated_once = false;
    int64_t t_start_us;
    int64_t t_load_us;
    int64_t t_sample_us = 0;
    int64_t t_p_eval_us = 0;
    int64_t t_eval_us   = 0;
    int32_t n_sample = 0; // number of tokens sampled
    int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
    int32_t n_eval   = 0; // number of eval calls
    // decode output (2-dimensional array: [n_tokens][n_vocab])
    std::vector<float> logits;
    bool logits_all = false;
    // input embedding (1-dimensional array: [n_embd])
    std::vector<float> embedding;
    // reusable buffer for `struct ggml_graph_plan.work_data`
    std::vector<uint8_t> work_buffer;
    // memory buffers used to evaluate the model
    llama_buffer buf_compute;
    llama_buffer buf_alloc;
    ggml_allocr * alloc = NULL;
 #ifdef GGML_USE_METAL
    ggml_metal_context * ctx_metal = NULL;
 #endif
 #ifdef GGML_USE_MPI
    ggml_mpi_context * ctx_mpi = NULL;
 #endif
 };
 struct LLM_TN {
  LLM_TN(llm_arch arch) ;
  llm_arch arch;
  std::string operator()(llm_tensor tensor) const;
  std::string operator()(llm_tensor tensor, const std::string & suffix) const ;
  std::string operator()(llm_tensor tensor, int bid) const ;
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const ;
 };
 struct llama_file {
    // use FILE * so we don't have to re-open the file to mmap
    FILE * fp;
    size_t size;
  llama_file(const char * fname, const char * mode) ;
  size_t tell() const;
  void seek(size_t offset, int whence) const;
  void read_raw(void * ptr, size_t len) const;
  uint32_t read_u32() const;
  void write_raw(const void * ptr, size_t len) const ;
  void write_u32(std::uint32_t val) const;
  ~llama_file();
 };
 struct llama_state {
  llama_state();
    // We save the log callback globally
    ggml_log_callback log_callback;
    void * log_callback_user_data = nullptr;
 };
 struct llama_model_loader {
    int n_kv      = 0;
    int n_tensors = 0;
    int n_created = 0;
    int64_t n_elements = 0;
    size_t  n_bytes    = 0;
    bool use_mmap = false;
    llama_file  file;
    llama_ftype ftype;
    llama_fver  fver;
    std::unique_ptr<llama_mmap> mapping;
    struct gguf_context * ctx_gguf = NULL;
    struct ggml_context * ctx_meta = NULL;
  llama_model_loader(const std::string & fname, bool use_mmap) ;
  ~llama_model_loader();
  std::string get_arch_name() const;
  enum llm_arch get_arch() const ;
  const char * get_tensor_name(int i) const;
  struct ggml_tensor * get_tensor_meta(int i) const;
  void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const;
  struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) ;
  struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) ;
  void done_getting_tensors() const;
  size_t file_offset(const char * name) const;
  void load_data_for(struct ggml_tensor * cur) const ;
  void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) ;
 };
 struct llama_data_context {
    virtual void write(const void * src, size_t size) = 0;
    virtual size_t get_size_written() = 0;
    virtual ~llama_data_context() = default;
 };
 struct llama_data_buffer_context : llama_data_context {
    uint8_t * ptr;
    size_t size_written = 0;
  llama_data_buffer_context(uint8_t * p) ;
  void write(const void * src, size_t size) override ;
  size_t get_size_written() override ;
 };
 struct llama_data_file_context : llama_data_context {
    llama_file * file;
    size_t size_written = 0;
  llama_data_file_context(llama_file * f);
  size_t get_size_written() override ;
  void write(const void * src, size_t size);
 };
 struct llama_beam {
  std::vector<llama_token> tokens;
  float p;  // Cumulative beam probability (renormalized relative to all beams)
  bool eob; // Initialize end-of-beam to false. Callback sets this to true.
  // Sort beams by probability. In case of ties, prefer beams at eob.
  bool operator<(const llama_beam & rhs) const ;
  void shift_tokens(const size_t n) ;
  llama_beam_view view() const;
 };
 // A struct for calculating logit-related info.
 struct llama_logit_info {
    const float * const logits;
    const int n_vocab;
    const float max_l;
    const float normalizer;
    struct sum_exp {
 	float max_l;
 	float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
    };
  llama_logit_info(llama_context * ctx);
  llama_token_data get_token_data(const llama_token token_id) const ;
  std::vector<llama_token_data> top_k(size_t k) ;
  float probability_from_logit(float logit) const ;
 };
 struct llama_beam_search_data {
  llama_context * ctx;
  size_t n_beams;
  int n_past;
  int n_predict;
  std::vector<llama_beam> beams;
  std::vector<llama_beam> next_beams;
  size_t common_prefix_length;
  std::vector<llama_beam_view> beam_views;
  llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict);
  void collapse_beams(const size_t beam_idx) ;
  void fill_next_beams_by_top_probabilities(llama_beam & beam) ;
  size_t find_common_prefix_length() ;
  llama_beams_state get_beams_state(const bool last_call) ;
  void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data);
  static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) ;
  size_t top_beam_index();
  void update_beams_from_beam_views();
 };
 using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
 enum llm_rope_type {
    LLM_ROPE,
    LLM_ROPE_NEOX,
    LLM_ROPE_GLM,
 };
 enum llm_ffn_op_type {
    LLM_FFN_SILU,
    LLM_FFN_GELU,
    LLM_FFN_RELU,
    LLM_FFN_RELU_SQR,
 };
 enum llm_ffn_gate_type {
    LLM_FFN_SEQ,
    LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
 };
 enum llm_norm_type {
    LLM_NORM,
    LLM_NORM_RMS,
 };
 struct llm_build_context {
    const llama_model    & model;
    const llama_hparams  & hparams;
    const llama_cparams  & cparams;
    const llama_batch    & batch;
    const llama_kv_cache & kv_self;
    const int64_t n_embd;
    const int64_t n_layer;
    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
    const int64_t n_head;
    const int64_t n_head_kv;
    const int64_t n_embd_head;
    const int64_t n_embd_gqa;
    const float freq_base;
    const float freq_scale;
    const float ext_factor;
    const float attn_factor;
    const float beta_fast;
    const float beta_slow;
    const float norm_eps;
    const float norm_rms_eps;
    const int32_t n_tokens;
    const int32_t n_kv;     // size of KV cache to consider (n_kv <= n_ctx)
    const int32_t kv_head;  // index of where we store new KV data in the cache
    const int32_t n_orig_ctx;
    const bool do_rope_shift;
    const llm_build_cb & cb;
    llama_buffer & buf_compute;
    struct ggml_context * ctx0 = nullptr;
    // TODO: consider making the entire interface noexcept
    llm_build_context(
 	llama_context  & lctx,
    const llama_batch  & batch,
    const llm_build_cb & cb,
 	bool   worst_case);
  void init() ;
  void free() ;
  struct ggml_cgraph * build_llama() ;
  struct ggml_cgraph * build_baichuan() ;
  struct ggml_cgraph * build_falcon() ;
  struct ggml_cgraph * build_starcoder() ;
  struct ggml_cgraph * build_persimmon() ;
  struct ggml_cgraph * build_refact() ;
  struct ggml_cgraph * build_bloom() ;
  struct ggml_cgraph * build_mpt() ;
  struct ggml_cgraph * build_stablelm();
 };
 enum llm_offload_func_e {
    OFFLOAD_FUNC_NOP,
    OFFLOAD_FUNC,
    OFFLOAD_FUNC_KQ,
    OFFLOAD_FUNC_V,
    OFFLOAD_FUNC_NR,
    OFFLOAD_FUNC_EMB,
    OFFLOAD_FUNC_OUT,
 };
 struct llm_offload_trie {
  struct node {
    ~node() ;
    node * children[256] = { nullptr };
    llm_offload_func_e func = OFFLOAD_FUNC_NOP;
  };
  node * root = nullptr;
  llm_offload_trie();
  llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) ;
  ~llm_offload_trie();
  void add(const char * name, llm_offload_func_e func);
  llm_offload_func_e find(const char * name) const;
 };
 struct llm_symbol {
    using index = int;
    index prev;
    index next;
    const char * text;
    size_t n;
 };
 struct llm_bigram_spm {
    struct comparator {
      bool operator()(llm_bigram_spm & l, llm_bigram_spm & r);
    };
    using queue_storage = std::vector<llm_bigram_spm>;
    using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
    llm_symbol::index left;
    llm_symbol::index right;
    float score;
    size_t size;
 };
 struct llm_tokenizer_spm {
  llm_tokenizer_spm(const llama_vocab & vocab);
    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output);
 private:
  void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) ;
  void try_add_bigram(int left, int right) ;
  const llama_vocab & vocab;
  std::vector<llm_symbol> symbols;
  llm_bigram_spm::queue work_queue;
    std::map<std::string, std::pair<int, int>> rev_merge;
 };
 // BPE tokenizer
 // adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
 // tried to simplify unicode stuff, so most likely does not work 100% correctly!
 // TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
 struct llm_bigram_bpe {
    struct comparator {
      bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const ;
    };
    using queue_storage = std::vector<llm_bigram_bpe>;
    using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
    llm_symbol::index left;
    llm_symbol::index right;
    std::string text;
    int rank;
    size_t size;
 };
 struct llm_tokenizer_bpe {
  llm_tokenizer_bpe(const llama_vocab & vocab);
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output);
 private:
  void add_new_bigram(int left, int right) ;
  std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) ;
  const llama_vocab & vocab;
  std::vector<llm_symbol> symbols;
  std::vector<llm_symbol> symbols_final;
    llm_bigram_bpe::queue work_queue;
 };
 typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
    FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
    FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
 } FRAGMENT_BUFFER_VARIANT_TYPE;
 struct fragment_buffer_variant{
  fragment_buffer_variant(llama_vocab::id _token);
  fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length);
  const FRAGMENT_BUFFER_VARIANT_TYPE type;
  const llama_vocab::id token;
  const std::string _dummy;
  const std::string & raw_text;
  const uint64_t offset;
  const uint64_t length;
 };
 struct llama_partial_utf8 {
    uint32_t value;    // bit value so far (unshifted)
    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
 };
 struct llama_grammar {
    const std::vector<std::vector<llama_grammar_element>>   rules;
    std::vector<std::vector<const llama_grammar_element *>> stacks;
    // buffer for partially generated UTF-8 sequence from accepted tokens
    llama_partial_utf8                                      partial_utf8;
 };
 struct llama_grammar_candidate {
    size_t               index;
    const uint32_t     * code_points;
    llama_partial_utf8   partial_utf8;
 };
 struct quantize_state_internal {
    const llama_model                 & model;
    const llama_model_quantize_params * params;
    int n_attention_wv    = 0;
    int n_feed_forward_w2 = 0;
    int i_attention_wv    = 0;
    int i_feed_forward_w2 = 0;
    int n_k_quantized     = 0;
    int n_fallback        = 0;
    quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
        : model(model)
        , params(params)
        {}
 };
--- a/llama.h
+++ b/llama.h
@ -50,7 +50,9 @@
 #endif
 #ifdef __cplusplus
 #ifndef CPP_ONLY
 extern "C" {
 #endif
 #endif
    //
@ -827,8 +829,10 @@ extern "C" {
    LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
 #ifdef __cplusplus
 #ifndef CPP_ONLY
 }
 #endif
 #endif
 // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
 #ifdef LLAMA_API_INTERNAL
@ -844,4 +848,8 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
 #endif // LLAMA_API_INTERNAL
 #endif // LLAMA_H
--- a/print.hpp
+++ b/print.hpp
@ -0,0 +1,756 @@
 #include <iostream>
 #include "llama.h"
 #include "ggml-internal.hpp"
 #include "llama-internal.hpp"
 REFL_TYPE(ggml_init_params )
 REFL_END
 REFL_TYPE(ggml_opt_params::ggml_adam)
 REFL_END
 REFL_TYPE(ggml_opt_params::ggml_lbfgs)
 REFL_END
 REFL_TYPE(ggml_opt_context::ggml_grad )
 REFL_END
 REFL_TYPE(gpt_params )
 REFL_FIELD( seed )
 REFL_FIELD( n_threads)
 REFL_FIELD( n_threads_batch)
 REFL_FIELD( n_predict )
 REFL_FIELD( n_ctx )
 REFL_FIELD( n_batch)
 REFL_FIELD( n_keep )
 REFL_FIELD( n_draft)
 REFL_FIELD( n_chunks )
 REFL_FIELD( n_parallel)
 REFL_FIELD( n_sequences)
 REFL_FIELD( p_accept  )
 REFL_FIELD( p_split )
 REFL_FIELD( n_gpu_layers)
 REFL_FIELD( n_gpu_layers_draft)
 REFL_FIELD( main_gpu )
 REFL_FIELD( tensor_split)
 REFL_FIELD( n_beams )
 REFL_FIELD(rope_freq_base)
 REFL_FIELD( rope_freq_scale )
 REFL_FIELD( yarn_ext_factor )
 REFL_FIELD( yarn_attn_factor )
 REFL_FIELD( yarn_beta_fast )
 REFL_FIELD( yarn_beta_slow )
 REFL_FIELD( yarn_orig_ctx)
 REFL_FIELD( rope_scaling_type)
 REFL_FIELD( sparams)
 REFL_FIELD(model )
 REFL_FIELD(model_draft )
 REFL_FIELD(model_alias)
 REFL_FIELD(prompt )
 REFL_FIELD(prompt_file )
 REFL_FIELD(path_prompt_cache )
 REFL_FIELD(input_prefix )
 REFL_FIELD(input_suffix )
 REFL_FIELD( antiprompt)
 REFL_FIELD(logdir )
 REFL_FIELD( lora_adapter)
 REFL_FIELD(lora_base )
 REFL_FIELD( ppl_stride )
 REFL_FIELD( ppl_output_type )
 REFL_FIELD( hellaswag )
 REFL_FIELD( hellaswag_tasks )
 REFL_FIELD( mul_mat_q )
 REFL_FIELD( memory_f16)
 REFL_FIELD( random_prompt )
 REFL_FIELD( use_color )
 REFL_FIELD( interactive )
 REFL_FIELD( chatml )
 REFL_FIELD( prompt_cache_all )
 REFL_FIELD( prompt_cache_ro )
 REFL_FIELD( embedding )
 REFL_FIELD( escape )
 REFL_FIELD( interactive_first )
 REFL_FIELD( multiline_input )
 REFL_FIELD( simple_io )
 REFL_FIELD( cont_batching )
 REFL_FIELD( input_prefix_bos )
 REFL_FIELD( ignore_eos )
 REFL_FIELD( instruct )
 REFL_FIELD( logits_all )
 REFL_FIELD( use_mmap)
 REFL_FIELD( use_mlock )
 REFL_FIELD( numa )
 REFL_FIELD( verbose_prompt )
 REFL_FIELD( infill ) 
 REFL_FIELD(mmproj )
 REFL_FIELD( image)
 REFL_END
 REFL_TYPE(llama_sampling_params)
 REFL_END
 REFL_TYPE(llm_arch)
 REFL_END
 REFL_TYPE(llama_sampling_context )
 REFL_FIELD( params)
 REFL_FIELD( mirostat_mu)
 REFL_FIELD( grammar)
 REFL_FIELD( parsed_grammar)
 REFL_FIELD( prev) 
 REFL_FIELD( cur)
 REFL_END
 REFL_TYPE(llama_token_data )
 REFL_END
 REFL_TYPE(llama_token_data_array )
 REFL_END
 REFL_TYPE(llama_batch )
 REFL_END
 REFL_TYPE(ggml_object)
  REFL_FIELD(offs)
 REFL_END
 REFL_TYPE(ggml_tensor)
  REFL_FIELD(type)
 REFL_END
 REFL_TYPE(ggml_cplan)
  REFL_FIELD(work_size)
 REFL_END
 REFL_TYPE(ggml_hash_set)
  REFL_FIELD(size)
 REFL_END
 REFL_TYPE(ggml_cgraph)
  REFL_FIELD(size)
 REFL_END
 REFL_TYPE(ggml_scratch)
  REFL_FIELD(offs)
 REFL_END
 REFL_TYPE(ggml_compute_params)
  REFL_FIELD(type)
 REFL_END
 REFL_TYPE(ggml_opt_params)
  REFL_FIELD(type)
 REFL_END
 REFL_TYPE(ggml_opt_context)
  REFL_FIELD(ctx)
 REFL_END
 REFL_TYPE(gguf_init_params)
 REFL_END
 REFL_TYPE(ggml_something)
  REFL_FIELD(type_name)
 REFL_END
 REFL_TYPE(ggml_context)
  REFL_FIELD(mem_size)
 REFL_FIELD(mem_buffer)
 REFL_FIELD(mem_buffer_owned)
 REFL_FIELD(    no_alloc)
 REFL_FIELD(    no_alloc_save)
 REFL_FIELD(    n_objects)
 REFL_FIELD(    objects_begin)
 REFL_FIELD(    objects_end)
 REFL_FIELD(    scratch)
 REFL_FIELD(    scratch_save)
 REFL_END
 REFL_TYPE(ggml_context_container)
  REFL_FIELD(used)
  REFL_FIELD(context)
 REFL_END
 REFL_TYPE(ggml_numa_node)
   REFL_FIELD(cpus)
   REFL_FIELD(n_cpus)
 REFL_END
 REFL_TYPE(ggml_numa_nodes)
   REFL_FIELD(nodes)
   REFL_FIELD(n_nodes)
 REFL_END
 REFL_TYPE(ggml_state)
   REFL_FIELD(contexts)
   REFL_FIELD(numa)
   REFL_END
 REFL_TYPE(gguf_str)
   REFL_FIELD(n)
   REFL_FIELD(data)
 REFL_END
 REFL_TYPE(ggml_map_custom1_op_params)
   REFL_FIELD(fun)
   REFL_FIELD(n_tasks)
 REFL_END
 REFL_TYPE(ggml_map_custom2_op_params)
  REFL_FIELD(fun)
  REFL_FIELD(n_tasks)
 REFL_END
 REFL_TYPE(ggml_map_custom3_op_params)
  REFL_FIELD(fun)
  REFL_FIELD(n_tasks)
 REFL_END
 REFL_TYPE(hash_map)
  REFL_FIELD(set)
  REFL_FIELD(vals)
 REFL_END
 REFL_TYPE(ggml_compute_state_shared)
  REFL_FIELD(cgraph)
  REFL_FIELD(cplan)
 REFL_END
 REFL_TYPE(ggml_compute_state)
  REFL_FIELD(thrd)
  REFL_FIELD(ith)
 REFL_END
 REFL_TYPE(ggml_lbfgs_iteration_data)
  REFL_FIELD(alpha)
  REFL_FIELD(ys)
 REFL_END
 REFL_TYPE(gguf_kv)
  REFL_FIELD(key)
  REFL_FIELD(type)
 REFL_END
 REFL_TYPE(gguf_header)
  REFL_FIELD(magic)
  REFL_FIELD(version)
 REFL_END
 REFL_TYPE(gguf_tensor_info)
  REFL_FIELD(name)
  REFL_FIELD(n_dims)
 REFL_END
 REFL_TYPE(gguf_context)
  REFL_FIELD(header)
  REFL_FIELD(kv)
 REFL_END
 REFL_TYPE(gguf_buf)
  REFL_FIELD(data)
  REFL_FIELD(size)
 REFL_END
 REFL_TYPE(llama_model_params)
  REFL_FIELD(n_gpu_layers)
 REFL_END
 REFL_TYPE(llama_context_params)
  REFL_FIELD(seed)
 REFL_END
 REFL_TYPE(llama_model_quantize_params)
  REFL_FIELD(nthread)
 REFL_END
 REFL_TYPE(llama_grammar_element)
 REFL_END
 REFL_TYPE(llama_timings)
  REFL_FIELD(t_start_ms)
 REFL_END
 REFL_TYPE(llama_beam_view)
  REFL_FIELD(tokens)
 REFL_END
 REFL_TYPE(llama_beams_state)
  REFL_FIELD(beam_views)
 REFL_END
 REFL_TYPE(ggml_backend)
 REFL_END
 REFL_TYPE(ggml_backend_buffer)
 REFL_END
 REFL_TYPE(ggml_allocr)
 REFL_END
 REFL_TYPE(ggml_tallocr)
 REFL_END
 REFL_TYPE(ggml_gallocr)
 REFL_END
 REFL_TYPE(llama_buffer)
 REFL_FIELD(data)
 REFL_FIELD(size)
 REFL_END
 REFL_TYPE(llama_file)
 REFL_FIELD(fp)
 REFL_FIELD(size)
 REFL_END
 REFL_TYPE(llama_mmap)
 REFL_FIELD(addr)
 REFL_FIELD(size)
 REFL_END
 REFL_TYPE(llama_mlock)
  REFL_FIELD(addr)
  REFL_FIELD(size)
 REFL_END
 REFL_TYPE(llama_state)
 REFL_FIELD(log_callback)
 REFL_FIELD(log_callback_user_data)
 REFL_END
 REFL_TYPE(llama_hparams)
  REFL_FIELD(vocab_only)
  REFL_FIELD(n_vocab)
  REFL_END
 REFL_TYPE(llama_cparams)
  REFL_FIELD(n_ctx)
  REFL_FIELD(n_batch)
 REFL_END
 REFL_TYPE(llama_layer)
 REFL_FIELD(attn_norm)
 REFL_FIELD(attn_norm_b)
 REFL_END
 REFL_TYPE(llama_kv_cell)
  REFL_FIELD(pos)
  REFL_FIELD(delta)
 REFL_END
 REFL_TYPE(llama_kv_cache)
   REFL_FIELD(has_shift)
   REFL_FIELD(head)
 REFL_END
 REFL_TYPE(e_model)
 REFL_END
 REFL_TYPE(llama_ftype)
 REFL_END
 REFL_TYPE(llama_model)
  REFL_FIELD(type)
  REFL_FIELD(arch)
 REFL_FIELD(ftype )
 REFL_FIELD(  name )
  REFL_FIELD(   hparams )
 REFL_FIELD(    vocab)
 REFL_FIELD(   tok_embd)
 REFL_FIELD(   pos_embd)
 REFL_FIELD(   tok_norm)
 REFL_FIELD(   tok_norm_b)
 REFL_FIELD(   output_norm)
 REFL_FIELD(  output_norm_b)
 REFL_FIELD(  output)
 REFL_FIELD(  layers)
 REFL_FIELD(  n_gpu_layers)
  REFL_FIELD(  gguf_kv) //unordered map
  REFL_FIELD( ctx)
  REFL_FIELD( buf)
 REFL_FIELD( mapping) //std::unique_ptr 
 REFL_FIELD( mlock_buf)
 REFL_FIELD( mlock_mmap)
 REFL_FIELD( tensors_by_name)
  REFL_FIELD( t_load_us)
 REFL_FIELD( t_start_us)
 REFL_END
 REFL_TYPE(llama_vocab)
  REFL_END
  REFL_TYPE(grammar_parser::parse_state)
  REFL_END
 REFL_TYPE(llama_context)
 REFL_FIELD( cparams)
 //REFL_FIELD(model)
 REFL_FIELD(kv_self)
 REFL_FIELD(rng) //random numbers
 REFL_FIELD(has_evaluated_once )
 REFL_FIELD( t_start_us)
 REFL_FIELD( t_load_us)
  REFL_FIELD( t_sample_us )
 REFL_FIELD( t_p_eval_us )
  REFL_FIELD( t_eval_us)
 REFL_FIELD( n_sample )
 REFL_FIELD( n_p_eval )
  REFL_FIELD( n_eval  )
 REFL_FIELD(  logits)
 REFL_FIELD(  logits_all )
 REFL_FIELD(  embedding)
 REFL_FIELD(   work_buffer)
  REFL_FIELD(   buf_compute)
  REFL_FIELD( buf_alloc)
 REFL_FIELD( alloc ) 
 #ifdef GGML_USE_METAL
 REFL_FIELD( ctx_metal )
 #endif
 #ifdef GGML_USE_MPI
 REFL_FIELD( ctx_mpi )
 #endif
 REFL_END
 REFL_TYPE(llama_model_loader)
  REFL_FIELD(n_kv)
  REFL_FIELD(n_tensors)
 REFL_END
 REFL_TYPE(llm_build_context)
 // REFL_FIELD(model) cannot create pointer to reference member ‘llm_build_context::model’
 //  REFL_FIELD(hparams) cannot create pointer to reference member ‘llm_build_context::hparams’
 REFL_END
 REFL_TYPE(llm_offload_trie)
 REFL_END
 REFL_TYPE(llm_symbol)
  REFL_FIELD(prev)
 REFL_END
 REFL_TYPE(llm_bigram_spm)
 REFL_END
 REFL_TYPE(llm_tokenizer_spm)
 REFL_END
 REFL_TYPE(llm_bigram_bpe)
 REFL_END
 REFL_TYPE(llm_tokenizer_bpe)
 REFL_END
 REFL_TYPE(fragment_buffer_variant)
 REFL_END
 REFL_TYPE(llama_partial_utf8)
  REFL_FIELD(value)
  REFL_FIELD(n_remain)
 REFL_END
 REFL_TYPE(llama_grammar)
 REFL_FIELD(rules)
 REFL_FIELD(stacks)
 REFL_END
 REFL_TYPE(llama_grammar_candidate)
 REFL_FIELD(index)
 REFL_FIELD(code_points)
 REFL_END
 REFL_TYPE(llama_beam)
  REFL_FIELD(tokens)
  REFL_FIELD(p)
 REFL_END
 REFL_TYPE(llama_logit_info)
  REFL_FIELD(logits)
  REFL_FIELD(n_vocab)
 REFL_END
 REFL_TYPE(llama_beam_search_data)
  REFL_FIELD(ctx)
  REFL_FIELD(n_beams)
 REFL_END
 REFL_TYPE(quantize_state_internal)
 //  REFL_FIELD(model)
  REFL_FIELD(params)
 REFL_FIELD( n_attention_wv )
 REFL_FIELD(    n_feed_forward_w2 )
  REFL_FIELD(    i_attention_wv    )
  REFL_FIELD(    i_feed_forward_w2 )
 REFL_FIELD(    n_k_quantized     )
 REFL_FIELD(     n_fallback        )
 REFL_END
 REFL_TYPE(llama_data_context)
 REFL_END
 REFL_TYPE(llama_data_buffer_context)
  REFL_FIELD(ptr)
 REFL_END
 REFL_TYPE(llama_data_file_context)
  REFL_FIELD(file)
 REFL_END
 template <typename T>
 constexpr auto get_value_type_name(const T t) noexcept
 {
  return t.value_type;
 }
 namespace runtime2
    {
      using namespace refl;
      using namespace refl::descriptor;
      template <typename CharT, typename T>
        void debug(std::basic_ostream<CharT>& os, const T& value, bool compact = false);
        namespace detail
        {
            template <typename CharT, typename T, typename = decltype(std::declval<std::basic_ostream<CharT>&>() << std::declval<T>())>
            std::true_type is_ostream_printable_test(int);
            template <typename CharT, typename T>
            std::false_type is_ostream_printable_test(...);
            template <typename CharT, typename T>
            constexpr bool is_ostream_printable_v{ decltype(is_ostream_printable_test<CharT, T>(0))::value };
            namespace
            {
                [[maybe_unused]] int next_depth(int depth)
                {
                    return depth == -1 || depth > 8
                        ? -1
                        : depth + 1;
                }
            }
            template <typename CharT>
            void indent(std::basic_ostream<CharT>& os, int depth)
            {
                for (int i = 0; i < depth; i++) {
                    os << "    ";
                }
            }
            template <typename CharT, typename T>
            void debug_impl(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] int depth);
            template <typename CharT, typename T>
            void debug_detailed(std::basic_ostream<CharT>& os, const T& value, int depth)
            {
                using type_descriptor = type_descriptor<T>;
                bool compact = depth == -1;
                // print type with members enclosed in braces
                os << type_descriptor::name << " { ";
                if (!compact) os << '\n';
                constexpr auto readable_members = filter(type_descriptor::members, [](auto member) { return is_readable(member); });
                for_each(readable_members, [&](auto member, [[maybe_unused]] auto index) {
                    int new_depth = next_depth(depth);
                    indent(os, new_depth);
                    os << get_display_name(member) << " = ";
                    if constexpr (util::contains_instance<attr::debug>(member.attributes)) {
                        // use the debug attribute to print
                        auto debug_attr = util::get_instance<attr::debug>(member.attributes);
                        debug_attr.write(os, value);
                    }
                    else {
                        debug_impl(os, member(value), new_depth);
                    }
                    if (!compact || index + 1 != readable_members.size) {
                        os << ", ";
                    }
                    if (!compact) {
                        indent(os, depth);
                        os << '\n';
                    }
                });
                if (compact) os << ' ';
                indent(os, depth);
                os << '}';
            }
            template <typename CharT, typename T>
            void debug_reflectable(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] int depth)
            {
                using type_descriptor = type_descriptor<T>;
                if constexpr (trait::contains_instance_v<attr::debug, typename type_descriptor::attribute_types>) {
                    // use the debug attribute to print
                    auto debug_attr = util::get_instance<attr::debug>(type_descriptor::attributes);
                    debug_attr.write(os, value);
                }
                else if constexpr (detail::is_ostream_printable_v<CharT, T>) {
                    // type supports printing natively, just use that
                    os << value;
                }
                else {
                    debug_detailed(os, value, depth);
                }
            }
            template <typename CharT, typename T>
            void debug_container(std::basic_ostream<CharT>& os, const T& value, int depth)
            {
                bool compact = depth == -1;
                os << "[";
                auto end = value.end();
                for (auto it = value.begin(); it != end; ++it)
                {
                    if (!compact) os << '\n';
                    int new_depth = next_depth(depth);
                    indent(os, new_depth);
                    debug_impl(os, *it, new_depth);
                    if (std::next(it, 1) != end) {
                        os << ", ";
                    }
                    else if (!compact) {
                        os << '\n';
                    }
                }
                indent(os, depth);
                os << "]";
            }
            template <typename CharT, typename T>
            void debug_impl(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] int depth)
            {
                using no_pointer_t = std::remove_pointer_t<T>;
                if constexpr (std::is_same_v<bool, T>) {
                    os << (value ? "true" : "false");
                }
                else if constexpr (std::is_pointer_v<T> && !std::is_void_v<no_pointer_t> && trait::is_reflectable_v<no_pointer_t>) {
                    if (value == nullptr) {
                        os << "nullptr";
                    }
                    else {
                        os << '&';
                        debug_impl(os, *value, -1);
                    }
                }
                else if constexpr (trait::is_reflectable_v<T>) {
                    debug_reflectable(os, value, depth);
                }
                else if constexpr (detail::is_ostream_printable_v<CharT, T>) {
                    os << value;
                }
                else if constexpr (trait::is_container_v<T>) {
                    debug_container(os, value, depth);
                }
                else {
                    os << "(not printable)";
                }
            }
        }
        /**
         * Writes the debug representation of value to the given std::ostream.
         * Calls the function specified by the debug<F> attribute whenever possible,
         * before falling back to recursively interating the members and printing them.
         * Takes an optional arguments specifying whether to print a compact representation.
         * The compact representation contains no newlines.
         */
        template <typename CharT, typename T>
        void debug(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] bool compact)
        {
            static_assert(trait::is_reflectable_v<T> || trait::is_container_v<T> || detail::is_ostream_printable_v<CharT, T>,
                "Type is not reflectable, not a container of reflectable types and does not support operator<<(std::ostream&, T)!");
            detail::debug_impl(os, value, compact ? -1 : 0);
        }
        /**
         * Writes the compact debug representation of the provided values to the given std::ostream.
         */
        template <typename CharT, typename... Ts>
        void debug_all(std::basic_ostream<CharT>& os, const Ts&... values)
        {
            refl::runtime::debug(os, std::forward_as_tuple(static_cast<const Ts&>(values)...), true);
        }
        /**
         * Writes the debug representation of the provided value to an std::string and returns it.
         * Takes an optional arguments specifying whether to print a compact representation.
         * The compact representation contains no newlines.
         */
        template <typename CharT = char, typename T>
        std::basic_string<CharT> debug_str(const T& value, bool compact = false)
        {
            std::basic_stringstream<CharT> ss;
            debug(ss, value, compact);
            return ss.str();
        }
        /**
         * Writes the compact debug representation of the provided values to an std::string and returns it.
         */
        template <typename CharT = char, typename... Ts>
        std::basic_string<CharT> debug_all_str(const Ts&... values)
        {
            return refl::runtime::debug_str(std::forward_as_tuple(static_cast<const Ts&>(values)...), true);
        }
 }
 // // A generic function to print out the fields of any object
 template<typename T>
 void print_fields(const T& t) {
  runtime2::debug(std::cout, t);
  constexpr auto type = refl::reflect<T>();
  constexpr auto membertype = refl::member_list<T>();
  constexpr auto members = get_members(type);
  std::cout << "DEBUG Type: " << type.name.c_str() << "\n";
  std::cout << "DEBUG Type2: " << typeid(membertype).name() << "\n";
  std::cout << "DEBUG Type3: " << typeid(members).name() << "\n";
     refl::util::for_each(members, [&](auto member) {
       //using member_t = decltype(member::value_type);
       //typename type3 = member::value_type;
       //typename trait::remove_qualifiers_t<member_t>::value_type>;
       //constexpr auto type2 = refl::reflect(type3);
 	 //std::cout  << "Auto:" << foo <<"\n";       
       std::cout  << "Auto:" << member.name <<"\n";
       //std::cout << "DEBUG Type2: " << typeid(member_t).name() << "\n";
       //std::cout << "DEBUG Type2: " << type2.name.c_str() << "\n";
     });
     std::cout << "\n";
 }
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -46,6 +46,6 @@ llama_build_and_test_executable(test-grad0.cpp) # SLOW
 llama_build_and_test_executable(test-rope.cpp)
 # dummy executable - not installed
-get_filename_component(TEST_TARGET test-c.c NAME_WE)
+get_filename_component(TEST_TARGET test-c.cpp NAME_WE)
-add_executable(${TEST_TARGET} test-c.c)
+add_executable(${TEST_TARGET} test-c.cpp)
 target_link_libraries(${TEST_TARGET} PRIVATE llama)
--- a/tests/test-c.cpp
+++ b/tests/test-c.cpp