remove old filever checks (+7 squashed commit)

Squashed commit: [b72627a] new format not working [e568870] old ver works [7053b77] compile errors fixed, fixing linkers [4ae8889] add new ver [ff82dfd] file format checks [25b8aa8] refactoring type names [931063b] still merging
2023-05-20 16:47:44 +08:00 · 2023-05-20 16:47:44 +08:00 · c048bcfec4
commit c048bcfec4
parent 417302b226
25 changed files with 3180 additions and 506 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -285,6 +285,14 @@ target_compile_features(ggml_v1 PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml_v1 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
 set_target_properties(ggml_v1 PROPERTIES POSITION_INDEPENDENT_CODE ON)

+add_library(ggml_v2 OBJECT
+            otherarch/ggml_v2.c
+            otherarch/ggml_v2.h)
+target_include_directories(ggml_v2 PUBLIC . ./otherarch ./otherarch/tools)
+target_compile_features(ggml_v2 PUBLIC c_std_11) # don't bump
+target_link_libraries(ggml_v2 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
+set_target_properties(ggml_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
 add_library(common2
            examples/common.cpp
            examples/common.h)
--- a/32
+++ b/32
@ -228,7 +228,21 @@ ggml_clblast.o: ggml.c ggml.h
 	$(CC)  $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) $(CLBLAST_FLAGS) -c $< -o $@
 ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
 	$(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
-ggml-opencl-legacy.o: ggml-opencl-legacy.c ggml-opencl-legacy.h
+
+#version 2 libs
+ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
+	$(CC)  $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -c $< -o $@
+ggml_v2_openblas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
+	$(CC)  $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) $(OPENBLAS_FLAGS) -c $< -o $@
+ggml_v2_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
+	$(CC)  $(CFLAGS) -c $< -o $@
+ggml_v2_openblas_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
+	$(CC)  $(CFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
+ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
+	$(CC)  $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) $(CLBLAST_FLAGS) -c $< -o $@
+ggml_v2-opencl.o: otherarch/ggml_v2-opencl.cpp otherarch/ggml_v2-opencl.h
+	$(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
+ggml_v2-opencl-legacy.o: otherarch/ggml_v2-opencl-legacy.c otherarch/ggml_v2-opencl-legacy.h
 	$(CC) $(CFLAGS) -c $< -o $@

 #extreme old version compat
@ -264,19 +278,15 @@ main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	@echo '====  Run ./main -h for help.  ===='
 	@echo

-koboldcpp: ggml.o ggml_v1.o expose.o common.o gpttype_adapter.o $(OBJS)
+koboldcpp: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o $(OBJS)
 	$(DEFAULT_BUILD)
-
-koboldcpp_openblas: ggml_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o 
-	$(OPENBLAS_BUILD)
-	
-koboldcpp_noavx2: ggml_noavx2.o ggml_v1_noavx2.o expose.o common.o gpttype_adapter.o 
+koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o 
+	$(OPENBLAS_BUILD)	
+koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_noavx2.o expose.o common.o gpttype_adapter.o 
 	$(NOAVX2_BUILD)
-
-koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o gpttype_adapter.o 
+koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_v2_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o gpttype_adapter.o 
 	$(OPENBLAS_NOAVX2_BUILD)
-
-koboldcpp_clblast: ggml_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml-opencl-legacy.o
+koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o
 	$(CLBLAST_BUILD)
 		
 quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o
--- a/expose.cpp
+++ b/expose.cpp
@ -63,7 +63,7 @@ extern "C"
        putenv((char*)deviceenv.c_str());
        executable_path = inputs.executable_path;

-        if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4)
+        if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4  || file_format==FileFormat::GPTJ_5)
        {
            printf("\n---\nIdentified as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format);
            ModelLoadResult lr = gpttype_load_model(inputs, file_format);
@ -141,7 +141,7 @@ extern "C"
                return true;
            }
        }
-        else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
+        else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5|| file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
        {
            printf("\n---\nIdentified as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
            ModelLoadResult lr = gpttype_load_model(inputs, file_format);
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -6,6 +6,7 @@

 #define CL_TARGET_OPENCL_VERSION 110
 #include <clblast.h>
+#include <clblast_c.h>

 #include <stdlib.h>
 #include <stdio.h>
@ -16,55 +17,45 @@
 #define CL_DMMV_BLOCK_SIZE 32;

 #define MULTILINE_QUOTE(...) #__VA_ARGS__
-std::string program_source = MULTILINE_QUOTE(
+static std::string program_source = MULTILINE_QUOTE(

 typedef char int8_t;
 typedef uchar uint8_t;
 typedef int int32_t;
 typedef uint uint32_t;

-constant uint QK4_0 = 32;
-constant uint QR4_0 = 2;
 struct block_q4_0
 {
    half d;
-    uint8_t qs[QK4_0 / 2];
+    uint8_t qs[16];
 };

-constant uint QK4_1 = 32;
-constant uint QR4_1 = 2;
 struct block_q4_1
 {
    half d;
    half m;
-    uint8_t qs[QK4_1 / 2];
+    uint8_t qs[16];
 };

-constant uint QK5_0 = 32;
-constant uint QR5_0 = 2;
 struct __attribute__ ((packed)) block_q5_0
 {
    half d;
    uint32_t qh;
-    uint8_t qs[QK5_0 / 2];
+    uint8_t qs[16];
 };

-constant uint QK5_1 = 32;
-constant uint QR5_1 = 2;
 struct block_q5_1
 {
    half d;
    half m;
    uint32_t qh;
-    uint8_t qs[QK5_1 / 2];
+    uint8_t qs[16];
 };

-constant uint QK8_0 = 32;
-constant uint QR8_0 = 1;
 struct block_q8_0
 {
    half d;
-    uint8_t qs[QK8_0];
+    uint8_t qs[32];
 };


@ -135,13 +126,13 @@ void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const in
    *v0 = vi0*d;
    *v1 = vi1*d;
 }
-void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
+static void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
    *v0 = vload_half(0, &x[ib + 0]);
    *v1 = vload_half(0, &x[ib + 1]);
 }
 );

-std::string dequant_template = MULTILINE_QUOTE(
+static std::string dequant_template = MULTILINE_QUOTE(
 __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
    const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2;

@ -165,7 +156,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
 }
 );

-std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
+static std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
 __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
    const int block_size = get_local_size(0);
    const int row = get_global_id(0) / block_size;
@ -207,29 +198,29 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
 }
 );

-std::array<std::string, 5> dequant_str_keys = {
+static std::array<std::string, 5> dequant_str_keys = {
    "KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC"
 };

-std::array<std::string, 30> dequant_str_values = {
-    "dequantize_row_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
-    "dequantize_row_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
-    "dequantize_row_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
-    "dequantize_row_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
-    "dequantize_row_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
+static std::array<std::string, 30> dequant_str_values = {
+    "dequantize_row_q4_0", "struct block_q4_0", "32", "2", "dequantize_q4_0",
+    "dequantize_row_q4_1", "struct block_q4_1", "32", "2", "dequantize_q4_1",
+    "dequantize_row_q5_0", "struct block_q5_0", "32", "2", "dequantize_q5_0",
+    "dequantize_row_q5_1", "struct block_q5_1", "32", "2", "dequantize_q5_1",
+    "dequantize_row_q8_0", "struct block_q8_0", "32", "1", "dequantize_q8_0",
    "convert_row_f16", "half", "1", "1", "convert_f16"
 };

-std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
-    "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
-    "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
-    "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
-    "dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
-    "dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
+static std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
+    "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "32", "2", "dequantize_q4_0",
+    "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "32", "2", "dequantize_q4_1",
+    "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "32", "2", "dequantize_q5_0",
+    "dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "32", "2", "dequantize_q5_1",
+    "dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "32", "1", "dequantize_q8_0",
    "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
 };

-std::string& replace(std::string& s, const std::string& from, const std::string& to) {
+static std::string& sreplace(std::string& s, const std::string& from, const std::string& to) {
    size_t pos = 0;
    while ((pos = s.find(from, pos)) != std::string::npos) {
         s.replace(pos, from.length(), to);
@ -238,15 +229,15 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
    return s;
 }

-std::string generate_kernels() {
+static std::string generate_kernels() {
    std::stringstream src;
    src << program_source << '\n';
    for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
        std::string dequant_kernel = dequant_template;
        std::string dmmv_kernel = dequant_mul_mat_vec_template;
        for (size_t j = 0; j < dequant_str_keys.size(); j++) {
-            replace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]);
-            replace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
+            sreplace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]);
+            sreplace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
        }
        src << dequant_kernel << '\n';
        src << dmmv_kernel << '\n';
@ -259,6 +250,7 @@ std::string generate_kernels() {
        cl_int err_ = (err);                                                                    \
        if (err_ != CL_SUCCESS) {                                                               \
            fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__);   \
+            fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n");      \
            exit(1);                                                                            \
        }                                                                                       \
    } while (0)
@ -271,7 +263,7 @@ static cl_program program;
 static cl_kernel convert_row_f16_cl;
 static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
 static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
-static bool fp16_support;
+static bool fp16_support = false;

 static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
    cl_program p;
@ -339,6 +331,8 @@ void ggml_cl_init(void) {
    }
    free(ext_buffer);
    printf("Using Platform: %s Device: %s FP16: %d\n", platform_buffer, device_buffer, fp16_support);
+    fp16_support = false;
+    printf("CL FP16 temporarily disabled pending further optimization.\n");
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    CL_CHECK(err, "clCreateContext");
    queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
@ -552,17 +546,18 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr

            // compute
            cl_event ev_sgemm;
-            clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
-                                                       clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                       ne01, ne11, ne10,
-                                                       alpha,
-                                                       d_X, 0, ne00,
-                                                       d_Y, 0, ne10,
-                                                       beta,
-                                                       d_D, 0, ne01,
-                                                       &queue, &ev_sgemm);
+            clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor,
+                                            (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
+                                            ne01, ne11, ne10,
+                                            alpha,
+                                            d_X, 0, ne00,
+                                            d_Y, 0, ne10,
+                                            beta,
+                                            d_D, 0, ne01,
+                                            &queue, &ev_sgemm);

            if (status != clblast::StatusCode::kSuccess) {
+ 				printf("\nF32 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status);    
                GGML_ASSERT(false);
            }

@ -650,18 +645,19 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr

            // compute
            cl_event ev_sgemm;
-            clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
-                                                       clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                       ne01, ne11, ne10,
-                                                       alpha,
-                                                       d_X, 0, ne00,
-                                                       d_Y, 0, ne10,
-                                                       beta,
-                                                       d_D, 0, ne01,
-                                                       &queue, &ev_sgemm);
+            clblast::StatusCode status = (clblast::StatusCode)CLBlastHgemm((CLBlastLayout)clblast::Layout::kColMajor,
+                                            (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
+                                            ne01, ne11, ne10,
+                                            alpha,
+                                            d_X, 0, ne00,
+                                            d_Y, 0, ne10,
+                                            beta,
+                                            d_D, 0, ne01,
+                                            &queue, &ev_sgemm);

            if (status != clblast::StatusCode::kSuccess) {
-                GGML_ASSERT(false);
+				printf("\nF16 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status);
+           		GGML_ASSERT(false);
            }

            // copy dst to host, then convert to float
@ -757,17 +753,18 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
                CL_CHECK(clFinish(queue), "clFinish");

                // compute
-                clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
-                                                           clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                           ne01, ne11, ne10,
-                                                           alpha,
-                                                           d_X, 0, ne00,
-                                                           d_Y, 0, ne10,
-                                                           beta,
-                                                           d_D, 0, ne01,
-                                                           &queue, &ev_sgemm);
+                clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor,
+                                            (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
+                                            ne01, ne11, ne10,
+                                            alpha,
+                                            d_X, 0, ne00,
+                                            d_Y, 0, ne10,
+                                            beta,
+                                            d_D, 0, ne01,
+                                            &queue, &ev_sgemm);

                if (status != clblast::StatusCode::kSuccess) {
+					printf("\nQF32 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status);
                    GGML_ASSERT(false);
                }
            }
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@ -0,0 +1,22 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+void ggml_cl_init(void);
+
+bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
+
+void * ggml_cl_host_malloc(size_t size);
+void   ggml_cl_host_free(void * ptr);
+
+void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ggml.c
+++ b/ggml.c
@ -512,7 +512,7 @@ static inline int hsum_i32_4(const __m128i a) {
    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
 }

-#if defined(__AVX2__) || defined(__AVX512F__)
+#if __AVX2__ || __AVX512F__
 // spread 32 bits to 32 bytes { 0x00, 0xFF }
 static inline __m256i bytes_from_bits_32(const uint8_t * x) {
    uint32_t x32;
@ -688,7 +688,7 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
 #endif // __AVX__ || __AVX2__ || __AVX512F__
 #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)

-#if defined(__ARM_NEON)
+#if __ARM_NEON

 #if !defined(__aarch64__)

@ -2481,7 +2481,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
        }

-        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
+        sumf += (GGML_FP16_TO_FP32(x[i]).d*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
    }

    *s = sumf;
@ -9289,7 +9289,7 @@ static void ggml_compute_forward_rms_norm_back(

 // ggml_compute_forward_mul_mat

-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
 // helper function to determine if it is better to use BLAS or not
 // for large matrices, BLAS is faster
 static bool ggml_compute_forward_mul_mat_use_blas(
@ -9330,7 +9330,7 @@ static void ggml_compute_forward_mul_mat_f32(
    const int64_t ne02 = src0->ne[2];
    const int64_t ne03 = src0->ne[3];

-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    const int64_t ne10 = src1->ne[0];
 #endif
    const int64_t ne11 = src1->ne[1];
@ -9394,9 +9394,16 @@ static void ggml_compute_forward_mul_mat_f32(
        }
        return;
    }
+#elif defined(GGML_USE_CLBLAST)
+    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
 #endif

-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
        if (params->ith != 0) {
            return;
@ -9416,21 +9423,11 @@ static void ggml_compute_forward_mul_mat_f32(
                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);

-#if defined(GGML_USE_CLBLAST)
-                // zT = y * xT
-                ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne10,
-                        0.0f,    d, ne01,
-                        GGML_TYPE_F32);
-#else
                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                        ne11, ne01, ne10,
                        1.0f,    y, ne10,
                                 x, ne00,
                        0.0f,    d, ne01);
-#endif
            }
        }
        //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
@ -9569,9 +9566,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
        }
        return;
    }
+#elif defined(GGML_USE_CLBLAST)
+    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
 #endif

-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
        GGML_ASSERT(nb10 == sizeof(float));

@ -9601,20 +9605,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
                    assert(id*sizeof(float) <= params->wsize);
                }

-#if defined(GGML_USE_CLBLAST)
-                const float * x = wdata;
-                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-                // zT = y * xT
-                ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne10,
-                        0.0f,    d, ne01,
-                        GGML_TYPE_F32);
-#else
                const float * x = wdata;
                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);

@ -9626,7 +9616,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
                        1.0f,    y, ne10,
                                 x, ne00,
                        0.0f,    d, ne01);
-#endif
            }
        }

@ -9789,9 +9778,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
        }
        return;
    }
+#elif defined(GGML_USE_CLBLAST)
+    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
 #endif

-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
        if (params->ith != 0) {
            return;
@ -9814,9 +9810,6 @@ static void ggml_compute_forward_mul_mat_q_f32(

                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);

-#if defined(GGML_USE_CLBLAST)
-                const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
-#else
                {
                    size_t id = 0;
                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
@ -9828,23 +9821,12 @@ static void ggml_compute_forward_mul_mat_q_f32(
                }

                const float * x = wdata;
-#endif

-#if defined(GGML_USE_CLBLAST)
-                // zT = y * xT
-                ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne10,
-                        0.0f,    d, ne01,
-                        type);
-#else
                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                        ne11, ne01, ne10,
                        1.0f,    y, ne10,
                                 x, ne00,
                        0.0f,    d, ne01);
-#endif
            }
        }

@ -13941,9 +13923,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                            cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
                        }
                        else
+#elif defined(GGML_USE_CLBLAST)
+                        if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
+                            node->n_tasks = 1; // TODO: this actually is doing nothing
+                                                //       the threads are still spinning
+                            cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
+                        }
+                        else
 #endif
                        if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                node->n_tasks = 1; // TODO: this actually is doing nothing
                                                   //       the threads are still spinning
@ -13957,13 +13946,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 #endif
                        } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
                            cur = 0;
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                node->n_tasks = 1;
                            }
 #endif
                        } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                node->n_tasks = 1;
                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
@ -15468,4 +15457,4 @@ int ggml_cpu_has_vsx(void) {
 #endif
 }

-////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
--- a/ggml.h
+++ b/ggml.h
@ -249,6 +249,7 @@ extern "C" {
    enum ggml_backend {
        GGML_BACKEND_CPU = 0,
        GGML_BACKEND_CUDA = 1,
+        GGML_BACKEND_CL = 2,
    };

    // model file types
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -12,30 +12,42 @@
 #include "otherarch.h"

 //for easier compilation
-#include "llama_v2.cpp"
-
 //concat source files into one file for compilation purposes
+#include "llama_v2.cpp"
+#include "llama.cpp"
 #include "utils.cpp"
 #include "gptj_v1.cpp"
 #include "gptj_v2.cpp"
+#include "gptj_v3.cpp"
 #include "gpt2_v1.cpp"
 #include "gpt2_v2.cpp"
+#include "gpt2_v3.cpp"
 #include "rwkv_v2.cpp"
 #include "neox_v2.cpp"
+#include "neox_v3.cpp"


 //return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
 static FileFormat file_format = FileFormat::BADFORMAT;

 static gpt_vocab vocab;
-static gptj_model_v1 gptj_ctx_v1;
-static gptj_model gptj_ctx_v2;
+
+static gptj_v1_model gptj_ctx_v1;
+static gptj_v2_model gptj_ctx_v2;
+static gptj_model gptj_ctx_v3;
+
 static gpt2_v1_model gpt2_ctx_v1;
-static gpt2_model gpt2_ctx_v2;
-static gpt_neox_model neox_ctx;
+static gpt2_v2_model gpt2_ctx_v2;
+static gpt2_model gpt2_ctx_v3;
+
+static gpt_neox_v2_model neox_ctx_v2;
+static gpt_neox_model neox_ctx_v3;
+
 static rwkv_context * rwkv_ctx_v1;
+static llama_v2_context_params llama_ctx_params_v2;
 static llama_context_params llama_ctx_params;
-static llama_context * llama_ctx_v1;
+static llama_v2_context * llama_ctx_v2;
+static llama_context * llama_ctx_v3;

 static gpt_params params;
 static int n_past = 0;
@ -81,7 +93,6 @@ inline bool LogitsDuplicated(std::vector<float> & arr1, std::vector<float> & arr

 llama_token sample_token(llama_token_data_array * candidates, std::mt19937 & rng) 
 {
-    const int64_t t_start_sample_us = ggml_time_us();
    llama_sample_softmax(nullptr, candidates);
    std::vector<float> probs;
    probs.reserve(candidates->size);
@ -220,7 +231,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
    blasbatchsize = inputs.blasbatchsize;
    params.memory_f16 = inputs.f16_kv;
    params.n_ctx = inputs.max_context_length;
-    neox_ctx.hparams.n_ctx = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = params.n_ctx;
+
+    neox_ctx_v2.hparams.n_ctx = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx 
+    = neox_ctx_v3.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx = params.n_ctx;

    printf("System Info: %s\n", llama_print_system_info());
    SetQuantsUnshuffled(false);   
@ -229,33 +242,31 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
        //newer format has bit unshuffling
        SetQuantsUnshuffled(file_format == FileFormat::GGJT_2);        

-        llama_ctx_params = llama_context_default_params();
-        llama_ctx_params.n_ctx = inputs.max_context_length;
+        llama_ctx_params_v2 = llama_v2_context_default_params();
+        llama_ctx_params_v2.n_ctx = inputs.max_context_length;
        //llama_ctx_params.n_parts = -1;
-        llama_ctx_params.seed = -1;
-        llama_ctx_params.f16_kv = inputs.f16_kv;
-        llama_ctx_params.logits_all = false;
-        llama_ctx_params.use_mmap = inputs.use_mmap;
-        llama_ctx_params.use_mlock = inputs.use_mlock;
-        llama_ctx_params.n_gpu_layers = inputs.gpulayers;
+        llama_ctx_params_v2.seed = -1;
+        llama_ctx_params_v2.f16_kv = inputs.f16_kv;
+        llama_ctx_params_v2.logits_all = false;
+        llama_ctx_params_v2.use_mmap = inputs.use_mmap;
+        llama_ctx_params_v2.use_mlock = inputs.use_mlock;
+        llama_ctx_params_v2.n_gpu_layers = inputs.gpulayers;
        
-        llama_ctx_v1 = llama_init_from_file(modelname.c_str(), llama_ctx_params);
+        llama_ctx_v2 = llama_v2_init_from_file(modelname.c_str(), llama_ctx_params_v2);
        
-        if (llama_ctx_v1 == NULL)
+        if (llama_ctx_v2 == NULL)
        {
            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, modelname.c_str());
            return ModelLoadResult::FAIL;
        }
-        if (file_format < FileFormat::GGJT_2)
-        {
-            printf("\n---\nWarning: Your model may be an OUTDATED format (ver %d). Please reconvert it for better results!\n---\n", file_format);
-        }
-
+       
+        printf("\n---\nWarning: Your model may be an OUTDATED format (ver %d). Please reconvert it for better results!\n---\n", file_format);
+       
        if (lora_filename != "")
        {
            printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str());
     
-            int err = llama_apply_lora_from_file(llama_ctx_v1,
+            int err = llama_v2_apply_lora_from_file(llama_ctx_v2,
                                                 lora_filename.c_str(),
                                                 NULL,
                                                 n_threads);
@ -268,9 +279,47 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in

        //determine mem per token
        const std::vector<int> tmp = {1, 2, 3, 4};
-        llama_eval(llama_ctx_v1, tmp.data(), tmp.size(), 0, params.n_threads);
+        llama_v2_eval(llama_ctx_v2, tmp.data(), tmp.size(), 0, params.n_threads);
        return ModelLoadResult::SUCCESS;
+    }
+    else if(file_format == FileFormat::GGJT_3)
+    {
+        llama_ctx_params = llama_context_default_params();
+        llama_ctx_params.n_ctx = inputs.max_context_length;
+        //llama_ctx_paran_parts = -1;
+        llama_ctx_params.seed = -1;
+        llama_ctx_params.f16_kv = inputs.f16_kv;
+        llama_ctx_params.logits_all = false;
+        llama_ctx_params.use_mmap = inputs.use_mmap;
+        llama_ctx_params.use_mlock = inputs.use_mlock;
+        llama_ctx_params.n_gpu_layers = inputs.gpulayers;
+        
+        llama_ctx_v3 = llama_init_from_file(modelname.c_str(), llama_ctx_params);
+        
+        if (llama_ctx_v3 == NULL)
+        {
+            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, modelname.c_str());
+            return ModelLoadResult::FAIL;
+        }
+        if (lora_filename != "")
+        {
+            printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str());
+     
+            int err = llama_apply_lora_from_file(llama_ctx_v3,
+                                                 lora_filename.c_str(),
+                                                 NULL,
+                                                 n_threads);
+            if (err != 0)
+            {
+                fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+                return ModelLoadResult::FAIL;
+            }
+        }

+        //determine mem per token
+        const std::vector<int> tmp = {1, 2, 3, 4};
+        llama_eval(llama_ctx_v3, tmp.data(), tmp.size(), 0, params.n_threads);
+        return ModelLoadResult::SUCCESS;
    }
    else if (file_format == FileFormat::RWKV_1)
    {
@ -329,25 +378,45 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
        legacy_gpt2_eval(gpt2_ctx_v1, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);    
        return ModelLoadResult::SUCCESS;
    }
-    else if (file_format == FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3)
+    else if (file_format == FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3 || file_format==FileFormat::GPT2_4)
    {
-        //newer format has bit unshuffling
-        SetQuantsUnshuffled(file_format == FileFormat::GPT2_3);   
-
-        ModelLoadResult res = gpt2_model_load(params.model, gpt2_ctx_v2, vocab, file_format, inputs.gpulayers);
-        if(res==ModelLoadResult::FAIL)
+        if(file_format==FileFormat::GPT2_4)
        {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return res;
-        } 
-        else if(res==ModelLoadResult::RETRY_LOAD)
-        {
-            printf("\nTensor Transposition Detected! Retrying GPT-2 model loading...");
-            return res;
+            ModelLoadResult res = gpt2_model_load(params.model, gpt2_ctx_v3, vocab, file_format, inputs.gpulayers);
+            if(res==ModelLoadResult::FAIL)
+            {
+                fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+                return res;
+            } 
+            else if(res==ModelLoadResult::RETRY_LOAD)
+            {
+                printf("\nTensor Transposition Detected! Retrying GPT-2 model loading...");
+                return res;
+            }
+            // determine the required inference memory per token:
+            gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);    
+            return ModelLoadResult::SUCCESS;
+        }
+        else
+        {
+            //newer format has bit unshuffling
+            SetQuantsUnshuffled(file_format == FileFormat::GPT2_3);   
+
+            ModelLoadResult res = gpt2_v2_model_load(params.model, gpt2_ctx_v2, vocab, file_format, inputs.gpulayers);
+            if(res==ModelLoadResult::FAIL)
+            {
+                fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+                return res;
+            } 
+            else if(res==ModelLoadResult::RETRY_LOAD)
+            {
+                printf("\nTensor Transposition Detected! Retrying GPT-2 model loading...");
+                return res;
+            }
+            // determine the required inference memory per token:    
+            gpt2_v2_eval(gpt2_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);    
+            return ModelLoadResult::SUCCESS;
        }
-         // determine the required inference memory per token:    
-        gpt2_eval(gpt2_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);    
-        return ModelLoadResult::SUCCESS;
    }
    else if (file_format == FileFormat::GPTJ_1 || file_format == FileFormat::GPTJ_2)
    {
@ -375,82 +444,146 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in

        return ModelLoadResult::SUCCESS;
    }
-    else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
+    else if(file_format == FileFormat::GPTJ_3 || file_format == FileFormat::GPTJ_4 || file_format == FileFormat::GPTJ_5)
    {
-        //newer format has bit unshuffling
-        SetQuantsUnshuffled(file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5);   
-
-        ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx, vocab, file_format);       
-        if(res==ModelLoadResult::FAIL)
+        if(file_format == FileFormat::GPTJ_5)
        {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return res;
-        }
-        else if(res==ModelLoadResult::RETRY_LOAD)
-        {
-            printf("\nIncorrect Tensor Size Detected! Retrying GPT-NeoX model loading...");
-            return res;
-        }
-
-         // determine the required inference memory per token:    
-        gpt_neox_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
-
-        if(logits.size()>0 && file_format==FileFormat::NEOX_2 && !IsNanCheck(logits[0]))
-        {
-            //run the black magic eval to determine if it's redpajama. VERY UGLY HACK!
-            std::vector<int> test_embd = ::gpt_tokenize(vocab, "1 2 3 4 5 6 7");
-            auto orig_par_res = neox_ctx.hparams.par_res;
-            neox_ctx.hparams.par_res = 0; //test with residual false
-            gpt_neox_eval(neox_ctx, params.n_threads, 0, test_embd, logits, mem_per_token);
-            neox_ctx.hparams.par_res = orig_par_res;      
-            int topid = std::max_element(logits.begin(),logits.end())-logits.begin();
-            std::string predicted = vocab.id_to_token[topid].c_str();
-            auto findresult = predicted.find("8");
-            if(findresult != std::string::npos && findresult<2)
+            ModelLoadResult loadresult = gptj_model_load(params.model, gptj_ctx_v3, vocab, inputs.gpulayers);
+            if (loadresult == ModelLoadResult::FAIL)
            {
-                printf("\n---\nOld RedPajama NeoX Detected! Switching to new format! (use_parallel_residual=False)\n");
-                ggml_free(neox_ctx.ctx);
+                fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+                return loadresult;
+            }
+            else if (loadresult == ModelLoadResult::RETRY_LOAD)
+            {
+                printf("\nTensor Transposition Detected! Retrying GPT-J model loading...");
+                return loadresult;
+            }
+
+            // determine the required inference memory per token:    
+            gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
+        
+            //if the logits are NAN or duplicated, it means the model is incompatible
+            std::vector<float> oldlogits(logits);
+
+            //this is another hack because they change the library - we run the eval through the model
+            //twice and compare logits. if they give the same logits for different inputs, model is broken
+            gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token);
+                    
+            if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits)))
+            {
+                printf("\nBad Logits detected! Retrying GPT-J model loading...");
+                ggml_free(gptj_ctx_v3.ctx);
                return ModelLoadResult::RETRY_LOAD;
            }
-        }

-        return ModelLoadResult::SUCCESS;
+            return ModelLoadResult::SUCCESS;
+        }
+        else
+        {
+            //newer format has bit unshuffling
+            SetQuantsUnshuffled(file_format == FileFormat::GPTJ_4);   
+
+            ModelLoadResult loadresult = gptj_v2_model_load(params.model, gptj_ctx_v2, vocab, inputs.gpulayers);
+            if (loadresult == ModelLoadResult::FAIL)
+            {
+                fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+                return loadresult;
+            }
+            else if (loadresult == ModelLoadResult::RETRY_LOAD)
+            {
+                printf("\nTensor Transposition Detected! Retrying GPT-J model loading...");
+                return loadresult;
+            }
+
+            // determine the required inference memory per token:    
+            gptj_v2_eval(gptj_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
+        
+            //if the logits are NAN or duplicated, it means the model is incompatible
+            std::vector<float> oldlogits(logits);
+
+            //this is another hack because they change the library - we run the eval through the model
+            //twice and compare logits. if they give the same logits for different inputs, model is broken
+            gptj_v2_eval(gptj_ctx_v2, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token);
+                    
+            if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits)))
+            {
+                printf("\nBad Logits detected! Retrying GPT-J model loading...");
+                ggml_v2_free(gptj_ctx_v2.ctx);
+                return ModelLoadResult::RETRY_LOAD;
+            }
+
+            return ModelLoadResult::SUCCESS;
+        }
+    }
+    else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5|| file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
+    {
+        if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
+        {           
+            ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx_v3, vocab, file_format);       
+            if(res==ModelLoadResult::FAIL)
+            {
+                fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+                return res;
+            }
+            else if(res==ModelLoadResult::RETRY_LOAD)
+            {
+                printf("\nIncorrect Tensor Size Detected! Retrying GPT-NeoX model loading...");
+                return res;
+            }
+
+            // determine the required inference memory per token:    
+            gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
+
+            return ModelLoadResult::SUCCESS;
+        }
+        else
+        {
+            //newer format has bit unshuffling
+            SetQuantsUnshuffled(file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5);   
+
+            ModelLoadResult res = gpt_neox_v2_model_load(params.model, neox_ctx_v2, vocab, file_format);       
+            if(res==ModelLoadResult::FAIL)
+            {
+                fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+                return res;
+            }
+            else if(res==ModelLoadResult::RETRY_LOAD)
+            {
+                printf("\nIncorrect Tensor Size Detected! Retrying GPT-NeoX model loading...");
+                return res;
+            }
+
+            // determine the required inference memory per token:    
+            gpt_neox_v2_eval(neox_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
+
+            if(logits.size()>0 && file_format==FileFormat::NEOX_2 && !IsNanCheck(logits[0]))
+            {
+                //run the black magic eval to determine if it's redpajama. VERY UGLY HACK!
+                std::vector<int> test_embd = ::gpt_tokenize(vocab, "1 2 3 4 5 6 7");
+                auto orig_par_res = neox_ctx_v2.hparams.par_res;
+                neox_ctx_v2.hparams.par_res = 0; //test with residual false
+                gpt_neox_v2_eval(neox_ctx_v2, params.n_threads, 0, test_embd, logits, mem_per_token);
+                neox_ctx_v2.hparams.par_res = orig_par_res;      
+                int topid = std::max_element(logits.begin(),logits.end())-logits.begin();
+                std::string predicted = vocab.id_to_token[topid].c_str();
+                auto findresult = predicted.find("8");
+                if(findresult != std::string::npos && findresult<2)
+                {
+                    printf("\n---\nOld RedPajama NeoX Detected! Switching to new format! (use_parallel_residual=False)\n");
+                    ggml_v2_free(neox_ctx_v2.ctx);
+                    return ModelLoadResult::RETRY_LOAD;
+                }
+            }
+
+            return ModelLoadResult::SUCCESS;
+        }
+       
    }
    else
    {
-        //newer format has bit unshuffling
-        SetQuantsUnshuffled(file_format == FileFormat::GPTJ_4);   
-
-        ModelLoadResult loadresult = gptj_model_load(params.model, gptj_ctx_v2, vocab, inputs.gpulayers);
-        if (loadresult == ModelLoadResult::FAIL)
-        {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return loadresult;
-        }
-        else if (loadresult == ModelLoadResult::RETRY_LOAD)
-        {
-            printf("\nTensor Transposition Detected! Retrying GPT-J model loading...");
-            return loadresult;
-        }
-
-        // determine the required inference memory per token:    
-        gptj_eval(gptj_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
-      
-        //if the logits are NAN or duplicated, it means the model is incompatible
-        std::vector<float> oldlogits(logits);
-
-        //this is another hack because they change the library - we run the eval through the model
-        //twice and compare logits. if they give the same logits for different inputs, model is broken
-        gptj_eval(gptj_ctx_v2, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token);
-                
-        if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits)))
-        {
-            printf("\nBad Logits detected! Retrying GPT-J model loading...");
-            ggml_free(gptj_ctx_v2.ctx);
-            return ModelLoadResult::RETRY_LOAD;
-        }
-
-        return ModelLoadResult::SUCCESS;
+        printf("\nUnknown Model, cannot load.\n");
+        return ModelLoadResult::FAIL;
    }
   
 }
@ -501,16 +634,20 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
    // tokenize the prompt
    std::vector<int> embd_inp;

-    if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
+    if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2  || file_format == FileFormat::GGJT_3)
    {
        params.prompt.insert(0, 1, ' ');
-        if (file_format == FileFormat::GGML)
+        if(file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 )
        {
-            embd_inp = ::legacy_llama_tokenize(llama_ctx_v1, params.prompt, true);
+            embd_inp = ::llama_v2_tokenize(llama_ctx_v2, params.prompt, true);
+        }
+        else if (file_format == FileFormat::GGML)
+        {
+            embd_inp = ::legacy_llama_v2_tokenize(llama_ctx_v2, params.prompt, true);
        }
        else
        {
-            embd_inp = ::llama_tokenize(llama_ctx_v1, params.prompt, true);
+            embd_inp = ::llama_tokenize(llama_ctx_v3, params.prompt, true);
        }
    }
    else
@ -560,7 +697,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
    {
        //for non llama, limit to 256
        int bbs = blasbatchsize;
-        if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2)
+        if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3)
        {
            bbs = (blasbatchsize > 256 ? 256 : blasbatchsize);
        }
@ -592,7 +729,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o

    if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
    {
-        n_vocab = llama_n_vocab(llama_ctx_v1);
+        n_vocab = llama_v2_n_vocab(llama_ctx_v2);
+    }
+    else if(file_format == FileFormat::GGJT_3)
+    {
+        n_vocab = llama_n_vocab(llama_ctx_v3);
    }
    else if (file_format == FileFormat::GPTJ_1 || file_format == FileFormat::GPTJ_2)
    {
@ -602,6 +743,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
    {
        n_vocab = gptj_ctx_v2.hparams.n_vocab;
    }
+    else if(file_format==FileFormat::GPTJ_5)
+    {
+        n_vocab = gptj_ctx_v3.hparams.n_vocab;
+    }
    else if(file_format == FileFormat::GPT2_1)
    {
        n_vocab = gpt2_ctx_v1.hparams.n_vocab;
@ -610,9 +755,17 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
    {
        n_vocab = gpt2_ctx_v2.hparams.n_vocab;
    }
+    else if(file_format==FileFormat::GPT2_4)
+    {
+        n_vocab = gpt2_ctx_v3.hparams.n_vocab;
+    }
    else if(file_format == FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
    {
-        n_vocab = neox_ctx.hparams.n_vocab;
+        n_vocab = neox_ctx_v2.hparams.n_vocab;
+    }
+    else if( file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
+    {
+        n_vocab = neox_ctx_v3.hparams.n_vocab;
    }
    else if(file_format == FileFormat::RWKV_1)
    {
@ -641,11 +794,18 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
    if(debugmode)
    {
        printf("\n[Debug: Dump Input Tokens]\n");
-        if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
+        if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3)
        {
            for (auto id : embd_inp)
            {
-                printf("'%s (%d)', ",llama_token_to_str(llama_ctx_v1, id),id);
+                printf("'%s (%d)', ",llama_v2_token_to_str(llama_ctx_v2, id),id);
+            }
+        }
+        else if (file_format == FileFormat::GGJT_3)
+        {
+            for (auto id : embd_inp)
+            {
+                printf("'%s (%d)', ",llama_token_to_str(llama_ctx_v3, id),id);
            }
        }
        else
@ -680,7 +840,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o

            if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
            {
-                evalres = (llama_eval(llama_ctx_v1, embd.data(), embdsize, n_past, params.n_threads)==0);
+                evalres = (llama_v2_eval(llama_ctx_v2, embd.data(), embdsize, n_past, params.n_threads)==0);
+            }
+            else if(file_format == FileFormat::GGJT_3)
+            {
+                evalres = (llama_eval(llama_ctx_v3, embd.data(), embdsize, n_past, params.n_threads)==0);
            }
            else if(file_format==FileFormat::RWKV_1)
            {
@ -694,20 +858,37 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
            }
            else if(file_format==FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3)
            {
-                evalres = gpt2_eval(gpt2_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
+                evalres = gpt2_v2_eval(gpt2_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
+            }
+            else if(file_format==FileFormat::GPT2_4)
+            {
+                evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
            }
            else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
            {
-                evalres = gpt_neox_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token);
+                evalres = gpt_neox_v2_eval(neox_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token);
+            }
+            else if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
+            {
+                evalres = gpt_neox_eval(neox_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token);
            }
            else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2)
            {
                evalres = legacy_gptj_eval(gptj_ctx_v1, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
            }
+            else if(file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4)
+            {
+                evalres = gptj_v2_eval(gptj_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token);
+            }
+            else if(file_format==FileFormat::GPTJ_5)
+            {
+                evalres = gptj_eval(gptj_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token);
+            }
            else
            {
-                evalres = gptj_eval(gptj_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token);
+                printf("\nCannot find eval function\n");
            }
+
            if (!evalres)
            {
                fprintf(stderr, "Failed to predict\n");
@ -739,9 +920,17 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
                printf("\n");
            }

-            if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
+            if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3)
            {
-                auto logits = llama_get_logits(llama_ctx_v1);
+                float * logits;
+                if(file_format == FileFormat::GGJT_3)
+                {
+                    logits = llama_get_logits(llama_ctx_v3);
+                }
+                else
+                {
+                    logits = llama_v2_get_logits(llama_ctx_v2);
+                }

                if (!unbanTokens)
                {
@ -765,10 +954,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
                    if ((file_format == FileFormat::GPT2_1 ||
                         file_format == FileFormat::GPT2_2 ||
                         file_format == FileFormat::GPT2_3 ||
+                         file_format == FileFormat::GPT2_4 ||
                         file_format == FileFormat::GPTJ_1 ||
                         file_format == FileFormat::GPTJ_2 ||
                         file_format == FileFormat::GPTJ_3 ||
-                         file_format == FileFormat::GPTJ_4) &&
+                         file_format == FileFormat::GPTJ_4 ||
+                         file_format == FileFormat::GPTJ_5) &&
                        logits.size() > 50256)
                    {
                        logits[50256] = (logits[50256] < 0 ? logits[50256] : 0);
@ -793,7 +984,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o

            if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
            {
-                concat_output += llama_token_to_str(llama_ctx_v1, id);
+                if(file_format == FileFormat::GGJT_3)
+                {
+                    concat_output += llama_token_to_str(llama_ctx_v3, id);
+                }
+                else
+                {
+                    concat_output += llama_v2_token_to_str(llama_ctx_v2, id);
+                }
+                
                if(unbanTokens && id==llama_token_eos())
                {
                     printf("\n(EOS token triggered!)");
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -200,7 +200,7 @@ maxctx = 2048
 maxlen = 128
 modelbusy = False
 defaultport = 5001
-KcppVersion = "1.23.1"
+KcppVersion = "1.24"

 class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
    sys_version = ""
--- a/llama.cpp
+++ b/llama.cpp
@ -11,6 +11,8 @@
 #include "ggml.h"
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
+#elif defined(GGML_USE_CLBLAST)
+#include "ggml-opencl.h"
 #endif

 #include <array>
@ -91,7 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
 static const std::map<e_model, size_t> & MEM_REQ_EVAL()
 {
    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_7B,   768ull * MB },
+        { MODEL_7B,   800ull * MB },
        { MODEL_13B, 1024ull * MB },
        { MODEL_30B, 1280ull * MB },
        { MODEL_65B, 1536ull * MB },
@ -933,7 +935,7 @@ static void llama_model_load_internal(
        if (hparams.ftype != LLAMA_FTYPE_ALL_F32     &&
            hparams.ftype != LLAMA_FTYPE_MOSTLY_F16  &&
            hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
-            throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
+            printf("\nthis format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
        }
    }

@ -941,7 +943,7 @@ static void llama_model_load_internal(
        if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
            hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
            hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
-            throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
+            printf("\nthis format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
        }
    }

@ -1039,7 +1041,7 @@ static void llama_model_load_internal(
    ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);

    model.mapping = std::move(ml->mapping);
-#ifdef GGML_USE_CUBLAS
+#if defined(GGML_USE_CUBLAS)
    {
        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));

@ -1065,6 +1067,32 @@ static void llama_model_load_internal(

        fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
    }
+#elif defined(GGML_USE_CLBLAST)
+    {
+        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+
+        fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
+
+        size_t vram_total = 0;
+
+        for (int i = 0; i < n_gpu; ++i) {
+            const auto & layer = model.layers[i];
+
+            ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
+            ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
+            ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
+            ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
+            ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
+            ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
+            ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
+        }
+        if (n_gpu_layers > (int) hparams.n_layer) {
+            fprintf(stderr, "%s: [opencl] offloading output layer to GPU\n", __func__);
+            ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
+        }
+
+        fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+    }
 #else
    (void) n_gpu_layers;
 #endif
--- a/model_adapter.cpp
+++ b/model_adapter.cpp
@ -112,7 +112,14 @@ void print_tok_vec(std::vector<float> &embd)
           temp %= 1000;
           if (qntvr != 0)
           {
-               fileformat = FileFormat::GPTJ_4;
+               if (qntvr == 1)
+               {
+                   fileformat = FileFormat::GPTJ_4;
+               }
+               else
+               {
+                   fileformat = FileFormat::GPTJ_5;
+               }
           }
           else if (temp != 0 && temp != 1)
           {
@ -131,8 +138,15 @@ void print_tok_vec(std::vector<float> &embd)
           const int32_t qntvr = temp / 1000;
           temp %= 1000;           
           if (qntvr != 0)
-           {
-               fileformat = FileFormat::GPT2_3;
+           {               
+               if (qntvr == 1)
+               {
+                   fileformat = FileFormat::GPT2_3;
+               }
+               else
+               {
+                   fileformat = FileFormat::GPT2_4;
+               }
           }
           else if (temp != 0 && temp != 1)
           {
@ -142,7 +156,7 @@ void print_tok_vec(std::vector<float> &embd)
       else if(vocabsiz < 31998 || vocabsiz > 33000)
       {
           //anything outside the llama v1 range is assumed to be NeoX
-           fileformat = FileFormat::NEOX_4;
+           fileformat = FileFormat::NEOX_6;
           uint32_t temp,temp2;
           fin.read((char *)&temp, sizeof(temp)); //ctx
           fin.read((char *)&temp, sizeof(temp)); //n_embd
@ -169,17 +183,21 @@ void print_tok_vec(std::vector<float> &embd)
                    if((temp==0||temp==1)&&(temp2==0||temp2==1))//special case: par_res and ftype are both 1 or 0
                    {
                        //its a f16/f32 model in the new format
-                        fileformat = FileFormat::NEOX_4;
+                        fileformat = temp==0?FileFormat::NEOX_7:FileFormat::NEOX_6;
                    }
                }
                else
                {
                    const int32_t qntvr = temp2 / 1000; //for future use
-                    //then temp was par_res
-                    if(temp==0) //use_parallel_residual is false in RedPajama
+                    //then temp was par_res, use_parallel_residual is false in RedPajama                  
+                    if(qntvr==1)
                    {
-                        fileformat = FileFormat::NEOX_5;
+                        fileformat = (temp==0?FileFormat::NEOX_5:FileFormat::NEOX_4);
                    }
+                    else
+                    {
+                        fileformat = (temp==0?FileFormat::NEOX_7:FileFormat::NEOX_6);
+                    }              
                }
           }
          
@ -197,7 +215,7 @@ void print_tok_vec(std::vector<float> &embd)
    }
    else if(magic == 0x67676a74) //v3 format ggjt
    {
-        fileformat = FileFormat::GGJT_2; //ggjt by default
+        fileformat = FileFormat::GGJT_3; //ggjt by default
        uint32_t ver, temp, ftype;
        fin.read((char *)&ver, sizeof(ver)); //file version
        fin.read((char *)&temp, sizeof(temp));//vocab
@ -212,6 +230,10 @@ void print_tok_vec(std::vector<float> &embd)
        {
            fileformat = FileFormat::GGJT;
        }
+        else if(ver==2)
+        {
+            fileformat = FileFormat::GGJT_2;
+        }
    }
    fin.close();
    
--- a/model_adapter.h
+++ b/model_adapter.h
@ -20,15 +20,18 @@ enum FileFormat
    GGHF=2, // 2=(llama ggmf)
    GGJT=3, // 3=(llama ggjt) 
    GGJT_2=4, //newer llama format unshuffled
+    GGJT_3=5, //using 16bit scalar

    GPTJ_1=100, //the very first super old GPTJ format
    GPTJ_2=101, //pygmalion, uses old ggml lib
    GPTJ_3=102, //uses new ggml lib
    GPTJ_4=103, //unshuffled
+    GPTJ_5=104, //using 16bit scalar

    GPT2_1=200,
    GPT2_2=201,
    GPT2_3=202, //unshuffled
+    GPT2_4=203, //using 16bit scalar

    RWKV_1=300,

@ -37,6 +40,8 @@ enum FileFormat
    NEOX_3=402, //redpajama
    NEOX_4=403, //unshuffled
    NEOX_5=404, //unshuffled redpajama
+    NEOX_6=405, //using 16bit scalar
+    NEOX_7=406, //using 16bit scalar redpajama
 };

 enum ModelLoadResult
--- a/otherarch/ggml_v2-opencl.cpp
+++ b/otherarch/ggml_v2-opencl.cpp
@ -12,12 +12,12 @@
 #include <stdio.h>
 #include <string.h>

-#include "ggml.h"
+#include "ggml_v2.h"

 #define CL_DMMV_BLOCK_SIZE 32;

 #define MULTILINE_QUOTE(...) #__VA_ARGS__
-std::string program_source = MULTILINE_QUOTE(
+static std::string program_source = MULTILINE_QUOTE(

 typedef char int8_t;
 typedef uchar uint8_t;
@ -126,13 +126,13 @@ void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const in
    *v0 = vi0*d;
    *v1 = vi1*d;
 }
-void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
+static void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
    *v0 = vload_half(0, &x[ib + 0]);
    *v1 = vload_half(0, &x[ib + 1]);
 }
 );

-std::string dequant_template = MULTILINE_QUOTE(
+static std::string dequant_template = MULTILINE_QUOTE(
 __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
    const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2;

@ -156,7 +156,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
 }
 );

-std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
+static std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
 __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
    const int block_size = get_local_size(0);
    const int row = get_global_id(0) / block_size;
@ -198,11 +198,11 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
 }
 );

-std::array<std::string, 5> dequant_str_keys = {
+static std::array<std::string, 5> dequant_str_keys = {
    "KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC"
 };

-std::array<std::string, 30> dequant_str_values = {
+static std::array<std::string, 30> dequant_str_values = {
    "dequantize_row_q4_0", "struct block_q4_0", "32", "2", "dequantize_q4_0",
    "dequantize_row_q4_1", "struct block_q4_1", "32", "2", "dequantize_q4_1",
    "dequantize_row_q5_0", "struct block_q5_0", "32", "2", "dequantize_q5_0",
@ -211,7 +211,7 @@ std::array<std::string, 30> dequant_str_values = {
    "convert_row_f16", "half", "1", "1", "convert_f16"
 };

-std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
+static std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
    "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "32", "2", "dequantize_q4_0",
    "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "32", "2", "dequantize_q4_1",
    "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "32", "2", "dequantize_q5_0",
@ -220,7 +220,7 @@ std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
    "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
 };

-static std::string& sreplace(std::string& s, const std::string& from, const std::string& to) {
+static std::string& sreplace2(std::string& s, const std::string& from, const std::string& to) {
    size_t pos = 0;
    while ((pos = s.find(from, pos)) != std::string::npos) {
         s.replace(pos, from.length(), to);
@ -236,8 +236,8 @@ static std::string generate_kernels() {
        std::string dequant_kernel = dequant_template;
        std::string dmmv_kernel = dequant_mul_mat_vec_template;
        for (size_t j = 0; j < dequant_str_keys.size(); j++) {
-            sreplace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]);
-            sreplace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
+            sreplace2(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]);
+            sreplace2(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
        }
        src << dequant_kernel << '\n';
        src << dmmv_kernel << '\n';
@ -296,13 +296,13 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
    return p;
 }

-void ggml_cl_init(void) {
+void ggml_v2_cl_init(void) {
    cl_int err = 0;
-    char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
-    char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
-    int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
-    int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
-    printf("\nInitializing CLBlast (First Run)...");
+    char * GGML_V2_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
+    char * GGML_V2_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
+    int plat_num = (GGML_V2_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_V2_CLBLAST_PLATFORM));
+    int dev_num = (GGML_V2_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_V2_CLBLAST_DEVICE));
+    printf("\nInitializing LEGACY v2 CLBlast (First Run)...");
    printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
    cl_uint num_platforms;
    clGetPlatformIDs(0, NULL, &num_platforms);
@ -376,7 +376,7 @@ void ggml_cl_init(void) {
    CL_CHECK(err, "clCreateKernel");
 }

-static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
+static void ggml_v2_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
    if (req_size <= *cur_size) {
        return;
    }
@ -391,38 +391,38 @@ static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags
    CL_CHECK(err, "clCreateBuffer");
 }

-static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
+static cl_kernel* ggml_v2_get_to_fp32_cl(ggml_v2_type type) {
    switch (type) {
-        case GGML_TYPE_Q4_0:
+        case GGML_V2_TYPE_Q4_0:
            return &dequantize_row_q4_0_cl;
-        case GGML_TYPE_Q4_1:
+        case GGML_V2_TYPE_Q4_1:
            return &dequantize_row_q4_1_cl;
-        case GGML_TYPE_Q5_0:
+        case GGML_V2_TYPE_Q5_0:
            return &dequantize_row_q5_0_cl;
-        case GGML_TYPE_Q5_1:
+        case GGML_V2_TYPE_Q5_1:
            return &dequantize_row_q5_1_cl;
-        case GGML_TYPE_Q8_0:
+        case GGML_V2_TYPE_Q8_0:
            return &dequantize_row_q8_0_cl;
-        case GGML_TYPE_F16:
+        case GGML_V2_TYPE_F16:
            return &convert_row_f16_cl;
        default:
            return nullptr;
    }
 }

-static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) {
+static cl_kernel* ggml_v2_get_dequantize_mul_mat_vec_cl(ggml_v2_type type) {
    switch (type) {
-        case GGML_TYPE_Q4_0:
+        case GGML_V2_TYPE_Q4_0:
            return &dequantize_mul_mat_vec_q4_0_cl;
-        case GGML_TYPE_Q4_1:
+        case GGML_V2_TYPE_Q4_1:
            return &dequantize_mul_mat_vec_q4_1_cl;
-        case GGML_TYPE_Q5_0:
+        case GGML_V2_TYPE_Q5_0:
            return &dequantize_mul_mat_vec_q5_0_cl;
-        case GGML_TYPE_Q5_1:
+        case GGML_V2_TYPE_Q5_1:
            return &dequantize_mul_mat_vec_q5_1_cl;
-        case GGML_TYPE_Q8_0:
+        case GGML_V2_TYPE_Q8_0:
            return &dequantize_mul_mat_vec_q8_0_cl;
-        case GGML_TYPE_F16:
+        case GGML_V2_TYPE_F16:
            return &convert_mul_mat_vec_f16_cl;
        default:
            return nullptr;
@ -454,7 +454,7 @@ struct cl_buffer {
 static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
 static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;

-static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) {
+static cl_mem ggml_v2_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) {
    scoped_spin_lock lock(g_cl_pool_lock);
    cl_int err;

@ -473,7 +473,7 @@ static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flag
    return mem;
 }

-static void ggml_cl_pool_free(cl_mem mem, size_t size) {
+static void ggml_v2_cl_pool_free(cl_mem mem, size_t size) {
    scoped_spin_lock lock(g_cl_pool_lock);

    for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
@ -488,7 +488,7 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
    clReleaseMemObject(mem);
 }

-static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
+static cl_int ggml_v2_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_v2_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
    cl_int err;
    const uint64_t ne0 = src->ne[0];
    const uint64_t ne1 = src->ne[1];
@ -496,9 +496,9 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
    const uint64_t nb1 = src->nb[1];
    const uint64_t nb2 = src->nb[2];
    const uint64_t nb3 = src->nb[3];
-    const enum ggml_type type = src->type;
-    const size_t ts = ggml_type_size(type);
-    const size_t bs = ggml_blck_size(type);
+    const enum ggml_v2_type type = src->type;
+    const size_t ts = ggml_v2_type_size(type);
+    const size_t bs = ggml_v2_blck_size(type);

    const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
    if (nb0 == ts && nb1 == ts*ne0/bs) {
@ -525,7 +525,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
    return err;
 }

-static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_v2_cl_mul_mat_f32(const ggml_v2_tensor * src0, const ggml_v2_tensor * src1, ggml_v2_tensor * dst) {
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
@ -544,18 +544,18 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
    const int d_ne = ne11 * ne01;

    size_t x_size, y_size, d_size;
-    cl_mem d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_ONLY);
-    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
-    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
+    cl_mem d_X = ggml_v2_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_ONLY);
+    cl_mem d_Y = ggml_v2_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
+    cl_mem d_D = ggml_v2_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);

    cl_int err;

    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            // copy data to device
-            err = ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL);
-            err |= ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL);
-            CL_CHECK(err, "ggml_cl_h2d_tensor_2d");
+            err = ggml_v2_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL);
+            err |= ggml_v2_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL);
+            CL_CHECK(err, "ggml_v2_cl_h2d_tensor_2d");

            CL_CHECK(clFinish(queue), "clFinish");

@ -574,7 +574,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr

            if (status != clblast::StatusCode::kSuccess) {
                printf("\nF32 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status);
-                GGML_ASSERT(false);
+                GGML_V2_ASSERT(false);
            }

            // copy dst to host
@ -584,13 +584,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
        }
    }

-    ggml_cl_pool_free(d_X, x_size);
-    ggml_cl_pool_free(d_Y, y_size);
-    ggml_cl_pool_free(d_D, d_size);
+    ggml_v2_cl_pool_free(d_X, x_size);
+    ggml_v2_cl_pool_free(d_Y, y_size);
+    ggml_v2_cl_pool_free(d_D, d_size);
 }

-static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
-    GGML_ASSERT(fp16_support);
+static void ggml_v2_cl_mul_mat_f16(const ggml_v2_tensor * src0, const ggml_v2_tensor * src1, ggml_v2_tensor * dst, void * wdata, size_t /* wsize */) {
+    GGML_V2_ASSERT(fp16_support);

    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
@ -608,16 +608,16 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
    const int nb2  = dst->nb[2];
    const int nb3  = dst->nb[3];

-    const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
-    const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
+    const ggml_v2_fp16_t alpha = ggml_v2_fp32_to_fp16(1.0f);
+    const ggml_v2_fp16_t beta = ggml_v2_fp32_to_fp16(0.0f);
    const int x_ne = ne01 * ne00;
    const int y_ne = ne11 * ne10;
    const int d_ne = ne11 * ne01;

    size_t x_size, y_size, d_size;
-    cl_mem d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
-    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY);
-    cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
+    cl_mem d_X = ggml_v2_cl_pool_malloc(sizeof(ggml_v2_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
+    cl_mem d_Y = ggml_v2_cl_pool_malloc(sizeof(ggml_v2_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY);
+    cl_mem d_D = ggml_v2_cl_pool_malloc(sizeof(ggml_v2_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY);

    cl_int err;

@ -627,20 +627,20 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            // copy src0 to device
-            err = ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL);
-            CL_CHECK(err, "ggml_cl_h2d_tensor_2d");
+            err = ggml_v2_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL);
+            CL_CHECK(err, "ggml_v2_cl_h2d_tensor_2d");

            // convert src1 to fp16
            // TODO: use multiple threads
-            ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
+            ggml_v2_fp16_t * const tmp = (ggml_v2_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
            char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
            if (src1_cont_rows) {
                if (src1_cont_cols) {
-                    ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
+                    ggml_v2_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
                }
                else {
                    for (int64_t i01 = 0; i01 < ne11; i01++) {
-                        ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
+                        ggml_v2_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
                    }
                }
            }
@ -648,14 +648,14 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
                for (int64_t i01 = 0; i01 < ne11; i01++) {
                    for (int64_t i00 = 0; i00 < ne10; i00++) {
                        // very slow due to no inlining
-                        tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
+                        tmp[i01*ne10 + i00] = ggml_v2_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
                    }
                }
            }

            // copy src1 to device
-            err |= clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL);
-            CL_CHECK(err, "ggml_cl_h2d_tensor_2d");
+            err |= clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_v2_fp16_t) * y_ne, tmp, 0, NULL, NULL);
+            CL_CHECK(err, "ggml_v2_cl_h2d_tensor_2d");

            CL_CHECK(clFinish(queue), "clFinish");

@ -673,24 +673,24 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr

            if (status != clblast::StatusCode::kSuccess) {
                printf("\nF16 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status);
-                GGML_ASSERT(false);
+                GGML_V2_ASSERT(false);
            }

            // copy dst to host, then convert to float
-            err = clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL);
+            err = clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_v2_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL);

            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);

-            ggml_fp16_to_fp32_row(tmp, d, d_ne);
+            ggml_v2_fp16_to_fp32_row(tmp, d, d_ne);
        }
    }

-    ggml_cl_pool_free(d_X, x_size);
-    ggml_cl_pool_free(d_Y, y_size);
-    ggml_cl_pool_free(d_D, d_size);
+    ggml_v2_cl_pool_free(d_X, x_size);
+    ggml_v2_cl_pool_free(d_Y, y_size);
+    ggml_v2_cl_pool_free(d_D, d_size);
 }

-static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_v2_cl_mul_mat_q_f32(const ggml_v2_tensor * src0, const ggml_v2_tensor * src1, ggml_v2_tensor * dst) {
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
@ -701,7 +701,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *

    const int nb2  = dst->nb[2];
    const int nb3  = dst->nb[3];
-    const ggml_type type = src0->type;
+    const ggml_v2_type type = src0->type;
    const bool mul_mat_vec = ne11 == 1;

    const float alpha = 1.0f;
@ -709,39 +709,39 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
    const int x_ne = ne01 * ne00;
    const int y_ne = ne11 * ne10;
    const int d_ne = ne11 * ne01;
-    const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
+    const size_t q_sz = ggml_v2_type_size(type) * x_ne / ggml_v2_blck_size(type);

    size_t x_size, y_size, d_size, q_size;
    cl_mem d_X;
    if (!mul_mat_vec) {
-        d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE);
+        d_X = ggml_v2_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE);
    }
-    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
-    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
+    cl_mem d_Y = ggml_v2_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
+    cl_mem d_D = ggml_v2_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
    cl_mem d_Q;
-    if (src0->backend == GGML_BACKEND_CPU) {
-        d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
+    if (src0->backend == GGML_V2_BACKEND_CPU) {
+        d_Q = ggml_v2_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
    }

-    cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
-    cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
-    GGML_ASSERT(to_fp32_cl != nullptr);
+    cl_kernel* to_fp32_cl = ggml_v2_get_to_fp32_cl(type);
+    cl_kernel* dmmv = ggml_v2_get_dequantize_mul_mat_vec_cl(type);
+    GGML_V2_ASSERT(to_fp32_cl != nullptr);

    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            cl_event ev_sgemm;

            // copy src0 to device if necessary
-            if (src0->backend == GGML_BACKEND_CPU) {
-                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL), "ggml_cl_h2d_tensor_2d");
-            } else if (src0->backend == GGML_BACKEND_CL) {
+            if (src0->backend == GGML_V2_BACKEND_CPU) {
+                CL_CHECK(ggml_v2_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL), "ggml_v2_cl_h2d_tensor_2d");
+            } else if (src0->backend == GGML_V2_BACKEND_CL) {
                d_Q = *(cl_mem*) src0->data;
            } else {
-                GGML_ASSERT(false);
+                GGML_V2_ASSERT(false);
            }
            if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
                // copy src1 to device
-                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_cl_h2d_tensor_2d");
+                CL_CHECK(ggml_v2_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_v2_cl_h2d_tensor_2d");

                // compute
                const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
@ -763,7 +763,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
                CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, 0, NULL, NULL), "clEnqueueNDRangeKernel");

                // copy src1 to device
-                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_cl_h2d_tensor_2d");
+                CL_CHECK(ggml_v2_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_v2_cl_h2d_tensor_2d");

                // wait for conversion
                CL_CHECK(clFinish(queue), "clFinish");
@ -781,7 +781,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *

                if (status != clblast::StatusCode::kSuccess) {
                    printf("\nQF32 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status);
-                    GGML_ASSERT(false);
+                    GGML_V2_ASSERT(false);
                }
            }

@ -793,146 +793,146 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
    }

    if (!mul_mat_vec) {
-        ggml_cl_pool_free(d_X, x_size);
+        ggml_v2_cl_pool_free(d_X, x_size);
    }
-    ggml_cl_pool_free(d_Y, y_size);
-    ggml_cl_pool_free(d_D, d_size);
-    if (src0->backend == GGML_BACKEND_CPU) {
-        ggml_cl_pool_free(d_Q, q_size);
+    ggml_v2_cl_pool_free(d_Y, y_size);
+    ggml_v2_cl_pool_free(d_D, d_size);
+    if (src0->backend == GGML_V2_BACKEND_CPU) {
+        ggml_v2_cl_pool_free(d_Q, q_size);
    }
 }


-bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+bool ggml_v2_cl_can_mul_mat(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * dst) {
    const int64_t ne10 = src1->ne[0];

    const int64_t ne0 = dst->ne[0];
    const int64_t ne1 = dst->ne[1];

    // TODO: find the optimal values for these
-    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
-        src1->type == GGML_TYPE_F32 &&
-        dst->type == GGML_TYPE_F32 &&
-        ((GetQuantsUnshuffled() && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) {
+    if ((src0->type == GGML_V2_TYPE_F32 || src0->type == GGML_V2_TYPE_F16 || ggml_v2_is_quantized(src0->type)) &&
+        src1->type == GGML_V2_TYPE_F32 &&
+        dst->type == GGML_V2_TYPE_F32 &&
+        ((GetQuantsUnshuffled() && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_V2_BACKEND_CL)) {
        return true;
    }

    return false;
 }

-bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
+bool ggml_v2_cl_mul_mat_use_f16(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * /* dst */) {
    // If device doesn't support FP16
    if (!fp16_support) {
        return false;
    }

-    size_t src0_sz = ggml_nbytes(src0);
-    size_t src1_sz = ggml_nbytes(src1);
+    size_t src0_sz = ggml_v2_nbytes(src0);
+    size_t src1_sz = ggml_v2_nbytes(src1);

    // mul_mat_q: src0 is converted to fp32 on device
    size_t mul_mat_q_transfer = src0_sz + src1_sz;

    // mul_mat_f16: src1 is converted to fp16 on cpu
-    size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_fp16_t) * ggml_nelements(src1);
+    size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_v2_fp16_t) * ggml_v2_nelements(src1);

    // choose the smaller one to transfer to the device
    // TODO: this is not always the best choice due to the overhead of converting to fp16
    return mul_mat_f16_transfer < mul_mat_q_transfer;
 }

-void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
-    GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
+void ggml_v2_cl_mul_mat(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * dst, void * wdata, size_t wsize) {
+    GGML_V2_ASSERT(ggml_v2_cl_can_mul_mat(src0, src1, dst));

-    if (src0->type == GGML_TYPE_F32) {
-        ggml_cl_mul_mat_f32(src0, src1, dst);
+    if (src0->type == GGML_V2_TYPE_F32) {
+        ggml_v2_cl_mul_mat_f32(src0, src1, dst);
    }
-    else if (src0->type == GGML_TYPE_F16) {
-        if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
-            ggml_cl_mul_mat_f16(src0, src1, dst, wdata, wsize);
+    else if (src0->type == GGML_V2_TYPE_F16) {
+        if (ggml_v2_cl_mul_mat_use_f16(src0, src1, dst)) {
+            ggml_v2_cl_mul_mat_f16(src0, src1, dst, wdata, wsize);
        }
        else {
-            ggml_cl_mul_mat_q_f32(src0, src1, dst);
+            ggml_v2_cl_mul_mat_q_f32(src0, src1, dst);
        }
    }
-    else if (ggml_is_quantized(src0->type)) {
-        ggml_cl_mul_mat_q_f32(src0, src1, dst);
+    else if (ggml_v2_is_quantized(src0->type)) {
+        ggml_v2_cl_mul_mat_q_f32(src0, src1, dst);
    }
    else {
-        GGML_ASSERT(false);
+        GGML_V2_ASSERT(false);
    }
 }

-size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
-        return ggml_nelements(src1) * sizeof(ggml_fp16_t);
+size_t ggml_v2_cl_mul_mat_get_wsize(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * dst) {
+    if (ggml_v2_cl_mul_mat_use_f16(src0, src1, dst)) {
+        return ggml_v2_nelements(src1) * sizeof(ggml_v2_fp16_t);
    }
    return 0;
 }

-void ggml_cl_transform_tensor(ggml_tensor * tensor) {
+void ggml_v2_cl_transform_tensor(ggml_v2_tensor * tensor) {
    const int64_t ne0 = tensor->ne[0];
    const int64_t ne1 = tensor->ne[1];
    const int64_t ne2 = tensor->ne[2];
    const int64_t ne3 = tensor->ne[3];

-    const ggml_type type = tensor->type;
-    const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
+    const ggml_v2_type type = tensor->type;
+    const size_t q_sz = ggml_v2_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_v2_blck_size(type);

    size_t q_size;
    cl_mem* dst = (cl_mem*) malloc(sizeof(cl_mem));
-    *dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
+    *dst = ggml_v2_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);

    // copy tensor to device
    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = 0; i2 < ne2; i2++) {
            int i = i3*ne2 + i2;
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, *dst, i*ne0*ne1, tensor, i3, i2, NULL), "ggml_cl_h2d_tensor_2d");
+            CL_CHECK(ggml_v2_cl_h2d_tensor_2d(queue, *dst, i*ne0*ne1, tensor, i3, i2, NULL), "ggml_v2_cl_h2d_tensor_2d");
        }
    }

    CL_CHECK(clFinish(queue), "clFinish");

    tensor->data = dst;
-    tensor->backend = GGML_BACKEND_CL;
+    tensor->backend = GGML_V2_BACKEND_CL;
 }

-void ggml_cl_sgemm_wrapper(
-        const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
+void ggml_v2_cl_sgemm_wrapper(
+        const enum ggml_v2_blas_order order, const enum ggml_v2_blas_op trans_a, const enum ggml_v2_blas_op trans_b,
        const int m, const int n, const int k,
        const float alpha, const void *host_a, const int lda,
        const float *host_b, const int ldb, const float beta,
        float *host_c, const int ldc, const int btype) {
    cl_int err = 0;

-    cl_kernel * kernel = ggml_get_to_fp32_cl((ggml_type)btype);
+    cl_kernel * kernel = ggml_v2_get_to_fp32_cl((ggml_v2_type)btype);
    size_t global = n * k, local, size_qb;
    bool dequant;

    switch (btype) {
-    case GGML_TYPE_F32:
+    case GGML_V2_TYPE_F32:
        dequant = false;
        break;
-    case GGML_TYPE_Q4_0:
+    case GGML_V2_TYPE_Q4_0:
        dequant = true;
        local = 16;
        size_qb = global * (sizeof(float) + local) / 32;
        break;
-    case GGML_TYPE_Q4_1:
+    case GGML_V2_TYPE_Q4_1:
        dequant = true;
        local = 16;
        size_qb = global * (sizeof(float) * 2 + local) / 32;
        break;
-    case GGML_TYPE_Q5_0:
+    case GGML_V2_TYPE_Q5_0:
        dequant = true;
        local = 16;
-        size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
+        size_qb = global * (sizeof(ggml_v2_fp16_t) + sizeof(uint32_t) + local) / 32;
        break;
-    case GGML_TYPE_Q5_1:
+    case GGML_V2_TYPE_Q5_1:
        dequant = true;
        local = 16;
-        size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
+        size_qb = global * (sizeof(ggml_v2_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
        break;
-    case GGML_TYPE_Q8_0:
+    case GGML_V2_TYPE_Q8_0:
        dequant = true;
        local = 32;
        size_qb = global * (sizeof(float) + local) / 32;
@ -947,12 +947,12 @@ void ggml_cl_sgemm_wrapper(
    const size_t size_c =  m * n * sizeof(float);

    // Prepare buffers
-    ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
+    ggml_v2_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
    if (dequant) {
-        ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
+        ggml_v2_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
    }
-    ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
-    ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
+    ggml_v2_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
+    ggml_v2_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);

    cl_event ev_a, ev_qb, ev_b;

--- a/otherarch/ggml_v2.c
+++ b/otherarch/ggml_v2.c
@ -1463,7 +1463,7 @@ static void ggml_v2_vec_dot_q5_0_q8_0(const int n, float * restrict s, const voi
 static void ggml_v2_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_v2_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);

-static const quantize_fns_t quantize_fns[GGML_V2_TYPE_COUNT] = {
+static const quantize_fns_t2 quantize_fns[GGML_V2_TYPE_COUNT] = {
    [GGML_V2_TYPE_Q4_0] = {
        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q4_0,
        .quantize_row_q           = quantize_row_q4_0,
@ -1515,14 +1515,14 @@ static const quantize_fns_t quantize_fns[GGML_V2_TYPE_COUNT] = {
 };

 // For internal test use
-quantize_fns_t ggml_v2_internal_get_quantize_fn(size_t i) {
+quantize_fns_t2 ggml_v2_internal_get_quantize_fn(size_t i) {
    GGML_V2_ASSERT(i < GGML_V2_TYPE_COUNT);
    return quantize_fns[i];
 }

 bool quants_unshuffled = false; //new GGJT_2 is unshuffled, all old ones are shuffled
-static const quantize_fns_t quantize_fns_v2[GGML_V2_TYPE_COUNT]; //forward decl
-static inline quantize_fns_t get_quantize_fn(size_t i) 
+static const quantize_fns_t2 quantize_fns_v2[GGML_V2_TYPE_COUNT]; //forward decl
+static inline quantize_fns_t2 get_quantize_fn(size_t i) 
 {
    return(quants_unshuffled?quantize_fns[i]:quantize_fns_v2[i]);  
 }
@ -17186,7 +17186,7 @@ inline bool GetQuantsUnshuffled()
 }

 //TODO: integrate backwards compat
-static const quantize_fns_t quantize_fns_v2[GGML_V2_TYPE_COUNT] = {
+static const quantize_fns_t2 quantize_fns_v2[GGML_V2_TYPE_COUNT] = {
    [GGML_V2_TYPE_Q4_0] = {
        .dequantize_row_q         = dequantize_row_q4_0_v2,
        .quantize_row_q           = quantize_row_q4_0_v2,
--- a/otherarch/ggml_v2.h
+++ b/otherarch/ggml_v2.h
@ -1134,9 +1134,9 @@ extern "C" {
        quantize_row_q_t   quantize_row_q_dot;
        vec_dot_q_t        vec_dot_q;
        enum ggml_v2_type     vec_dot_type;
-    } quantize_fns_t;
+    } quantize_fns_t2;

-    quantize_fns_t ggml_v2_internal_get_quantize_fn(size_t i);
+    quantize_fns_t2 ggml_v2_internal_get_quantize_fn(size_t i);

 #ifdef  __cplusplus
 }
--- a/otherarch/gpt2_v2.cpp
+++ b/otherarch/gpt2_v2.cpp
@ -15,12 +15,9 @@

 #include "model_adapter.h"

-#if defined(GGML_USE_CLBLAST)
-#include "ggml_v2-opencl.h"
-#endif

 // load the model's weights from a file
-ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) {
+ModelLoadResult gpt2_v2_model_load(const std::string & fname, gpt2_v2_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) {
    printf("%s: loading model from '%s'\n", __func__, fname.c_str());

    auto fin = std::ifstream(fname, std::ios::binary);
@ -379,8 +376,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
 //   - embd_inp:  the embeddings of the tokens in the context
 //   - embd_w:    the predicted logits for the next token
 //
-bool gpt2_eval(
-        const gpt2_model & model,
+bool gpt2_v2_eval(
+        const gpt2_v2_model & model,
        const int n_threads,
        const int n_past,
        const std::vector<gpt_vocab::id> & embd_inp,
--- a/otherarch/gpt2_v3.cpp
+++ b/otherarch/gpt2_v3.cpp
@ -0,0 +1,696 @@
+#include "ggml.h"
+#include "otherarch.h"
+
+#include "utils.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <iostream>
+
+#include "model_adapter.h"
+
+#if defined(GGML_USE_CLBLAST)
+#include "ggml-opencl.h"
+#endif
+
+// load the model's weights from a file
+ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) {
+    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return ModelLoadResult::FAIL;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != 0x67676d6c) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return ModelLoadResult::FAIL;
+        }
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+    }
+
+    // load vocab
+    {
+        int32_t n_vocab = 0;
+        fin.read((char *) &n_vocab, sizeof(n_vocab));
+
+        if (n_vocab != model.hparams.n_vocab) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
+            return ModelLoadResult::FAIL;
+        }
+
+        std::string word;
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            fin.read((char *) &len, sizeof(len));
+
+            word.resize(len);
+            fin.read((char *) word.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    auto memory_type = GGML_TYPE_F16;
+
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+    // in order to save memory and also to speed up the computation
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return ModelLoadResult::FAIL;
+    }
+
+    auto & ctx = model.ctx;
+
+    size_t ctx_size = 0;
+
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
+        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+
+        ctx_size += 1.5*(n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // memory_k
+        ctx_size += 1.5*(n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // memory_v
+
+        ctx_size += (6 + 12*n_layer)*512; // object overhead
+
+        printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params;
+        params.mem_size   = ctx_size;
+        params.mem_buffer = NULL;
+        params.no_alloc   = false;
+       
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return ModelLoadResult::FAIL;
+        }
+    }
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        model.layers.resize(n_layer);
+
+        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
+        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+
+        // map by name
+        model.tensors["model/ln_f/g"] = model.ln_f_g;
+        model.tensors["model/ln_f/b"] = model.ln_f_b;
+
+        model.tensors["model/wte"]     = model.wte;
+        model.tensors["model/wpe"]     = model.wpe;
+        model.tensors["model/lm_head"] = model.lm_head;
+
+        for (int i = 0; i < n_layer; ++i) {
+            auto & layer = model.layers[i];
+
+            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
+            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
+
+            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
+            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
+            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+
+            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            // map by name
+            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
+            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
+            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
+        }
+    }
+
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+
+        const int n_mem      = n_layer*n_ctx;
+        const int n_elements = n_embd*n_mem;
+       
+        model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements*1.5);
+        model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements*1.5);
+
+        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+
+        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+    }
+
+    // load weights
+    {
+        size_t total_size = 0;
+
+        bool has_lm_head = false;
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name.data()) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
+                return ModelLoadResult::FAIL;
+            }
+
+            auto tensor = model.tensors[name.data()];
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
+                return ModelLoadResult::FAIL;
+            }
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld], expected [%lld, %lld]\n",
+                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
+                return ModelLoadResult::FAIL;
+            }
+
+            // for debugging
+            if (0) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+            }
+
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
+                return ModelLoadResult::FAIL;
+            }
+
+            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+
+            // GPT-2 models share the WTE tensor as the LM head
+            if (name == "model/wte" && has_lm_head == false) {
+                memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
+            }
+
+            if (name == "model/lm_head") {
+                has_lm_head = true;
+            }
+
+            total_size += ggml_nbytes(tensor);
+        }
+
+        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
+    }
+
+    fin.close();
+
+
+//     //gpu offload for gpt2
+// #if defined(GGML_USE_CLBLAST)
+//     if(gpulayers>0)
+//     {
+//         const auto & hparams = model.hparams;
+//         const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
+//         if(GetQuantsUnshuffled())
+//         {
+//         SetGPULayers(n_gpu);
+
+//         fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
+
+//         size_t vram_total = 0;
+
+//         for (int i = 0; i < n_gpu; ++i) {
+//             const auto & layer = model.layers[i];
+            
+//             ggml_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_nbytes(layer.ln_1_g);
+//             ggml_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_nbytes(layer.ln_1_b);
+//             ggml_cl_transform_tensor(layer.ln_2_g); vram_total += ggml_nbytes(layer.ln_2_g);
+//             ggml_cl_transform_tensor(layer.ln_2_b); vram_total += ggml_nbytes(layer.ln_2_b);
+//             ggml_cl_transform_tensor(layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
+//             ggml_cl_transform_tensor(layer.c_attn_attn_b); vram_total += ggml_nbytes(layer.c_attn_attn_b);
+//             ggml_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
+//             ggml_cl_transform_tensor(layer.c_attn_proj_b); vram_total += ggml_nbytes(layer.c_attn_proj_b);
+//             ggml_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
+//             ggml_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_nbytes(layer.c_mlp_fc_b);
+//             ggml_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
+//             ggml_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_nbytes(layer.c_mlp_proj_b);
+//         }
+
+//         fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+//         }
+//         else
+//         {
+//             if(n_gpu>0)
+//             {
+//                 printf("\n[WARNING: Old format does not support GPU offloading! It will be deactivated!]\n");
+//             }
+//         }
+//     }
+// #endif
+
+
+    return ModelLoadResult::SUCCESS;
+}
+
+// evaluate the transformer
+//
+//   - model:     the model
+//   - n_threads: number of threads to use
+//   - n_past:    the context size so far
+//   - embd_inp:  the embeddings of the tokens in the context
+//   - embd_w:    the predicted logits for the next token
+//
+bool gpt2_eval(
+        const gpt2_model & model,
+        const int n_threads,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp,
+              std::vector<float>         & embd_w,
+              size_t                     & mem_per_token,
+              FileFormat file_format) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_head  = hparams.n_head;
+    const int n_vocab = hparams.n_vocab;
+
+    static size_t buf_size = 256u*1024*1024;
+    static void * buf = malloc(buf_size);
+
+    if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
+        const size_t buf_size_new = 320u*1024*1024 + 2*(mem_per_token*N); // add 10% to account for ggml object overhead
+        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+
+        // reallocate
+        if (buf_size_new > buf_size)
+        {
+            buf_size = buf_size_new;
+            buf = realloc(buf, buf_size);
+            if (buf == nullptr)
+            {
+                fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
+                return false;
+            }
+        }
+    }
+
+    struct ggml_init_params params;
+    params.mem_size   = buf_size;
+    params.mem_buffer = buf;
+    params.no_alloc   = false;
+    
+
+    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_cgraph gf = {};
+    gf.n_threads = n_threads;
+
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+
+    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    for (int i = 0; i < N; ++i) {
+        ((int32_t *) position->data)[i] = n_past + i;
+    }
+
+    // wte + wpe
+    struct ggml_tensor * inpL =
+        ggml_add(ctx0,
+                ggml_get_rows(ctx0, model.wte, embd),
+                ggml_get_rows(ctx0, model.wpe, position));
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * cur;
+
+        // norm
+        {
+            // [ 768, N]
+            cur = ggml_norm(ctx0, inpL);
+
+            // cur = ln_1_g*cur + ln_1_b
+            // [ 768, N]
+            cur = ggml_add(ctx0,
+                    ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
+                        cur),
+                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+        }
+
+        // attn
+        // [2304, 768] - model.layers[il].c_attn_attn_w
+        // [2304,   1] - model.layers[il].c_attn_attn_b
+        // [ 768,   N] - cur (in)
+        // [2304,   N] - cur (out)
+        //
+        // cur = attn_w*cur + attn_b
+        // [2304, N]
+        {
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_attn_attn_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
+                    cur);
+        }
+
+        // self-attention
+        {
+            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+
+            // store key and value to memory
+            if (N >= 1) {
+                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
+
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+            // [64, N, 12]
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
+                            Qcur,
+                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+                        0, 2, 1, 3);
+
+            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+            // [64, n_past + N, 12]
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
+                        0, 2, 1, 3);
+
+            // GG: flash attention
+            //struct ggml_tensor * V =
+            //    ggml_cpy(ctx0,
+            //            ggml_permute(ctx0,
+            //                ggml_reshape_3d(ctx0,
+            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+            //                    n_embd/n_head, n_head, n_past + N),
+            //                1, 2, 0, 3),
+            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
+
+            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
+
+            // K * Q
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale_inplace(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
+                        );
+
+            // KQ_masked = mask_past(KQ_scaled)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+
+            // KQ = soft_max(KQ_masked)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+
+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
+            // [n_past + N, 64, 12]
+            struct ggml_tensor * V_trans =
+                ggml_cpy(ctx0,
+                        ggml_permute(ctx0,
+                            ggml_reshape_3d(ctx0,
+                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+                                n_embd/n_head, n_head, n_past + N),
+                            1, 2, 0, 3),
+                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
+
+            // KQV = transpose(V) * KQ_soft_max
+            // [64, N, 12]
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            // [64, 12, N]
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            // [768, N]
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+        }
+
+        // projection
+        // [ 768, 768] - model.layers[il].c_attn_proj_w
+        // [ 768,   1] - model.layers[il].c_attn_proj_b
+        // [ 768,   N] - cur (in)
+        // [ 768,   N] - cur (out)
+        //
+        // cur = proj_w*cur + proj_b
+        // [768, N]
+        {
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_attn_proj_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
+                    cur);
+        }
+
+        // add the input
+        cur = ggml_add(ctx0, cur, inpL);
+
+        struct ggml_tensor * inpFF = cur;
+
+        // feed-forward network
+        {
+            // norm
+            {
+                cur = ggml_norm(ctx0, inpFF);
+
+                // cur = ln_2_g*cur + ln_2_b
+                // [ 768, N]
+                cur = ggml_add(ctx0,
+                        ggml_mul(ctx0,
+                            ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
+                            cur),
+                        ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
+            }
+
+            // fully connected
+            // [3072, 768] - model.layers[il].c_mlp_fc_w
+            // [3072,   1] - model.layers[il].c_mlp_fc_b
+            // [ 768,   N] - cur (in)
+            // [3072,   N] - cur (out)
+            //
+            // cur = fc_w*cur + fc_b
+            // [3072, N]
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_mlp_fc_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
+                    cur);
+
+            // GELU activation
+            // [3072, N]
+            cur = ggml_gelu(ctx0, cur);
+
+            // projection
+            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
+            // [ 768,    1] - model.layers[il].c_mlp_proj_b
+            // [3072,    N] - cur (in)
+            // [ 768,    N] - cur (out)
+            //
+            // cur = proj_w*cur + proj_b
+            // [768, N]
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_mlp_proj_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
+                    cur);
+        }
+
+        // input for next layer
+        inpL = ggml_add(ctx0, cur, inpFF);
+    }
+
+    // norm
+    {
+        // [ 768, N]
+        inpL = ggml_norm(ctx0, inpL);
+
+        // inpL = ln_f_g*inpL + ln_f_b
+        // [ 768, N]
+        inpL = ggml_add(ctx0,
+                ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model.ln_f_g, inpL),
+                    inpL),
+                ggml_repeat(ctx0, model.ln_f_b, inpL));
+    }
+
+    // inpL = WTE * inpL
+    // [ 768, 50257] - model.lm_head
+    // [ 768, N]     - inpL
+    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+
+    // logits -> probs
+    //inpL = ggml_soft_max_inplace(ctx0, inpL);
+
+    // run the computation
+    ggml_build_forward_expand(&gf, inpL);
+    ggml_graph_compute       (ctx0, &gf);
+
+    //if (n_past%100 == 0) {
+    //    ggml_graph_print   (&gf);
+    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
+    //}
+
+    //embd_w.resize(n_vocab*N);
+    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+
+    // return result just for the last token
+    embd_w.resize(n_vocab);
+    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+
+    if (mem_per_token == 0) {
+        mem_per_token = ggml_used_mem(ctx0)/N;
+    }
+    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
+
+    ggml_free(ctx0);
+
+    return true;
+}
--- a/otherarch/gptj_v1.cpp
+++ b/otherarch/gptj_v1.cpp
@ -16,7 +16,7 @@


 // load the model's weights from a file
-ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1 & model, gpt_vocab & vocab, FileFormat file_format) {
+ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_v1_model & model, gpt_vocab & vocab, FileFormat file_format) {
    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());

    bool super_old_format = (file_format==FileFormat::GPTJ_1);
@ -363,7 +363,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1
 // The GPT-J model requires about 16MB of memory per input token.
 //
 bool legacy_gptj_eval(
-        const gptj_model_v1 & model,
+        const gptj_v1_model & model,
        const int n_threads,
        const int n_past,
        const std::vector<gpt_vocab::id> & embd_inp,
--- a/otherarch/gptj_v2.cpp
+++ b/otherarch/gptj_v2.cpp
@ -18,7 +18,7 @@


 // load the model's weights from a file
-ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab, int gpulayers) {
+ModelLoadResult gptj_v2_model_load(const std::string & fname, gptj_v2_model & model, gpt_vocab & vocab, int gpulayers) {
    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());

    auto fin = std::ifstream(fname, std::ios::binary);
@ -382,8 +382,8 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
 //
 // The GPT-J model requires about 16MB of memory per input token.
 //
-bool gptj_eval(
-        const gptj_model & model,
+bool gptj_v2_eval(
+        const gptj_v2_model & model,
        const int n_threads,
        const int n_past,
        const std::vector<gpt_vocab::id> & embd_inp,
--- a/otherarch/gptj_v3.cpp
+++ b/otherarch/gptj_v3.cpp
@ -0,0 +1,613 @@
+#include "ggml.h"
+#include "otherarch.h"
+
+#include "utils.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <iostream>
+
+#include "model_adapter.h"
+
+
+
+// load the model's weights from a file
+ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab, int gpulayers) {
+    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return ModelLoadResult::FAIL;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != 0x67676d6c) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return ModelLoadResult::FAIL;
+        }
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+    }
+
+    // load vocab
+    {
+        int32_t n_vocab = 0;
+        fin.read((char *) &n_vocab, sizeof(n_vocab));
+
+        if (n_vocab != model.hparams.n_vocab) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
+            return ModelLoadResult::FAIL;
+        }
+
+        std::string word;
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            fin.read((char *) &len, sizeof(len));
+
+            word.resize(len);
+            fin.read((char *) word.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+    // in order to save memory and also to speed up the computation
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return ModelLoadResult::FAIL;
+    }
+
+    auto & ctx = model.ctx;
+
+    auto memory_type = GGML_TYPE_F16;
+
+    size_t ctx_size = 0;
+
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+
+        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte
+
+        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype);         // lmh_g
+        ctx_size +=        n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_q_proj_w
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_k_proj_w
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_v_proj_w
+
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
+
+        ctx_size += (5 + 10*n_layer)*512; // object overhead
+
+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params;
+        params.mem_size   = ctx_size;
+        params.mem_buffer = NULL;
+        params.no_alloc   = false;
+        
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return ModelLoadResult::FAIL;
+        }
+    }
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_vocab = hparams.n_vocab;
+
+        model.layers.resize(n_layer);
+
+        model.wte    = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+
+        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        model.lmh_g  = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        model.lmh_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);
+
+        // map by name
+        model.tensors["transformer.wte.weight"] = model.wte;
+
+        model.tensors["transformer.ln_f.weight"] = model.ln_f_g;
+        model.tensors["transformer.ln_f.bias"]   = model.ln_f_b;
+
+        model.tensors["lm_head.weight"] = model.lmh_g;
+        model.tensors["lm_head.bias"]   = model.lmh_b;
+
+        for (int i = 0; i < n_layer; ++i) {
+            auto & layer = model.layers[i];
+
+            layer.ln_1_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_1_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.c_attn_q_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
+            layer.c_attn_k_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
+            layer.c_attn_v_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
+
+            layer.c_attn_proj_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
+
+            layer.c_mlp_fc_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
+            layer.c_mlp_fc_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+
+            layer.c_mlp_proj_w    = ggml_new_tensor_2d(ctx, wtype,         4*n_embd,   n_embd);
+            layer.c_mlp_proj_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            // map by name
+            model.tensors["transformer.h." + std::to_string(i) + ".ln_1.weight"]          = layer.ln_1_g;
+            model.tensors["transformer.h." + std::to_string(i) + ".ln_1.bias"]            = layer.ln_1_b;
+
+            model.tensors["transformer.h." + std::to_string(i) + ".attn.q_proj.weight"]   = layer.c_attn_q_proj_w;
+            model.tensors["transformer.h." + std::to_string(i) + ".attn.k_proj.weight"]   = layer.c_attn_k_proj_w;
+            model.tensors["transformer.h." + std::to_string(i) + ".attn.v_proj.weight"]   = layer.c_attn_v_proj_w;
+
+            model.tensors["transformer.h." + std::to_string(i) + ".attn.out_proj.weight"] = layer.c_attn_proj_w;
+
+            model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.weight"]     = layer.c_mlp_fc_w;
+            model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.bias"]       = layer.c_mlp_fc_b;
+
+            model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.weight"]    = layer.c_mlp_proj_w;
+            model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.bias"]      = layer.c_mlp_proj_b;
+        }
+    }
+
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+
+        const int n_mem      = n_layer*n_ctx;
+        const int n_elements = n_embd*n_mem;
+
+        model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements);
+        model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements);
+
+        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+
+        printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+    }
+
+    // load weights
+    {
+        int n_tensors = 0;
+        size_t total_size = 0;
+
+        printf("%s: ", __func__);
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name.data()) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
+                return ModelLoadResult::FAIL;
+            }
+
+            auto tensor = model.tensors[name.data()];
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
+                return ModelLoadResult::FAIL;
+            }
+          
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+
+                //test for transposition and retry older loader
+                if(tensor->ne[0]==ne[1] && tensor->ne[1]==ne[0] && should_transpose_layer(name))
+                {
+                    printf("\nFound a transposed tensor. This could be an older or newer model. Retrying load...");
+                    ggml_free(ctx);
+                    return ModelLoadResult::RETRY_LOAD;
+                }
+                else
+                {
+                    fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                            __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
+                    return ModelLoadResult::FAIL;
+                }
+               
+            }
+
+            // for debugging
+            if (0) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+            }
+
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
+                return ModelLoadResult::FAIL;
+            }
+
+            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+
+            //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ttype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+            total_size += ggml_nbytes(tensor);
+            if (++n_tensors % 8 == 0) {
+                printf(".");
+                fflush(stdout);
+            }
+        }
+
+        printf(" done\n");
+
+        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
+    }
+
+    fin.close();
+
+//         //gpu offload for gptj
+// #if defined(GGML_USE_CLBLAST)
+//     if(gpulayers>0)
+//     {
+//         const auto & hparams = model.hparams;
+//         const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
+//         if(GetQuantsUnshuffled())
+//         {
+//         SetGPULayers(n_gpu);
+
+//         fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
+
+//         size_t vram_total = 0;
+
+//         for (int i = 0; i < n_gpu; ++i) {
+//             const auto & layer = model.layers[i];
+
+//             ggml_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_nbytes(layer.ln_1_g);
+//             ggml_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_nbytes(layer.ln_1_b);
+//             ggml_cl_transform_tensor(layer.c_attn_q_proj_w); vram_total += ggml_nbytes(layer.c_attn_q_proj_w);
+//             ggml_cl_transform_tensor(layer.c_attn_k_proj_w); vram_total += ggml_nbytes(layer.c_attn_k_proj_w);
+//             ggml_cl_transform_tensor(layer.c_attn_v_proj_w); vram_total += ggml_nbytes(layer.c_attn_v_proj_w);
+//             ggml_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
+//             ggml_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
+//             ggml_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_nbytes(layer.c_mlp_fc_b);
+//             ggml_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
+//             ggml_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_nbytes(layer.c_mlp_proj_b);
+//         }
+
+//         fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+//         }
+//         else
+//         {
+//             if(n_gpu>0)
+//             {
+//                 printf("\n[WARNING: Old format does not support GPU offloading! It will be deactivated!]\n");
+//             }
+//         }
+//     }
+// #endif
+
+
+    return ModelLoadResult::SUCCESS;
+}
+
+// evaluate the transformer
+//
+//   - model:     the model
+//   - n_threads: number of threads to use
+//   - n_past:    the context size so far
+//   - embd_inp:  the embeddings of the tokens in the context
+//   - embd_w:    the predicted logits for the next token
+//
+// The GPT-J model requires about 16MB of memory per input token.
+//
+bool gptj_eval(
+        const gptj_model & model,
+        const int n_threads,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp,
+              std::vector<float>         & embd_w,
+              size_t                     & mem_per_token) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_head  = hparams.n_head;
+    const int n_vocab = hparams.n_vocab;
+    const int n_rot   = hparams.n_rot;
+
+    static size_t buf_size = 256u*1024*1024;
+    static void * buf = malloc(buf_size);
+
+    if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
+        const size_t buf_size_new = 320u*1024*1024 + 2*(mem_per_token*N); // add 10% to account for ggml object overhead
+        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+
+        // reallocate
+        if (buf_size_new > buf_size)
+        {
+            buf_size = buf_size_new;
+            buf = realloc(buf, buf_size);
+            if (buf == nullptr)
+            {
+                fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
+                return false;
+            }
+        }
+    }
+
+    struct ggml_init_params params;
+    params.mem_size   = buf_size;
+    params.mem_buffer = buf;
+    params.no_alloc   = false;
+    
+
+    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_cgraph gf = {};
+    gf.n_threads = n_threads;
+
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+
+    // wte
+    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * cur;
+
+        // norm
+        {
+            cur = ggml_norm(ctx0, inpL);
+
+            // cur = ln_1_g*cur + ln_1_b
+            cur = ggml_add(ctx0,
+                    ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
+                        cur),
+                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+        }
+
+        struct ggml_tensor * inpSA = cur;
+
+        // self-attention
+        {
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+
+            // store key and value to memory
+            {
+                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur));
+
+                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
+                        (   n_ctx)*ggml_element_size(model.memory_v),
+                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
+
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        Qcur,
+                        0, 2, 1, 3);
+
+            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
+                        0, 2, 1, 3);
+
+            // K * Q
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale_inplace(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
+                        );
+
+            // KQ_masked = mask_past(KQ_scaled)
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+
+            // KQ = soft_max(KQ_masked)
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+
+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, model.memory_v,
+                        n_past + N, n_embd/n_head, n_head,
+                        n_ctx*ggml_element_size(model.memory_v),
+                        n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
+                        il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
+
+            // KQV = transpose(V) * KQ_soft_max
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+
+            // projection (no bias)
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_attn_proj_w,
+                    cur);
+        }
+
+        struct ggml_tensor * inpFF = cur;
+
+        // feed-forward network
+        // this is independent of the self-attention result, so it could be done in parallel to the self-attention
+        {
+            // note here we pass inpSA instead of cur
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_mlp_fc_w,
+                    inpSA);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
+                    cur);
+
+            // GELU activation
+            cur = ggml_gelu(ctx0, cur);
+
+            // projection
+            // cur = proj_w*cur + proj_b
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_mlp_proj_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
+                    cur);
+        }
+
+        // self-attention + FF
+        cur  = ggml_add(ctx0, cur, inpFF);
+
+        // input for next layer
+        inpL = ggml_add(ctx0, cur, inpL);
+    }
+
+    // norm
+    {
+        inpL = ggml_norm(ctx0, inpL);
+
+        // inpL = ln_f_g*inpL + ln_f_b
+        inpL = ggml_add(ctx0,
+                ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model.ln_f_g, inpL),
+                    inpL),
+                ggml_repeat(ctx0, model.ln_f_b, inpL));
+    }
+
+    // lm_head
+    {
+        inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
+
+        inpL = ggml_add(ctx0,
+                ggml_repeat(ctx0, model.lmh_b, inpL),
+                inpL);
+    }
+
+    // logits -> probs
+    //inpL = ggml_soft_max_inplace(ctx0, inpL);
+
+    // run the computation
+    ggml_build_forward_expand(&gf, inpL);
+    ggml_graph_compute       (ctx0, &gf);
+
+    //if (n_past%100 == 0) {
+    //    ggml_graph_print   (&gf);
+    //    ggml_graph_dump_dot(&gf, NULL, "gpt-j.dot");
+    //}
+
+    //embd_w.resize(n_vocab*N);
+    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+
+    // return result for just the last token
+    embd_w.resize(n_vocab);
+    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+
+    if (mem_per_token == 0) {
+        mem_per_token = ggml_used_mem(ctx0)/N;
+    }
+    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
+
+    ggml_free(ctx0);
+
+    return true;
+}
--- a/otherarch/llama_v2-util.h
+++ b/otherarch/llama_v2-util.h
@ -0,0 +1,464 @@
+// Internal header to be included only by llama.cpp.
+// Contains wrappers around OS interfaces.
+#pragma once
+#ifndef LLAMA_V2_UTIL_H
+#define LLAMA_V2_UTIL_H
+
+#include "llama-util.h"
+
+#include <cstdio>
+#include <cstdint>
+#include <cerrno>
+#include <cstring>
+#include <cstdarg>
+#include <cstdlib>
+#include <climits>
+
+#include <string>
+#include <vector>
+#include <stdexcept>
+
+#ifdef __has_include
+    #if __has_include(<unistd.h>)
+        #include <unistd.h>
+        #if defined(_POSIX_MAPPED_FILES)
+            #include <sys/mman.h>
+        #endif
+        #if defined(_POSIX_MEMLOCK_RANGE)
+            #include <sys/resource.h>
+        #endif
+    #endif
+#endif
+
+#if defined(_WIN32)
+    #define WIN32_LEAN_AND_MEAN
+    #ifndef NOMINMAX
+        #define NOMINMAX
+    #endif
+    #include <windows.h>
+    #include <io.h>
+    #include <stdio.h> // for _fseeki64
+#endif
+
+#define LLAMA_V2_ASSERT(x) \
+    do { \
+        if (!(x)) { \
+            fprintf(stderr, "LLAMA_V2_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+            abort(); \
+        } \
+    } while (0)
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
+__attribute__((format(printf, 1, 2)))
+#endif
+#endif
+
+
+struct llama_v2_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_v2_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
+        }
+        seek(0, SEEK_END);
+        size = tell();
+        seek(0, SEEK_SET);
+    }
+
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        LLAMA_V2_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        LLAMA_V2_ASSERT(ret == 0); // same
+    }
+
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
+        }
+        if (ret != 1) {
+            throw std::runtime_error(std::string("unexpectedly reached end of file"));
+        }
+    }
+
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, size, 1, fp);
+        if (ret != 1) {
+            throw std::runtime_error(format("write error: %s", strerror(errno)));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~llama_v2_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+
+#if defined(_WIN32)
+static std::string llama_v2_format_win_err(DWORD err) {
+    LPSTR buf;
+    size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+                                 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
+    if (!size) {
+        return "FormatMessageA failed";
+    }
+    std::string ret(buf, size);
+    LocalFree(buf);
+    return ret;
+}
+#endif
+
+struct llama_v2_mmap {
+    void * addr;
+    size_t size;
+
+    llama_v2_mmap(const llama_v2_mmap &) = delete;
+
+#ifdef _POSIX_MAPPED_FILES
+    static constexpr bool SUPPORTED = true;
+
+    llama_v2_mmap(struct llama_v2_file * file, bool prefetch = true) {
+        size = file->size;
+        int fd = fileno(file->fp);
+        int flags = MAP_SHARED;
+#ifdef __linux__
+        flags |= MAP_POPULATE;
+#endif
+        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
+        if (addr == MAP_FAILED) {
+            throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
+        }
+
+        if (prefetch) {
+            // Advise the kernel to preload the mapped memory
+            if (madvise(addr, file->size, MADV_WILLNEED)) {
+                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
+                        strerror(errno));
+            }
+        }
+    }
+
+    ~llama_v2_mmap() {
+        munmap(addr, size);
+    }
+#elif defined(_WIN32)
+    static constexpr bool SUPPORTED = true;
+
+    llama_v2_mmap(struct llama_v2_file * file, bool prefetch = true) {
+        size = file->size;
+
+        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
+
+        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
+        DWORD error = GetLastError();
+
+        if (hMapping == NULL) {
+            throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_v2_format_win_err(error).c_str()));
+        }
+
+        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
+        error = GetLastError();
+        CloseHandle(hMapping);
+
+        if (addr == NULL) {
+            throw std::runtime_error(format("MapViewOfFile failed: %s", llama_v2_format_win_err(error).c_str()));
+        }
+
+        #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
+        if (prefetch) {
+            // Advise the kernel to preload the mapped memory
+            WIN32_MEMORY_RANGE_ENTRY range;
+            range.VirtualAddress = addr;
+            range.NumberOfBytes = (SIZE_T)size;
+            if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
+                fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
+                        llama_v2_format_win_err(GetLastError()).c_str());
+            }
+        }
+        #else
+        #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
+        #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
+    }
+
+    ~llama_v2_mmap() {
+        if (!UnmapViewOfFile(addr)) {
+            fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
+                    llama_v2_format_win_err(GetLastError()).c_str());
+        }
+    }
+#else
+    static constexpr bool SUPPORTED = false;
+
+    llama_v2_mmap(struct llama_v2_file *, bool prefetch = true) {
+        (void)prefetch;
+        throw std::runtime_error(std::string("mmap not supported"));
+    }
+#endif
+};
+
+// Represents some region of memory being locked using mlock or VirtualLock;
+// will automatically unlock on destruction.
+struct llama_v2_mlock {
+    void * addr = NULL;
+    size_t size = 0;
+    bool failed_already = false;
+
+    llama_v2_mlock() {}
+    llama_v2_mlock(const llama_v2_mlock &) = delete;
+
+    ~llama_v2_mlock() {
+        if (size) {
+            raw_unlock(addr, size);
+        }
+    }
+
+    void init(void * addr) {
+        LLAMA_V2_ASSERT(this->addr == NULL && this->size == 0);
+        this->addr = addr;
+    }
+
+    void grow_to(size_t target_size) {
+        LLAMA_V2_ASSERT(addr);
+        if (failed_already) {
+            return;
+        }
+        size_t granularity = lock_granularity();
+        target_size = (target_size + granularity - 1) & ~(granularity - 1);
+        if (target_size > size) {
+            if (raw_lock((uint8_t *) addr + size, target_size - size)) {
+                size = target_size;
+            } else {
+                failed_already = true;
+            }
+        }
+    }
+
+#ifdef _POSIX_MEMLOCK_RANGE
+    static constexpr bool SUPPORTED = true;
+
+    size_t lock_granularity() {
+        return (size_t) sysconf(_SC_PAGESIZE);
+    }
+
+    #ifdef __APPLE__
+        #define MLOCK_SUGGESTION \
+            "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
+            "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
+    #else
+        #define MLOCK_SUGGESTION \
+            "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
+    #endif
+
+    bool raw_lock(const void * addr, size_t size) {
+        if (!mlock(addr, size)) {
+            return true;
+        } else {
+            char* errmsg = std::strerror(errno);
+            bool suggest = (errno == ENOMEM);
+
+            // Check if the resource limit is fine after all
+            struct rlimit lock_limit;
+            if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
+                suggest = false;
+            if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
+                suggest = false;
+
+            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
+                    size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
+            return false;
+        }
+    }
+
+    #undef MLOCK_SUGGESTION
+
+    void raw_unlock(void * addr, size_t size) {
+        if (munlock(addr, size)) {
+            fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
+        }
+    }
+#elif defined(_WIN32)
+    static constexpr bool SUPPORTED = true;
+
+    size_t lock_granularity() {
+        SYSTEM_INFO si;
+        GetSystemInfo(&si);
+        return (size_t) si.dwPageSize;
+    }
+
+    bool raw_lock(void * addr, size_t size) {
+        for (int tries = 1; ; tries++) {
+            if (VirtualLock(addr, size)) {
+                return true;
+            }
+            if (tries == 2) {
+                fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
+                        size, this->size, llama_v2_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+
+            // It failed but this was only the first try; increase the working
+            // set size and try again.
+            SIZE_T min_ws_size, max_ws_size;
+            if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
+                fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
+                        llama_v2_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+            // Per MSDN: "The maximum number of pages that a process can lock
+            // is equal to the number of pages in its minimum working set minus
+            // a small overhead."
+            // Hopefully a megabyte is enough overhead:
+            size_t increment = size + 1048576;
+            // The minimum must be <= the maximum, so we need to increase both:
+            min_ws_size += increment;
+            max_ws_size += increment;
+            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
+                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
+                        llama_v2_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+        }
+    }
+
+    void raw_unlock(void * addr, size_t size) {
+        if (!VirtualUnlock(addr, size)) {
+            fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
+                    llama_v2_format_win_err(GetLastError()).c_str());
+        }
+    }
+#else
+    static constexpr bool SUPPORTED = false;
+
+    size_t lock_granularity() {
+        return (size_t) 65536;
+    }
+
+    bool raw_lock(const void * addr, size_t size) {
+        fprintf(stderr, "warning: mlock not supported on this system\n");
+        return false;
+    }
+
+    void raw_unlock(const void * addr, size_t size) {}
+#endif
+};
+
+// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
+struct llama_v2_buffer {
+    uint8_t * addr = NULL;
+    size_t size = 0;
+
+    llama_v2_buffer() = default;
+
+    void resize(size_t size) {
+        delete[] addr;
+        addr = new uint8_t[size];
+        this->size = size;
+    }
+
+    ~llama_v2_buffer() {
+        delete[] addr;
+    }
+
+    // disable copy and move
+    llama_v2_buffer(const llama_v2_buffer&) = delete;
+    llama_v2_buffer(llama_v2_buffer&&) = delete;
+    llama_v2_buffer& operator=(const llama_v2_buffer&) = delete;
+    llama_v2_buffer& operator=(llama_v2_buffer&&) = delete;
+};
+
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+struct llama_v2_ctx_buffer {
+    uint8_t * addr = NULL;
+    bool is_cuda;
+    size_t size = 0;
+
+    llama_v2_ctx_buffer() = default;
+
+    void resize(size_t size) {
+        free();
+
+        addr = (uint8_t *) ggml_cuda_host_malloc(size);
+        if (addr) {
+            is_cuda = true;
+        }
+        else {
+            // fall back to pageable memory
+            addr = new uint8_t[size];
+            is_cuda = false;
+        }
+        this->size = size;
+    }
+
+    void free() {
+        if (addr) {
+            if (is_cuda) {
+                ggml_cuda_host_free(addr);
+            }
+            else {
+                delete[] addr;
+            }
+        }
+        addr = NULL;
+    }
+
+    ~llama_v2_ctx_buffer() {
+        free();
+    }
+
+    // disable copy and move
+    llama_v2_ctx_buffer(const llama_v2_ctx_buffer&) = delete;
+    llama_v2_ctx_buffer(llama_v2_ctx_buffer&&) = delete;
+    llama_v2_ctx_buffer& operator=(const llama_v2_ctx_buffer&) = delete;
+    llama_v2_ctx_buffer& operator=(llama_v2_ctx_buffer&&) = delete;
+};
+#else
+typedef llama_v2_buffer llama_v2_ctx_buffer;
+#endif
+
+#endif
--- a/otherarch/llama_v2.cpp
+++ b/otherarch/llama_v2.cpp
@ -5,7 +5,7 @@
 #include <cstdio>
 #endif

-#include "llama-util.h"
+#include "llama_v2-util.h"
 #include "llama_v2.h"

 #include "ggml_v2.h"
@ -39,67 +39,67 @@
 #define LLAMA_V2_MAX_SCRATCH_BUFFERS 16

 // available llama models
-enum e_model {
-    MODEL_UNKNOWN,
-    MODEL_7B,
-    MODEL_13B,
-    MODEL_30B,
-    MODEL_65B,
+enum e_model2 {
+    MODEL_UNKNOWN_2,
+    MODEL_7B_2,
+    MODEL_13B_2,
+    MODEL_30B_2,
+    MODEL_65B_2,
 };

-static const size_t MB = 1024*1024;
+static const size_t MB_2 = 1024*1024;

 // computed for n_ctx == 2048
 // TODO: dynamically determine these sizes
 //       needs modifications in ggml

-static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
+static const std::map<e_model2, size_t> & MEM_REQ_SCRATCH0_2()
 {
-    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_UNKNOWN, 512ull * MB },
-        { MODEL_7B,    512ull * MB },
-        { MODEL_13B,   512ull * MB },
-        { MODEL_30B,   512ull * MB },
-        { MODEL_65B,  1024ull * MB },
+    static std::map<e_model2, size_t> k_sizes = {
+        { MODEL_UNKNOWN_2, 512ull * MB_2 },
+        { MODEL_7B_2,    512ull * MB_2 },
+        { MODEL_13B_2,   512ull * MB_2 },
+        { MODEL_30B_2,   512ull * MB_2 },
+        { MODEL_65B_2,  1024ull * MB_2 },
    };
    return k_sizes;
 }

-static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
+static const std::map<e_model2, size_t> & MEM_REQ_SCRATCH1_2()
 {
-    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_UNKNOWN, 512ull * MB },
-        { MODEL_7B,    512ull * MB },
-        { MODEL_13B,   512ull * MB },
-        { MODEL_30B,   512ull * MB },
-        { MODEL_65B,  1024ull * MB },
+    static std::map<e_model2, size_t> k_sizes = {
+        { MODEL_UNKNOWN_2, 512ull * MB_2 },
+        { MODEL_7B_2,    512ull * MB_2 },
+        { MODEL_13B_2,   512ull * MB_2 },
+        { MODEL_30B_2,   512ull * MB_2 },
+        { MODEL_65B_2,  1024ull * MB_2 },
    };
    return k_sizes;
 }

 // 2*n_embd*n_ctx*n_layer*sizeof(float16)
-static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
+static const std::map<e_model2, size_t> & MEM_REQ_KV_SELF_2()
 {
-    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_UNKNOWN, 1026ull * MB },
-        { MODEL_7B,   1026ull * MB },
-        { MODEL_13B,  1608ull * MB },
-        { MODEL_30B,  3124ull * MB },
-        { MODEL_65B,  5120ull * MB },
+    static std::map<e_model2, size_t> k_sizes = {
+        { MODEL_UNKNOWN_2, 1026ull * MB_2 },
+        { MODEL_7B_2,   1026ull * MB_2 },
+        { MODEL_13B_2,  1608ull * MB_2 },
+        { MODEL_30B_2,  3124ull * MB_2 },
+        { MODEL_65B_2,  5120ull * MB_2 },
    };
    return k_sizes;
 }

 // this is mostly needed for temporary mul_mat buffers to dequantize the data
 // not actually needed if BLAS is disabled
-static const std::map<e_model, size_t> & MEM_REQ_EVAL()
+static const std::map<e_model2, size_t> & MEM_REQ_EVAL_2()
 {
-    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_UNKNOWN,   800ull * MB },
-        { MODEL_7B,   800ull * MB },
-        { MODEL_13B, 1024ull * MB },
-        { MODEL_30B, 1280ull * MB },
-        { MODEL_65B, 1536ull * MB },
+    static std::map<e_model2, size_t> k_sizes = {
+        { MODEL_UNKNOWN_2,   800ull * MB_2 },
+        { MODEL_7B_2,   800ull * MB_2 },
+        { MODEL_13B_2, 1024ull * MB_2 },
+        { MODEL_30B_2, 1280ull * MB_2 },
+        { MODEL_65B_2, 1536ull * MB_2 },
    };
    return k_sizes;
 }
@ -157,7 +157,7 @@ struct llama_v2_kv_cache {
 };

 struct llama_v2_model {
-    e_model type = MODEL_UNKNOWN;
+    e_model2 type = MODEL_UNKNOWN_2;

    llama_v2_hparams hparams;

@ -276,7 +276,7 @@ struct llama_v2_context {
 };

 template <typename T>
-static T checked_mul(T a, T b) {
+static T checked_mul2(T a, T b) {
    T ret = a * b;
    if (a != 0 && ret / a != b) {
        throw format("overflow multiplying %llu * %llu",
@ -285,7 +285,7 @@ static T checked_mul(T a, T b) {
    return ret;
 }

-static size_t checked_div(size_t a, size_t b) {
+static size_t checked_div2(size_t a, size_t b) {
    if (b == 0 || a % b != 0) {
        throw format("error dividing %zu / %zu", a, b);
    }
@ -304,7 +304,7 @@ static std::string llama_v2_format_tensor_shape(const std::vector<uint32_t> & ne
 static size_t llama_v2_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_v2_type type) {
    size_t size = ggml_v2_type_size(type);
    for (uint32_t dim : ne) {
-        size = checked_mul<size_t>(size, dim);
+        size = checked_mul2<size_t>(size, dim);
    }
    return size / ggml_v2_blck_size(type);
 }
@ -322,9 +322,9 @@ struct llama_v2_load_tensor_shard {
 };

 enum llama_v2_split_type {
-    SPLIT_NONE,
-    SPLIT_BY_COLUMNS,
-    SPLIT_BY_ROWS
+    SPLIT_NONE_2,
+    SPLIT_BY_COLUMNS_2,
+    SPLIT_BY_ROWS_2
 };

 struct llama_v2_load_tensor {
@ -332,7 +332,7 @@ struct llama_v2_load_tensor {

    std::string name;
    enum ggml_v2_type type = GGML_V2_TYPE_F32;
-    llama_v2_split_type split_type = SPLIT_NONE;
+    llama_v2_split_type split_type = SPLIT_NONE_2;
    std::vector<uint32_t> ne;
    size_t size;
    struct ggml_v2_tensor * ggml_v2_tensor = NULL;
@ -360,13 +360,13 @@ struct llama_v2_load_tensor {
    void calc_split_type() {
        if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
            shards.size() == 1) { // only one file?
-            split_type = SPLIT_NONE;
+            split_type = SPLIT_NONE_2;
        } else if (name.find("tok_embeddings.") == 0 ||
            name.find(".attention.wo.weight") != std::string::npos ||
            name.find(".feed_forward.w2.weight") != std::string::npos) {
-            split_type = SPLIT_BY_COLUMNS;
+            split_type = SPLIT_BY_COLUMNS_2;
        } else {
-            split_type = SPLIT_BY_ROWS;
+            split_type = SPLIT_BY_ROWS_2;
        }
    }

@ -382,16 +382,16 @@ struct llama_v2_load_tensor {
        LLAMA_V2_ASSERT(shards.size() <= UINT32_MAX);
        uint32_t n_shards = (uint32_t) shards.size();
        switch (split_type) {
-            case SPLIT_NONE:
+            case SPLIT_NONE_2:
                ne = first_shard.ne;
                break;
-            case SPLIT_BY_COLUMNS:
-                ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
+            case SPLIT_BY_COLUMNS_2:
+                ne = {checked_mul2<uint32_t>(first_shard.ne[0], n_shards),
                      first_shard.ne[1]};
                break;
-            case SPLIT_BY_ROWS:
+            case SPLIT_BY_ROWS_2:
                ne = {first_shard.ne[0],
-                      checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
+                      checked_mul2<uint32_t>(first_shard.ne[1], n_shards)};
                break;
        }
    }
@ -737,11 +737,11 @@ struct llama_v2_model_loader {
        if (use_mmap) {
            LLAMA_V2_ASSERT(lt.shards.size() == 1);
            lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
-        } else if (lt.split_type == SPLIT_NONE) {
+        } else if (lt.split_type == SPLIT_NONE_2) {
            llama_v2_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
            file.seek(lt.shards.at(0).file_off, SEEK_SET);
            file.read_raw(lt.data, lt.size);
-        } else if (lt.split_type == SPLIT_BY_ROWS) {
+        } else if (lt.split_type == SPLIT_BY_ROWS_2) {
            size_t offset = 0;
            for (llama_v2_load_tensor_shard & shard : lt.shards) {
                llama_v2_file & file = file_loaders.at(shard.file_idx)->file;
@ -750,7 +750,7 @@ struct llama_v2_model_loader {
                offset += shard.size;
            }
            LLAMA_V2_ASSERT(offset == lt.size);
-        } else if (lt.split_type == SPLIT_BY_COLUMNS) {
+        } else if (lt.split_type == SPLIT_BY_COLUMNS_2) {
            // Let's load the data into temporary buffers to ensure the OS performs large loads.
            std::vector<llama_v2_buffer> tmp_bufs(lt.shards.size());
            for (size_t i = 0; i < lt.shards.size(); i++) {
@ -807,7 +807,7 @@ static bool kv_cache_init(
    const int64_t n_mem      = n_layer*n_ctx;
    const int64_t n_elements = n_embd*n_mem;

-    cache.buf.resize(2u*n_elements*ggml_v2_type_size(wtype) + 2u*MB);
+    cache.buf.resize(2u*n_elements*ggml_v2_type_size(wtype) + 2u*MB_2);

    struct ggml_v2_init_params params;
    params.mem_size   = cache.buf.size;
@ -888,12 +888,12 @@ static const char *llama_v2_ftype_name(enum llama_v2_ftype ftype) {
    }
 }

-static const char *llama_v2_model_type_name(e_model type) {
+static const char *llama_v2_model_type_name(e_model2 type) {
    switch (type) {
-        case MODEL_7B: return "7B";
-        case MODEL_13B: return "13B";
-        case MODEL_30B: return "30B";
-        case MODEL_65B: return "65B";
+        case MODEL_7B_2: return "7B";
+        case MODEL_13B_2: return "13B";
+        case MODEL_30B_2: return "30B";
+        case MODEL_65B_2: return "65B";
        default: 
            printf("\nWARNING: NON-STANDARD LLAMA FILE DETECTED. DEFAULT TO 7B SIZE.\n");
            return "UNKNOWN";
@ -925,11 +925,11 @@ static void llama_v2_model_load_internal(

    {
        switch (hparams.n_layer) {
-            case 32: model.type = e_model::MODEL_7B; break;
-            case 40: model.type = e_model::MODEL_13B; break;
-            case 60: model.type = e_model::MODEL_30B; break;
-            case 80: model.type = e_model::MODEL_65B; break;
-            default: model.type = e_model::MODEL_UNKNOWN; break;
+            case 32: model.type = e_model2::MODEL_7B_2; break;
+            case 40: model.type = e_model2::MODEL_13B_2; break;
+            case 60: model.type = e_model2::MODEL_30B_2; break;
+            case 80: model.type = e_model2::MODEL_65B_2; break;
+            default: model.type = e_model2::MODEL_UNKNOWN_2; break;
        }

        hparams.n_ctx = n_ctx;
@ -985,13 +985,13 @@ static void llama_v2_model_load_internal(
        const size_t mem_required =
            ctx_size +
            mmapped_size +
-            MEM_REQ_SCRATCH0().at(model.type) +
-            MEM_REQ_SCRATCH1().at(model.type) +
-            MEM_REQ_EVAL().at(model.type);
+            MEM_REQ_SCRATCH0_2().at(model.type) +
+            MEM_REQ_SCRATCH1_2().at(model.type) +
+            MEM_REQ_EVAL_2().at(model.type);

        // this is the memory required by one llama_v2_state
        const size_t mem_required_state =
-            scale*MEM_REQ_KV_SELF().at(model.type);
+            scale*MEM_REQ_KV_SELF_2().at(model.type);

        fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
                mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@ -1462,7 +1462,7 @@ static bool llama_v2_eval_internal(
 // tokenizer
 //

-static size_t utf8_len(char src) {
+static size_t utf8_len2(char src) {
    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
    return lookup[highbits];
@ -1503,7 +1503,7 @@ struct llama_v2_tokenizer {
        size_t offs = 0;
        while (offs < text.size()) {
            llama_v2_sp_symbol sym;
-            size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
+            size_t char_len = std::min(text.size() - offs, utf8_len2(text[offs]));
            sym.text = text.c_str() + offs;
            sym.n = char_len;
            offs += char_len;
@ -2251,10 +2251,10 @@ struct llama_v2_context * llama_v2_init_from_file(
            ctx->embedding.resize(hparams.n_embd);
        }

-        ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
+        ctx->buf_compute.resize(MEM_REQ_EVAL_2().at(ctx->model.type));

-        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
-        ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
+        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0_2().at(ctx->model.type));
+        ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1_2().at(ctx->model.type));
    }

    return ctx;
@ -3077,5 +3077,15 @@ std::vector<llama_v2_token> legacy_llama_v2_tokenize(struct llama_v2_context * c
    int n = legacy_llama_v2_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
    res.resize(n);

+    return res;
+}
+
+std::vector<llama_token> llama_v2_tokenize(struct llama_v2_context * ctx, const std::string & text, bool add_bos) {
+    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
+    std::vector<llama_token> res(text.size() + (int) add_bos);
+    const int n = llama_v2_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
+    assert(n >= 0);
+    res.resize(n);
+
    return res;
 }
--- a/otherarch/neox_v2.cpp
+++ b/otherarch/neox_v2.cpp
@ -16,7 +16,7 @@


 // load the model's weights from a file
-ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format) {
+ModelLoadResult gpt_neox_v2_model_load(const std::string & fname, gpt_neox_v2_model & model, gpt_vocab & vocab, FileFormat file_format) {
    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());

    auto fin = std::ifstream(fname, std::ios::binary);
@ -344,7 +344,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &

 // feed-forward network
 ggml_v2_tensor * gpt_neox_ff(
-        const gpt_neox_layer &layer,
+        const gpt_neox_layer_v2 &layer,
        ggml_v2_context * ctx0,
        ggml_v2_tensor * inp) {
    ggml_v2_tensor * cur = ggml_v2_norm(ctx0, inp);
@ -386,8 +386,8 @@ ggml_v2_tensor * gpt_neox_ff(
 //   - embd_inp:  the embeddings of the tokens in the context
 //   - embd_w:    the predicted logits for the next token
 //
-bool gpt_neox_eval(
-        const gpt_neox_model & model,
+bool gpt_neox_v2_eval(
+        const gpt_neox_v2_model & model,
        const int n_threads,
        const int n_past,
        const std::vector<gpt_vocab::id> & embd_inp,
--- a/otherarch/neox_v3.cpp
+++ b/otherarch/neox_v3.cpp
@ -0,0 +1,613 @@
+#include "ggml.h"
+#include "otherarch.h"
+
+#include "utils.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <iostream>
+
+
+
+// load the model's weights from a file
+ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format) {
+    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return ModelLoadResult::FAIL;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != 0x67676d6c) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return ModelLoadResult::FAIL;
+        }
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+        hparams.par_res = 1; //true
+        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
+        if(file_format!=FileFormat::NEOX_1 && file_format!=FileFormat::NEOX_2 && file_format!=FileFormat::NEOX_3)
+        {
+            fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
+        }
+        if(file_format==FileFormat::NEOX_3)
+        {
+            hparams.par_res = 0;
+        }
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
+        printf("%s: par_res = %d\n", __func__, hparams.par_res);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+    }
+
+    // load vocab
+    {
+        const int32_t n_vocab = model.hparams.n_vocab;
+
+        std::string word;
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            fin.read((char *) &len, sizeof(len));
+
+            word.resize(len);
+            fin.read((char *) word.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+    // in order to save memory and also to speed up the computation
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return ModelLoadResult::FAIL;
+    }
+
+    auto & ctx = model.ctx;
+
+    size_t ctx_size = 0;
+
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+
+        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte
+
+        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype);           // lmh_g
+        //ctx_size +=        n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_proj_w
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+
+        ctx_size += (6 + 16*n_layer)*512; // object overhead
+
+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params;
+        params.mem_size   = ctx_size;
+        params.mem_buffer = NULL;
+        params.no_alloc   = false;
+        
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return ModelLoadResult::FAIL;
+        }
+    }
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_vocab = hparams.n_vocab;
+
+        model.layers.resize(n_layer);
+
+        model.wte    = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+
+        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        model.lmh_g  = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        //model.lmh_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);
+
+        // map by name
+        model.tensors["gpt_neox.embed_in.weight"] = model.wte;
+
+        model.tensors["gpt_neox.final_layer_norm.weight"] = model.ln_f_g;
+        model.tensors["gpt_neox.final_layer_norm.bias"]   = model.ln_f_b;
+
+        model.tensors["embed_out.weight"] = model.lmh_g;
+        //model.tensors["lm_head.bias"]   = model.lmh_b;
+
+        for (int i = 0; i < n_layer; ++i) {
+            auto & layer = model.layers[i];
+
+            layer.ln_1_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_1_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.c_attn_attn_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
+            layer.c_attn_attn_b   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
+
+            layer.c_attn_proj_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
+            layer.c_attn_proj_b   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.ln_2_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_2_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.c_mlp_fc_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
+            layer.c_mlp_fc_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+
+            layer.c_mlp_proj_w    = ggml_new_tensor_2d(ctx, wtype,         4*n_embd,   n_embd);
+            layer.c_mlp_proj_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            // map by name
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.weight"] = layer.ln_1_g;
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.bias"]   = layer.ln_1_b;
+
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.weight"] = layer.c_attn_attn_w;
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.bias"]   = layer.c_attn_attn_b;
+
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.weight"] = layer.c_attn_proj_w;
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.bias"]   = layer.c_attn_proj_b;
+
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.weight"] = layer.ln_2_g;
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.bias"]   = layer.ln_2_b;
+
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.weight"] = layer.c_mlp_fc_w;
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.bias"]   = layer.c_mlp_fc_b;
+
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.weight"] = layer.c_mlp_proj_w;
+            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.bias"]   = layer.c_mlp_proj_b;
+        }
+    }
+
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+
+        const int64_t n_mem      = n_layer*n_ctx;
+        const int64_t n_elements = n_embd*n_mem;
+
+        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+
+        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+
+        printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
+    }
+
+    // load weights
+    {
+        int n_tensors = 0;
+        size_t total_size = 0;
+
+        printf("%s: ", __func__);
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name.data()) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
+                return ModelLoadResult::FAIL;
+            }
+
+            auto tensor = model.tensors[name.data()];
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
+                return ModelLoadResult::FAIL;
+            }
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
+                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
+                return ModelLoadResult::FAIL;
+            }
+
+            // for debugging
+            if (0) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+            }
+
+            size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if(file_format==FileFormat::NEOX_1)
+            {
+                switch (ttype) {
+                    case 0: bpe = ggml_type_size(GGML_TYPE_F32);  break;
+                    case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;
+                    case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
+                    case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
+                    default:
+                    {
+                        fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ttype);
+                        return ModelLoadResult::FAIL;
+                    }
+                };
+            }
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
+                 ggml_free(ctx);
+                 return ModelLoadResult::RETRY_LOAD;
+            }
+
+            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+
+            total_size += ggml_nbytes(tensor);
+            if (++n_tensors % 8 == 0) {
+                printf(".");
+                fflush(stdout);
+            }
+        }
+
+        printf(" done\n");
+
+        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
+    }
+
+    fin.close();
+
+    return ModelLoadResult::SUCCESS;
+}
+
+
+// feed-forward network
+ggml_tensor * gpt_neox_ff(
+        const gpt_neox_layer &layer,
+        ggml_context * ctx0,
+        ggml_tensor * inp) {
+    ggml_tensor * cur = ggml_norm(ctx0, inp);
+
+    cur = ggml_add(ctx0,
+        ggml_mul(ctx0,
+            ggml_repeat(ctx0, layer.ln_2_g, cur),
+            cur),
+        ggml_repeat(ctx0, layer.ln_2_b, cur));
+
+    cur = ggml_mul_mat(ctx0,
+            layer.c_mlp_fc_w,
+            cur);
+
+    cur = ggml_add(ctx0,
+            ggml_repeat(ctx0, layer.c_mlp_fc_b, cur),
+            cur);
+
+    // GELU activation
+    cur = ggml_gelu(ctx0, cur);
+
+    // projection
+    // cur = proj_w*cur + proj_b
+    cur = ggml_mul_mat(ctx0,
+            layer.c_mlp_proj_w,
+            cur);
+
+    cur = ggml_add(ctx0,
+            ggml_repeat(ctx0, layer.c_mlp_proj_b, cur),
+            cur);
+    return cur;
+}
+
+// evaluate the transformer
+//
+//   - model:     the model
+//   - n_threads: number of threads to use
+//   - n_past:    the context size so far
+//   - embd_inp:  the embeddings of the tokens in the context
+//   - embd_w:    the predicted logits for the next token
+//
+bool gpt_neox_eval(
+        const gpt_neox_model & model,
+        const int n_threads,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp,
+              std::vector<float>         & embd_w,
+              size_t                     & mem_per_token) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_head  = hparams.n_head;
+    const int n_vocab = hparams.n_vocab;
+    const int n_rot   = hparams.n_rot;
+
+    static size_t buf_size = 256u*1024*1024;
+    static void * buf = malloc(buf_size);
+
+    if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
+        const size_t buf_size_new = 360u*1024*1024 + 2*(mem_per_token*N); // add 10% to account for ggml object overhead
+        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+
+        // reallocate
+        buf_size = buf_size_new;
+        buf = realloc(buf, buf_size);
+        if (buf == nullptr) {
+            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
+            return false;
+        }
+    }
+
+    struct ggml_init_params params;
+    params.mem_size   = buf_size;
+    params.mem_buffer = buf;
+    params.no_alloc   = false;
+    
+
+    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_cgraph gf = {};
+    gf.n_threads = n_threads;
+
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+
+    // wte
+    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * cur;
+
+        // self-attention
+        {
+            {
+                cur = ggml_norm(ctx0, inpL);
+
+                cur = ggml_add(ctx0,
+                        ggml_mul(ctx0,
+                            ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
+                            cur),
+                        ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+            }
+
+            // compute QKV
+            {
+                cur = ggml_mul_mat(ctx0,
+                        model.layers[il].c_attn_attn_w,
+                        cur);
+
+                cur = ggml_add(ctx0,
+                        ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
+                        cur);
+            }
+
+            struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head));
+            struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head));
+            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
+
+            // using mode = 2 for GPT-NeoX mode
+            Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2);
+            Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2);
+
+            // store key and value to memory
+            {
+                Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
+
+                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
+                        (   n_ctx)*ggml_element_size(model.memory_v),
+                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
+
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        Qcur,
+                        0, 2, 1, 3);
+
+            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
+                        0, 2, 1, 3);
+
+            // K * Q
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale_inplace(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
+                        );
+
+            // KQ_masked = mask_past(KQ_scaled)
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+
+            // KQ = soft_max(KQ_masked)
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+
+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, model.memory_v,
+                        n_past + N, n_embd/n_head, n_head,
+                        n_ctx*ggml_element_size(model.memory_v),
+                        n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
+                        il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
+
+            // KQV = transpose(V) * KQ_soft_max
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+
+            // projection
+            {
+                cur = ggml_mul_mat(ctx0,
+                        model.layers[il].c_attn_proj_w,
+                        cur);
+
+                cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur);
+            }
+        }
+
+        if (hparams.par_res == 0) {
+            struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
+
+            cur = gpt_neox_ff(model.layers[il], ctx0, inpFF);
+
+            // input for next layer
+            inpL = ggml_add(ctx0, cur, inpFF);
+        } else {
+            struct ggml_tensor * inpFF = cur;
+
+            // this is independent of the self-attention result, so it could be done in parallel to the self-attention
+            // note here we pass inpL instead of cur
+            cur = gpt_neox_ff(model.layers[il], ctx0, inpL);
+
+            // layer input + FF
+            cur  = ggml_add(ctx0, cur, inpFF);
+
+            // input for next layer
+            inpL = ggml_add(ctx0, cur, inpL);
+        }
+    }
+
+    // norm
+    {
+        inpL = ggml_norm(ctx0, inpL);
+
+        // inpL = ln_f_g*inpL + ln_f_b
+        inpL = ggml_add(ctx0,
+                ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model.ln_f_g, inpL),
+                    inpL),
+                ggml_repeat(ctx0, model.ln_f_b, inpL));
+    }
+
+    // lm_head
+    {
+        inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
+
+        //inpL = ggml_add(ctx0,
+        //        ggml_repeat(ctx0, model.lmh_b, inpL),
+        //        inpL);
+    }
+
+    // logits -> probs
+    //inpL = ggml_soft_max_inplace(ctx0, inpL);
+
+    // run the computation
+    ggml_build_forward_expand(&gf, inpL);
+    ggml_graph_compute       (ctx0, &gf);
+
+    //if (n_past%100 == 0) {
+    //    ggml_graph_print   (&gf);
+    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
+    //}
+
+    //embd_w.resize(n_vocab*N);
+    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+
+    // return result for just the last token
+    embd_w.resize(n_vocab);
+    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+
+    if (mem_per_token == 0) {
+        mem_per_token = ggml_used_mem(ctx0)/N;
+    }
+    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
+
+    ggml_free(ctx0);
+
+    return true;
+}
--- a/otherarch/otherarch.h
+++ b/otherarch/otherarch.h
@ -87,7 +87,7 @@ struct gptj_layer_v1 {
    struct ggml_v1_tensor * c_mlp_proj_b;
 };

-struct gptj_model_v1 {
+struct gptj_v1_model {
    gptj_hparams hparams;

    // normalization
@ -110,7 +110,7 @@ struct gptj_model_v1 {
    std::map<std::string, struct ggml_v1_tensor *> tensors;
 };

-struct gptj_model_v2 {
+struct gptj_v2_model {
    gptj_hparams hparams;

    // normalization
@ -122,7 +122,7 @@ struct gptj_model_v2 {
    struct ggml_v2_tensor * lmh_g; // language model head
    struct ggml_v2_tensor * lmh_b; // language model bias

-    std::vector<gptj_layer> layers;
+    std::vector<gptj_layer_v2> layers;

    // key + value memory
    struct ggml_v2_tensor * memory_k;