just testing cublas

2023-05-15 20:01:22 +08:00 · 2023-05-15 20:01:22 +08:00 · 6504150fac
commit 6504150fac
parent fce2e7e518
9 changed files with 53 additions and 59 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -17,7 +17,7 @@ set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Release")
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(LLAMA_STANDALONE ON)
 set(BUILD_SHARED_LIBS_DEFAULT ON)
-set(LLAMA_STATIC ON)
+set(LLAMA_STATIC OFF)
 set(LLAMA_NATIVE OFF)
 set(LLAMA_LTO OFF)
 set(LLAMA_ALL_WARNINGS OFF)
--- a/otherarch/ggml_v1.c
+++ b/otherarch/ggml_v1.c
@ -10053,10 +10053,10 @@ enum ggml_v1_opt_result ggml_v1_opt(
        struct ggml_v1_tensor * f) {
    bool free_ctx = false;
    if (ctx == NULL) {
-        struct ggml_v1_init_params params_ctx = {
-            .mem_size   = 16*1024*1024,
-            .mem_buffer = NULL,
-        };
+        struct ggml_v1_init_params params_ctx;
+        params_ctx.mem_size   = 16*1024*1024;
+        params_ctx.mem_buffer = NULL;
+       

        ctx = ggml_v1_init(params_ctx);
        if (ctx == NULL) {
--- a/otherarch/gpt2_v1.cpp
+++ b/otherarch/gpt2_v1.cpp
@ -12,7 +12,6 @@
 #include <string>
 #include <vector>
 #include <iostream>
-#include <unistd.h>



@ -137,10 +136,10 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model

    // create the ggml context
    {
-        struct ggml_v1_init_params params = {
-            .mem_size   = ctx_size,
-            .mem_buffer = NULL,
-        };
+        struct ggml_v1_init_params params;        
+        params.mem_size   = ctx_size,
+        params.mem_buffer = NULL,
+        

        model.ctx = ggml_v1_init(params);
        if (!model.ctx) {
@ -352,10 +351,10 @@ bool legacy_gpt2_eval(
        }
    }

-    struct ggml_v1_init_params params = {
-        .mem_size   = buf_size,
-        .mem_buffer = buf,
-    };
+    struct ggml_v1_init_params params;
+    params.mem_size   = buf_size;
+    params.mem_buffer = buf;
+    

    struct ggml_v1_context * ctx0 = ggml_v1_init(params);
    struct ggml_v1_cgraph gf = { .n_threads = n_threads };
--- a/otherarch/gpt2_v2.cpp
+++ b/otherarch/gpt2_v2.cpp
@ -13,7 +13,6 @@
 #include <string>
 #include <vector>
 #include <iostream>
-#include <unistd.h>

 #include "model_adapter.h"

@ -143,11 +142,11 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g

    // create the ggml context
    {
-        struct ggml_init_params params = {
-            .mem_size   = ctx_size,
-            .mem_buffer = NULL,
-            .no_alloc   = false,
-        };
+        struct ggml_init_params params;
+        params.mem_size   = ctx_size;
+        params.mem_buffer = NULL;
+        params.no_alloc   = false;
+       

        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -370,11 +369,11 @@ bool gpt2_eval(
        }
    }

-    struct ggml_init_params params = {
-        .mem_size   = buf_size,
-        .mem_buffer = buf,
-        .no_alloc   = false,
-    };
+    struct ggml_init_params params;
+    params.mem_size   = buf_size;
+    params.mem_buffer = buf;
+    params.no_alloc   = false;
+    

    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph gf = { .n_threads = n_threads };
--- a/otherarch/gptj_v1.cpp
+++ b/otherarch/gptj_v1.cpp
@ -12,7 +12,6 @@
 #include <string>
 #include <vector>
 #include <iostream>
-#include <unistd.h>



@ -148,10 +147,10 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1

    // create the ggml context
    {
-        struct ggml_v1_init_params params = {
-            .mem_size   = ctx_size,
-            .mem_buffer = NULL,
-        };
+        struct ggml_v1_init_params params;
+        params.mem_size   = ctx_size;
+        params.mem_buffer = NULL;
+        

        model.ctx = ggml_v1_init(params);
        if (!model.ctx) {
@ -402,10 +401,10 @@ bool legacy_gptj_eval(
        }
    }

-    struct ggml_v1_init_params params = {
-        .mem_size   = buf_size,
-        .mem_buffer = buf,
-    };
+    struct ggml_v1_init_params params;
+    params.mem_size   = buf_size;
+    params.mem_buffer = buf;
+    

    struct ggml_v1_context * ctx0 = ggml_v1_init(params);
    struct ggml_v1_cgraph gf = { .n_threads = n_threads };
--- a/otherarch/gptj_v2.cpp
+++ b/otherarch/gptj_v2.cpp
@ -13,7 +13,6 @@
 #include <string>
 #include <vector>
 #include <iostream>
-#include <unistd.h>

 #include "model_adapter.h"

@ -143,11 +142,11 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g

    // create the ggml context
    {
-        struct ggml_init_params params = {
-            .mem_size   = ctx_size,
-            .mem_buffer = NULL,
-            .no_alloc   = false,
-        };
+        struct ggml_init_params params;
+        params.mem_size   = ctx_size;
+        params.mem_buffer = NULL;
+        params.no_alloc   = false;
+        

        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -382,11 +381,11 @@ bool gptj_eval(
        }
    }

-    struct ggml_init_params params = {
-        .mem_size   = buf_size,
-        .mem_buffer = buf,
-        .no_alloc   = false,
-    };
+    struct ggml_init_params params;
+    params.mem_size   = buf_size;
+    params.mem_buffer = buf;
+    params.no_alloc   = false;
+    

    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph gf = { .n_threads = n_threads };
--- a/otherarch/neox.cpp
+++ b/otherarch/neox.cpp
@ -13,7 +13,6 @@
 #include <string>
 #include <vector>
 #include <iostream>
-#include <unistd.h>



@ -135,11 +134,10 @@ ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model &

    // create the ggml context
    {
-        struct ggml_init_params params = {
-            .mem_size   = ctx_size,
-            .mem_buffer = NULL,
-            .no_alloc   = false,
-        };
+        struct ggml_init_params params;
+        params.mem_size   = ctx_size;
+        params.mem_buffer = NULL;
+        params.no_alloc   = false;
        
        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -377,11 +375,11 @@ bool stablelm_eval(
        }
    }

-    struct ggml_init_params params = {
-        .mem_size   = buf_size,
-        .mem_buffer = buf,
-        .no_alloc   = false,
-    };
+    struct ggml_init_params params;
+    params.mem_size   = buf_size;
+    params.mem_buffer = buf;
+    params.no_alloc   = false;
+    

    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph gf = { .n_threads = n_threads };
--- a/otherarch/tools/convert_hf_gpt2.py
+++ b/otherarch/tools/convert_hf_gpt2.py
@ -10,7 +10,7 @@ import torch
 import numpy as np
 import re

-from transformers import GPTJForCausalLM, AutoModelForCausalLM
+from transformers import AutoModelForCausalLM

 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
 def bytes_to_unicode():
--- a/otherarch/tools/convert_hf_neox.py
+++ b/otherarch/tools/convert_hf_neox.py
@ -1,7 +1,6 @@
 import sys
 import struct
 import json
-import torch
 import numpy as np

 from transformers import AutoModelForCausalLM, AutoTokenizer
@ -59,6 +58,7 @@ fout.write(struct.pack("i", hparams["hidden_size"]))
 fout.write(struct.pack("i", hparams["num_attention_heads"]))
 fout.write(struct.pack("i", hparams["num_hidden_layers"]))
 fout.write(struct.pack("i", int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"]))))
+fout.write(struct.pack("i", hparams["use_parallel_residual"]))
 fout.write(struct.pack("i", ftype))

 # TODO: temporary hack to not deal with implementing the tokenizer