diff --git a/CMakeLists.txt b/CMakeLists.txt index e4bd57d77..8aa36e4c3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,7 +17,7 @@ set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Release") set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set(LLAMA_STANDALONE ON) set(BUILD_SHARED_LIBS_DEFAULT ON) -set(LLAMA_STATIC ON) +set(LLAMA_STATIC OFF) set(LLAMA_NATIVE OFF) set(LLAMA_LTO OFF) set(LLAMA_ALL_WARNINGS OFF) diff --git a/otherarch/ggml_v1.c b/otherarch/ggml_v1.c index 5720788d6..d6af52c82 100644 --- a/otherarch/ggml_v1.c +++ b/otherarch/ggml_v1.c @@ -10053,10 +10053,10 @@ enum ggml_v1_opt_result ggml_v1_opt( struct ggml_v1_tensor * f) { bool free_ctx = false; if (ctx == NULL) { - struct ggml_v1_init_params params_ctx = { - .mem_size = 16*1024*1024, - .mem_buffer = NULL, - }; + struct ggml_v1_init_params params_ctx; + params_ctx.mem_size = 16*1024*1024; + params_ctx.mem_buffer = NULL; + ctx = ggml_v1_init(params_ctx); if (ctx == NULL) { diff --git a/otherarch/gpt2_v1.cpp b/otherarch/gpt2_v1.cpp index e60084b34..b70f814e9 100644 --- a/otherarch/gpt2_v1.cpp +++ b/otherarch/gpt2_v1.cpp @@ -12,7 +12,6 @@ #include #include #include -#include @@ -137,10 +136,10 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model // create the ggml context { - struct ggml_v1_init_params params = { - .mem_size = ctx_size, - .mem_buffer = NULL, - }; + struct ggml_v1_init_params params; + params.mem_size = ctx_size, + params.mem_buffer = NULL, + model.ctx = ggml_v1_init(params); if (!model.ctx) { @@ -352,10 +351,10 @@ bool legacy_gpt2_eval( } } - struct ggml_v1_init_params params = { - .mem_size = buf_size, - .mem_buffer = buf, - }; + struct ggml_v1_init_params params; + params.mem_size = buf_size; + params.mem_buffer = buf; + struct ggml_v1_context * ctx0 = ggml_v1_init(params); struct ggml_v1_cgraph gf = { .n_threads = n_threads }; diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp index af252a0cc..199c353b9 100644 --- a/otherarch/gpt2_v2.cpp +++ b/otherarch/gpt2_v2.cpp @@ -13,7 +13,6 @@ #include #include #include -#include #include "model_adapter.h" @@ -143,11 +142,11 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g // create the ggml context { - struct ggml_init_params params = { - .mem_size = ctx_size, - .mem_buffer = NULL, - .no_alloc = false, - }; + struct ggml_init_params params; + params.mem_size = ctx_size; + params.mem_buffer = NULL; + params.no_alloc = false; + model.ctx = ggml_init(params); if (!model.ctx) { @@ -370,11 +369,11 @@ bool gpt2_eval( } } - struct ggml_init_params params = { - .mem_size = buf_size, - .mem_buffer = buf, - .no_alloc = false, - }; + struct ggml_init_params params; + params.mem_size = buf_size; + params.mem_buffer = buf; + params.no_alloc = false; + struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = { .n_threads = n_threads }; diff --git a/otherarch/gptj_v1.cpp b/otherarch/gptj_v1.cpp index 2f6ae9898..3ec25d2ae 100644 --- a/otherarch/gptj_v1.cpp +++ b/otherarch/gptj_v1.cpp @@ -12,7 +12,6 @@ #include #include #include -#include @@ -148,10 +147,10 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1 // create the ggml context { - struct ggml_v1_init_params params = { - .mem_size = ctx_size, - .mem_buffer = NULL, - }; + struct ggml_v1_init_params params; + params.mem_size = ctx_size; + params.mem_buffer = NULL; + model.ctx = ggml_v1_init(params); if (!model.ctx) { @@ -402,10 +401,10 @@ bool legacy_gptj_eval( } } - struct ggml_v1_init_params params = { - .mem_size = buf_size, - .mem_buffer = buf, - }; + struct ggml_v1_init_params params; + params.mem_size = buf_size; + params.mem_buffer = buf; + struct ggml_v1_context * ctx0 = ggml_v1_init(params); struct ggml_v1_cgraph gf = { .n_threads = n_threads }; diff --git a/otherarch/gptj_v2.cpp b/otherarch/gptj_v2.cpp index 8ea889025..c942cd3d7 100644 --- a/otherarch/gptj_v2.cpp +++ b/otherarch/gptj_v2.cpp @@ -13,7 +13,6 @@ #include #include #include -#include #include "model_adapter.h" @@ -143,11 +142,11 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g // create the ggml context { - struct ggml_init_params params = { - .mem_size = ctx_size, - .mem_buffer = NULL, - .no_alloc = false, - }; + struct ggml_init_params params; + params.mem_size = ctx_size; + params.mem_buffer = NULL; + params.no_alloc = false; + model.ctx = ggml_init(params); if (!model.ctx) { @@ -382,11 +381,11 @@ bool gptj_eval( } } - struct ggml_init_params params = { - .mem_size = buf_size, - .mem_buffer = buf, - .no_alloc = false, - }; + struct ggml_init_params params; + params.mem_size = buf_size; + params.mem_buffer = buf; + params.no_alloc = false; + struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = { .n_threads = n_threads }; diff --git a/otherarch/neox.cpp b/otherarch/neox.cpp index 120ccab45..53a9d4512 100644 --- a/otherarch/neox.cpp +++ b/otherarch/neox.cpp @@ -13,7 +13,6 @@ #include #include #include -#include @@ -135,12 +134,11 @@ ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model & // create the ggml context { - struct ggml_init_params params = { - .mem_size = ctx_size, - .mem_buffer = NULL, - .no_alloc = false, - }; - + struct ggml_init_params params; + params.mem_size = ctx_size; + params.mem_buffer = NULL; + params.no_alloc = false; + model.ctx = ggml_init(params); if (!model.ctx) { fprintf(stderr, "%s: ggml_init() failed\n", __func__); @@ -377,11 +375,11 @@ bool stablelm_eval( } } - struct ggml_init_params params = { - .mem_size = buf_size, - .mem_buffer = buf, - .no_alloc = false, - }; + struct ggml_init_params params; + params.mem_size = buf_size; + params.mem_buffer = buf; + params.no_alloc = false; + struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = { .n_threads = n_threads }; diff --git a/otherarch/tools/convert_hf_gpt2.py b/otherarch/tools/convert_hf_gpt2.py index 70df03f3e..b6e20983a 100644 --- a/otherarch/tools/convert_hf_gpt2.py +++ b/otherarch/tools/convert_hf_gpt2.py @@ -10,7 +10,7 @@ import torch import numpy as np import re -from transformers import GPTJForCausalLM, AutoModelForCausalLM +from transformers import AutoModelForCausalLM # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py def bytes_to_unicode(): diff --git a/otherarch/tools/convert_hf_neox.py b/otherarch/tools/convert_hf_neox.py index 4e1f8f01b..fc327b149 100644 --- a/otherarch/tools/convert_hf_neox.py +++ b/otherarch/tools/convert_hf_neox.py @@ -1,7 +1,6 @@ import sys import struct import json -import torch import numpy as np from transformers import AutoModelForCausalLM, AutoTokenizer @@ -59,6 +58,7 @@ fout.write(struct.pack("i", hparams["hidden_size"])) fout.write(struct.pack("i", hparams["num_attention_heads"])) fout.write(struct.pack("i", hparams["num_hidden_layers"])) fout.write(struct.pack("i", int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))) +fout.write(struct.pack("i", hparams["use_parallel_residual"])) fout.write(struct.pack("i", ftype)) # TODO: temporary hack to not deal with implementing the tokenizer