examples : remove embd-input and gptneox-wip

2023-10-20 17:08:32 +03:00 · 2023-10-20 17:08:32 +03:00 · 84ed48b473
commit 84ed48b473
parent 6e6587656f
16 changed files with 1 additions and 4075 deletions
--- a/9
+++ b/9
@ -1,7 +1,7 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server embd-input-test gguf llama-bench llava baby-llama beam-search  \
+	simple batched batched-bench save-load-state server gguf llama-bench llava baby-llama beam-search  \
 	speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
 # Binaries only useful for tests
@ -608,13 +608,6 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
 server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
 $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
 embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
 gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
--- a/README.md
+++ b/README.md
@ -962,7 +962,6 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
 - [main](./examples/main/README.md)
 - [server](./examples/server/README.md)
 - [embd-input](./examples/embd-input/README.md)
 - [jeopardy](./examples/jeopardy/README.md)
 - [BLIS](./docs/BLIS.md)
 - [Performance troubleshooting](./docs/token_generation_performance_tips.md)
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -28,7 +28,6 @@ else()
    add_subdirectory(batched-bench)
    add_subdirectory(speculative)
    add_subdirectory(parallel)
    add_subdirectory(embd-input)
    add_subdirectory(llava)
    add_subdirectory(llama-bench)
    add_subdirectory(beam-search)
--- a/examples/embd-input/.gitignore
+++ b/examples/embd-input/.gitignore
@ -1,4 +0,0 @@
 PandaGPT
 MiniGPT-4
 *.pth
--- a/examples/embd-input/CMakeLists.txt
+++ b/examples/embd-input/CMakeLists.txt
@ -1,17 +0,0 @@
 set(TARGET embdinput)
 add_library(${TARGET} embd-input-lib.cpp embd-input.h)
 install(TARGETS ${TARGET} LIBRARY)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
 set(TARGET embd-input-test)
 add_executable(${TARGET} embd-input-test.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/embd-input/README.md
+++ b/examples/embd-input/README.md
@ -1,63 +0,0 @@
 ### Examples for input embedding directly
 ## Requirement
 build  `libembdinput.so`
 run the following comman in main dir (../../).
 ```
 make
 ```
 ## [LLaVA](https://github.com/haotian-liu/LLaVA/) example  (llava.py)
 1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/).
 2. Convert it to ggml format.
 3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin).
 ```
 import torch
 bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
 pth_path = "./examples/embd-input/llava_projection.pth"
 dic = torch.load(bin_path)
 used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
 torch.save({k: dic[k] for k in used_key}, pth_path)
 ```
 4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`.
 ## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py)
 1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
 The `adapter_config.json` is
 ```
 {
  "peft_type": "LORA",
  "fan_in_fan_out": false,
  "bias": null,
  "modules_to_save": null,
  "r": 32,
  "lora_alpha": 32,
  "lora_dropout": 0.1,
  "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
 }
 ```
 2. Papare the `vicuna` v0 model.
 3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
 4. Clone the PandaGPT source.
 ```
 git clone https://github.com/yxuansu/PandaGPT
 ```
 5. Install the requirement of PandaGPT.
 6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
 ## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py)
 1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`.
 2. Clone the MiniGPT-4 source.
 ```
 git clone https://github.com/Vision-CAIR/MiniGPT-4/
 ```
 3. Install the requirement of PandaGPT.
 4. Papare the `vicuna` v0 model.
 5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`.
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@ -1,221 +0,0 @@
 #include "build-info.h"
 #include "common.h"
 #include "embd-input.h"
 #include <cassert>
 #include <cinttypes>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <fstream>
 #include <iostream>
 #include <string>
 #include <vector>
 static llama_context ** g_ctx;
 extern "C" {
 struct MyModel* create_mymodel(int argc, char ** argv) {
    gpt_params params;
    if (!gpt_params_parse(argc, argv, params)) {
        return nullptr;
    }
    print_build_info();
    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = uint32_t(time(NULL));
    }
    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
    llama_backend_init(params.numa);
    llama_model * model;
    llama_context * ctx;
    g_ctx = &ctx;
    // load the model and apply lora adapter, if any
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    if (model == NULL) {
        fprintf(stderr, "%s: error: unable to load model\n", __func__);
        return nullptr;
    }
    // print system information
    {
        fprintf(stderr, "\n");
        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }
    struct MyModel * ret = new MyModel();
    ret->ctx = ctx;
    ret->params = params;
    ret->n_past = 0;
    // printf("ctx: %d\n", ret->ctx);
    return ret;
 }
 void free_mymodel(struct MyModel * mymodel) {
    llama_context * ctx = mymodel->ctx;
    llama_print_timings(ctx);
    llama_free(ctx);
    delete mymodel;
 }
 bool eval_float(void * model, float * input, int N){
    MyModel * mymodel = (MyModel*)model;
    llama_context * ctx = mymodel->ctx;
    gpt_params params = mymodel->params;
    int n_emb = llama_n_embd(llama_get_model(ctx));
    int n_past = mymodel->n_past;
    int n_batch = N; // params.n_batch;
    for (int i = 0; i < (int) N; i += n_batch) {
        int n_eval = (int) N - i;
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
        llama_batch batch = {  int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
        if (llama_decode(ctx, batch)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return false;
        }
        n_past += n_eval;
    }
    mymodel->n_past = n_past;
    return true;
 }
 bool eval_tokens(void * model, std::vector<llama_token> tokens) {
    MyModel * mymodel = (MyModel* )model;
    llama_context * ctx;
    ctx = mymodel->ctx;
    gpt_params params = mymodel->params;
    int n_past = mymodel->n_past;
    for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
        int n_eval = (int) tokens.size() - i;
        if (n_eval > params.n_batch) {
            n_eval = params.n_batch;
        }
        if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return false;
        }
        n_past += n_eval;
    }
    mymodel->n_past = n_past;
    return true;
 }
 bool eval_id(struct MyModel* mymodel, int id) {
    std::vector<llama_token> tokens;
    tokens.push_back(id);
    return eval_tokens(mymodel, tokens);
 }
 bool eval_string(struct MyModel * mymodel,const char* str){
    llama_context * ctx = mymodel->ctx;
    std::string str2 = str;
    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true);
    eval_tokens(mymodel, embd_inp);
    return true;
 }
 llama_token sampling_id(struct MyModel* mymodel) {
    llama_context* ctx = mymodel->ctx;
    gpt_params params = mymodel->params;
    llama_sampling_params & sparams = params.sparams;
    // int n_ctx = llama_n_ctx(ctx);
    // out of user input, sample next token
    const float   temp            = sparams.temp;
    const int32_t top_k           = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
    const float   top_p           = sparams.top_p;
    const float   tfs_z           = sparams.tfs_z;
    const float   typical_p       = sparams.typical_p;
    // const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
    // const float   repeat_penalty  = params.repeat_penalty;
    // const float   alpha_presence  = params.presence_penalty;
    // const float   alpha_frequency = params.frequency_penalty;
    const int     mirostat        = sparams.mirostat;
    const float   mirostat_tau    = sparams.mirostat_tau;
    const float   mirostat_eta    = sparams.mirostat_eta;
    // const bool    penalize_nl     = params.penalize_nl;
    llama_token id = 0;
    {
        auto logits  = llama_get_logits(ctx);
        auto n_vocab = llama_n_vocab(llama_get_model(ctx));
        // Apply params.logit_bias map
        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
            logits[it->first] += it->second;
        }
        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        // TODO: Apply penalties
        // float nl_logit = logits[llama_token_nl(ctx)];
        // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
        // llama_sample_repetition_penalty(ctx, &candidates_p,
        //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
        //      last_n_repeat, repeat_penalty);
        // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
        // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
        // last_n_repeat, alpha_frequency, alpha_presence);
        // if (!penalize_nl) {
        //     logits[llama_token_nl(ctx)] = nl_logit;
        // }
        if (temp <= 0) {
            // Greedy sampling
            id = llama_sample_token_greedy(ctx, &candidates_p);
        } else {
            if (mirostat == 1) {
                static float mirostat_mu = 2.0f * mirostat_tau;
                const int mirostat_m = 100;
                llama_sample_temp(ctx, &candidates_p, temp);
                id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
            } else if (mirostat == 2) {
                static float mirostat_mu = 2.0f * mirostat_tau;
                llama_sample_temp(ctx, &candidates_p, temp);
                id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
            } else {
                // Temperature sampling
                llama_sample_top_k(ctx, &candidates_p, top_k, 1);
                llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
                llama_sample_typical(ctx, &candidates_p, typical_p, 1);
                llama_sample_top_p(ctx, &candidates_p, top_p, 1);
                llama_sample_temp(ctx, &candidates_p, temp);
                id = llama_sample_token(ctx, &candidates_p);
            }
        }
    }
    return id;
 }
 const char * sampling(struct MyModel * mymodel) {
    llama_context * ctx = mymodel->ctx;
    int id = sampling_id(mymodel);
    static std::string ret;
    if (id == llama_token_eos(ctx)) {
        ret = "</s>";
    } else {
        ret = llama_token_to_piece(ctx, id);
    }
    eval_id(mymodel, id);
    return ret.c_str();
 }
 }
--- a/examples/embd-input/embd-input-test.cpp
+++ b/examples/embd-input/embd-input-test.cpp
@ -1,35 +0,0 @@
 #include "embd-input.h"
 #include <stdlib.h>
 #include <random>
 #include <string.h>
 int main(int argc, char** argv) {
    auto mymodel = create_mymodel(argc, argv);
    int N = 10;
    int max_tgt_len = 500;
    int n_embd = llama_n_embd(llama_get_model(mymodel->ctx));
    // add random float embd to test evaluation
    float * data = new float[N*n_embd];
    std::default_random_engine e;
    std::uniform_real_distribution<float>  u(0,1);
    for (int i=0;i<N*n_embd;i++) {
        data[i] = u(e);
    }
    eval_string(mymodel, "user: what is the color of the flag of UN?");
    eval_float(mymodel, data, N);
    eval_string(mymodel, "assistant:");
    eval_string(mymodel, mymodel->params.prompt.c_str());
    const char* tmp;
    for (int i=0; i<max_tgt_len; i++) {
        tmp = sampling(mymodel);
        if (strcmp(tmp, "</s>")==0) break;
        printf("%s", tmp);
        fflush(stdout);
    }
    printf("\n");
    free_mymodel(mymodel);
    return 0;
 }
--- a/examples/embd-input/embd-input.h
+++ b/examples/embd-input/embd-input.h
@ -1,27 +0,0 @@
 #ifndef _EMBD_INPUT_H_
 #define _EMBD_INPUT_H_ 1
 #include "common.h"
 #include "llama.h"
 extern "C" {
 typedef struct MyModel {
    llama_context* ctx;
    gpt_params params;
    int n_past = 0;
 } MyModel;
 struct MyModel* create_mymodel(int argc, char ** argv);
 bool eval_float(void* model, float* input, int N);
 bool eval_tokens(void* model, std::vector<llama_token> tokens);
 bool eval_id(struct MyModel* mymodel, int id);
 bool eval_string(struct MyModel* mymodel, const char* str);
 const char * sampling(struct MyModel* mymodel);
 llama_token sampling_id(struct MyModel* mymodel);
 void free_mymodel(struct MyModel* mymodel);
 }
 #endif
--- a/examples/embd-input/embd_input.py
+++ b/examples/embd-input/embd_input.py
@ -1,72 +0,0 @@
 #!/usr/bin/env python3
 import ctypes
 from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
 import numpy as np
 import os
 libc = cdll.LoadLibrary("./libembdinput.so")
 libc.sampling.restype=c_char_p
 libc.create_mymodel.restype=c_void_p
 libc.eval_string.argtypes=[c_void_p, c_char_p]
 libc.sampling.argtypes=[c_void_p]
 libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int]
 class MyModel:
    def __init__(self, args):
        argc = len(args)
        c_str = [c_char_p(i.encode()) for i in args]
        args_c = (c_char_p * argc)(*c_str)
        self.model = c_void_p(libc.create_mymodel(argc, args_c))
        self.max_tgt_len = 512
        self.print_string_eval = True
    def __del__(self):
        libc.free_mymodel(self.model)
    def eval_float(self, x):
        libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1])
    def eval_string(self, x):
        libc.eval_string(self.model, x.encode()) # c_char_p(x.encode()))
        if self.print_string_eval:
            print(x)
    def eval_token(self, x):
        libc.eval_id(self.model, x)
    def sampling(self):
        s = libc.sampling(self.model)
        return s
    def stream_generate(self, end="</s>"):
        ret = b""
        end = end.encode()
        for _ in range(self.max_tgt_len):
            tmp = self.sampling()
            ret += tmp
            yield tmp
            if ret.endswith(end):
                break
    def generate_with_print(self, end="</s>"):
        ret = b""
        for i in self.stream_generate(end=end):
            ret += i
            print(i.decode(errors="replace"), end="", flush=True)
        print("")
        return ret.decode(errors="replace")
    def generate(self, end="</s>"):
        text = b"".join(self.stream_generate(end=end))
        return text.decode(errors="replace")
 if __name__ == "__main__":
    model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
    model.eval_string("""user: what is the color of the flag of UN?""")
    x = np.random.random((5120,10))# , dtype=np.float32)
    model.eval_float(x)
    model.eval_string("""assistant:""")
    for i in model.generate():
        print(i.decode(errors="replace"), end="", flush=True)
--- a/examples/embd-input/llava.py
+++ b/examples/embd-input/llava.py
@ -1,71 +0,0 @@
 #!/usr/bin/env python3
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
 from embd_input import MyModel
 import numpy as np
 from torch import nn
 import torch
 from transformers import CLIPVisionModel,  CLIPImageProcessor
 from PIL import Image
 # model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1'
 vision_tower = "openai/clip-vit-large-patch14"
 select_hidden_state_layer = -2
 # (vision_config.image_size // vision_config.patch_size) ** 2
 image_token_len = (224//14)**2
 class Llava:
    def __init__(self, args):
        self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
        self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
        self.mm_projector = nn.Linear(1024, 5120)
        self.model = MyModel(["main", *args])
    def load_projection(self, path):
        state = torch.load(path)
        self.mm_projector.load_state_dict({
            "weight": state["model.mm_projector.weight"],
            "bias": state["model.mm_projector.bias"]})
    def chat(self, question):
        self.model.eval_string("user: ")
        self.model.eval_string(question)
        self.model.eval_string("\nassistant: ")
        return self.model.generate_with_print()
    def chat_with_image(self, image, question):
        with torch.no_grad():
            embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
            image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True)
            select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
            image_feature = select_hidden_state[:, 1:]
            embd_image = self.mm_projector(image_feature)
            embd_image = embd_image.cpu().numpy()[0]
        self.model.eval_string("user: ")
        self.model.eval_token(32003-2) # im_start
        self.model.eval_float(embd_image.T)
        for i in range(image_token_len-embd_image.shape[0]):
            self.model.eval_token(32003-3) # im_patch
        self.model.eval_token(32003-1) # im_end
        self.model.eval_string(question)
        self.model.eval_string("\nassistant: ")
        return self.model.generate_with_print()
 if __name__=="__main__":
    # model form liuhaotian/LLaVA-13b-delta-v1-1
    a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"])
    # Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin.
    # Also here can use pytorch_model-00003-of-00003.bin directly.
    a.load_projection(os.path.join(
        os.path.dirname(__file__) ,
        "llava_projection.pth"))
    respose = a.chat_with_image(
        Image.open("./media/llama1-logo.png").convert('RGB'),
        "what is the text in the picture?")
    respose
    a.chat("what is the color of it?")
--- a/examples/embd-input/minigpt4.py
+++ b/examples/embd-input/minigpt4.py
@ -1,129 +0,0 @@
 #!/usr/bin/env python3
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
 from embd_input import MyModel
 import numpy as np
 from torch import nn
 import torch
 from PIL import Image
 minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4")
 sys.path.insert(0, minigpt4_path)
 from minigpt4.models.blip2 import Blip2Base
 from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor
 class MiniGPT4(Blip2Base):
    """
    MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4
    """
    def __init__(self,
        args,
        vit_model="eva_clip_g",
        q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth",
        img_size=224,
        drop_path_rate=0,
        use_grad_checkpoint=False,
        vit_precision="fp32",
        freeze_vit=True,
        freeze_qformer=True,
        num_query_token=32,
        llama_model="",
        prompt_path="",
        prompt_template="",
        max_txt_len=32,
        end_sym='\n',
        low_resource=False,  # use 8 bit and put vit in cpu
        device_8bit=0
    ):
        super().__init__()
        self.img_size = img_size
        self.low_resource = low_resource
        self.preprocessor = Blip2ImageEvalProcessor(img_size)
        print('Loading VIT')
        self.visual_encoder, self.ln_vision = self.init_vision_encoder(
            vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
        )
        print('Loading VIT Done')
        print('Loading Q-Former')
        self.Qformer, self.query_tokens = self.init_Qformer(
            num_query_token, self.visual_encoder.num_features
        )
        self.Qformer.cls = None
        self.Qformer.bert.embeddings.word_embeddings = None
        self.Qformer.bert.embeddings.position_embeddings = None
        for layer in self.Qformer.bert.encoder.layer:
            layer.output = None
            layer.intermediate = None
        self.load_from_pretrained(url_or_filename=q_former_model)
        print('Loading Q-Former Done')
        self.llama_proj = nn.Linear(
            self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size
        )
        self.max_txt_len = max_txt_len
        self.end_sym = end_sym
        self.model = MyModel(["main", *args])
        # system prompt
        self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
           "You will be able to see the image once I provide it to you. Please answer my questions."
           "###")
    def encode_img(self, image):
        image = self.preprocessor(image)
        image = image.unsqueeze(0)
        device = image.device
        if self.low_resource:
            self.vit_to_cpu()
            image = image.to("cpu")
        with self.maybe_autocast():
            image_embeds = self.ln_vision(self.visual_encoder(image)).to(device)
            image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)
            query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
            query_output = self.Qformer.bert(
                query_embeds=query_tokens,
                encoder_hidden_states=image_embeds,
                encoder_attention_mask=image_atts,
                return_dict=True,
            )
            inputs_llama = self.llama_proj(query_output.last_hidden_state)
            # atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device)
        return inputs_llama
    def load_projection(self, path):
        state = torch.load(path)["model"]
        self.llama_proj.load_state_dict({
            "weight": state["llama_proj.weight"],
            "bias": state["llama_proj.bias"]})
    def chat(self, question):
        self.model.eval_string("Human: ")
        self.model.eval_string(question)
        self.model.eval_string("\n### Assistant:")
        return self.model.generate_with_print(end="###")
    def chat_with_image(self, image, question):
        with torch.no_grad():
            embd_image = self.encode_img(image)
        embd_image = embd_image.cpu().numpy()[0]
        self.model.eval_string("Human: <Img>")
        self.model.eval_float(embd_image.T)
        self.model.eval_string("</Img> ")
        self.model.eval_string(question)
        self.model.eval_string("\n### Assistant:")
        return self.model.generate_with_print(end="###")
 if __name__=="__main__":
    a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"])
    a.load_projection(os.path.join(
        os.path.dirname(__file__) ,
        "pretrained_minigpt4.pth"))
    respose = a.chat_with_image(
        Image.open("./media/llama1-logo.png").convert('RGB'),
        "what is the text in the picture?")
    a.chat("what is the color of it?")
--- a/examples/embd-input/panda_gpt.py
+++ b/examples/embd-input/panda_gpt.py
@ -1,99 +0,0 @@
 #!/usr/bin/env python3
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
 from embd_input import MyModel
 import numpy as np
 from torch import nn
 import torch
 # use PandaGPT path
 panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
 imagebind_ckpt_path = "./models/panda_gpt/"
 sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
 from ImageBind.models import imagebind_model
 from ImageBind import data
 ModalityType = imagebind_model.ModalityType
 max_tgt_len = 400
 class PandaGPT:
    def __init__(self, args):
        self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path)
        self.visual_encoder.eval()
        self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
        self.max_tgt_len = max_tgt_len
        self.model = MyModel(["main", *args])
        self.generated_text = ""
        self.device = "cpu"
    def load_projection(self, path):
        state = torch.load(path, map_location="cpu")
        self.llama_proj.load_state_dict({
            "weight": state["llama_proj.weight"],
            "bias": state["llama_proj.bias"]})
    def eval_inputs(self, inputs):
        self.model.eval_string("<Img>")
        embds = self.extract_multimoal_feature(inputs)
        for i in embds:
            self.model.eval_float(i.T)
        self.model.eval_string("</Img> ")
    def chat(self, question):
        return self.chat_with_image(None, question)
    def chat_with_image(self, inputs, question):
        if self.generated_text == "":
            self.model.eval_string("###")
        self.model.eval_string(" Human: ")
        if inputs:
            self.eval_inputs(inputs)
        self.model.eval_string(question)
        self.model.eval_string("\n### Assistant:")
        ret = self.model.generate_with_print(end="###")
        self.generated_text += ret
        return ret
    def extract_multimoal_feature(self, inputs):
        features = []
        for key in ["image", "audio", "video", "thermal"]:
            if key + "_paths" in inputs:
                embeds = self.encode_data(key, inputs[key+"_paths"])
                features.append(embeds)
        return features
    def encode_data(self, data_type, data_paths):
        type_map = {
            "image": ModalityType.VISION,
            "audio": ModalityType.AUDIO,
            "video": ModalityType.VISION,
            "thermal": ModalityType.THERMAL,
        }
        load_map = {
            "image": data.load_and_transform_vision_data,
            "audio": data.load_and_transform_audio_data,
            "video": data.load_and_transform_video_data,
            "thermal": data.load_and_transform_thermal_data
        }
        load_function = load_map[data_type]
        key = type_map[data_type]
        inputs = {key: load_function(data_paths, self.device)}
        with torch.no_grad():
            embeddings = self.visual_encoder(inputs)
            embeds = embeddings[key]
            embeds = self.llama_proj(embeds).cpu().numpy()
        return embeds
 if __name__=="__main__":
    a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"])
    a.load_projection("./models/panda_gpt/adapter_model.bin")
    a.chat_with_image(
        {"image_paths": ["./media/llama1-logo.png"]},
        "what is the text in the picture? 'llama' or 'lambda'?")
    a.chat("what is the color of it?")
--- a/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
+++ b/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
--- a/examples/gptneox-wip/falcon-main.cpp
+++ b/examples/gptneox-wip/falcon-main.cpp
--- a/examples/gptneox-wip/gptneox-main.cpp
+++ b/examples/gptneox-wip/gptneox-main.cpp