Remove extraneous files

2024-03-04 07:54:00 +00:00 · 2024-03-04 07:54:00 +00:00 · 4089657815
commit 4089657815
parent d532d5b1f7
10 changed files with 0 additions and 712 deletions
--- a/MixtralLoopPrompt.py
+++ b/MixtralLoopPrompt.py
@ -1,181 +0,0 @@
 # running Mixtral in a loop
 # Needs a zsh change to max memory using
 # sudo sysctl iogpu.wired_limit_mb=27500 (anything bigger crashes easily)
 import os
 import subprocess
 import re
 import psutil
 import threading
 import time
 import queue
 def get_pid():
    # Get the parent process ID (PPID) of the current Python script
    current_pid = os.getpid()
    parent_pid = None
    # Iterate through all the parent processes to find the actual Python process
    while parent_pid is not None:
        try:
            parent_proc = psutil.Process(parent_pid)
        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
            parent_pid = None
        else:
            if 'python' in parent_proc.name():
                current_pid = parent_pid
            else:
                parent_pid = parent_proc.ppid()
    # Print the PID of the running Python script
    print(f"The PID of the running Python script is: {current_pid}")
    return current_pid
 def get_cpu_percent():
    cpu_percent = psutil.cpu_percent()  # Measure CPU usage every second
    return cpu_percent
 def get_memory_info():
    mem_info = psutil.virtual_memory()
    return {
        'total': mem_info.total,
        'used': mem_info.used,
        'percent': mem_info.percent
    }
 def get_threads():
    # Get the PID of the process you want to inspect
    pid = get_pid()
    # Get the process object
    process = psutil.Process(pid)
    # Print the number of threads used by the process
    print("Number of threads:", len(process.threads()))
    # Iterate over the threads and print their attributes
    for thread in process.threads():
        print(f"Thread ID: {thread.id}")
        #print(f"Thread count: {thread.count}")
        #print(f"Thread index: {thread.index}")
        print(f"Thread system_time: {thread.system_time}")
        print(f"Thread user time: {thread.user_time}")
 def find_time_and_tokens(string):
    # Define the regular expression pattern
    pattern = r"llama_print_timings:       total time =\s*(\d+(\.\d+)?)\s*ms /\s*(\d+)"
    pattern2 = r"llama_model_loader: - kv  10:                    llama.expert_used_count u32              = (\d+)"
    # Search for the pattern in stderr
    match = re.search(pattern, string)
    match2 = re.search(pattern2, string)
    if match:
        # Extract the total time and token count from the matched groups
        total_time = float(match.group(1))
        token_count = int(match.group(3))
        print(f"Total time taken: {total_time} ms")
        print(f"Token consumption count: {token_count}")
    else:
        print("Could not find the total time and token count in the output.")
    if match2:
        # Extract the total time and token count from the matched groups
        experts_used = float(match2.group(1))
        print(f"Number of experts used: {experts_used}")
    else:
        print("Could not find the total number of experts used in the process.")
 def command_setup(return_queue, prompt="How can I use python psutil package to calculate CPU and memory usage in a run?"):
    prompt2 = f" [INST] {prompt} [/INST] "
    kv_override = f"llama_kv_expert_used_count=int:3"
    command = [
        '/Users/edsilm2/llama.cpp/build/bin/main',
        '-m',
        '/Users/edsilm2/llama.cpp/models/Mixtral-8x7b-Q2_K/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf',
        '-p', prompt2,
        '-ngl', '99',
        '-c', '4096',
        '-n', '-1',
        '-s', '1',
        '-ctk', 'q8_0',
        '--override-kv', kv_override    # this doesn't have any effect on the LOG which doesn't reflect kv overrides (they say)
        ]
    #print(command)
    response = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    exit_code = response.wait()
    # print(dir(response))
    # print("Returned from subprocess call.")
    stdout, stderr = response.communicate()
            # Check if the command was successful (exit code 0 usually means success)
    if exit_code == 0:
        print(f"\nUser input: \033[31m{prompt}\033[0m\n")
        # Convert the output bytes to a string and print it
        output_str = stdout.decode('utf-8').strip()
        print(f"Output: \033[33m{output_str}\033[0m\n")
        output_err = stderr.decode('utf-8').strip()
        #print(f"Standard Error: \033[33m{output_err}\033[0m\n")
    else:
        try:
            # There was an error, print the error message
            error_str = stderr.decode('utf-8').strip()
            print('Error:', error_str)
        except AttributeError as ae:
            print(f"Unable to process the exit code correctly: {ae}.")
    find_time_and_tokens(output_err)
    cpu_percent_usage = get_cpu_percent()
    print(f"CPU percentage usage = {cpu_percent_usage}\n")
    get_threads()
    memory_info = get_memory_info()
    print(f"Memory usage: Total = {memory_info['total']} Used = {memory_info['used']} Percentage = {memory_info['percent']}")
    # Put return values on queue
    return_queue.put((stdout, stderr, exit_code))
 def check_response(response):
    start = time.time()
    while time.time() - start < 30:
        if response.poll() is not None:
            break
        time.sleep(1)
    if response.poll() is None:
        print("Killing process")
        response.kill()
 if __name__ == "__main__":
    prompt = "Who are you?"
    while prompt != "quit":
        # original user prompt was here
        q = queue.Queue()
        #response, error, code = command_setup(prompt)
        thread = threading.Thread(target=command_setup, args=(q, prompt))
        thread.start()
        # Wait with timeout
        thread.join(timeout=5)
        # Get return values from queue
        if not q.empty():
            stdout, stderr, exit_code = q.get()
        prompt = input("Awaiting the reply from mixtral ... ",)
--- a/PortTest.ipynb
+++ b/PortTest.ipynb
@ -1,2 +0,0 @@
 for i in range(10):
    print(i, i**2)
--- a/SatelliteScrape.ipynb
+++ b/SatelliteScrape.ipynb
--- a/SciRAGQ5KM.py
+++ b/SciRAGQ5KM.py
@ -1,58 +0,0 @@
 import os
 from ctransformers import AutoModelForCausalLM
 # Requires SCIPHI_API_KEY in the environment
 from agent_search import SciPhi
 def initialise():
    SCIPHI_API_KEY = "528d08dc3ed417f32954509131952c5a"
    sciphi_api_key = os.environ("SCI_PHI_API_KEY")
 '''def get_chat_completion(
    self, conversation: list[dict], generation_config: GenerationConfig
 ) -> str:
    self._check_stop_token(generation_config.stop_token)
    prompt = ""
    added_system_prompt = False
    for message in conversation:
        if message["role"] == "system":
            prompt += f"### System:\n{SciPhiLLMInterface.ALPACA_CHAT_SYSTEM_PROMPT}. Further, the assistant is given the following additional instructions - {message['content']}\n\n"
            added_system_prompt = True
        elif message["role"] == "user":
            last_user_message = message["content"]
            prompt += f"### Instruction:\n{last_user_message}\n\n"
        elif message["role"] == "assistant":
            prompt += f"### Response:\n{message['content']}\n\n"
    if not added_system_prompt:
        prompt = f"### System:\n{SciPhiLLMInterface.ALPACA_CHAT_SYSTEM_PROMPT}.\n\n{prompt}"
    context = self.rag_interface.get_contexts([last_user_message])[0]
    prompt += f"### Response:\n{SciPhiFormatter.RETRIEVAL_TOKEN} {SciPhiFormatter.INIT_PARAGRAPH_TOKEN}{context}{SciPhiFormatter.END_PARAGRAPH_TOKEN}"
    latest_completion = self.model.get_instruct_completion(
        prompt, generation_config
    ).strip()
    return SciPhiFormatter.remove_cruft(latest_completion)
 '''
 def perform_search(client):
    # Perform a search
    search_response = client.search(query='Quantum Field Theory', search_provider='agent-search')
    print(search_response)
    # example: [{ 'score': '.89', 'url': 'https://...', 'metadata': {...} }
    # Generate a RAG response
    rag_response = client.get_search_rag_response(query='latest news', search_provider='bing', llm_model='SciPhi/Sensei-7B-V1')
    print(rag_response)
    # example: { 'response': '...', 'other_queries': '...', 'search_results': '...' }
 if __name__ == "__main__":
    initialise()
    # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
    llm = AutoModelForCausalLM.from_pretrained("models/", model_file="sciphi-self-rag-mistral-7b-32k.Q5_K_M.gguf", model_type="mistral", gpu_layers=50)
    print(llm("In 2024 AI is going to"))
    perform_search(client)
--- a/ServerTest2.py
+++ b/ServerTest2.py
@ -1,58 +0,0 @@
 import threading
 import queue
 import requests
 def print_dict(data):
    if isinstance(data, dict):
        print_dict(data)
    elif isinstance(data, list):
        for entry in data:
            print_dict(entry)
    elif data == "content":          
        print(f"Key: {data:>30}: {data['content']}")
    return
 def producer(list):
    # Generate test requests and add them to the queue
    for i in range(10):  # Adjust for desired load
        request_data = f"What is the capital of {list[i % len(list)]}?"
        print(f"Request: {request_data}")
        requests_queue.put(request_data)
 def consumer():
    while True:
        try:
            request_data = requests_queue.get()
            print(f"Processing {request_data}")
            response = requests.post("http://localhost:8080", data=request_data)
            print_dict(response.text)
        except Exception as e:
            print(f"Exception {e}\n")
            continue
        finally:
            requests_queue.task_done()
 # Define your test request data
 requests_queue = queue.Queue()
 # number of threads
 num_threads = 5
 # some text data
 country_list = ["France", "Germany", "China", "USA", "Italy", "India",
    "Ukraine", "Japan", "Australia", "New Zealand", "Indonesia", "Nigeria", "Saudi Arabia", "Israel", "Egypt", "Kenya", "Chile", "Mexico", "Canada"]
 # Create producer and consumer threads
 producer_thread = threading.Thread(target=producer, args = (country_list,))
 consumer_threads = [threading.Thread(target=consumer) for _ in range(num_threads)]  # Adjust thread count
 # Start threads and monitor resources
 producer_thread.start()
 for thread in consumer_threads:
    thread.start()
 producer_thread.join()
 for thread in consumer_threads:
    thread.join()
 print("Stress test completed!")
--- a/StockMarketPred.py
+++ b/StockMarketPred.py
@ -1,40 +0,0 @@
 # stock market predictions
 import numpy as np
 import pandas as pd
 from sklearn import preprocessing
 from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LinearRegression
 def prepare_data(df,forecast_col,forecast_out,test_size):
    label = df[forecast_col].shift(-forecast_out) #creating new column called label with the last 5 rows are nan
    X = np.array(df[[forecast_col]]) #creating the feature array
    X = preprocessing.scale(X) #processing the feature array
    X_lately = X[-forecast_out:] #creating the column i want to use later in the predicting method
    X = X[:-forecast_out] # X that will contain the training and testing
    label.dropna(inplace=True) #dropping na values
    y = np.array(label)  # assigning Y
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=test_size, random_state=0) #cross validation
    response = [X_train,X_test , Y_train, Y_test , X_lately]
    return response
 df = pd.read_csv("prices.csv")
 df = df[df.symbol == "GOOG"]
 forecast_col = 'close'
 forecast_out = 5
 test_size = 0.2
 X_train, X_test, Y_train, Y_test , X_lately =prepare_data(df,forecast_col,forecast_out,test_size); #calling the method were the cross validation and data preperation is in
 learner = LinearRegression() #initializing linear regression model
 learner.fit(X_train,Y_train) #training the linear regression model
 score=learner.score(X_test,Y_test)#testing the linear regression model
 forecast= learner.predict(X_lately) #set that will contain the forecasted data
 response={}#creting json object
 response['test_score']=score
 response['forecast_set']=forecast
 print(response)
--- a/examples/cmap-example/KVcacheViz.py
+++ b/examples/cmap-example/KVcacheViz.py
@ -1,29 +0,0 @@
 # A simple illustration of how to represent cache occupancy
 # graphically using unicvode blocks
 # which are generated using print("\u2588"), print("\u2591")
 from time import sleep
 import random
 CACHE_SIZE = 50 
 used_blocks = [5, 3, 2, 1, 10, 2, 6, 4, 7, 10]
 def visualize_kv_cache(used_blocks, total_size):
    cache_viz = "["
    tot_used = 0
    for i in range(len(used_blocks)):
        # cache_viz += "█" * used_blocks[i]
        cache_viz += "\u2589" * used_blocks[i]
        cache_viz += "░" * (total_size - used_blocks[i])
        cache_viz += f"{used_blocks[i]:3.0f}/{total_size}]\r["
        tot_used += used_blocks[i]
        #print(f"\r[{cache_viz}] {used_blocks[i]:2.0f}/{total_size}", end="")
    print(f"\r{cache_viz}] {tot_used}/{len(used_blocks) * total_size}", end="")
 while True:
    visualize_kv_cache(used_blocks, CACHE_SIZE)
    sleep(0.5)
    used_blocks = used_blocks[1:] + [random.randint(0,50)] # update used blocks
--- a/examples/cmap-example/cursor.cpp
+++ b/examples/cmap-example/cursor.cpp
@ -1,26 +0,0 @@
 // just trying to get the cursor position
 #include <cstdlib>
 struct CursorPos {
  int x;
  int y;
 };
 static CursorPos getCursorPos() {
  // Get text cursor position
  auto cursorPos = getCursorPos();
  // Assign to struct
  CursorPos pos;
  pos.x = cursorPos.x;
  pos.y = cursorPos.y;
  return pos;
 }
 int main() {
    CursorPos cursor = getCursorPos();
    printf("The x co-ordinate of the cursor is %zu\n; the y co-ordinate of the cursor is %zu\n", cursor.x, cursor.y);
 }
--- a/examples/cmap-example/kvcache2.cpp
+++ b/examples/cmap-example/kvcache2.cpp
@ -1,76 +0,0 @@
 /*
 A utility to represent the kv-cache occupancy graphically
 Takes as parameters
 - total cache size (-c)
 - number of simultaneous accesses/slots (-np)
 - a parameter related to the display context (max window width - data display requirements)
 It then uses a trick borrowed from tqdm to display occupancy
 TODO: Show contiguous space and block availability
 */
 #include <iostream>
 #include <iomanip>
 #include <vector>
 #include <cstdlib> // for rand()
 // a custom function to display graphics of the kvcache status
 static void show_kvcache(std::vector<std::pair<int,struct llama_client_slot>> used_blocks, int cache_size) {
    int max_length = 128;
    int num_blocks = used_blocks.size();
    int slot_cache_size = cache_size / num_blocks;
    bool cls_flag = true;
    std::string slot_symbol1 = "";
    std::string slot_symbol2 = "";
    std::string slot_symbol3 = "";
    auto& p = used_blocks[0];
    llama_client_slot slot = p.second;
    return; // remove when not in debug mode
    if ((used_blocks.size() == 0) || (used_blocks[0].first == 0)) {
        return;
    }
    // Print visualization
    // Always start at the top left of the window (H means 'move cursor to this position'; 2J = cls)
    // Only clear the screen the first time round
    if (cls_flag) {
        printf("\033[2J");
        cls_flag = false;
    }
    printf("\033[1;0H\033[K**************************\n\033[KKVcache occupancy by slot:\n\033[K**************************\n");
    for(int i=0; i<num_blocks; i++) {
        printf("\033[K");  // clear the current line
        for(int j=0; j < max_length; j++) {
            int used = used_blocks[i].first * max_length / slot_cache_size;
            if((j < max_length / 2) && (j < used)) {
                printf("\033[90m█\033[0m");
            } else if (j < used) {
                printf("\033[94m█\033[0m");
            } else {
                printf("\033[91m█\033[0m");
            }
        }
        if(used_blocks[i].second.state == PROCESSING) {
            slot_symbol1 = "\u23F0"; // clock symbol = processing
        } else if(used_blocks[i].second.state == IDLE) {
            slot_symbol1 = "\u2705"; // red box white tick
        } else {
            slot_symbol1 = "\u2620"; // skull and crossbones symbol = dead?
        }
        if(used_blocks[i].second.command == LOAD_PROMPT) {
            slot_symbol2 = "\u24C1"; // dingbat L symbol = loading
        } else if(used_blocks[i].second.command == RELEASE) {
            slot_symbol2 = "\u24C7"; // dingbat R release
        } else if(used_blocks[i].second.command == NONE) {
            slot_symbol2 = "\u24C3"; // dingbat N none
        }
        if(used_blocks[i].first == slot_cache_size) {
            slot_symbol3 = "\u274E"; // red box white cross
        } else {
            slot_symbol3 = "";
        }
    printf(" %4d/%5d %2d %s %s %s\n", used_blocks[i].first, slot_cache_size, used_blocks[i].second.id, slot_symbol1.c_str(), slot_symbol2.c_str(), slot_symbol3.c_str());
    }
    printf("\n\033[%dJ", 0);
 }
--- a/examples/cmap-example/kvcacheviz.cpp
+++ b/examples/cmap-example/kvcacheviz.cpp
@ -1,68 +0,0 @@
 /*
 A utility to represent the kv-cache occupancy graphically
 Takes as parameters
 - total cache size (-c)
 - number of simultaneous accesses/slots (-np)
 - a parameter related to the display context (max window width - data display requirements)
 It then uses a trick borrowed from tqdm to display occupancy
 TODO: Show contiguous space and block availability
 */
 #include <iostream>
 #include <iomanip>
 #include <vector>
 #include <cstdlib> // for rand()
 static void show_kvcache(
  std::vector<int> used_blocks,
  int cache_size,
  int max_length
 ) {
  int num_blocks = used_blocks.size();
  int slot_cache_size = cache_size / num_blocks;
  while(true) {
    // Print visualization after erasing the current line
    for(int i=0; i<num_blocks; i++) {
      for(int j=0; j<max_length; j++) {
        if(j<used_blocks[i] * max_length / slot_cache_size) {
          std::cout << "\033[94m█\033[0m";
        }
        //else if ((j == int(used_blocks[i] * max_length / slot_cache_size + 0.5)) && (j > 7 * max_length / slot_cache_size + 0.5)) {
        //  std::cout << "\033[D\033[D\033[D\033[D" << std::setw(3) << used_blocks[i] << "\033[C";
        //}
        else {
          std::cout << "\033[91m█\033[0m";
        }
      }
    std::cout << " " << std::setw(5) << used_blocks[i] << "/" << std::setw(5) << slot_cache_size << std::endl;
    }
  std::cout << "{";
  std::string upcursor = "\033[K\033[A\033[K";
  for(int i=0; i < num_blocks; i++){
    //std::cout << used_blocks[i] << " ";
    upcursor += "\033[A\033[K";
  }
  // Remove first element
  used_blocks.erase(used_blocks.begin());
  // Add new random block at the end
  u_int new_block = rand() % slot_cache_size;
  used_blocks.push_back(new_block);
 // Adjust the cursor so that the display overwrites itself
  upcursor += "\033[A\033[K";
  std::cout << "}" << std::endl;
  std::cin.get();
  std::cout << upcursor;
  }
 }
 int main() {
  std::vector<int> used_blocks = {64, 64, 64, 64, 64, 64, 64, 64, 64, 46, 46, 46, 46, 46, 46, 46, 46, 46};
  int cache_size = 65536;
  int max_length = 128;
  show_kvcache(used_blocks, cache_size, max_length);
  }