Remove extraneous files

This commit is contained in:
pudepiedj 2024-03-04 07:54:00 +00:00
parent d532d5b1f7
commit 4089657815
10 changed files with 0 additions and 712 deletions

View file

@ -1,181 +0,0 @@
# running Mixtral in a loop
# Needs a zsh change to max memory using
# sudo sysctl iogpu.wired_limit_mb=27500 (anything bigger crashes easily)
import os
import subprocess
import re
import psutil
import threading
import time
import queue
def get_pid():
# Get the parent process ID (PPID) of the current Python script
current_pid = os.getpid()
parent_pid = None
# Iterate through all the parent processes to find the actual Python process
while parent_pid is not None:
try:
parent_proc = psutil.Process(parent_pid)
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
parent_pid = None
else:
if 'python' in parent_proc.name():
current_pid = parent_pid
else:
parent_pid = parent_proc.ppid()
# Print the PID of the running Python script
print(f"The PID of the running Python script is: {current_pid}")
return current_pid
def get_cpu_percent():
cpu_percent = psutil.cpu_percent() # Measure CPU usage every second
return cpu_percent
def get_memory_info():
mem_info = psutil.virtual_memory()
return {
'total': mem_info.total,
'used': mem_info.used,
'percent': mem_info.percent
}
def get_threads():
# Get the PID of the process you want to inspect
pid = get_pid()
# Get the process object
process = psutil.Process(pid)
# Print the number of threads used by the process
print("Number of threads:", len(process.threads()))
# Iterate over the threads and print their attributes
for thread in process.threads():
print(f"Thread ID: {thread.id}")
#print(f"Thread count: {thread.count}")
#print(f"Thread index: {thread.index}")
print(f"Thread system_time: {thread.system_time}")
print(f"Thread user time: {thread.user_time}")
def find_time_and_tokens(string):
# Define the regular expression pattern
pattern = r"llama_print_timings: total time =\s*(\d+(\.\d+)?)\s*ms /\s*(\d+)"
pattern2 = r"llama_model_loader: - kv 10: llama.expert_used_count u32 = (\d+)"
# Search for the pattern in stderr
match = re.search(pattern, string)
match2 = re.search(pattern2, string)
if match:
# Extract the total time and token count from the matched groups
total_time = float(match.group(1))
token_count = int(match.group(3))
print(f"Total time taken: {total_time} ms")
print(f"Token consumption count: {token_count}")
else:
print("Could not find the total time and token count in the output.")
if match2:
# Extract the total time and token count from the matched groups
experts_used = float(match2.group(1))
print(f"Number of experts used: {experts_used}")
else:
print("Could not find the total number of experts used in the process.")
def command_setup(return_queue, prompt="How can I use python psutil package to calculate CPU and memory usage in a run?"):
prompt2 = f" [INST] {prompt} [/INST] "
kv_override = f"llama_kv_expert_used_count=int:3"
command = [
'/Users/edsilm2/llama.cpp/build/bin/main',
'-m',
'/Users/edsilm2/llama.cpp/models/Mixtral-8x7b-Q2_K/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf',
'-p', prompt2,
'-ngl', '99',
'-c', '4096',
'-n', '-1',
'-s', '1',
'-ctk', 'q8_0',
'--override-kv', kv_override # this doesn't have any effect on the LOG which doesn't reflect kv overrides (they say)
]
#print(command)
response = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
exit_code = response.wait()
# print(dir(response))
# print("Returned from subprocess call.")
stdout, stderr = response.communicate()
# Check if the command was successful (exit code 0 usually means success)
if exit_code == 0:
print(f"\nUser input: \033[31m{prompt}\033[0m\n")
# Convert the output bytes to a string and print it
output_str = stdout.decode('utf-8').strip()
print(f"Output: \033[33m{output_str}\033[0m\n")
output_err = stderr.decode('utf-8').strip()
#print(f"Standard Error: \033[33m{output_err}\033[0m\n")
else:
try:
# There was an error, print the error message
error_str = stderr.decode('utf-8').strip()
print('Error:', error_str)
except AttributeError as ae:
print(f"Unable to process the exit code correctly: {ae}.")
find_time_and_tokens(output_err)
cpu_percent_usage = get_cpu_percent()
print(f"CPU percentage usage = {cpu_percent_usage}\n")
get_threads()
memory_info = get_memory_info()
print(f"Memory usage: Total = {memory_info['total']} Used = {memory_info['used']} Percentage = {memory_info['percent']}")
# Put return values on queue
return_queue.put((stdout, stderr, exit_code))
def check_response(response):
start = time.time()
while time.time() - start < 30:
if response.poll() is not None:
break
time.sleep(1)
if response.poll() is None:
print("Killing process")
response.kill()
if __name__ == "__main__":
prompt = "Who are you?"
while prompt != "quit":
# original user prompt was here
q = queue.Queue()
#response, error, code = command_setup(prompt)
thread = threading.Thread(target=command_setup, args=(q, prompt))
thread.start()
# Wait with timeout
thread.join(timeout=5)
# Get return values from queue
if not q.empty():
stdout, stderr, exit_code = q.get()
prompt = input("Awaiting the reply from mixtral ... ",)

View file

@ -1,2 +0,0 @@
for i in range(10):
print(i, i**2)

File diff suppressed because one or more lines are too long

View file

@ -1,58 +0,0 @@
import os
from ctransformers import AutoModelForCausalLM
# Requires SCIPHI_API_KEY in the environment
from agent_search import SciPhi
def initialise():
SCIPHI_API_KEY = "528d08dc3ed417f32954509131952c5a"
sciphi_api_key = os.environ("SCI_PHI_API_KEY")
'''def get_chat_completion(
self, conversation: list[dict], generation_config: GenerationConfig
) -> str:
self._check_stop_token(generation_config.stop_token)
prompt = ""
added_system_prompt = False
for message in conversation:
if message["role"] == "system":
prompt += f"### System:\n{SciPhiLLMInterface.ALPACA_CHAT_SYSTEM_PROMPT}. Further, the assistant is given the following additional instructions - {message['content']}\n\n"
added_system_prompt = True
elif message["role"] == "user":
last_user_message = message["content"]
prompt += f"### Instruction:\n{last_user_message}\n\n"
elif message["role"] == "assistant":
prompt += f"### Response:\n{message['content']}\n\n"
if not added_system_prompt:
prompt = f"### System:\n{SciPhiLLMInterface.ALPACA_CHAT_SYSTEM_PROMPT}.\n\n{prompt}"
context = self.rag_interface.get_contexts([last_user_message])[0]
prompt += f"### Response:\n{SciPhiFormatter.RETRIEVAL_TOKEN} {SciPhiFormatter.INIT_PARAGRAPH_TOKEN}{context}{SciPhiFormatter.END_PARAGRAPH_TOKEN}"
latest_completion = self.model.get_instruct_completion(
prompt, generation_config
).strip()
return SciPhiFormatter.remove_cruft(latest_completion)
'''
def perform_search(client):
# Perform a search
search_response = client.search(query='Quantum Field Theory', search_provider='agent-search')
print(search_response)
# example: [{ 'score': '.89', 'url': 'https://...', 'metadata': {...} }
# Generate a RAG response
rag_response = client.get_search_rag_response(query='latest news', search_provider='bing', llm_model='SciPhi/Sensei-7B-V1')
print(rag_response)
# example: { 'response': '...', 'other_queries': '...', 'search_results': '...' }
if __name__ == "__main__":
initialise()
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = AutoModelForCausalLM.from_pretrained("models/", model_file="sciphi-self-rag-mistral-7b-32k.Q5_K_M.gguf", model_type="mistral", gpu_layers=50)
print(llm("In 2024 AI is going to"))
perform_search(client)

View file

@ -1,58 +0,0 @@
import threading
import queue
import requests
def print_dict(data):
if isinstance(data, dict):
print_dict(data)
elif isinstance(data, list):
for entry in data:
print_dict(entry)
elif data == "content":
print(f"Key: {data:>30}: {data['content']}")
return
def producer(list):
# Generate test requests and add them to the queue
for i in range(10): # Adjust for desired load
request_data = f"What is the capital of {list[i % len(list)]}?"
print(f"Request: {request_data}")
requests_queue.put(request_data)
def consumer():
while True:
try:
request_data = requests_queue.get()
print(f"Processing {request_data}")
response = requests.post("http://localhost:8080", data=request_data)
print_dict(response.text)
except Exception as e:
print(f"Exception {e}\n")
continue
finally:
requests_queue.task_done()
# Define your test request data
requests_queue = queue.Queue()
# number of threads
num_threads = 5
# some text data
country_list = ["France", "Germany", "China", "USA", "Italy", "India",
"Ukraine", "Japan", "Australia", "New Zealand", "Indonesia", "Nigeria", "Saudi Arabia", "Israel", "Egypt", "Kenya", "Chile", "Mexico", "Canada"]
# Create producer and consumer threads
producer_thread = threading.Thread(target=producer, args = (country_list,))
consumer_threads = [threading.Thread(target=consumer) for _ in range(num_threads)] # Adjust thread count
# Start threads and monitor resources
producer_thread.start()
for thread in consumer_threads:
thread.start()
producer_thread.join()
for thread in consumer_threads:
thread.join()
print("Stress test completed!")

View file

@ -1,40 +0,0 @@
# stock market predictions
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
def prepare_data(df,forecast_col,forecast_out,test_size):
label = df[forecast_col].shift(-forecast_out) #creating new column called label with the last 5 rows are nan
X = np.array(df[[forecast_col]]) #creating the feature array
X = preprocessing.scale(X) #processing the feature array
X_lately = X[-forecast_out:] #creating the column i want to use later in the predicting method
X = X[:-forecast_out] # X that will contain the training and testing
label.dropna(inplace=True) #dropping na values
y = np.array(label) # assigning Y
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=test_size, random_state=0) #cross validation
response = [X_train,X_test , Y_train, Y_test , X_lately]
return response
df = pd.read_csv("prices.csv")
df = df[df.symbol == "GOOG"]
forecast_col = 'close'
forecast_out = 5
test_size = 0.2
X_train, X_test, Y_train, Y_test , X_lately =prepare_data(df,forecast_col,forecast_out,test_size); #calling the method were the cross validation and data preperation is in
learner = LinearRegression() #initializing linear regression model
learner.fit(X_train,Y_train) #training the linear regression model
score=learner.score(X_test,Y_test)#testing the linear regression model
forecast= learner.predict(X_lately) #set that will contain the forecasted data
response={}#creting json object
response['test_score']=score
response['forecast_set']=forecast
print(response)

View file

@ -1,29 +0,0 @@
# A simple illustration of how to represent cache occupancy
# graphically using unicvode blocks
# which are generated using print("\u2588"), print("\u2591")
from time import sleep
import random
CACHE_SIZE = 50
used_blocks = [5, 3, 2, 1, 10, 2, 6, 4, 7, 10]
def visualize_kv_cache(used_blocks, total_size):
cache_viz = "["
tot_used = 0
for i in range(len(used_blocks)):
# cache_viz += "█" * used_blocks[i]
cache_viz += "\u2589" * used_blocks[i]
cache_viz += "" * (total_size - used_blocks[i])
cache_viz += f"{used_blocks[i]:3.0f}/{total_size}]\r["
tot_used += used_blocks[i]
#print(f"\r[{cache_viz}] {used_blocks[i]:2.0f}/{total_size}", end="")
print(f"\r{cache_viz}] {tot_used}/{len(used_blocks) * total_size}", end="")
while True:
visualize_kv_cache(used_blocks, CACHE_SIZE)
sleep(0.5)
used_blocks = used_blocks[1:] + [random.randint(0,50)] # update used blocks

View file

@ -1,26 +0,0 @@
// just trying to get the cursor position
#include <cstdlib>
struct CursorPos {
int x;
int y;
};
static CursorPos getCursorPos() {
// Get text cursor position
auto cursorPos = getCursorPos();
// Assign to struct
CursorPos pos;
pos.x = cursorPos.x;
pos.y = cursorPos.y;
return pos;
}
int main() {
CursorPos cursor = getCursorPos();
printf("The x co-ordinate of the cursor is %zu\n; the y co-ordinate of the cursor is %zu\n", cursor.x, cursor.y);
}

View file

@ -1,76 +0,0 @@
/*
A utility to represent the kv-cache occupancy graphically
Takes as parameters
- total cache size (-c)
- number of simultaneous accesses/slots (-np)
- a parameter related to the display context (max window width - data display requirements)
It then uses a trick borrowed from tqdm to display occupancy
TODO: Show contiguous space and block availability
*/
#include <iostream>
#include <iomanip>
#include <vector>
#include <cstdlib> // for rand()
// a custom function to display graphics of the kvcache status
static void show_kvcache(std::vector<std::pair<int,struct llama_client_slot>> used_blocks, int cache_size) {
int max_length = 128;
int num_blocks = used_blocks.size();
int slot_cache_size = cache_size / num_blocks;
bool cls_flag = true;
std::string slot_symbol1 = "";
std::string slot_symbol2 = "";
std::string slot_symbol3 = "";
auto& p = used_blocks[0];
llama_client_slot slot = p.second;
return; // remove when not in debug mode
if ((used_blocks.size() == 0) || (used_blocks[0].first == 0)) {
return;
}
// Print visualization
// Always start at the top left of the window (H means 'move cursor to this position'; 2J = cls)
// Only clear the screen the first time round
if (cls_flag) {
printf("\033[2J");
cls_flag = false;
}
printf("\033[1;0H\033[K**************************\n\033[KKVcache occupancy by slot:\n\033[K**************************\n");
for(int i=0; i<num_blocks; i++) {
printf("\033[K"); // clear the current line
for(int j=0; j < max_length; j++) {
int used = used_blocks[i].first * max_length / slot_cache_size;
if((j < max_length / 2) && (j < used)) {
printf("\033[90m█\033[0m");
} else if (j < used) {
printf("\033[94m█\033[0m");
} else {
printf("\033[91m█\033[0m");
}
}
if(used_blocks[i].second.state == PROCESSING) {
slot_symbol1 = "\u23F0"; // clock symbol = processing
} else if(used_blocks[i].second.state == IDLE) {
slot_symbol1 = "\u2705"; // red box white tick
} else {
slot_symbol1 = "\u2620"; // skull and crossbones symbol = dead?
}
if(used_blocks[i].second.command == LOAD_PROMPT) {
slot_symbol2 = "\u24C1"; // dingbat L symbol = loading
} else if(used_blocks[i].second.command == RELEASE) {
slot_symbol2 = "\u24C7"; // dingbat R release
} else if(used_blocks[i].second.command == NONE) {
slot_symbol2 = "\u24C3"; // dingbat N none
}
if(used_blocks[i].first == slot_cache_size) {
slot_symbol3 = "\u274E"; // red box white cross
} else {
slot_symbol3 = "";
}
printf(" %4d/%5d %2d %s %s %s\n", used_blocks[i].first, slot_cache_size, used_blocks[i].second.id, slot_symbol1.c_str(), slot_symbol2.c_str(), slot_symbol3.c_str());
}
printf("\n\033[%dJ", 0);
}

View file

@ -1,68 +0,0 @@
/*
A utility to represent the kv-cache occupancy graphically
Takes as parameters
- total cache size (-c)
- number of simultaneous accesses/slots (-np)
- a parameter related to the display context (max window width - data display requirements)
It then uses a trick borrowed from tqdm to display occupancy
TODO: Show contiguous space and block availability
*/
#include <iostream>
#include <iomanip>
#include <vector>
#include <cstdlib> // for rand()
static void show_kvcache(
std::vector<int> used_blocks,
int cache_size,
int max_length
) {
int num_blocks = used_blocks.size();
int slot_cache_size = cache_size / num_blocks;
while(true) {
// Print visualization after erasing the current line
for(int i=0; i<num_blocks; i++) {
for(int j=0; j<max_length; j++) {
if(j<used_blocks[i] * max_length / slot_cache_size) {
std::cout << "\033[94m█\033[0m";
}
//else if ((j == int(used_blocks[i] * max_length / slot_cache_size + 0.5)) && (j > 7 * max_length / slot_cache_size + 0.5)) {
// std::cout << "\033[D\033[D\033[D\033[D" << std::setw(3) << used_blocks[i] << "\033[C";
//}
else {
std::cout << "\033[91m█\033[0m";
}
}
std::cout << " " << std::setw(5) << used_blocks[i] << "/" << std::setw(5) << slot_cache_size << std::endl;
}
std::cout << "{";
std::string upcursor = "\033[K\033[A\033[K";
for(int i=0; i < num_blocks; i++){
//std::cout << used_blocks[i] << " ";
upcursor += "\033[A\033[K";
}
// Remove first element
used_blocks.erase(used_blocks.begin());
// Add new random block at the end
u_int new_block = rand() % slot_cache_size;
used_blocks.push_back(new_block);
// Adjust the cursor so that the display overwrites itself
upcursor += "\033[A\033[K";
std::cout << "}" << std::endl;
std::cin.get();
std::cout << upcursor;
}
}
int main() {
std::vector<int> used_blocks = {64, 64, 64, 64, 64, 64, 64, 64, 64, 46, 46, 46, 46, 46, 46, 46, 46, 46};
int cache_size = 65536;
int max_length = 128;
show_kvcache(used_blocks, cache_size, max_length);
}