now server has it

This commit is contained in:
mike dupont 2023-11-25 11:13:45 -05:00
parent e8e94f4f69
commit 90568a6696
2 changed files with 131 additions and 10 deletions

View file

@ -14,8 +14,9 @@ llvmLibPath = "/usr/lib/llvm-15/lib/"
cxxClientRoot = "/home/mdupont/experiments/llama.cpp/"
fileList = [
"ggml.cpp",
"llama.cpp"
# "ggml.cpp",
# "llama.cpp",
"examples/server/server.cpp",
]
typeList = [
@ -224,10 +225,11 @@ UNNAMED_STRUCT_DELIM = '::(unnamed struct'
def traverse(node, namespace, main_file):
# only scan the elements of the file we parsed
#print("FILE", node.location.file )
if node.kind == clang.cindex.CursorKind.STRUCT_DECL or node.kind == clang.cindex.CursorKind.CLASS_DECL:
fullStructName = "::".join([*namespace, node.displayname])
print("#FILE", node.location.file )
print("REFL_TYPE(" + fullStructName + ")")
structFields = []
@ -247,14 +249,15 @@ def traverse(node, namespace, main_file):
"type": struct_type,
})
# replica read changes introduced duplicate get requests
if any(map(lambda op: op['name'] == fullStructName, opTypes)):
return
#if any(map(lambda op: op['name'] == fullStructName, opTypes)):
# return
opTypes.append({
"name": fullStructName,
"fields": structFields,
})
#opTypes.append({
# "name": fullStructName,
# "fields": structFields,
#})
print("REFL_END")
if node.kind == clang.cindex.CursorKind.TYPE_ALIAS_DECL:
fullStructName = "::".join([*namespace, node.displayname])

View file

@ -24,6 +24,7 @@
#include <thread>
#include <mutex>
#include <chrono>
#include "print.hpp"
#ifndef SERVER_VERBOSE
#define SERVER_VERBOSE 1
@ -33,6 +34,9 @@
using json = nlohmann::json;
REFL_TYPE(std::less< ::nlohmann::detail::value_t>)
REFL_END
struct server_params
{
std::string hostname = "127.0.0.1";
@ -41,6 +45,13 @@ struct server_params
int32_t read_timeout = 600;
int32_t write_timeout = 600;
};
REFL_TYPE(server_params)
REFL_FIELD(hostname)
REFL_FIELD(public_path)
REFL_FIELD(port)
REFL_FIELD(read_timeout)
REFL_FIELD(write_timeout)
REFL_END
static bool server_verbose = false;
@ -157,6 +168,15 @@ struct task_server {
bool embedding_mode = false;
};
REFL_TYPE(task_server)
REFL_FIELD(id)
REFL_FIELD(target_id)
REFL_FIELD(type)
REFL_FIELD(data)
REFL_FIELD(infill_mode)
REFL_FIELD(embedding_mode)
REFL_END
struct task_result {
int id;
bool stop;
@ -193,6 +213,18 @@ struct slot_params
json input_suffix;
};
REFL_TYPE(slot_params)
REFL_FIELD(stream)
REFL_FIELD(cache_prompt)
REFL_FIELD(seed)
REFL_FIELD(n_keep)
REFL_FIELD(n_predict)
REFL_FIELD(antiprompt)
REFL_FIELD(input_prefix)
REFL_FIELD(input_suffix)
REFL_END
struct slot_image
{
int32_t id;
@ -220,6 +252,17 @@ struct completion_token_output
std::string text_to_send;
};
REFL_TYPE(completion_token_output)
REFL_FIELD(probs)
REFL_FIELD(tok)
REFL_FIELD(text_to_send)
REFL_END
REFL_TYPE(completion_token_output::token_prob)
REFL_FIELD(tok)
REFL_FIELD(prob)
REFL_END
static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
{
size_t i;
@ -496,6 +539,51 @@ struct llama_client_slot
}
};
//REFL_TYPE(llama_client_slot::llama_sampling_params)
//REFL_END
REFL_TYPE(llama_client_slot)
REFL_FIELD(id)
REFL_FIELD(task_id)
REFL_FIELD(params)
REFL_FIELD(state)
REFL_FIELD(command)
REFL_FIELD(t_last_used)
REFL_FIELD(n_ctx)
REFL_FIELD(n_past)
REFL_FIELD(n_decoded)
REFL_FIELD(n_remaining)
REFL_FIELD(i_batch)
REFL_FIELD(num_prompt_tokens)
REFL_FIELD(num_prompt_tokens_processed)
REFL_FIELD(multibyte_pending)
REFL_FIELD(prompt)
REFL_FIELD(generated_text)
REFL_FIELD(sampled)
REFL_FIELD(cache_tokens)
REFL_FIELD(generated_token_probs)
REFL_FIELD(infill)
REFL_FIELD(embedding)
REFL_FIELD(has_next_token)
REFL_FIELD(truncated)
REFL_FIELD(stopped_eos)
REFL_FIELD(stopped_word)
REFL_FIELD(stopped_limit)
REFL_FIELD(oaicompat)
REFL_FIELD(oaicompat_model)
REFL_FIELD(stopping_word)
REFL_FIELD(sparams)
REFL_FIELD(ctx_sampling)
REFL_FIELD(images)
REFL_FIELD(sent_count)
REFL_FIELD(sent_token_probs_index)
REFL_FIELD(t_start_process_prompt)
REFL_FIELD(t_start_genereration)
REFL_FIELD(t_prompt_processing)
REFL_FIELD(t_token_generation)
REFL_END
struct llama_server_context
{
llama_model *model = nullptr;
@ -878,7 +966,7 @@ struct llama_server_context
all_slots_are_idle = false;
LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id);
print_fields(*slot);
return true;
}
@ -1787,6 +1875,31 @@ struct llama_server_context
}
};
REFL_TYPE(llama_server_context)
REFL_FIELD(model)
REFL_FIELD(ctx)
REFL_FIELD(clp_ctx)
REFL_FIELD(params)
REFL_FIELD(batch)
REFL_FIELD(multimodal)
REFL_FIELD(clean_kv_cache)
REFL_FIELD(all_slots_are_idle)
REFL_FIELD(add_bos_token)
REFL_FIELD(id_gen)
REFL_FIELD(n_ctx)
REFL_FIELD(system_need_update)
REFL_FIELD(system_prompt)
REFL_FIELD(system_tokens)
REFL_FIELD(name_user)
REFL_FIELD(name_assistant)
REFL_FIELD(slots)
REFL_FIELD(queue_tasks)
REFL_FIELD(queue_results)
REFL_FIELD(mutex_tasks)
REFL_FIELD(mutex_results)
REFL_END
static void server_print_usage(const char *argv0, const gpt_params &params,
const server_params &sparams)
{
@ -2497,6 +2610,11 @@ struct token_translator
std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
};
REFL_TYPE(token_translator)
REFL_FIELD(ctx)
REFL_END
static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, llama_client_slot *slot)
{
auto & gtps = slot->generated_token_probs;