Created a Server example

This commit is contained in:
Alisamar Husain 2023-04-17 17:15:57 +05:30
parent 0b2da20538
commit 98de1c349d
4 changed files with 12766 additions and 0 deletions

View file

@ -35,4 +35,5 @@ else()
add_subdirectory(perplexity)
add_subdirectory(embedding)
add_subdirectory(save-load-state)
add_subdirectory(server)
endif()

View file

@ -0,0 +1,11 @@
set(TARGET server)
add_executable(${TARGET} server.cpp)
# Boost is needed for Crow
find_package(Boost 1.81.0)
include_directories(${Boost_INCLUDE_DIRS})
target_link_libraries(${TARGET} ${Boost_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

12643
examples/server/crow.h Normal file

File diff suppressed because it is too large Load diff

111
examples/server/server.cpp Normal file
View file

@ -0,0 +1,111 @@
#include <stdio.h>
#include <iostream>
#include <fstream>
#include "common.h"
#include "llama.h"
#include "crow.h"
auto const BINDPORT = 8001;
int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
if (gpt_params_parse(argc, argv, params) == false)
return 1;
if (params.n_ctx > 2048)
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
if (params.seed <= 0)
params.seed = time(NULL);
llama_context * ctx;
// load the model
{
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mlock = params.use_mlock;
lparams.logits_all = params.perplexity;
lparams.embedding = true;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
}
crow::SimpleApp app;
// app.loglevel(crow::LogLevel::Warning);
/// Python server will send a file name to you.
/// You should open that file and give the pointer to run_llama.
/// run_llama will keep writing the output to it.
/// The python server will keep reading from that file just like it reads
/// from the stdout of the main process.
///
/// We are doing this because this is probably the simplest way
/// to get streaming to work here.
CROW_ROUTE(app, "/completion").methods("POST"_method)
([&params, &ctx](const crow::request& req){
auto body = crow::json::load(req.body);
if (!body) return crow::response(crow::status::BAD_REQUEST);
// Create new params for this request only
gpt_params runparams = params;
// Set run params from body
runparams.prompt = body["prompt"].s();
runparams.n_predict = body["n_predict"].i();
runparams.top_k = body["top_k"].i();
runparams.n_ctx = body["ctx_size"].i();
runparams.repeat_last_n = body["repeat_last_n"].i();
runparams.top_p = (float)body["top_p"].d();
runparams.temp = (float)body["temp"].d();
runparams.repeat_penalty = (float)body["repeat_penalty"].d();
runparams.embedding = false;
// Open the tempfile into a stream.
std::ofstream outfile(body["tempfile"].s(), std::ios::out);
// Write output of LLaMA to file stream.
run_llama(ctx, runparams, &outfile);
return crow::response(crow::status::OK);
});
CROW_ROUTE(app, "/embedding").methods("POST"_method)
([&params, &ctx](const crow::request& req){
auto body = crow::json::load(req.body);
if (!body) return crow::response(crow::status::BAD_REQUEST);
// Create new params for this request only
gpt_params runparams = params;
// Set run params from body
runparams.prompt = body["prompt"].s();
runparams.embedding = true;
// Open the tempfile into a stream.
std::ofstream outfile(body["tempfile"].s(), std::ios::out);
// Write output of LLaMA to file stream.
run_llama_embedding(ctx, runparams, &outfile);
return crow::response(crow::status::OK);
});
app.port(BINDPORT).multithreaded().run();
return 0;
}