updated kobold lite, added debug mode, changed streaming mode to now use the same url when launching
This commit is contained in:
parent
2499632cdc
commit
e8a389f85b
4 changed files with 33 additions and 6 deletions
1
expose.h
1
expose.h
|
@ -15,6 +15,7 @@ struct load_model_inputs
|
||||||
const bool unban_tokens;
|
const bool unban_tokens;
|
||||||
const int clblast_info = 0;
|
const int clblast_info = 0;
|
||||||
const int blasbatchsize = 512;
|
const int blasbatchsize = 512;
|
||||||
|
const bool debugmode;
|
||||||
};
|
};
|
||||||
struct generation_inputs
|
struct generation_inputs
|
||||||
{
|
{
|
||||||
|
|
|
@ -43,6 +43,7 @@ static int n_batch = 8;
|
||||||
static bool useSmartContext = false;
|
static bool useSmartContext = false;
|
||||||
static bool unbanTokens = false;
|
static bool unbanTokens = false;
|
||||||
static int blasbatchsize = 512;
|
static int blasbatchsize = 512;
|
||||||
|
static bool debugmode = false;
|
||||||
static std::string modelname;
|
static std::string modelname;
|
||||||
static std::vector<gpt_vocab::id> last_n_tokens;
|
static std::vector<gpt_vocab::id> last_n_tokens;
|
||||||
static std::vector<gpt_vocab::id> current_context_tokens;
|
static std::vector<gpt_vocab::id> current_context_tokens;
|
||||||
|
@ -66,6 +67,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
n_batch = params.n_batch = inputs.batch_size;
|
n_batch = params.n_batch = inputs.batch_size;
|
||||||
modelname = params.model = inputs.model_filename;
|
modelname = params.model = inputs.model_filename;
|
||||||
useSmartContext = inputs.use_smartcontext;
|
useSmartContext = inputs.use_smartcontext;
|
||||||
|
debugmode = inputs.debugmode;
|
||||||
unbanTokens = inputs.unban_tokens;
|
unbanTokens = inputs.unban_tokens;
|
||||||
blasbatchsize = inputs.blasbatchsize;
|
blasbatchsize = inputs.blasbatchsize;
|
||||||
params.memory_f16 = inputs.f16_kv;
|
params.memory_f16 = inputs.f16_kv;
|
||||||
|
@ -440,6 +442,26 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
|
||||||
|
if(debugmode)
|
||||||
|
{
|
||||||
|
printf("\n[Debug: Dump Input Tokens]\n");
|
||||||
|
if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT)
|
||||||
|
{
|
||||||
|
for (auto id : embd_inp)
|
||||||
|
{
|
||||||
|
printf("'%s', ",llama_token_to_str(llama_ctx_v1, id));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (auto id : embd_inp)
|
||||||
|
{
|
||||||
|
printf("'%s', ",vocab.id_to_token[id].c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
while (remaining_tokens > 0)
|
while (remaining_tokens > 0)
|
||||||
{
|
{
|
||||||
|
@ -613,5 +635,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
output.status = 1;
|
output.status = 1;
|
||||||
snprintf(output.text, sizeof(output.text), "%s", concat_output.c_str());
|
snprintf(output.text, sizeof(output.text), "%s", concat_output.c_str());
|
||||||
|
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because one or more lines are too long
11
koboldcpp.py
11
koboldcpp.py
|
@ -21,7 +21,8 @@ class load_model_inputs(ctypes.Structure):
|
||||||
("use_smartcontext", ctypes.c_bool),
|
("use_smartcontext", ctypes.c_bool),
|
||||||
("unban_tokens", ctypes.c_bool),
|
("unban_tokens", ctypes.c_bool),
|
||||||
("clblast_info", ctypes.c_int),
|
("clblast_info", ctypes.c_int),
|
||||||
("blasbatchsize", ctypes.c_int)]
|
("blasbatchsize", ctypes.c_int),
|
||||||
|
("debugmode", ctypes.c_bool)]
|
||||||
|
|
||||||
class generation_inputs(ctypes.Structure):
|
class generation_inputs(ctypes.Structure):
|
||||||
_fields_ = [("seed", ctypes.c_int),
|
_fields_ = [("seed", ctypes.c_int),
|
||||||
|
@ -114,6 +115,7 @@ def load_model(model_filename):
|
||||||
clblastids = 100 + int(args.useclblast[0])*10 + int(args.useclblast[1])
|
clblastids = 100 + int(args.useclblast[0])*10 + int(args.useclblast[1])
|
||||||
inputs.clblast_info = clblastids
|
inputs.clblast_info = clblastids
|
||||||
inputs.executable_path = (getdirpath()+"/").encode("UTF-8")
|
inputs.executable_path = (getdirpath()+"/").encode("UTF-8")
|
||||||
|
inputs.debugmode = args.debugmode
|
||||||
ret = handle.load_model(inputs)
|
ret = handle.load_model(inputs)
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
@ -461,9 +463,9 @@ def main(args):
|
||||||
print(f"Starting Kobold HTTP Server on port {args.port}")
|
print(f"Starting Kobold HTTP Server on port {args.port}")
|
||||||
epurl = ""
|
epurl = ""
|
||||||
if args.host=="":
|
if args.host=="":
|
||||||
epurl = f"http://localhost:{args.port}" + ("?streaming=1" if args.stream else "")
|
epurl = f"http://localhost:{args.port}"
|
||||||
else:
|
else:
|
||||||
epurl = f"http://{args.host}:{args.port}" + ("?streaming=1" if args.stream else "")
|
epurl = f"http://{args.host}:{args.port}"
|
||||||
|
|
||||||
if args.launch:
|
if args.launch:
|
||||||
try:
|
try:
|
||||||
|
@ -496,11 +498,12 @@ if __name__ == '__main__':
|
||||||
parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
|
parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
|
||||||
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
|
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
|
||||||
parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[32,64,128,256,512,1024], default=512)
|
parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[32,64,128,256,512,1024], default=512)
|
||||||
parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
|
parser.add_argument("--stream", help="Uses pseudo streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true')
|
||||||
parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
|
parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
|
||||||
parser.add_argument("--unbantokens", help="Normally, KoboldAI prevents certain tokens such as EOS and Square Brackets. This flag unbans them.", action='store_true')
|
parser.add_argument("--unbantokens", help="Normally, KoboldAI prevents certain tokens such as EOS and Square Brackets. This flag unbans them.", action='store_true')
|
||||||
parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
|
parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
|
||||||
parser.add_argument("--noavx2", help="Do not use AVX2 instructions, a slower compatibility mode for older devices. Does not work with --clblast.", action='store_true')
|
parser.add_argument("--noavx2", help="Do not use AVX2 instructions, a slower compatibility mode for older devices. Does not work with --clblast.", action='store_true')
|
||||||
|
parser.add_argument("--debugmode", help="Shows additional debug info in the terminal.", action='store_true')
|
||||||
compatgroup = parser.add_mutually_exclusive_group()
|
compatgroup = parser.add_mutually_exclusive_group()
|
||||||
compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
|
compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
|
||||||
compatgroup.add_argument("--useclblast", help="Use CLBlast instead of OpenBLAS for prompt ingestion. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
|
compatgroup.add_argument("--useclblast", help="Use CLBlast instead of OpenBLAS for prompt ingestion. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue