Merge branch 'master' into compilade/lazy-convert-hf
This commit is contained in:
commit
68c5ac628f
54 changed files with 1151 additions and 771 deletions
|
@ -32,6 +32,7 @@ struct split_params {
|
|||
int n_split_tensors = 128;
|
||||
std::string input;
|
||||
std::string output;
|
||||
bool no_tensor_first_split = false;
|
||||
bool dry_run = false;
|
||||
};
|
||||
|
||||
|
@ -49,6 +50,7 @@ static void split_print_usage(const char * executable) {
|
|||
printf(" --merge merge multiple GGUF to a single GGUF\n");
|
||||
printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors);
|
||||
printf(" --split-max-size N(M|G) max size per split\n");
|
||||
printf(" --no-tensor-first-split do not add tensors to the first split (disabled by default)\n");
|
||||
printf(" --dry-run only print out a split plan and exit, without writing any new files\n");
|
||||
printf("\n");
|
||||
}
|
||||
|
@ -100,6 +102,10 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
|
|||
arg_found = true;
|
||||
params.dry_run = true;
|
||||
}
|
||||
if (arg == "--no-tensor-first-split") {
|
||||
arg_found = true;
|
||||
params.no_tensor_first_split = true;
|
||||
}
|
||||
|
||||
if (is_op_set) {
|
||||
throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
|
||||
|
@ -200,10 +206,10 @@ struct split_strategy {
|
|||
// because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
|
||||
int i_split = -1;
|
||||
struct gguf_context * ctx_out = NULL;
|
||||
auto new_ctx_out = [&]() {
|
||||
auto new_ctx_out = [&](bool allow_no_tensors) {
|
||||
i_split++;
|
||||
if (ctx_out != NULL) {
|
||||
if (gguf_get_n_tensors(ctx_out) == 0) {
|
||||
if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) {
|
||||
fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
@ -220,7 +226,12 @@ struct split_strategy {
|
|||
};
|
||||
|
||||
// initialize ctx_out for the first split
|
||||
new_ctx_out();
|
||||
new_ctx_out(false);
|
||||
|
||||
// skip first split if no_tensor_first_split is set
|
||||
if (params.no_tensor_first_split) {
|
||||
new_ctx_out(true);
|
||||
}
|
||||
|
||||
// process tensors one by one
|
||||
size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
|
||||
|
@ -230,7 +241,7 @@ struct split_strategy {
|
|||
size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
|
||||
size_t next_tensors_size = curr_tensors_size + n_bytes;
|
||||
if (should_split(i, next_tensors_size)) {
|
||||
new_ctx_out();
|
||||
new_ctx_out(false);
|
||||
curr_tensors_size = n_bytes;
|
||||
} else {
|
||||
curr_tensors_size = next_tensors_size;
|
||||
|
|
|
@ -55,15 +55,15 @@ $MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
|
|||
echo PASS
|
||||
echo
|
||||
|
||||
# 4. Split with no tensor in metadata
|
||||
#$SPLIT --split-max-tensors 32 --no-tensor-in-metadata $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
|
||||
#echo PASS
|
||||
#echo
|
||||
# 4. Split with no tensors in the first split
|
||||
$SPLIT --split-max-tensors 32 --no-tensor-first-split $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
|
||||
echo PASS
|
||||
echo
|
||||
|
||||
# 4b. Test the sharded model is loading properly
|
||||
#$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf --random-prompt --n-predict 32
|
||||
#echo PASS
|
||||
#echo
|
||||
$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32
|
||||
echo PASS
|
||||
echo
|
||||
|
||||
# 5. Merge
|
||||
#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf
|
||||
|
|
|
@ -178,6 +178,7 @@ struct cmd_params {
|
|||
std::vector<std::vector<float>> tensor_split;
|
||||
std::vector<bool> use_mmap;
|
||||
std::vector<bool> embeddings;
|
||||
ggml_numa_strategy numa;
|
||||
int reps;
|
||||
bool verbose;
|
||||
output_formats output_format;
|
||||
|
@ -200,6 +201,7 @@ static const cmd_params cmd_params_defaults = {
|
|||
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
|
||||
/* use_mmap */ {true},
|
||||
/* embeddings */ {false},
|
||||
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
||||
/* reps */ 5,
|
||||
/* verbose */ false,
|
||||
/* output_format */ MARKDOWN
|
||||
|
@ -224,6 +226,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
|||
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
|
||||
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
||||
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
|
||||
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
||||
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||
|
@ -396,6 +399,17 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||
}
|
||||
auto p = split<bool>(argv[i], split_delim);
|
||||
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
|
||||
} else if (arg == "--numa") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
} else {
|
||||
std::string value(argv[i]);
|
||||
/**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
|
||||
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
|
||||
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
|
||||
else { invalid_param = true; break; }
|
||||
}
|
||||
} else if (arg == "-fa" || arg == "--flash-attn") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -1215,6 +1229,7 @@ int main(int argc, char ** argv) {
|
|||
llama_log_set(llama_null_log_callback, NULL);
|
||||
}
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
// initialize printer
|
||||
std::unique_ptr<printer> p;
|
||||
|
|
|
@ -1383,9 +1383,10 @@ struct server_context {
|
|||
if (!slot.params.stream && slot.stopped_word) {
|
||||
const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
|
||||
|
||||
size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
|
||||
probs = std::vector<completion_token_output>(
|
||||
slot.generated_token_probs.begin(),
|
||||
slot.generated_token_probs.end() - stop_word_toks.size());
|
||||
slot.generated_token_probs.end() - safe_offset);
|
||||
} else {
|
||||
probs = std::vector<completion_token_output>(
|
||||
slot.generated_token_probs.begin(),
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue