Merge 09279c86ce
into 948ff137ec
This commit is contained in:
commit
b5ff3e45ee
4 changed files with 30 additions and 30 deletions
|
@ -40,13 +40,13 @@ class Model:
|
|||
self.ftype = ftype
|
||||
self.fname_out = fname_out
|
||||
self.is_big_endian = is_big_endian
|
||||
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
||||
self.endianness = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
||||
self.is_safetensors = self._is_model_safetensors()
|
||||
self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
|
||||
self.part_names = self._get_part_names()
|
||||
self.hparams = Model.load_hparams(self.dir_model)
|
||||
self.model_arch = self._get_model_architecture()
|
||||
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess)
|
||||
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianness=self.endianness)
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_gpt2()
|
||||
|
|
18
convert.py
18
convert.py
|
@ -842,8 +842,8 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
|||
|
||||
|
||||
class OutputFile:
|
||||
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
||||
def __init__(self, fname_out: Path, endianness:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianness=endianness)
|
||||
|
||||
def add_meta_arch(self, params: Params) -> None:
|
||||
name = "LLaMA"
|
||||
|
@ -932,10 +932,10 @@ class OutputFile:
|
|||
self.gguf.close()
|
||||
|
||||
@staticmethod
|
||||
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
||||
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianness:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
||||
check_vocab_size(params, vocab)
|
||||
|
||||
of = OutputFile(fname_out, endianess=endianess)
|
||||
of = OutputFile(fname_out, endianness=endianness)
|
||||
|
||||
# meta data
|
||||
of.add_meta_arch(params)
|
||||
|
@ -960,10 +960,10 @@ class OutputFile:
|
|||
return dt.quantize(arr)
|
||||
|
||||
@staticmethod
|
||||
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
||||
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianness: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
||||
check_vocab_size(params, vocab)
|
||||
|
||||
of = OutputFile(fname_out, endianess=endianess)
|
||||
of = OutputFile(fname_out, endianness=endianness)
|
||||
|
||||
# meta data
|
||||
of.add_meta_arch(params)
|
||||
|
@ -1205,9 +1205,9 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
if args.dump:
|
||||
do_dump_model(model_plus)
|
||||
return
|
||||
endianess = gguf.GGUFEndian.LITTLE
|
||||
endianness = gguf.GGUFEndian.LITTLE
|
||||
if args.bigendian:
|
||||
endianess = gguf.GGUFEndian.BIG
|
||||
endianness = gguf.GGUFEndian.BIG
|
||||
|
||||
params = Params.load(model_plus)
|
||||
if params.n_ctx == -1:
|
||||
|
@ -1260,7 +1260,7 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
params.ftype = ftype
|
||||
print(f"Writing {outfile}, format {ftype}")
|
||||
|
||||
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
|
||||
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianness=endianness)
|
||||
print(f"Wrote {outfile}")
|
||||
|
||||
|
||||
|
|
|
@ -408,7 +408,7 @@ struct llama_client_slot
|
|||
size_t sent_token_probs_index = 0;
|
||||
|
||||
int64_t t_start_process_prompt;
|
||||
int64_t t_start_genereration;
|
||||
int64_t t_start_generation;
|
||||
|
||||
double t_prompt_processing; // ms
|
||||
double t_token_generation; // ms
|
||||
|
@ -475,12 +475,12 @@ struct llama_client_slot
|
|||
void release() {
|
||||
if (state == IDLE || state == PROCESSING)
|
||||
{
|
||||
t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
|
||||
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
|
||||
command = RELEASE;
|
||||
}
|
||||
}
|
||||
|
||||
json get_formated_timings() {
|
||||
json get_formatted_timings() {
|
||||
return json
|
||||
{
|
||||
{"prompt_n", num_prompt_tokens_processed},
|
||||
|
@ -1159,10 +1159,10 @@ struct llama_server_context
|
|||
|
||||
json get_model_props()
|
||||
{
|
||||
return get_formated_generation(slots[0]);
|
||||
return get_formatted_generation(slots[0]);
|
||||
}
|
||||
|
||||
json get_formated_generation(llama_client_slot &slot)
|
||||
json get_formatted_generation(llama_client_slot &slot)
|
||||
{
|
||||
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
|
||||
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
|
||||
|
@ -1253,7 +1253,7 @@ struct llama_server_context
|
|||
{"model", params.model_alias},
|
||||
{"tokens_predicted", slot.n_decoded},
|
||||
{"tokens_evaluated", slot.num_prompt_tokens},
|
||||
{"generation_settings", get_formated_generation(slot)},
|
||||
{"generation_settings", get_formatted_generation(slot)},
|
||||
{"prompt", slot.prompt},
|
||||
{"truncated", slot.truncated},
|
||||
{"stopped_eos", slot.stopped_eos},
|
||||
|
@ -1261,7 +1261,7 @@ struct llama_server_context
|
|||
{"stopped_limit", slot.stopped_limit},
|
||||
{"stopping_word", slot.stopping_word},
|
||||
{"tokens_cached", slot.n_past},
|
||||
{"timings", slot.get_formated_timings()}
|
||||
{"timings", slot.get_formatted_timings()}
|
||||
};
|
||||
|
||||
if (slot.sparams.n_probs > 0)
|
||||
|
@ -1680,7 +1680,7 @@ struct llama_server_context
|
|||
slot.command = NONE;
|
||||
std::vector<llama_token> prompt_tokens;
|
||||
slot.t_start_process_prompt = ggml_time_us();
|
||||
slot.t_start_genereration = 0;
|
||||
slot.t_start_generation = 0;
|
||||
|
||||
if (slot.infill)
|
||||
{
|
||||
|
@ -1870,8 +1870,8 @@ struct llama_server_context
|
|||
|
||||
if (slot.n_decoded == 1)
|
||||
{
|
||||
slot.t_start_genereration = ggml_time_us();
|
||||
slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
|
||||
slot.t_start_generation = ggml_time_us();
|
||||
slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
|
||||
|
@ -2294,13 +2294,13 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::string systm_content;
|
||||
std::string system_content;
|
||||
std::copy(
|
||||
std::istreambuf_iterator<char>(file),
|
||||
std::istreambuf_iterator<char>(),
|
||||
std::back_inserter(systm_content)
|
||||
std::back_inserter(system_content)
|
||||
);
|
||||
llama.process_system_prompt_data(json::parse(systm_content));
|
||||
llama.process_system_prompt_data(json::parse(system_content));
|
||||
}
|
||||
else if(arg == "--mmproj")
|
||||
{
|
||||
|
|
|
@ -50,11 +50,11 @@ class GGUFWriter:
|
|||
|
||||
def __init__(
|
||||
self, path: os.PathLike[str] | str, arch: str, use_temp_file: bool = True,
|
||||
endianess: GGUFEndian = GGUFEndian.LITTLE,
|
||||
endianness: GGUFEndian = GGUFEndian.LITTLE,
|
||||
):
|
||||
self.fout = open(path, "wb")
|
||||
self.arch = arch
|
||||
self.endianess = endianess
|
||||
self.endianness = endianness
|
||||
self.offset_tensor = 0
|
||||
self.data_alignment = GGUF_DEFAULT_ALIGNMENT
|
||||
self.kv_data = bytearray()
|
||||
|
@ -65,7 +65,7 @@ class GGUFWriter:
|
|||
self.temp_file = None
|
||||
self.tensors = []
|
||||
print("gguf: This GGUF file is for {0} Endian only".format(
|
||||
"Big" if self.endianess == GGUFEndian.BIG else "Little",
|
||||
"Big" if self.endianness == GGUFEndian.BIG else "Little",
|
||||
))
|
||||
self.state = WriterState.EMPTY
|
||||
|
||||
|
@ -218,7 +218,7 @@ class GGUFWriter:
|
|||
self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
|
||||
raw_dtype: GGMLQuantizationType | None = None,
|
||||
) -> None:
|
||||
if self.endianess == GGUFEndian.BIG:
|
||||
if self.endianness == GGUFEndian.BIG:
|
||||
tensor.byteswap(inplace=True)
|
||||
if self.use_temp_file and self.temp_file is None:
|
||||
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256 * 1024 * 1024)
|
||||
|
@ -244,7 +244,7 @@ class GGUFWriter:
|
|||
if self.state is not WriterState.TI_DATA:
|
||||
raise ValueError(f'Expected output file to contain tensor info, got {self.state}')
|
||||
|
||||
if self.endianess == GGUFEndian.BIG:
|
||||
if self.endianness == GGUFEndian.BIG:
|
||||
tensor.byteswap(inplace=True)
|
||||
self.write_padding(self.fout, self.fout.tell())
|
||||
tensor.tofile(self.fout)
|
||||
|
@ -411,7 +411,7 @@ class GGUFWriter:
|
|||
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
||||
pack_prefix = ''
|
||||
if not skip_pack_prefix:
|
||||
pack_prefix = '<' if self.endianess == GGUFEndian.LITTLE else '>'
|
||||
pack_prefix = '<' if self.endianness == GGUFEndian.LITTLE else '>'
|
||||
return struct.pack(f'{pack_prefix}{fmt}', value)
|
||||
|
||||
def _write_packed(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> None:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue