bug fixes for convert-train-checkpoint-to-gguf
This commit is contained in:
parent
c690c20362
commit
5f27ade48e
1 changed files with 91 additions and 89 deletions
|
@ -4,7 +4,9 @@
|
||||||
import argparse
|
import argparse
|
||||||
import gguf
|
import gguf
|
||||||
import os
|
import os
|
||||||
|
import struct
|
||||||
import sys
|
import sys
|
||||||
|
import numpy as np
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
# gguf constants
|
# gguf constants
|
||||||
|
@ -55,7 +57,10 @@ class Tensor:
|
||||||
self.ne = ne
|
self.ne = ne
|
||||||
self.nbytes = 0
|
self.nbytes = 0
|
||||||
if self.dtype == 'f':
|
if self.dtype == 'f':
|
||||||
self.nbytes = product(self.ne) * 4
|
if len(self.ne) == 0:
|
||||||
|
self.nbytes = 0
|
||||||
|
else:
|
||||||
|
self.nbytes = int(np.product(self.ne)) * 4
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unhandled data type '{self.dtype}'")
|
raise ValueError(f"Unhandled data type '{self.dtype}'")
|
||||||
|
|
||||||
|
@ -67,7 +72,7 @@ class Tensor:
|
||||||
assert(nd == len(self.ne))
|
assert(nd == len(self.ne))
|
||||||
ne = []
|
ne = []
|
||||||
for d in range(nd):
|
for d in range(nd):
|
||||||
n = struct.unpack('<Q', bytes(data[offset:offset + 8]))[0]; offset += 8
|
n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
ne.append(n)
|
ne.append(n)
|
||||||
|
|
||||||
assert(tuple(ne) == tuple(self.ne))
|
assert(tuple(ne) == tuple(self.ne))
|
||||||
|
@ -81,7 +86,7 @@ class Tensor:
|
||||||
# 32-byte alignment
|
# 32-byte alignment
|
||||||
offset += (0 - offset) & 31
|
offset += (0 - offset) & 31
|
||||||
self.data = data[offset:offset+self.nbytes]
|
self.data = data[offset:offset+self.nbytes]
|
||||||
|
offset += self.nbytes
|
||||||
return offset
|
return offset
|
||||||
|
|
||||||
def max_storage_size(self):
|
def max_storage_size(self):
|
||||||
|
@ -100,7 +105,7 @@ class Tensor:
|
||||||
name=name,
|
name=name,
|
||||||
tensor=self.data,
|
tensor=self.data,
|
||||||
raw_shape=np.array(list(reversed(self.ne))),
|
raw_shape=np.array(list(reversed(self.ne))),
|
||||||
raw_type=gguf.GGMLQuantizationType.F32)
|
raw_dtype=gguf.GGMLQuantizationType.F32)
|
||||||
|
|
||||||
class OptimizationParamsV0:
|
class OptimizationParamsV0:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -141,26 +146,26 @@ class OptimizationContext:
|
||||||
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
|
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
|
||||||
offset += 4
|
offset += 4
|
||||||
|
|
||||||
if version == 0:
|
if self.version == 0:
|
||||||
params = OptimizationParamsV0()
|
params = OptimizationParamsV0()
|
||||||
offset += params.load(data, offset)
|
offset = params.load(data, offset)
|
||||||
self.past = params.past
|
self.past = params.past
|
||||||
self.lbfgs_m = params.lbfgs_m
|
self.lbfgs_m = params.lbfgs_m
|
||||||
self.nx = struct.unpack('<Q', bytes(data[offset:offset + 8]))[0]; offset += 8
|
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
|
||||||
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
|
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
|
||||||
self.type = params.type
|
self.type = params.type
|
||||||
|
|
||||||
self.adam_m = Tensor('f', [self.nx])
|
self.adam_m = Tensor('f', [self.nx])
|
||||||
self.adam_v = Tensor('f', [self.nx])
|
self.adam_v = Tensor('f', [self.nx])
|
||||||
self.adam_pf = Tensor('f', [self.past])
|
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
||||||
|
|
||||||
self.lbfgs_x = Tensor('f', [self.nx])
|
self.lbfgs_x = Tensor('f', [self.nx])
|
||||||
self.lbfgs_xp = Tensor('f', [self.nx])
|
self.lbfgs_xp = Tensor('f', [self.nx])
|
||||||
self.lbfgs_g = Tensor('f', [self.nx])
|
self.lbfgs_g = Tensor('f', [self.nx])
|
||||||
self.lbfgs_gp = Tensor('f', [self.nx])
|
self.lbfgs_gp = Tensor('f', [self.nx])
|
||||||
self.lbfgs_d = Tensor('f', [self.nx])
|
self.lbfgs_d = Tensor('f', [self.nx])
|
||||||
self.lbfgs_pf = Tensor('f', [self.past])
|
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
||||||
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
|
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
|
||||||
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
|
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
|
||||||
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
|
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
|
||||||
|
@ -174,32 +179,30 @@ class OptimizationContext:
|
||||||
mh = Tensor('f', [self.nx])
|
mh = Tensor('f', [self.nx])
|
||||||
mv = Tensor('f', [self.nx])
|
mv = Tensor('f', [self.nx])
|
||||||
|
|
||||||
offset += x.load(data, offset)
|
offset = x.load(data, offset)
|
||||||
offset += g.load(data, offset)
|
offset = g.load(data, offset)
|
||||||
offset += g2.load(data, offset)
|
offset = g2.load(data, offset)
|
||||||
offset += self.adam_m.load(data, offset)
|
offset = self.adam_m.load(data, offset)
|
||||||
offset += self.adam_v.load(data, offset)
|
offset = self.adam_v.load(data, offset)
|
||||||
offset += mh.load(data, offset)
|
offset = mh.load(data, offset)
|
||||||
offset += vh.load(data, offset)
|
offset = vh.load(data, offset)
|
||||||
if self.past > 0:
|
offset = self.adam_pf.load(data, offset)
|
||||||
offset += self.adam_pf.load(data, offset)
|
|
||||||
|
|
||||||
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
|
||||||
elif self.type == 1:
|
elif self.type == 1:
|
||||||
offset += self.lbfgs_x.load(data, offset)
|
offset = self.lbfgs_x.load(data, offset)
|
||||||
offset += self.lbfgs_xp.load(data, offset)
|
offset = self.lbfgs_xp.load(data, offset)
|
||||||
offset += self.lbfgs_g.load(data, offset)
|
offset = self.lbfgs_g.load(data, offset)
|
||||||
offset += self.lbfgs_gp.load(data, offset)
|
offset = self.lbfgs_gp.load(data, offset)
|
||||||
offset += self.lbfgs_d.load(data, offset)
|
offset = self.lbfgs_d.load(data, offset)
|
||||||
if self.past > 0:
|
offset = self.lbfgs_pf.load(data, offset)
|
||||||
offset += self.lbfgs_pf.load(data, offset)
|
offset = self.lbfgs_lmal.load(data, offset)
|
||||||
offset += self.lbfgs_lmal.load(data, offset)
|
offset = self.lbfgs_lmys.load(data, offset)
|
||||||
offset += self.lbfgs_lmys.load(data, offset)
|
offset = self.lbfgs_lms.load(data, offset)
|
||||||
offset += self.lbfgs_lms.load(data, offset)
|
offset = self.lbfgs_lmy.load(data, offset)
|
||||||
offset += self.lbfgs_lmy.load(data, offset)
|
|
||||||
|
|
||||||
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
@ -212,23 +215,23 @@ class OptimizationContext:
|
||||||
raise ValueError('Unknown optimizer type')
|
raise ValueError('Unknown optimizer type')
|
||||||
|
|
||||||
|
|
||||||
elif version == 1:
|
elif self.version == 1:
|
||||||
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
self.nx = struct.unpack('<Q', bytes(data[offset:offset + 8]))[0]; offset += 8
|
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
|
||||||
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
|
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
|
||||||
|
|
||||||
self.adam_m = Tensor('f', [self.nx])
|
self.adam_m = Tensor('f', [self.nx])
|
||||||
self.adam_w = Tensor('f', [self.nx])
|
self.adam_v = Tensor('f', [self.nx])
|
||||||
self.adam_pf = Tensor('f', [self.past])
|
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
||||||
|
|
||||||
self.lbfgs_x = Tensor('f', [self.nx])
|
self.lbfgs_x = Tensor('f', [self.nx])
|
||||||
self.lbfgs_xp = Tensor('f', [self.nx])
|
self.lbfgs_xp = Tensor('f', [self.nx])
|
||||||
self.lbfgs_g = Tensor('f', [self.nx])
|
self.lbfgs_g = Tensor('f', [self.nx])
|
||||||
self.lbfgs_gp = Tensor('f', [self.nx])
|
self.lbfgs_gp = Tensor('f', [self.nx])
|
||||||
self.lbfgs_d = Tensor('f', [self.nx])
|
self.lbfgs_d = Tensor('f', [self.nx])
|
||||||
self.lbfgs_pf = Tensor('f', [self.past])
|
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
||||||
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
|
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
|
||||||
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
|
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
|
||||||
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
|
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
|
||||||
|
@ -237,14 +240,14 @@ class OptimizationContext:
|
||||||
# forgot to save type in version 1:
|
# forgot to save type in version 1:
|
||||||
# guess self.type from number of remaining bytes
|
# guess self.type from number of remaining bytes
|
||||||
size_type_0 = 12 + sum([t.max_storage_size() for t in
|
size_type_0 = 12 + sum([t.max_storage_size() for t in
|
||||||
[self.adam_m, self.adam_w]
|
[self.adam_m, self.adam_v]
|
||||||
+[self.adam_pf] if self.past > 0 else []])
|
+([self.adam_pf] if (self.past > 0) else [])])
|
||||||
size_type_1 = 24 + sum([t.max_storage_size() for t in
|
size_type_1 = 24 + sum([t.max_storage_size() for t in
|
||||||
[self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
|
[self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
|
||||||
self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
|
self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
|
||||||
self.lbfgs_lmal, self.lbfgs_lmys,
|
self.lbfgs_lmal, self.lbfgs_lmys,
|
||||||
self.lbfgs_lms, self.lbfgs_lmy]
|
self.lbfgs_lms, self.lbfgs_lmy]
|
||||||
+[self.lbfgs_pf] if self.past > 0 else []])
|
+([self.lbfgs_pf] if (self.past > 0) else [])])
|
||||||
# due to alignment padding the size might not by exact
|
# due to alignment padding the size might not by exact
|
||||||
# but the difference in size for both types is significant,
|
# but the difference in size for both types is significant,
|
||||||
# so we can just use whichever is closest
|
# so we can just use whichever is closest
|
||||||
|
@ -255,28 +258,25 @@ class OptimizationContext:
|
||||||
self.type = 1
|
self.type = 1
|
||||||
|
|
||||||
if self.type == 0:
|
if self.type == 0:
|
||||||
offset += self.adam_m.load(data, offset)
|
offset = self.adam_m.load(data, offset)
|
||||||
offset += self.adam_w.load(data, offset)
|
offset = self.adam_v.load(data, offset)
|
||||||
if self.past > 0:
|
offset = self.adam_pf.load(data,offset)
|
||||||
offset += self.adam_pf.load(data,offset)
|
|
||||||
|
|
||||||
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
|
||||||
elif self.type == 1:
|
elif self.type == 1:
|
||||||
|
offset = self.lbfgs_x.load(data, offset)
|
||||||
offset += self.lbfgs_x.load(data, offset)
|
offset = self.lbfgs_xp.load(data, offset)
|
||||||
offset += self.lbfgs_xp.load(data, offset)
|
offset = self.lbfgs_g.load(data, offset)
|
||||||
offset += self.lbfgs_g.load(data, offset)
|
offset = self.lbfgs_gp.load(data, offset)
|
||||||
offset += self.lbfgs_gp.load(data, offset)
|
offset = self.lbfgs_d.load(data, offset)
|
||||||
offset += self.lbfgs_d.load(data, offset)
|
offset = self.lbfgs_pf.load(data, offset)
|
||||||
if self.past > 0:
|
offset = self.lbfgs_lmal.load(data, offset)
|
||||||
offset += self.lbfgs_pf.load(data, offset)
|
offset = self.lbfgs_lmys.load(data, offset)
|
||||||
offset += self.lbfgs_lmal.load(data, offset)
|
offset = self.lbfgs_lms.load(data, offset)
|
||||||
offset += self.lbfgs_lmys.load(data, offset)
|
offset = self.lbfgs_lmy.load(data, offset)
|
||||||
offset += self.lbfgs_lms.load(data, offset)
|
|
||||||
offset += self.lbfgs_lmy.load(data, offset)
|
|
||||||
|
|
||||||
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
@ -295,7 +295,7 @@ class OptimizationContext:
|
||||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
|
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
|
||||||
gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
|
gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
|
||||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
|
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
|
||||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
|
gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
|
||||||
|
|
||||||
if self.type == 0:
|
if self.type == 0:
|
||||||
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
|
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
|
||||||
|
@ -357,6 +357,9 @@ class ModelParams:
|
||||||
gguf_writer.add_rope_dimension_count(self.n_rot)
|
gguf_writer.add_rope_dimension_count(self.n_rot)
|
||||||
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
||||||
|
|
||||||
|
def tensor_name(key, bid=None):
|
||||||
|
return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + ".weight"
|
||||||
|
|
||||||
class Layer:
|
class Layer:
|
||||||
def __init__(self, params, bid):
|
def __init__(self, params, bid):
|
||||||
self.bid = bid
|
self.bid = bid
|
||||||
|
@ -371,27 +374,27 @@ class Layer:
|
||||||
self.w3 = Tensor('f', [params.n_embd, params.get_n_ff()])
|
self.w3 = Tensor('f', [params.n_embd, params.get_n_ff()])
|
||||||
|
|
||||||
def load(self, data, offset):
|
def load(self, data, offset):
|
||||||
offset += self.att_norm.load(data, offset)
|
offset = self.att_norm.load(data, offset)
|
||||||
offset += self.wq.load(data, offset)
|
offset = self.wq.load(data, offset)
|
||||||
offset += self.wk.load(data, offset)
|
offset = self.wk.load(data, offset)
|
||||||
offset += self.wv.load(data, offset)
|
offset = self.wv.load(data, offset)
|
||||||
offset += self.wo.load(data, offset)
|
offset = self.wo.load(data, offset)
|
||||||
offset += self.ffn_norm.load(data, offset)
|
offset = self.ffn_norm.load(data, offset)
|
||||||
offset += self.w1.load(data, offset)
|
offset = self.w1.load(data, offset)
|
||||||
offset += self.w2.load(data, offset)
|
offset = self.w2.load(data, offset)
|
||||||
offset += self.w3.load(data, offset)
|
offset = self.w3.load(data, offset)
|
||||||
return offset
|
return offset
|
||||||
|
|
||||||
def save_gguf(self, gguf_writer):
|
def save_gguf(self, gguf_writer):
|
||||||
self.att_norm.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.ATTN_NORM].format(bid=self.bid))
|
self.att_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid))
|
||||||
self.wq.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.ATTN_Q].format(bid=self.bid))
|
self.wq.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid))
|
||||||
self.wk.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.ATTN_K].format(bid=self.bid))
|
self.wk.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid))
|
||||||
self.wv.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.ATTN_V].format(bid=self.bid))
|
self.wv.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid))
|
||||||
self.wo.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.ATTN_OUT].format(bid=self.bid))
|
self.wo.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid))
|
||||||
self.ffn_norm.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.FFN_NORM].format(bid=self.bid))
|
self.ffn_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid))
|
||||||
self.w1.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.FFN_GATE].format(bid=self.bid))
|
self.w1.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid))
|
||||||
self.w2.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.FFN_DOWN].format(bid=self.bid))
|
self.w2.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid))
|
||||||
self.w3.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.FFN_UP].format(bid=self.bid))
|
self.w3.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid))
|
||||||
|
|
||||||
class Model:
|
class Model:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -405,14 +408,14 @@ class Model:
|
||||||
self.norm = Tensor('f', [self.params.n_embd])
|
self.norm = Tensor('f', [self.params.n_embd])
|
||||||
self.output = Tensor('f', [self.params.n_embd, self.params.n_vocab])
|
self.output = Tensor('f', [self.params.n_embd, self.params.n_vocab])
|
||||||
|
|
||||||
offset += self.tok_embd.load(data, offset)
|
offset = self.tok_embd.load(data, offset)
|
||||||
offset += self.norm.load(data, offset)
|
offset = self.norm.load(data, offset)
|
||||||
offset += self.output.load(data, offset)
|
offset = self.output.load(data, offset)
|
||||||
|
|
||||||
self.layers.clear()
|
self.layers.clear()
|
||||||
for bid in range(self.n_layer):
|
for bid in range(self.params.n_layer):
|
||||||
layer = Layer(self.params, bid)
|
layer = Layer(self.params, bid)
|
||||||
offset += layer.load(data, offset)
|
offset = layer.load(data, offset)
|
||||||
self.layers.append(layer)
|
self.layers.append(layer)
|
||||||
|
|
||||||
return offset
|
return offset
|
||||||
|
@ -420,9 +423,9 @@ class Model:
|
||||||
def save_gguf(self, gguf_writer):
|
def save_gguf(self, gguf_writer):
|
||||||
self.params.save_gguf(gguf_writer)
|
self.params.save_gguf(gguf_writer)
|
||||||
|
|
||||||
self.tok_embd.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.TOKEN_EMBD])
|
self.tok_embd.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD))
|
||||||
self.norm.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.OUTPUT_NORM])
|
self.norm.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM))
|
||||||
self.output.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.OUTPUT])
|
self.output.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT))
|
||||||
|
|
||||||
for layer in self.layers:
|
for layer in self.layers:
|
||||||
layer.save_gguf(gguf_writer)
|
layer.save_gguf(gguf_writer)
|
||||||
|
@ -433,25 +436,24 @@ class Checkpoint:
|
||||||
self.opt_ctx = OptimizationContext()
|
self.opt_ctx = OptimizationContext()
|
||||||
|
|
||||||
def load(self, data, offset):
|
def load(self, data, offset):
|
||||||
magic = bytes(data[offset:offset + 4]); offset += 4
|
magic = bytes(reversed(data[offset:offset + 4])); offset += 4
|
||||||
if magic != b'ggcp':
|
if magic != b'ggcp':
|
||||||
raise ValueError(f"File header magic indicates, that this is no checkpoint file. Expected 'ggcp', Got '{str(magic)}'")
|
raise ValueError(f"File header magic indicates, that this is no checkpoint file. Expected 'ggcp', Got '{str(magic)}'")
|
||||||
|
|
||||||
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
if version != 0:
|
if self.version != 0:
|
||||||
raise ValueError('Invalid version of checkpoint file')
|
raise ValueError('Invalid version of checkpoint file')
|
||||||
|
|
||||||
self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
|
||||||
offset += self.model.load(data, offset)
|
offset = self.model.load(data, offset)
|
||||||
offset += self.opt_ctx.load(data, offset)
|
offset = self.opt_ctx.load(data, offset)
|
||||||
|
|
||||||
return offset
|
return offset
|
||||||
|
|
||||||
def save_gguf(self, gguf_writer):
|
def save_gguf(self, gguf_writer):
|
||||||
gguf_writer.add_architecture()
|
|
||||||
gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
|
gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
|
||||||
gguf_writer.add_layer_norm_rms_eps(1e-5)
|
gguf_writer.add_layer_norm_rms_eps(1e-5)
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
|
gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
|
||||||
|
@ -463,8 +465,8 @@ class Checkpoint:
|
||||||
|
|
||||||
def handle_args():
|
def handle_args():
|
||||||
parser = argparse.ArgumentParser(description = 'Convert train-text-from-scratch checkpoints to GGUF')
|
parser = argparse.ArgumentParser(description = 'Convert train-text-from-scratch checkpoints to GGUF')
|
||||||
parser.add_argument('--input', '-i', type = Path, help = 'Input train checkpoint filename')
|
parser.add_argument('--input', '-i', type = Path, help = 'Input train checkpoint filename', required=True)
|
||||||
parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename')
|
parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename', required=True)
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -476,7 +478,7 @@ def main():
|
||||||
# we should have read all available data
|
# we should have read all available data
|
||||||
assert(offset == len(data))
|
assert(offset == len(data))
|
||||||
|
|
||||||
gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
|
gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
|
||||||
chk.save_gguf(gguf_writer)
|
chk.save_gguf(gguf_writer)
|
||||||
print(" gguf: write header")
|
print(" gguf: write header")
|
||||||
gguf_writer.write_header_to_file()
|
gguf_writer.write_header_to_file()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue