diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py index d7ea4e6fe..a69a9687d 100644 --- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py +++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py @@ -4,7 +4,9 @@ import argparse import gguf import os +import struct import sys +import numpy as np from pathlib import Path # gguf constants @@ -55,7 +57,10 @@ class Tensor: self.ne = ne self.nbytes = 0 if self.dtype == 'f': - self.nbytes = product(self.ne) * 4 + if len(self.ne) == 0: + self.nbytes = 0 + else: + self.nbytes = int(np.product(self.ne)) * 4 else: raise ValueError(f"Unhandled data type '{self.dtype}'") @@ -67,7 +72,7 @@ class Tensor: assert(nd == len(self.ne)) ne = [] for d in range(nd): - n = struct.unpack(' 0 else []) self.lbfgs_x = Tensor('f', [self.nx]) self.lbfgs_xp = Tensor('f', [self.nx]) self.lbfgs_g = Tensor('f', [self.nx]) self.lbfgs_gp = Tensor('f', [self.nx]) self.lbfgs_d = Tensor('f', [self.nx]) - self.lbfgs_pf = Tensor('f', [self.past]) + self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else []) self.lbfgs_lmal = Tensor('f', [self.lbfgs_m]) self.lbfgs_lmys = Tensor('f', [self.lbfgs_m]) self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m]) @@ -174,32 +179,30 @@ class OptimizationContext: mh = Tensor('f', [self.nx]) mv = Tensor('f', [self.nx]) - offset += x.load(data, offset) - offset += g.load(data, offset) - offset += g2.load(data, offset) - offset += self.adam_m.load(data, offset) - offset += self.adam_v.load(data, offset) - offset += mh.load(data, offset) - offset += vh.load(data, offset) - if self.past > 0: - offset += self.adam_pf.load(data, offset) + offset = x.load(data, offset) + offset = g.load(data, offset) + offset = g2.load(data, offset) + offset = self.adam_m.load(data, offset) + offset = self.adam_v.load(data, offset) + offset = mh.load(data, offset) + offset = vh.load(data, offset) + offset = self.adam_pf.load(data, offset) self.adam_fx_best = struct.unpack(' 0: - offset += self.lbfgs_pf.load(data, offset) - offset += self.lbfgs_lmal.load(data, offset) - offset += self.lbfgs_lmys.load(data, offset) - offset += self.lbfgs_lms.load(data, offset) - offset += self.lbfgs_lmy.load(data, offset) + offset = self.lbfgs_x.load(data, offset) + offset = self.lbfgs_xp.load(data, offset) + offset = self.lbfgs_g.load(data, offset) + offset = self.lbfgs_gp.load(data, offset) + offset = self.lbfgs_d.load(data, offset) + offset = self.lbfgs_pf.load(data, offset) + offset = self.lbfgs_lmal.load(data, offset) + offset = self.lbfgs_lmys.load(data, offset) + offset = self.lbfgs_lms.load(data, offset) + offset = self.lbfgs_lmy.load(data, offset) self.lbfgs_fx_best = struct.unpack(' 0 else []) self.lbfgs_x = Tensor('f', [self.nx]) self.lbfgs_xp = Tensor('f', [self.nx]) self.lbfgs_g = Tensor('f', [self.nx]) self.lbfgs_gp = Tensor('f', [self.nx]) self.lbfgs_d = Tensor('f', [self.nx]) - self.lbfgs_pf = Tensor('f', [self.past]) + self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else []) self.lbfgs_lmal = Tensor('f', [self.lbfgs_m]) self.lbfgs_lmys = Tensor('f', [self.lbfgs_m]) self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m]) @@ -237,14 +240,14 @@ class OptimizationContext: # forgot to save type in version 1: # guess self.type from number of remaining bytes size_type_0 = 12 + sum([t.max_storage_size() for t in - [self.adam_m, self.adam_w] - +[self.adam_pf] if self.past > 0 else []]) + [self.adam_m, self.adam_v] + +([self.adam_pf] if (self.past > 0) else [])]) size_type_1 = 24 + sum([t.max_storage_size() for t in [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g, self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf, self.lbfgs_lmal, self.lbfgs_lmys, self.lbfgs_lms, self.lbfgs_lmy] - +[self.lbfgs_pf] if self.past > 0 else []]) + +([self.lbfgs_pf] if (self.past > 0) else [])]) # due to alignment padding the size might not by exact # but the difference in size for both types is significant, # so we can just use whichever is closest @@ -255,28 +258,25 @@ class OptimizationContext: self.type = 1 if self.type == 0: - offset += self.adam_m.load(data, offset) - offset += self.adam_w.load(data, offset) - if self.past > 0: - offset += self.adam_pf.load(data,offset) + offset = self.adam_m.load(data, offset) + offset = self.adam_v.load(data, offset) + offset = self.adam_pf.load(data,offset) self.adam_fx_best = struct.unpack(' 0: - offset += self.lbfgs_pf.load(data, offset) - offset += self.lbfgs_lmal.load(data, offset) - offset += self.lbfgs_lmys.load(data, offset) - offset += self.lbfgs_lms.load(data, offset) - offset += self.lbfgs_lmy.load(data, offset) + offset = self.lbfgs_x.load(data, offset) + offset = self.lbfgs_xp.load(data, offset) + offset = self.lbfgs_g.load(data, offset) + offset = self.lbfgs_gp.load(data, offset) + offset = self.lbfgs_d.load(data, offset) + offset = self.lbfgs_pf.load(data, offset) + offset = self.lbfgs_lmal.load(data, offset) + offset = self.lbfgs_lmys.load(data, offset) + offset = self.lbfgs_lms.load(data, offset) + offset = self.lbfgs_lmy.load(data, offset) self.lbfgs_fx_best = struct.unpack('