Merge branch 'ggerganov:master' into master
This commit is contained in:
commit
8411b02fc7
11 changed files with 1940 additions and 2508 deletions
|
@ -15,6 +15,7 @@
|
|||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
#include <cinttypes>
|
||||
|
||||
#if defined(__APPLE__) && defined(__MACH__)
|
||||
#include <sys/types.h>
|
||||
|
@ -938,8 +939,8 @@ std::string get_sortable_timestamp() {
|
|||
|
||||
const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
|
||||
current_time.time_since_epoch() % 1000000000).count();
|
||||
char timestamp_ns[10];
|
||||
snprintf(timestamp_ns, 11, "%09ld", ns);
|
||||
char timestamp_ns[11];
|
||||
snprintf(timestamp_ns, 11, "%09" PRId64, ns);
|
||||
|
||||
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
|
||||
}
|
||||
|
|
|
@ -681,7 +681,6 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
|
|||
|
||||
// for rms-att-weight
|
||||
int row_length = model->hparams.n_embd;
|
||||
const auto & hparams = model->hparams;
|
||||
int n_ff = model->hparams.n_ff;
|
||||
|
||||
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
|
||||
|
|
5
examples/gguf/CMakeLists.txt
Normal file
5
examples/gguf/CMakeLists.txt
Normal file
|
@ -0,0 +1,5 @@
|
|||
set(TARGET gguf)
|
||||
add_executable(${TARGET} gguf.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
|
@ -8,15 +8,15 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
|
|||
|
||||
# train
|
||||
./bin/train-text-from-scratch \
|
||||
--vocab-model ../models/ggml-vocab.bin \
|
||||
--vocab-model ../models/ggml-vocab-llama.gguf \
|
||||
--ctx 64 --embd 256 --head 8 --layer 16 \
|
||||
--checkpoint-in chk-shakespeare-256x16.bin \
|
||||
--checkpoint-out chk-shakespeare-256x16.bin \
|
||||
--model-out ggml-shakespeare-256x16-f32.bin \
|
||||
--checkpoint-in chk-shakespeare-256x16.gguf \
|
||||
--checkpoint-out chk-shakespeare-256x16.gguf \
|
||||
--model-out ggml-shakespeare-256x16-f32.gguf \
|
||||
--train-data "shakespeare.txt" \
|
||||
-t 6 -b 16 -n 32 --seed 1 --adam-iter 16 \
|
||||
--print-details-interval 0 --predict 16 --use-flash
|
||||
-t 6 -b 16 --seed 1 --adam-iter 256 \
|
||||
--no-checkpointing
|
||||
|
||||
# predict
|
||||
./bin/main -m ggml-shakespeare-256x16-f32.bin
|
||||
./bin/main -m ggml-shakespeare-256x16-f32.gguf
|
||||
```
|
||||
|
|
|
@ -0,0 +1,492 @@
|
|||
#!/usr/bin/env python3
|
||||
# train-text-from-scratch checkpoint --> gguf conversion
|
||||
|
||||
import argparse
|
||||
import gguf
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
# gguf constants
|
||||
LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
|
||||
LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"
|
||||
LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
|
||||
LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"
|
||||
LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"
|
||||
LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"
|
||||
LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"
|
||||
LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"
|
||||
LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"
|
||||
LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"
|
||||
LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"
|
||||
LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
|
||||
LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"
|
||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"
|
||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"
|
||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"
|
||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"
|
||||
LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
|
||||
|
||||
LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"
|
||||
LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"
|
||||
LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
|
||||
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
|
||||
|
||||
LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
|
||||
LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
|
||||
LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
|
||||
LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
|
||||
|
||||
class Tensor:
|
||||
def __init__(self, dtype='f', ne=None):
|
||||
if ne is None:
|
||||
ne = []
|
||||
self.dtype = dtype
|
||||
self.ne = ne
|
||||
self.nbytes = 0
|
||||
if self.dtype == 'f':
|
||||
if len(self.ne) == 0:
|
||||
self.nbytes = 0
|
||||
else:
|
||||
self.nbytes = int(np.product(self.ne)) * 4
|
||||
else:
|
||||
raise ValueError(f"Unhandled data type '{self.dtype}'")
|
||||
|
||||
def load(self, data, offset):
|
||||
nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
assert(nd == len(self.ne))
|
||||
ne = []
|
||||
for d in range(nd):
|
||||
n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
ne.append(n)
|
||||
|
||||
assert(tuple(ne) == tuple(self.ne))
|
||||
|
||||
if self.dtype == 'f':
|
||||
assert(dtype == 0)
|
||||
else:
|
||||
raise ValueError(f"Unhandled data type '{self.dtype}'")
|
||||
|
||||
self.name = bytes(data[offset:offset+namelen]); offset += namelen
|
||||
# 32-byte alignment
|
||||
offset += (0 - offset) & 31
|
||||
self.data = data[offset:offset+self.nbytes]
|
||||
offset += self.nbytes
|
||||
return offset
|
||||
|
||||
def max_storage_size(self):
|
||||
result = 0
|
||||
result += 4 # nd
|
||||
result += 4 # namelen
|
||||
result += 4 # dtype
|
||||
result += len(self.ne)*8 # ne
|
||||
result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
|
||||
result += 31 # 32-byte alignment
|
||||
result += self.nbytes
|
||||
return result
|
||||
|
||||
def save_gguf(self, gguf_writer, name):
|
||||
gguf_writer.add_tensor(
|
||||
name=name,
|
||||
tensor=self.data,
|
||||
raw_shape=np.array(list(reversed(self.ne))),
|
||||
raw_dtype=gguf.GGMLQuantizationType.F32)
|
||||
|
||||
class OptimizationParamsV0:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def load(self, data, offset):
|
||||
self.type = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_threads = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.delta = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.print_forward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0]; offset += 4 # 32bit-aligned
|
||||
self.print_backward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0]; offset += 4 # 32bit-aligned
|
||||
self.adam_n_iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_sched = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_decay = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_alpha = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_beta1 = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_beta2 = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_eps = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_eps_f = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_eps_g = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_n_iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_max_linesearch = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_eps = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_ftol = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_wolfe = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_min_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_max_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_linesearch = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
return offset
|
||||
|
||||
class OptimizationContext:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def load(self, data, offset):
|
||||
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
|
||||
offset += 4
|
||||
|
||||
if self.version == 0:
|
||||
params = OptimizationParamsV0()
|
||||
offset = params.load(data, offset)
|
||||
self.past = params.past
|
||||
self.lbfgs_m = params.lbfgs_m
|
||||
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
|
||||
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
|
||||
self.type = params.type
|
||||
|
||||
self.adam_m = Tensor('f', [self.nx])
|
||||
self.adam_v = Tensor('f', [self.nx])
|
||||
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
||||
|
||||
self.lbfgs_x = Tensor('f', [self.nx])
|
||||
self.lbfgs_xp = Tensor('f', [self.nx])
|
||||
self.lbfgs_g = Tensor('f', [self.nx])
|
||||
self.lbfgs_gp = Tensor('f', [self.nx])
|
||||
self.lbfgs_d = Tensor('f', [self.nx])
|
||||
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
||||
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
|
||||
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
|
||||
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
|
||||
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
|
||||
|
||||
if self.type == 0:
|
||||
# these tensors are stored, but we don't need their data
|
||||
x = Tensor('f', [self.nx])
|
||||
g = Tensor('f', [self.nx])
|
||||
g2 = Tensor('f', [self.nx])
|
||||
mh = Tensor('f', [self.nx])
|
||||
vh = Tensor('f', [self.nx])
|
||||
|
||||
offset = x.load(data, offset)
|
||||
offset = g.load(data, offset)
|
||||
offset = g2.load(data, offset)
|
||||
offset = self.adam_m.load(data, offset)
|
||||
offset = self.adam_v.load(data, offset)
|
||||
offset = mh.load(data, offset)
|
||||
offset = vh.load(data, offset)
|
||||
offset = self.adam_pf.load(data, offset)
|
||||
|
||||
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
elif self.type == 1:
|
||||
offset = self.lbfgs_x.load(data, offset)
|
||||
offset = self.lbfgs_xp.load(data, offset)
|
||||
offset = self.lbfgs_g.load(data, offset)
|
||||
offset = self.lbfgs_gp.load(data, offset)
|
||||
offset = self.lbfgs_d.load(data, offset)
|
||||
offset = self.lbfgs_pf.load(data, offset)
|
||||
offset = self.lbfgs_lmal.load(data, offset)
|
||||
offset = self.lbfgs_lmys.load(data, offset)
|
||||
offset = self.lbfgs_lms.load(data, offset)
|
||||
offset = self.lbfgs_lmy.load(data, offset)
|
||||
|
||||
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
else:
|
||||
raise ValueError('Unknown optimizer type')
|
||||
|
||||
|
||||
elif self.version == 1:
|
||||
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
|
||||
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
|
||||
|
||||
self.adam_m = Tensor('f', [self.nx])
|
||||
self.adam_v = Tensor('f', [self.nx])
|
||||
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
||||
|
||||
self.lbfgs_x = Tensor('f', [self.nx])
|
||||
self.lbfgs_xp = Tensor('f', [self.nx])
|
||||
self.lbfgs_g = Tensor('f', [self.nx])
|
||||
self.lbfgs_gp = Tensor('f', [self.nx])
|
||||
self.lbfgs_d = Tensor('f', [self.nx])
|
||||
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
||||
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
|
||||
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
|
||||
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
|
||||
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
|
||||
|
||||
# forgot to save type in version 1:
|
||||
# guess self.type from number of remaining bytes
|
||||
size_type_0 = 12 + sum([t.max_storage_size() for t in
|
||||
[self.adam_m, self.adam_v]
|
||||
+([self.adam_pf] if (self.past > 0) else [])])
|
||||
size_type_1 = 24 + sum([t.max_storage_size() for t in
|
||||
[self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
|
||||
self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
|
||||
self.lbfgs_lmal, self.lbfgs_lmys,
|
||||
self.lbfgs_lms, self.lbfgs_lmy]
|
||||
+([self.lbfgs_pf] if (self.past > 0) else [])])
|
||||
# due to alignment padding the size might not by exact
|
||||
# but the difference in size for both types is significant,
|
||||
# so we can just use whichever is closest
|
||||
remaining = len(data) - offset
|
||||
if abs(remaining - size_type_0) < abs(remaining - size_type_1):
|
||||
self.type = 0
|
||||
else:
|
||||
self.type = 1
|
||||
|
||||
if self.type == 0:
|
||||
offset = self.adam_m.load(data, offset)
|
||||
offset = self.adam_v.load(data, offset)
|
||||
offset = self.adam_pf.load(data,offset)
|
||||
|
||||
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
elif self.type == 1:
|
||||
offset = self.lbfgs_x.load(data, offset)
|
||||
offset = self.lbfgs_xp.load(data, offset)
|
||||
offset = self.lbfgs_g.load(data, offset)
|
||||
offset = self.lbfgs_gp.load(data, offset)
|
||||
offset = self.lbfgs_d.load(data, offset)
|
||||
offset = self.lbfgs_pf.load(data, offset)
|
||||
offset = self.lbfgs_lmal.load(data, offset)
|
||||
offset = self.lbfgs_lmys.load(data, offset)
|
||||
offset = self.lbfgs_lms.load(data, offset)
|
||||
offset = self.lbfgs_lmy.load(data, offset)
|
||||
|
||||
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
else:
|
||||
raise ValueError('Invalid version of checkpoint file')
|
||||
|
||||
return offset
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
|
||||
gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
|
||||
gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
|
||||
|
||||
if self.type == 0:
|
||||
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
|
||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
|
||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
|
||||
|
||||
self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
|
||||
self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
|
||||
if self.past > 0:
|
||||
self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
|
||||
|
||||
elif self.type == 1:
|
||||
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
|
||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
|
||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
|
||||
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
|
||||
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
|
||||
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
|
||||
|
||||
self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
|
||||
self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
|
||||
self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
|
||||
self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
|
||||
self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
|
||||
if self.past > 0:
|
||||
self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
|
||||
self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
|
||||
self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
|
||||
self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
|
||||
self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
|
||||
else:
|
||||
raise ValueError('Unknown optimizer type')
|
||||
|
||||
class ModelParams:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def load(self, data, offset):
|
||||
self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_embd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_mult = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_head = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_rot = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
return offset
|
||||
|
||||
def get_n_ff(self):
|
||||
# struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
|
||||
return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
# self.n_vocab not saved
|
||||
gguf_writer.add_embedding_length(self.n_embd)
|
||||
gguf_writer.add_head_count(self.n_head)
|
||||
gguf_writer.add_block_count(self.n_layer)
|
||||
gguf_writer.add_rope_dimension_count(self.n_rot)
|
||||
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
||||
|
||||
def tensor_name(key, bid=None):
|
||||
return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + ".weight"
|
||||
|
||||
class Layer:
|
||||
def __init__(self, params, bid):
|
||||
self.bid = bid
|
||||
self.att_norm = Tensor('f', [params.n_embd])
|
||||
self.wq = Tensor('f', [params.n_embd, params.n_embd])
|
||||
self.wk = Tensor('f', [params.n_embd, params.n_embd])
|
||||
self.wv = Tensor('f', [params.n_embd, params.n_embd])
|
||||
self.wo = Tensor('f', [params.n_embd, params.n_embd])
|
||||
self.ffn_norm = Tensor('f', [params.n_embd])
|
||||
self.w1 = Tensor('f', [params.n_embd, params.get_n_ff()])
|
||||
self.w2 = Tensor('f', [params.get_n_ff(), params.n_embd])
|
||||
self.w3 = Tensor('f', [params.n_embd, params.get_n_ff()])
|
||||
|
||||
def load(self, data, offset):
|
||||
offset = self.att_norm.load(data, offset)
|
||||
offset = self.wq.load(data, offset)
|
||||
offset = self.wk.load(data, offset)
|
||||
offset = self.wv.load(data, offset)
|
||||
offset = self.wo.load(data, offset)
|
||||
offset = self.ffn_norm.load(data, offset)
|
||||
offset = self.w1.load(data, offset)
|
||||
offset = self.w2.load(data, offset)
|
||||
offset = self.w3.load(data, offset)
|
||||
return offset
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
self.att_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid))
|
||||
self.wq.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid))
|
||||
self.wk.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid))
|
||||
self.wv.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid))
|
||||
self.wo.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid))
|
||||
self.ffn_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid))
|
||||
self.w1.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid))
|
||||
self.w2.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid))
|
||||
self.w3.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid))
|
||||
|
||||
class Model:
|
||||
def __init__(self):
|
||||
self.params = ModelParams()
|
||||
self.layers = []
|
||||
|
||||
def load(self, data, offset):
|
||||
offset = self.params.load(data, offset)
|
||||
|
||||
self.tok_embd = Tensor('f', [self.params.n_embd, self.params.n_vocab])
|
||||
self.norm = Tensor('f', [self.params.n_embd])
|
||||
self.output = Tensor('f', [self.params.n_embd, self.params.n_vocab])
|
||||
|
||||
offset = self.tok_embd.load(data, offset)
|
||||
offset = self.norm.load(data, offset)
|
||||
offset = self.output.load(data, offset)
|
||||
|
||||
self.layers.clear()
|
||||
for bid in range(self.params.n_layer):
|
||||
layer = Layer(self.params, bid)
|
||||
offset = layer.load(data, offset)
|
||||
self.layers.append(layer)
|
||||
|
||||
return offset
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
self.params.save_gguf(gguf_writer)
|
||||
|
||||
self.tok_embd.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD))
|
||||
self.norm.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM))
|
||||
self.output.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT))
|
||||
|
||||
for layer in self.layers:
|
||||
layer.save_gguf(gguf_writer)
|
||||
|
||||
class Checkpoint:
|
||||
def __init__(self):
|
||||
self.model = Model()
|
||||
self.opt_ctx = OptimizationContext()
|
||||
|
||||
def load(self, data, offset):
|
||||
magic = bytes(reversed(data[offset:offset + 4])); offset += 4
|
||||
if magic != b'ggcp':
|
||||
raise ValueError(f"File header magic indicates, that this is no checkpoint file. Expected 'ggcp', Got '{str(magic)}'")
|
||||
|
||||
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
if self.version != 0:
|
||||
raise ValueError('Invalid version of checkpoint file')
|
||||
|
||||
self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
offset = self.model.load(data, offset)
|
||||
offset = self.opt_ctx.load(data, offset)
|
||||
|
||||
return offset
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
|
||||
gguf_writer.add_layer_norm_rms_eps(1e-5)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
|
||||
self.model.save_gguf(gguf_writer)
|
||||
self.opt_ctx.save_gguf(gguf_writer)
|
||||
|
||||
def handle_args():
|
||||
parser = argparse.ArgumentParser(description = 'Convert train-text-from-scratch checkpoints to GGUF')
|
||||
parser.add_argument('--input', '-i', type = Path, help = 'Input train checkpoint filename', required=True)
|
||||
parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename', required=True)
|
||||
return parser.parse_args()
|
||||
|
||||
def main():
|
||||
cfg = handle_args()
|
||||
data = np.memmap(cfg.input, mode = 'r')
|
||||
chk = Checkpoint()
|
||||
offset = 0
|
||||
offset = chk.load(data, offset)
|
||||
# we should have read all available data
|
||||
assert(offset == len(data))
|
||||
|
||||
gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
|
||||
chk.save_gguf(gguf_writer)
|
||||
print(" gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print(" gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
print(" gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
gguf_writer.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
File diff suppressed because it is too large
Load diff
|
@ -107,6 +107,10 @@ static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct g
|
|||
}
|
||||
|
||||
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
|
||||
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
||||
#endif
|
||||
size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
|
||||
size = aligned_offset(NULL, size, alloc->alignment);
|
||||
|
||||
|
|
335
ggml.c
335
ggml.c
|
@ -123,6 +123,8 @@ typedef void * thread_ret_t;
|
|||
#define GGML_GELU_FP16
|
||||
#define GGML_GELU_QUICK_FP16
|
||||
#define GGML_SILU_FP16
|
||||
// #define GGML_CROSS_ENTROPY_EXP_FP16
|
||||
// #define GGML_FLASH_ATTN_EXP_FP16
|
||||
|
||||
#define GGML_SOFT_MAX_UNROLL 4
|
||||
#define GGML_VEC_DOT_UNROLL 2
|
||||
|
@ -186,8 +188,8 @@ typedef void * thread_ret_t;
|
|||
//
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
||||
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
||||
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
||||
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
||||
#else
|
||||
inline static void * ggml_aligned_malloc(size_t size) {
|
||||
void * aligned_memory = NULL;
|
||||
|
@ -212,8 +214,8 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|||
}
|
||||
return aligned_memory;
|
||||
}
|
||||
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
||||
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
||||
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
||||
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
||||
#endif
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
@ -5857,7 +5859,8 @@ struct ggml_tensor * ggml_rms_norm_inplace(
|
|||
struct ggml_tensor * ggml_rms_norm_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b) {
|
||||
struct ggml_tensor * b,
|
||||
float eps) {
|
||||
bool is_node = false;
|
||||
|
||||
if (a->grad) {
|
||||
|
@ -5867,6 +5870,8 @@ struct ggml_tensor * ggml_rms_norm_back(
|
|||
|
||||
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
||||
|
||||
ggml_set_op_params(result, &eps, sizeof(eps));
|
||||
|
||||
result->op = GGML_OP_RMS_NORM_BACK;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
|
@ -9443,6 +9448,8 @@ static void ggml_compute_forward_div_f32(
|
|||
|
||||
|
||||
#ifdef GGML_USE_ACCELERATE
|
||||
UNUSED(ggml_vec_div_f32);
|
||||
|
||||
vDSP_vdiv(
|
||||
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
||||
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
||||
|
@ -10749,7 +10756,8 @@ static void ggml_compute_forward_rms_norm_back_f32(
|
|||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
|
||||
const float eps = 1e-6f; // TODO: make this a parameter
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
// TODO: optimize
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
|
@ -12139,6 +12147,7 @@ static void ggml_compute_forward_soft_max_back_f32(
|
|||
// dx = J * dy
|
||||
// dxk = sum_i(Jki * dyi)
|
||||
// dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
|
||||
// dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
|
||||
// dxk = sum_i(-yk*yi * dyi) + yk*dyk
|
||||
// dxk = -yk * sum_i(yi * dyi) + yk*dyk
|
||||
// dxk = -yk * dot(y, dy) + yk*dyk
|
||||
|
@ -13929,7 +13938,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
|||
vvexpf(S, S, &Mup);
|
||||
ggml_vec_sum_f32(Mup, &sum, S);
|
||||
#else
|
||||
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
||||
uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
|
||||
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
||||
|
||||
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
||||
|
@ -13939,9 +13948,13 @@ static void ggml_compute_forward_flash_attn_f32(
|
|||
if (SS[j] == -INFINITY) {
|
||||
SS[j] = 0.0f;
|
||||
} else {
|
||||
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
||||
const float val = expf(SS[j] - max);
|
||||
#else
|
||||
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
||||
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
||||
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
|
||||
#endif
|
||||
sump[j] += (ggml_float)val;
|
||||
SS[j] = val;
|
||||
}
|
||||
|
@ -14519,7 +14532,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|||
vvexpf(SM, SM, &Mup);
|
||||
ggml_vec_sum_f32(Mup, &sum, SM);
|
||||
#else
|
||||
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
||||
uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
|
||||
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
||||
|
||||
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
||||
|
@ -14530,9 +14543,13 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|||
if (SR[j] == -INFINITY) {
|
||||
SW[j] = 0.0f;
|
||||
} else {
|
||||
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
||||
const float val = expf(SR[j] - max);
|
||||
#else
|
||||
ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
|
||||
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
||||
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
|
||||
#endif
|
||||
sump[j] += (ggml_float)val;
|
||||
SW[j] = val;
|
||||
}
|
||||
|
@ -15270,6 +15287,8 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|||
const int nc = src0->ne[0];
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
if (ith == 0) {
|
||||
memset(sums, 0, sizeof(float) * (nth + nth * nc));
|
||||
|
@ -15281,7 +15300,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|||
if (ith == 0) {
|
||||
float * dp = (float *) dst->data;
|
||||
ggml_vec_sum_f32(nth, dp, sums);
|
||||
dp[0] *= -1.0f;
|
||||
dp[0] *= -1.0f / (float) nr;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -15298,7 +15317,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
|
||||
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
|
||||
float * st = (float *) params->wdata + nth + ith*nc;
|
||||
float * st = ((float *) params->wdata) + nth + ith*nc;
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int i = 0; i < nc; ++i) {
|
||||
|
@ -15313,15 +15332,19 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|||
float max = -INFINITY;
|
||||
ggml_vec_max_f32(nc, &max, s0);
|
||||
|
||||
uint16_t scvt;
|
||||
uint16_t scvt; UNUSED(scvt);
|
||||
for (int i = 0; i < nc; i++) {
|
||||
if (s0[i] == -INFINITY) {
|
||||
st[i] = 0.0f;
|
||||
} else {
|
||||
// const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
|
||||
#ifndef GGML_CROSS_ENTROPY_EXP_FP16
|
||||
const float s = s0[i] - max;
|
||||
const float val = expf(s);
|
||||
#else
|
||||
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
||||
memcpy(&scvt, &s, sizeof(scvt));
|
||||
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
|
||||
#endif
|
||||
sum += (ggml_float)val;
|
||||
st[i] = val;
|
||||
}
|
||||
|
@ -15337,7 +15360,9 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|||
ggml_vec_log_f32(nc, st, st);
|
||||
ggml_vec_mul_f32(nc, st, st, s1);
|
||||
|
||||
ggml_vec_sum_f32(nc, sums + ith, st);
|
||||
float st_sum = 0;
|
||||
ggml_vec_sum_f32(nc, &st_sum, st);
|
||||
sums[ith] += st_sum;
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int i = 0; i < nc; ++i) {
|
||||
|
@ -15387,7 +15412,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|||
return;
|
||||
}
|
||||
|
||||
const float eps = 1e-9f;
|
||||
const double eps = 1e-9;
|
||||
|
||||
// TODO: handle transposed/permuted matrices
|
||||
const int64_t nc = src0->ne[0];
|
||||
|
@ -15406,7 +15431,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|||
float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
|
||||
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
|
||||
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
|
||||
float * sm = (float *) params->wdata + ith*nc;
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int i = 0; i < nc; ++i) {
|
||||
|
@ -15415,54 +15439,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|||
assert(!isnan(s1[i]));
|
||||
}
|
||||
#endif
|
||||
// step by step explanation:
|
||||
{
|
||||
//float * sums = (float *) params->wdata;
|
||||
|
||||
// forward pass with annotated gradients from backward pass
|
||||
// (built by going in reverse operation order, adding to gradients of current operation args)
|
||||
// st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum
|
||||
// from softmax_back: grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
|
||||
// ggml_vec_scale_f32(nc, st, sum); // st1 = st0*/sum = softmax(s0) grad[st1] = grad[st2]*(1.0 - eps)
|
||||
// ggml_vec_scale_f32(nc, st, (1.0f - eps)); // st2 = st1*(1.0 - eps) grad[st2] = grad[st3]
|
||||
// ggml_vec_add1_f32(nc, st, st, eps); // st3 = st2 + eps grad[st3] = grad[st4]/st3
|
||||
// ggml_vec_log_f32(nc, st, st); // st4 = log(st3) grad[st4] = grad[st5] * s1
|
||||
// ggml_vec_mul_f32(nc, st, st, s1); // st5 = st4 * s1 grad[st5] = grad[sums[ith]]
|
||||
// ggml_vec_sum_f32(nc, sums + ith, st); // sums[ith] = st5 grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel]
|
||||
|
||||
// substitute into grad[st1], because we can reuse softmax_back from this point on
|
||||
// grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps))
|
||||
// postorder:
|
||||
// grad[st1] := softmax(s0)
|
||||
// grad[st1] := grad[st1]*(1.0 - eps)
|
||||
// grad[st1] := grad[st1] + eps
|
||||
// grad[st1] := s1 / grad[st1]
|
||||
// grad[st1] := grad[st1]*(1.0-eps)*-grad[cel]
|
||||
|
||||
// src0 gradients by going through softmax_back
|
||||
// grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
|
||||
// from softmax_back:
|
||||
// dxk = yk * (dyk - dot(y, dy))
|
||||
// dot_y_dy := dot(y, dy)
|
||||
// dx := dy
|
||||
// dx := dx - dot_y_dy
|
||||
// dx := dx * y
|
||||
// postorder:
|
||||
// dot_st1_dst1 := dot(st1, grad[st1])
|
||||
// grad[s0] := grad[st1]
|
||||
// grad[s0] := grad[s0] - dot_st1_dst1
|
||||
// grad[s0] := grad[s0] * st1
|
||||
|
||||
// prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1]
|
||||
// sm := softmax(s0)
|
||||
// grad[s0] := sm*(1.0 - eps)
|
||||
// grad[s0] := grad[s0] + eps
|
||||
// grad[s0] := s1 / grad[s0]
|
||||
// grad[s0] := grad[s0]*(1.0-eps)*-grad[cel]
|
||||
// dot_st1_dst1 := dot(sm, grad[s0])
|
||||
// grad[s0] := grad[s0] - dot_st1_dst1
|
||||
// grad[s0] := grad[s0] * sm
|
||||
}
|
||||
|
||||
// soft_max
|
||||
ggml_float sum = 0.0;
|
||||
|
@ -15470,39 +15446,37 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|||
float max = -INFINITY;
|
||||
ggml_vec_max_f32(nc, &max, s0);
|
||||
|
||||
uint16_t scvt;
|
||||
uint16_t scvt; UNUSED(scvt);
|
||||
for (int i = 0; i < nc; i++) {
|
||||
if (s0[i] == -INFINITY) {
|
||||
sm[i] = 0.0f;
|
||||
ds0[i] = 0.0f;
|
||||
} else {
|
||||
// const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
|
||||
#ifndef GGML_CROSS_ENTROPY_EXP_FP16
|
||||
const float s = s0[i] - max;
|
||||
const float val = expf(s);
|
||||
#else
|
||||
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
||||
memcpy(&scvt, &s, sizeof(scvt));
|
||||
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
|
||||
#endif
|
||||
sum += (ggml_float)val;
|
||||
sm[i] = val;
|
||||
ds0[i] = val;
|
||||
}
|
||||
}
|
||||
|
||||
assert(sum > 0.0);
|
||||
sum = 1.0/sum;
|
||||
sum = (1.0 - eps)/sum;
|
||||
}
|
||||
|
||||
float dot_st1_dst1 = 0;
|
||||
ggml_vec_scale_f32(nc, sm, sum);
|
||||
ggml_vec_cpy_f32 (nc, ds0, sm);
|
||||
ggml_vec_scale_f32(nc, ds0, (1.0f - eps));
|
||||
ggml_vec_add1_f32 (nc, ds0, ds0, eps);
|
||||
ggml_vec_div_f32 (nc, ds0, s1, ds0);
|
||||
ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]);
|
||||
ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0);
|
||||
ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1);
|
||||
ggml_vec_mul_f32 (nc, ds0, ds0, sm);
|
||||
// grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
|
||||
ggml_vec_scale_f32(nc, ds0, sum);
|
||||
ggml_vec_add1_f32(nc, ds0, ds0, eps);
|
||||
ggml_vec_sub_f32(nc, ds0, ds0, s1);
|
||||
ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
|
||||
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int i = 0; i < nc; ++i) {
|
||||
assert(!isnan(sm[i]));
|
||||
assert(!isinf(sm[i]));
|
||||
assert(!isnan(ds0[i]));
|
||||
assert(!isinf(ds0[i]));
|
||||
}
|
||||
|
@ -16057,9 +16031,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||
{
|
||||
// necessary for llama
|
||||
if (src0->grad) {
|
||||
float eps;
|
||||
memcpy(&eps, tensor->op_params, sizeof(float));
|
||||
|
||||
src0->grad = ggml_add_impl(ctx,
|
||||
src0->grad,
|
||||
ggml_rms_norm_back(ctx, src0, tensor->grad),
|
||||
ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
|
||||
inplace);
|
||||
}
|
||||
} break;
|
||||
|
@ -16827,9 +16804,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
|
|||
return result;
|
||||
}
|
||||
|
||||
struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
|
||||
struct ggml_cgraph result = *gf;
|
||||
|
||||
void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
|
||||
GGML_ASSERT(gf->n_nodes > 0);
|
||||
|
||||
// if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
|
||||
|
@ -16853,15 +16828,19 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
|
|||
}
|
||||
}
|
||||
|
||||
for (int i = gf->n_nodes - 1; i >= 0; i--) {
|
||||
for (int i = 0; i < gf->n_nodes; i++) {
|
||||
struct ggml_tensor * node = gf->nodes[i];
|
||||
|
||||
if (node->is_param) {
|
||||
GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
|
||||
ggml_build_forward_expand(&result, node->grad);
|
||||
ggml_build_forward_expand(gb, node->grad);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
|
||||
struct ggml_cgraph result = *gf;
|
||||
ggml_build_backward_expand(ctx, gf, &result, keep);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -17537,10 +17516,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|||
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
|
||||
size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
|
||||
|
||||
work_size = MAX(work_size, cur);
|
||||
} break;
|
||||
case GGML_OP_NONE:
|
||||
{
|
||||
|
@ -18418,14 +18393,16 @@ static enum ggml_opt_result ggml_opt_adam(
|
|||
struct ggml_opt_params params,
|
||||
struct ggml_tensor * f,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_cgraph * gb) {
|
||||
struct ggml_cgraph * gb,
|
||||
ggml_opt_callback callback,
|
||||
void * callback_data) {
|
||||
GGML_ASSERT(ggml_is_scalar(f));
|
||||
|
||||
// these will store the parameters we want to optimize
|
||||
struct ggml_tensor * ps[GGML_MAX_PARAMS];
|
||||
|
||||
int np = 0;
|
||||
int nx = 0;
|
||||
int64_t nx = 0;
|
||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
||||
if (gf->nodes[i]->is_param) {
|
||||
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
|
||||
|
@ -18444,31 +18421,32 @@ static enum ggml_opt_result ggml_opt_adam(
|
|||
}
|
||||
|
||||
// constants
|
||||
const float sched = params.adam.sched;
|
||||
const float decay = params.adam.decay * sched;
|
||||
const float alpha = params.adam.alpha * sched;
|
||||
float sched = params.adam.sched;
|
||||
const float alpha = params.adam.alpha;
|
||||
const float decay = params.adam.decay * alpha;
|
||||
const float beta1 = params.adam.beta1;
|
||||
const float beta2 = params.adam.beta2;
|
||||
const float eps = params.adam.eps;
|
||||
const float gclip = params.adam.gclip;
|
||||
const int decay_min_ndim = params.adam.decay_min_ndim;
|
||||
|
||||
float * x = opt->adam.x->data; // view of the parameters
|
||||
float * g1 = opt->adam.g1->data; // gradient
|
||||
float * g2 = opt->adam.g2->data; // gradient squared
|
||||
float * m = opt->adam.m->data; // first moment
|
||||
float * v = opt->adam.v->data; // second moment
|
||||
float * mh = opt->adam.mh->data; // first moment hat
|
||||
float * vh = opt->adam.vh->data; // second moment hat
|
||||
|
||||
float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
|
||||
|
||||
// update view
|
||||
ggml_opt_get_params(np, ps, x);
|
||||
if (callback) {
|
||||
callback(callback_data, &sched);
|
||||
}
|
||||
|
||||
// compute the function value
|
||||
ggml_graph_reset (gf);
|
||||
ggml_set_f32 (f->grad, 1.0f);
|
||||
|
||||
ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
|
||||
struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
|
||||
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
|
||||
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
||||
ggml_graph_compute(gb, &cplan);
|
||||
|
||||
opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
|
||||
opt->adam.fx_best = opt->adam.fx_prev;
|
||||
|
@ -18476,6 +18454,9 @@ static enum ggml_opt_result ggml_opt_adam(
|
|||
pf[opt->iter % params.past] = opt->adam.fx_prev;
|
||||
}
|
||||
|
||||
opt->loss_before = opt->adam.fx_prev;
|
||||
opt->loss_after = opt->adam.fx_prev;
|
||||
|
||||
// initialize
|
||||
if (opt->just_initialized) {
|
||||
opt->adam.n_no_improvement = 0;
|
||||
|
@ -18508,50 +18489,55 @@ static enum ggml_opt_result ggml_opt_adam(
|
|||
UNUSED(t_start_cpu);
|
||||
|
||||
{
|
||||
// update the gradient
|
||||
ggml_opt_get_grad(np, ps, g1);
|
||||
float gnorm = 1.0f;
|
||||
if (gclip > 0.0f) {
|
||||
// gradient clipping
|
||||
ggml_float sum = 0.0;
|
||||
for (int p = 0; p < np; ++p) {
|
||||
const int64_t ne = ggml_nelements(ps[p]);
|
||||
for (int64_t j = 0; j < ne; ++j) {
|
||||
float g = ggml_get_f32_1d(ps[p]->grad, j);
|
||||
sum += (ggml_float)(g*g);
|
||||
}
|
||||
}
|
||||
ggml_float norm = sqrt(sum);
|
||||
if (norm > (ggml_float) gclip) {
|
||||
gnorm = (float) ((ggml_float) gclip / norm);
|
||||
}
|
||||
}
|
||||
const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
|
||||
const float beta2h = 1.0f/(1.0f - powf(beta2, opt->iter));
|
||||
int64_t i = 0;
|
||||
for (int p = 0; p < np; ++p) {
|
||||
const int64_t ne = ggml_nelements(ps[p]);
|
||||
const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
|
||||
for (int64_t j = 0; j < ne; ++j) {
|
||||
float x = ggml_get_f32_1d(ps[p], j);
|
||||
float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
|
||||
m[i] = m[i]*beta1 + g*(1.0f - beta1);
|
||||
v[i] = v[i]*beta2 + g*g*(1.0f - beta2);
|
||||
float mh = m[i]*beta1h;
|
||||
float vh = v[i]*beta2h;
|
||||
vh = sqrtf(vh) + eps;
|
||||
x = x*(1.0f - p_decay) - mh/vh;
|
||||
ggml_set_f32_1d(ps[p], j, x);
|
||||
++i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// m_t = beta1*m_t-1 + (1 - beta1)*g_t
|
||||
ggml_vec_scale_f32(nx, m, beta1);
|
||||
ggml_vec_mad_f32 (nx, m, g1, 1.0f - beta1);
|
||||
|
||||
// g2 = g1^2
|
||||
ggml_vec_sqr_f32 (nx, g2, g1);
|
||||
|
||||
// v_t = beta2*v_t-1 + (1 - beta2)*g_t^2
|
||||
ggml_vec_scale_f32(nx, v, beta2);
|
||||
ggml_vec_mad_f32 (nx, v, g2, 1.0f - beta2);
|
||||
|
||||
// m^hat = m_t / (1 - beta1^t)
|
||||
// v^hat = v_t / (1 - beta2^t)
|
||||
// x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1)
|
||||
// x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1
|
||||
// x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps)
|
||||
// x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps)
|
||||
// x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay)
|
||||
ggml_vec_cpy_f32 (nx, mh, m);
|
||||
ggml_vec_cpy_f32 (nx, vh, v);
|
||||
|
||||
ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter)));
|
||||
ggml_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, opt->iter)));
|
||||
|
||||
ggml_vec_sqrt_f32 (nx, vh, vh);
|
||||
ggml_vec_acc1_f32 (nx, vh, eps);
|
||||
|
||||
ggml_vec_div_f32 (nx, mh, mh, vh);
|
||||
ggml_vec_scale_f32(nx, x, 1.0f - decay);
|
||||
ggml_vec_sub_f32 (nx, x, x, mh);
|
||||
|
||||
// update the parameters
|
||||
ggml_opt_set_params(np, ps, x);
|
||||
if (callback) {
|
||||
callback(callback_data, &sched);
|
||||
}
|
||||
|
||||
ggml_graph_reset (gf);
|
||||
ggml_set_f32 (f->grad, 1.0f);
|
||||
|
||||
ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
|
||||
ggml_graph_compute(gb, &cplan);
|
||||
|
||||
const float fx = ggml_get_f32_1d(f, 0);
|
||||
opt->loss_after = fx;
|
||||
|
||||
|
||||
// check convergence
|
||||
if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
|
||||
|
@ -18620,7 +18606,6 @@ struct ggml_lbfgs_iteration_data {
|
|||
};
|
||||
|
||||
static enum ggml_opt_result linesearch_backtracking(
|
||||
struct ggml_context * ctx,
|
||||
const struct ggml_opt_params * params,
|
||||
int nx,
|
||||
float * x,
|
||||
|
@ -18632,8 +18617,11 @@ static enum ggml_opt_result linesearch_backtracking(
|
|||
struct ggml_tensor * f,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_cgraph * gb,
|
||||
struct ggml_cplan * cplan,
|
||||
const int np,
|
||||
struct ggml_tensor * ps[]) {
|
||||
struct ggml_tensor * ps[],
|
||||
ggml_opt_callback callback,
|
||||
void * callback_data) {
|
||||
int count = 0;
|
||||
|
||||
float width = 0.0f;
|
||||
|
@ -18662,6 +18650,12 @@ static enum ggml_opt_result linesearch_backtracking(
|
|||
dgtest = params->lbfgs.ftol*dginit;
|
||||
|
||||
while (true) {
|
||||
if (callback) {
|
||||
// LBFG-S does not support learning rate -> ignore learning schedule
|
||||
float sched = 0;
|
||||
callback(callback_data, &sched);
|
||||
}
|
||||
|
||||
ggml_vec_cpy_f32(nx, x, xp);
|
||||
ggml_vec_mad_f32(nx, x, d, *step);
|
||||
|
||||
|
@ -18672,7 +18666,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
|||
ggml_graph_reset (gf);
|
||||
ggml_set_f32 (f->grad, 1.0f);
|
||||
|
||||
ggml_graph_compute_with_ctx(ctx, gb, params->n_threads);
|
||||
ggml_graph_compute(gb, cplan);
|
||||
|
||||
ggml_opt_get_grad(np, ps, g);
|
||||
|
||||
|
@ -18732,7 +18726,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||
struct ggml_opt_params params,
|
||||
struct ggml_tensor * f,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_cgraph * gb) {
|
||||
struct ggml_cgraph * gb,
|
||||
ggml_opt_callback callback,
|
||||
void * callback_data) {
|
||||
if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
|
||||
params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
|
||||
if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
|
||||
|
@ -18764,6 +18760,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||
opt->iter = iter;
|
||||
}
|
||||
|
||||
struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
|
||||
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
|
||||
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
||||
|
||||
float * x = opt->lbfgs.x->data; // current parameters
|
||||
float * xp = opt->lbfgs.xp->data; // previous parameters
|
||||
float * g = opt->lbfgs.g->data; // current gradient
|
||||
|
@ -18785,6 +18785,12 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||
float * lm_s = opt->lbfgs.lms->data;
|
||||
float * lm_y = opt->lbfgs.lmy->data;
|
||||
|
||||
if (callback) {
|
||||
// LBFG-S does not support learning rate -> ignore learning schedule
|
||||
float sched = 0;
|
||||
callback(callback_data, &sched);
|
||||
}
|
||||
|
||||
// evaluate the function value and its gradient
|
||||
{
|
||||
ggml_opt_set_params(np, ps, x);
|
||||
|
@ -18792,11 +18798,14 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||
ggml_graph_reset (gf);
|
||||
ggml_set_f32 (f->grad, 1.0f);
|
||||
|
||||
ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
|
||||
ggml_graph_compute(gb, &cplan);
|
||||
|
||||
ggml_opt_get_grad(np, ps, g);
|
||||
|
||||
fx = ggml_get_f32_1d(f, 0);
|
||||
|
||||
opt->loss_before = fx;
|
||||
opt->loss_after = fx;
|
||||
}
|
||||
|
||||
// search direction = -gradient
|
||||
|
@ -18851,7 +18860,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||
ggml_vec_cpy_f32(nx, xp, x);
|
||||
ggml_vec_cpy_f32(nx, gp, g);
|
||||
|
||||
ls = linesearch_backtracking(ctx, ¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);
|
||||
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data);
|
||||
|
||||
if (ls < 0) {
|
||||
// linesearch failed - go back to the previous point and return
|
||||
|
@ -18861,6 +18870,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||
return ls;
|
||||
}
|
||||
|
||||
opt->loss_after = fx;
|
||||
|
||||
ggml_vec_norm_f32(nx, &xnorm, x);
|
||||
ggml_vec_norm_f32(nx, &gnorm, g);
|
||||
|
||||
|
@ -18918,7 +18929,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||
// ys = y^t \cdot s -> 1 / \rho.
|
||||
// yy = y^t \cdot y.
|
||||
//
|
||||
ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0] *nx]);
|
||||
ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
|
||||
ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
|
||||
|
||||
lm_ys[end[0]] = ys;
|
||||
|
@ -18981,13 +18992,15 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
|
|||
.adam = {
|
||||
.n_iter = 10000,
|
||||
.sched = 1.000f,
|
||||
.decay = 0.001f,
|
||||
.decay = 0.0f,
|
||||
.decay_min_ndim = 2,
|
||||
.alpha = 0.001f,
|
||||
.beta1 = 0.9f,
|
||||
.beta2 = 0.999f,
|
||||
.eps = 1e-8f,
|
||||
.eps_f = 1e-5f,
|
||||
.eps_g = 1e-3f,
|
||||
.gclip = 0.0f,
|
||||
},
|
||||
};
|
||||
} break;
|
||||
|
@ -19037,23 +19050,13 @@ GGML_API void ggml_opt_init(
|
|||
switch (opt->params.type) {
|
||||
case GGML_OPT_ADAM:
|
||||
{
|
||||
opt->adam.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
||||
opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
||||
opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
||||
opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
||||
opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
||||
opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
||||
opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
||||
opt->adam.pf = params.past > 0
|
||||
? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
|
||||
: NULL;
|
||||
ggml_set_zero(opt->adam.x);
|
||||
ggml_set_zero(opt->adam.g1);
|
||||
ggml_set_zero(opt->adam.g2);
|
||||
ggml_set_zero(opt->adam.m);
|
||||
ggml_set_zero(opt->adam.v);
|
||||
ggml_set_zero(opt->adam.mh);
|
||||
ggml_set_zero(opt->adam.vh);
|
||||
if (opt->adam.pf) {
|
||||
ggml_set_zero(opt->adam.pf);
|
||||
}
|
||||
|
@ -19137,7 +19140,7 @@ enum ggml_opt_result ggml_opt_resume(
|
|||
*gf = ggml_build_forward (f);
|
||||
*gb = ggml_build_backward(ctx, gf, true);
|
||||
|
||||
return ggml_opt_resume_g(ctx, opt, f, gf, gb);
|
||||
return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
|
||||
}
|
||||
|
||||
enum ggml_opt_result ggml_opt_resume_g(
|
||||
|
@ -19145,7 +19148,9 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|||
struct ggml_opt_context * opt,
|
||||
struct ggml_tensor * f,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_cgraph * gb) {
|
||||
struct ggml_cgraph * gb,
|
||||
ggml_opt_callback callback,
|
||||
void * callback_data) {
|
||||
|
||||
// build forward + backward compute graphs
|
||||
enum ggml_opt_result result = GGML_OPT_OK;
|
||||
|
@ -19153,11 +19158,11 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|||
switch (opt->params.type) {
|
||||
case GGML_OPT_ADAM:
|
||||
{
|
||||
result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
|
||||
result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
|
||||
} break;
|
||||
case GGML_OPT_LBFGS:
|
||||
{
|
||||
result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb);
|
||||
result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
|
||||
} break;
|
||||
}
|
||||
|
||||
|
@ -19612,7 +19617,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
|
||||
// read the kv pairs
|
||||
{
|
||||
ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
|
||||
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
||||
|
||||
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
||||
struct gguf_kv * kv = &ctx->kv[i];
|
||||
|
@ -19695,7 +19700,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
|
||||
// read the tensor infos
|
||||
{
|
||||
ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
||||
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
||||
|
||||
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||
|
@ -19896,7 +19901,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|||
}
|
||||
}
|
||||
|
||||
GGML_ALIGNED_FREE(ctx->kv);
|
||||
free(ctx->kv);
|
||||
}
|
||||
|
||||
if (ctx->infos) {
|
||||
|
@ -19908,7 +19913,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|||
}
|
||||
}
|
||||
|
||||
GGML_ALIGNED_FREE(ctx->infos);
|
||||
free(ctx->infos);
|
||||
}
|
||||
|
||||
GGML_ALIGNED_FREE(ctx);
|
||||
|
|
29
ggml.h
29
ggml.h
|
@ -952,11 +952,11 @@ extern "C" {
|
|||
|
||||
// a - x
|
||||
// b - dy
|
||||
// TODO: update with configurable eps
|
||||
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
struct ggml_tensor * b,
|
||||
float eps);
|
||||
|
||||
// A: n columns, m rows
|
||||
// B: n columns, p rows (i.e. we transpose it internally)
|
||||
|
@ -1612,7 +1612,8 @@ extern "C" {
|
|||
struct ggml_tensor * tensor);
|
||||
|
||||
|
||||
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
||||
|
||||
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
||||
|
@ -1677,6 +1678,8 @@ extern "C" {
|
|||
GGML_LINESEARCH_INVALID_PARAMETERS,
|
||||
};
|
||||
|
||||
typedef void (*ggml_opt_callback)(void * data, float * sched);
|
||||
|
||||
// optimization parameters
|
||||
//
|
||||
// see ggml.c (ggml_opt_default_params) for default values
|
||||
|
@ -1712,12 +1715,14 @@ extern "C" {
|
|||
|
||||
float sched; // schedule multiplier (fixed, decay or warmup)
|
||||
float decay; // weight decay for AdamW, use 0.0f to disable
|
||||
int decay_min_ndim; // minimum number of tensor dimension to apply weight decay
|
||||
float alpha; // learning rate
|
||||
float beta1;
|
||||
float beta2;
|
||||
float eps; // epsilon for numerical stability
|
||||
float eps_f; // epsilon for convergence test
|
||||
float eps_g; // epsilon for convergence test
|
||||
float gclip; // gradient clipping
|
||||
} adam;
|
||||
|
||||
// LBFGS parameters
|
||||
|
@ -1745,14 +1750,12 @@ extern "C" {
|
|||
|
||||
bool just_initialized;
|
||||
|
||||
float loss_before;
|
||||
float loss_after;
|
||||
|
||||
struct {
|
||||
struct ggml_tensor * x; // view of the parameters
|
||||
struct ggml_tensor * g1; // gradient
|
||||
struct ggml_tensor * g2; // gradient squared
|
||||
struct ggml_tensor * m; // first moment
|
||||
struct ggml_tensor * v; // second moment
|
||||
struct ggml_tensor * mh; // first moment hat
|
||||
struct ggml_tensor * vh; // second moment hat
|
||||
struct ggml_tensor * pf; // past function values
|
||||
float fx_best;
|
||||
float fx_prev;
|
||||
|
@ -1789,10 +1792,10 @@ extern "C" {
|
|||
|
||||
// initialize optimizer context
|
||||
GGML_API void ggml_opt_init(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_opt_context * opt,
|
||||
struct ggml_opt_params params,
|
||||
int64_t nx);
|
||||
struct ggml_opt_params params,
|
||||
int64_t nx);
|
||||
|
||||
// continue optimizing the function defined by the tensor f
|
||||
GGML_API enum ggml_opt_result ggml_opt_resume(
|
||||
|
@ -1806,7 +1809,9 @@ extern "C" {
|
|||
struct ggml_opt_context * opt,
|
||||
struct ggml_tensor * f,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_cgraph * gb);
|
||||
struct ggml_cgraph * gb,
|
||||
ggml_opt_callback callback,
|
||||
void * callback_data);
|
||||
|
||||
//
|
||||
// quantization
|
||||
|
|
|
@ -6248,7 +6248,6 @@ const char * llama_print_system_info(void) {
|
|||
}
|
||||
|
||||
void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
|
||||
|
||||
fprintf(stream, "\n");
|
||||
fprintf(stream, "###########\n");
|
||||
fprintf(stream, "# Timings #\n");
|
||||
|
@ -6264,10 +6263,10 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
|
|||
fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
|
||||
fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
|
||||
fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->n_sample);
|
||||
fprintf(stream, "t_eval_us: %ld # total microseconds spent generating tokens\n", ctx->t_eval_us);
|
||||
fprintf(stream, "t_load_us: %ld # total microseconds spent loading the model\n", ctx->t_load_us);
|
||||
fprintf(stream, "t_p_eval_us: %ld # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
|
||||
fprintf(stream, "t_sample_us: %ld # total microseconds spent sampling\n", ctx->t_sample_us);
|
||||
fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us);
|
||||
fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us);
|
||||
fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
|
||||
fprintf(stream, "t_sample_us: %" PRId64 " # total microseconds spent sampling\n", ctx->t_sample_us);
|
||||
fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
|
||||
1.0e6 * ctx->n_eval / ctx->t_eval_us);
|
||||
fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
|
||||
|
|
|
@ -275,14 +275,14 @@ static bool check_gradient(
|
|||
|
||||
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
||||
|
||||
const float f0 = ggml_get_f32_1d(f, 0);
|
||||
const double f0 = ggml_get_f32_1d(f, 0);
|
||||
|
||||
ggml_set_f32_1d(x[i], k, xm);
|
||||
|
||||
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
||||
|
||||
const float f1 = ggml_get_f32_1d(f, 0);
|
||||
const float g0 = (f0 - f1)/(2.0f*eps);
|
||||
const double f1 = ggml_get_f32_1d(f, 0);
|
||||
const double g0 = (f0 - f1)/(2.0*(double) eps);
|
||||
|
||||
ggml_set_f32_1d(x[i], k, x0);
|
||||
|
||||
|
@ -292,10 +292,10 @@ static bool check_gradient(
|
|||
|
||||
ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
|
||||
|
||||
const float g1 = ggml_get_f32_1d(x[i]->grad, k);
|
||||
const double g1 = ggml_get_f32_1d(x[i]->grad, k);
|
||||
|
||||
const float error_abs = fabsf(g0 - g1);
|
||||
const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0;
|
||||
const double error_abs = fabs(g0 - g1);
|
||||
const double error_rel = g0 != 0 ? fabs(g0 - g1)/fabs(g0) : 0;
|
||||
|
||||
if (error_abs > max_error_abs || error_rel > max_error_rel) {
|
||||
printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
|
||||
|
@ -531,7 +531,7 @@ int main(int argc, const char ** argv) {
|
|||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
|
||||
|
||||
check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
|
||||
check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1345,9 +1345,18 @@ int main(int argc, const char ** argv) {
|
|||
x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
|
||||
ggml_set_param(ctx0, x[0]);
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_soft_max(ctx0, x[0]));
|
||||
float eps = 1e-6f;
|
||||
// dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
|
||||
// instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
|
||||
struct ggml_tensor * f = ggml_sum(ctx0,
|
||||
ggml_log(ctx0,
|
||||
ggml_add1(ctx0,
|
||||
ggml_scale(ctx0,
|
||||
ggml_soft_max(ctx0, x[0]),
|
||||
ggml_new_f32(ctx0, 1.0f - eps)),
|
||||
ggml_new_f32(ctx0, eps))));
|
||||
|
||||
check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||
check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1358,15 +1367,26 @@ int main(int argc, const char ** argv) {
|
|||
int64_t ne2[4];
|
||||
get_random_dims(ne2, 4);
|
||||
|
||||
for (int ndims = 1; ndims <= 3; ++ndims) {
|
||||
x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
|
||||
for (int ndims = 1; ndims <= 4; ++ndims) {
|
||||
x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -0.1f, 0.1f);
|
||||
x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
|
||||
// the second argument to cross_entropy_loss must sum up to 1 for each row
|
||||
int nr = ggml_nrows(x[1]);
|
||||
int nc = ggml_nelements(x[1]) / nr;
|
||||
for (int ir = 0; ir < nr; ++ir) {
|
||||
float sum = 0;
|
||||
for (int ic = 0; ic < nc; ++ic) {
|
||||
sum += ((float *) x[1]->data)[ic + ir*nc];
|
||||
}
|
||||
for (int ic = 0; ic < nc; ++ic) {
|
||||
((float *) x[1]->data)[ic + ir*nc] /= sum;
|
||||
}
|
||||
}
|
||||
ggml_set_param(ctx0, x[0]);
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1]));
|
||||
struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);
|
||||
|
||||
check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-1f, 1e-2f, INFINITY);
|
||||
// finite differences regularly fails!
|
||||
check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-3f, INFINITY);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1473,7 +1493,7 @@ int main(int argc, const char ** argv) {
|
|||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
|
||||
|
||||
check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
|
||||
check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1514,7 +1534,7 @@ int main(int argc, const char ** argv) {
|
|||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
|
||||
|
||||
check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
|
||||
check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue