From a80f184e6d386b4a6d74902ddae61bf4740fd9a1 Mon Sep 17 00:00:00 2001 From: xaedes Date: Thu, 29 Jun 2023 21:31:25 +0200 Subject: [PATCH] change AdamW decay parameter to work like the torch AdamW decay parameter It is now relative to Adam learning rate `alpha*sched`. Before that it was relative to `sched` only. `alpha` being the maximum learning rate and `sched` being a scaling parameter in [0..1] --- ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index 92717f0aa..451c765f9 100644 --- a/ggml.c +++ b/ggml.c @@ -17351,8 +17351,8 @@ static enum ggml_opt_result ggml_opt_adam( // constants const float sched = params.adam.sched; - const float decay = params.adam.decay * sched; const float alpha = params.adam.alpha * sched; + const float decay = params.adam.decay * alpha; const float beta1 = params.adam.beta1; const float beta2 = params.adam.beta2; const float eps = params.adam.eps;