From 55eaa7c1f511af5fb6ef808b5328804f4d4e5243 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 30 Apr 2013 17:14:42 +0200 Subject: [PATCH 1/4] sched: Avoid cputime scaling overflow Here is patch, which adds Linus's cputime scaling algorithm to the kernel. This is a follow up (well, fix) to commit d9a3c9823a2e6a543eb7807fb3d15d8233817ec5 ("sched: Lower chances of cputime scaling overflow") which commit tried to avoid multiplication overflow, but did not guarantee that the overflow would not happen. Linus crated a different algorithm, which completely avoids the multiplication overflow by dropping precision when numbers are big. It was tested by me and it gives good relative error of scaled numbers. Testing method is described here: http://marc.info/?l=linux-kernel&m=136733059505406&w=2 Originally-From: Linus Torvalds Signed-off-by: Stanislaw Gruszka Cc: Frederic Weisbecker Cc: rostedt@goodmis.org Cc: Dave Hansen Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20130430151441.GC10465@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 57 ++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 33508dc78d0c..e9198abfca53 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -506,34 +506,47 @@ void account_idle_ticks(unsigned long ticks) } /* - * Perform (stime * rtime) / total with reduced chances - * of multiplication overflows by using smaller factors - * like quotient and remainders of divisions between - * rtime and total. + * Perform (stime * rtime) / total, but avoid multiplication overflow by + * loosing precision when the numbers are big. */ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) { - u64 rem, res, scaled; + u64 scaled; - if (rtime >= total) { - /* - * Scale up to rtime / total then add - * the remainder scaled to stime / total. - */ - res = div64_u64_rem(rtime, total, &rem); - scaled = stime * res; - scaled += div64_u64(stime * rem, total); - } else { - /* - * Same in reverse: scale down to total / rtime - * then substract that result scaled to - * to the remaining part. - */ - res = div64_u64_rem(total, rtime, &rem); - scaled = div64_u64(stime, res); - scaled -= div64_u64(scaled * rem, total); + for (;;) { + /* Make sure "rtime" is the bigger of stime/rtime */ + if (stime > rtime) { + u64 tmp = rtime; rtime = stime; stime = tmp; + } + + /* Make sure 'total' fits in 32 bits */ + if (total >> 32) + goto drop_precision; + + /* Does rtime (and thus stime) fit in 32 bits? */ + if (!(rtime >> 32)) + break; + + /* Can we just balance rtime/stime rather than dropping bits? */ + if (stime >> 31) + goto drop_precision; + + /* We can grow stime and shrink rtime and try to make them both fit */ + stime <<= 1; + rtime >>= 1; + continue; + +drop_precision: + /* We drop from rtime, it has more bits than stime */ + rtime >>= 1; + total >>= 1; } + /* + * Make sure gcc understands that this is a 32x32->64 multiply, + * followed by a 64/32->64 divide. + */ + scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); return (__force cputime_t) scaled; } From 772c808a252594692972773f6ee41c289b8e0b2a Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 30 Apr 2013 11:35:05 +0200 Subject: [PATCH 2/4] sched: Do not account bogus utime Due to rounding in scale_stime(), for big numbers, scaled stime values will grow in chunks. Since rtime grow in jiffies and we calculate utime like below: prev->stime = max(prev->stime, stime); prev->utime = max(prev->utime, rtime - prev->stime); we could erroneously account stime values as utime. To prevent that only update prev->{u,s}time values when they are smaller than current rtime. Signed-off-by: Stanislaw Gruszka Cc: Frederic Weisbecker Cc: rostedt@goodmis.org Cc: Linus Torvalds Cc: Dave Hansen Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1367314507-9728-2-git-send-email-sgruszka@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index e9198abfca53..1b7c2161f5cd 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -581,6 +581,14 @@ static void cputime_adjust(struct task_cputime *curr, */ rtime = nsecs_to_cputime(curr->sum_exec_runtime); + /* + * Update userspace visible utime/stime values only if actual execution + * time is bigger than already exported. Note that can happen, that we + * provided bigger values due to scaling inaccuracy on big numbers. + */ + if (prev->stime + prev->utime >= rtime) + goto out; + if (!rtime) { stime = 0; } else if (!total) { @@ -598,6 +606,7 @@ static void cputime_adjust(struct task_cputime *curr, prev->stime = max(prev->stime, stime); prev->utime = max(prev->utime, rtime - prev->stime); +out: *ut = prev->utime; *st = prev->stime; } From 68aa8efcd1ab961e4684ef5af32f72a6ec1911de Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 30 Apr 2013 11:35:06 +0200 Subject: [PATCH 3/4] sched: Avoid prev->stime underflow Dave Hansen reported strange utime/stime values on his system: https://lkml.org/lkml/2013/4/4/435 This happens because prev->stime value is bigger than rtime value. Root of the problem are non-monotonic rtime values (i.e. current rtime is smaller than previous rtime) and that should be debugged and fixed. But since problem did not manifest itself before commit 62188451f0d63add7ad0cd2a1ae269d600c1663d "cputime: Avoid multiplication overflow on utime scaling", it should be threated as regression, which we can easily fixed on cputime_adjust() function. For now, let's apply this fix, but further work is needed to fix root of the problem. Reported-and-tested-by: Dave Hansen Cc: # 3.9+ Signed-off-by: Stanislaw Gruszka Cc: Frederic Weisbecker Cc: rostedt@goodmis.org Cc: Linus Torvalds Cc: Dave Hansen Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1367314507-9728-3-git-send-email-sgruszka@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 1b7c2161f5cd..337a36745800 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -558,7 +558,7 @@ static void cputime_adjust(struct task_cputime *curr, struct cputime *prev, cputime_t *ut, cputime_t *st) { - cputime_t rtime, stime, total; + cputime_t rtime, stime, utime, total; if (vtime_accounting_enabled()) { *ut = curr->utime; @@ -589,13 +589,13 @@ static void cputime_adjust(struct task_cputime *curr, if (prev->stime + prev->utime >= rtime) goto out; - if (!rtime) { - stime = 0; - } else if (!total) { - stime = rtime; - } else { + if (total) { stime = scale_stime((__force u64)stime, (__force u64)rtime, (__force u64)total); + utime = rtime - stime; + } else { + stime = rtime; + utime = 0; } /* @@ -604,7 +604,7 @@ static void cputime_adjust(struct task_cputime *curr, * Let's enforce monotonicity. */ prev->stime = max(prev->stime, stime); - prev->utime = max(prev->utime, rtime - prev->stime); + prev->utime = max(prev->utime, utime); out: *ut = prev->utime; From f3002134158092178be81339ec5a22ff80e6c308 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 30 Apr 2013 11:35:07 +0200 Subject: [PATCH 4/4] Revert "math64: New div64_u64_rem helper" This reverts commit f792685006274a850e6cc0ea9ade275ccdfc90bc. The cputime scaling code was changed/fixed and does not need the div64_u64_rem() primitive anymore. It has no other users, so let's remove them. Signed-off-by: Stanislaw Gruszka Cc: Frederic Weisbecker Cc: rostedt@goodmis.org Cc: Linus Torvalds Cc: Dave Hansen Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1367314507-9728-4-git-send-email-sgruszka@redhat.com Signed-off-by: Ingo Molnar --- include/linux/math64.h | 19 +------------------ lib/div64.c | 19 ++++++------------- 2 files changed, 7 insertions(+), 31 deletions(-) diff --git a/include/linux/math64.h b/include/linux/math64.h index 931a619407bf..b8ba85544721 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -29,15 +29,6 @@ static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder) return dividend / divisor; } -/** - * div64_u64_rem - unsigned 64bit divide with 64bit divisor - */ -static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder) -{ - *remainder = dividend % divisor; - return dividend / divisor; -} - /** * div64_u64 - unsigned 64bit divide with 64bit divisor */ @@ -70,16 +61,8 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder); #endif -#ifndef div64_u64_rem -extern u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder); -#endif - #ifndef div64_u64 -static inline u64 div64_u64(u64 dividend, u64 divisor) -{ - u64 remainder; - return div64_u64_rem(dividend, divisor, &remainder); -} +extern u64 div64_u64(u64 dividend, u64 divisor); #endif #ifndef div64_s64 diff --git a/lib/div64.c b/lib/div64.c index 3af5728d95fd..a163b6caef73 100644 --- a/lib/div64.c +++ b/lib/div64.c @@ -79,10 +79,9 @@ EXPORT_SYMBOL(div_s64_rem); #endif /** - * div64_u64_rem - unsigned 64bit divide with 64bit divisor and 64bit remainder + * div64_u64 - unsigned 64bit divide with 64bit divisor * @dividend: 64bit dividend * @divisor: 64bit divisor - * @remainder: 64bit remainder * * This implementation is a modified version of the algorithm proposed * by the book 'Hacker's Delight'. The original source and full proof @@ -90,33 +89,27 @@ EXPORT_SYMBOL(div_s64_rem); * * 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c.txt' */ -#ifndef div64_u64_rem -u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder) +#ifndef div64_u64 +u64 div64_u64(u64 dividend, u64 divisor) { u32 high = divisor >> 32; u64 quot; if (high == 0) { - u32 rem32; - quot = div_u64_rem(dividend, divisor, &rem32); - *remainder = rem32; + quot = div_u64(dividend, divisor); } else { int n = 1 + fls(high); quot = div_u64(dividend >> n, divisor >> n); if (quot != 0) quot--; - - *remainder = dividend - quot * divisor; - if (*remainder >= divisor) { + if ((dividend - quot * divisor) >= divisor) quot++; - *remainder -= divisor; - } } return quot; } -EXPORT_SYMBOL(div64_u64_rem); +EXPORT_SYMBOL(div64_u64); #endif /**