diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h index c99e259469f7..12ebfcc1d539 100644 --- a/arch/arm/include/asm/switch_to.h +++ b/arch/arm/include/asm/switch_to.h @@ -10,7 +10,9 @@ * CPU. */ #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) && defined(CONFIG_CPU_V7) -#define finish_arch_switch(prev) dsb(ish) +#define __complete_pending_tlbi() dsb(ish) +#else +#define __complete_pending_tlbi() #endif /* @@ -22,6 +24,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info #define switch_to(prev,next,last) \ do { \ + __complete_pending_tlbi(); \ last = __switch_to(prev,task_thread_info(prev), task_thread_info(next)); \ } while (0) diff --git a/arch/avr32/include/asm/switch_to.h b/arch/avr32/include/asm/switch_to.h index 9a8e9d5208d4..6f00581c3d4f 100644 --- a/arch/avr32/include/asm/switch_to.h +++ b/arch/avr32/include/asm/switch_to.h @@ -15,11 +15,13 @@ */ #ifdef CONFIG_OWNERSHIP_TRACE #include -#define finish_arch_switch(prev) \ +#define ocd_switch(prev, next) \ do { \ ocd_write(PID, prev->pid); \ - ocd_write(PID, current->pid); \ + ocd_write(PID, next->pid); \ } while(0) +#else +#define ocd_switch(prev, next) #endif /* @@ -38,6 +40,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct cpu_context *); #define switch_to(prev, next, last) \ do { \ + ocd_switch(prev, next); \ last = __switch_to(prev, &prev->thread.cpu_context + 1, \ &next->thread.cpu_context); \ } while (0) diff --git a/arch/mips/include/asm/switch_to.h b/arch/mips/include/asm/switch_to.h index 7163cd7fdd69..9733cd0266e4 100644 --- a/arch/mips/include/asm/switch_to.h +++ b/arch/mips/include/asm/switch_to.h @@ -83,45 +83,43 @@ do { if (cpu_has_rw_llb) { \ } \ } while (0) +/* + * For newly created kernel threads switch_to() will return to + * ret_from_kernel_thread, newly created user threads to ret_from_fork. + * That is, everything following resume() will be skipped for new threads. + * So everything that matters to new threads should be placed before resume(). + */ #define switch_to(prev, next, last) \ do { \ - u32 __c0_stat; \ s32 __fpsave = FP_SAVE_NONE; \ __mips_mt_fpaff_switch_to(prev); \ - if (cpu_has_dsp) \ + if (cpu_has_dsp) { \ __save_dsp(prev); \ - if (cop2_present && (KSTK_STATUS(prev) & ST0_CU2)) { \ - if (cop2_lazy_restore) \ - KSTK_STATUS(prev) &= ~ST0_CU2; \ - __c0_stat = read_c0_status(); \ - write_c0_status(__c0_stat | ST0_CU2); \ - cop2_save(prev); \ - write_c0_status(__c0_stat & ~ST0_CU2); \ + __restore_dsp(next); \ + } \ + if (cop2_present) { \ + set_c0_status(ST0_CU2); \ + if ((KSTK_STATUS(prev) & ST0_CU2)) { \ + if (cop2_lazy_restore) \ + KSTK_STATUS(prev) &= ~ST0_CU2; \ + cop2_save(prev); \ + } \ + if (KSTK_STATUS(next) & ST0_CU2 && \ + !cop2_lazy_restore) { \ + cop2_restore(next); \ + } \ + clear_c0_status(ST0_CU2); \ } \ __clear_software_ll_bit(); \ if (test_and_clear_tsk_thread_flag(prev, TIF_USEDFPU)) \ __fpsave = FP_SAVE_SCALAR; \ if (test_and_clear_tsk_thread_flag(prev, TIF_USEDMSA)) \ __fpsave = FP_SAVE_VECTOR; \ + if (cpu_has_userlocal) \ + write_c0_userlocal(task_thread_info(next)->tp_value); \ + __restore_watch(); \ + disable_msa(); \ (last) = resume(prev, next, task_thread_info(next), __fpsave); \ } while (0) -#define finish_arch_switch(prev) \ -do { \ - u32 __c0_stat; \ - if (cop2_present && !cop2_lazy_restore && \ - (KSTK_STATUS(current) & ST0_CU2)) { \ - __c0_stat = read_c0_status(); \ - write_c0_status(__c0_stat | ST0_CU2); \ - cop2_restore(current); \ - write_c0_status(__c0_stat & ~ST0_CU2); \ - } \ - if (cpu_has_dsp) \ - __restore_dsp(current); \ - if (cpu_has_userlocal) \ - write_c0_userlocal(current_thread_info()->tp_value); \ - __restore_watch(); \ - disable_msa(); \ -} while (0) - #endif /* _ASM_SWITCH_TO_H */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 68d067ad4222..a9f753fb73a8 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2178,7 +2178,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) vc->runner = vcpu; if (n_ceded == vc->n_runnable) { kvmppc_vcore_blocked(vc); - } else if (should_resched()) { + } else if (need_resched()) { vc->vcore_state = VCORE_PREEMPT; /* Let something else run */ cond_resched_lock(&vc->lock); diff --git a/arch/score/include/asm/switch_to.h b/arch/score/include/asm/switch_to.h index 031756b59ece..fda3f83308d2 100644 --- a/arch/score/include/asm/switch_to.h +++ b/arch/score/include/asm/switch_to.h @@ -8,6 +8,4 @@ do { \ (last) = resume(prev, next, task_thread_info(next)); \ } while (0) -#define finish_arch_switch(prev) do {} while (0) - #endif /* _ASM_SCORE_SWITCH_TO_H */ diff --git a/arch/sh/include/asm/switch_to_32.h b/arch/sh/include/asm/switch_to_32.h index 0c065513e7ac..7661b4ba8259 100644 --- a/arch/sh/include/asm/switch_to_32.h +++ b/arch/sh/include/asm/switch_to_32.h @@ -78,6 +78,8 @@ do { \ \ if (is_dsp_enabled(prev)) \ __save_dsp(prev); \ + if (is_dsp_enabled(next)) \ + __restore_dsp(next); \ \ __ts1 = (u32 *)&prev->thread.sp; \ __ts2 = (u32 *)&prev->thread.pc; \ @@ -125,10 +127,4 @@ do { \ last = __last; \ } while (0) -#define finish_arch_switch(prev) \ -do { \ - if (is_dsp_enabled(prev)) \ - __restore_dsp(prev); \ -} while (0) - #endif /* __ASM_SH_SWITCH_TO_32_H */ diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 50e7b626afe8..c5113c7ce2fd 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -333,11 +333,11 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, childregs = (struct pt_regs *) (new_stack + STACKFRAME_SZ); /* - * A new process must start with interrupts closed in 2.5, - * because this is how Mingo's scheduler works (see schedule_tail - * and finish_arch_switch). If we do not do it, a timer interrupt hits - * before we unlock, attempts to re-take the rq->lock, and then we die. - * Thus, kpsr|=PSR_PIL. + * A new process must start with interrupts disabled, see schedule_tail() + * and finish_task_switch(). (If we do not do it and if a timer interrupt + * hits before we unlock and attempts to take the rq->lock, we deadlock.) + * + * Thus, kpsr |= PSR_PIL. */ ti->ksp = (unsigned long) new_stack; p->thread.kregs = childregs; diff --git a/arch/tile/include/asm/switch_to.h b/arch/tile/include/asm/switch_to.h index b8f888cbe6b0..34ee72705521 100644 --- a/arch/tile/include/asm/switch_to.h +++ b/arch/tile/include/asm/switch_to.h @@ -53,15 +53,13 @@ extern unsigned long get_switch_to_pc(void); * Kernel threads can check to see if they need to migrate their * stack whenever they return from a context switch; for user * threads, we defer until they are returning to user-space. + * We defer homecache migration until the runqueue lock is released. */ -#define finish_arch_switch(prev) do { \ - if (unlikely((prev)->state == TASK_DEAD)) \ - __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT | \ - ((prev)->pid << _SIM_CONTROL_OPERATOR_BITS)); \ +#define finish_arch_post_lock_switch() do { \ __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_SWITCH | \ (current->pid << _SIM_CONTROL_OPERATOR_BITS)); \ if (current->mm == NULL && !kstack_hash && \ - current_thread_info()->homecache_cpu != smp_processor_id()) \ + current_thread_info()->homecache_cpu != raw_smp_processor_id()) \ homecache_migrate_kthread(); \ } while (0) diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index a45213781ad0..7d5769310bef 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -446,6 +446,11 @@ struct task_struct *__sched _switch_to(struct task_struct *prev, hardwall_switch_tasks(prev, next); #endif + /* Notify the simulator of task exit. */ + if (unlikely(prev->state == TASK_DEAD)) + __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT | + (prev->pid << _SIM_CONTROL_OPERATOR_BITS)); + /* * Switch kernel SP, PC, and callee-saved registers. * In the context of the new task, return the old task pointer diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index dca71714f860..b12f81022a6b 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -90,9 +90,9 @@ static __always_inline bool __preempt_count_dec_and_test(void) /* * Returns true when we need to resched and can (barring IRQ state). */ -static __always_inline bool should_resched(void) +static __always_inline bool should_resched(int preempt_offset) { - return unlikely(!raw_cpu_read_4(__preempt_count)); + return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); } #ifdef CONFIG_PREEMPT diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 48b7228563ad..33253930247f 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -123,6 +123,7 @@ static void enter_freeze_proper(struct cpuidle_driver *drv, * cpuidle mechanism enables interrupts and doing that with timekeeping * suspended is generally unsafe. */ + stop_critical_timings(); drv->states[index].enter_freeze(dev, drv, index); WARN_ON(!irqs_disabled()); /* @@ -131,6 +132,7 @@ static void enter_freeze_proper(struct cpuidle_driver *drv, * critical sections, so tell RCU about that. */ RCU_NONIDLE(tick_unfreeze()); + start_critical_timings(); } /** @@ -195,7 +197,9 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, trace_cpu_idle_rcuidle(index, dev->cpu); time_start = ktime_get(); + stop_critical_timings(); entered_state = target_state->enter(dev, drv, index); + start_critical_timings(); time_end = ktime_get(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); diff --git a/drivers/xen/preempt.c b/drivers/xen/preempt.c index a1800c150839..08cb419eb4e6 100644 --- a/drivers/xen/preempt.c +++ b/drivers/xen/preempt.c @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); asmlinkage __visible void xen_maybe_preempt_hcall(void) { if (unlikely(__this_cpu_read(xen_in_preemptible_hcall) - && should_resched())) { + && need_resched())) { /* * Clear flag as we may be rescheduled on a different * cpu. diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index d0a7a4753db2..0bec580a4885 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -71,9 +71,10 @@ static __always_inline bool __preempt_count_dec_and_test(void) /* * Returns true when we need to resched and can (barring IRQ state). */ -static __always_inline bool should_resched(void) +static __always_inline bool should_resched(int preempt_offset) { - return unlikely(!preempt_count() && tif_need_resched()); + return unlikely(preempt_count() == preempt_offset && + tif_need_resched()); } #ifdef CONFIG_PREEMPT diff --git a/include/linux/init_task.h b/include/linux/init_task.h index e8493fee8160..d0b380ee7d67 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -32,6 +32,14 @@ extern struct fs_struct init_fs; #define INIT_CPUSET_SEQ(tsk) #endif +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +#define INIT_PREV_CPUTIME(x) .prev_cputime = { \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(x.prev_cputime.lock), \ +}, +#else +#define INIT_PREV_CPUTIME(x) +#endif + #define INIT_SIGNALS(sig) { \ .nr_threads = 1, \ .thread_head = LIST_HEAD_INIT(init_task.thread_node), \ @@ -46,6 +54,7 @@ extern struct fs_struct init_fs; .cputime_atomic = INIT_CPUTIME_ATOMIC, \ .running = 0, \ }, \ + INIT_PREV_CPUTIME(sig) \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ } @@ -246,6 +255,7 @@ extern struct task_group root_task_group; INIT_TASK_RCU_TASKS(tsk) \ INIT_CPUSET_SEQ(tsk) \ INIT_RT_MUTEXES(tsk) \ + INIT_PREV_CPUTIME(tsk) \ INIT_VTIME(tsk) \ INIT_NUMA_BALANCING(tsk) \ INIT_KASAN(tsk) \ diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 13d55206ccf6..869b21dcf503 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -38,6 +38,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), }) void kthread_bind(struct task_struct *k, unsigned int cpu); +void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask); int kthread_stop(struct task_struct *k); bool kthread_should_stop(void); bool kthread_should_park(void); diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 84991f185173..bea8dd8ff5e0 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -84,12 +84,20 @@ */ #define in_nmi() (preempt_count() & NMI_MASK) +/* + * The preempt_count offset after preempt_disable(); + */ #if defined(CONFIG_PREEMPT_COUNT) -# define PREEMPT_DISABLE_OFFSET 1 +# define PREEMPT_DISABLE_OFFSET PREEMPT_OFFSET #else -# define PREEMPT_DISABLE_OFFSET 0 +# define PREEMPT_DISABLE_OFFSET 0 #endif +/* + * The preempt_count offset after spin_lock() + */ +#define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET + /* * The preempt_count offset needed for things like: * @@ -103,7 +111,7 @@ * * Work as expected. */ -#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_DISABLE_OFFSET) +#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_LOCK_OFFSET) /* * Are we running in atomic context? WARNING: this macro cannot @@ -124,7 +132,8 @@ #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void preempt_count_add(int val); extern void preempt_count_sub(int val); -#define preempt_count_dec_and_test() ({ preempt_count_sub(1); should_resched(); }) +#define preempt_count_dec_and_test() \ + ({ preempt_count_sub(1); should_resched(0); }) #else #define preempt_count_add(val) __preempt_count_add(val) #define preempt_count_sub(val) __preempt_count_sub(val) @@ -184,7 +193,7 @@ do { \ #define preempt_check_resched() \ do { \ - if (should_resched()) \ + if (should_resched(0)) \ __preempt_schedule(); \ } while (0) diff --git a/include/linux/sched.h b/include/linux/sched.h index 04b5ada460b4..119823decc46 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -530,39 +530,49 @@ struct cpu_itimer { }; /** - * struct cputime - snaphsot of system and user cputime + * struct prev_cputime - snaphsot of system and user cputime * @utime: time spent in user mode * @stime: time spent in system mode + * @lock: protects the above two fields * - * Gathers a generic snapshot of user and system time. + * Stores previous user/system time values such that we can guarantee + * monotonicity. */ -struct cputime { +struct prev_cputime { +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE cputime_t utime; cputime_t stime; + raw_spinlock_t lock; +#endif }; +static inline void prev_cputime_init(struct prev_cputime *prev) +{ +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + prev->utime = prev->stime = 0; + raw_spin_lock_init(&prev->lock); +#endif +} + /** * struct task_cputime - collected CPU time counts * @utime: time spent in user mode, in &cputime_t units * @stime: time spent in kernel mode, in &cputime_t units * @sum_exec_runtime: total time spent on the CPU, in nanoseconds * - * This is an extension of struct cputime that includes the total runtime - * spent by the task from the scheduler point of view. - * - * As a result, this structure groups together three kinds of CPU time - * that are tracked for threads and thread groups. Most things considering - * CPU time want to group these counts together and treat all three - * of them in parallel. + * This structure groups together three kinds of CPU time that are tracked for + * threads and thread groups. Most things considering CPU time want to group + * these counts together and treat all three of them in parallel. */ struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; }; + /* Alternate field names when used to cache expirations. */ -#define prof_exp stime #define virt_exp utime +#define prof_exp stime #define sched_exp sum_exec_runtime #define INIT_CPUTIME \ @@ -715,9 +725,7 @@ struct signal_struct { cputime_t utime, stime, cutime, cstime; cputime_t gtime; cputime_t cgtime; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - struct cputime prev_cputime; -#endif + struct prev_cputime prev_cputime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; @@ -1167,29 +1175,24 @@ struct load_weight { u32 inv_weight; }; +/* + * The load_avg/util_avg accumulates an infinite geometric series. + * 1) load_avg factors the amount of time that a sched_entity is + * runnable on a rq into its weight. For cfs_rq, it is the aggregated + * such weights of all runnable and blocked sched_entities. + * 2) util_avg factors frequency scaling into the amount of time + * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE]. + * For cfs_rq, it is the aggregated such times of all runnable and + * blocked sched_entities. + * The 64 bit load_sum can: + * 1) for cfs_rq, afford 4353082796 (=2^64/47742/88761) entities with + * the highest weight (=88761) always runnable, we should not overflow + * 2) for entity, support any load.weight always runnable + */ struct sched_avg { - u64 last_runnable_update; - s64 decay_count; - /* - * utilization_avg_contrib describes the amount of time that a - * sched_entity is running on a CPU. It is based on running_avg_sum - * and is scaled in the range [0..SCHED_LOAD_SCALE]. - * load_avg_contrib described the amount of time that a sched_entity - * is runnable on a rq. It is based on both runnable_avg_sum and the - * weight of the task. - */ - unsigned long load_avg_contrib, utilization_avg_contrib; - /* - * These sums represent an infinite geometric series and so are bound - * above by 1024/(1-y). Thus we only need a u32 to store them for all - * choices of y < 1-2^(-32)*1024. - * running_avg_sum reflects the time that the sched_entity is - * effectively running on the CPU. - * runnable_avg_sum represents the amount of time a sched_entity is on - * a runqueue which includes the running time that is monitored by - * running_avg_sum. - */ - u32 runnable_avg_sum, avg_period, running_avg_sum; + u64 last_update_time, load_sum; + u32 util_sum, period_contrib; + unsigned long load_avg, util_avg; }; #ifdef CONFIG_SCHEDSTATS @@ -1255,7 +1258,7 @@ struct sched_entity { #endif #ifdef CONFIG_SMP - /* Per-entity load-tracking */ + /* Per entity load average tracking */ struct sched_avg avg; #endif }; @@ -1351,9 +1354,9 @@ struct task_struct { #ifdef CONFIG_SMP struct llist_node wake_entry; int on_cpu; - struct task_struct *last_wakee; - unsigned long wakee_flips; + unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; + struct task_struct *last_wakee; int wake_cpu; #endif @@ -1481,9 +1484,7 @@ struct task_struct { cputime_t utime, stime, utimescaled, stimescaled; cputime_t gtime; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - struct cputime prev_cputime; -#endif + struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN seqlock_t vtime_seqlock; unsigned long long vtime_snap; @@ -2214,13 +2215,6 @@ static inline void calc_load_enter_idle(void) { } static inline void calc_load_exit_idle(void) { } #endif /* CONFIG_NO_HZ_COMMON */ -#ifndef CONFIG_CPUMASK_OFFSTACK -static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) -{ - return set_cpus_allowed_ptr(p, &new_mask); -} -#endif - /* * Do not use outside of architecture code which knows its limitations. * @@ -2897,12 +2891,6 @@ extern int _cond_resched(void); extern int __cond_resched_lock(spinlock_t *lock); -#ifdef CONFIG_PREEMPT_COUNT -#define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET -#else -#define PREEMPT_LOCK_OFFSET 0 -#endif - #define cond_resched_lock(lock) ({ \ ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\ __cond_resched_lock(lock); \ diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index d2abbdb8c6aa..414d924318ce 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -112,25 +112,13 @@ static inline int try_stop_cpus(const struct cpumask *cpumask, * * This can be thought of as a very heavy write lock, equivalent to * grabbing every spinlock in the kernel. */ -int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus); +int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); -/** - * __stop_machine: freeze the machine on all CPUs and run this function - * @fn: the function to run - * @data: the data ptr for the @fn - * @cpus: the cpus to run the @fn() on (NULL = any online cpu) - * - * Description: This is a special version of the above, which assumes cpus - * won't come or go while it's being called. Used by hotplug cpu. - */ -int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus); - -int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, +int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); - #else /* CONFIG_STOP_MACHINE && CONFIG_SMP */ -static inline int __stop_machine(int (*fn)(void *), void *data, +static inline int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { unsigned long flags; @@ -141,16 +129,10 @@ static inline int __stop_machine(int (*fn)(void *), void *data, return ret; } -static inline int stop_machine(int (*fn)(void *), void *data, - const struct cpumask *cpus) -{ - return __stop_machine(fn, data, cpus); -} - -static inline int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, +static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { - return __stop_machine(fn, data, cpus); + return stop_machine(fn, data, cpus); } #endif /* CONFIG_STOP_MACHINE && CONFIG_SMP */ diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index d57a575fe31f..539d6bc3216a 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -55,9 +55,9 @@ TRACE_EVENT(sched_kthread_stop_ret, */ DECLARE_EVENT_CLASS(sched_wakeup_template, - TP_PROTO(struct task_struct *p, int success), + TP_PROTO(struct task_struct *p), - TP_ARGS(__perf_task(p), success), + TP_ARGS(__perf_task(p)), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) @@ -71,25 +71,37 @@ DECLARE_EVENT_CLASS(sched_wakeup_template, memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; - __entry->success = success; + __entry->success = 1; /* rudiment, kill when possible */ __entry->target_cpu = task_cpu(p); ), - TP_printk("comm=%s pid=%d prio=%d success=%d target_cpu=%03d", + TP_printk("comm=%s pid=%d prio=%d target_cpu=%03d", __entry->comm, __entry->pid, __entry->prio, - __entry->success, __entry->target_cpu) + __entry->target_cpu) ); +/* + * Tracepoint called when waking a task; this tracepoint is guaranteed to be + * called from the waking context. + */ +DEFINE_EVENT(sched_wakeup_template, sched_waking, + TP_PROTO(struct task_struct *p), + TP_ARGS(p)); + +/* + * Tracepoint called when the task is actually woken; p->state == TASK_RUNNNG. + * It it not always called from the waking context. + */ DEFINE_EVENT(sched_wakeup_template, sched_wakeup, - TP_PROTO(struct task_struct *p, int success), - TP_ARGS(p, success)); + TP_PROTO(struct task_struct *p), + TP_ARGS(p)); /* * Tracepoint for waking up a new task: */ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new, - TP_PROTO(struct task_struct *p, int success), - TP_ARGS(p, success)); + TP_PROTO(struct task_struct *p), + TP_ARGS(p)); #ifdef CREATE_TRACE_POINTS static inline long __trace_sched_switch_state(struct task_struct *p) diff --git a/kernel/cpu.c b/kernel/cpu.c index 3c91a3fdfce5..82cf9dff4295 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -402,7 +402,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) /* * So now all preempt/rcu users must observe !cpu_active(). */ - err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); + err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); diff --git a/kernel/fork.c b/kernel/fork.c index dbd9b8d7b7cc..0d93b4d0617b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1072,6 +1072,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) rcu_assign_pointer(tsk->sighand, sig); if (!sig) return -ENOMEM; + atomic_set(&sig->count, 1); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); return 0; @@ -1133,6 +1134,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); seqlock_init(&sig->stats_lock); + prev_cputime_init(&sig->prev_cputime); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; @@ -1340,9 +1342,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = p->stime = p->gtime = 0; p->utimescaled = p->stimescaled = 0; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - p->prev_cputime.utime = p->prev_cputime.stime = 0; -#endif + prev_cputime_init(&p->prev_cputime); + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN seqlock_init(&p->vtime_seqlock); p->vtime_snap = 0; diff --git a/kernel/kthread.c b/kernel/kthread.c index fdea0bee7b5a..490924cc9e7c 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -327,16 +327,30 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create_on_node); -static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) +static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state) { - /* Must have done schedule() in kthread() before we set_task_cpu */ + unsigned long flags; + if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } + /* It's safe because the task is inactive. */ - do_set_cpus_allowed(p, cpumask_of(cpu)); + raw_spin_lock_irqsave(&p->pi_lock, flags); + do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; + raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} + +static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) +{ + __kthread_bind_mask(p, cpumask_of(cpu), state); +} + +void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) +{ + __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); } /** diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5e73c79fadd0..a585c7b2ccf0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1151,15 +1151,45 @@ static int migration_cpu_stop(void *data) return 0; } -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +/* + * sched_class::set_cpus_allowed must do the below, but is not required to + * actually call this function. + */ +void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) { - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, new_mask); - cpumask_copy(&p->cpus_allowed, new_mask); p->nr_cpus_allowed = cpumask_weight(new_mask); } +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + struct rq *rq = task_rq(p); + bool queued, running; + + lockdep_assert_held(&p->pi_lock); + + queued = task_on_rq_queued(p); + running = task_current(rq, p); + + if (queued) { + /* + * Because __kthread_bind() calls this on blocked tasks without + * holding rq->lock. + */ + lockdep_assert_held(&rq->lock); + dequeue_task(rq, p, 0); + } + if (running) + put_prev_task(rq, p); + + p->sched_class->set_cpus_allowed(p, new_mask); + + if (running) + p->sched_class->set_curr_task(rq); + if (queued) + enqueue_task(rq, p, 0); +} + /* * Change a given task's CPU affinity. Migrate the thread to a * proper CPU and schedule it away if the CPU it's executing on @@ -1169,7 +1199,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) { unsigned long flags; struct rq *rq; @@ -1178,6 +1209,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) rq = task_rq_lock(p, &flags); + /* + * Must re-check here, to close a race against __kthread_bind(), + * sched_setaffinity() is not guaranteed to observe the flag. + */ + if (check && (p->flags & PF_NO_SETAFFINITY)) { + ret = -EINVAL; + goto out; + } + if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; @@ -1214,6 +1254,11 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) return ret; } + +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + return __set_cpus_allowed_ptr(p, new_mask, false); +} EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); void set_task_cpu(struct task_struct *p, unsigned int new_cpu) @@ -1595,6 +1640,15 @@ static void update_avg(u64 *avg, u64 sample) s64 diff = sample - *avg; *avg += diff >> 3; } + +#else + +static inline int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) +{ + return set_cpus_allowed_ptr(p, new_mask); +} + #endif /* CONFIG_SMP */ static void @@ -1654,9 +1708,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { check_preempt_curr(rq, p, wake_flags); - trace_sched_wakeup(p, true); - p->state = TASK_RUNNING; + trace_sched_wakeup(p); + #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* @@ -1874,6 +1928,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (!(p->state & state)) goto out; + trace_sched_waking(p); + success = 1; /* we're going to change ->state */ cpu = task_cpu(p); @@ -1949,6 +2005,8 @@ static void try_to_wake_up_local(struct task_struct *p) if (!(p->state & TASK_NORMAL)) goto out; + trace_sched_waking(p); + if (!task_on_rq_queued(p)) ttwu_activate(rq, p, ENQUEUE_WAKEUP); @@ -2016,9 +2074,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; -#ifdef CONFIG_SMP - p->se.avg.decay_count = 0; -#endif INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS @@ -2303,11 +2358,11 @@ void wake_up_new_task(struct task_struct *p) #endif /* Initialize new task's runnable average */ - init_task_runnable_average(p); + init_entity_runnable_average(&p->se); rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; - trace_sched_wakeup_new(p, true); + trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_woken) @@ -2469,7 +2524,6 @@ static struct rq *finish_task_switch(struct task_struct *prev) */ prev_state = prev->state; vtime_task_switch(prev); - finish_arch_switch(prev); perf_event_task_sched_in(prev, current); finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); @@ -4340,7 +4394,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) } #endif again: - retval = set_cpus_allowed_ptr(p, new_mask); + retval = __set_cpus_allowed_ptr(p, new_mask, true); if (!retval) { cpuset_cpus_allowed(p, cpus_allowed); @@ -4492,7 +4546,7 @@ SYSCALL_DEFINE0(sched_yield) int __sched _cond_resched(void) { - if (should_resched()) { + if (should_resched(0)) { preempt_schedule_common(); return 1; } @@ -4510,7 +4564,7 @@ EXPORT_SYMBOL(_cond_resched); */ int __cond_resched_lock(spinlock_t *lock) { - int resched = should_resched(); + int resched = should_resched(PREEMPT_LOCK_OFFSET); int ret = 0; lockdep_assert_held(lock); @@ -4532,7 +4586,7 @@ int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); - if (should_resched()) { + if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { local_bh_enable(); preempt_schedule_common(); local_bh_disable(); @@ -4865,7 +4919,8 @@ void init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&idle->pi_lock, flags); + raw_spin_lock(&rq->lock); __sched_fork(0, idle); idle->state = TASK_RUNNING; @@ -4891,7 +4946,8 @@ void init_idle(struct task_struct *idle, int cpu) #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&idle->pi_lock, flags); /* Set the preempt count _outside_ the spinlocks! */ init_idle_preempt_count(idle, cpu); @@ -5311,8 +5367,7 @@ static void register_sched_domain_sysctl(void) /* may be called multiple times per register */ static void unregister_sched_domain_sysctl(void) { - if (sd_sysctl_header) - unregister_sysctl_table(sd_sysctl_header); + unregister_sysctl_table(sd_sysctl_header); sd_sysctl_header = NULL; if (sd_ctl_dir[0].child) sd_free_ctl_entry(&sd_ctl_dir[0].child); @@ -6445,8 +6500,10 @@ static void init_numa_topology_type(void) n = sched_max_numa_distance; - if (n <= 1) + if (sched_domains_numa_levels <= 1) { sched_numa_topology_type = NUMA_DIRECT; + return; + } for_each_online_node(a) { for_each_online_node(b) { diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index f5a64ffad176..8cbc3db671df 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -555,48 +555,43 @@ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) } /* - * Atomically advance counter to the new value. Interrupts, vcpu - * scheduling, and scaling inaccuracies can cause cputime_advance - * to be occasionally called with a new value smaller than counter. - * Let's enforce atomicity. + * Adjust tick based cputime random precision against scheduler runtime + * accounting. * - * Normally a caller will only go through this loop once, or not - * at all in case a previous caller updated counter the same jiffy. - */ -static void cputime_advance(cputime_t *counter, cputime_t new) -{ - cputime_t old; - - while (new > (old = READ_ONCE(*counter))) - cmpxchg_cputime(counter, old, new); -} - -/* - * Adjust tick based cputime random precision against scheduler - * runtime accounting. + * Tick based cputime accounting depend on random scheduling timeslices of a + * task to be interrupted or not by the timer. Depending on these + * circumstances, the number of these interrupts may be over or + * under-optimistic, matching the real user and system cputime with a variable + * precision. + * + * Fix this by scaling these tick based values against the total runtime + * accounted by the CFS scheduler. + * + * This code provides the following guarantees: + * + * stime + utime == rtime + * stime_i+1 >= stime_i, utime_i+1 >= utime_i + * + * Assuming that rtime_i+1 >= rtime_i. */ static void cputime_adjust(struct task_cputime *curr, - struct cputime *prev, + struct prev_cputime *prev, cputime_t *ut, cputime_t *st) { cputime_t rtime, stime, utime; + unsigned long flags; - /* - * Tick based cputime accounting depend on random scheduling - * timeslices of a task to be interrupted or not by the timer. - * Depending on these circumstances, the number of these interrupts - * may be over or under-optimistic, matching the real user and system - * cputime with a variable precision. - * - * Fix this by scaling these tick based values against the total - * runtime accounted by the CFS scheduler. - */ + /* Serialize concurrent callers such that we can honour our guarantees */ + raw_spin_lock_irqsave(&prev->lock, flags); rtime = nsecs_to_cputime(curr->sum_exec_runtime); /* - * Update userspace visible utime/stime values only if actual execution - * time is bigger than already exported. Note that can happen, that we - * provided bigger values due to scaling inaccuracy on big numbers. + * This is possible under two circumstances: + * - rtime isn't monotonic after all (a bug); + * - we got reordered by the lock. + * + * In both cases this acts as a filter such that the rest of the code + * can assume it is monotonic regardless of anything else. */ if (prev->stime + prev->utime >= rtime) goto out; @@ -606,22 +601,46 @@ static void cputime_adjust(struct task_cputime *curr, if (utime == 0) { stime = rtime; - } else if (stime == 0) { - utime = rtime; - } else { - cputime_t total = stime + utime; - - stime = scale_stime((__force u64)stime, - (__force u64)rtime, (__force u64)total); - utime = rtime - stime; + goto update; } - cputime_advance(&prev->stime, stime); - cputime_advance(&prev->utime, utime); + if (stime == 0) { + utime = rtime; + goto update; + } + stime = scale_stime((__force u64)stime, (__force u64)rtime, + (__force u64)(stime + utime)); + + /* + * Make sure stime doesn't go backwards; this preserves monotonicity + * for utime because rtime is monotonic. + * + * utime_i+1 = rtime_i+1 - stime_i + * = rtime_i+1 - (rtime_i - utime_i) + * = (rtime_i+1 - rtime_i) + utime_i + * >= utime_i + */ + if (stime < prev->stime) + stime = prev->stime; + utime = rtime - stime; + + /* + * Make sure utime doesn't go backwards; this still preserves + * monotonicity for stime, analogous argument to above. + */ + if (utime < prev->utime) { + utime = prev->utime; + stime = rtime - utime; + } + +update: + prev->stime = stime; + prev->utime = utime; out: *ut = prev->utime; *st = prev->stime; + raw_spin_unlock_irqrestore(&prev->lock, flags); } void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0a17af35670a..fc8f01083527 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -953,7 +953,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) /* * Use the scheduling parameters of the top pi-waiter - * task if we have one and its (relative) deadline is + * task if we have one and its (absolute) deadline is * smaller than our one... OTW we keep our runtime and * deadline. */ @@ -1563,7 +1563,7 @@ static int push_dl_task(struct rq *rq) static void push_dl_tasks(struct rq *rq) { - /* Terminates as it moves a -deadline task */ + /* push_dl_task() will return true if it moved a -deadline task */ while (push_dl_task(rq)) ; } @@ -1657,7 +1657,6 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - has_pushable_dl_tasks(rq) && p->nr_cpus_allowed > 1 && dl_task(rq->curr) && (rq->curr->nr_cpus_allowed < 2 || @@ -1669,9 +1668,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) static void set_cpus_allowed_dl(struct task_struct *p, const struct cpumask *new_mask) { - struct rq *rq; struct root_domain *src_rd; - int weight; + struct rq *rq; BUG_ON(!dl_task(p)); @@ -1697,37 +1695,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, raw_spin_unlock(&src_dl_b->lock); } - /* - * Update only if the task is actually running (i.e., - * it is on the rq AND it is not throttled). - */ - if (!on_dl_rq(&p->dl)) - return; - - weight = cpumask_weight(new_mask); - - /* - * Only update if the process changes its state from whether it - * can migrate or not. - */ - if ((p->nr_cpus_allowed > 1) == (weight > 1)) - return; - - /* - * The process used to be able to migrate OR it can now migrate - */ - if (weight <= 1) { - if (!task_current(rq, p)) - dequeue_pushable_dl_task(rq, p); - BUG_ON(!rq->dl.dl_nr_migratory); - rq->dl.dl_nr_migratory--; - } else { - if (!task_current(rq, p)) - enqueue_pushable_dl_task(rq, p); - rq->dl.dl_nr_migratory++; - } - - update_dl_migration(&rq->dl); + set_cpus_allowed_common(p, new_mask); } /* Assumes rq->lock is held */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4222ec50ab88..641511771ae6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -68,13 +68,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #define PN(F) \ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) - if (!se) { - struct sched_avg *avg = &cpu_rq(cpu)->avg; - P(avg->runnable_avg_sum); - P(avg->avg_period); + if (!se) return; - } - PN(se->exec_start); PN(se->vruntime); @@ -93,12 +88,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #endif P(se->load.weight); #ifdef CONFIG_SMP - P(se->avg.runnable_avg_sum); - P(se->avg.running_avg_sum); - P(se->avg.avg_period); - P(se->avg.load_avg_contrib); - P(se->avg.utilization_avg_contrib); - P(se->avg.decay_count); + P(se->avg.load_avg); + P(se->avg.util_avg); #endif #undef PN #undef P @@ -214,21 +205,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg", + SEQ_printf(m, " .%-30s: %lu\n", "load_avg", + cfs_rq->avg.load_avg); + SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", cfs_rq->runnable_load_avg); - SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", - cfs_rq->blocked_load_avg); - SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", - cfs_rq->utilization_load_avg); + SEQ_printf(m, " .%-30s: %lu\n", "util_avg", + cfs_rq->avg.util_avg); + SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg", + atomic_long_read(&cfs_rq->removed_load_avg)); + SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg", + atomic_long_read(&cfs_rq->removed_util_avg)); #ifdef CONFIG_FAIR_GROUP_SCHED - SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", - cfs_rq->tg_load_contrib); - SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", - cfs_rq->tg_runnable_contrib); + SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", + cfs_rq->tg_load_avg_contrib); SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", atomic_long_read(&cfs_rq->tg->load_avg)); - SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", - atomic_read(&cfs_rq->tg->runnable_avg)); #endif #endif #ifdef CONFIG_CFS_BANDWIDTH @@ -636,12 +627,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.load.weight); #ifdef CONFIG_SMP - P(se.avg.runnable_avg_sum); - P(se.avg.running_avg_sum); - P(se.avg.avg_period); - P(se.avg.load_avg_contrib); - P(se.avg.utilization_avg_contrib); - P(se.avg.decay_count); + P(se.avg.load_sum); + P(se.avg.util_sum); + P(se.avg.load_avg); + P(se.avg.util_avg); + P(se.avg.last_update_time); #endif P(policy); P(prio); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d113c3ba8bc4..6e2e3483b1ec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -283,9 +283,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } -static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, - int force_update); - static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { @@ -305,8 +302,6 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) } cfs_rq->on_list = 1; - /* We should have no load, but we need to update last_decay. */ - update_cfs_rq_blocked_load(cfs_rq, 0); } } @@ -616,15 +611,10 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) */ static u64 __sched_period(unsigned long nr_running) { - u64 period = sysctl_sched_latency; - unsigned long nr_latency = sched_nr_latency; - - if (unlikely(nr_running > nr_latency)) { - period = sysctl_sched_min_granularity; - period *= nr_running; - } - - return period; + if (unlikely(nr_running > sched_nr_latency)) + return nr_running * sysctl_sched_min_granularity; + else + return sysctl_sched_latency; } /* @@ -669,22 +659,37 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) static int select_idle_sibling(struct task_struct *p, int cpu); static unsigned long task_h_load(struct task_struct *p); -static inline void __update_task_entity_contrib(struct sched_entity *se); -static inline void __update_task_entity_utilization(struct sched_entity *se); +/* + * We choose a half-life close to 1 scheduling period. + * Note: The tables below are dependent on this value. + */ +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ -/* Give new task start runnable values to heavy its load in infant time */ -void init_task_runnable_average(struct task_struct *p) +/* Give new sched_entity start runnable values to heavy its load in infant time */ +void init_entity_runnable_average(struct sched_entity *se) { - u32 slice; + struct sched_avg *sa = &se->avg; - slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; - p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; - p->se.avg.avg_period = slice; - __update_task_entity_contrib(&p->se); - __update_task_entity_utilization(&p->se); + sa->last_update_time = 0; + /* + * sched_avg's period_contrib should be strictly less then 1024, so + * we give it 1023 to make sure it is almost a period (1024us), and + * will definitely be update (after enqueue). + */ + sa->period_contrib = 1023; + sa->load_avg = scale_load_down(se->load.weight); + sa->load_sum = sa->load_avg * LOAD_AVG_MAX; + sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); + sa->util_sum = LOAD_AVG_MAX; + /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } + +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else -void init_task_runnable_average(struct task_struct *p) +void init_entity_runnable_average(struct sched_entity *se) { } #endif @@ -1415,8 +1420,9 @@ static bool numa_has_capacity(struct task_numa_env *env) * --------------------- vs --------------------- * src->compute_capacity dst->compute_capacity */ - if (src->load * dst->compute_capacity > - dst->load * src->compute_capacity) + if (src->load * dst->compute_capacity * env->imbalance_pct > + + dst->load * src->compute_capacity * 100) return true; return false; @@ -1702,8 +1708,8 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) delta = runtime - p->last_sum_exec_runtime; *period = now - p->last_task_numa_placement; } else { - delta = p->se.avg.runnable_avg_sum; - *period = p->se.avg.avg_period; + delta = p->se.avg.load_sum / p->se.load.weight; + *period = LOAD_AVG_MAX; } p->last_sum_exec_runtime = runtime; @@ -2351,13 +2357,13 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) long tg_weight; /* - * Use this CPU's actual weight instead of the last load_contribution - * to gain a more accurate current total weight. See - * update_cfs_rq_load_contribution(). + * Use this CPU's real-time load instead of the last load contribution + * as the updating of the contribution is delayed, and we will use the + * the real-time load to calc the share. See update_tg_load_avg(). */ tg_weight = atomic_long_read(&tg->load_avg); - tg_weight -= cfs_rq->tg_load_contrib; - tg_weight += cfs_rq->load.weight; + tg_weight -= cfs_rq->tg_load_avg_contrib; + tg_weight += cfs_rq_load_avg(cfs_rq); return tg_weight; } @@ -2367,7 +2373,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) long tg_weight, load, shares; tg_weight = calc_tg_weight(tg, cfs_rq); - load = cfs_rq->load.weight; + load = cfs_rq_load_avg(cfs_rq); shares = (tg->shares * load); if (tg_weight) @@ -2429,14 +2435,6 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP -/* - * We choose a half-life close to 1 scheduling period. - * Note: The tables below are dependent on this value. - */ -#define LOAD_AVG_PERIOD 32 -#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ -#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ - /* Precomputed fixed inverse multiplies for multiplication by y^n */ static const u32 runnable_avg_yN_inv[] = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, @@ -2485,9 +2483,8 @@ static __always_inline u64 decay_load(u64 val, u64 n) local_n %= LOAD_AVG_PERIOD; } - val *= runnable_avg_yN_inv[local_n]; - /* We don't use SRR here since we always want to round down. */ - return val >> 32; + val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); + return val; } /* @@ -2546,23 +2543,22 @@ static u32 __compute_runnable_contrib(u64 n) * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ -static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, - struct sched_avg *sa, - int runnable, - int running) +static __always_inline int +__update_load_avg(u64 now, int cpu, struct sched_avg *sa, + unsigned long weight, int running, struct cfs_rq *cfs_rq) { u64 delta, periods; - u32 runnable_contrib; + u32 contrib; int delta_w, decayed = 0; unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); - delta = now - sa->last_runnable_update; + delta = now - sa->last_update_time; /* * This should only happen when time goes backwards, which it * unfortunately does during sched clock init when we swap over to TSC. */ if ((s64)delta < 0) { - sa->last_runnable_update = now; + sa->last_update_time = now; return 0; } @@ -2573,26 +2569,29 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, delta >>= 10; if (!delta) return 0; - sa->last_runnable_update = now; + sa->last_update_time = now; /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->avg_period % 1024; + delta_w = sa->period_contrib; if (delta + delta_w >= 1024) { - /* period roll-over */ decayed = 1; + /* how much left for next period will start over, we don't know yet */ + sa->period_contrib = 0; + /* * Now that we know we're crossing a period boundary, figure * out how much from delta we need to complete the current * period and accrue it. */ delta_w = 1024 - delta_w; - if (runnable) - sa->runnable_avg_sum += delta_w; + if (weight) { + sa->load_sum += weight * delta_w; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * delta_w; + } if (running) - sa->running_avg_sum += delta_w * scale_freq - >> SCHED_CAPACITY_SHIFT; - sa->avg_period += delta_w; + sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT; delta -= delta_w; @@ -2600,341 +2599,186 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, periods = delta / 1024; delta %= 1024; - sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, - periods + 1); - sa->running_avg_sum = decay_load(sa->running_avg_sum, - periods + 1); - sa->avg_period = decay_load(sa->avg_period, - periods + 1); + sa->load_sum = decay_load(sa->load_sum, periods + 1); + if (cfs_rq) { + cfs_rq->runnable_load_sum = + decay_load(cfs_rq->runnable_load_sum, periods + 1); + } + sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); /* Efficiently calculate \sum (1..n_period) 1024*y^i */ - runnable_contrib = __compute_runnable_contrib(periods); - if (runnable) - sa->runnable_avg_sum += runnable_contrib; + contrib = __compute_runnable_contrib(periods); + if (weight) { + sa->load_sum += weight * contrib; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * contrib; + } if (running) - sa->running_avg_sum += runnable_contrib * scale_freq - >> SCHED_CAPACITY_SHIFT; - sa->avg_period += runnable_contrib; + sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT; } /* Remainder of delta accrued against u_0` */ - if (runnable) - sa->runnable_avg_sum += delta; + if (weight) { + sa->load_sum += weight * delta; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * delta; + } if (running) - sa->running_avg_sum += delta * scale_freq - >> SCHED_CAPACITY_SHIFT; - sa->avg_period += delta; + sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT; + + sa->period_contrib += delta; + + if (decayed) { + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + if (cfs_rq) { + cfs_rq->runnable_load_avg = + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); + } + sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX; + } return decayed; } -/* Synchronize an entity's decay with its parenting cfs_rq.*/ -static inline u64 __synchronize_entity_decay(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 decays = atomic64_read(&cfs_rq->decay_counter); - - decays -= se->avg.decay_count; - se->avg.decay_count = 0; - if (!decays) - return 0; - - se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); - se->avg.utilization_avg_contrib = - decay_load(se->avg.utilization_avg_contrib, decays); - - return decays; -} - #ifdef CONFIG_FAIR_GROUP_SCHED -static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, - int force_update) -{ - struct task_group *tg = cfs_rq->tg; - long tg_contrib; - - tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; - tg_contrib -= cfs_rq->tg_load_contrib; - - if (!tg_contrib) - return; - - if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { - atomic_long_add(tg_contrib, &tg->load_avg); - cfs_rq->tg_load_contrib += tg_contrib; - } -} - /* - * Aggregate cfs_rq runnable averages into an equivalent task_group - * representation for computing load contributions. + * Updating tg's load_avg is necessary before update_cfs_share (which is done) + * and effective_load (which is not done because it is too costly). */ -static inline void __update_tg_runnable_avg(struct sched_avg *sa, - struct cfs_rq *cfs_rq) +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { - struct task_group *tg = cfs_rq->tg; - long contrib; + long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; - /* The fraction of a cpu used by this cfs_rq */ - contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, - sa->avg_period + 1); - contrib -= cfs_rq->tg_runnable_contrib; - - if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { - atomic_add(contrib, &tg->runnable_avg); - cfs_rq->tg_runnable_contrib += contrib; + if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { + atomic_long_add(delta, &cfs_rq->tg->load_avg); + cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; } } -static inline void __update_group_entity_contrib(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = group_cfs_rq(se); - struct task_group *tg = cfs_rq->tg; - int runnable_avg; - - u64 contrib; - - contrib = cfs_rq->tg_load_contrib * tg->shares; - se->avg.load_avg_contrib = div_u64(contrib, - atomic_long_read(&tg->load_avg) + 1); - - /* - * For group entities we need to compute a correction term in the case - * that they are consuming <1 cpu so that we would contribute the same - * load as a task of equal weight. - * - * Explicitly co-ordinating this measurement would be expensive, but - * fortunately the sum of each cpus contribution forms a usable - * lower-bound on the true value. - * - * Consider the aggregate of 2 contributions. Either they are disjoint - * (and the sum represents true value) or they are disjoint and we are - * understating by the aggregate of their overlap. - * - * Extending this to N cpus, for a given overlap, the maximum amount we - * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of - * cpus that overlap for this interval and w_i is the interval width. - * - * On a small machine; the first term is well-bounded which bounds the - * total error since w_i is a subset of the period. Whereas on a - * larger machine, while this first term can be larger, if w_i is the - * of consequential size guaranteed to see n_i*w_i quickly converge to - * our upper bound of 1-cpu. - */ - runnable_avg = atomic_read(&tg->runnable_avg); - if (runnable_avg < NICE_0_LOAD) { - se->avg.load_avg_contrib *= runnable_avg; - se->avg.load_avg_contrib >>= NICE_0_SHIFT; - } -} - -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) -{ - __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, - runnable, runnable); - __update_tg_runnable_avg(&rq->avg, &rq->cfs); -} #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, - int force_update) {} -static inline void __update_tg_runnable_avg(struct sched_avg *sa, - struct cfs_rq *cfs_rq) {} -static inline void __update_group_entity_contrib(struct sched_entity *se) {} -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void __update_task_entity_contrib(struct sched_entity *se) -{ - u32 contrib; - - /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ - contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); - contrib /= (se->avg.avg_period + 1); - se->avg.load_avg_contrib = scale_load(contrib); -} - -/* Compute the current contribution to load_avg by se, return any delta */ -static long __update_entity_load_avg_contrib(struct sched_entity *se) -{ - long old_contrib = se->avg.load_avg_contrib; - - if (entity_is_task(se)) { - __update_task_entity_contrib(se); - } else { - __update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); - __update_group_entity_contrib(se); - } - - return se->avg.load_avg_contrib - old_contrib; -} - - -static inline void __update_task_entity_utilization(struct sched_entity *se) -{ - u32 contrib; - - /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ - contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); - contrib /= (se->avg.avg_period + 1); - se->avg.utilization_avg_contrib = scale_load(contrib); -} - -static long __update_entity_utilization_avg_contrib(struct sched_entity *se) -{ - long old_contrib = se->avg.utilization_avg_contrib; - - if (entity_is_task(se)) - __update_task_entity_utilization(se); - else - se->avg.utilization_avg_contrib = - group_cfs_rq(se)->utilization_load_avg; - - return se->avg.utilization_avg_contrib - old_contrib; -} - -static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, - long load_contrib) -{ - if (likely(load_contrib < cfs_rq->blocked_load_avg)) - cfs_rq->blocked_load_avg -= load_contrib; - else - cfs_rq->blocked_load_avg = 0; -} - static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); -/* Update a sched_entity's runnable average */ -static inline void update_entity_load_avg(struct sched_entity *se, - int update_cfs_rq) +/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ +static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) +{ + int decayed; + struct sched_avg *sa = &cfs_rq->avg; + + if (atomic_long_read(&cfs_rq->removed_load_avg)) { + long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); + sa->load_avg = max_t(long, sa->load_avg - r, 0); + sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); + } + + if (atomic_long_read(&cfs_rq->removed_util_avg)) { + long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); + sa->util_avg = max_t(long, sa->util_avg - r, 0); + sa->util_sum = max_t(s32, sa->util_sum - + ((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0); + } + + decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, + scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq); + +#ifndef CONFIG_64BIT + smp_wmb(); + cfs_rq->load_last_update_time_copy = sa->last_update_time; +#endif + + return decayed; +} + +/* Update task and its cfs_rq load average */ +static inline void update_load_avg(struct sched_entity *se, int update_tg) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - long contrib_delta, utilization_delta; int cpu = cpu_of(rq_of(cfs_rq)); - u64 now; + u64 now = cfs_rq_clock_task(cfs_rq); /* - * For a group entity we need to use their owned cfs_rq_clock_task() in - * case they are the parent of a throttled hierarchy. + * Track task load average for carrying it to new CPU after migrated, and + * track group sched_entity load average for task_h_load calc in migration */ - if (entity_is_task(se)) - now = cfs_rq_clock_task(cfs_rq); - else - now = cfs_rq_clock_task(group_cfs_rq(se)); + __update_load_avg(now, cpu, &se->avg, + se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); - if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, - cfs_rq->curr == se)) - return; + if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) + update_tg_load_avg(cfs_rq, 0); +} - contrib_delta = __update_entity_load_avg_contrib(se); - utilization_delta = __update_entity_utilization_avg_contrib(se); +/* Add the load generated by se into cfs_rq's load average */ +static inline void +enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct sched_avg *sa = &se->avg; + u64 now = cfs_rq_clock_task(cfs_rq); + int migrated = 0, decayed; - if (!update_cfs_rq) - return; - - if (se->on_rq) { - cfs_rq->runnable_load_avg += contrib_delta; - cfs_rq->utilization_load_avg += utilization_delta; - } else { - subtract_blocked_load_contrib(cfs_rq, -contrib_delta); + if (sa->last_update_time == 0) { + sa->last_update_time = now; + migrated = 1; } + else { + __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); + } + + decayed = update_cfs_rq_load_avg(now, cfs_rq); + + cfs_rq->runnable_load_avg += sa->load_avg; + cfs_rq->runnable_load_sum += sa->load_sum; + + if (migrated) { + cfs_rq->avg.load_avg += sa->load_avg; + cfs_rq->avg.load_sum += sa->load_sum; + cfs_rq->avg.util_avg += sa->util_avg; + cfs_rq->avg.util_sum += sa->util_sum; + } + + if (decayed || migrated) + update_tg_load_avg(cfs_rq, 0); +} + +/* Remove the runnable load generated by se from cfs_rq's runnable load average */ +static inline void +dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + update_load_avg(se, 1); + + cfs_rq->runnable_load_avg = + max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); + cfs_rq->runnable_load_sum = + max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } /* - * Decay the load contributed by all blocked children and account this so that - * their contribution may appropriately discounted when they wake up. + * Task first catches up with cfs_rq, and then subtract + * itself from the cfs_rq (task must be off the queue now). */ -static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) +void remove_entity_load_avg(struct sched_entity *se) { - u64 now = cfs_rq_clock_task(cfs_rq) >> 20; - u64 decays; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; - decays = now - cfs_rq->last_decay; - if (!decays && !force_update) - return; +#ifndef CONFIG_64BIT + u64 last_update_time_copy; - if (atomic_long_read(&cfs_rq->removed_load)) { - unsigned long removed_load; - removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0); - subtract_blocked_load_contrib(cfs_rq, removed_load); - } + do { + last_update_time_copy = cfs_rq->load_last_update_time_copy; + smp_rmb(); + last_update_time = cfs_rq->avg.last_update_time; + } while (last_update_time != last_update_time_copy); +#else + last_update_time = cfs_rq->avg.last_update_time; +#endif - if (decays) { - cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, - decays); - atomic64_add(decays, &cfs_rq->decay_counter); - cfs_rq->last_decay = now; - } - - __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); -} - -/* Add the load generated by se into cfs_rq's child load-average */ -static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int wakeup) -{ - /* - * We track migrations using entity decay_count <= 0, on a wake-up - * migration we use a negative decay count to track the remote decays - * accumulated while sleeping. - * - * Newly forked tasks are enqueued with se->avg.decay_count == 0, they - * are seen by enqueue_entity_load_avg() as a migration with an already - * constructed load_avg_contrib. - */ - if (unlikely(se->avg.decay_count <= 0)) { - se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq)); - if (se->avg.decay_count) { - /* - * In a wake-up migration we have to approximate the - * time sleeping. This is because we can't synchronize - * clock_task between the two cpus, and it is not - * guaranteed to be read-safe. Instead, we can - * approximate this using our carried decays, which are - * explicitly atomically readable. - */ - se->avg.last_runnable_update -= (-se->avg.decay_count) - << 20; - update_entity_load_avg(se, 0); - /* Indicate that we're now synchronized and on-rq */ - se->avg.decay_count = 0; - } - wakeup = 0; - } else { - __synchronize_entity_decay(se); - } - - /* migrated tasks did not contribute to our blocked load */ - if (wakeup) { - subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); - update_entity_load_avg(se, 0); - } - - cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; - cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; - /* we force update consideration on load-balancer moves */ - update_cfs_rq_blocked_load(cfs_rq, !wakeup); -} - -/* - * Remove se's load from this cfs_rq child load-average, if the entity is - * transitioning to a blocked state we track its projected decay using - * blocked_load_avg. - */ -static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int sleep) -{ - update_entity_load_avg(se, 1); - /* we force update consideration on load-balancer moves */ - update_cfs_rq_blocked_load(cfs_rq, !sleep); - - cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; - cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; - if (sleep) { - cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; - se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); - } /* migrations, e.g. sleep=0 leave decay_count == 0 */ + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); + atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } /* @@ -2944,7 +2788,6 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, */ void idle_enter_fair(struct rq *this_rq) { - update_rq_runnable_avg(this_rq, 1); } /* @@ -2954,24 +2797,28 @@ void idle_enter_fair(struct rq *this_rq) */ void idle_exit_fair(struct rq *this_rq) { - update_rq_runnable_avg(this_rq, 0); +} + +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) +{ + return cfs_rq->runnable_load_avg; +} + +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.load_avg; } static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ -static inline void update_entity_load_avg(struct sched_entity *se, - int update_cfs_rq) {} -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} -static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int wakeup) {} -static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int sleep) {} -static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, - int force_update) {} +static inline void update_load_avg(struct sched_entity *se, int update_tg) {} +static inline void +enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void +dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline int idle_balance(struct rq *rq) { @@ -3103,7 +2950,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); + enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -3178,7 +3025,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); + dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -3268,7 +3115,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); - update_entity_load_avg(se, 1); + update_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); @@ -3368,7 +3215,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ - update_entity_load_avg(prev, 1); + update_load_avg(prev, 0); } cfs_rq->curr = NULL; } @@ -3384,8 +3231,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_entity_load_avg(curr, 1); - update_cfs_rq_blocked_load(cfs_rq, 1); + update_load_avg(curr, 1); update_cfs_shares(cfs_rq); #ifdef CONFIG_SCHED_HRTICK @@ -4258,14 +4104,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; + update_load_avg(se, 1); update_cfs_shares(cfs_rq); - update_entity_load_avg(se, 1); } - if (!se) { - update_rq_runnable_avg(rq, rq->nr_running); + if (!se) add_nr_running(rq, 1); - } + hrtick_update(rq); } @@ -4319,14 +4164,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; + update_load_avg(se, 1); update_cfs_shares(cfs_rq); - update_entity_load_avg(se, 1); } - if (!se) { + if (!se) sub_nr_running(rq, 1); - update_rq_runnable_avg(rq, 1); - } + hrtick_update(rq); } @@ -4439,6 +4283,12 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, sched_avg_update(this_rq); } +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs); +} + #ifdef CONFIG_NO_HZ_COMMON /* * There is no sane way to deal with nohz on smp when using jiffies because the @@ -4460,7 +4310,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, static void update_idle_cpu_load(struct rq *this_rq) { unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long load = this_rq->cfs.runnable_load_avg; + unsigned long load = weighted_cpuload(cpu_of(this_rq)); unsigned long pending_updates; /* @@ -4506,7 +4356,7 @@ void update_cpu_load_nohz(void) */ void update_cpu_load_active(struct rq *this_rq) { - unsigned long load = this_rq->cfs.runnable_load_avg; + unsigned long load = weighted_cpuload(cpu_of(this_rq)); /* * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ @@ -4514,12 +4364,6 @@ void update_cpu_load_active(struct rq *this_rq) __update_cpu_load(this_rq, load, 1); } -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->cfs.runnable_load_avg; -} - /* * Return a low guess at the load of a migration-source cpu weighted * according to the scheduling class and "nice" value. @@ -4567,7 +4411,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = rq->cfs.runnable_load_avg; + unsigned long load_avg = weighted_cpuload(cpu); if (nr_running) return load_avg / nr_running; @@ -4686,7 +4530,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) /* * w = rw_i + @wl */ - w = se->my_q->load.weight + wl; + w = cfs_rq_load_avg(se->my_q) + wl; /* * wl = S * s'_i; see (2) @@ -4707,7 +4551,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) /* * wl = dw_i = S * (s'_i - s_i); see (3) */ - wl -= se->load.weight; + wl -= se->avg.load_avg; /* * Recursively apply this logic to all parent groups to compute @@ -4730,26 +4574,29 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +/* + * Detect M:N waker/wakee relationships via a switching-frequency heuristic. + * A waker of many should wake a different task than the one last awakened + * at a frequency roughly N times higher than one of its wakees. In order + * to determine whether we should let the load spread vs consolodating to + * shared cache, we look for a minimum 'flip' frequency of llc_size in one + * partner, and a factor of lls_size higher frequency in the other. With + * both conditions met, we can be relatively sure that the relationship is + * non-monogamous, with partner count exceeding socket size. Waker/wakee + * being client/server, worker/dispatcher, interrupt source or whatever is + * irrelevant, spread criteria is apparent partner count exceeds socket size. + */ static int wake_wide(struct task_struct *p) { + unsigned int master = current->wakee_flips; + unsigned int slave = p->wakee_flips; int factor = this_cpu_read(sd_llc_size); - /* - * Yeah, it's the switching-frequency, could means many wakee or - * rapidly switch, use factor here will just help to automatically - * adjust the loose-degree, so bigger node will lead to more pull. - */ - if (p->wakee_flips > factor) { - /* - * wakee is somewhat hot, it needs certain amount of cpu - * resource, so if waker is far more hot, prefer to leave - * it alone. - */ - if (current->wakee_flips > (factor * p->wakee_flips)) - return 1; - } - - return 0; + if (master < slave) + swap(master, slave); + if (slave < factor || master < slave * factor) + return 0; + return 1; } static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) @@ -4761,13 +4608,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) unsigned long weight; int balanced; - /* - * If we wake multiple tasks be careful to not bounce - * ourselves around too much. - */ - if (wake_wide(p)) - return 0; - idx = sd->wake_idx; this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); @@ -4781,14 +4621,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) */ if (sync) { tg = task_group(current); - weight = current->se.load.weight; + weight = current->se.avg.load_avg; this_load += effective_load(tg, this_cpu, -weight, -weight); load += effective_load(tg, prev_cpu, 0, -weight); } tg = task_group(p); - weight = p->se.load.weight; + weight = p->se.avg.load_avg; /* * In low-load situations, where prev_cpu is idle and this_cpu is idle @@ -4981,12 +4821,12 @@ static int select_idle_sibling(struct task_struct *p, int target) * tasks. The unit of the return value must be the one of capacity so we can * compare the usage with the capacity of the CPU that is available for CFS * task (ie cpu_capacity). - * cfs.utilization_load_avg is the sum of running time of runnable tasks on a + * cfs.avg.util_avg is the sum of running time of runnable tasks on a * CPU. It represents the amount of utilization of a CPU in the range * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full * capacity of the CPU because it's about the running time on this CPU. - * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE - * because of unfortunate rounding in avg_period and running_load_avg or just + * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE + * because of unfortunate rounding in util_avg or just * after migrating tasks until the average stabilizes with the new running * time. So we need to check that the usage stays into the range * [0..cpu_capacity_orig] and cap if necessary. @@ -4995,7 +4835,7 @@ static int select_idle_sibling(struct task_struct *p, int target) */ static int get_cpu_usage(int cpu) { - unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; + unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); if (usage >= SCHED_LOAD_SCALE) @@ -5021,17 +4861,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); - int new_cpu = cpu; + int new_cpu = prev_cpu; int want_affine = 0; int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) - want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) - continue; + break; /* * If both cpu and prev_cpu are part of this domain, @@ -5045,17 +4885,21 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (tmp->flags & sd_flag) sd = tmp; + else if (!want_affine) + break; } - if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) - prev_cpu = cpu; - - if (sd_flag & SD_BALANCE_WAKE) { - new_cpu = select_idle_sibling(p, prev_cpu); - goto unlock; + if (affine_sd) { + sd = NULL; /* Prefer wake_affine over balance flags */ + if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + new_cpu = cpu; } - while (sd) { + if (!sd) { + if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ + new_cpu = select_idle_sibling(p, new_cpu); + + } else while (sd) { struct sched_group *group; int weight; @@ -5089,7 +4933,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } /* while loop will break here if sd == NULL */ } -unlock: rcu_read_unlock(); return new_cpu; @@ -5101,26 +4944,27 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f * previous cpu. However, the caller only guarantees p->pi_lock is held; no * other assumptions, including the state of rq->lock, should be made. */ -static void -migrate_task_rq_fair(struct task_struct *p, int next_cpu) +static void migrate_task_rq_fair(struct task_struct *p, int next_cpu) { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - /* - * Load tracking: accumulate removed load so that it can be processed - * when we next update owning cfs_rq under rq->lock. Tasks contribute - * to blocked load iff they have a positive decay-count. It can never - * be negative here since on-rq tasks have decay-count == 0. + * We are supposed to update the task to "current" time, then its up to date + * and ready to go to new CPU/cfs_rq. But we have difficulty in getting + * what current time is, so simply throw away the out-of-date time. This + * will result in the wakee task is less decayed, but giving the wakee more + * load sounds not bad. */ - if (se->avg.decay_count) { - se->avg.decay_count = -__synchronize_entity_decay(se); - atomic_long_add(se->avg.load_avg_contrib, - &cfs_rq->removed_load); - } + remove_entity_load_avg(&p->se); + + /* Tell new CPU we are migrated */ + p->se.avg.last_update_time = 0; /* We have migrated, no longer consider this task hot */ - se->exec_start = 0; + p->se.exec_start = 0; +} + +static void task_dead_fair(struct task_struct *p) +{ + remove_entity_load_avg(&p->se); } #endif /* CONFIG_SMP */ @@ -5670,72 +5514,39 @@ static int task_hot(struct task_struct *p, struct lb_env *env) #ifdef CONFIG_NUMA_BALANCING /* - * Returns true if the destination node is the preferred node. - * Needs to match fbq_classify_rq(): if there is a runnable task - * that is not on its preferred node, we should identify it. + * Returns 1, if task migration degrades locality + * Returns 0, if task migration improves locality i.e migration preferred. + * Returns -1, if task migration is not affected by locality. */ -static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) +static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); unsigned long src_faults, dst_faults; int src_nid, dst_nid; - if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || - !(env->sd->flags & SD_NUMA)) { - return false; - } - - src_nid = cpu_to_node(env->src_cpu); - dst_nid = cpu_to_node(env->dst_cpu); - - if (src_nid == dst_nid) - return false; - - /* Encourage migration to the preferred node. */ - if (dst_nid == p->numa_preferred_nid) - return true; - - /* Migrating away from the preferred node is bad. */ - if (src_nid == p->numa_preferred_nid) - return false; - - if (numa_group) { - src_faults = group_faults(p, src_nid); - dst_faults = group_faults(p, dst_nid); - } else { - src_faults = task_faults(p, src_nid); - dst_faults = task_faults(p, dst_nid); - } - - return dst_faults > src_faults; -} - - -static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) -{ - struct numa_group *numa_group = rcu_dereference(p->numa_group); - unsigned long src_faults, dst_faults; - int src_nid, dst_nid; - - if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) - return false; - if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) - return false; + return -1; + + if (!sched_feat(NUMA)) + return -1; src_nid = cpu_to_node(env->src_cpu); dst_nid = cpu_to_node(env->dst_cpu); if (src_nid == dst_nid) - return false; + return -1; - /* Migrating away from the preferred node is bad. */ - if (src_nid == p->numa_preferred_nid) - return true; + /* Migrating away from the preferred node is always bad. */ + if (src_nid == p->numa_preferred_nid) { + if (env->src_rq->nr_running > env->src_rq->nr_preferred_running) + return 1; + else + return -1; + } /* Encourage migration to the preferred node. */ if (dst_nid == p->numa_preferred_nid) - return false; + return 0; if (numa_group) { src_faults = group_faults(p, src_nid); @@ -5749,16 +5560,10 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) } #else -static inline bool migrate_improves_locality(struct task_struct *p, +static inline int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { - return false; -} - -static inline bool migrate_degrades_locality(struct task_struct *p, - struct lb_env *env) -{ - return false; + return -1; } #endif @@ -5768,7 +5573,7 @@ static inline bool migrate_degrades_locality(struct task_struct *p, static int can_migrate_task(struct task_struct *p, struct lb_env *env) { - int tsk_cache_hot = 0; + int tsk_cache_hot; lockdep_assert_held(&env->src_rq->lock); @@ -5826,13 +5631,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) task is cache cold, or * 3) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, env); - if (!tsk_cache_hot) - tsk_cache_hot = migrate_degrades_locality(p, env); + tsk_cache_hot = migrate_degrades_locality(p, env); + if (tsk_cache_hot == -1) + tsk_cache_hot = task_hot(p, env); - if (migrate_improves_locality(p, env) || !tsk_cache_hot || + if (tsk_cache_hot <= 0 || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { - if (tsk_cache_hot) { + if (tsk_cache_hot == 1) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } @@ -5906,6 +5711,13 @@ static int detach_tasks(struct lb_env *env) return 0; while (!list_empty(tasks)) { + /* + * We don't want to steal all, otherwise we may be treated likewise, + * which could at worst lead to a livelock crash. + */ + if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) + break; + p = list_first_entry(tasks, struct task_struct, se.group_node); env->loop++; @@ -6015,39 +5827,6 @@ static void attach_tasks(struct lb_env *env) } #ifdef CONFIG_FAIR_GROUP_SCHED -/* - * update tg->load_weight by folding this cpu's load_avg - */ -static void __update_blocked_averages_cpu(struct task_group *tg, int cpu) -{ - struct sched_entity *se = tg->se[cpu]; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; - - /* throttled entities do not contribute to load */ - if (throttled_hierarchy(cfs_rq)) - return; - - update_cfs_rq_blocked_load(cfs_rq, 1); - - if (se) { - update_entity_load_avg(se, 1); - /* - * We pivot on our runnable average having decayed to zero for - * list removal. This generally implies that all our children - * have also been removed (modulo rounding error or bandwidth - * control); however, such cases are rare and we can fix these - * at enqueue. - * - * TODO: fix up out-of-order children on enqueue. - */ - if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) - list_del_leaf_cfs_rq(cfs_rq); - } else { - struct rq *rq = rq_of(cfs_rq); - update_rq_runnable_avg(rq, rq->nr_running); - } -} - static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -6056,19 +5835,19 @@ static void update_blocked_averages(int cpu) raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); + /* * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { - /* - * Note: We may want to consider periodically releasing - * rq->lock about these updates so that creating many task - * groups does not result in continually extending hold time. - */ - __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu); - } + /* throttled entities do not contribute to load */ + if (throttled_hierarchy(cfs_rq)) + continue; + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + update_tg_load_avg(cfs_rq, 0); + } raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -6096,14 +5875,14 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) } if (!se) { - cfs_rq->h_load = cfs_rq->runnable_load_avg; + cfs_rq->h_load = cfs_rq_load_avg(cfs_rq); cfs_rq->last_h_load_update = now; } while ((se = cfs_rq->h_load_next) != NULL) { load = cfs_rq->h_load; - load = div64_ul(load * se->avg.load_avg_contrib, - cfs_rq->runnable_load_avg + 1); + load = div64_ul(load * se->avg.load_avg, + cfs_rq_load_avg(cfs_rq) + 1); cfs_rq = group_cfs_rq(se); cfs_rq->h_load = load; cfs_rq->last_h_load_update = now; @@ -6115,17 +5894,25 @@ static unsigned long task_h_load(struct task_struct *p) struct cfs_rq *cfs_rq = task_cfs_rq(p); update_cfs_rq_h_load(cfs_rq); - return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, - cfs_rq->runnable_load_avg + 1); + return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, + cfs_rq_load_avg(cfs_rq) + 1); } #else static inline void update_blocked_averages(int cpu) { + struct rq *rq = cpu_rq(cpu); + struct cfs_rq *cfs_rq = &rq->cfs; + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); } static unsigned long task_h_load(struct task_struct *p) { - return p->se.avg.load_avg_contrib; + return p->se.avg.load_avg; } #endif @@ -8025,8 +7812,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (numabalancing_enabled) task_tick_numa(rq, curr); - - update_rq_runnable_avg(rq, 1); } /* @@ -8125,15 +7910,18 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) } #ifdef CONFIG_SMP - /* - * Remove our load from contribution when we leave sched_fair - * and ensure we don't carry in an old decay_count if we - * switch back. - */ - if (se->avg.decay_count) { - __synchronize_entity_decay(se); - subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); - } + /* Catch up with the cfs_rq and remove our load when we leave */ + __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg, + se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); + + cfs_rq->avg.load_avg = + max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); + cfs_rq->avg.load_sum = + max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); + cfs_rq->avg.util_avg = + max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); + cfs_rq->avg.util_sum = + max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); #endif } @@ -8142,16 +7930,31 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) */ static void switched_to_fair(struct rq *rq, struct task_struct *p) { -#ifdef CONFIG_FAIR_GROUP_SCHED struct sched_entity *se = &p->se; + +#ifdef CONFIG_FAIR_GROUP_SCHED /* * Since the real-depth could have been changed (only FAIR * class maintain depth value), reset depth properly. */ se->depth = se->parent ? se->parent->depth + 1 : 0; #endif - if (!task_on_rq_queued(p)) + + if (!task_on_rq_queued(p)) { + + /* + * Ensure the task has a non-normalized vruntime when it is switched + * back to the fair class with !queued, so that enqueue_entity() at + * wake-up time will do the right thing. + * + * If it's queued, then the enqueue_entity(.flags=0) makes the task + * has non-normalized vruntime, if it's !queued, then it still has + * normalized vruntime. + */ + if (p->state != TASK_RUNNING) + se->vruntime += cfs_rq_of(se)->min_vruntime; return; + } /* * We were most likely switched from sched_rt, so @@ -8190,8 +7993,8 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif #ifdef CONFIG_SMP - atomic64_set(&cfs_rq->decay_counter, 1); - atomic_long_set(&cfs_rq->removed_load, 0); + atomic_long_set(&cfs_rq->removed_load_avg, 0); + atomic_long_set(&cfs_rq->removed_util_avg, 0); #endif } @@ -8236,14 +8039,14 @@ static void task_move_group_fair(struct task_struct *p, int queued) if (!queued) { cfs_rq = cfs_rq_of(se); se->vruntime += cfs_rq->min_vruntime; + #ifdef CONFIG_SMP - /* - * migrate_task_rq_fair() will have removed our previous - * contribution, but we must synchronize for ongoing future - * decay. - */ - se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); - cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; + /* Virtually synchronize task with its new cfs_rq */ + p->se.avg.last_update_time = cfs_rq->avg.last_update_time; + cfs_rq->avg.load_avg += p->se.avg.load_avg; + cfs_rq->avg.load_sum += p->se.avg.load_sum; + cfs_rq->avg.util_avg += p->se.avg.util_avg; + cfs_rq->avg.util_sum += p->se.avg.util_sum; #endif } } @@ -8257,8 +8060,11 @@ void free_fair_sched_group(struct task_group *tg) for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); - if (tg->se) + if (tg->se) { + if (tg->se[i]) + remove_entity_load_avg(tg->se[i]); kfree(tg->se[i]); + } } kfree(tg->cfs_rq); @@ -8295,6 +8101,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); + init_entity_runnable_average(se); } return 1; @@ -8444,6 +8251,8 @@ const struct sched_class fair_sched_class = { .rq_offline = rq_offline_fair, .task_waking = task_waking_fair, + .task_dead = task_dead_fair, + .set_cpus_allowed = set_cpus_allowed_common, #endif .set_curr_task = set_curr_task_fair, diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 91e33cd485f6..83a50e7ca533 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -79,20 +79,12 @@ SCHED_FEAT(LB_MIN, false) * numa_balancing= */ #ifdef CONFIG_NUMA_BALANCING -SCHED_FEAT(NUMA, false) /* - * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a - * higher number of hinting faults are recorded during active load - * balancing. + * NUMA will favor moving tasks towards nodes where a higher number of + * hinting faults are recorded during active load balancing. It will + * resist moving tasks towards nodes where a lower number of hinting + * faults have been recorded. */ -SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) - -/* - * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a - * lower number of hinting faults have been recorded. As this has - * the potential to prevent a task ever migrating to a new node - * due to CPU overload it is disabled by default. - */ -SCHED_FEAT(NUMA_RESIST_LOWER, false) +SCHED_FEAT(NUMA, true) #endif diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 594275ed2620..8f177c73ae19 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -83,10 +83,13 @@ void __weak arch_cpu_idle(void) */ void default_idle_call(void) { - if (current_clr_polling_and_test()) + if (current_clr_polling_and_test()) { local_irq_enable(); - else + } else { + stop_critical_timings(); arch_cpu_idle(); + start_critical_timings(); + } } static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, @@ -140,12 +143,6 @@ static void cpuidle_idle_call(void) return; } - /* - * During the idle period, stop measuring the disabled irqs - * critical sections latencies - */ - stop_critical_timings(); - /* * Tell the RCU framework we are entering an idle section, * so no more rcu read side critical sections and one more @@ -198,7 +195,6 @@ static void cpuidle_idle_call(void) local_irq_enable(); rcu_idle_exit(); - start_critical_timings(); } DEFINE_PER_CPU(bool, cpu_dead_idle); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index c65dac8c97cd..c4ae0f1fdf9b 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -96,6 +96,7 @@ const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, + .set_cpus_allowed = set_cpus_allowed_common, #endif .set_curr_task = set_curr_task_idle, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0d193a243e96..d2ea59364a1c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2069,7 +2069,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - has_pushable_tasks(rq) && p->nr_cpus_allowed > 1 && (dl_task(rq->curr) || rt_task(rq->curr)) && (rq->curr->nr_cpus_allowed < 2 || @@ -2077,45 +2076,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) push_rt_tasks(rq); } -static void set_cpus_allowed_rt(struct task_struct *p, - const struct cpumask *new_mask) -{ - struct rq *rq; - int weight; - - BUG_ON(!rt_task(p)); - - if (!task_on_rq_queued(p)) - return; - - weight = cpumask_weight(new_mask); - - /* - * Only update if the process changes its state from whether it - * can migrate or not. - */ - if ((p->nr_cpus_allowed > 1) == (weight > 1)) - return; - - rq = task_rq(p); - - /* - * The process used to be able to migrate OR it can now migrate - */ - if (weight <= 1) { - if (!task_current(rq, p)) - dequeue_pushable_task(rq, p); - BUG_ON(!rq->rt.rt_nr_migratory); - rq->rt.rt_nr_migratory--; - } else { - if (!task_current(rq, p)) - enqueue_pushable_task(rq, p); - rq->rt.rt_nr_migratory++; - } - - update_rt_migration(&rq->rt); -} - /* Assumes rq->lock is held */ static void rq_online_rt(struct rq *rq) { @@ -2324,7 +2284,7 @@ const struct sched_class rt_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_rt, - .set_cpus_allowed = set_cpus_allowed_rt, + .set_cpus_allowed = set_cpus_allowed_common, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, .task_woken = task_woken_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 84d48790bb6d..68cda117574c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -245,7 +245,6 @@ struct task_group { #ifdef CONFIG_SMP atomic_long_t load_avg; - atomic_t runnable_avg; #endif #endif @@ -366,27 +365,20 @@ struct cfs_rq { #ifdef CONFIG_SMP /* - * CFS Load tracking - * Under CFS, load is tracked on a per-entity basis and aggregated up. - * This allows for the description of both thread and group usage (in - * the FAIR_GROUP_SCHED case). - * runnable_load_avg is the sum of the load_avg_contrib of the - * sched_entities on the rq. - * blocked_load_avg is similar to runnable_load_avg except that its - * the blocked sched_entities on the rq. - * utilization_load_avg is the sum of the average running time of the - * sched_entities on the rq. + * CFS load tracking */ - unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; - atomic64_t decay_counter; - u64 last_decay; - atomic_long_t removed_load; + struct sched_avg avg; + u64 runnable_load_sum; + unsigned long runnable_load_avg; +#ifdef CONFIG_FAIR_GROUP_SCHED + unsigned long tg_load_avg_contrib; +#endif + atomic_long_t removed_load_avg, removed_util_avg; +#ifndef CONFIG_64BIT + u64 load_last_update_time_copy; +#endif #ifdef CONFIG_FAIR_GROUP_SCHED - /* Required to track per-cpu representation of a task_group */ - u32 tg_runnable_contrib; - unsigned long tg_load_contrib; - /* * h_load = weight * f(tg) * @@ -595,8 +587,6 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; - - struct sched_avg avg; #endif /* CONFIG_FAIR_GROUP_SCHED */ /* @@ -1065,9 +1055,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif -#ifndef finish_arch_switch -# define finish_arch_switch(prev) do { } while (0) -#endif #ifndef finish_arch_post_lock_switch # define finish_arch_post_lock_switch() do { } while (0) #endif @@ -1268,6 +1255,8 @@ extern void trigger_load_balance(struct rq *rq); extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); +extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); + #else static inline void idle_enter_fair(struct rq *rq) { } @@ -1319,7 +1308,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); -extern void init_task_runnable_average(struct task_struct *p); +extern void init_entity_runnable_average(struct sched_entity *se); static inline void add_nr_running(struct rq *rq, unsigned count) { diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 79ffec45a6ac..cbc67da10954 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -123,6 +123,7 @@ const struct sched_class stop_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_stop, + .set_cpus_allowed = set_cpus_allowed_common, #endif .set_curr_task = set_curr_task_stop, diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index fd643d8c4b42..12484e5d5c88 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -35,13 +35,16 @@ struct cpu_stop_done { /* the actual stopper, one per every possible cpu, enabled on online cpus */ struct cpu_stopper { + struct task_struct *thread; + spinlock_t lock; bool enabled; /* is this stopper enabled? */ struct list_head works; /* list of pending works */ + + struct cpu_stop_work stop_work; /* for stop_cpus */ }; static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); -static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); static bool stop_machine_initialized = false; /* @@ -74,7 +77,6 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - struct task_struct *p = per_cpu(cpu_stopper_task, cpu); unsigned long flags; @@ -82,7 +84,7 @@ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) if (stopper->enabled) { list_add_tail(&work->list, &stopper->works); - wake_up_process(p); + wake_up_process(stopper->thread); } else cpu_stop_signal_done(work->done, false); @@ -139,7 +141,7 @@ enum multi_stop_state { }; struct multi_stop_data { - int (*fn)(void *); + cpu_stop_fn_t fn; void *data; /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ unsigned int num_threads; @@ -293,7 +295,6 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, /* static data for stop_cpus */ static DEFINE_MUTEX(stop_cpus_mutex); -static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); static void queue_stop_cpus_work(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg, @@ -302,22 +303,19 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, struct cpu_stop_work *work; unsigned int cpu; - /* initialize works and done */ - for_each_cpu(cpu, cpumask) { - work = &per_cpu(stop_cpus_work, cpu); - work->fn = fn; - work->arg = arg; - work->done = done; - } - /* * Disable preemption while queueing to avoid getting * preempted by a stopper which might wait for other stoppers * to enter @fn which can lead to deadlock. */ lg_global_lock(&stop_cpus_lock); - for_each_cpu(cpu, cpumask) - cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); + for_each_cpu(cpu, cpumask) { + work = &per_cpu(cpu_stopper.stop_work, cpu); + work->fn = fn; + work->arg = arg; + work->done = done; + cpu_stop_queue_work(cpu, work); + } lg_global_unlock(&stop_cpus_lock); } @@ -458,19 +456,21 @@ extern void sched_set_stop_task(int cpu, struct task_struct *stop); static void cpu_stop_create(unsigned int cpu) { - sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu)); + sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu)); } static void cpu_stop_park(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - struct cpu_stop_work *work; + struct cpu_stop_work *work, *tmp; unsigned long flags; /* drain remaining works */ spin_lock_irqsave(&stopper->lock, flags); - list_for_each_entry(work, &stopper->works, list) + list_for_each_entry_safe(work, tmp, &stopper->works, list) { + list_del_init(&work->list); cpu_stop_signal_done(work->done, false); + } stopper->enabled = false; spin_unlock_irqrestore(&stopper->lock, flags); } @@ -485,7 +485,7 @@ static void cpu_stop_unpark(unsigned int cpu) } static struct smp_hotplug_thread cpu_stop_threads = { - .store = &cpu_stopper_task, + .store = &cpu_stopper.thread, .thread_should_run = cpu_stop_should_run, .thread_fn = cpu_stopper_thread, .thread_comm = "migration/%u", @@ -515,7 +515,7 @@ early_initcall(cpu_stop_init); #ifdef CONFIG_STOP_MACHINE -int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) +static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { struct multi_stop_data msdata = { .fn = fn, @@ -548,7 +548,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata); } -int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) +int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { int ret; @@ -582,7 +582,7 @@ EXPORT_SYMBOL_GPL(stop_machine); * 0 if all executions of @fn returned 0, any non zero return value if any * returned non zero. */ -int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, +int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { struct multi_stop_data msdata = { .fn = fn, .data = data, diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 419ca37e72c9..f270088e9929 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -26,7 +26,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n } static void -probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) +probe_sched_wakeup(void *ignore, struct task_struct *wakee) { if (unlikely(!sched_ref)) return; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 9b33dd117f3f..12cbe77b4136 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -514,7 +514,7 @@ static void wakeup_reset(struct trace_array *tr) } static void -probe_wakeup(void *ignore, struct task_struct *p, int success) +probe_wakeup(void *ignore, struct task_struct *p) { struct trace_array_cpu *data; int cpu = smp_processor_id(); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index cb91c63b4f4a..811edb77dd6d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1714,9 +1714,7 @@ static struct worker *create_worker(struct worker_pool *pool) goto fail; set_user_nice(worker->task, pool->attrs->nice); - - /* prevent userland from meddling with cpumask of workqueue workers */ - worker->task->flags |= PF_NO_SETAFFINITY; + kthread_bind_mask(worker->task, pool->attrs->cpumask); /* successful, attach the worker to the pool */ worker_attach_to_pool(worker, pool); @@ -3856,7 +3854,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, } wq->rescuer = rescuer; - rescuer->task->flags |= PF_NO_SETAFFINITY; + kthread_bind_mask(rescuer->task, cpu_possible_mask); wake_up_process(rescuer->task); }