diff --git a/Documentation/scheduler/sched-deadline.rst b/Documentation/scheduler/sched-deadline.rst index 9d9be52f221a..9fe4846079bb 100644 --- a/Documentation/scheduler/sched-deadline.rst +++ b/Documentation/scheduler/sched-deadline.rst @@ -203,12 +203,15 @@ Deadline Task Scheduling - Total bandwidth (this_bw): this is the sum of all tasks "belonging" to the runqueue, including the tasks in Inactive state. + - Maximum usable bandwidth (max_bw): This is the maximum bandwidth usable by + deadline tasks and is currently set to the RT capacity. + The algorithm reclaims the bandwidth of the tasks in Inactive state. It does so by decrementing the runtime of the executing task Ti at a pace equal to - dq = -max{ Ui / Umax, (1 - Uinact - Uextra) } dt + dq = -(max{ Ui, (Umax - Uinact - Uextra) } / Umax) dt where: diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h index af1fafbe7e1d..934c658ee947 100644 --- a/arch/arm64/include/asm/arch_timer.h +++ b/arch/arm64/include/asm/arch_timer.h @@ -88,13 +88,7 @@ static inline notrace u64 arch_timer_read_cntvct_el0(void) #define arch_timer_reg_read_stable(reg) \ ({ \ - u64 _val; \ - \ - preempt_disable_notrace(); \ - _val = erratum_handler(read_ ## reg)(); \ - preempt_enable_notrace(); \ - \ - _val; \ + erratum_handler(read_ ## reg)(); \ }) /* diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 877495a0fd0c..51d92abf945e 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -22,13 +22,13 @@ * Generic IO read/write. These perform native-endian accesses. */ #define __raw_writeb __raw_writeb -static inline void __raw_writeb(u8 val, volatile void __iomem *addr) +static __always_inline void __raw_writeb(u8 val, volatile void __iomem *addr) { asm volatile("strb %w0, [%1]" : : "rZ" (val), "r" (addr)); } #define __raw_writew __raw_writew -static inline void __raw_writew(u16 val, volatile void __iomem *addr) +static __always_inline void __raw_writew(u16 val, volatile void __iomem *addr) { asm volatile("strh %w0, [%1]" : : "rZ" (val), "r" (addr)); } @@ -40,13 +40,13 @@ static __always_inline void __raw_writel(u32 val, volatile void __iomem *addr) } #define __raw_writeq __raw_writeq -static inline void __raw_writeq(u64 val, volatile void __iomem *addr) +static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) { asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr)); } #define __raw_readb __raw_readb -static inline u8 __raw_readb(const volatile void __iomem *addr) +static __always_inline u8 __raw_readb(const volatile void __iomem *addr) { u8 val; asm volatile(ALTERNATIVE("ldrb %w0, [%1]", @@ -57,7 +57,7 @@ static inline u8 __raw_readb(const volatile void __iomem *addr) } #define __raw_readw __raw_readw -static inline u16 __raw_readw(const volatile void __iomem *addr) +static __always_inline u16 __raw_readw(const volatile void __iomem *addr) { u16 val; @@ -80,7 +80,7 @@ static __always_inline u32 __raw_readl(const volatile void __iomem *addr) } #define __raw_readq __raw_readq -static inline u64 __raw_readq(const volatile void __iomem *addr) +static __always_inline u64 __raw_readq(const volatile void __iomem *addr) { u64 val; asm volatile(ALTERNATIVE("ldr %0, [%1]", diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index 35e8a52fea11..1c2a0a2c8830 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -1167,7 +1167,7 @@ static __always_inline void iocsr_write64(u64 val, u32 reg) #ifndef __ASSEMBLY__ -static inline u64 drdtime(void) +static __always_inline u64 drdtime(void) { int rID = 0; u64 val = 0; diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c index f377e50f3c66..c189e03cd5da 100644 --- a/arch/loongarch/kernel/time.c +++ b/arch/loongarch/kernel/time.c @@ -190,9 +190,9 @@ static u64 read_const_counter(struct clocksource *clk) return drdtime(); } -static u64 native_sched_clock(void) +static noinstr u64 sched_clock_read(void) { - return read_const_counter(NULL); + return drdtime(); } static struct clocksource clocksource_const = { @@ -211,7 +211,7 @@ int __init constant_clocksource_init(void) res = clocksource_register_hz(&clocksource_const, freq); - sched_clock_register(native_sched_clock, 64, freq); + sched_clock_register(sched_clock_read, 64, freq); pr_info("Constant clock source device register\n"); diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index ce878e85b6e4..4d646659a5f5 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -63,7 +63,7 @@ static inline int store_tod_clock_ext_cc(union tod_clock *clk) return cc; } -static inline void store_tod_clock_ext(union tod_clock *tod) +static __always_inline void store_tod_clock_ext(union tod_clock *tod) { asm volatile("stcke %0" : "=Q" (*tod) : : "cc"); } @@ -177,7 +177,7 @@ static inline void local_tick_enable(unsigned long comp) typedef unsigned long cycles_t; -static inline unsigned long get_tod_clock(void) +static __always_inline unsigned long get_tod_clock(void) { union tod_clock clk; @@ -204,6 +204,11 @@ void init_cpu_timer(void); extern union tod_clock tod_clock_base; +static __always_inline unsigned long __get_tod_clock_monotonic(void) +{ + return get_tod_clock() - tod_clock_base.tod; +} + /** * get_clock_monotonic - returns current time in clock rate units * @@ -216,7 +221,7 @@ static inline unsigned long get_tod_clock_monotonic(void) unsigned long tod; preempt_disable_notrace(); - tod = get_tod_clock() - tod_clock_base.tod; + tod = __get_tod_clock_monotonic(); preempt_enable_notrace(); return tod; } @@ -240,7 +245,7 @@ static inline unsigned long get_tod_clock_monotonic(void) * -> ns = (th * 125) + ((tl * 125) >> 9); * */ -static inline unsigned long tod_to_ns(unsigned long todval) +static __always_inline unsigned long tod_to_ns(unsigned long todval) { return ((todval >> 9) * 125) + (((todval & 0x1ff) * 125) >> 9); } diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 6b7b6d5e3632..276278199c44 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -102,6 +102,11 @@ void __init time_early_init(void) ((long) qui.old_leap * 4096000000L); } +unsigned long long noinstr sched_clock_noinstr(void) +{ + return tod_to_ns(__get_tod_clock_monotonic()); +} + /* * Scheduler clock - returns current time in nanosec units. */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 49bb4f2bd300..88d9ef98e087 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -257,6 +257,11 @@ void hv_set_register(unsigned int reg, u64 value); u64 hv_get_non_nested_register(unsigned int reg); void hv_set_non_nested_register(unsigned int reg, u64 value); +static __always_inline u64 hv_raw_get_register(unsigned int reg) +{ + return __rdmsr(reg); +} + #else /* CONFIG_HYPERV */ static inline void hyperv_init(void) {} static inline void hyperv_setup_mmu_ops(void) {} diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index 4cf6794f9d68..c81858d903dc 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -231,14 +231,19 @@ static u64 vread_pvclock(void) ret = __pvclock_read_cycles(pvti, rdtsc_ordered()); } while (pvclock_read_retry(pvti, version)); - return ret; + return ret & S64_MAX; } #endif #ifdef CONFIG_HYPERV_TIMER static u64 vread_hvclock(void) { - return hv_read_tsc_page(&hvclock_page); + u64 tsc, time; + + if (hv_read_tsc_page_tsc(&hvclock_page, &tsc, &time)) + return time & S64_MAX; + + return U64_MAX; } #endif @@ -246,7 +251,7 @@ static inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_data *vd) { if (likely(clock_mode == VDSO_CLOCKMODE_TSC)) - return (u64)rdtsc_ordered(); + return (u64)rdtsc_ordered() & S64_MAX; /* * For any memory-mapped vclock type, we need to make sure that gcc * doesn't cleverly hoist a load before the mode check. Otherwise we @@ -284,6 +289,9 @@ static inline bool arch_vdso_clocksource_ok(const struct vdso_data *vd) * which can be invalidated asynchronously and indicate invalidation by * returning U64_MAX, which can be effectively tested by checking for a * negative value after casting it to s64. + * + * This effectively forces a S64_MAX mask on the calculations, unlike the + * U64_MAX mask normally used by x86 clocksources. */ static inline bool arch_vdso_cycles_ok(u64 cycles) { @@ -303,18 +311,29 @@ static inline bool arch_vdso_cycles_ok(u64 cycles) * @last. If not then use @last, which is the base time of the current * conversion period. * - * This variant also removes the masking of the subtraction because the - * clocksource mask of all VDSO capable clocksources on x86 is U64_MAX - * which would result in a pointless operation. The compiler cannot - * optimize it away as the mask comes from the vdso data and is not compile - * time constant. + * This variant also uses a custom mask because while the clocksource mask of + * all the VDSO capable clocksources on x86 is U64_MAX, the above code uses + * U64_MASK as an exception value, additionally arch_vdso_cycles_ok() above + * declares everything with the MSB/Sign-bit set as invalid. Therefore the + * effective mask is S64_MAX. */ static __always_inline u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult) { - if (cycles > last) - return (cycles - last) * mult; - return 0; + /* + * Due to the MSB/Sign-bit being used as invald marker (see + * arch_vdso_cycles_valid() above), the effective mask is S64_MAX. + */ + u64 delta = (cycles - last) & S64_MAX; + + /* + * Due to the above mentioned TSC wobbles, filter out negative motion. + * Per the above masking, the effective sign bit is now bit 62. + */ + if (unlikely(delta & (1ULL << 62))) + return 0; + + return delta * mult; } #define vdso_calc_delta vdso_calc_delta diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 670eb08b972a..ee4fe8cdb857 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -165,32 +165,19 @@ int arch_asym_cpu_priority(int cpu) /** * sched_set_itmt_core_prio() - Set CPU priority based on ITMT - * @prio: Priority of cpu core - * @core_cpu: The cpu number associated with the core + * @prio: Priority of @cpu + * @cpu: The CPU number * * The pstate driver will find out the max boost frequency * and call this function to set a priority proportional - * to the max boost frequency. CPU with higher boost + * to the max boost frequency. CPUs with higher boost * frequency will receive higher priority. * * No need to rebuild sched domain after updating * the CPU priorities. The sched domains have no * dependency on CPU priorities. */ -void sched_set_itmt_core_prio(int prio, int core_cpu) +void sched_set_itmt_core_prio(int prio, int cpu) { - int cpu, i = 1; - - for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) { - int smt_prio; - - /* - * Ensure that the siblings are moved to the end - * of the priority chain and only used when - * all other high priority cpus are out of capacity. - */ - smt_prio = prio * smp_num_siblings / (i * i); - per_cpu(sched_core_priority, cpu) = smt_prio; - i++; - } + per_cpu(sched_core_priority, cpu) = prio; } diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 0f35d44c56fe..fb8f52149be9 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -71,7 +71,7 @@ static int kvm_set_wallclock(const struct timespec64 *now) return -ENODEV; } -static noinstr u64 kvm_clock_read(void) +static u64 kvm_clock_read(void) { u64 ret; @@ -88,7 +88,7 @@ static u64 kvm_clock_get_cycles(struct clocksource *cs) static noinstr u64 kvm_sched_clock_read(void) { - return kvm_clock_read() - kvm_sched_clock_offset; + return pvclock_clocksource_read_nowd(this_cpu_pvti()) - kvm_sched_clock_offset; } static inline void kvm_sched_clock_init(bool stable) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8779a7ed3e87..ed2d51960a7d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -602,7 +602,7 @@ static int x86_core_flags(void) #ifdef CONFIG_SCHED_SMT static int x86_smt_flags(void) { - return cpu_smt_flags() | x86_sched_itmt_flags(); + return cpu_smt_flags(); } #endif #ifdef CONFIG_SCHED_CLUSTER @@ -613,44 +613,6 @@ static int x86_cluster_flags(void) #endif #endif -static struct sched_domain_topology_level x86_numa_in_package_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, -#endif -#ifdef CONFIG_SCHED_CLUSTER - { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, -#endif -#ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, -#endif - { NULL, }, -}; - -static struct sched_domain_topology_level x86_hybrid_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, -#endif -#ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, -#endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, - { NULL, }, -}; - -static struct sched_domain_topology_level x86_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, -#endif -#ifdef CONFIG_SCHED_CLUSTER - { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, -#endif -#ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, -#endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, - { NULL, }, -}; - /* * Set if a package/die has multiple NUMA nodes inside. * AMD Magny-Cours, Intel Cluster-on-Die, and Intel @@ -658,6 +620,51 @@ static struct sched_domain_topology_level x86_topology[] = { */ static bool x86_has_numa_in_package; +static struct sched_domain_topology_level x86_topology[6]; + +static void __init build_sched_topology(void) +{ + int i = 0; + +#ifdef CONFIG_SCHED_SMT + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) + }; +#endif +#ifdef CONFIG_SCHED_CLUSTER + /* + * For now, skip the cluster domain on Hybrid. + */ + if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) + }; + } +#endif +#ifdef CONFIG_SCHED_MC + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) + }; +#endif + /* + * When there is NUMA topology inside the package skip the DIE domain + * since the NUMA domains will auto-magically create the right spanning + * domains based on the SLIT. + */ + if (!x86_has_numa_in_package) { + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_cpu_mask, SD_INIT_NAME(DIE) + }; + } + + /* + * There must be one trailing NULL entry left. + */ + BUG_ON(i >= ARRAY_SIZE(x86_topology)-1); + + set_sched_topology(x86_topology); +} + void set_cpu_sibling_map(int cpu) { bool has_smt = smp_num_siblings > 1; @@ -1264,15 +1271,6 @@ void __init smp_prepare_cpus_common(void) zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL); } - /* - * Set 'default' x86 topology, this matches default_topology() in that - * it has NUMA nodes as a topology level. See also - * native_smp_cpus_done(). - * - * Must be done before set_cpus_sibling_map() is ran. - */ - set_sched_topology(x86_topology); - set_cpu_sibling_map(0); } @@ -1393,13 +1391,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) pr_debug("Boot done\n"); calculate_max_logical_packages(); - - /* XXX for now assume numa-in-package and hybrid don't overlap */ - if (x86_has_numa_in_package) - set_sched_topology(x86_numa_in_package_topology); - if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - set_sched_topology(x86_hybrid_topology); - + build_sched_topology(); nmi_selftest(); impress_friends(); cache_aps_init(); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 1412b771651e..3425c6a943e4 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -69,12 +69,10 @@ static int __init tsc_early_khz_setup(char *buf) } early_param("tsc_early_khz", tsc_early_khz_setup); -__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) +__always_inline void __cyc2ns_read(struct cyc2ns_data *data) { int seq, idx; - preempt_disable_notrace(); - do { seq = this_cpu_read(cyc2ns.seq.seqcount.sequence); idx = seq & 1; @@ -86,6 +84,12 @@ __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence))); } +__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) +{ + preempt_disable_notrace(); + __cyc2ns_read(data); +} + __always_inline void cyc2ns_read_end(void) { preempt_enable_notrace(); @@ -115,18 +119,25 @@ __always_inline void cyc2ns_read_end(void) * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) +static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc) { struct cyc2ns_data data; unsigned long long ns; - cyc2ns_read_begin(&data); + __cyc2ns_read(&data); ns = data.cyc2ns_offset; ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); - cyc2ns_read_end(); + return ns; +} +static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + unsigned long long ns; + preempt_disable_notrace(); + ns = __cycles_2_ns(cyc); + preempt_enable_notrace(); return ns; } @@ -223,7 +234,7 @@ noinstr u64 native_sched_clock(void) u64 tsc_now = rdtsc(); /* return the value in ns */ - return cycles_2_ns(tsc_now); + return __cycles_2_ns(tsc_now); } /* @@ -250,7 +261,7 @@ u64 native_sched_clock_from_tsc(u64 tsc) /* We need to define a real function for sched_clock, to override the weak default version */ #ifdef CONFIG_PARAVIRT -noinstr u64 sched_clock(void) +noinstr u64 sched_clock_noinstr(void) { return paravirt_sched_clock(); } @@ -260,11 +271,20 @@ bool using_native_sched_clock(void) return static_call_query(pv_sched_clock) == native_sched_clock; } #else -u64 sched_clock(void) __attribute__((alias("native_sched_clock"))); +u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock"))); bool using_native_sched_clock(void) { return true; } #endif +notrace u64 sched_clock(void) +{ + u64 now; + preempt_disable_notrace(); + now = sched_clock_noinstr(); + preempt_enable_notrace(); + return now; +} + int check_tsc_unstable(void) { return tsc_unstable; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 04b57a336b34..bc68a39efd70 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2799,14 +2799,13 @@ static u64 read_tsc(void) static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, int *mode) { - long v; u64 tsc_pg_val; + long v; switch (clock->vclock_mode) { case VDSO_CLOCKMODE_HVCLOCK: - tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(), - tsc_timestamp); - if (tsc_pg_val != U64_MAX) { + if (hv_read_tsc_page_tsc(hv_get_tsc_page(), + tsc_timestamp, &tsc_pg_val)) { /* TSC page valid */ *mode = VDSO_CLOCKMODE_HVCLOCK; v = (tsc_pg_val - clock->cycle_last) & diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index b74ac2562cfb..52fa5609b7f6 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -66,11 +66,10 @@ static noinstr u64 xen_sched_clock(void) struct pvclock_vcpu_time_info *src; u64 ret; - preempt_disable_notrace(); src = &__this_cpu_read(xen_vcpu)->time; ret = pvclock_clocksource_read_nowd(src); ret -= xen_sched_clock_offset; - preempt_enable_notrace(); + return ret; } diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index e09d4427f604..e733a2a1927a 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -191,22 +191,40 @@ u32 arch_timer_reg_read(int access, enum arch_timer_reg reg, return val; } -static notrace u64 arch_counter_get_cntpct_stable(void) +static noinstr u64 raw_counter_get_cntpct_stable(void) { return __arch_counter_get_cntpct_stable(); } -static notrace u64 arch_counter_get_cntpct(void) +static notrace u64 arch_counter_get_cntpct_stable(void) +{ + u64 val; + preempt_disable_notrace(); + val = __arch_counter_get_cntpct_stable(); + preempt_enable_notrace(); + return val; +} + +static noinstr u64 arch_counter_get_cntpct(void) { return __arch_counter_get_cntpct(); } -static notrace u64 arch_counter_get_cntvct_stable(void) +static noinstr u64 raw_counter_get_cntvct_stable(void) { return __arch_counter_get_cntvct_stable(); } -static notrace u64 arch_counter_get_cntvct(void) +static notrace u64 arch_counter_get_cntvct_stable(void) +{ + u64 val; + preempt_disable_notrace(); + val = __arch_counter_get_cntvct_stable(); + preempt_enable_notrace(); + return val; +} + +static noinstr u64 arch_counter_get_cntvct(void) { return __arch_counter_get_cntvct(); } @@ -753,14 +771,14 @@ static int arch_timer_set_next_event_phys(unsigned long evt, return 0; } -static u64 arch_counter_get_cnt_mem(struct arch_timer *t, int offset_lo) +static noinstr u64 arch_counter_get_cnt_mem(struct arch_timer *t, int offset_lo) { u32 cnt_lo, cnt_hi, tmp_hi; do { - cnt_hi = readl_relaxed(t->base + offset_lo + 4); - cnt_lo = readl_relaxed(t->base + offset_lo); - tmp_hi = readl_relaxed(t->base + offset_lo + 4); + cnt_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); + cnt_lo = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo)); + tmp_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); } while (cnt_hi != tmp_hi); return ((u64) cnt_hi << 32) | cnt_lo; @@ -1060,7 +1078,7 @@ bool arch_timer_evtstrm_available(void) return cpumask_test_cpu(raw_smp_processor_id(), &evtstrm_available); } -static u64 arch_counter_get_cntvct_mem(void) +static noinstr u64 arch_counter_get_cntvct_mem(void) { return arch_counter_get_cnt_mem(arch_timer_mem, CNTVCT_LO); } @@ -1074,6 +1092,7 @@ struct arch_timer_kvm_info *arch_timer_get_kvm_info(void) static void __init arch_counter_register(unsigned type) { + u64 (*scr)(void); u64 start_count; int width; @@ -1083,21 +1102,28 @@ static void __init arch_counter_register(unsigned type) if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) || arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) { - if (arch_timer_counter_has_wa()) + if (arch_timer_counter_has_wa()) { rd = arch_counter_get_cntvct_stable; - else + scr = raw_counter_get_cntvct_stable; + } else { rd = arch_counter_get_cntvct; + scr = arch_counter_get_cntvct; + } } else { - if (arch_timer_counter_has_wa()) + if (arch_timer_counter_has_wa()) { rd = arch_counter_get_cntpct_stable; - else + scr = raw_counter_get_cntpct_stable; + } else { rd = arch_counter_get_cntpct; + scr = arch_counter_get_cntpct; + } } arch_timer_read_counter = rd; clocksource_counter.vdso_clock_mode = vdso_default; } else { arch_timer_read_counter = arch_counter_get_cntvct_mem; + scr = arch_counter_get_cntvct_mem; } width = arch_counter_get_width(); @@ -1113,7 +1139,7 @@ static void __init arch_counter_register(unsigned type) timecounter_init(&arch_timer_kvm_info.timecounter, &cyclecounter, start_count); - sched_clock_register(arch_timer_read_counter, width, arch_timer_rate); + sched_clock_register(scr, width, arch_timer_rate); } static void arch_timer_stop(struct clock_event_device *clk) diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index 9fc008c16636..e56307a81f4d 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -365,6 +365,20 @@ void hv_stimer_global_cleanup(void) } EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup); +static __always_inline u64 read_hv_clock_msr(void) +{ + /* + * Read the partition counter to get the current tick count. This count + * is set to 0 when the partition is created and is incremented in 100 + * nanosecond units. + * + * Use hv_raw_get_register() because this function is used from + * noinstr. Notable; while HV_REGISTER_TIME_REF_COUNT is a synthetic + * register it doesn't need the GHCB path. + */ + return hv_raw_get_register(HV_REGISTER_TIME_REF_COUNT); +} + /* * Code and definitions for the Hyper-V clocksources. Two * clocksources are defined: one that reads the Hyper-V defined MSR, and @@ -393,14 +407,20 @@ struct ms_hyperv_tsc_page *hv_get_tsc_page(void) } EXPORT_SYMBOL_GPL(hv_get_tsc_page); -static u64 notrace read_hv_clock_tsc(void) +static __always_inline u64 read_hv_clock_tsc(void) { - u64 current_tick = hv_read_tsc_page(hv_get_tsc_page()); + u64 cur_tsc, time; - if (current_tick == U64_MAX) - current_tick = hv_get_register(HV_REGISTER_TIME_REF_COUNT); + /* + * The Hyper-V Top-Level Function Spec (TLFS), section Timers, + * subsection Refererence Counter, guarantees that the TSC and MSR + * times are in sync and monotonic. Therefore we can fall back + * to the MSR in case the TSC page indicates unavailability. + */ + if (!hv_read_tsc_page_tsc(tsc_page, &cur_tsc, &time)) + time = read_hv_clock_msr(); - return current_tick; + return time; } static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg) @@ -408,7 +428,7 @@ static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg) return read_hv_clock_tsc(); } -static u64 notrace read_hv_sched_clock_tsc(void) +static u64 noinstr read_hv_sched_clock_tsc(void) { return (read_hv_clock_tsc() - hv_sched_clock_offset) * (NSEC_PER_SEC / HV_CLOCK_HZ); @@ -460,16 +480,6 @@ static struct clocksource hyperv_cs_tsc = { #endif }; -static u64 notrace read_hv_clock_msr(void) -{ - /* - * Read the partition counter to get the current tick count. This count - * is set to 0 when the partition is created and is incremented in - * 100 nanosecond units. - */ - return hv_get_register(HV_REGISTER_TIME_REF_COUNT); -} - static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg) { return read_hv_clock_msr(); diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 8e929f6602ce..737a026ef58a 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -145,7 +145,7 @@ static noinstr void enter_s2idle_proper(struct cpuidle_driver *drv, instrumentation_begin(); - time_start = ns_to_ktime(local_clock()); + time_start = ns_to_ktime(local_clock_noinstr()); tick_freeze(); /* @@ -169,7 +169,7 @@ static noinstr void enter_s2idle_proper(struct cpuidle_driver *drv, tick_unfreeze(); start_critical_timings(); - time_end = ns_to_ktime(local_clock()); + time_end = ns_to_ktime(local_clock_noinstr()); dev->states_usage[index].s2idle_time += ktime_us_delta(time_end, time_start); dev->states_usage[index].s2idle_usage++; @@ -243,7 +243,7 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev, sched_idle_set_state(target_state); trace_cpu_idle(index, dev->cpu); - time_start = ns_to_ktime(local_clock()); + time_start = ns_to_ktime(local_clock_noinstr()); stop_critical_timings(); if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) { @@ -276,7 +276,7 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev, start_critical_timings(); sched_clock_idle_wakeup_event(); - time_end = ns_to_ktime(local_clock()); + time_end = ns_to_ktime(local_clock_noinstr()); trace_cpu_idle(PWR_EVENT_EXIT, dev->cpu); /* The cpu is no longer idle or about to enter idle. */ diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c index bdcfeaecd228..9b6d90a72601 100644 --- a/drivers/cpuidle/poll_state.c +++ b/drivers/cpuidle/poll_state.c @@ -15,7 +15,7 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev, { u64 time_start; - time_start = local_clock(); + time_start = local_clock_noinstr(); dev->poll_time_limit = false; @@ -32,7 +32,7 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev, continue; loop_count = 0; - if (local_clock() - time_start > limit) { + if (local_clock_noinstr() - time_start > limit) { dev->poll_time_limit = true; break; } diff --git a/include/clocksource/hyperv_timer.h b/include/clocksource/hyperv_timer.h index 536f897375d0..6cdc873ac907 100644 --- a/include/clocksource/hyperv_timer.h +++ b/include/clocksource/hyperv_timer.h @@ -38,8 +38,9 @@ extern void hv_remap_tsc_clocksource(void); extern unsigned long hv_get_tsc_pfn(void); extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void); -static inline notrace u64 -hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc) +static __always_inline bool +hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, + u64 *cur_tsc, u64 *time) { u64 scale, offset; u32 sequence; @@ -63,7 +64,7 @@ hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc) do { sequence = READ_ONCE(tsc_pg->tsc_sequence); if (!sequence) - return U64_MAX; + return false; /* * Make sure we read sequence before we read other values from * TSC page. @@ -82,15 +83,8 @@ hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc) } while (READ_ONCE(tsc_pg->tsc_sequence) != sequence); - return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset; -} - -static inline notrace u64 -hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) -{ - u64 cur_tsc; - - return hv_read_tsc_page_tsc(tsc_pg, &cur_tsc); + *time = mul_u64_u64_shr(*cur_tsc, scale, 64) + offset; + return true; } #else /* CONFIG_HYPERV_TIMER */ @@ -104,10 +98,10 @@ static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void) return NULL; } -static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, - u64 *cur_tsc) +static __always_inline bool +hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc, u64 *time) { - return U64_MAX; + return false; } static inline int hv_stimer_cleanup(unsigned int cpu) { return 0; } diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 30e5bec81d2b..f1f95a71a4bc 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -89,6 +89,7 @@ int kthread_stop(struct task_struct *k); bool kthread_should_stop(void); bool kthread_should_park(void); bool __kthread_should_park(struct task_struct *k); +bool kthread_should_stop_or_park(void); bool kthread_freezable_should_stop(bool *was_frozen); void *kthread_func(struct task_struct *k); void *kthread_data(struct task_struct *k); diff --git a/include/linux/math64.h b/include/linux/math64.h index 8b9191a2849e..bf74478926d4 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -168,7 +168,7 @@ static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) #endif /* mul_u64_u32_shr */ #ifndef mul_u64_u64_shr -static inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift) +static __always_inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift) { return (u64)(((unsigned __int128)a * mul) >> shift); } diff --git a/include/linux/rbtree_latch.h b/include/linux/rbtree_latch.h index 3d1a9e716b80..6a0999c26c7c 100644 --- a/include/linux/rbtree_latch.h +++ b/include/linux/rbtree_latch.h @@ -206,7 +206,7 @@ latch_tree_find(void *key, struct latch_tree_root *root, do { seq = raw_read_seqcount_latch(&root->seq); node = __lt_find(key, root, seq & 1, ops->comp); - } while (read_seqcount_latch_retry(&root->seq, seq)); + } while (raw_read_seqcount_latch_retry(&root->seq, seq)); return node; } diff --git a/include/linux/sched.h b/include/linux/sched.h index eed5d65b8d1f..1292d38d66cc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2006,15 +2006,12 @@ static __always_inline void scheduler_ipi(void) */ preempt_fold_need_resched(); } -extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); #else static inline void scheduler_ipi(void) { } -static inline unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) -{ - return 1; -} #endif +extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); + /* * Set thread flags in other task's structures. * See asm/thread_info.h for TIF_xxxx flags available: diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h index ca008f7d3615..196f0ca351a2 100644 --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -12,7 +12,16 @@ * * Please use one of the three interfaces below. */ -extern unsigned long long notrace sched_clock(void); +extern u64 sched_clock(void); + +#if defined(CONFIG_ARCH_WANTS_NO_INSTR) || defined(CONFIG_GENERIC_SCHED_CLOCK) +extern u64 sched_clock_noinstr(void); +#else +static __always_inline u64 sched_clock_noinstr(void) +{ + return sched_clock(); +} +#endif /* * See the comment in kernel/sched/clock.c @@ -45,6 +54,11 @@ static inline u64 cpu_clock(int cpu) return sched_clock(); } +static __always_inline u64 local_clock_noinstr(void) +{ + return sched_clock_noinstr(); +} + static __always_inline u64 local_clock(void) { return sched_clock(); @@ -79,6 +93,7 @@ static inline u64 cpu_clock(int cpu) return sched_clock_cpu(cpu); } +extern u64 local_clock_noinstr(void); extern u64 local_clock(void); #endif diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index 57bde66d95f7..fad77b5172e2 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -132,12 +132,9 @@ SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) /* * Place busy tasks earlier in the domain * - * SHARED_CHILD: Usually set on the SMT level. Technically could be set further - * up, but currently assumed to be set from the base domain - * upwards (see update_top_cache_domain()). * NEEDS_GROUPS: Load balancing flag. */ -SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) +SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS) /* * Prefer to place tasks in a sibling domain diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 816df6cc444e..67b573d5bf28 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -203,7 +203,7 @@ struct sched_domain_topology_level { #endif }; -extern void set_sched_topology(struct sched_domain_topology_level *tl); +extern void __init set_sched_topology(struct sched_domain_topology_level *tl); #ifdef CONFIG_SCHED_DEBUG # define SD_INIT_NAME(type) .name = #type diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 3926e9027947..987a59d977c5 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -671,9 +671,9 @@ typedef struct { * * Return: sequence counter raw value. Use the lowest bit as an index for * picking which data copy to read. The full counter must then be checked - * with read_seqcount_latch_retry(). + * with raw_read_seqcount_latch_retry(). */ -static inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s) +static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s) { /* * Pairs with the first smp_wmb() in raw_write_seqcount_latch(). @@ -683,16 +683,17 @@ static inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s) } /** - * read_seqcount_latch_retry() - end a seqcount_latch_t read section + * raw_read_seqcount_latch_retry() - end a seqcount_latch_t read section * @s: Pointer to seqcount_latch_t * @start: count, from raw_read_seqcount_latch() * * Return: true if a read section retry is required, else false */ -static inline int -read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) +static __always_inline int +raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) { - return read_seqcount_retry(&s->seqcount, start); + smp_rmb(); + return unlikely(READ_ONCE(s->seqcount.sequence) != start); } /** @@ -752,7 +753,7 @@ read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) * entry = data_query(latch->data[idx], ...); * * // This includes needed smp_rmb() - * } while (read_seqcount_latch_retry(&latch->seq, seq)); + * } while (raw_read_seqcount_latch_retry(&latch->seq, seq)); * * return entry; * } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4d42f0cbc11e..8f917f682f52 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3891,6 +3891,14 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); } +static int cgroup_pressure_open(struct kernfs_open_file *of) +{ + if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + return 0; +} + static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; @@ -5290,6 +5298,7 @@ static struct cftype cgroup_psi_files[] = { { .name = "io.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), + .open = cgroup_pressure_open, .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, @@ -5298,6 +5307,7 @@ static struct cftype cgroup_psi_files[] = { { .name = "memory.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), + .open = cgroup_pressure_open, .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, @@ -5306,6 +5316,7 @@ static struct cftype cgroup_psi_files[] = { { .name = "cpu.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), + .open = cgroup_pressure_open, .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, @@ -5315,6 +5326,7 @@ static struct cftype cgroup_psi_files[] = { { .name = "irq.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), + .open = cgroup_pressure_open, .seq_show = cgroup_irq_pressure_show, .write = cgroup_irq_pressure_write, .poll = cgroup_pressure_poll, diff --git a/kernel/kthread.c b/kernel/kthread.c index 490792b1066e..07a057086d26 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -182,6 +182,16 @@ bool kthread_should_park(void) } EXPORT_SYMBOL_GPL(kthread_should_park); +bool kthread_should_stop_or_park(void) +{ + struct kthread *kthread = __to_kthread(current); + + if (!kthread) + return false; + + return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK)); +} + /** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 6a333adce3b3..357a4d18f638 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -528,7 +528,7 @@ static u64 latched_seq_read_nolock(struct latched_seq *ls) seq = raw_read_seqcount_latch(&ls->latch); idx = seq & 0x1; val = ls->val[idx]; - } while (read_seqcount_latch_retry(&ls->latch, seq)); + } while (raw_read_seqcount_latch_retry(&ls->latch, seq)); return val; } diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index b5cc2b53464d..5a575a0ba4e6 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -266,7 +266,7 @@ static __always_inline u64 sched_clock_local(struct sched_clock_data *scd) s64 delta; again: - now = sched_clock(); + now = sched_clock_noinstr(); delta = now - scd->tick_raw; if (unlikely(delta < 0)) delta = 0; @@ -293,22 +293,29 @@ static __always_inline u64 sched_clock_local(struct sched_clock_data *scd) return clock; } -noinstr u64 local_clock(void) +noinstr u64 local_clock_noinstr(void) { u64 clock; if (static_branch_likely(&__sched_clock_stable)) - return sched_clock() + __sched_clock_offset; + return sched_clock_noinstr() + __sched_clock_offset; if (!static_branch_likely(&sched_clock_running)) - return sched_clock(); + return sched_clock_noinstr(); - preempt_disable_notrace(); clock = sched_clock_local(this_scd()); - preempt_enable_notrace(); return clock; } + +u64 local_clock(void) +{ + u64 now; + preempt_disable_notrace(); + now = local_clock_noinstr(); + preempt_enable_notrace(); + return now; +} EXPORT_SYMBOL_GPL(local_clock); static notrace u64 sched_clock_remote(struct sched_clock_data *scd) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a68d1276bab0..7eb6e2927390 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2213,6 +2213,154 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) rq_clock_skip_update(rq); } +static __always_inline +int __task_state_match(struct task_struct *p, unsigned int state) +{ + if (READ_ONCE(p->__state) & state) + return 1; + +#ifdef CONFIG_PREEMPT_RT + if (READ_ONCE(p->saved_state) & state) + return -1; +#endif + return 0; +} + +static __always_inline +int task_state_match(struct task_struct *p, unsigned int state) +{ +#ifdef CONFIG_PREEMPT_RT + int match; + + /* + * Serialize against current_save_and_set_rtlock_wait_state() and + * current_restore_rtlock_saved_state(). + */ + raw_spin_lock_irq(&p->pi_lock); + match = __task_state_match(p, state); + raw_spin_unlock_irq(&p->pi_lock); + + return match; +#else + return __task_state_match(p, state); +#endif +} + +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * Wait for the thread to block in any of the states set in @match_state. + * If it changes, i.e. @p might have woken up, then return zero. When we + * succeed in waiting for @p to be off its CPU, we return a positive number + * (its total switch count). If a second call a short while later returns the + * same number, the caller can be sure that @p has remained unscheduled the + * whole time. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. + */ +unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) +{ + int running, queued, match; + struct rq_flags rf; + unsigned long ncsw; + struct rq *rq; + + for (;;) { + /* + * We do the initial early heuristics without holding + * any task-queue locks at all. We'll only try to get + * the runqueue lock when things look like they will + * work out! + */ + rq = task_rq(p); + + /* + * If the task is actively running on another CPU + * still, just relax and busy-wait without holding + * any locks. + * + * NOTE! Since we don't hold any locks, it's not + * even sure that "rq" stays as the right runqueue! + * But we don't care, since "task_on_cpu()" will + * return false if the runqueue has changed and p + * is actually now running somewhere else! + */ + while (task_on_cpu(rq, p)) { + if (!task_state_match(p, match_state)) + return 0; + cpu_relax(); + } + + /* + * Ok, time to look more closely! We need the rq + * lock now, to be *sure*. If we're wrong, we'll + * just go back and repeat. + */ + rq = task_rq_lock(p, &rf); + trace_sched_wait_task(p); + running = task_on_cpu(rq, p); + queued = task_on_rq_queued(p); + ncsw = 0; + if ((match = __task_state_match(p, match_state))) { + /* + * When matching on p->saved_state, consider this task + * still queued so it will wait. + */ + if (match < 0) + queued = 1; + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ + } + task_rq_unlock(rq, p, &rf); + + /* + * If it changed from the expected state, bail out now. + */ + if (unlikely(!ncsw)) + break; + + /* + * Was it really running after all now that we + * checked with the proper locks actually held? + * + * Oops. Go back and try again.. + */ + if (unlikely(running)) { + cpu_relax(); + continue; + } + + /* + * It's not enough that it's not actively running, + * it must be off the runqueue _entirely_, and not + * preempted! + * + * So if it was still runnable (but just not actively + * running right now), it's preempted, and we should + * yield - it could be a while. + */ + if (unlikely(queued)) { + ktime_t to = NSEC_PER_SEC / HZ; + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); + continue; + } + + /* + * Ahh, all good. It wasn't running, and it wasn't + * runnable, which means that it will never become + * running in the future either. We're all done! + */ + break; + } + + return ncsw; +} + #ifdef CONFIG_SMP static void @@ -2398,7 +2546,6 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, if (!is_cpu_allowed(p, dest_cpu)) return rq; - update_rq_clock(rq); rq = move_queued_task(rq, rf, p, dest_cpu); return rq; @@ -2456,10 +2603,12 @@ static int migration_cpu_stop(void *data) goto out; } - if (task_on_rq_queued(p)) + if (task_on_rq_queued(p)) { + update_rq_clock(rq); rq = __migrate_task(rq, &rf, p, arg->dest_cpu); - else + } else { p->wake_cpu = arg->dest_cpu; + } /* * XXX __migrate_task() can fail, at which point we might end @@ -3341,114 +3490,6 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p, } #endif /* CONFIG_NUMA_BALANCING */ -/* - * wait_task_inactive - wait for a thread to unschedule. - * - * Wait for the thread to block in any of the states set in @match_state. - * If it changes, i.e. @p might have woken up, then return zero. When we - * succeed in waiting for @p to be off its CPU, we return a positive number - * (its total switch count). If a second call a short while later returns the - * same number, the caller can be sure that @p has remained unscheduled the - * whole time. - * - * The caller must ensure that the task *will* unschedule sometime soon, - * else this function might spin for a *long* time. This function can't - * be called with interrupts off, or it may introduce deadlock with - * smp_call_function() if an IPI is sent by the same process we are - * waiting to become inactive. - */ -unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) -{ - int running, queued; - struct rq_flags rf; - unsigned long ncsw; - struct rq *rq; - - for (;;) { - /* - * We do the initial early heuristics without holding - * any task-queue locks at all. We'll only try to get - * the runqueue lock when things look like they will - * work out! - */ - rq = task_rq(p); - - /* - * If the task is actively running on another CPU - * still, just relax and busy-wait without holding - * any locks. - * - * NOTE! Since we don't hold any locks, it's not - * even sure that "rq" stays as the right runqueue! - * But we don't care, since "task_on_cpu()" will - * return false if the runqueue has changed and p - * is actually now running somewhere else! - */ - while (task_on_cpu(rq, p)) { - if (!(READ_ONCE(p->__state) & match_state)) - return 0; - cpu_relax(); - } - - /* - * Ok, time to look more closely! We need the rq - * lock now, to be *sure*. If we're wrong, we'll - * just go back and repeat. - */ - rq = task_rq_lock(p, &rf); - trace_sched_wait_task(p); - running = task_on_cpu(rq, p); - queued = task_on_rq_queued(p); - ncsw = 0; - if (READ_ONCE(p->__state) & match_state) - ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_rq_unlock(rq, p, &rf); - - /* - * If it changed from the expected state, bail out now. - */ - if (unlikely(!ncsw)) - break; - - /* - * Was it really running after all now that we - * checked with the proper locks actually held? - * - * Oops. Go back and try again.. - */ - if (unlikely(running)) { - cpu_relax(); - continue; - } - - /* - * It's not enough that it's not actively running, - * it must be off the runqueue _entirely_, and not - * preempted! - * - * So if it was still runnable (but just not actively - * running right now), it's preempted, and we should - * yield - it could be a while. - */ - if (unlikely(queued)) { - ktime_t to = NSEC_PER_SEC / HZ; - - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); - continue; - } - - /* - * Ahh, all good. It wasn't running, and it wasn't - * runnable, which means that it will never become - * running in the future either. We're all done! - */ - break; - } - - return ncsw; -} - /*** * kick_process - kick a running thread to enter/exit the kernel * @p: the to-be-kicked thread @@ -4003,15 +4044,14 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) static __always_inline bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) { + int match; + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) && state != TASK_RTLOCK_WAIT); } - if (READ_ONCE(p->__state) & state) { - *success = 1; - return true; - } + *success = !!(match = __task_state_match(p, state)); #ifdef CONFIG_PREEMPT_RT /* @@ -4027,12 +4067,10 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) * p::saved_state to TASK_RUNNING so any further tests will * not result in false positives vs. @success */ - if (p->saved_state & state) { + if (match < 0) p->saved_state = TASK_RUNNING; - *success = 1; - } #endif - return false; + return match > 0; } /* @@ -9548,6 +9586,7 @@ void set_rq_offline(struct rq *rq) if (rq->online) { const struct sched_class *class; + update_rq_clock(rq); for_each_class(class) { if (class->rq_offline) class->rq_offline(rq); @@ -9689,7 +9728,6 @@ int sched_cpu_deactivate(unsigned int cpu) rq_lock_irqsave(rq, &rf); if (rq->rd) { - update_rq_clock(rq); BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e3211455b203..4492608b7d7f 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -155,10 +155,11 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, static void sugov_get_util(struct sugov_cpu *sg_cpu) { + unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu); struct rq *rq = cpu_rq(sg_cpu->cpu); sg_cpu->bw_dl = cpu_bw_dl(rq); - sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), + sg_cpu->util = effective_cpu_util(sg_cpu->cpu, util, FREQUENCY_UTIL, NULL); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5a9a4b81c972..e41a36bd66a6 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -489,13 +489,6 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); -void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) -{ - raw_spin_lock_init(&dl_b->dl_runtime_lock); - dl_b->dl_period = period; - dl_b->dl_runtime = runtime; -} - void init_dl_bw(struct dl_bw *dl_b) { raw_spin_lock_init(&dl_b->lock); @@ -1260,43 +1253,39 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) } /* - * This function implements the GRUB accounting rule: - * according to the GRUB reclaiming algorithm, the runtime is - * not decreased as "dq = -dt", but as - * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt", + * This function implements the GRUB accounting rule. According to the + * GRUB reclaiming algorithm, the runtime is not decreased as "dq = -dt", + * but as "dq = -(max{u, (Umax - Uinact - Uextra)} / Umax) dt", * where u is the utilization of the task, Umax is the maximum reclaimable * utilization, Uinact is the (per-runqueue) inactive utilization, computed * as the difference between the "total runqueue utilization" and the - * runqueue active utilization, and Uextra is the (per runqueue) extra + * "runqueue active utilization", and Uextra is the (per runqueue) extra * reclaimable utilization. - * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations - * multiplied by 2^BW_SHIFT, the result has to be shifted right by - * BW_SHIFT. - * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, - * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. - * Since delta is a 64 bit variable, to have an overflow its value - * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. - * So, overflow is not an issue here. + * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied + * by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT. + * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw + * is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. + * Since delta is a 64 bit variable, to have an overflow its value should be + * larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is + * not an issue here. */ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) { - u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ u64 u_act; - u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT; + u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ /* - * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)}, - * we compare u_inact + rq->dl.extra_bw with - * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because - * u_inact + rq->dl.extra_bw can be larger than - * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative - * leading to wrong results) + * Instead of computing max{u, (u_max - u_inact - u_extra)}, we + * compare u_inact + u_extra with u_max - u, because u_inact + u_extra + * can be larger than u_max. So, u_max - u_inact - u_extra would be + * negative leading to wrong results. */ - if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min) - u_act = u_act_min; + if (u_inact + rq->dl.extra_bw > rq->dl.max_bw - dl_se->dl_bw) + u_act = dl_se->dl_bw; else - u_act = BW_UNIT - u_inact - rq->dl.extra_bw; + u_act = rq->dl.max_bw - u_inact - rq->dl.extra_bw; + u_act = (u_act * rq->dl.bw_ratio) >> RATIO_SHIFT; return (delta * u_act) >> BW_SHIFT; } @@ -2795,12 +2784,12 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) { if (global_rt_runtime() == RUNTIME_INF) { dl_rq->bw_ratio = 1 << RATIO_SHIFT; - dl_rq->extra_bw = 1 << BW_SHIFT; + dl_rq->max_bw = dl_rq->extra_bw = 1 << BW_SHIFT; } else { dl_rq->bw_ratio = to_ratio(global_rt_runtime(), global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT); - dl_rq->extra_bw = to_ratio(global_rt_period(), - global_rt_runtime()); + dl_rq->max_bw = dl_rq->extra_bw = + to_ratio(global_rt_period(), global_rt_runtime()); } } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0b2340a79b65..066ff1c8ae4e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -777,7 +777,7 @@ static void print_cpu(struct seq_file *m, int cpu) #define P(x) \ do { \ if (sizeof(rq->x) == 4) \ - SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \ + SEQ_printf(m, " .%-30s: %d\n", #x, (int)(rq->x)); \ else \ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\ } while (0) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 373ff5f55884..a80a73909dc2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1064,6 +1064,23 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +static inline bool is_core_idle(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + int sibling; + + for_each_cpu(sibling, cpu_smt_mask(cpu)) { + if (cpu == sibling) + continue; + + if (!idle_cpu(sibling)) + return false; + } +#endif + + return true; +} + #ifdef CONFIG_NUMA #define NUMA_IMBALANCE_MIN 2 @@ -1700,23 +1717,6 @@ struct numa_stats { int idle_cpu; }; -static inline bool is_core_idle(int cpu) -{ -#ifdef CONFIG_SCHED_SMT - int sibling; - - for_each_cpu(sibling, cpu_smt_mask(cpu)) { - if (cpu == sibling) - continue; - - if (!idle_cpu(sibling)) - return false; - } -#endif - - return true; -} - struct task_numa_env { struct task_struct *p; @@ -5576,6 +5576,14 @@ static void __cfsb_csd_unthrottle(void *arg) rq_lock(rq, &rf); + /* + * Iterating over the list can trigger several call to + * update_rq_clock() in unthrottle_cfs_rq(). + * Do it once and skip the potential next ones. + */ + update_rq_clock(rq); + rq_clock_start_loop_update(rq); + /* * Since we hold rq lock we're safe from concurrent manipulation of * the CSD list. However, this RCU critical section annotates the @@ -5595,6 +5603,7 @@ static void __cfsb_csd_unthrottle(void *arg) rcu_read_unlock(); + rq_clock_stop_loop_update(rq); rq_unlock(rq, &rf); } @@ -6115,6 +6124,13 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) lockdep_assert_rq_held(rq); + /* + * The rq clock has already been updated in the + * set_rq_offline(), so we should skip updating + * the rq clock again in unthrottle_cfs_rq(). + */ + rq_clock_start_loop_update(rq); + rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; @@ -6137,6 +6153,8 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) unthrottle_cfs_rq(cfs_rq); } rcu_read_unlock(); + + rq_clock_stop_loop_update(rq); } #else /* CONFIG_CFS_BANDWIDTH */ @@ -7202,14 +7220,58 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) return target; } -/* - * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu - * (@dst_cpu = -1) or migrated to @dst_cpu. +/** + * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks. + * @cpu: the CPU to get the utilization for + * @p: task for which the CPU utilization should be predicted or NULL + * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL + * @boost: 1 to enable boosting, otherwise 0 + * + * The unit of the return value must be the same as the one of CPU capacity + * so that CPU utilization can be compared with CPU capacity. + * + * CPU utilization is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on that CPU. + * It represents the amount of CPU capacity currently used by CFS tasks in + * the range [0..max CPU capacity] with max CPU capacity being the CPU + * capacity at f_max. + * + * The estimated CPU utilization is defined as the maximum between CPU + * utilization and sum of the estimated utilization of the currently + * runnable tasks on that CPU. It preserves a utilization "snapshot" of + * previously-executed tasks, which helps better deduce how busy a CPU will + * be when a long-sleeping task wakes up. The contribution to CPU utilization + * of such a task would be significantly decayed at this point of time. + * + * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization). + * CPU contention for CFS tasks can be detected by CPU runnable > CPU + * utilization. Boosting is implemented in cpu_util() so that internal + * users (e.g. EAS) can use it next to external users (e.g. schedutil), + * latter via cpu_util_cfs_boost(). + * + * CPU utilization can be higher than the current CPU capacity + * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because + * of rounding errors as well as task migrations or wakeups of new tasks. + * CPU utilization has to be capped to fit into the [0..max CPU capacity] + * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%) + * could be seen as over-utilized even though CPU1 has 20% of spare CPU + * capacity. CPU utilization is allowed to overshoot current CPU capacity + * though since this is useful for predicting the CPU capacity required + * after task migrations (scheduler-driven DVFS). + * + * Return: (Boosted) (estimated) utilization for the specified CPU. */ -static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) +static unsigned long +cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) { struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); + unsigned long runnable; + + if (boost) { + runnable = READ_ONCE(cfs_rq->avg.runnable_avg); + util = max(util, runnable); + } /* * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its @@ -7217,9 +7279,9 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) * contribution. In all the other cases @cpu is not impacted by the * migration so its util_avg is already correct. */ - if (task_cpu(p) == cpu && dst_cpu != cpu) + if (p && task_cpu(p) == cpu && dst_cpu != cpu) lsub_positive(&util, task_util(p)); - else if (task_cpu(p) != cpu && dst_cpu == cpu) + else if (p && task_cpu(p) != cpu && dst_cpu == cpu) util += task_util(p); if (sched_feat(UTIL_EST)) { @@ -7227,6 +7289,9 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); + if (boost) + util_est = max(util_est, runnable); + /* * During wake-up @p isn't enqueued yet and doesn't contribute * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued. @@ -7255,7 +7320,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) */ if (dst_cpu == cpu) util_est += _task_util_est(p); - else if (unlikely(task_on_rq_queued(p) || current == p)) + else if (p && unlikely(task_on_rq_queued(p) || current == p)) lsub_positive(&util_est, _task_util_est(p)); util = max(util, util_est); @@ -7264,6 +7329,16 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) return min(util, capacity_orig_of(cpu)); } +unsigned long cpu_util_cfs(int cpu) +{ + return cpu_util(cpu, NULL, -1, 0); +} + +unsigned long cpu_util_cfs_boost(int cpu) +{ + return cpu_util(cpu, NULL, -1, 1); +} + /* * cpu_util_without: compute cpu utilization without any contributions from *p * @cpu: the CPU which utilization is requested @@ -7281,9 +7356,9 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) { /* Task has no contribution or is new */ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) - return cpu_util_cfs(cpu); + p = NULL; - return cpu_util_next(cpu, p, -1); + return cpu_util(cpu, p, -1, 0); } /* @@ -7330,7 +7405,7 @@ static inline void eenv_task_busy_time(struct energy_env *eenv, * cpu_capacity. * * The contribution of the task @p for which we want to estimate the - * energy cost is removed (by cpu_util_next()) and must be calculated + * energy cost is removed (by cpu_util()) and must be calculated * separately (see eenv_task_busy_time). This ensures: * * - A stable PD utilization, no matter which CPU of that PD we want to place @@ -7351,7 +7426,7 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv, int cpu; for_each_cpu(cpu, pd_cpus) { - unsigned long util = cpu_util_next(cpu, p, -1); + unsigned long util = cpu_util(cpu, p, -1, 0); busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL); } @@ -7375,8 +7450,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, for_each_cpu(cpu, pd_cpus) { struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL; - unsigned long util = cpu_util_next(cpu, p, dst_cpu); - unsigned long cpu_util; + unsigned long util = cpu_util(cpu, p, dst_cpu, 1); + unsigned long eff_util; /* * Performance domain frequency: utilization clamping @@ -7385,8 +7460,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, * NOTE: in case RT tasks are running, by default the * FREQUENCY_UTIL's utilization can be max OPP. */ - cpu_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk); - max_util = max(max_util, cpu_util); + eff_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk); + max_util = max(max_util, eff_util); } return min(max_util, eenv->cpu_cap); @@ -7521,7 +7596,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; - util = cpu_util_next(cpu, p, cpu); + util = cpu_util(cpu, p, cpu, 0); cpu_cap = capacity_of(cpu); /* @@ -9331,96 +9406,61 @@ group_type group_classify(unsigned int imbalance_pct, } /** - * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks - * @dst_cpu: Destination CPU of the load balancing - * @sds: Load-balancing data with statistics of the local group - * @sgs: Load-balancing statistics of the candidate busiest group - * @sg: The candidate busiest group + * sched_use_asym_prio - Check whether asym_packing priority must be used + * @sd: The scheduling domain of the load balancing + * @cpu: A CPU * - * Check the state of the SMT siblings of both @sds::local and @sg and decide - * if @dst_cpu can pull tasks. + * Always use CPU priority when balancing load between SMT siblings. When + * balancing load between cores, it is not sufficient that @cpu is idle. Only + * use CPU priority if the whole core is idle. * - * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of - * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks - * only if @dst_cpu has higher priority. - * - * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more - * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority. - * Bigger imbalances in the number of busy CPUs will be dealt with in - * update_sd_pick_busiest(). - * - * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings - * of @dst_cpu are idle and @sg has lower priority. - * - * Return: true if @dst_cpu can pull tasks, false otherwise. + * Returns: True if the priority of @cpu must be followed. False otherwise. */ -static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, - struct sg_lb_stats *sgs, - struct sched_group *sg) +static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) { -#ifdef CONFIG_SCHED_SMT - bool local_is_smt, sg_is_smt; - int sg_busy_cpus; + if (!sched_smt_active()) + return true; - local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY; - sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY; - - sg_busy_cpus = sgs->group_weight - sgs->idle_cpus; - - if (!local_is_smt) { - /* - * If we are here, @dst_cpu is idle and does not have SMT - * siblings. Pull tasks if candidate group has two or more - * busy CPUs. - */ - if (sg_busy_cpus >= 2) /* implies sg_is_smt */ - return true; - - /* - * @dst_cpu does not have SMT siblings. @sg may have SMT - * siblings and only one is busy. In such case, @dst_cpu - * can help if it has higher priority and is idle (i.e., - * it has no running tasks). - */ - return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); - } - - /* @dst_cpu has SMT siblings. */ - - if (sg_is_smt) { - int local_busy_cpus = sds->local->group_weight - - sds->local_stat.idle_cpus; - int busy_cpus_delta = sg_busy_cpus - local_busy_cpus; - - if (busy_cpus_delta == 1) - return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); - - return false; - } - - /* - * @sg does not have SMT siblings. Ensure that @sds::local does not end - * up with more than one busy SMT sibling and only pull tasks if there - * are not busy CPUs (i.e., no CPU has running tasks). - */ - if (!sds->local_stat.sum_nr_running) - return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); - - return false; -#else - /* Always return false so that callers deal with non-SMT cases. */ - return false; -#endif + return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu); } +/** + * sched_asym - Check if the destination CPU can do asym_packing load balance + * @env: The load balancing environment + * @sds: Load-balancing data with statistics of the local group + * @sgs: Load-balancing statistics of the candidate busiest group + * @group: The candidate busiest group + * + * @env::dst_cpu can do asym_packing if it has higher priority than the + * preferred CPU of @group. + * + * SMT is a special case. If we are balancing load between cores, @env::dst_cpu + * can do asym_packing balance only if all its SMT siblings are idle. Also, it + * can only do it if @group is an SMT group and has exactly on busy CPU. Larger + * imbalances in the number of CPUS are dealt with in find_busiest_group(). + * + * If we are balancing load within an SMT core, or at DIE domain level, always + * proceed. + * + * Return: true if @env::dst_cpu can do with asym_packing load balance. False + * otherwise. + */ static inline bool sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, struct sched_group *group) { - /* Only do SMT checks if either local or candidate have SMT siblings */ - if ((sds->local->flags & SD_SHARE_CPUCAPACITY) || - (group->flags & SD_SHARE_CPUCAPACITY)) - return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group); + /* Ensure that the whole local core is idle, if applicable. */ + if (!sched_use_asym_prio(env->sd, env->dst_cpu)) + return false; + + /* + * CPU priorities does not make sense for SMT cores with more than one + * busy sibling. + */ + if (group->flags & SD_SHARE_CPUCAPACITY) { + if (sgs->group_weight - sgs->idle_cpus != 1) + return false; + } return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); } @@ -9610,10 +9650,22 @@ static bool update_sd_pick_busiest(struct lb_env *env, * contention when accessing shared HW resources. * * XXX for now avg_load is not computed and always 0 so we - * select the 1st one. + * select the 1st one, except if @sg is composed of SMT + * siblings. */ - if (sgs->avg_load <= busiest->avg_load) + + if (sgs->avg_load < busiest->avg_load) return false; + + if (sgs->avg_load == busiest->avg_load) { + /* + * SMT sched groups need more help than non-SMT groups. + * If @sg happens to also be SMT, either choice is good. + */ + if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) + return false; + } + break; case group_has_spare: @@ -10088,7 +10140,6 @@ static void update_idle_cpu_scan(struct lb_env *env, static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { - struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; @@ -10129,8 +10180,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd sg = sg->next; } while (sg != env->sd->groups); - /* Tag domain that child domain prefers tasks go to siblings first */ - sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING; + /* + * Indicate that the child domain of the busiest group prefers tasks + * go to a child's sibling domains first. NB the flags of a sched group + * are those of the child domain. + */ + if (sds->busiest) + sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING); if (env->sd->flags & SD_NUMA) @@ -10440,7 +10496,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto out_balanced; } - /* Try to move all excess tasks to child's sibling domain */ + /* + * Try to move all excess tasks to a sibling domain of the busiest + * group's child domain. + */ if (sds.prefer_sibling && local->group_type == group_has_spare && busiest->sum_nr_running > local->sum_nr_running + 1) goto force_balance; @@ -10542,8 +10601,15 @@ static struct rq *find_busiest_queue(struct lb_env *env, nr_running == 1) continue; - /* Make sure we only pull tasks from a CPU of lower priority */ + /* + * Make sure we only pull tasks from a CPU of lower priority + * when balancing between SMT siblings. + * + * If balancing between cores, let lower priority CPUs help + * SMT cores with more than one busy sibling. + */ if ((env->sd->flags & SD_ASYM_PACKING) && + sched_use_asym_prio(env->sd, i) && sched_asym_prefer(i, env->dst_cpu) && nr_running == 1) continue; @@ -10581,7 +10647,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, break; case migrate_util: - util = cpu_util_cfs(i); + util = cpu_util_cfs_boost(i); /* * Don't try to pull utilization from a CPU with one @@ -10632,12 +10698,19 @@ static inline bool asym_active_balance(struct lb_env *env) { /* - * ASYM_PACKING needs to force migrate tasks from busy but - * lower priority CPUs in order to pack all tasks in the - * highest priority CPUs. + * ASYM_PACKING needs to force migrate tasks from busy but lower + * priority CPUs in order to pack all tasks in the highest priority + * CPUs. When done between cores, do it only if the whole core if the + * whole core is idle. + * + * If @env::src_cpu is an SMT core with busy siblings, let + * the lower priority @env::dst_cpu help it. Do not follow + * CPU priority. */ return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && - sched_asym_prefer(env->dst_cpu, env->src_cpu); + sched_use_asym_prio(env->sd, env->dst_cpu) && + (sched_asym_prefer(env->dst_cpu, env->src_cpu) || + !sched_use_asym_prio(env->sd, env->src_cpu)); } static inline bool @@ -10744,7 +10817,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, - .dst_grpmask = sched_group_span(sd->groups), + .dst_grpmask = group_balance_mask(sd->groups), .idle = idle, .loop_break = SCHED_NR_MIGRATE_BREAK, .cpus = cpus, @@ -11371,9 +11444,13 @@ static void nohz_balancer_kick(struct rq *rq) * When ASYM_PACKING; see if there's a more preferred CPU * currently idle; in which case, kick the ILB to move tasks * around. + * + * When balancing betwen cores, all the SMT siblings of the + * preferred CPU must be idle. */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { - if (sched_asym_prefer(i, cpu)) { + if (sched_use_asym_prio(sd, i) && + sched_asym_prefer(i, cpu)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index e072f6b31bf3..81fca77397f6 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -160,7 +160,6 @@ __setup("psi=", setup_psi); #define EXP_300s 2034 /* 1/exp(2s/300s) */ /* PSI trigger definitions */ -#define WINDOW_MIN_US 500000 /* Min window size is 500ms */ #define WINDOW_MAX_US 10000000 /* Max window size is 10s */ #define UPDATES_PER_WINDOW 10 /* 10 updates per window */ @@ -1305,8 +1304,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, if (state >= PSI_NONIDLE) return ERR_PTR(-EINVAL); - if (window_us < WINDOW_MIN_US || - window_us > WINDOW_MAX_US) + if (window_us == 0 || window_us > WINDOW_MAX_US) return ERR_PTR(-EINVAL); /* @@ -1409,11 +1407,16 @@ void psi_trigger_destroy(struct psi_trigger *t) group->rtpoll_nr_triggers[t->state]--; if (!group->rtpoll_nr_triggers[t->state]) group->rtpoll_states &= ~(1 << t->state); - /* reset min update period for the remaining triggers */ - list_for_each_entry(tmp, &group->rtpoll_triggers, node) - period = min(period, div_u64(tmp->win.size, - UPDATES_PER_WINDOW)); - group->rtpoll_min_period = period; + /* + * Reset min update period for the remaining triggers + * iff the destroying trigger had the min window size. + */ + if (group->rtpoll_min_period == div_u64(t->win.size, UPDATES_PER_WINDOW)) { + list_for_each_entry(tmp, &group->rtpoll_triggers, node) + period = min(period, div_u64(tmp->win.size, + UPDATES_PER_WINDOW)); + group->rtpoll_min_period = period; + } /* Destroy rtpoll_task when the last trigger is destroyed */ if (group->rtpoll_states == 0) { group->rtpoll_until = 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec7b3e0a2b20..50d4b61aef3a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -286,12 +286,6 @@ struct rt_bandwidth { void __dl_clear_params(struct task_struct *p); -struct dl_bandwidth { - raw_spinlock_t dl_runtime_lock; - u64 dl_runtime; - u64 dl_period; -}; - static inline int dl_bandwidth_enabled(void) { return sysctl_sched_rt_runtime >= 0; @@ -753,6 +747,12 @@ struct dl_rq { u64 this_bw; u64 extra_bw; + /* + * Maximum available bandwidth for reclaiming by SCHED_FLAG_RECLAIM + * tasks of this rq. Used in calculation of reclaimable bandwidth(GRUB). + */ + u64 max_bw; + /* * Inverse of the fraction of CPU utilization that can be reclaimed * by the GRUB algorithm. @@ -1546,6 +1546,28 @@ static inline void rq_clock_cancel_skipupdate(struct rq *rq) rq->clock_update_flags &= ~RQCF_REQ_SKIP; } +/* + * During cpu offlining and rq wide unthrottling, we can trigger + * an update_rq_clock() for several cfs and rt runqueues (Typically + * when using list_for_each_entry_*) + * rq_clock_start_loop_update() can be called after updating the clock + * once and before iterating over the list to prevent multiple update. + * After the iterative traversal, we need to call rq_clock_stop_loop_update() + * to clear RQCF_ACT_SKIP of rq->clock_update_flags. + */ +static inline void rq_clock_start_loop_update(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP); + rq->clock_update_flags |= RQCF_ACT_SKIP; +} + +static inline void rq_clock_stop_loop_update(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + rq->clock_update_flags &= ~RQCF_ACT_SKIP; +} + struct rq_flags { unsigned long flags; struct pin_cookie cookie; @@ -1772,6 +1794,13 @@ queue_balance_callback(struct rq *rq, for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ __sd; __sd = __sd->parent) +/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */ +#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_SHARED_CHILD)) | +static const unsigned int SD_SHARED_CHILD_MASK = +#include +0; +#undef SD_FLAG + /** * highest_flag_domain - Return highest sched_domain containing flag. * @cpu: The CPU whose highest level of sched domain is to @@ -1779,16 +1808,25 @@ queue_balance_callback(struct rq *rq, * @flag: The flag to check for the highest sched_domain * for the given CPU. * - * Returns the highest sched_domain of a CPU which contains the given flag. + * Returns the highest sched_domain of a CPU which contains @flag. If @flag has + * the SDF_SHARED_CHILD metaflag, all the children domains also have @flag. */ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) { struct sched_domain *sd, *hsd = NULL; for_each_domain(cpu, sd) { - if (!(sd->flags & flag)) + if (sd->flags & flag) { + hsd = sd; + continue; + } + + /* + * Stop the search if @flag is known to be shared at lower + * levels. It will not be found further up. + */ + if (flag & SD_SHARED_CHILD_MASK) break; - hsd = sd; } return hsd; @@ -2378,7 +2416,6 @@ extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); -extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); @@ -2946,53 +2983,9 @@ static inline unsigned long cpu_util_dl(struct rq *rq) return READ_ONCE(rq->avg_dl.util_avg); } -/** - * cpu_util_cfs() - Estimates the amount of CPU capacity used by CFS tasks. - * @cpu: the CPU to get the utilization for. - * - * The unit of the return value must be the same as the one of CPU capacity - * so that CPU utilization can be compared with CPU capacity. - * - * CPU utilization is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on that CPU. - * It represents the amount of CPU capacity currently used by CFS tasks in - * the range [0..max CPU capacity] with max CPU capacity being the CPU - * capacity at f_max. - * - * The estimated CPU utilization is defined as the maximum between CPU - * utilization and sum of the estimated utilization of the currently - * runnable tasks on that CPU. It preserves a utilization "snapshot" of - * previously-executed tasks, which helps better deduce how busy a CPU will - * be when a long-sleeping task wakes up. The contribution to CPU utilization - * of such a task would be significantly decayed at this point of time. - * - * CPU utilization can be higher than the current CPU capacity - * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because - * of rounding errors as well as task migrations or wakeups of new tasks. - * CPU utilization has to be capped to fit into the [0..max CPU capacity] - * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%) - * could be seen as over-utilized even though CPU1 has 20% of spare CPU - * capacity. CPU utilization is allowed to overshoot current CPU capacity - * though since this is useful for predicting the CPU capacity required - * after task migrations (scheduler-driven DVFS). - * - * Return: (Estimated) utilization for the specified CPU. - */ -static inline unsigned long cpu_util_cfs(int cpu) -{ - struct cfs_rq *cfs_rq; - unsigned long util; - cfs_rq = &cpu_rq(cpu)->cfs; - util = READ_ONCE(cfs_rq->avg.util_avg); - - if (sched_feat(UTIL_EST)) { - util = max_t(unsigned long, util, - READ_ONCE(cfs_rq->avg.util_est.enqueued)); - } - - return min(util, capacity_orig_of(cpu)); -} +extern unsigned long cpu_util_cfs(int cpu); +extern unsigned long cpu_util_cfs_boost(int cpu); static inline unsigned long cpu_util_rt(struct rq *rq) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6682535e37c8..d3a3b2646ec4 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -487,9 +487,9 @@ static void free_rootdomain(struct rcu_head *rcu) void rq_attach_root(struct rq *rq, struct root_domain *rd) { struct root_domain *old_rd = NULL; - unsigned long flags; + struct rq_flags rf; - raw_spin_rq_lock_irqsave(rq, flags); + rq_lock_irqsave(rq, &rf); if (rq->rd) { old_rd = rq->rd; @@ -515,7 +515,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); - raw_spin_rq_unlock_irqrestore(rq, flags); + rq_unlock_irqrestore(rq, &rf); if (old_rd) call_rcu(&old_rd->rcu, free_rootdomain); @@ -719,8 +719,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) if (sd_parent_degenerate(tmp, parent)) { tmp->parent = parent->parent; - if (parent->parent) + + if (parent->parent) { parent->parent->child = tmp; + if (tmp->flags & SD_SHARE_CPUCAPACITY) + parent->parent->groups->flags |= SD_SHARE_CPUCAPACITY; + } + /* * Transfer SD_PREFER_SIBLING down in case of a * degenerate parent; the spans match for this @@ -1676,7 +1681,7 @@ static struct sched_domain_topology_level *sched_domain_topology_saved; #define for_each_sd_topology(tl) \ for (tl = sched_domain_topology; tl->mask; tl++) -void set_sched_topology(struct sched_domain_topology_level *tl) +void __init set_sched_topology(struct sched_domain_topology_level *tl) { if (WARN_ON_ONCE(sched_smp_initialized)) return; diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 133b74730738..48c53e4739ea 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -425,11 +425,6 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i } EXPORT_SYMBOL(autoremove_wake_function); -static inline bool is_kthread_should_stop(void) -{ - return (current->flags & PF_KTHREAD) && kthread_should_stop(); -} - /* * DEFINE_WAIT_FUNC(wait, woken_wake_func); * @@ -459,7 +454,7 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) * or woken_wake_function() sees our store to current->state. */ set_current_state(mode); /* A */ - if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) + if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !kthread_should_stop_or_park()) timeout = schedule_timeout(timeout); __set_current_state(TASK_RUNNING); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 8464c5acc913..68d6c1190ac7 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -64,7 +64,7 @@ static struct clock_data cd ____cacheline_aligned = { .actual_read_sched_clock = jiffy_sched_clock_read, }; -static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) +static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift) { return (cyc * mult) >> shift; } @@ -77,26 +77,36 @@ notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq) notrace int sched_clock_read_retry(unsigned int seq) { - return read_seqcount_latch_retry(&cd.seq, seq); + return raw_read_seqcount_latch_retry(&cd.seq, seq); } -unsigned long long notrace sched_clock(void) +unsigned long long noinstr sched_clock_noinstr(void) { - u64 cyc, res; - unsigned int seq; struct clock_read_data *rd; + unsigned int seq; + u64 cyc, res; do { - rd = sched_clock_read_begin(&seq); + seq = raw_read_seqcount_latch(&cd.seq); + rd = cd.read_data + (seq & 1); cyc = (rd->read_sched_clock() - rd->epoch_cyc) & rd->sched_clock_mask; res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); - } while (sched_clock_read_retry(seq)); + } while (raw_read_seqcount_latch_retry(&cd.seq, seq)); return res; } +unsigned long long notrace sched_clock(void) +{ + unsigned long long ns; + preempt_disable_notrace(); + ns = sched_clock_noinstr(); + preempt_enable_notrace(); + return ns; +} + /* * Updating the data required to read the clock. * diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 09d594900ee0..266d02809dbb 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -450,7 +450,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); now += fast_tk_get_delta_ns(tkr); - } while (read_seqcount_latch_retry(&tkf->seq, seq)); + } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); return now; } @@ -566,7 +566,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) basem = ktime_to_ns(tkr->base); baser = ktime_to_ns(tkr->base_real); delta = fast_tk_get_delta_ns(tkr); - } while (read_seqcount_latch_retry(&tkf->seq, seq)); + } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); if (mono) *mono = basem + delta;