Scheduler changes for v6.5:

- Scheduler SMP load-balancer improvements:
 
     - Avoid unnecessary migrations within SMT domains on hybrid systems.
 
       Problem:
 
         On hybrid CPU systems, (processors with a mixture of higher-frequency
 	SMT cores and lower-frequency non-SMT cores), under the old code
 	lower-priority CPUs pulled tasks from the higher-priority cores if
 	more than one SMT sibling was busy - resulting in many unnecessary
 	task migrations.
 
       Solution:
 
         The new code improves the load balancer to recognize SMT cores with more
         than one busy sibling and allows lower-priority CPUs to pull tasks, which
         avoids superfluous migrations and lets lower-priority cores inspect all SMT
         siblings for the busiest queue.
 
     - Implement the 'runnable boosting' feature in the EAS balancer: consider CPU
       contention in frequency, EAS max util & load-balance busiest CPU selection.
 
       This improves CPU utilization for certain workloads, while leaves other key
       workloads unchanged.
 
 - Scheduler infrastructure improvements:
 
     - Rewrite the scheduler topology setup code by consolidating it
       into the build_sched_topology() helper function and building
       it dynamically on the fly.
 
     - Resolve the local_clock() vs. noinstr complications by rewriting
       the code: provide separate sched_clock_noinstr() and
       local_clock_noinstr() functions to be used in instrumentation code,
       and make sure it is all instrumentation-safe.
 
 - Fixes:
 
     - Fix a kthread_park() race with wait_woken()
 
     - Fix misc wait_task_inactive() bugs unearthed by the -rt merge:
        - Fix UP PREEMPT bug by unifying the SMP and UP implementations.
        - Fix task_struct::saved_state handling.
 
     - Fix various rq clock update bugs, unearthed by turning on the rq clock
       debugging code.
 
     - Fix the PSI WINDOW_MIN_US trigger limit, which was easy to trigger by
       creating enough cgroups, by removing the warnign and restricting
       window size triggers to PSI file write-permission or CAP_SYS_RESOURCE.
 
     - Propagate SMT flags in the topology when removing degenerate domain
 
     - Fix grub_reclaim() calculation bug in the deadline scheduler code
 
     - Avoid resetting the min update period when it is unnecessary, in
       psi_trigger_destroy().
 
     - Don't balance a task to its current running CPU in load_balance(),
       which was possible on certain NUMA topologies with overlapping
       groups.
 
     - Fix the sched-debug printing of rq->nr_uninterruptible
 
 - Cleanups:
 
     - Address various -Wmissing-prototype warnings, as a preparation
       to (maybe) enable this warning in the future.
 
     - Remove unused code
 
     - Mark more functions __init
 
     - Fix shadow-variable warnings
 
 Signed-off-by: Ingo Molnar <mingo@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmSatWQRHG1pbmdvQGtl
 cm5lbC5vcmcACgkQEnMQ0APhK1j62xAAuGOx1LcDfRGC6WGQzp1zOdlsVQtnDvlS
 qL58zYSHgizprpVQ3j87SBaG4CHCdvd2Bo36yW0lNZS4nd203qdq7fkrMb3hPP/w
 egUQUzMegf5fF6BWldKeMjuHSt+twFQz/ZAKK8iSbAir6CHNAqbNst1oL0i/+Tyk
 o33hBs1hT5tnbFb1NSVZkX4k+qT3LzTW4K2QgjjGtkScr6yHh2BdEVefyigWOjdo
 9s02d00ll9a2r+F5txlN7Dnw6TN7rmTXGMOJU5bZvBE90/anNiAorMXHJdEKCyUR
 u9+JtBdJWiCplGa/tSRcxT16ZW1VdtTnd9q66TDhXREd2UNDFqBEyg5Wl77K4Tlf
 vKFajmj/to+cTbuv6m6TVR+zyXpdEpdL6F04P44U3qiJvDobBqeDNKHHIqpmbHXl
 AXUXcPWTVAzXX1Ce5M+BeAgTBQ1T7C5tELILrTNQHJvO1s9VVBRFZ/l65Ps4vu7T
 wIZ781IFuopk0zWqHovNvgKrJ7oFmOQQZFttQEe8n6nafkjI7u+IZ8FayiGaUMRr
 4GawFGUCEdYh8z9qyslGKe8Q/Rphfk6hxMFRYUJpDmubQ0PkMeDjDGq77jDGl1PF
 VqwSDEyOaBJs7Gqf/mem00JtzBmXhkhm1SEjggHMI2IQbr/eeBXoLQOn3CDapO/N
 PiDbtX760ic=
 =EWQA
 -----END PGP SIGNATURE-----

Merge tag 'sched-core-2023-06-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "Scheduler SMP load-balancer improvements:

   - Avoid unnecessary migrations within SMT domains on hybrid systems.

     Problem:

        On hybrid CPU systems, (processors with a mixture of
        higher-frequency SMT cores and lower-frequency non-SMT cores),
        under the old code lower-priority CPUs pulled tasks from the
        higher-priority cores if more than one SMT sibling was busy -
        resulting in many unnecessary task migrations.

     Solution:

        The new code improves the load balancer to recognize SMT cores
        with more than one busy sibling and allows lower-priority CPUs
        to pull tasks, which avoids superfluous migrations and lets
        lower-priority cores inspect all SMT siblings for the busiest
        queue.

   - Implement the 'runnable boosting' feature in the EAS balancer:
     consider CPU contention in frequency, EAS max util & load-balance
     busiest CPU selection.

     This improves CPU utilization for certain workloads, while leaves
     other key workloads unchanged.

  Scheduler infrastructure improvements:

   - Rewrite the scheduler topology setup code by consolidating it into
     the build_sched_topology() helper function and building it
     dynamically on the fly.

   - Resolve the local_clock() vs. noinstr complications by rewriting
     the code: provide separate sched_clock_noinstr() and
     local_clock_noinstr() functions to be used in instrumentation code,
     and make sure it is all instrumentation-safe.

  Fixes:

   - Fix a kthread_park() race with wait_woken()

   - Fix misc wait_task_inactive() bugs unearthed by the -rt merge:
       - Fix UP PREEMPT bug by unifying the SMP and UP implementations
       - Fix task_struct::saved_state handling

   - Fix various rq clock update bugs, unearthed by turning on the rq
     clock debugging code.

   - Fix the PSI WINDOW_MIN_US trigger limit, which was easy to trigger
     by creating enough cgroups, by removing the warnign and restricting
     window size triggers to PSI file write-permission or
     CAP_SYS_RESOURCE.

   - Propagate SMT flags in the topology when removing degenerate domain

   - Fix grub_reclaim() calculation bug in the deadline scheduler code

   - Avoid resetting the min update period when it is unnecessary, in
     psi_trigger_destroy().

   - Don't balance a task to its current running CPU in load_balance(),
     which was possible on certain NUMA topologies with overlapping
     groups.

   - Fix the sched-debug printing of rq->nr_uninterruptible

  Cleanups:

   - Address various -Wmissing-prototype warnings, as a preparation to
     (maybe) enable this warning in the future.

   - Remove unused code

   - Mark more functions __init

   - Fix shadow-variable warnings"

* tag 'sched-core-2023-06-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (50 commits)
  sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle()
  sched/core: Avoid double calling update_rq_clock() in __balance_push_cpu_stop()
  sched/core: Fixed missing rq clock update before calling set_rq_offline()
  sched/deadline: Update GRUB description in the documentation
  sched/deadline: Fix bandwidth reclaim equation in GRUB
  sched/wait: Fix a kthread_park race with wait_woken()
  sched/topology: Mark set_sched_topology() __init
  sched/fair: Rename variable cpu_util eff_util
  arm64/arch_timer: Fix MMIO byteswap
  sched/fair, cpufreq: Introduce 'runnable boosting'
  sched/fair: Refactor CPU utilization functions
  cpuidle: Use local_clock_noinstr()
  sched/clock: Provide local_clock_noinstr()
  x86/tsc: Provide sched_clock_noinstr()
  clocksource: hyper-v: Provide noinstr sched_clock()
  clocksource: hyper-v: Adjust hv_read_tsc_page_tsc() to avoid special casing U64_MAX
  x86/vdso: Fix gettimeofday masking
  math64: Always inline u128 version of mul_u64_u64_shr()
  s390/time: Provide sched_clock_noinstr()
  loongarch: Provide noinstr sched_clock_read()
  ...
This commit is contained in:
Linus Torvalds 2023-06-27 14:03:21 -07:00
commit ed3b7923a8
43 changed files with 776 additions and 567 deletions

View file

@ -203,12 +203,15 @@ Deadline Task Scheduling
- Total bandwidth (this_bw): this is the sum of all tasks "belonging" to the
runqueue, including the tasks in Inactive state.
- Maximum usable bandwidth (max_bw): This is the maximum bandwidth usable by
deadline tasks and is currently set to the RT capacity.
The algorithm reclaims the bandwidth of the tasks in Inactive state.
It does so by decrementing the runtime of the executing task Ti at a pace equal
to
dq = -max{ Ui / Umax, (1 - Uinact - Uextra) } dt
dq = -(max{ Ui, (Umax - Uinact - Uextra) } / Umax) dt
where:

View file

@ -88,13 +88,7 @@ static inline notrace u64 arch_timer_read_cntvct_el0(void)
#define arch_timer_reg_read_stable(reg) \
({ \
u64 _val; \
\
preempt_disable_notrace(); \
_val = erratum_handler(read_ ## reg)(); \
preempt_enable_notrace(); \
\
_val; \
erratum_handler(read_ ## reg)(); \
})
/*

View file

@ -22,13 +22,13 @@
* Generic IO read/write. These perform native-endian accesses.
*/
#define __raw_writeb __raw_writeb
static inline void __raw_writeb(u8 val, volatile void __iomem *addr)
static __always_inline void __raw_writeb(u8 val, volatile void __iomem *addr)
{
asm volatile("strb %w0, [%1]" : : "rZ" (val), "r" (addr));
}
#define __raw_writew __raw_writew
static inline void __raw_writew(u16 val, volatile void __iomem *addr)
static __always_inline void __raw_writew(u16 val, volatile void __iomem *addr)
{
asm volatile("strh %w0, [%1]" : : "rZ" (val), "r" (addr));
}
@ -40,13 +40,13 @@ static __always_inline void __raw_writel(u32 val, volatile void __iomem *addr)
}
#define __raw_writeq __raw_writeq
static inline void __raw_writeq(u64 val, volatile void __iomem *addr)
static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
{
asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr));
}
#define __raw_readb __raw_readb
static inline u8 __raw_readb(const volatile void __iomem *addr)
static __always_inline u8 __raw_readb(const volatile void __iomem *addr)
{
u8 val;
asm volatile(ALTERNATIVE("ldrb %w0, [%1]",
@ -57,7 +57,7 @@ static inline u8 __raw_readb(const volatile void __iomem *addr)
}
#define __raw_readw __raw_readw
static inline u16 __raw_readw(const volatile void __iomem *addr)
static __always_inline u16 __raw_readw(const volatile void __iomem *addr)
{
u16 val;
@ -80,7 +80,7 @@ static __always_inline u32 __raw_readl(const volatile void __iomem *addr)
}
#define __raw_readq __raw_readq
static inline u64 __raw_readq(const volatile void __iomem *addr)
static __always_inline u64 __raw_readq(const volatile void __iomem *addr)
{
u64 val;
asm volatile(ALTERNATIVE("ldr %0, [%1]",

View file

@ -1167,7 +1167,7 @@ static __always_inline void iocsr_write64(u64 val, u32 reg)
#ifndef __ASSEMBLY__
static inline u64 drdtime(void)
static __always_inline u64 drdtime(void)
{
int rID = 0;
u64 val = 0;

View file

@ -190,9 +190,9 @@ static u64 read_const_counter(struct clocksource *clk)
return drdtime();
}
static u64 native_sched_clock(void)
static noinstr u64 sched_clock_read(void)
{
return read_const_counter(NULL);
return drdtime();
}
static struct clocksource clocksource_const = {
@ -211,7 +211,7 @@ int __init constant_clocksource_init(void)
res = clocksource_register_hz(&clocksource_const, freq);
sched_clock_register(native_sched_clock, 64, freq);
sched_clock_register(sched_clock_read, 64, freq);
pr_info("Constant clock source device register\n");

View file

@ -63,7 +63,7 @@ static inline int store_tod_clock_ext_cc(union tod_clock *clk)
return cc;
}
static inline void store_tod_clock_ext(union tod_clock *tod)
static __always_inline void store_tod_clock_ext(union tod_clock *tod)
{
asm volatile("stcke %0" : "=Q" (*tod) : : "cc");
}
@ -177,7 +177,7 @@ static inline void local_tick_enable(unsigned long comp)
typedef unsigned long cycles_t;
static inline unsigned long get_tod_clock(void)
static __always_inline unsigned long get_tod_clock(void)
{
union tod_clock clk;
@ -204,6 +204,11 @@ void init_cpu_timer(void);
extern union tod_clock tod_clock_base;
static __always_inline unsigned long __get_tod_clock_monotonic(void)
{
return get_tod_clock() - tod_clock_base.tod;
}
/**
* get_clock_monotonic - returns current time in clock rate units
*
@ -216,7 +221,7 @@ static inline unsigned long get_tod_clock_monotonic(void)
unsigned long tod;
preempt_disable_notrace();
tod = get_tod_clock() - tod_clock_base.tod;
tod = __get_tod_clock_monotonic();
preempt_enable_notrace();
return tod;
}
@ -240,7 +245,7 @@ static inline unsigned long get_tod_clock_monotonic(void)
* -> ns = (th * 125) + ((tl * 125) >> 9);
*
*/
static inline unsigned long tod_to_ns(unsigned long todval)
static __always_inline unsigned long tod_to_ns(unsigned long todval)
{
return ((todval >> 9) * 125) + (((todval & 0x1ff) * 125) >> 9);
}

View file

@ -102,6 +102,11 @@ void __init time_early_init(void)
((long) qui.old_leap * 4096000000L);
}
unsigned long long noinstr sched_clock_noinstr(void)
{
return tod_to_ns(__get_tod_clock_monotonic());
}
/*
* Scheduler clock - returns current time in nanosec units.
*/

View file

@ -257,6 +257,11 @@ void hv_set_register(unsigned int reg, u64 value);
u64 hv_get_non_nested_register(unsigned int reg);
void hv_set_non_nested_register(unsigned int reg, u64 value);
static __always_inline u64 hv_raw_get_register(unsigned int reg)
{
return __rdmsr(reg);
}
#else /* CONFIG_HYPERV */
static inline void hyperv_init(void) {}
static inline void hyperv_setup_mmu_ops(void) {}

View file

@ -231,14 +231,19 @@ static u64 vread_pvclock(void)
ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
} while (pvclock_read_retry(pvti, version));
return ret;
return ret & S64_MAX;
}
#endif
#ifdef CONFIG_HYPERV_TIMER
static u64 vread_hvclock(void)
{
return hv_read_tsc_page(&hvclock_page);
u64 tsc, time;
if (hv_read_tsc_page_tsc(&hvclock_page, &tsc, &time))
return time & S64_MAX;
return U64_MAX;
}
#endif
@ -246,7 +251,7 @@ static inline u64 __arch_get_hw_counter(s32 clock_mode,
const struct vdso_data *vd)
{
if (likely(clock_mode == VDSO_CLOCKMODE_TSC))
return (u64)rdtsc_ordered();
return (u64)rdtsc_ordered() & S64_MAX;
/*
* For any memory-mapped vclock type, we need to make sure that gcc
* doesn't cleverly hoist a load before the mode check. Otherwise we
@ -284,6 +289,9 @@ static inline bool arch_vdso_clocksource_ok(const struct vdso_data *vd)
* which can be invalidated asynchronously and indicate invalidation by
* returning U64_MAX, which can be effectively tested by checking for a
* negative value after casting it to s64.
*
* This effectively forces a S64_MAX mask on the calculations, unlike the
* U64_MAX mask normally used by x86 clocksources.
*/
static inline bool arch_vdso_cycles_ok(u64 cycles)
{
@ -303,18 +311,29 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
* @last. If not then use @last, which is the base time of the current
* conversion period.
*
* This variant also removes the masking of the subtraction because the
* clocksource mask of all VDSO capable clocksources on x86 is U64_MAX
* which would result in a pointless operation. The compiler cannot
* optimize it away as the mask comes from the vdso data and is not compile
* time constant.
* This variant also uses a custom mask because while the clocksource mask of
* all the VDSO capable clocksources on x86 is U64_MAX, the above code uses
* U64_MASK as an exception value, additionally arch_vdso_cycles_ok() above
* declares everything with the MSB/Sign-bit set as invalid. Therefore the
* effective mask is S64_MAX.
*/
static __always_inline
u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
{
if (cycles > last)
return (cycles - last) * mult;
return 0;
/*
* Due to the MSB/Sign-bit being used as invald marker (see
* arch_vdso_cycles_valid() above), the effective mask is S64_MAX.
*/
u64 delta = (cycles - last) & S64_MAX;
/*
* Due to the above mentioned TSC wobbles, filter out negative motion.
* Per the above masking, the effective sign bit is now bit 62.
*/
if (unlikely(delta & (1ULL << 62)))
return 0;
return delta * mult;
}
#define vdso_calc_delta vdso_calc_delta

View file

@ -165,32 +165,19 @@ int arch_asym_cpu_priority(int cpu)
/**
* sched_set_itmt_core_prio() - Set CPU priority based on ITMT
* @prio: Priority of cpu core
* @core_cpu: The cpu number associated with the core
* @prio: Priority of @cpu
* @cpu: The CPU number
*
* The pstate driver will find out the max boost frequency
* and call this function to set a priority proportional
* to the max boost frequency. CPU with higher boost
* to the max boost frequency. CPUs with higher boost
* frequency will receive higher priority.
*
* No need to rebuild sched domain after updating
* the CPU priorities. The sched domains have no
* dependency on CPU priorities.
*/
void sched_set_itmt_core_prio(int prio, int core_cpu)
void sched_set_itmt_core_prio(int prio, int cpu)
{
int cpu, i = 1;
for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) {
int smt_prio;
/*
* Ensure that the siblings are moved to the end
* of the priority chain and only used when
* all other high priority cpus are out of capacity.
*/
smt_prio = prio * smp_num_siblings / (i * i);
per_cpu(sched_core_priority, cpu) = smt_prio;
i++;
}
per_cpu(sched_core_priority, cpu) = prio;
}

View file

@ -71,7 +71,7 @@ static int kvm_set_wallclock(const struct timespec64 *now)
return -ENODEV;
}
static noinstr u64 kvm_clock_read(void)
static u64 kvm_clock_read(void)
{
u64 ret;
@ -88,7 +88,7 @@ static u64 kvm_clock_get_cycles(struct clocksource *cs)
static noinstr u64 kvm_sched_clock_read(void)
{
return kvm_clock_read() - kvm_sched_clock_offset;
return pvclock_clocksource_read_nowd(this_cpu_pvti()) - kvm_sched_clock_offset;
}
static inline void kvm_sched_clock_init(bool stable)

View file

@ -602,7 +602,7 @@ static int x86_core_flags(void)
#ifdef CONFIG_SCHED_SMT
static int x86_smt_flags(void)
{
return cpu_smt_flags() | x86_sched_itmt_flags();
return cpu_smt_flags();
}
#endif
#ifdef CONFIG_SCHED_CLUSTER
@ -613,44 +613,6 @@ static int x86_cluster_flags(void)
#endif
#endif
static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_CLUSTER
{ cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
#endif
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
#endif
{ NULL, },
};
static struct sched_domain_topology_level x86_hybrid_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
#endif
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
};
static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_CLUSTER
{ cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
#endif
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
#endif
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
};
/*
* Set if a package/die has multiple NUMA nodes inside.
* AMD Magny-Cours, Intel Cluster-on-Die, and Intel
@ -658,6 +620,51 @@ static struct sched_domain_topology_level x86_topology[] = {
*/
static bool x86_has_numa_in_package;
static struct sched_domain_topology_level x86_topology[6];
static void __init build_sched_topology(void)
{
int i = 0;
#ifdef CONFIG_SCHED_SMT
x86_topology[i++] = (struct sched_domain_topology_level){
cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT)
};
#endif
#ifdef CONFIG_SCHED_CLUSTER
/*
* For now, skip the cluster domain on Hybrid.
*/
if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
x86_topology[i++] = (struct sched_domain_topology_level){
cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
};
}
#endif
#ifdef CONFIG_SCHED_MC
x86_topology[i++] = (struct sched_domain_topology_level){
cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC)
};
#endif
/*
* When there is NUMA topology inside the package skip the DIE domain
* since the NUMA domains will auto-magically create the right spanning
* domains based on the SLIT.
*/
if (!x86_has_numa_in_package) {
x86_topology[i++] = (struct sched_domain_topology_level){
cpu_cpu_mask, SD_INIT_NAME(DIE)
};
}
/*
* There must be one trailing NULL entry left.
*/
BUG_ON(i >= ARRAY_SIZE(x86_topology)-1);
set_sched_topology(x86_topology);
}
void set_cpu_sibling_map(int cpu)
{
bool has_smt = smp_num_siblings > 1;
@ -1264,15 +1271,6 @@ void __init smp_prepare_cpus_common(void)
zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
}
/*
* Set 'default' x86 topology, this matches default_topology() in that
* it has NUMA nodes as a topology level. See also
* native_smp_cpus_done().
*
* Must be done before set_cpus_sibling_map() is ran.
*/
set_sched_topology(x86_topology);
set_cpu_sibling_map(0);
}
@ -1393,13 +1391,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
pr_debug("Boot done\n");
calculate_max_logical_packages();
/* XXX for now assume numa-in-package and hybrid don't overlap */
if (x86_has_numa_in_package)
set_sched_topology(x86_numa_in_package_topology);
if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
set_sched_topology(x86_hybrid_topology);
build_sched_topology();
nmi_selftest();
impress_friends();
cache_aps_init();

View file

@ -69,12 +69,10 @@ static int __init tsc_early_khz_setup(char *buf)
}
early_param("tsc_early_khz", tsc_early_khz_setup);
__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
__always_inline void __cyc2ns_read(struct cyc2ns_data *data)
{
int seq, idx;
preempt_disable_notrace();
do {
seq = this_cpu_read(cyc2ns.seq.seqcount.sequence);
idx = seq & 1;
@ -86,6 +84,12 @@ __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
} while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence)));
}
__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
{
preempt_disable_notrace();
__cyc2ns_read(data);
}
__always_inline void cyc2ns_read_end(void)
{
preempt_enable_notrace();
@ -115,18 +119,25 @@ __always_inline void cyc2ns_read_end(void)
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
*/
static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc)
{
struct cyc2ns_data data;
unsigned long long ns;
cyc2ns_read_begin(&data);
__cyc2ns_read(&data);
ns = data.cyc2ns_offset;
ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
cyc2ns_read_end();
return ns;
}
static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
unsigned long long ns;
preempt_disable_notrace();
ns = __cycles_2_ns(cyc);
preempt_enable_notrace();
return ns;
}
@ -223,7 +234,7 @@ noinstr u64 native_sched_clock(void)
u64 tsc_now = rdtsc();
/* return the value in ns */
return cycles_2_ns(tsc_now);
return __cycles_2_ns(tsc_now);
}
/*
@ -250,7 +261,7 @@ u64 native_sched_clock_from_tsc(u64 tsc)
/* We need to define a real function for sched_clock, to override the
weak default version */
#ifdef CONFIG_PARAVIRT
noinstr u64 sched_clock(void)
noinstr u64 sched_clock_noinstr(void)
{
return paravirt_sched_clock();
}
@ -260,11 +271,20 @@ bool using_native_sched_clock(void)
return static_call_query(pv_sched_clock) == native_sched_clock;
}
#else
u64 sched_clock(void) __attribute__((alias("native_sched_clock")));
u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));
bool using_native_sched_clock(void) { return true; }
#endif
notrace u64 sched_clock(void)
{
u64 now;
preempt_disable_notrace();
now = sched_clock_noinstr();
preempt_enable_notrace();
return now;
}
int check_tsc_unstable(void)
{
return tsc_unstable;

View file

@ -2799,14 +2799,13 @@ static u64 read_tsc(void)
static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
int *mode)
{
long v;
u64 tsc_pg_val;
long v;
switch (clock->vclock_mode) {
case VDSO_CLOCKMODE_HVCLOCK:
tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
tsc_timestamp);
if (tsc_pg_val != U64_MAX) {
if (hv_read_tsc_page_tsc(hv_get_tsc_page(),
tsc_timestamp, &tsc_pg_val)) {
/* TSC page valid */
*mode = VDSO_CLOCKMODE_HVCLOCK;
v = (tsc_pg_val - clock->cycle_last) &

View file

@ -66,11 +66,10 @@ static noinstr u64 xen_sched_clock(void)
struct pvclock_vcpu_time_info *src;
u64 ret;
preempt_disable_notrace();
src = &__this_cpu_read(xen_vcpu)->time;
ret = pvclock_clocksource_read_nowd(src);
ret -= xen_sched_clock_offset;
preempt_enable_notrace();
return ret;
}

View file

@ -191,22 +191,40 @@ u32 arch_timer_reg_read(int access, enum arch_timer_reg reg,
return val;
}
static notrace u64 arch_counter_get_cntpct_stable(void)
static noinstr u64 raw_counter_get_cntpct_stable(void)
{
return __arch_counter_get_cntpct_stable();
}
static notrace u64 arch_counter_get_cntpct(void)
static notrace u64 arch_counter_get_cntpct_stable(void)
{
u64 val;
preempt_disable_notrace();
val = __arch_counter_get_cntpct_stable();
preempt_enable_notrace();
return val;
}
static noinstr u64 arch_counter_get_cntpct(void)
{
return __arch_counter_get_cntpct();
}
static notrace u64 arch_counter_get_cntvct_stable(void)
static noinstr u64 raw_counter_get_cntvct_stable(void)
{
return __arch_counter_get_cntvct_stable();
}
static notrace u64 arch_counter_get_cntvct(void)
static notrace u64 arch_counter_get_cntvct_stable(void)
{
u64 val;
preempt_disable_notrace();
val = __arch_counter_get_cntvct_stable();
preempt_enable_notrace();
return val;
}
static noinstr u64 arch_counter_get_cntvct(void)
{
return __arch_counter_get_cntvct();
}
@ -753,14 +771,14 @@ static int arch_timer_set_next_event_phys(unsigned long evt,
return 0;
}
static u64 arch_counter_get_cnt_mem(struct arch_timer *t, int offset_lo)
static noinstr u64 arch_counter_get_cnt_mem(struct arch_timer *t, int offset_lo)
{
u32 cnt_lo, cnt_hi, tmp_hi;
do {
cnt_hi = readl_relaxed(t->base + offset_lo + 4);
cnt_lo = readl_relaxed(t->base + offset_lo);
tmp_hi = readl_relaxed(t->base + offset_lo + 4);
cnt_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4));
cnt_lo = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo));
tmp_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4));
} while (cnt_hi != tmp_hi);
return ((u64) cnt_hi << 32) | cnt_lo;
@ -1060,7 +1078,7 @@ bool arch_timer_evtstrm_available(void)
return cpumask_test_cpu(raw_smp_processor_id(), &evtstrm_available);
}
static u64 arch_counter_get_cntvct_mem(void)
static noinstr u64 arch_counter_get_cntvct_mem(void)
{
return arch_counter_get_cnt_mem(arch_timer_mem, CNTVCT_LO);
}
@ -1074,6 +1092,7 @@ struct arch_timer_kvm_info *arch_timer_get_kvm_info(void)
static void __init arch_counter_register(unsigned type)
{
u64 (*scr)(void);
u64 start_count;
int width;
@ -1083,21 +1102,28 @@ static void __init arch_counter_register(unsigned type)
if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) ||
arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) {
if (arch_timer_counter_has_wa())
if (arch_timer_counter_has_wa()) {
rd = arch_counter_get_cntvct_stable;
else
scr = raw_counter_get_cntvct_stable;
} else {
rd = arch_counter_get_cntvct;
scr = arch_counter_get_cntvct;
}
} else {
if (arch_timer_counter_has_wa())
if (arch_timer_counter_has_wa()) {
rd = arch_counter_get_cntpct_stable;
else
scr = raw_counter_get_cntpct_stable;
} else {
rd = arch_counter_get_cntpct;
scr = arch_counter_get_cntpct;
}
}
arch_timer_read_counter = rd;
clocksource_counter.vdso_clock_mode = vdso_default;
} else {
arch_timer_read_counter = arch_counter_get_cntvct_mem;
scr = arch_counter_get_cntvct_mem;
}
width = arch_counter_get_width();
@ -1113,7 +1139,7 @@ static void __init arch_counter_register(unsigned type)
timecounter_init(&arch_timer_kvm_info.timecounter,
&cyclecounter, start_count);
sched_clock_register(arch_timer_read_counter, width, arch_timer_rate);
sched_clock_register(scr, width, arch_timer_rate);
}
static void arch_timer_stop(struct clock_event_device *clk)

View file

@ -365,6 +365,20 @@ void hv_stimer_global_cleanup(void)
}
EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup);
static __always_inline u64 read_hv_clock_msr(void)
{
/*
* Read the partition counter to get the current tick count. This count
* is set to 0 when the partition is created and is incremented in 100
* nanosecond units.
*
* Use hv_raw_get_register() because this function is used from
* noinstr. Notable; while HV_REGISTER_TIME_REF_COUNT is a synthetic
* register it doesn't need the GHCB path.
*/
return hv_raw_get_register(HV_REGISTER_TIME_REF_COUNT);
}
/*
* Code and definitions for the Hyper-V clocksources. Two
* clocksources are defined: one that reads the Hyper-V defined MSR, and
@ -393,14 +407,20 @@ struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
}
EXPORT_SYMBOL_GPL(hv_get_tsc_page);
static u64 notrace read_hv_clock_tsc(void)
static __always_inline u64 read_hv_clock_tsc(void)
{
u64 current_tick = hv_read_tsc_page(hv_get_tsc_page());
u64 cur_tsc, time;
if (current_tick == U64_MAX)
current_tick = hv_get_register(HV_REGISTER_TIME_REF_COUNT);
/*
* The Hyper-V Top-Level Function Spec (TLFS), section Timers,
* subsection Refererence Counter, guarantees that the TSC and MSR
* times are in sync and monotonic. Therefore we can fall back
* to the MSR in case the TSC page indicates unavailability.
*/
if (!hv_read_tsc_page_tsc(tsc_page, &cur_tsc, &time))
time = read_hv_clock_msr();
return current_tick;
return time;
}
static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg)
@ -408,7 +428,7 @@ static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg)
return read_hv_clock_tsc();
}
static u64 notrace read_hv_sched_clock_tsc(void)
static u64 noinstr read_hv_sched_clock_tsc(void)
{
return (read_hv_clock_tsc() - hv_sched_clock_offset) *
(NSEC_PER_SEC / HV_CLOCK_HZ);
@ -460,16 +480,6 @@ static struct clocksource hyperv_cs_tsc = {
#endif
};
static u64 notrace read_hv_clock_msr(void)
{
/*
* Read the partition counter to get the current tick count. This count
* is set to 0 when the partition is created and is incremented in
* 100 nanosecond units.
*/
return hv_get_register(HV_REGISTER_TIME_REF_COUNT);
}
static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg)
{
return read_hv_clock_msr();

View file

@ -145,7 +145,7 @@ static noinstr void enter_s2idle_proper(struct cpuidle_driver *drv,
instrumentation_begin();
time_start = ns_to_ktime(local_clock());
time_start = ns_to_ktime(local_clock_noinstr());
tick_freeze();
/*
@ -169,7 +169,7 @@ static noinstr void enter_s2idle_proper(struct cpuidle_driver *drv,
tick_unfreeze();
start_critical_timings();
time_end = ns_to_ktime(local_clock());
time_end = ns_to_ktime(local_clock_noinstr());
dev->states_usage[index].s2idle_time += ktime_us_delta(time_end, time_start);
dev->states_usage[index].s2idle_usage++;
@ -243,7 +243,7 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev,
sched_idle_set_state(target_state);
trace_cpu_idle(index, dev->cpu);
time_start = ns_to_ktime(local_clock());
time_start = ns_to_ktime(local_clock_noinstr());
stop_critical_timings();
if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) {
@ -276,7 +276,7 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev,
start_critical_timings();
sched_clock_idle_wakeup_event();
time_end = ns_to_ktime(local_clock());
time_end = ns_to_ktime(local_clock_noinstr());
trace_cpu_idle(PWR_EVENT_EXIT, dev->cpu);
/* The cpu is no longer idle or about to enter idle. */

View file

@ -15,7 +15,7 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev,
{
u64 time_start;
time_start = local_clock();
time_start = local_clock_noinstr();
dev->poll_time_limit = false;
@ -32,7 +32,7 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev,
continue;
loop_count = 0;
if (local_clock() - time_start > limit) {
if (local_clock_noinstr() - time_start > limit) {
dev->poll_time_limit = true;
break;
}

View file

@ -38,8 +38,9 @@ extern void hv_remap_tsc_clocksource(void);
extern unsigned long hv_get_tsc_pfn(void);
extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
static inline notrace u64
hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc)
static __always_inline bool
hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
u64 *cur_tsc, u64 *time)
{
u64 scale, offset;
u32 sequence;
@ -63,7 +64,7 @@ hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc)
do {
sequence = READ_ONCE(tsc_pg->tsc_sequence);
if (!sequence)
return U64_MAX;
return false;
/*
* Make sure we read sequence before we read other values from
* TSC page.
@ -82,15 +83,8 @@ hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc)
} while (READ_ONCE(tsc_pg->tsc_sequence) != sequence);
return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset;
}
static inline notrace u64
hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
{
u64 cur_tsc;
return hv_read_tsc_page_tsc(tsc_pg, &cur_tsc);
*time = mul_u64_u64_shr(*cur_tsc, scale, 64) + offset;
return true;
}
#else /* CONFIG_HYPERV_TIMER */
@ -104,10 +98,10 @@ static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
return NULL;
}
static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
u64 *cur_tsc)
static __always_inline bool
hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc, u64 *time)
{
return U64_MAX;
return false;
}
static inline int hv_stimer_cleanup(unsigned int cpu) { return 0; }

View file

@ -89,6 +89,7 @@ int kthread_stop(struct task_struct *k);
bool kthread_should_stop(void);
bool kthread_should_park(void);
bool __kthread_should_park(struct task_struct *k);
bool kthread_should_stop_or_park(void);
bool kthread_freezable_should_stop(bool *was_frozen);
void *kthread_func(struct task_struct *k);
void *kthread_data(struct task_struct *k);

View file

@ -168,7 +168,7 @@ static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
#endif /* mul_u64_u32_shr */
#ifndef mul_u64_u64_shr
static inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift)
static __always_inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift)
{
return (u64)(((unsigned __int128)a * mul) >> shift);
}

View file

@ -206,7 +206,7 @@ latch_tree_find(void *key, struct latch_tree_root *root,
do {
seq = raw_read_seqcount_latch(&root->seq);
node = __lt_find(key, root, seq & 1, ops->comp);
} while (read_seqcount_latch_retry(&root->seq, seq));
} while (raw_read_seqcount_latch_retry(&root->seq, seq));
return node;
}

View file

@ -2006,15 +2006,12 @@ static __always_inline void scheduler_ipi(void)
*/
preempt_fold_need_resched();
}
extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);
#else
static inline void scheduler_ipi(void) { }
static inline unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
{
return 1;
}
#endif
extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);
/*
* Set thread flags in other task's structures.
* See asm/thread_info.h for TIF_xxxx flags available:

View file

@ -12,7 +12,16 @@
*
* Please use one of the three interfaces below.
*/
extern unsigned long long notrace sched_clock(void);
extern u64 sched_clock(void);
#if defined(CONFIG_ARCH_WANTS_NO_INSTR) || defined(CONFIG_GENERIC_SCHED_CLOCK)
extern u64 sched_clock_noinstr(void);
#else
static __always_inline u64 sched_clock_noinstr(void)
{
return sched_clock();
}
#endif
/*
* See the comment in kernel/sched/clock.c
@ -45,6 +54,11 @@ static inline u64 cpu_clock(int cpu)
return sched_clock();
}
static __always_inline u64 local_clock_noinstr(void)
{
return sched_clock_noinstr();
}
static __always_inline u64 local_clock(void)
{
return sched_clock();
@ -79,6 +93,7 @@ static inline u64 cpu_clock(int cpu)
return sched_clock_cpu(cpu);
}
extern u64 local_clock_noinstr(void);
extern u64 local_clock(void);
#endif

View file

@ -132,12 +132,9 @@ SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
/*
* Place busy tasks earlier in the domain
*
* SHARED_CHILD: Usually set on the SMT level. Technically could be set further
* up, but currently assumed to be set from the base domain
* upwards (see update_top_cache_domain()).
* NEEDS_GROUPS: Load balancing flag.
*/
SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS)
/*
* Prefer to place tasks in a sibling domain

View file

@ -203,7 +203,7 @@ struct sched_domain_topology_level {
#endif
};
extern void set_sched_topology(struct sched_domain_topology_level *tl);
extern void __init set_sched_topology(struct sched_domain_topology_level *tl);
#ifdef CONFIG_SCHED_DEBUG
# define SD_INIT_NAME(type) .name = #type

View file

@ -671,9 +671,9 @@ typedef struct {
*
* Return: sequence counter raw value. Use the lowest bit as an index for
* picking which data copy to read. The full counter must then be checked
* with read_seqcount_latch_retry().
* with raw_read_seqcount_latch_retry().
*/
static inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
{
/*
* Pairs with the first smp_wmb() in raw_write_seqcount_latch().
@ -683,16 +683,17 @@ static inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
}
/**
* read_seqcount_latch_retry() - end a seqcount_latch_t read section
* raw_read_seqcount_latch_retry() - end a seqcount_latch_t read section
* @s: Pointer to seqcount_latch_t
* @start: count, from raw_read_seqcount_latch()
*
* Return: true if a read section retry is required, else false
*/
static inline int
read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
static __always_inline int
raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
{
return read_seqcount_retry(&s->seqcount, start);
smp_rmb();
return unlikely(READ_ONCE(s->seqcount.sequence) != start);
}
/**
@ -752,7 +753,7 @@ read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
* entry = data_query(latch->data[idx], ...);
*
* // This includes needed smp_rmb()
* } while (read_seqcount_latch_retry(&latch->seq, seq));
* } while (raw_read_seqcount_latch_retry(&latch->seq, seq));
*
* return entry;
* }

View file

@ -3891,6 +3891,14 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}
static int cgroup_pressure_open(struct kernfs_open_file *of)
{
if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
return -EPERM;
return 0;
}
static void cgroup_pressure_release(struct kernfs_open_file *of)
{
struct cgroup_file_ctx *ctx = of->priv;
@ -5290,6 +5298,7 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "io.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
.open = cgroup_pressure_open,
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@ -5298,6 +5307,7 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "memory.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
.open = cgroup_pressure_open,
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@ -5306,6 +5316,7 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "cpu.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
.open = cgroup_pressure_open,
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
@ -5315,6 +5326,7 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "irq.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
.open = cgroup_pressure_open,
.seq_show = cgroup_irq_pressure_show,
.write = cgroup_irq_pressure_write,
.poll = cgroup_pressure_poll,

View file

@ -182,6 +182,16 @@ bool kthread_should_park(void)
}
EXPORT_SYMBOL_GPL(kthread_should_park);
bool kthread_should_stop_or_park(void)
{
struct kthread *kthread = __to_kthread(current);
if (!kthread)
return false;
return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK));
}
/**
* kthread_freezable_should_stop - should this freezable kthread return now?
* @was_frozen: optional out parameter, indicates whether %current was frozen

View file

@ -528,7 +528,7 @@ static u64 latched_seq_read_nolock(struct latched_seq *ls)
seq = raw_read_seqcount_latch(&ls->latch);
idx = seq & 0x1;
val = ls->val[idx];
} while (read_seqcount_latch_retry(&ls->latch, seq));
} while (raw_read_seqcount_latch_retry(&ls->latch, seq));
return val;
}

View file

@ -266,7 +266,7 @@ static __always_inline u64 sched_clock_local(struct sched_clock_data *scd)
s64 delta;
again:
now = sched_clock();
now = sched_clock_noinstr();
delta = now - scd->tick_raw;
if (unlikely(delta < 0))
delta = 0;
@ -293,22 +293,29 @@ static __always_inline u64 sched_clock_local(struct sched_clock_data *scd)
return clock;
}
noinstr u64 local_clock(void)
noinstr u64 local_clock_noinstr(void)
{
u64 clock;
if (static_branch_likely(&__sched_clock_stable))
return sched_clock() + __sched_clock_offset;
return sched_clock_noinstr() + __sched_clock_offset;
if (!static_branch_likely(&sched_clock_running))
return sched_clock();
return sched_clock_noinstr();
preempt_disable_notrace();
clock = sched_clock_local(this_scd());
preempt_enable_notrace();
return clock;
}
u64 local_clock(void)
{
u64 now;
preempt_disable_notrace();
now = local_clock_noinstr();
preempt_enable_notrace();
return now;
}
EXPORT_SYMBOL_GPL(local_clock);
static notrace u64 sched_clock_remote(struct sched_clock_data *scd)

View file

@ -2213,6 +2213,154 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
rq_clock_skip_update(rq);
}
static __always_inline
int __task_state_match(struct task_struct *p, unsigned int state)
{
if (READ_ONCE(p->__state) & state)
return 1;
#ifdef CONFIG_PREEMPT_RT
if (READ_ONCE(p->saved_state) & state)
return -1;
#endif
return 0;
}
static __always_inline
int task_state_match(struct task_struct *p, unsigned int state)
{
#ifdef CONFIG_PREEMPT_RT
int match;
/*
* Serialize against current_save_and_set_rtlock_wait_state() and
* current_restore_rtlock_saved_state().
*/
raw_spin_lock_irq(&p->pi_lock);
match = __task_state_match(p, state);
raw_spin_unlock_irq(&p->pi_lock);
return match;
#else
return __task_state_match(p, state);
#endif
}
/*
* wait_task_inactive - wait for a thread to unschedule.
*
* Wait for the thread to block in any of the states set in @match_state.
* If it changes, i.e. @p might have woken up, then return zero. When we
* succeed in waiting for @p to be off its CPU, we return a positive number
* (its total switch count). If a second call a short while later returns the
* same number, the caller can be sure that @p has remained unscheduled the
* whole time.
*
* The caller must ensure that the task *will* unschedule sometime soon,
* else this function might spin for a *long* time. This function can't
* be called with interrupts off, or it may introduce deadlock with
* smp_call_function() if an IPI is sent by the same process we are
* waiting to become inactive.
*/
unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
{
int running, queued, match;
struct rq_flags rf;
unsigned long ncsw;
struct rq *rq;
for (;;) {
/*
* We do the initial early heuristics without holding
* any task-queue locks at all. We'll only try to get
* the runqueue lock when things look like they will
* work out!
*/
rq = task_rq(p);
/*
* If the task is actively running on another CPU
* still, just relax and busy-wait without holding
* any locks.
*
* NOTE! Since we don't hold any locks, it's not
* even sure that "rq" stays as the right runqueue!
* But we don't care, since "task_on_cpu()" will
* return false if the runqueue has changed and p
* is actually now running somewhere else!
*/
while (task_on_cpu(rq, p)) {
if (!task_state_match(p, match_state))
return 0;
cpu_relax();
}
/*
* Ok, time to look more closely! We need the rq
* lock now, to be *sure*. If we're wrong, we'll
* just go back and repeat.
*/
rq = task_rq_lock(p, &rf);
trace_sched_wait_task(p);
running = task_on_cpu(rq, p);
queued = task_on_rq_queued(p);
ncsw = 0;
if ((match = __task_state_match(p, match_state))) {
/*
* When matching on p->saved_state, consider this task
* still queued so it will wait.
*/
if (match < 0)
queued = 1;
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
}
task_rq_unlock(rq, p, &rf);
/*
* If it changed from the expected state, bail out now.
*/
if (unlikely(!ncsw))
break;
/*
* Was it really running after all now that we
* checked with the proper locks actually held?
*
* Oops. Go back and try again..
*/
if (unlikely(running)) {
cpu_relax();
continue;
}
/*
* It's not enough that it's not actively running,
* it must be off the runqueue _entirely_, and not
* preempted!
*
* So if it was still runnable (but just not actively
* running right now), it's preempted, and we should
* yield - it could be a while.
*/
if (unlikely(queued)) {
ktime_t to = NSEC_PER_SEC / HZ;
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
continue;
}
/*
* Ahh, all good. It wasn't running, and it wasn't
* runnable, which means that it will never become
* running in the future either. We're all done!
*/
break;
}
return ncsw;
}
#ifdef CONFIG_SMP
static void
@ -2398,7 +2546,6 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
if (!is_cpu_allowed(p, dest_cpu))
return rq;
update_rq_clock(rq);
rq = move_queued_task(rq, rf, p, dest_cpu);
return rq;
@ -2456,10 +2603,12 @@ static int migration_cpu_stop(void *data)
goto out;
}
if (task_on_rq_queued(p))
if (task_on_rq_queued(p)) {
update_rq_clock(rq);
rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
else
} else {
p->wake_cpu = arg->dest_cpu;
}
/*
* XXX __migrate_task() can fail, at which point we might end
@ -3341,114 +3490,6 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
}
#endif /* CONFIG_NUMA_BALANCING */
/*
* wait_task_inactive - wait for a thread to unschedule.
*
* Wait for the thread to block in any of the states set in @match_state.
* If it changes, i.e. @p might have woken up, then return zero. When we
* succeed in waiting for @p to be off its CPU, we return a positive number
* (its total switch count). If a second call a short while later returns the
* same number, the caller can be sure that @p has remained unscheduled the
* whole time.
*
* The caller must ensure that the task *will* unschedule sometime soon,
* else this function might spin for a *long* time. This function can't
* be called with interrupts off, or it may introduce deadlock with
* smp_call_function() if an IPI is sent by the same process we are
* waiting to become inactive.
*/
unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
{
int running, queued;
struct rq_flags rf;
unsigned long ncsw;
struct rq *rq;
for (;;) {
/*
* We do the initial early heuristics without holding
* any task-queue locks at all. We'll only try to get
* the runqueue lock when things look like they will
* work out!
*/
rq = task_rq(p);
/*
* If the task is actively running on another CPU
* still, just relax and busy-wait without holding
* any locks.
*
* NOTE! Since we don't hold any locks, it's not
* even sure that "rq" stays as the right runqueue!
* But we don't care, since "task_on_cpu()" will
* return false if the runqueue has changed and p
* is actually now running somewhere else!
*/
while (task_on_cpu(rq, p)) {
if (!(READ_ONCE(p->__state) & match_state))
return 0;
cpu_relax();
}
/*
* Ok, time to look more closely! We need the rq
* lock now, to be *sure*. If we're wrong, we'll
* just go back and repeat.
*/
rq = task_rq_lock(p, &rf);
trace_sched_wait_task(p);
running = task_on_cpu(rq, p);
queued = task_on_rq_queued(p);
ncsw = 0;
if (READ_ONCE(p->__state) & match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, p, &rf);
/*
* If it changed from the expected state, bail out now.
*/
if (unlikely(!ncsw))
break;
/*
* Was it really running after all now that we
* checked with the proper locks actually held?
*
* Oops. Go back and try again..
*/
if (unlikely(running)) {
cpu_relax();
continue;
}
/*
* It's not enough that it's not actively running,
* it must be off the runqueue _entirely_, and not
* preempted!
*
* So if it was still runnable (but just not actively
* running right now), it's preempted, and we should
* yield - it could be a while.
*/
if (unlikely(queued)) {
ktime_t to = NSEC_PER_SEC / HZ;
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
continue;
}
/*
* Ahh, all good. It wasn't running, and it wasn't
* runnable, which means that it will never become
* running in the future either. We're all done!
*/
break;
}
return ncsw;
}
/***
* kick_process - kick a running thread to enter/exit the kernel
* @p: the to-be-kicked thread
@ -4003,15 +4044,14 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
static __always_inline
bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
{
int match;
if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
state != TASK_RTLOCK_WAIT);
}
if (READ_ONCE(p->__state) & state) {
*success = 1;
return true;
}
*success = !!(match = __task_state_match(p, state));
#ifdef CONFIG_PREEMPT_RT
/*
@ -4027,12 +4067,10 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
* p::saved_state to TASK_RUNNING so any further tests will
* not result in false positives vs. @success
*/
if (p->saved_state & state) {
if (match < 0)
p->saved_state = TASK_RUNNING;
*success = 1;
}
#endif
return false;
return match > 0;
}
/*
@ -9548,6 +9586,7 @@ void set_rq_offline(struct rq *rq)
if (rq->online) {
const struct sched_class *class;
update_rq_clock(rq);
for_each_class(class) {
if (class->rq_offline)
class->rq_offline(rq);
@ -9689,7 +9728,6 @@ int sched_cpu_deactivate(unsigned int cpu)
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
update_rq_clock(rq);
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}

View file

@ -155,10 +155,11 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu);
struct rq *rq = cpu_rq(sg_cpu->cpu);
sg_cpu->bw_dl = cpu_bw_dl(rq);
sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
sg_cpu->util = effective_cpu_util(sg_cpu->cpu, util,
FREQUENCY_UTIL, NULL);
}

View file

@ -489,13 +489,6 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
{
raw_spin_lock_init(&dl_b->dl_runtime_lock);
dl_b->dl_period = period;
dl_b->dl_runtime = runtime;
}
void init_dl_bw(struct dl_bw *dl_b)
{
raw_spin_lock_init(&dl_b->lock);
@ -1260,43 +1253,39 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
}
/*
* This function implements the GRUB accounting rule:
* according to the GRUB reclaiming algorithm, the runtime is
* not decreased as "dq = -dt", but as
* "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt",
* This function implements the GRUB accounting rule. According to the
* GRUB reclaiming algorithm, the runtime is not decreased as "dq = -dt",
* but as "dq = -(max{u, (Umax - Uinact - Uextra)} / Umax) dt",
* where u is the utilization of the task, Umax is the maximum reclaimable
* utilization, Uinact is the (per-runqueue) inactive utilization, computed
* as the difference between the "total runqueue utilization" and the
* runqueue active utilization, and Uextra is the (per runqueue) extra
* "runqueue active utilization", and Uextra is the (per runqueue) extra
* reclaimable utilization.
* Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
* multiplied by 2^BW_SHIFT, the result has to be shifted right by
* BW_SHIFT.
* Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT,
* dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
* Since delta is a 64 bit variable, to have an overflow its value
* should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
* So, overflow is not an issue here.
* Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied
* by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT.
* Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw
* is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
* Since delta is a 64 bit variable, to have an overflow its value should be
* larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is
* not an issue here.
*/
static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
{
u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
u64 u_act;
u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT;
u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
/*
* Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)},
* we compare u_inact + rq->dl.extra_bw with
* 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because
* u_inact + rq->dl.extra_bw can be larger than
* 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative
* leading to wrong results)
* Instead of computing max{u, (u_max - u_inact - u_extra)}, we
* compare u_inact + u_extra with u_max - u, because u_inact + u_extra
* can be larger than u_max. So, u_max - u_inact - u_extra would be
* negative leading to wrong results.
*/
if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min)
u_act = u_act_min;
if (u_inact + rq->dl.extra_bw > rq->dl.max_bw - dl_se->dl_bw)
u_act = dl_se->dl_bw;
else
u_act = BW_UNIT - u_inact - rq->dl.extra_bw;
u_act = rq->dl.max_bw - u_inact - rq->dl.extra_bw;
u_act = (u_act * rq->dl.bw_ratio) >> RATIO_SHIFT;
return (delta * u_act) >> BW_SHIFT;
}
@ -2795,12 +2784,12 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
{
if (global_rt_runtime() == RUNTIME_INF) {
dl_rq->bw_ratio = 1 << RATIO_SHIFT;
dl_rq->extra_bw = 1 << BW_SHIFT;
dl_rq->max_bw = dl_rq->extra_bw = 1 << BW_SHIFT;
} else {
dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
dl_rq->extra_bw = to_ratio(global_rt_period(),
global_rt_runtime());
dl_rq->max_bw = dl_rq->extra_bw =
to_ratio(global_rt_period(), global_rt_runtime());
}
}

View file

@ -777,7 +777,7 @@ static void print_cpu(struct seq_file *m, int cpu)
#define P(x) \
do { \
if (sizeof(rq->x) == 4) \
SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
SEQ_printf(m, " .%-30s: %d\n", #x, (int)(rq->x)); \
else \
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
} while (0)

View file

@ -1064,6 +1064,23 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
static inline bool is_core_idle(int cpu)
{
#ifdef CONFIG_SCHED_SMT
int sibling;
for_each_cpu(sibling, cpu_smt_mask(cpu)) {
if (cpu == sibling)
continue;
if (!idle_cpu(sibling))
return false;
}
#endif
return true;
}
#ifdef CONFIG_NUMA
#define NUMA_IMBALANCE_MIN 2
@ -1700,23 +1717,6 @@ struct numa_stats {
int idle_cpu;
};
static inline bool is_core_idle(int cpu)
{
#ifdef CONFIG_SCHED_SMT
int sibling;
for_each_cpu(sibling, cpu_smt_mask(cpu)) {
if (cpu == sibling)
continue;
if (!idle_cpu(sibling))
return false;
}
#endif
return true;
}
struct task_numa_env {
struct task_struct *p;
@ -5576,6 +5576,14 @@ static void __cfsb_csd_unthrottle(void *arg)
rq_lock(rq, &rf);
/*
* Iterating over the list can trigger several call to
* update_rq_clock() in unthrottle_cfs_rq().
* Do it once and skip the potential next ones.
*/
update_rq_clock(rq);
rq_clock_start_loop_update(rq);
/*
* Since we hold rq lock we're safe from concurrent manipulation of
* the CSD list. However, this RCU critical section annotates the
@ -5595,6 +5603,7 @@ static void __cfsb_csd_unthrottle(void *arg)
rcu_read_unlock();
rq_clock_stop_loop_update(rq);
rq_unlock(rq, &rf);
}
@ -6115,6 +6124,13 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
lockdep_assert_rq_held(rq);
/*
* The rq clock has already been updated in the
* set_rq_offline(), so we should skip updating
* the rq clock again in unthrottle_cfs_rq().
*/
rq_clock_start_loop_update(rq);
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
@ -6137,6 +6153,8 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
unthrottle_cfs_rq(cfs_rq);
}
rcu_read_unlock();
rq_clock_stop_loop_update(rq);
}
#else /* CONFIG_CFS_BANDWIDTH */
@ -7202,14 +7220,58 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return target;
}
/*
* Predicts what cpu_util(@cpu) would return if @p was removed from @cpu
* (@dst_cpu = -1) or migrated to @dst_cpu.
/**
* cpu_util() - Estimates the amount of CPU capacity used by CFS tasks.
* @cpu: the CPU to get the utilization for
* @p: task for which the CPU utilization should be predicted or NULL
* @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL
* @boost: 1 to enable boosting, otherwise 0
*
* The unit of the return value must be the same as the one of CPU capacity
* so that CPU utilization can be compared with CPU capacity.
*
* CPU utilization is the sum of running time of runnable tasks plus the
* recent utilization of currently non-runnable tasks on that CPU.
* It represents the amount of CPU capacity currently used by CFS tasks in
* the range [0..max CPU capacity] with max CPU capacity being the CPU
* capacity at f_max.
*
* The estimated CPU utilization is defined as the maximum between CPU
* utilization and sum of the estimated utilization of the currently
* runnable tasks on that CPU. It preserves a utilization "snapshot" of
* previously-executed tasks, which helps better deduce how busy a CPU will
* be when a long-sleeping task wakes up. The contribution to CPU utilization
* of such a task would be significantly decayed at this point of time.
*
* Boosted CPU utilization is defined as max(CPU runnable, CPU utilization).
* CPU contention for CFS tasks can be detected by CPU runnable > CPU
* utilization. Boosting is implemented in cpu_util() so that internal
* users (e.g. EAS) can use it next to external users (e.g. schedutil),
* latter via cpu_util_cfs_boost().
*
* CPU utilization can be higher than the current CPU capacity
* (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
* of rounding errors as well as task migrations or wakeups of new tasks.
* CPU utilization has to be capped to fit into the [0..max CPU capacity]
* range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
* could be seen as over-utilized even though CPU1 has 20% of spare CPU
* capacity. CPU utilization is allowed to overshoot current CPU capacity
* though since this is useful for predicting the CPU capacity required
* after task migrations (scheduler-driven DVFS).
*
* Return: (Boosted) (estimated) utilization for the specified CPU.
*/
static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
static unsigned long
cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
{
struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
unsigned long runnable;
if (boost) {
runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
util = max(util, runnable);
}
/*
* If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
@ -7217,9 +7279,9 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
* contribution. In all the other cases @cpu is not impacted by the
* migration so its util_avg is already correct.
*/
if (task_cpu(p) == cpu && dst_cpu != cpu)
if (p && task_cpu(p) == cpu && dst_cpu != cpu)
lsub_positive(&util, task_util(p));
else if (task_cpu(p) != cpu && dst_cpu == cpu)
else if (p && task_cpu(p) != cpu && dst_cpu == cpu)
util += task_util(p);
if (sched_feat(UTIL_EST)) {
@ -7227,6 +7289,9 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
if (boost)
util_est = max(util_est, runnable);
/*
* During wake-up @p isn't enqueued yet and doesn't contribute
* to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
@ -7255,7 +7320,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
*/
if (dst_cpu == cpu)
util_est += _task_util_est(p);
else if (unlikely(task_on_rq_queued(p) || current == p))
else if (p && unlikely(task_on_rq_queued(p) || current == p))
lsub_positive(&util_est, _task_util_est(p));
util = max(util, util_est);
@ -7264,6 +7329,16 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
return min(util, capacity_orig_of(cpu));
}
unsigned long cpu_util_cfs(int cpu)
{
return cpu_util(cpu, NULL, -1, 0);
}
unsigned long cpu_util_cfs_boost(int cpu)
{
return cpu_util(cpu, NULL, -1, 1);
}
/*
* cpu_util_without: compute cpu utilization without any contributions from *p
* @cpu: the CPU which utilization is requested
@ -7281,9 +7356,9 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
{
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return cpu_util_cfs(cpu);
p = NULL;
return cpu_util_next(cpu, p, -1);
return cpu_util(cpu, p, -1, 0);
}
/*
@ -7330,7 +7405,7 @@ static inline void eenv_task_busy_time(struct energy_env *eenv,
* cpu_capacity.
*
* The contribution of the task @p for which we want to estimate the
* energy cost is removed (by cpu_util_next()) and must be calculated
* energy cost is removed (by cpu_util()) and must be calculated
* separately (see eenv_task_busy_time). This ensures:
*
* - A stable PD utilization, no matter which CPU of that PD we want to place
@ -7351,7 +7426,7 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv,
int cpu;
for_each_cpu(cpu, pd_cpus) {
unsigned long util = cpu_util_next(cpu, p, -1);
unsigned long util = cpu_util(cpu, p, -1, 0);
busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL);
}
@ -7375,8 +7450,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
for_each_cpu(cpu, pd_cpus) {
struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
unsigned long util = cpu_util_next(cpu, p, dst_cpu);
unsigned long cpu_util;
unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
unsigned long eff_util;
/*
* Performance domain frequency: utilization clamping
@ -7385,8 +7460,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
cpu_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
max_util = max(max_util, cpu_util);
eff_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
max_util = max(max_util, eff_util);
}
return min(max_util, eenv->cpu_cap);
@ -7521,7 +7596,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
util = cpu_util_next(cpu, p, cpu);
util = cpu_util(cpu, p, cpu, 0);
cpu_cap = capacity_of(cpu);
/*
@ -9331,96 +9406,61 @@ group_type group_classify(unsigned int imbalance_pct,
}
/**
* asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks
* @dst_cpu: Destination CPU of the load balancing
* @sds: Load-balancing data with statistics of the local group
* @sgs: Load-balancing statistics of the candidate busiest group
* @sg: The candidate busiest group
* sched_use_asym_prio - Check whether asym_packing priority must be used
* @sd: The scheduling domain of the load balancing
* @cpu: A CPU
*
* Check the state of the SMT siblings of both @sds::local and @sg and decide
* if @dst_cpu can pull tasks.
* Always use CPU priority when balancing load between SMT siblings. When
* balancing load between cores, it is not sufficient that @cpu is idle. Only
* use CPU priority if the whole core is idle.
*
* If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of
* the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks
* only if @dst_cpu has higher priority.
*
* If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more
* busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority.
* Bigger imbalances in the number of busy CPUs will be dealt with in
* update_sd_pick_busiest().
*
* If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
* of @dst_cpu are idle and @sg has lower priority.
*
* Return: true if @dst_cpu can pull tasks, false otherwise.
* Returns: True if the priority of @cpu must be followed. False otherwise.
*/
static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
struct sg_lb_stats *sgs,
struct sched_group *sg)
static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
{
#ifdef CONFIG_SCHED_SMT
bool local_is_smt, sg_is_smt;
int sg_busy_cpus;
if (!sched_smt_active())
return true;
local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY;
sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;
if (!local_is_smt) {
/*
* If we are here, @dst_cpu is idle and does not have SMT
* siblings. Pull tasks if candidate group has two or more
* busy CPUs.
*/
if (sg_busy_cpus >= 2) /* implies sg_is_smt */
return true;
/*
* @dst_cpu does not have SMT siblings. @sg may have SMT
* siblings and only one is busy. In such case, @dst_cpu
* can help if it has higher priority and is idle (i.e.,
* it has no running tasks).
*/
return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
}
/* @dst_cpu has SMT siblings. */
if (sg_is_smt) {
int local_busy_cpus = sds->local->group_weight -
sds->local_stat.idle_cpus;
int busy_cpus_delta = sg_busy_cpus - local_busy_cpus;
if (busy_cpus_delta == 1)
return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
return false;
}
/*
* @sg does not have SMT siblings. Ensure that @sds::local does not end
* up with more than one busy SMT sibling and only pull tasks if there
* are not busy CPUs (i.e., no CPU has running tasks).
*/
if (!sds->local_stat.sum_nr_running)
return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
return false;
#else
/* Always return false so that callers deal with non-SMT cases. */
return false;
#endif
return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu);
}
/**
* sched_asym - Check if the destination CPU can do asym_packing load balance
* @env: The load balancing environment
* @sds: Load-balancing data with statistics of the local group
* @sgs: Load-balancing statistics of the candidate busiest group
* @group: The candidate busiest group
*
* @env::dst_cpu can do asym_packing if it has higher priority than the
* preferred CPU of @group.
*
* SMT is a special case. If we are balancing load between cores, @env::dst_cpu
* can do asym_packing balance only if all its SMT siblings are idle. Also, it
* can only do it if @group is an SMT group and has exactly on busy CPU. Larger
* imbalances in the number of CPUS are dealt with in find_busiest_group().
*
* If we are balancing load within an SMT core, or at DIE domain level, always
* proceed.
*
* Return: true if @env::dst_cpu can do with asym_packing load balance. False
* otherwise.
*/
static inline bool
sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs,
struct sched_group *group)
{
/* Only do SMT checks if either local or candidate have SMT siblings */
if ((sds->local->flags & SD_SHARE_CPUCAPACITY) ||
(group->flags & SD_SHARE_CPUCAPACITY))
return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group);
/* Ensure that the whole local core is idle, if applicable. */
if (!sched_use_asym_prio(env->sd, env->dst_cpu))
return false;
/*
* CPU priorities does not make sense for SMT cores with more than one
* busy sibling.
*/
if (group->flags & SD_SHARE_CPUCAPACITY) {
if (sgs->group_weight - sgs->idle_cpus != 1)
return false;
}
return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
}
@ -9610,10 +9650,22 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* contention when accessing shared HW resources.
*
* XXX for now avg_load is not computed and always 0 so we
* select the 1st one.
* select the 1st one, except if @sg is composed of SMT
* siblings.
*/
if (sgs->avg_load <= busiest->avg_load)
if (sgs->avg_load < busiest->avg_load)
return false;
if (sgs->avg_load == busiest->avg_load) {
/*
* SMT sched groups need more help than non-SMT groups.
* If @sg happens to also be SMT, either choice is good.
*/
if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
return false;
}
break;
case group_has_spare:
@ -10088,7 +10140,6 @@ static void update_idle_cpu_scan(struct lb_env *env,
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
@ -10129,8 +10180,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
sg = sg->next;
} while (sg != env->sd->groups);
/* Tag domain that child domain prefers tasks go to siblings first */
sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
/*
* Indicate that the child domain of the busiest group prefers tasks
* go to a child's sibling domains first. NB the flags of a sched group
* are those of the child domain.
*/
if (sds->busiest)
sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING);
if (env->sd->flags & SD_NUMA)
@ -10440,7 +10496,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto out_balanced;
}
/* Try to move all excess tasks to child's sibling domain */
/*
* Try to move all excess tasks to a sibling domain of the busiest
* group's child domain.
*/
if (sds.prefer_sibling && local->group_type == group_has_spare &&
busiest->sum_nr_running > local->sum_nr_running + 1)
goto force_balance;
@ -10542,8 +10601,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,
nr_running == 1)
continue;
/* Make sure we only pull tasks from a CPU of lower priority */
/*
* Make sure we only pull tasks from a CPU of lower priority
* when balancing between SMT siblings.
*
* If balancing between cores, let lower priority CPUs help
* SMT cores with more than one busy sibling.
*/
if ((env->sd->flags & SD_ASYM_PACKING) &&
sched_use_asym_prio(env->sd, i) &&
sched_asym_prefer(i, env->dst_cpu) &&
nr_running == 1)
continue;
@ -10581,7 +10647,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
break;
case migrate_util:
util = cpu_util_cfs(i);
util = cpu_util_cfs_boost(i);
/*
* Don't try to pull utilization from a CPU with one
@ -10632,12 +10698,19 @@ static inline bool
asym_active_balance(struct lb_env *env)
{
/*
* ASYM_PACKING needs to force migrate tasks from busy but
* lower priority CPUs in order to pack all tasks in the
* highest priority CPUs.
* ASYM_PACKING needs to force migrate tasks from busy but lower
* priority CPUs in order to pack all tasks in the highest priority
* CPUs. When done between cores, do it only if the whole core if the
* whole core is idle.
*
* If @env::src_cpu is an SMT core with busy siblings, let
* the lower priority @env::dst_cpu help it. Do not follow
* CPU priority.
*/
return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
sched_asym_prefer(env->dst_cpu, env->src_cpu);
sched_use_asym_prio(env->sd, env->dst_cpu) &&
(sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
!sched_use_asym_prio(env->sd, env->src_cpu));
}
static inline bool
@ -10744,7 +10817,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
.dst_grpmask = sched_group_span(sd->groups),
.dst_grpmask = group_balance_mask(sd->groups),
.idle = idle,
.loop_break = SCHED_NR_MIGRATE_BREAK,
.cpus = cpus,
@ -11371,9 +11444,13 @@ static void nohz_balancer_kick(struct rq *rq)
* When ASYM_PACKING; see if there's a more preferred CPU
* currently idle; in which case, kick the ILB to move tasks
* around.
*
* When balancing betwen cores, all the SMT siblings of the
* preferred CPU must be idle.
*/
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
if (sched_asym_prefer(i, cpu)) {
if (sched_use_asym_prio(sd, i) &&
sched_asym_prefer(i, cpu)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}

View file

@ -160,7 +160,6 @@ __setup("psi=", setup_psi);
#define EXP_300s 2034 /* 1/exp(2s/300s) */
/* PSI trigger definitions */
#define WINDOW_MIN_US 500000 /* Min window size is 500ms */
#define WINDOW_MAX_US 10000000 /* Max window size is 10s */
#define UPDATES_PER_WINDOW 10 /* 10 updates per window */
@ -1305,8 +1304,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
if (state >= PSI_NONIDLE)
return ERR_PTR(-EINVAL);
if (window_us < WINDOW_MIN_US ||
window_us > WINDOW_MAX_US)
if (window_us == 0 || window_us > WINDOW_MAX_US)
return ERR_PTR(-EINVAL);
/*
@ -1409,11 +1407,16 @@ void psi_trigger_destroy(struct psi_trigger *t)
group->rtpoll_nr_triggers[t->state]--;
if (!group->rtpoll_nr_triggers[t->state])
group->rtpoll_states &= ~(1 << t->state);
/* reset min update period for the remaining triggers */
list_for_each_entry(tmp, &group->rtpoll_triggers, node)
period = min(period, div_u64(tmp->win.size,
UPDATES_PER_WINDOW));
group->rtpoll_min_period = period;
/*
* Reset min update period for the remaining triggers
* iff the destroying trigger had the min window size.
*/
if (group->rtpoll_min_period == div_u64(t->win.size, UPDATES_PER_WINDOW)) {
list_for_each_entry(tmp, &group->rtpoll_triggers, node)
period = min(period, div_u64(tmp->win.size,
UPDATES_PER_WINDOW));
group->rtpoll_min_period = period;
}
/* Destroy rtpoll_task when the last trigger is destroyed */
if (group->rtpoll_states == 0) {
group->rtpoll_until = 0;

View file

@ -286,12 +286,6 @@ struct rt_bandwidth {
void __dl_clear_params(struct task_struct *p);
struct dl_bandwidth {
raw_spinlock_t dl_runtime_lock;
u64 dl_runtime;
u64 dl_period;
};
static inline int dl_bandwidth_enabled(void)
{
return sysctl_sched_rt_runtime >= 0;
@ -753,6 +747,12 @@ struct dl_rq {
u64 this_bw;
u64 extra_bw;
/*
* Maximum available bandwidth for reclaiming by SCHED_FLAG_RECLAIM
* tasks of this rq. Used in calculation of reclaimable bandwidth(GRUB).
*/
u64 max_bw;
/*
* Inverse of the fraction of CPU utilization that can be reclaimed
* by the GRUB algorithm.
@ -1546,6 +1546,28 @@ static inline void rq_clock_cancel_skipupdate(struct rq *rq)
rq->clock_update_flags &= ~RQCF_REQ_SKIP;
}
/*
* During cpu offlining and rq wide unthrottling, we can trigger
* an update_rq_clock() for several cfs and rt runqueues (Typically
* when using list_for_each_entry_*)
* rq_clock_start_loop_update() can be called after updating the clock
* once and before iterating over the list to prevent multiple update.
* After the iterative traversal, we need to call rq_clock_stop_loop_update()
* to clear RQCF_ACT_SKIP of rq->clock_update_flags.
*/
static inline void rq_clock_start_loop_update(struct rq *rq)
{
lockdep_assert_rq_held(rq);
SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP);
rq->clock_update_flags |= RQCF_ACT_SKIP;
}
static inline void rq_clock_stop_loop_update(struct rq *rq)
{
lockdep_assert_rq_held(rq);
rq->clock_update_flags &= ~RQCF_ACT_SKIP;
}
struct rq_flags {
unsigned long flags;
struct pin_cookie cookie;
@ -1772,6 +1794,13 @@ queue_balance_callback(struct rq *rq,
for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
__sd; __sd = __sd->parent)
/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */
#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_SHARED_CHILD)) |
static const unsigned int SD_SHARED_CHILD_MASK =
#include <linux/sched/sd_flags.h>
0;
#undef SD_FLAG
/**
* highest_flag_domain - Return highest sched_domain containing flag.
* @cpu: The CPU whose highest level of sched domain is to
@ -1779,16 +1808,25 @@ queue_balance_callback(struct rq *rq,
* @flag: The flag to check for the highest sched_domain
* for the given CPU.
*
* Returns the highest sched_domain of a CPU which contains the given flag.
* Returns the highest sched_domain of a CPU which contains @flag. If @flag has
* the SDF_SHARED_CHILD metaflag, all the children domains also have @flag.
*/
static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
{
struct sched_domain *sd, *hsd = NULL;
for_each_domain(cpu, sd) {
if (!(sd->flags & flag))
if (sd->flags & flag) {
hsd = sd;
continue;
}
/*
* Stop the search if @flag is known to be shared at lower
* levels. It will not be found further up.
*/
if (flag & SD_SHARED_CHILD_MASK)
break;
hsd = sd;
}
return hsd;
@ -2378,7 +2416,6 @@ extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
@ -2946,53 +2983,9 @@ static inline unsigned long cpu_util_dl(struct rq *rq)
return READ_ONCE(rq->avg_dl.util_avg);
}
/**
* cpu_util_cfs() - Estimates the amount of CPU capacity used by CFS tasks.
* @cpu: the CPU to get the utilization for.
*
* The unit of the return value must be the same as the one of CPU capacity
* so that CPU utilization can be compared with CPU capacity.
*
* CPU utilization is the sum of running time of runnable tasks plus the
* recent utilization of currently non-runnable tasks on that CPU.
* It represents the amount of CPU capacity currently used by CFS tasks in
* the range [0..max CPU capacity] with max CPU capacity being the CPU
* capacity at f_max.
*
* The estimated CPU utilization is defined as the maximum between CPU
* utilization and sum of the estimated utilization of the currently
* runnable tasks on that CPU. It preserves a utilization "snapshot" of
* previously-executed tasks, which helps better deduce how busy a CPU will
* be when a long-sleeping task wakes up. The contribution to CPU utilization
* of such a task would be significantly decayed at this point of time.
*
* CPU utilization can be higher than the current CPU capacity
* (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
* of rounding errors as well as task migrations or wakeups of new tasks.
* CPU utilization has to be capped to fit into the [0..max CPU capacity]
* range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
* could be seen as over-utilized even though CPU1 has 20% of spare CPU
* capacity. CPU utilization is allowed to overshoot current CPU capacity
* though since this is useful for predicting the CPU capacity required
* after task migrations (scheduler-driven DVFS).
*
* Return: (Estimated) utilization for the specified CPU.
*/
static inline unsigned long cpu_util_cfs(int cpu)
{
struct cfs_rq *cfs_rq;
unsigned long util;
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
if (sched_feat(UTIL_EST)) {
util = max_t(unsigned long, util,
READ_ONCE(cfs_rq->avg.util_est.enqueued));
}
return min(util, capacity_orig_of(cpu));
}
extern unsigned long cpu_util_cfs(int cpu);
extern unsigned long cpu_util_cfs_boost(int cpu);
static inline unsigned long cpu_util_rt(struct rq *rq)
{

View file

@ -487,9 +487,9 @@ static void free_rootdomain(struct rcu_head *rcu)
void rq_attach_root(struct rq *rq, struct root_domain *rd)
{
struct root_domain *old_rd = NULL;
unsigned long flags;
struct rq_flags rf;
raw_spin_rq_lock_irqsave(rq, flags);
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
old_rd = rq->rd;
@ -515,7 +515,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);
raw_spin_rq_unlock_irqrestore(rq, flags);
rq_unlock_irqrestore(rq, &rf);
if (old_rd)
call_rcu(&old_rd->rcu, free_rootdomain);
@ -719,8 +719,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
if (sd_parent_degenerate(tmp, parent)) {
tmp->parent = parent->parent;
if (parent->parent)
if (parent->parent) {
parent->parent->child = tmp;
if (tmp->flags & SD_SHARE_CPUCAPACITY)
parent->parent->groups->flags |= SD_SHARE_CPUCAPACITY;
}
/*
* Transfer SD_PREFER_SIBLING down in case of a
* degenerate parent; the spans match for this
@ -1676,7 +1681,7 @@ static struct sched_domain_topology_level *sched_domain_topology_saved;
#define for_each_sd_topology(tl) \
for (tl = sched_domain_topology; tl->mask; tl++)
void set_sched_topology(struct sched_domain_topology_level *tl)
void __init set_sched_topology(struct sched_domain_topology_level *tl)
{
if (WARN_ON_ONCE(sched_smp_initialized))
return;

View file

@ -425,11 +425,6 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
}
EXPORT_SYMBOL(autoremove_wake_function);
static inline bool is_kthread_should_stop(void)
{
return (current->flags & PF_KTHREAD) && kthread_should_stop();
}
/*
* DEFINE_WAIT_FUNC(wait, woken_wake_func);
*
@ -459,7 +454,7 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout)
* or woken_wake_function() sees our store to current->state.
*/
set_current_state(mode); /* A */
if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !kthread_should_stop_or_park())
timeout = schedule_timeout(timeout);
__set_current_state(TASK_RUNNING);

View file

@ -64,7 +64,7 @@ static struct clock_data cd ____cacheline_aligned = {
.actual_read_sched_clock = jiffy_sched_clock_read,
};
static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift)
{
return (cyc * mult) >> shift;
}
@ -77,26 +77,36 @@ notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq)
notrace int sched_clock_read_retry(unsigned int seq)
{
return read_seqcount_latch_retry(&cd.seq, seq);
return raw_read_seqcount_latch_retry(&cd.seq, seq);
}
unsigned long long notrace sched_clock(void)
unsigned long long noinstr sched_clock_noinstr(void)
{
u64 cyc, res;
unsigned int seq;
struct clock_read_data *rd;
unsigned int seq;
u64 cyc, res;
do {
rd = sched_clock_read_begin(&seq);
seq = raw_read_seqcount_latch(&cd.seq);
rd = cd.read_data + (seq & 1);
cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
rd->sched_clock_mask;
res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
} while (sched_clock_read_retry(seq));
} while (raw_read_seqcount_latch_retry(&cd.seq, seq));
return res;
}
unsigned long long notrace sched_clock(void)
{
unsigned long long ns;
preempt_disable_notrace();
ns = sched_clock_noinstr();
preempt_enable_notrace();
return ns;
}
/*
* Updating the data required to read the clock.
*

View file

@ -450,7 +450,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
tkr = tkf->base + (seq & 0x01);
now = ktime_to_ns(tkr->base);
now += fast_tk_get_delta_ns(tkr);
} while (read_seqcount_latch_retry(&tkf->seq, seq));
} while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
return now;
}
@ -566,7 +566,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
basem = ktime_to_ns(tkr->base);
baser = ktime_to_ns(tkr->base_real);
delta = fast_tk_get_delta_ns(tkr);
} while (read_seqcount_latch_retry(&tkf->seq, seq));
} while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
if (mono)
*mono = basem + delta;