A set of scheduler fixes:

- Address a load balancer regression by making the load balancer use the
    same logic as the wakeup path to spread tasks in the LLC domain.
 
  - Prefer the CPU on which a task run last over the local CPU in the fast
    wakeup path for asymmetric CPU capacity systems to align with the
    symmetric case. This ensures more locality and prevents massive
    migration overhead on those asymetric systems
 
  - Fix a memory corruption bug in the scheduler debug code caused by
    handing a modified buffer pointer to kfree().
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAl+xJIoTHHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYofyGD/9rUnLlC1h7jEufVa4yPG94DcEqiXT7
 8B/zNRKnOmqQePCYUm+DS8njSFqpF9VjR+5zpos3bgYqwn7DyfV+hpxbbgS9NDh/
 qRg5gxhTrR4uMyZN62Fex5JS4bP8mKO7oc0usgV2Ytsg3e4H+9DqYhuaA5GrJAxC
 J3d1Hv/YBW2Uo+RZpB20aaJr0srN7bswTtPMxeeqo8q3Qh4pFcI+rmA4WphVAgHF
 jQWaNP4YVTgNjqxy7nBp7zFHlSdRbLohldZFtueYmRo1mjmkyQ34Cg7etfBvN1Uf
 iVYZLaInr0YPr0qR4FrQ3yI8ln/HESxshs0ARzMReYVT71mV//o5wftE18uCULQB
 rRu9vYz+LBVhkdgx118jJdNJqyqk6Ca6h9ZLqyBKuckj9a39289bwWiS6D/6W51p
 gurq58YTb2lRzyCnOVEULXehYRJkDI8EToiWppRVm9gy43OFPNox7n6TvNLW6BLS
 I8msTVdqDYXXj4U1o4Mf9K5LBKlda+ARuBu87r7kH1BJLxXHnOHcEkmeN8O9k7eu
 jdWfeDzDDjBjt/TU+X4f4RNjudUZrSPQrrESE5+XhfM4CwqcPXa2M/dGtPekW/ED
 9IqxPvwkau+0Ym6gkuanfnmda+JVR/nLvZV0uFuUGd+2xMcRemZbZE6hTUiYvYPY
 CAHpOhmeakbr6w==
 =wFcU
 -----END PGP SIGNATURE-----

Merge tag 'sched-urgent-2020-11-15' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Thomas Gleixner:
 "A set of scheduler fixes:

   - Address a load balancer regression by making the load balancer use
     the same logic as the wakeup path to spread tasks in the LLC domain

   - Prefer the CPU on which a task run last over the local CPU in the
     fast wakeup path for asymmetric CPU capacity systems to align with
     the symmetric case. This ensures more locality and prevents massive
     migration overhead on those asymetric systems

   - Fix a memory corruption bug in the scheduler debug code caused by
     handing a modified buffer pointer to kfree()"

* tag 'sched-urgent-2020-11-15' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/debug: Fix memory corruption caused by multiple small reads of flags
  sched/fair: Prefer prev cpu in asymmetric wakeup path
  sched/fair: Ensure tasks spreading in LLC during LB
This commit is contained in:
Linus Torvalds 2020-11-15 09:39:35 -08:00
commit d0a37fd57f
2 changed files with 51 additions and 31 deletions

View file

@ -251,7 +251,7 @@ static int sd_ctl_doflags(struct ctl_table *table, int write,
unsigned long flags = *(unsigned long *)table->data;
size_t data_size = 0;
size_t len = 0;
char *tmp;
char *tmp, *buf;
int idx;
if (write)
@ -269,17 +269,17 @@ static int sd_ctl_doflags(struct ctl_table *table, int write,
return 0;
}
tmp = kcalloc(data_size + 1, sizeof(*tmp), GFP_KERNEL);
if (!tmp)
buf = kcalloc(data_size + 1, sizeof(*buf), GFP_KERNEL);
if (!buf)
return -ENOMEM;
for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
char *name = sd_flag_debug[idx].name;
len += snprintf(tmp + len, strlen(name) + 2, "%s ", name);
len += snprintf(buf + len, strlen(name) + 2, "%s ", name);
}
tmp += *ppos;
tmp = buf + *ppos;
len -= *ppos;
if (len > *lenp)
@ -294,7 +294,7 @@ static int sd_ctl_doflags(struct ctl_table *table, int write,
*lenp = len;
*ppos += len;
kfree(tmp);
kfree(buf);
return 0;
}

View file

@ -6172,21 +6172,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
static int
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
{
unsigned long best_cap = 0;
unsigned long task_util, best_cap = 0;
int cpu, best_cpu = -1;
struct cpumask *cpus;
sync_entity_load_avg(&p->se);
cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
task_util = uclamp_task_util(p);
for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
continue;
if (task_fits_capacity(p, cpu_cap))
if (fits_capacity(task_util, cpu_cap))
return cpu;
if (cpu_cap > best_cap) {
@ -6198,44 +6198,42 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
return best_cpu;
}
static inline bool asym_fits_capacity(int task_util, int cpu)
{
if (static_branch_unlikely(&sched_asym_cpucapacity))
return fits_capacity(task_util, capacity_of(cpu));
return true;
}
/*
* Try and locate an idle core/thread in the LLC cache domain.
*/
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
unsigned long task_util;
int i, recent_used_cpu;
/*
* For asymmetric CPU capacity systems, our domain of interest is
* sd_asym_cpucapacity rather than sd_llc.
* On asymmetric system, update task utilization because we will check
* that the task fits with cpu's capacity.
*/
if (static_branch_unlikely(&sched_asym_cpucapacity)) {
sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
/*
* On an asymmetric CPU capacity system where an exclusive
* cpuset defines a symmetric island (i.e. one unique
* capacity_orig value through the cpuset), the key will be set
* but the CPUs within that cpuset will not have a domain with
* SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
* capacity path.
*/
if (!sd)
goto symmetric;
i = select_idle_capacity(p, sd, target);
return ((unsigned)i < nr_cpumask_bits) ? i : target;
sync_entity_load_avg(&p->se);
task_util = uclamp_task_util(p);
}
symmetric:
if (available_idle_cpu(target) || sched_idle_cpu(target))
if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
asym_fits_capacity(task_util, target))
return target;
/*
* If the previous CPU is cache affine and idle, don't be stupid:
*/
if (prev != target && cpus_share_cache(prev, target) &&
(available_idle_cpu(prev) || sched_idle_cpu(prev)))
(available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
asym_fits_capacity(task_util, prev))
return prev;
/*
@ -6258,7 +6256,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
asym_fits_capacity(task_util, recent_used_cpu)) {
/*
* Replace recent_used_cpu with prev as it is a potential
* candidate for the next wake:
@ -6267,6 +6266,26 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return recent_used_cpu;
}
/*
* For asymmetric CPU capacity systems, our domain of interest is
* sd_asym_cpucapacity rather than sd_llc.
*/
if (static_branch_unlikely(&sched_asym_cpucapacity)) {
sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
/*
* On an asymmetric CPU capacity system where an exclusive
* cpuset defines a symmetric island (i.e. one unique
* capacity_orig value through the cpuset), the key will be set
* but the CPUs within that cpuset will not have a domain with
* SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
* capacity path.
*/
if (sd) {
i = select_idle_capacity(p, sd, target);
return ((unsigned)i < nr_cpumask_bits) ? i : target;
}
}
sd = rcu_dereference(per_cpu(sd_llc, target));
if (!sd)
return target;
@ -9031,7 +9050,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* emptying busiest.
*/
if (local->group_type == group_has_spare) {
if (busiest->group_type > group_fully_busy) {
if ((busiest->group_type > group_fully_busy) &&
!(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
/*
* If busiest is overloaded, try to fill spare
* capacity. This might end up creating spare capacity