linux-stable/include/linux/topology.h
Matt Fleming a55c7454a8 sched/topology: Improve load balancing on AMD EPYC systems
SD_BALANCE_{FORK,EXEC} and SD_WAKE_AFFINE are stripped in sd_init()
for any sched domains with a NUMA distance greater than 2 hops
(RECLAIM_DISTANCE). The idea being that it's expensive to balance
across domains that far apart.

However, as is rather unfortunately explained in:

  commit 32e45ff43e ("mm: increase RECLAIM_DISTANCE to 30")

the value for RECLAIM_DISTANCE is based on node distance tables from
2011-era hardware.

Current AMD EPYC machines have the following NUMA node distances:

 node distances:
 node   0   1   2   3   4   5   6   7
   0:  10  16  16  16  32  32  32  32
   1:  16  10  16  16  32  32  32  32
   2:  16  16  10  16  32  32  32  32
   3:  16  16  16  10  32  32  32  32
   4:  32  32  32  32  10  16  16  16
   5:  32  32  32  32  16  10  16  16
   6:  32  32  32  32  16  16  10  16
   7:  32  32  32  32  16  16  16  10

where 2 hops is 32.

The result is that the scheduler fails to load balance properly across
NUMA nodes on different sockets -- 2 hops apart.

For example, pinning 16 busy threads to NUMA nodes 0 (CPUs 0-7) and 4
(CPUs 32-39) like so,

  $ numactl -C 0-7,32-39 ./spinner 16

causes all threads to fork and remain on node 0 until the active
balancer kicks in after a few seconds and forcibly moves some threads
to node 4.

Override node_reclaim_distance for AMD Zen.

Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suravee.Suthikulpanit@amd.com
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas.Lendacky@amd.com
Cc: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/20190808195301.13222-3-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-09-03 09:17:37 +02:00

230 lines
5.5 KiB
C

/*
* include/linux/topology.h
*
* Written by: Matthew Dobson, IBM Corporation
*
* Copyright (C) 2002, IBM Corp.
*
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Send feedback to <colpatch@us.ibm.com>
*/
#ifndef _LINUX_TOPOLOGY_H
#define _LINUX_TOPOLOGY_H
#include <linux/cpumask.h>
#include <linux/bitops.h>
#include <linux/mmzone.h>
#include <linux/smp.h>
#include <linux/percpu.h>
#include <asm/topology.h>
#ifndef nr_cpus_node
#define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node))
#endif
#define for_each_node_with_cpus(node) \
for_each_online_node(node) \
if (nr_cpus_node(node))
int arch_update_cpu_topology(void);
/* Conform to ACPI 2.0 SLIT distance definitions */
#define LOCAL_DISTANCE 10
#define REMOTE_DISTANCE 20
#ifndef node_distance
#define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
#endif
#ifndef RECLAIM_DISTANCE
/*
* If the distance between nodes in a system is larger than RECLAIM_DISTANCE
* (in whatever arch specific measurement units returned by node_distance())
* and node_reclaim_mode is enabled then the VM will only call node_reclaim()
* on nodes within this distance.
*/
#define RECLAIM_DISTANCE 30
#endif
/*
* The following tunable allows platforms to override the default node
* reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are
* sufficiently fast that the default value actually hurts
* performance.
*
* AMD EPYC machines use this because even though the 2-hop distance
* is 32 (3.2x slower than a local memory access) performance actually
* *improves* if allowed to reclaim memory and load balance tasks
* between NUMA nodes 2-hops apart.
*/
extern int __read_mostly node_reclaim_distance;
#ifndef PENALTY_FOR_NODE_WITH_CPUS
#define PENALTY_FOR_NODE_WITH_CPUS (1)
#endif
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DECLARE_PER_CPU(int, numa_node);
#ifndef numa_node_id
/* Returns the number of the current Node. */
static inline int numa_node_id(void)
{
return raw_cpu_read(numa_node);
}
#endif
#ifndef cpu_to_node
static inline int cpu_to_node(int cpu)
{
return per_cpu(numa_node, cpu);
}
#endif
#ifndef set_numa_node
static inline void set_numa_node(int node)
{
this_cpu_write(numa_node, node);
}
#endif
#ifndef set_cpu_numa_node
static inline void set_cpu_numa_node(int cpu, int node)
{
per_cpu(numa_node, cpu) = node;
}
#endif
#else /* !CONFIG_USE_PERCPU_NUMA_NODE_ID */
/* Returns the number of the current Node. */
#ifndef numa_node_id
static inline int numa_node_id(void)
{
return cpu_to_node(raw_smp_processor_id());
}
#endif
#endif /* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem().
*/
DECLARE_PER_CPU(int, _numa_mem_);
extern int _node_numa_mem_[MAX_NUMNODES];
#ifndef set_numa_mem
static inline void set_numa_mem(int node)
{
this_cpu_write(_numa_mem_, node);
_node_numa_mem_[numa_node_id()] = node;
}
#endif
#ifndef node_to_mem_node
static inline int node_to_mem_node(int node)
{
return _node_numa_mem_[node];
}
#endif
#ifndef numa_mem_id
/* Returns the number of the nearest Node with memory */
static inline int numa_mem_id(void)
{
return raw_cpu_read(_numa_mem_);
}
#endif
#ifndef cpu_to_mem
static inline int cpu_to_mem(int cpu)
{
return per_cpu(_numa_mem_, cpu);
}
#endif
#ifndef set_cpu_numa_mem
static inline void set_cpu_numa_mem(int cpu, int node)
{
per_cpu(_numa_mem_, cpu) = node;
_node_numa_mem_[cpu_to_node(cpu)] = node;
}
#endif
#else /* !CONFIG_HAVE_MEMORYLESS_NODES */
#ifndef numa_mem_id
/* Returns the number of the nearest Node with memory */
static inline int numa_mem_id(void)
{
return numa_node_id();
}
#endif
#ifndef node_to_mem_node
static inline int node_to_mem_node(int node)
{
return node;
}
#endif
#ifndef cpu_to_mem
static inline int cpu_to_mem(int cpu)
{
return cpu_to_node(cpu);
}
#endif
#endif /* [!]CONFIG_HAVE_MEMORYLESS_NODES */
#ifndef topology_physical_package_id
#define topology_physical_package_id(cpu) ((void)(cpu), -1)
#endif
#ifndef topology_die_id
#define topology_die_id(cpu) ((void)(cpu), -1)
#endif
#ifndef topology_core_id
#define topology_core_id(cpu) ((void)(cpu), 0)
#endif
#ifndef topology_sibling_cpumask
#define topology_sibling_cpumask(cpu) cpumask_of(cpu)
#endif
#ifndef topology_core_cpumask
#define topology_core_cpumask(cpu) cpumask_of(cpu)
#endif
#ifndef topology_die_cpumask
#define topology_die_cpumask(cpu) cpumask_of(cpu)
#endif
#ifdef CONFIG_SCHED_SMT
static inline const struct cpumask *cpu_smt_mask(int cpu)
{
return topology_sibling_cpumask(cpu);
}
#endif
static inline const struct cpumask *cpu_cpu_mask(int cpu)
{
return cpumask_of_node(cpu_to_node(cpu));
}
#endif /* _LINUX_TOPOLOGY_H */