linux-stable/arch/x86/include/asm/numa.h
Mike Rapoport (IBM) a1e2b8b368 x86/mm: Drop the 4 MB restriction on minimal NUMA node memory size
Qi Zheng reported crashes in a production environment and provided a
simplified example as a reproducer:

 |  For example, if we use Qemu to start a two NUMA node kernel,
 |  one of the nodes has 2M memory (less than NODE_MIN_SIZE),
 |  and the other node has 2G, then we will encounter the
 |  following panic:
 |
 |    BUG: kernel NULL pointer dereference, address: 0000000000000000
 |    <...>
 |    RIP: 0010:_raw_spin_lock_irqsave+0x22/0x40
 |    <...>
 |    Call Trace:
 |      <TASK>
 |      deactivate_slab()
 |      bootstrap()
 |      kmem_cache_init()
 |      start_kernel()
 |      secondary_startup_64_no_verify()

The crashes happen because of inconsistency between the nodemask that
has nodes with less than 4MB as memoryless, and the actual memory fed
into the core mm.

The commit:

  9391a3f9c7 ("[PATCH] x86_64: Clear more state when ignoring empty node in SRAT parsing")

... that introduced minimal size of a NUMA node does not explain why
a node size cannot be less than 4MB and what boot failures this
restriction might fix.

Fixes have been submitted to the core MM code to tighten up the
memory topologies it accepts and to not crash on weird input:

  mm: page_alloc: skip memoryless nodes entirely
  mm: memory_hotplug: drop memoryless node from fallback lists

Andrew has accepted them into the -mm tree, but there are no
stable SHA1's yet.

This patch drops the limitation for minimal node size on x86:

  - which works around the crash without the fixes to the core MM.
  - makes x86 topologies less weird,
  - removes an arbitrary and undocumented limitation on NUMA topologies.

[ mingo: Improved changelog clarity. ]

Reported-by: Qi Zheng <zhengqi.arch@bytedance.com>
Tested-by: Mario Casquero <mcasquer@redhat.com>
Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Rik van Riel <riel@surriel.com>
Link: https://lore.kernel.org/r/ZS+2qqjEO5/867br@gmail.com
2023-10-20 10:40:22 +02:00

84 lines
2.1 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_NUMA_H
#define _ASM_X86_NUMA_H
#include <linux/nodemask.h>
#include <linux/errno.h>
#include <asm/topology.h>
#include <asm/apicdef.h>
#ifdef CONFIG_NUMA
#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
extern int numa_off;
/*
* __apicid_to_node[] stores the raw mapping between physical apicid and
* node and is used to initialize cpu_to_node mapping.
*
* The mapping may be overridden by apic->numa_cpu_node() on 32bit and thus
* should be accessed by the accessors - set_apicid_to_node() and
* numa_cpu_node().
*/
extern s16 __apicid_to_node[MAX_LOCAL_APIC];
extern nodemask_t numa_nodes_parsed __initdata;
extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
extern void __init numa_set_distance(int from, int to, int distance);
static inline void set_apicid_to_node(int apicid, s16 node)
{
__apicid_to_node[apicid] = node;
}
extern int numa_cpu_node(int cpu);
#else /* CONFIG_NUMA */
static inline void set_apicid_to_node(int apicid, s16 node)
{
}
static inline int numa_cpu_node(int cpu)
{
return NUMA_NO_NODE;
}
#endif /* CONFIG_NUMA */
#ifdef CONFIG_X86_32
# include <asm/numa_32.h>
#endif
#ifdef CONFIG_NUMA
extern void numa_set_node(int cpu, int node);
extern void numa_clear_node(int cpu);
extern void __init init_cpu_to_node(void);
extern void numa_add_cpu(int cpu);
extern void numa_remove_cpu(int cpu);
extern void init_gi_nodes(void);
#else /* CONFIG_NUMA */
static inline void numa_set_node(int cpu, int node) { }
static inline void numa_clear_node(int cpu) { }
static inline void init_cpu_to_node(void) { }
static inline void numa_add_cpu(int cpu) { }
static inline void numa_remove_cpu(int cpu) { }
static inline void init_gi_nodes(void) { }
#endif /* CONFIG_NUMA */
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
void debug_cpumask_set_cpu(int cpu, int node, bool enable);
#endif
#ifdef CONFIG_NUMA_EMU
#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
int numa_emu_cmdline(char *str);
#else /* CONFIG_NUMA_EMU */
static inline int numa_emu_cmdline(char *str)
{
return -EINVAL;
}
#endif /* CONFIG_NUMA_EMU */
#endif /* _ASM_X86_NUMA_H */