mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-11-01 17:08:10 +00:00
95cf82ecc1
When parsing SRAT, all memory ranges are added into numa_meminfo. In numa_init(), before entering numa_cleanup_meminfo(), all possible memory ranges are in numa_meminfo. And numa_cleanup_meminfo() removes all ranges over max_pfn or empty. But, this only works if the nodes are continuous. Let's have a look at the following example: We have an SRAT like this: SRAT: Node 0 PXM 0 [mem 0x00000000-0x5fffffff] SRAT: Node 0 PXM 0 [mem 0x100000000-0x1ffffffffff] SRAT: Node 1 PXM 1 [mem 0x20000000000-0x3ffffffffff] SRAT: Node 4 PXM 2 [mem 0x40000000000-0x5ffffffffff] hotplug SRAT: Node 5 PXM 3 [mem 0x60000000000-0x7ffffffffff] hotplug SRAT: Node 2 PXM 4 [mem 0x80000000000-0x9ffffffffff] hotplug SRAT: Node 3 PXM 5 [mem 0xa0000000000-0xbffffffffff] hotplug SRAT: Node 6 PXM 6 [mem 0xc0000000000-0xdffffffffff] hotplug SRAT: Node 7 PXM 7 [mem 0xe0000000000-0xfffffffffff] hotplug On boot, only node 0,1,2,3 exist. And the numa_meminfo will look like this: numa_meminfo.nr_blks = 9 1. on node 0: [0, 60000000] 2. on node 0: [100000000, 20000000000] 3. on node 1: [20000000000, 40000000000] 4. on node 4: [40000000000, 60000000000] 5. on node 5: [60000000000, 80000000000] 6. on node 2: [80000000000, a0000000000] 7. on node 3: [a0000000000, a0800000000] 8. on node 6: [c0000000000, a0800000000] 9. on node 7: [e0000000000, a0800000000] And numa_cleanup_meminfo() will merge 1 and 2, and remove 8,9 because the end address is over max_pfn, which is a0800000000. But 4 and 5 are not removed because their end addresses are less then max_pfn. But in fact, node 4 and 5 don't exist. In a word, numa_cleanup_meminfo() is not able to handle holes between nodes. Since memory ranges in node 4 and 5 are in numa_meminfo, in numa_register_memblks(), node 4 and 5 will be mistakenly set to online. If you run lscpu, it will show: NUMA node0 CPU(s): 0-14,128-142 NUMA node1 CPU(s): 15-29,143-157 NUMA node2 CPU(s): NUMA node3 CPU(s): NUMA node4 CPU(s): 62-76,190-204 NUMA node5 CPU(s): 78-92,206-220 In this patch, we use memblock_overlaps_region() to check if ranges in numa_meminfo overlap with ranges in memory_block. Since memory_block contains all available memory at boot time, if they overlap, it means the ranges exist. If not, then remove them from numa_meminfo. After this patch, lscpu will show: NUMA node0 CPU(s): 0-14,128-142 NUMA node1 CPU(s): 15-29,143-157 NUMA node4 CPU(s): 62-76,190-204 NUMA node5 CPU(s): 78-92,206-220 Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com> Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tejun Heo <tj@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Xishi Qiu <qiuxishi@huawei.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Vladimir Murzin <vladimir.murzin@arm.com> Cc: Fabian Frederick <fabf@skynet.be> Cc: Alexander Kuleshov <kuleshovmail@gmail.com> Cc: Baoquan He <bhe@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
425 lines
14 KiB
C
425 lines
14 KiB
C
#ifndef _LINUX_MEMBLOCK_H
|
|
#define _LINUX_MEMBLOCK_H
|
|
#ifdef __KERNEL__
|
|
|
|
#ifdef CONFIG_HAVE_MEMBLOCK
|
|
/*
|
|
* Logical memory blocks.
|
|
*
|
|
* Copyright (C) 2001 Peter Bergner, IBM Corp.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version
|
|
* 2 of the License, or (at your option) any later version.
|
|
*/
|
|
|
|
#include <linux/init.h>
|
|
#include <linux/mm.h>
|
|
|
|
#define INIT_MEMBLOCK_REGIONS 128
|
|
#define INIT_PHYSMEM_REGIONS 4
|
|
|
|
/* Definition of memblock flags. */
|
|
enum {
|
|
MEMBLOCK_NONE = 0x0, /* No special request */
|
|
MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */
|
|
MEMBLOCK_MIRROR = 0x2, /* mirrored region */
|
|
};
|
|
|
|
struct memblock_region {
|
|
phys_addr_t base;
|
|
phys_addr_t size;
|
|
unsigned long flags;
|
|
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
|
|
int nid;
|
|
#endif
|
|
};
|
|
|
|
struct memblock_type {
|
|
unsigned long cnt; /* number of regions */
|
|
unsigned long max; /* size of the allocated array */
|
|
phys_addr_t total_size; /* size of all regions */
|
|
struct memblock_region *regions;
|
|
};
|
|
|
|
struct memblock {
|
|
bool bottom_up; /* is bottom up direction? */
|
|
phys_addr_t current_limit;
|
|
struct memblock_type memory;
|
|
struct memblock_type reserved;
|
|
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
|
|
struct memblock_type physmem;
|
|
#endif
|
|
};
|
|
|
|
extern struct memblock memblock;
|
|
extern int memblock_debug;
|
|
#ifdef CONFIG_MOVABLE_NODE
|
|
/* If movable_node boot option specified */
|
|
extern bool movable_node_enabled;
|
|
#endif /* CONFIG_MOVABLE_NODE */
|
|
|
|
#define memblock_dbg(fmt, ...) \
|
|
if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
|
|
|
|
phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
|
|
phys_addr_t start, phys_addr_t end,
|
|
int nid, ulong flags);
|
|
phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
|
|
phys_addr_t size, phys_addr_t align);
|
|
phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
|
|
phys_addr_t get_allocated_memblock_memory_regions_info(phys_addr_t *addr);
|
|
void memblock_allow_resize(void);
|
|
int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
|
|
int memblock_add(phys_addr_t base, phys_addr_t size);
|
|
int memblock_remove(phys_addr_t base, phys_addr_t size);
|
|
int memblock_free(phys_addr_t base, phys_addr_t size);
|
|
int memblock_reserve(phys_addr_t base, phys_addr_t size);
|
|
void memblock_trim_memory(phys_addr_t align);
|
|
bool memblock_overlaps_region(struct memblock_type *type,
|
|
phys_addr_t base, phys_addr_t size);
|
|
int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
|
|
int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
|
|
int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
|
|
ulong choose_memblock_flags(void);
|
|
|
|
/* Low level functions */
|
|
int memblock_add_range(struct memblock_type *type,
|
|
phys_addr_t base, phys_addr_t size,
|
|
int nid, unsigned long flags);
|
|
|
|
int memblock_remove_range(struct memblock_type *type,
|
|
phys_addr_t base,
|
|
phys_addr_t size);
|
|
|
|
void __next_mem_range(u64 *idx, int nid, ulong flags,
|
|
struct memblock_type *type_a,
|
|
struct memblock_type *type_b, phys_addr_t *out_start,
|
|
phys_addr_t *out_end, int *out_nid);
|
|
|
|
void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
|
|
struct memblock_type *type_a,
|
|
struct memblock_type *type_b, phys_addr_t *out_start,
|
|
phys_addr_t *out_end, int *out_nid);
|
|
|
|
void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
|
|
phys_addr_t *out_end);
|
|
|
|
/**
|
|
* for_each_mem_range - iterate through memblock areas from type_a and not
|
|
* included in type_b. Or just type_a if type_b is NULL.
|
|
* @i: u64 used as loop variable
|
|
* @type_a: ptr to memblock_type to iterate
|
|
* @type_b: ptr to memblock_type which excludes from the iteration
|
|
* @nid: node selector, %NUMA_NO_NODE for all nodes
|
|
* @flags: pick from blocks based on memory attributes
|
|
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
|
|
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
|
|
* @p_nid: ptr to int for nid of the range, can be %NULL
|
|
*/
|
|
#define for_each_mem_range(i, type_a, type_b, nid, flags, \
|
|
p_start, p_end, p_nid) \
|
|
for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b, \
|
|
p_start, p_end, p_nid); \
|
|
i != (u64)ULLONG_MAX; \
|
|
__next_mem_range(&i, nid, flags, type_a, type_b, \
|
|
p_start, p_end, p_nid))
|
|
|
|
/**
|
|
* for_each_mem_range_rev - reverse iterate through memblock areas from
|
|
* type_a and not included in type_b. Or just type_a if type_b is NULL.
|
|
* @i: u64 used as loop variable
|
|
* @type_a: ptr to memblock_type to iterate
|
|
* @type_b: ptr to memblock_type which excludes from the iteration
|
|
* @nid: node selector, %NUMA_NO_NODE for all nodes
|
|
* @flags: pick from blocks based on memory attributes
|
|
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
|
|
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
|
|
* @p_nid: ptr to int for nid of the range, can be %NULL
|
|
*/
|
|
#define for_each_mem_range_rev(i, type_a, type_b, nid, flags, \
|
|
p_start, p_end, p_nid) \
|
|
for (i = (u64)ULLONG_MAX, \
|
|
__next_mem_range_rev(&i, nid, flags, type_a, type_b,\
|
|
p_start, p_end, p_nid); \
|
|
i != (u64)ULLONG_MAX; \
|
|
__next_mem_range_rev(&i, nid, flags, type_a, type_b, \
|
|
p_start, p_end, p_nid))
|
|
|
|
/**
|
|
* for_each_reserved_mem_region - iterate over all reserved memblock areas
|
|
* @i: u64 used as loop variable
|
|
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
|
|
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
|
|
*
|
|
* Walks over reserved areas of memblock. Available as soon as memblock
|
|
* is initialized.
|
|
*/
|
|
#define for_each_reserved_mem_region(i, p_start, p_end) \
|
|
for (i = 0UL, \
|
|
__next_reserved_mem_region(&i, p_start, p_end); \
|
|
i != (u64)ULLONG_MAX; \
|
|
__next_reserved_mem_region(&i, p_start, p_end))
|
|
|
|
#ifdef CONFIG_MOVABLE_NODE
|
|
static inline bool memblock_is_hotpluggable(struct memblock_region *m)
|
|
{
|
|
return m->flags & MEMBLOCK_HOTPLUG;
|
|
}
|
|
|
|
static inline bool movable_node_is_enabled(void)
|
|
{
|
|
return movable_node_enabled;
|
|
}
|
|
#else
|
|
static inline bool memblock_is_hotpluggable(struct memblock_region *m)
|
|
{
|
|
return false;
|
|
}
|
|
static inline bool movable_node_is_enabled(void)
|
|
{
|
|
return false;
|
|
}
|
|
#endif
|
|
|
|
static inline bool memblock_is_mirror(struct memblock_region *m)
|
|
{
|
|
return m->flags & MEMBLOCK_MIRROR;
|
|
}
|
|
|
|
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
|
|
int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
|
|
unsigned long *end_pfn);
|
|
void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
|
|
unsigned long *out_end_pfn, int *out_nid);
|
|
|
|
/**
|
|
* for_each_mem_pfn_range - early memory pfn range iterator
|
|
* @i: an integer used as loop variable
|
|
* @nid: node selector, %MAX_NUMNODES for all nodes
|
|
* @p_start: ptr to ulong for start pfn of the range, can be %NULL
|
|
* @p_end: ptr to ulong for end pfn of the range, can be %NULL
|
|
* @p_nid: ptr to int for nid of the range, can be %NULL
|
|
*
|
|
* Walks over configured memory ranges.
|
|
*/
|
|
#define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \
|
|
for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \
|
|
i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
|
|
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
|
|
|
|
/**
|
|
* for_each_free_mem_range - iterate through free memblock areas
|
|
* @i: u64 used as loop variable
|
|
* @nid: node selector, %NUMA_NO_NODE for all nodes
|
|
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
|
|
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
|
|
* @p_nid: ptr to int for nid of the range, can be %NULL
|
|
* @flags: pick from blocks based on memory attributes
|
|
*
|
|
* Walks over free (memory && !reserved) areas of memblock. Available as
|
|
* soon as memblock is initialized.
|
|
*/
|
|
#define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid) \
|
|
for_each_mem_range(i, &memblock.memory, &memblock.reserved, \
|
|
nid, flags, p_start, p_end, p_nid)
|
|
|
|
/**
|
|
* for_each_free_mem_range_reverse - rev-iterate through free memblock areas
|
|
* @i: u64 used as loop variable
|
|
* @nid: node selector, %NUMA_NO_NODE for all nodes
|
|
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
|
|
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
|
|
* @p_nid: ptr to int for nid of the range, can be %NULL
|
|
* @flags: pick from blocks based on memory attributes
|
|
*
|
|
* Walks over free (memory && !reserved) areas of memblock in reverse
|
|
* order. Available as soon as memblock is initialized.
|
|
*/
|
|
#define for_each_free_mem_range_reverse(i, nid, flags, p_start, p_end, \
|
|
p_nid) \
|
|
for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
|
|
nid, flags, p_start, p_end, p_nid)
|
|
|
|
static inline void memblock_set_region_flags(struct memblock_region *r,
|
|
unsigned long flags)
|
|
{
|
|
r->flags |= flags;
|
|
}
|
|
|
|
static inline void memblock_clear_region_flags(struct memblock_region *r,
|
|
unsigned long flags)
|
|
{
|
|
r->flags &= ~flags;
|
|
}
|
|
|
|
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
|
|
int memblock_set_node(phys_addr_t base, phys_addr_t size,
|
|
struct memblock_type *type, int nid);
|
|
|
|
static inline void memblock_set_region_node(struct memblock_region *r, int nid)
|
|
{
|
|
r->nid = nid;
|
|
}
|
|
|
|
static inline int memblock_get_region_node(const struct memblock_region *r)
|
|
{
|
|
return r->nid;
|
|
}
|
|
#else
|
|
static inline void memblock_set_region_node(struct memblock_region *r, int nid)
|
|
{
|
|
}
|
|
|
|
static inline int memblock_get_region_node(const struct memblock_region *r)
|
|
{
|
|
return 0;
|
|
}
|
|
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
|
|
|
|
phys_addr_t memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid);
|
|
phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid);
|
|
|
|
phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align);
|
|
|
|
#ifdef CONFIG_MOVABLE_NODE
|
|
/*
|
|
* Set the allocation direction to bottom-up or top-down.
|
|
*/
|
|
static inline void __init memblock_set_bottom_up(bool enable)
|
|
{
|
|
memblock.bottom_up = enable;
|
|
}
|
|
|
|
/*
|
|
* Check if the allocation direction is bottom-up or not.
|
|
* if this is true, that said, memblock will allocate memory
|
|
* in bottom-up direction.
|
|
*/
|
|
static inline bool memblock_bottom_up(void)
|
|
{
|
|
return memblock.bottom_up;
|
|
}
|
|
#else
|
|
static inline void __init memblock_set_bottom_up(bool enable) {}
|
|
static inline bool memblock_bottom_up(void) { return false; }
|
|
#endif
|
|
|
|
/* Flags for memblock_alloc_base() amd __memblock_alloc_base() */
|
|
#define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0)
|
|
#define MEMBLOCK_ALLOC_ACCESSIBLE 0
|
|
|
|
phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
|
|
phys_addr_t start, phys_addr_t end,
|
|
ulong flags);
|
|
phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
|
|
phys_addr_t max_addr);
|
|
phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
|
|
phys_addr_t max_addr);
|
|
phys_addr_t memblock_phys_mem_size(void);
|
|
phys_addr_t memblock_mem_size(unsigned long limit_pfn);
|
|
phys_addr_t memblock_start_of_DRAM(void);
|
|
phys_addr_t memblock_end_of_DRAM(void);
|
|
void memblock_enforce_memory_limit(phys_addr_t memory_limit);
|
|
int memblock_is_memory(phys_addr_t addr);
|
|
int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
|
|
int memblock_is_reserved(phys_addr_t addr);
|
|
bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
|
|
|
|
extern void __memblock_dump_all(void);
|
|
|
|
static inline void memblock_dump_all(void)
|
|
{
|
|
if (memblock_debug)
|
|
__memblock_dump_all();
|
|
}
|
|
|
|
/**
|
|
* memblock_set_current_limit - Set the current allocation limit to allow
|
|
* limiting allocations to what is currently
|
|
* accessible during boot
|
|
* @limit: New limit value (physical address)
|
|
*/
|
|
void memblock_set_current_limit(phys_addr_t limit);
|
|
|
|
|
|
phys_addr_t memblock_get_current_limit(void);
|
|
|
|
/*
|
|
* pfn conversion functions
|
|
*
|
|
* While the memory MEMBLOCKs should always be page aligned, the reserved
|
|
* MEMBLOCKs may not be. This accessor attempt to provide a very clear
|
|
* idea of what they return for such non aligned MEMBLOCKs.
|
|
*/
|
|
|
|
/**
|
|
* memblock_region_memory_base_pfn - Return the lowest pfn intersecting with the memory region
|
|
* @reg: memblock_region structure
|
|
*/
|
|
static inline unsigned long memblock_region_memory_base_pfn(const struct memblock_region *reg)
|
|
{
|
|
return PFN_UP(reg->base);
|
|
}
|
|
|
|
/**
|
|
* memblock_region_memory_end_pfn - Return the end_pfn this region
|
|
* @reg: memblock_region structure
|
|
*/
|
|
static inline unsigned long memblock_region_memory_end_pfn(const struct memblock_region *reg)
|
|
{
|
|
return PFN_DOWN(reg->base + reg->size);
|
|
}
|
|
|
|
/**
|
|
* memblock_region_reserved_base_pfn - Return the lowest pfn intersecting with the reserved region
|
|
* @reg: memblock_region structure
|
|
*/
|
|
static inline unsigned long memblock_region_reserved_base_pfn(const struct memblock_region *reg)
|
|
{
|
|
return PFN_DOWN(reg->base);
|
|
}
|
|
|
|
/**
|
|
* memblock_region_reserved_end_pfn - Return the end_pfn this region
|
|
* @reg: memblock_region structure
|
|
*/
|
|
static inline unsigned long memblock_region_reserved_end_pfn(const struct memblock_region *reg)
|
|
{
|
|
return PFN_UP(reg->base + reg->size);
|
|
}
|
|
|
|
#define for_each_memblock(memblock_type, region) \
|
|
for (region = memblock.memblock_type.regions; \
|
|
region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \
|
|
region++)
|
|
|
|
|
|
#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
|
|
#define __init_memblock __meminit
|
|
#define __initdata_memblock __meminitdata
|
|
#else
|
|
#define __init_memblock
|
|
#define __initdata_memblock
|
|
#endif
|
|
|
|
#ifdef CONFIG_MEMTEST
|
|
extern void early_memtest(phys_addr_t start, phys_addr_t end);
|
|
#else
|
|
static inline void early_memtest(phys_addr_t start, phys_addr_t end)
|
|
{
|
|
}
|
|
#endif
|
|
|
|
#else
|
|
static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
#endif /* CONFIG_HAVE_MEMBLOCK */
|
|
|
|
#endif /* __KERNEL__ */
|
|
|
|
#endif /* _LINUX_MEMBLOCK_H */
|