linux-stable/include/linux/memblock.h
Tang Chen 95cf82ecc1 mem-hotplug: handle node hole when initializing numa_meminfo.
When parsing SRAT, all memory ranges are added into numa_meminfo.  In
numa_init(), before entering numa_cleanup_meminfo(), all possible memory
ranges are in numa_meminfo.  And numa_cleanup_meminfo() removes all
ranges over max_pfn or empty.

But, this only works if the nodes are continuous.  Let's have a look at
the following example:

We have an SRAT like this:
SRAT: Node 0 PXM 0 [mem 0x00000000-0x5fffffff]
SRAT: Node 0 PXM 0 [mem 0x100000000-0x1ffffffffff]
SRAT: Node 1 PXM 1 [mem 0x20000000000-0x3ffffffffff]
SRAT: Node 4 PXM 2 [mem 0x40000000000-0x5ffffffffff] hotplug
SRAT: Node 5 PXM 3 [mem 0x60000000000-0x7ffffffffff] hotplug
SRAT: Node 2 PXM 4 [mem 0x80000000000-0x9ffffffffff] hotplug
SRAT: Node 3 PXM 5 [mem 0xa0000000000-0xbffffffffff] hotplug
SRAT: Node 6 PXM 6 [mem 0xc0000000000-0xdffffffffff] hotplug
SRAT: Node 7 PXM 7 [mem 0xe0000000000-0xfffffffffff] hotplug

On boot, only node 0,1,2,3 exist.

And the numa_meminfo will look like this:
numa_meminfo.nr_blks = 9
1. on node 0: [0, 60000000]
2. on node 0: [100000000, 20000000000]
3. on node 1: [20000000000, 40000000000]
4. on node 4: [40000000000, 60000000000]
5. on node 5: [60000000000, 80000000000]
6. on node 2: [80000000000, a0000000000]
7. on node 3: [a0000000000, a0800000000]
8. on node 6: [c0000000000, a0800000000]
9. on node 7: [e0000000000, a0800000000]

And numa_cleanup_meminfo() will merge 1 and 2, and remove 8,9 because the
end address is over max_pfn, which is a0800000000.  But 4 and 5 are not
removed because their end addresses are less then max_pfn.  But in fact,
node 4 and 5 don't exist.

In a word, numa_cleanup_meminfo() is not able to handle holes between nodes.

Since memory ranges in node 4 and 5 are in numa_meminfo, in
numa_register_memblks(), node 4 and 5 will be mistakenly set to online.

If you run lscpu, it will show:
NUMA node0 CPU(s):     0-14,128-142
NUMA node1 CPU(s):     15-29,143-157
NUMA node2 CPU(s):
NUMA node3 CPU(s):
NUMA node4 CPU(s):     62-76,190-204
NUMA node5 CPU(s):     78-92,206-220

In this patch, we use memblock_overlaps_region() to check if ranges in
numa_meminfo overlap with ranges in memory_block.  Since memory_block
contains all available memory at boot time, if they overlap, it means the
ranges exist.  If not, then remove them from numa_meminfo.

After this patch, lscpu will show:
NUMA node0 CPU(s):     0-14,128-142
NUMA node1 CPU(s):     15-29,143-157
NUMA node4 CPU(s):     62-76,190-204
NUMA node5 CPU(s):     78-92,206-220

Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Vladimir Murzin <vladimir.murzin@arm.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Alexander Kuleshov <kuleshovmail@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-08 15:35:28 -07:00

425 lines
14 KiB
C

#ifndef _LINUX_MEMBLOCK_H
#define _LINUX_MEMBLOCK_H
#ifdef __KERNEL__
#ifdef CONFIG_HAVE_MEMBLOCK
/*
* Logical memory blocks.
*
* Copyright (C) 2001 Peter Bergner, IBM Corp.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/init.h>
#include <linux/mm.h>
#define INIT_MEMBLOCK_REGIONS 128
#define INIT_PHYSMEM_REGIONS 4
/* Definition of memblock flags. */
enum {
MEMBLOCK_NONE = 0x0, /* No special request */
MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */
MEMBLOCK_MIRROR = 0x2, /* mirrored region */
};
struct memblock_region {
phys_addr_t base;
phys_addr_t size;
unsigned long flags;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int nid;
#endif
};
struct memblock_type {
unsigned long cnt; /* number of regions */
unsigned long max; /* size of the allocated array */
phys_addr_t total_size; /* size of all regions */
struct memblock_region *regions;
};
struct memblock {
bool bottom_up; /* is bottom up direction? */
phys_addr_t current_limit;
struct memblock_type memory;
struct memblock_type reserved;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
struct memblock_type physmem;
#endif
};
extern struct memblock memblock;
extern int memblock_debug;
#ifdef CONFIG_MOVABLE_NODE
/* If movable_node boot option specified */
extern bool movable_node_enabled;
#endif /* CONFIG_MOVABLE_NODE */
#define memblock_dbg(fmt, ...) \
if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
phys_addr_t start, phys_addr_t end,
int nid, ulong flags);
phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align);
phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
phys_addr_t get_allocated_memblock_memory_regions_info(phys_addr_t *addr);
void memblock_allow_resize(void);
int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
int memblock_add(phys_addr_t base, phys_addr_t size);
int memblock_remove(phys_addr_t base, phys_addr_t size);
int memblock_free(phys_addr_t base, phys_addr_t size);
int memblock_reserve(phys_addr_t base, phys_addr_t size);
void memblock_trim_memory(phys_addr_t align);
bool memblock_overlaps_region(struct memblock_type *type,
phys_addr_t base, phys_addr_t size);
int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
ulong choose_memblock_flags(void);
/* Low level functions */
int memblock_add_range(struct memblock_type *type,
phys_addr_t base, phys_addr_t size,
int nid, unsigned long flags);
int memblock_remove_range(struct memblock_type *type,
phys_addr_t base,
phys_addr_t size);
void __next_mem_range(u64 *idx, int nid, ulong flags,
struct memblock_type *type_a,
struct memblock_type *type_b, phys_addr_t *out_start,
phys_addr_t *out_end, int *out_nid);
void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
struct memblock_type *type_a,
struct memblock_type *type_b, phys_addr_t *out_start,
phys_addr_t *out_end, int *out_nid);
void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
phys_addr_t *out_end);
/**
* for_each_mem_range - iterate through memblock areas from type_a and not
* included in type_b. Or just type_a if type_b is NULL.
* @i: u64 used as loop variable
* @type_a: ptr to memblock_type to iterate
* @type_b: ptr to memblock_type which excludes from the iteration
* @nid: node selector, %NUMA_NO_NODE for all nodes
* @flags: pick from blocks based on memory attributes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
*/
#define for_each_mem_range(i, type_a, type_b, nid, flags, \
p_start, p_end, p_nid) \
for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b, \
p_start, p_end, p_nid); \
i != (u64)ULLONG_MAX; \
__next_mem_range(&i, nid, flags, type_a, type_b, \
p_start, p_end, p_nid))
/**
* for_each_mem_range_rev - reverse iterate through memblock areas from
* type_a and not included in type_b. Or just type_a if type_b is NULL.
* @i: u64 used as loop variable
* @type_a: ptr to memblock_type to iterate
* @type_b: ptr to memblock_type which excludes from the iteration
* @nid: node selector, %NUMA_NO_NODE for all nodes
* @flags: pick from blocks based on memory attributes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
*/
#define for_each_mem_range_rev(i, type_a, type_b, nid, flags, \
p_start, p_end, p_nid) \
for (i = (u64)ULLONG_MAX, \
__next_mem_range_rev(&i, nid, flags, type_a, type_b,\
p_start, p_end, p_nid); \
i != (u64)ULLONG_MAX; \
__next_mem_range_rev(&i, nid, flags, type_a, type_b, \
p_start, p_end, p_nid))
/**
* for_each_reserved_mem_region - iterate over all reserved memblock areas
* @i: u64 used as loop variable
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
*
* Walks over reserved areas of memblock. Available as soon as memblock
* is initialized.
*/
#define for_each_reserved_mem_region(i, p_start, p_end) \
for (i = 0UL, \
__next_reserved_mem_region(&i, p_start, p_end); \
i != (u64)ULLONG_MAX; \
__next_reserved_mem_region(&i, p_start, p_end))
#ifdef CONFIG_MOVABLE_NODE
static inline bool memblock_is_hotpluggable(struct memblock_region *m)
{
return m->flags & MEMBLOCK_HOTPLUG;
}
static inline bool movable_node_is_enabled(void)
{
return movable_node_enabled;
}
#else
static inline bool memblock_is_hotpluggable(struct memblock_region *m)
{
return false;
}
static inline bool movable_node_is_enabled(void)
{
return false;
}
#endif
static inline bool memblock_is_mirror(struct memblock_region *m)
{
return m->flags & MEMBLOCK_MIRROR;
}
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
unsigned long *end_pfn);
void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
unsigned long *out_end_pfn, int *out_nid);
/**
* for_each_mem_pfn_range - early memory pfn range iterator
* @i: an integer used as loop variable
* @nid: node selector, %MAX_NUMNODES for all nodes
* @p_start: ptr to ulong for start pfn of the range, can be %NULL
* @p_end: ptr to ulong for end pfn of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
*
* Walks over configured memory ranges.
*/
#define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \
for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \
i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
/**
* for_each_free_mem_range - iterate through free memblock areas
* @i: u64 used as loop variable
* @nid: node selector, %NUMA_NO_NODE for all nodes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
* @flags: pick from blocks based on memory attributes
*
* Walks over free (memory && !reserved) areas of memblock. Available as
* soon as memblock is initialized.
*/
#define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid) \
for_each_mem_range(i, &memblock.memory, &memblock.reserved, \
nid, flags, p_start, p_end, p_nid)
/**
* for_each_free_mem_range_reverse - rev-iterate through free memblock areas
* @i: u64 used as loop variable
* @nid: node selector, %NUMA_NO_NODE for all nodes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
* @flags: pick from blocks based on memory attributes
*
* Walks over free (memory && !reserved) areas of memblock in reverse
* order. Available as soon as memblock is initialized.
*/
#define for_each_free_mem_range_reverse(i, nid, flags, p_start, p_end, \
p_nid) \
for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
nid, flags, p_start, p_end, p_nid)
static inline void memblock_set_region_flags(struct memblock_region *r,
unsigned long flags)
{
r->flags |= flags;
}
static inline void memblock_clear_region_flags(struct memblock_region *r,
unsigned long flags)
{
r->flags &= ~flags;
}
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int memblock_set_node(phys_addr_t base, phys_addr_t size,
struct memblock_type *type, int nid);
static inline void memblock_set_region_node(struct memblock_region *r, int nid)
{
r->nid = nid;
}
static inline int memblock_get_region_node(const struct memblock_region *r)
{
return r->nid;
}
#else
static inline void memblock_set_region_node(struct memblock_region *r, int nid)
{
}
static inline int memblock_get_region_node(const struct memblock_region *r)
{
return 0;
}
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
phys_addr_t memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid);
phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid);
phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align);
#ifdef CONFIG_MOVABLE_NODE
/*
* Set the allocation direction to bottom-up or top-down.
*/
static inline void __init memblock_set_bottom_up(bool enable)
{
memblock.bottom_up = enable;
}
/*
* Check if the allocation direction is bottom-up or not.
* if this is true, that said, memblock will allocate memory
* in bottom-up direction.
*/
static inline bool memblock_bottom_up(void)
{
return memblock.bottom_up;
}
#else
static inline void __init memblock_set_bottom_up(bool enable) {}
static inline bool memblock_bottom_up(void) { return false; }
#endif
/* Flags for memblock_alloc_base() amd __memblock_alloc_base() */
#define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0)
#define MEMBLOCK_ALLOC_ACCESSIBLE 0
phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
phys_addr_t start, phys_addr_t end,
ulong flags);
phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
phys_addr_t max_addr);
phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
phys_addr_t max_addr);
phys_addr_t memblock_phys_mem_size(void);
phys_addr_t memblock_mem_size(unsigned long limit_pfn);
phys_addr_t memblock_start_of_DRAM(void);
phys_addr_t memblock_end_of_DRAM(void);
void memblock_enforce_memory_limit(phys_addr_t memory_limit);
int memblock_is_memory(phys_addr_t addr);
int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
int memblock_is_reserved(phys_addr_t addr);
bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
extern void __memblock_dump_all(void);
static inline void memblock_dump_all(void)
{
if (memblock_debug)
__memblock_dump_all();
}
/**
* memblock_set_current_limit - Set the current allocation limit to allow
* limiting allocations to what is currently
* accessible during boot
* @limit: New limit value (physical address)
*/
void memblock_set_current_limit(phys_addr_t limit);
phys_addr_t memblock_get_current_limit(void);
/*
* pfn conversion functions
*
* While the memory MEMBLOCKs should always be page aligned, the reserved
* MEMBLOCKs may not be. This accessor attempt to provide a very clear
* idea of what they return for such non aligned MEMBLOCKs.
*/
/**
* memblock_region_memory_base_pfn - Return the lowest pfn intersecting with the memory region
* @reg: memblock_region structure
*/
static inline unsigned long memblock_region_memory_base_pfn(const struct memblock_region *reg)
{
return PFN_UP(reg->base);
}
/**
* memblock_region_memory_end_pfn - Return the end_pfn this region
* @reg: memblock_region structure
*/
static inline unsigned long memblock_region_memory_end_pfn(const struct memblock_region *reg)
{
return PFN_DOWN(reg->base + reg->size);
}
/**
* memblock_region_reserved_base_pfn - Return the lowest pfn intersecting with the reserved region
* @reg: memblock_region structure
*/
static inline unsigned long memblock_region_reserved_base_pfn(const struct memblock_region *reg)
{
return PFN_DOWN(reg->base);
}
/**
* memblock_region_reserved_end_pfn - Return the end_pfn this region
* @reg: memblock_region structure
*/
static inline unsigned long memblock_region_reserved_end_pfn(const struct memblock_region *reg)
{
return PFN_UP(reg->base + reg->size);
}
#define for_each_memblock(memblock_type, region) \
for (region = memblock.memblock_type.regions; \
region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \
region++)
#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
#define __init_memblock __meminit
#define __initdata_memblock __meminitdata
#else
#define __init_memblock
#define __initdata_memblock
#endif
#ifdef CONFIG_MEMTEST
extern void early_memtest(phys_addr_t start, phys_addr_t end);
#else
static inline void early_memtest(phys_addr_t start, phys_addr_t end)
{
}
#endif
#else
static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align)
{
return 0;
}
#endif /* CONFIG_HAVE_MEMBLOCK */
#endif /* __KERNEL__ */
#endif /* _LINUX_MEMBLOCK_H */