diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index ffa037f28d39..55728e121473 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -34,15 +34,15 @@ static inline void resume_map_numa_kva(pgd_t *pgd) {} * 64Gb / 4096bytes/page = 16777216 pages */ #define MAX_NR_PAGES 16777216 -#define MAX_ELEMENTS 1024 -#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS) +#define MAX_SECTIONS 1024 +#define PAGES_PER_SECTION (MAX_NR_PAGES/MAX_SECTIONS) extern s8 physnode_map[]; static inline int pfn_to_nid(unsigned long pfn) { #ifdef CONFIG_NUMA - return((int) physnode_map[(pfn) / PAGES_PER_ELEMENT]); + return((int) physnode_map[(pfn) / PAGES_PER_SECTION]); #else return 0; #endif diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index f5510d889a22..fbeaaf416610 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -496,6 +496,7 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) static int __init numa_register_memblks(struct numa_meminfo *mi) { + unsigned long uninitialized_var(pfn_align); int i, nid; /* Account for nodes with cpus and no memory */ @@ -511,6 +512,20 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) /* for out of order entries */ sort_node_map(); + + /* + * If sections array is gonna be used for pfn -> nid mapping, check + * whether its granularity is fine enough. + */ +#ifdef NODE_NOT_IN_PAGE_FLAGS + pfn_align = node_map_pfn_alignment(); + if (pfn_align && pfn_align < PAGES_PER_SECTION) { + printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n", + PFN_PHYS(pfn_align) >> 20, + PFN_PHYS(PAGES_PER_SECTION) >> 20); + return -EINVAL; + } +#endif if (!numa_meminfo_cover_memory(mi)) return -EINVAL; diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 849a975d3fa0..3adebe7e536a 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -41,7 +41,7 @@ * physnode_map[16-31] = 1; * physnode_map[32- ] = -1; */ -s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; +s8 physnode_map[MAX_SECTIONS] __read_mostly = { [0 ... (MAX_SECTIONS - 1)] = -1}; EXPORT_SYMBOL(physnode_map); void memory_present(int nid, unsigned long start, unsigned long end) @@ -52,8 +52,8 @@ void memory_present(int nid, unsigned long start, unsigned long end) nid, start, end); printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); printk(KERN_DEBUG " "); - for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { - physnode_map[pfn / PAGES_PER_ELEMENT] = nid; + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { + physnode_map[pfn / PAGES_PER_SECTION] = nid; printk(KERN_CONT "%lx ", pfn); } printk(KERN_CONT "\n"); diff --git a/include/linux/mm.h b/include/linux/mm.h index 9670f71d7be9..c70a326b8f26 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1313,6 +1313,7 @@ extern void remove_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); extern void remove_all_active_ranges(void); void sort_node_map(void); +unsigned long node_map_pfn_alignment(void); unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, unsigned long end_pfn); extern unsigned long absent_pages_in_range(unsigned long start_pfn, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4e8985acdab8..9119faae6e6a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4585,6 +4585,60 @@ void __init sort_node_map(void) cmp_node_active_region, NULL); } +/** + * node_map_pfn_alignment - determine the maximum internode alignment + * + * This function should be called after node map is populated and sorted. + * It calculates the maximum power of two alignment which can distinguish + * all the nodes. + * + * For example, if all nodes are 1GiB and aligned to 1GiB, the return value + * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the + * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is + * shifted, 1GiB is enough and this function will indicate so. + * + * This is used to test whether pfn -> nid mapping of the chosen memory + * model has fine enough granularity to avoid incorrect mapping for the + * populated node map. + * + * Returns the determined alignment in pfn's. 0 if there is no alignment + * requirement (single node). + */ +unsigned long __init node_map_pfn_alignment(void) +{ + unsigned long accl_mask = 0, last_end = 0; + int last_nid = -1; + int i; + + for_each_active_range_index_in_nid(i, MAX_NUMNODES) { + int nid = early_node_map[i].nid; + unsigned long start = early_node_map[i].start_pfn; + unsigned long end = early_node_map[i].end_pfn; + unsigned long mask; + + if (!start || last_nid < 0 || last_nid == nid) { + last_nid = nid; + last_end = end; + continue; + } + + /* + * Start with a mask granular enough to pin-point to the + * start pfn and tick off bits one-by-one until it becomes + * too coarse to separate the current node from the last. + */ + mask = ~((1 << __ffs(start)) - 1); + while (mask && last_end <= (start & (mask << 1))) + mask <<= 1; + + /* accumulate all internode masks */ + accl_mask |= mask; + } + + /* convert mask to number of pages */ + return ~accl_mask + 1; +} + /* Find the lowest pfn for a node */ static unsigned long __init find_min_pfn_for_node(int nid) {