diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 38c3957e0b40..cc5664286a1c 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -47,6 +47,14 @@ config NEED_PER_CPU_PAGE_FIRST_CHUNK config SYS_SUPPORTS_HUGETLBFS def_bool y +# Support for additional huge page sizes besides HPAGE_SIZE. +# The software support is currently only present in the TILE-Gx +# hypervisor. TILEPro in any case does not support page sizes +# larger than the default HPAGE_SIZE. +config HUGETLB_SUPER_PAGES + depends on HUGETLB_PAGE && TILEGX + def_bool y + config GENERIC_CLOCKEVENTS def_bool y diff --git a/arch/tile/include/asm/hugetlb.h b/arch/tile/include/asm/hugetlb.h index d396d1805163..b2042380a5aa 100644 --- a/arch/tile/include/asm/hugetlb.h +++ b/arch/tile/include/asm/hugetlb.h @@ -106,4 +106,25 @@ static inline void arch_release_hugepage(struct page *page) { } +#ifdef CONFIG_HUGETLB_SUPER_PAGES +static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, + struct page *page, int writable) +{ + size_t pagesize = huge_page_size(hstate_vma(vma)); + if (pagesize != PUD_SIZE && pagesize != PMD_SIZE) + entry = pte_mksuper(entry); + return entry; +} +#define arch_make_huge_pte arch_make_huge_pte + +/* Sizes to scale up page size for PTEs with HV_PTE_SUPER bit. */ +enum { + HUGE_SHIFT_PGDIR = 0, + HUGE_SHIFT_PMD = 1, + HUGE_SHIFT_PAGE = 2, + HUGE_SHIFT_ENTRIES +}; +extern int huge_shift[HUGE_SHIFT_ENTRIES]; +#endif + #endif /* _ASM_TILE_HUGETLB_H */ diff --git a/arch/tile/include/asm/page.h b/arch/tile/include/asm/page.h index c750943f961e..9d9131e5c552 100644 --- a/arch/tile/include/asm/page.h +++ b/arch/tile/include/asm/page.h @@ -87,8 +87,7 @@ typedef HV_PTE pgprot_t; /* * User L2 page tables are managed as one L2 page table per page, * because we use the page allocator for them. This keeps the allocation - * simple and makes it potentially useful to implement HIGHPTE at some point. - * However, it's also inefficient, since L2 page tables are much smaller + * simple, but it's also inefficient, since L2 page tables are much smaller * than pages (currently 2KB vs 64KB). So we should revisit this. */ typedef struct page *pgtable_t; @@ -137,7 +136,7 @@ static inline __attribute_const__ int get_order(unsigned long size) #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) -#define HUGE_MAX_HSTATE 2 +#define HUGE_MAX_HSTATE 6 #ifdef CONFIG_HUGETLB_PAGE #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h index 319f4826d972..73b1a4c9ad03 100644 --- a/arch/tile/include/asm/pgtable.h +++ b/arch/tile/include/asm/pgtable.h @@ -71,6 +71,7 @@ extern void set_page_homes(void); #define _PAGE_PRESENT HV_PTE_PRESENT #define _PAGE_HUGE_PAGE HV_PTE_PAGE +#define _PAGE_SUPER_PAGE HV_PTE_SUPER #define _PAGE_READABLE HV_PTE_READABLE #define _PAGE_WRITABLE HV_PTE_WRITABLE #define _PAGE_EXECUTABLE HV_PTE_EXECUTABLE @@ -87,6 +88,7 @@ extern void set_page_homes(void); #define _PAGE_ALL (\ _PAGE_PRESENT | \ _PAGE_HUGE_PAGE | \ + _PAGE_SUPER_PAGE | \ _PAGE_READABLE | \ _PAGE_WRITABLE | \ _PAGE_EXECUTABLE | \ @@ -197,6 +199,7 @@ static inline void __pte_clear(pte_t *ptep) #define pte_write hv_pte_get_writable #define pte_exec hv_pte_get_executable #define pte_huge hv_pte_get_page +#define pte_super hv_pte_get_super #define pte_rdprotect hv_pte_clear_readable #define pte_exprotect hv_pte_clear_executable #define pte_mkclean hv_pte_clear_dirty @@ -209,6 +212,7 @@ static inline void __pte_clear(pte_t *ptep) #define pte_mkyoung hv_pte_set_accessed #define pte_mkwrite hv_pte_set_writable #define pte_mkhuge hv_pte_set_page +#define pte_mksuper hv_pte_set_super #define pte_special(pte) 0 #define pte_mkspecial(pte) (pte) @@ -338,13 +342,8 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) */ #define pgd_offset_k(address) pgd_offset(&init_mm, address) -#if defined(CONFIG_HIGHPTE) -extern pte_t *pte_offset_map(pmd_t *, unsigned long address); -#define pte_unmap(pte) kunmap_atomic(pte) -#else #define pte_offset_map(dir, address) pte_offset_kernel(dir, address) #define pte_unmap(pte) do { } while (0) -#endif /* Clear a non-executable kernel PTE and flush it from the TLB. */ #define kpte_clear_flush(ptep, vaddr) \ @@ -537,7 +536,8 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) /* Support /proc/NN/pgtable API. */ struct seq_file; int arch_proc_pgtable_show(struct seq_file *m, struct mm_struct *mm, - unsigned long vaddr, pte_t *ptep, void **datap); + unsigned long vaddr, unsigned long pagesize, + pte_t *ptep, void **datap); #endif /* !__ASSEMBLY__ */ diff --git a/arch/tile/include/asm/tlbflush.h b/arch/tile/include/asm/tlbflush.h index 96199d214fb8..dcf91b25a1e5 100644 --- a/arch/tile/include/asm/tlbflush.h +++ b/arch/tile/include/asm/tlbflush.h @@ -38,16 +38,11 @@ DECLARE_PER_CPU(int, current_asid); /* The hypervisor tells us what ASIDs are available to us. */ extern int min_asid, max_asid; -static inline unsigned long hv_page_size(const struct vm_area_struct *vma) -{ - return (vma->vm_flags & VM_HUGETLB) ? HPAGE_SIZE : PAGE_SIZE; -} - /* Pass as vma pointer for non-executable mapping, if no vma available. */ -#define FLUSH_NONEXEC ((const struct vm_area_struct *)-1UL) +#define FLUSH_NONEXEC ((struct vm_area_struct *)-1UL) /* Flush a single user page on this cpu. */ -static inline void local_flush_tlb_page(const struct vm_area_struct *vma, +static inline void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long addr, unsigned long page_size) { @@ -60,7 +55,7 @@ static inline void local_flush_tlb_page(const struct vm_area_struct *vma, } /* Flush range of user pages on this cpu. */ -static inline void local_flush_tlb_pages(const struct vm_area_struct *vma, +static inline void local_flush_tlb_pages(struct vm_area_struct *vma, unsigned long addr, unsigned long page_size, unsigned long len) @@ -117,10 +112,10 @@ extern void flush_tlb_all(void); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); extern void flush_tlb_current_task(void); extern void flush_tlb_mm(struct mm_struct *); -extern void flush_tlb_page(const struct vm_area_struct *, unsigned long); -extern void flush_tlb_page_mm(const struct vm_area_struct *, +extern void flush_tlb_page(struct vm_area_struct *, unsigned long); +extern void flush_tlb_page_mm(struct vm_area_struct *, struct mm_struct *, unsigned long); -extern void flush_tlb_range(const struct vm_area_struct *, +extern void flush_tlb_range(struct vm_area_struct *, unsigned long start, unsigned long end); #define flush_tlb() flush_tlb_current_task() diff --git a/arch/tile/include/hv/hypervisor.h b/arch/tile/include/hv/hypervisor.h index f27871775b7a..85e5cab4c2f0 100644 --- a/arch/tile/include/hv/hypervisor.h +++ b/arch/tile/include/hv/hypervisor.h @@ -66,6 +66,22 @@ #define HV_DEFAULT_PAGE_SIZE_LARGE \ (__HV_SIZE_ONE << HV_LOG2_DEFAULT_PAGE_SIZE_LARGE) +#if CHIP_VA_WIDTH() > 32 + +/** The log2 of the initial size of jumbo pages, in bytes. + * See HV_DEFAULT_PAGE_SIZE_JUMBO. + */ +#define HV_LOG2_DEFAULT_PAGE_SIZE_JUMBO 32 + +/** The initial size of jumbo pages, in bytes. This value should + * be verified at runtime by calling hv_sysconf(HV_SYSCONF_PAGE_SIZE_JUMBO). + * It may also be modified when installing a new context. + */ +#define HV_DEFAULT_PAGE_SIZE_JUMBO \ + (__HV_SIZE_ONE << HV_LOG2_DEFAULT_PAGE_SIZE_JUMBO) + +#endif + /** The log2 of the granularity at which page tables must be aligned; * in other words, the CPA for a page table must have this many zero * bits at the bottom of the address. @@ -284,8 +300,11 @@ #define HV_DISPATCH_GET_IPI_PTE 56 #endif +/** hv_set_pte_super_shift */ +#define HV_DISPATCH_SET_PTE_SUPER_SHIFT 57 + /** One more than the largest dispatch value */ -#define _HV_DISPATCH_END 57 +#define _HV_DISPATCH_END 58 #ifndef __ASSEMBLER__ @@ -413,6 +432,11 @@ typedef enum { */ HV_SYSCONF_VALID_PAGE_SIZES = 7, + /** The size of jumbo pages, in bytes. + * If no jumbo pages are available, zero will be returned. + */ + HV_SYSCONF_PAGE_SIZE_JUMBO = 8, + } HV_SysconfQuery; /** Offset to subtract from returned Kelvin temperature to get degrees @@ -695,6 +719,29 @@ int hv_install_context(HV_PhysAddr page_table, HV_PTE access, HV_ASID asid, #ifndef __ASSEMBLER__ + +/** Set the number of pages ganged together by HV_PTE_SUPER at a + * particular level of the page table. + * + * The current TILE-Gx hardware only supports powers of four + * (i.e. log2_count must be a multiple of two), and the requested + * "super" page size must be less than the span of the next level in + * the page table. The largest size that can be requested is 64GB. + * + * The shift value is initially "0" for all page table levels, + * indicating that the HV_PTE_SUPER bit is effectively ignored. + * + * If you change the count from one non-zero value to another, the + * hypervisor will flush the entire TLB and TSB to avoid confusion. + * + * @param level Page table level (0, 1, or 2) + * @param log2_count Base-2 log of the number of pages to gang together, + * i.e. how much to shift left the base page size for the super page size. + * @return Zero on success, or a hypervisor error code on failure. + */ +int hv_set_pte_super_shift(int level, int log2_count); + + /** Value returned from hv_inquire_context(). */ typedef struct { @@ -1891,8 +1938,9 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control, #define HV_PTE_INDEX_USER 10 /**< Page is user-accessible */ #define HV_PTE_INDEX_ACCESSED 11 /**< Page has been accessed */ #define HV_PTE_INDEX_DIRTY 12 /**< Page has been written */ - /* Bits 13-15 are reserved for + /* Bits 13-14 are reserved for future use. */ +#define HV_PTE_INDEX_SUPER 15 /**< Pages ganged together for TLB */ #define HV_PTE_INDEX_MODE 16 /**< Page mode; see HV_PTE_MODE_xxx */ #define HV_PTE_MODE_BITS 3 /**< Number of bits in mode */ #define HV_PTE_INDEX_CLIENT2 19 /**< Page client state 2 */ @@ -1987,7 +2035,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control, /** Does this PTE map a page? * - * If this bit is set in the level-1 page table, the entry should be + * If this bit is set in a level-0 page table, the entry should be + * interpreted as a level-2 page table entry mapping a jumbo page. + * + * If this bit is set in a level-1 page table, the entry should be * interpreted as a level-2 page table entry mapping a large page. * * This bit should not be modified by the client while PRESENT is set, as @@ -1997,6 +2048,18 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control, */ #define HV_PTE_PAGE (__HV_PTE_ONE << HV_PTE_INDEX_PAGE) +/** Does this PTE implicitly reference multiple pages? + * + * If this bit is set in the page table (either in the level-2 page table, + * or in a higher level page table in conjunction with the PAGE bit) + * then the PTE specifies a range of contiguous pages, not a single page. + * The hv_set_pte_super_shift() allows you to specify the count for + * each level of the page table. + * + * Note: this bit is not supported on TILEPro systems. + */ +#define HV_PTE_SUPER (__HV_PTE_ONE << HV_PTE_INDEX_SUPER) + /** Is this a global (non-ASID) mapping? * * If this bit is set, the translations established by this PTE will @@ -2215,6 +2278,7 @@ hv_pte_clear_##name(HV_PTE pte) \ */ _HV_BIT(present, PRESENT) _HV_BIT(page, PAGE) +_HV_BIT(super, SUPER) _HV_BIT(client0, CLIENT0) _HV_BIT(client1, CLIENT1) _HV_BIT(client2, CLIENT2) diff --git a/arch/tile/kernel/hvglue.lds b/arch/tile/kernel/hvglue.lds index 2b7cd0a659a9..d44c5a67a1ed 100644 --- a/arch/tile/kernel/hvglue.lds +++ b/arch/tile/kernel/hvglue.lds @@ -55,4 +55,5 @@ hv_store_mapping = TEXT_OFFSET + 0x106a0; hv_inquire_realpa = TEXT_OFFSET + 0x106c0; hv_flush_all = TEXT_OFFSET + 0x106e0; hv_get_ipi_pte = TEXT_OFFSET + 0x10700; -hv_glue_internals = TEXT_OFFSET + 0x10720; +hv_set_pte_super_shift = TEXT_OFFSET + 0x10720; +hv_glue_internals = TEXT_OFFSET + 0x10740; diff --git a/arch/tile/kernel/proc.c b/arch/tile/kernel/proc.c index 446a7f52cc11..dafc447b5125 100644 --- a/arch/tile/kernel/proc.c +++ b/arch/tile/kernel/proc.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c index 32948e21113a..445c220eae51 100644 --- a/arch/tile/kernel/setup.c +++ b/arch/tile/kernel/setup.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -49,9 +50,6 @@ char chip_model[64] __write_once; struct pglist_data node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); -/* We only create bootmem data on node 0. */ -static bootmem_data_t __initdata node0_bdata; - /* Information on the NUMA nodes that we compute early */ unsigned long __cpuinitdata node_start_pfn[MAX_NUMNODES]; unsigned long __cpuinitdata node_end_pfn[MAX_NUMNODES]; @@ -518,37 +516,96 @@ static void __init setup_memory(void) #endif } -static void __init setup_bootmem_allocator(void) +/* + * On 32-bit machines, we only put bootmem on the low controller, + * since PAs > 4GB can't be used in bootmem. In principle one could + * imagine, e.g., multiple 1 GB controllers all of which could support + * bootmem, but in practice using controllers this small isn't a + * particularly interesting scenario, so we just keep it simple and + * use only the first controller for bootmem on 32-bit machines. + */ +static inline int node_has_bootmem(int nid) { - unsigned long bootmap_size, first_alloc_pfn, last_alloc_pfn; - - /* Provide a node 0 bdata. */ - NODE_DATA(0)->bdata = &node0_bdata; - -#ifdef CONFIG_PCI - /* Don't let boot memory alias the PCI region. */ - last_alloc_pfn = min(max_low_pfn, pci_reserve_start_pfn); +#ifdef CONFIG_64BIT + return 1; #else - last_alloc_pfn = max_low_pfn; + return nid == 0; +#endif +} + +static inline unsigned long alloc_bootmem_pfn(int nid, + unsigned long size, + unsigned long goal) +{ + void *kva = __alloc_bootmem_node(NODE_DATA(nid), size, + PAGE_SIZE, goal); + unsigned long pfn = kaddr_to_pfn(kva); + BUG_ON(goal && PFN_PHYS(pfn) != goal); + return pfn; +} + +static void __init setup_bootmem_allocator_node(int i) +{ + unsigned long start, end, mapsize, mapstart; + + if (node_has_bootmem(i)) { + NODE_DATA(i)->bdata = &bootmem_node_data[i]; + } else { + /* Share controller zero's bdata for now. */ + NODE_DATA(i)->bdata = &bootmem_node_data[0]; + return; + } + + /* Skip up to after the bss in node 0. */ + start = (i == 0) ? min_low_pfn : node_start_pfn[i]; + + /* Only lowmem, if we're a HIGHMEM build. */ +#ifdef CONFIG_HIGHMEM + end = node_lowmem_end_pfn[i]; +#else + end = node_end_pfn[i]; #endif - /* - * Initialize the boot-time allocator (with low memory only): - * The first argument says where to put the bitmap, and the - * second says where the end of allocatable memory is. - */ - bootmap_size = init_bootmem(min_low_pfn, last_alloc_pfn); + /* No memory here. */ + if (end == start) + return; - /* - * Let the bootmem allocator use all the space we've given it - * except for its own bitmap. - */ - first_alloc_pfn = min_low_pfn + PFN_UP(bootmap_size); - if (first_alloc_pfn >= last_alloc_pfn) - early_panic("Not enough memory on controller 0 for bootmem\n"); + /* Figure out where the bootmem bitmap is located. */ + mapsize = bootmem_bootmap_pages(end - start); + if (i == 0) { + /* Use some space right before the heap on node 0. */ + mapstart = start; + start += mapsize; + } else { + /* Allocate bitmap on node 0 to avoid page table issues. */ + mapstart = alloc_bootmem_pfn(0, PFN_PHYS(mapsize), 0); + } - free_bootmem(PFN_PHYS(first_alloc_pfn), - PFN_PHYS(last_alloc_pfn - first_alloc_pfn)); + /* Initialize a node. */ + init_bootmem_node(NODE_DATA(i), mapstart, start, end); + + /* Free all the space back into the allocator. */ + free_bootmem(PFN_PHYS(start), PFN_PHYS(end - start)); + +#if defined(CONFIG_PCI) + /* + * Throw away any memory aliased by the PCI region. FIXME: this + * is a temporary hack to work around bug 10502, and needs to be + * fixed properly. + */ + if (pci_reserve_start_pfn < end && pci_reserve_end_pfn > start) + reserve_bootmem(PFN_PHYS(pci_reserve_start_pfn), + PFN_PHYS(pci_reserve_end_pfn - + pci_reserve_start_pfn), + BOOTMEM_EXCLUSIVE); +#endif +} + +static void __init setup_bootmem_allocator(void) +{ + int i; + for (i = 0; i < MAX_NUMNODES; ++i) + setup_bootmem_allocator_node(i); #ifdef CONFIG_KEXEC if (crashk_res.start != crashk_res.end) @@ -579,14 +636,6 @@ static int __init percpu_size(void) return size; } -static inline unsigned long alloc_bootmem_pfn(int size, unsigned long goal) -{ - void *kva = __alloc_bootmem(size, PAGE_SIZE, goal); - unsigned long pfn = kaddr_to_pfn(kva); - BUG_ON(goal && PFN_PHYS(pfn) != goal); - return pfn; -} - static void __init zone_sizes_init(void) { unsigned long zones_size[MAX_NR_ZONES] = { 0 }; @@ -624,21 +673,22 @@ static void __init zone_sizes_init(void) * though, there'll be no lowmem, so we just alloc_bootmem * the memmap. There will be no percpu memory either. */ - if (__pfn_to_highbits(start) == 0) { - /* In low PAs, allocate via bootmem. */ + if (i != 0 && cpu_isset(i, isolnodes)) { + node_memmap_pfn[i] = + alloc_bootmem_pfn(0, memmap_size, 0); + BUG_ON(node_percpu[i] != 0); + } else if (node_has_bootmem(start)) { unsigned long goal = 0; node_memmap_pfn[i] = - alloc_bootmem_pfn(memmap_size, goal); + alloc_bootmem_pfn(i, memmap_size, 0); if (kdata_huge) goal = PFN_PHYS(lowmem_end) - node_percpu[i]; if (node_percpu[i]) node_percpu_pfn[i] = - alloc_bootmem_pfn(node_percpu[i], goal); - } else if (cpu_isset(i, isolnodes)) { - node_memmap_pfn[i] = alloc_bootmem_pfn(memmap_size, 0); - BUG_ON(node_percpu[i] != 0); + alloc_bootmem_pfn(i, node_percpu[i], + goal); } else { - /* In high PAs, just reserve some pages. */ + /* In non-bootmem zones, just reserve some pages. */ node_memmap_pfn[i] = node_free_pfn[i]; node_free_pfn[i] += PFN_UP(memmap_size); if (!kdata_huge) { @@ -662,16 +712,9 @@ static void __init zone_sizes_init(void) zones_size[ZONE_NORMAL] = end - start; #endif - /* - * Everyone shares node 0's bootmem allocator, but - * we use alloc_remap(), above, to put the actual - * struct page array on the individual controllers, - * which is most of the data that we actually care about. - * We can't place bootmem allocators on the other - * controllers since the bootmem allocator can only - * operate on 32-bit physical addresses. - */ - NODE_DATA(i)->bdata = NODE_DATA(0)->bdata; + /* Take zone metadata from controller 0 if we're isolnode. */ + if (node_isset(i, isolnodes)) + NODE_DATA(i)->bdata = &bootmem_node_data[0]; free_area_init_node(i, zones_size, start, NULL); printk(KERN_DEBUG " Normal zone: %ld per-cpu pages\n", @@ -854,6 +897,22 @@ subsys_initcall(topology_init); #endif /* CONFIG_NUMA */ +/* + * Initialize hugepage support on this cpu. We do this on all cores + * early in boot: before argument parsing for the boot cpu, and after + * argument parsing but before the init functions run on the secondaries. + * So the values we set up here in the hypervisor may be overridden on + * the boot cpu as arguments are parsed. + */ +static __cpuinit void init_super_pages(void) +{ +#ifdef CONFIG_HUGETLB_SUPER_PAGES + int i; + for (i = 0; i < HUGE_SHIFT_ENTRIES; ++i) + hv_set_pte_super_shift(i, huge_shift[i]); +#endif +} + /** * setup_cpu() - Do all necessary per-cpu, tile-specific initialization. * @boot: Is this the boot cpu? @@ -908,6 +967,8 @@ void __cpuinit setup_cpu(int boot) /* Reset the network state on this cpu. */ reset_network_state(); #endif + + init_super_pages(); } #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/tile/kernel/tlb.c b/arch/tile/kernel/tlb.c index a5f241c24cac..3fd54d5bbd4c 100644 --- a/arch/tile/kernel/tlb.c +++ b/arch/tile/kernel/tlb.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -49,25 +50,25 @@ void flush_tlb_current_task(void) flush_tlb_mm(current->mm); } -void flush_tlb_page_mm(const struct vm_area_struct *vma, struct mm_struct *mm, +void flush_tlb_page_mm(struct vm_area_struct *vma, struct mm_struct *mm, unsigned long va) { - unsigned long size = hv_page_size(vma); + unsigned long size = vma_kernel_pagesize(vma); int cache = (vma->vm_flags & VM_EXEC) ? HV_FLUSH_EVICT_L1I : 0; flush_remote(0, cache, mm_cpumask(mm), va, size, size, mm_cpumask(mm), NULL, 0); } -void flush_tlb_page(const struct vm_area_struct *vma, unsigned long va) +void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) { flush_tlb_page_mm(vma, vma->vm_mm, va); } EXPORT_SYMBOL(flush_tlb_page); -void flush_tlb_range(const struct vm_area_struct *vma, +void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - unsigned long size = hv_page_size(vma); + unsigned long size = vma_kernel_pagesize(vma); struct mm_struct *mm = vma->vm_mm; int cache = (vma->vm_flags & VM_EXEC) ? HV_FLUSH_EVICT_L1I : 0; flush_remote(0, cache, mm_cpumask(mm), start, end - start, size, diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c index 22e58f51ed23..54f18fc25ed0 100644 --- a/arch/tile/mm/fault.c +++ b/arch/tile/mm/fault.c @@ -187,7 +187,7 @@ static pgd_t *get_current_pgd(void) HV_Context ctx = hv_inquire_context(); unsigned long pgd_pfn = ctx.page_table >> PAGE_SHIFT; struct page *pgd_page = pfn_to_page(pgd_pfn); - BUG_ON(PageHighMem(pgd_page)); /* oops, HIGHPTE? */ + BUG_ON(PageHighMem(pgd_page)); return (pgd_t *) __va(ctx.page_table); } diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c index 499f73770b05..dbcbdf7b8aa8 100644 --- a/arch/tile/mm/homecache.c +++ b/arch/tile/mm/homecache.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c index 42cfcba4e1ef..812e2d037972 100644 --- a/arch/tile/mm/hugetlbpage.c +++ b/arch/tile/mm/hugetlbpage.c @@ -27,85 +27,161 @@ #include #include #include +#include + +#ifdef CONFIG_HUGETLB_SUPER_PAGES + +/* + * Provide an additional huge page size (in addition to the regular default + * huge page size) if no "hugepagesz" arguments are specified. + * Note that it must be smaller than the default huge page size so + * that it's possible to allocate them on demand from the buddy allocator. + * You can change this to 64K (on a 16K build), 256K, 1M, or 4M, + * or not define it at all. + */ +#define ADDITIONAL_HUGE_SIZE (1024 * 1024UL) + +/* "Extra" page-size multipliers, one per level of the page table. */ +int huge_shift[HUGE_SHIFT_ENTRIES] = { +#ifdef ADDITIONAL_HUGE_SIZE +#define ADDITIONAL_HUGE_SHIFT __builtin_ctzl(ADDITIONAL_HUGE_SIZE / PAGE_SIZE) + [HUGE_SHIFT_PAGE] = ADDITIONAL_HUGE_SHIFT +#endif +}; + +/* + * This routine is a hybrid of pte_alloc_map() and pte_alloc_kernel(). + * It assumes that L2 PTEs are never in HIGHMEM (we don't support that). + * It locks the user pagetable, and bumps up the mm->nr_ptes field, + * but otherwise allocate the page table using the kernel versions. + */ +static pte_t *pte_alloc_hugetlb(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + pte_t *new; + + if (pmd_none(*pmd)) { + new = pte_alloc_one_kernel(mm, address); + if (!new) + return NULL; + + smp_wmb(); /* See comment in __pte_alloc */ + + spin_lock(&mm->page_table_lock); + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ + mm->nr_ptes++; + pmd_populate_kernel(mm, pmd, new); + new = NULL; + } else + VM_BUG_ON(pmd_trans_splitting(*pmd)); + spin_unlock(&mm->page_table_lock); + if (new) + pte_free_kernel(mm, new); + } + + return pte_offset_kernel(pmd, address); +} +#endif pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; - pte_t *pte = NULL; - /* We do not yet support multiple huge page sizes. */ - BUG_ON(sz != PMD_SIZE); + addr &= -sz; /* Mask off any low bits in the address. */ pgd = pgd_offset(mm, addr); pud = pud_alloc(mm, pgd, addr); - if (pud) - pte = (pte_t *) pmd_alloc(mm, pud, addr); - BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); - return pte; +#ifdef CONFIG_HUGETLB_SUPER_PAGES + if (sz >= PGDIR_SIZE) { + BUG_ON(sz != PGDIR_SIZE && + sz != PGDIR_SIZE << huge_shift[HUGE_SHIFT_PGDIR]); + return (pte_t *)pud; + } else { + pmd_t *pmd = pmd_alloc(mm, pud, addr); + if (sz >= PMD_SIZE) { + BUG_ON(sz != PMD_SIZE && + sz != (PMD_SIZE << huge_shift[HUGE_SHIFT_PMD])); + return (pte_t *)pmd; + } + else { + if (sz != PAGE_SIZE << huge_shift[HUGE_SHIFT_PAGE]) + panic("Unexpected page size %#lx\n", sz); + return pte_alloc_hugetlb(mm, pmd, addr); + } + } +#else + BUG_ON(sz != PMD_SIZE); + return (pte_t *) pmd_alloc(mm, pud, addr); +#endif +} + +static pte_t *get_pte(pte_t *base, int index, int level) +{ + pte_t *ptep = base + index; +#ifdef CONFIG_HUGETLB_SUPER_PAGES + if (!pte_present(*ptep) && huge_shift[level] != 0) { + unsigned long mask = -1UL << huge_shift[level]; + pte_t *super_ptep = base + (index & mask); + pte_t pte = *super_ptep; + if (pte_present(pte) && pte_super(pte)) + ptep = super_ptep; + } +#endif + return ptep; } pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pud_t *pud; - pmd_t *pmd = NULL; + pmd_t *pmd; +#ifdef CONFIG_HUGETLB_SUPER_PAGES + pte_t *pte; +#endif - pgd = pgd_offset(mm, addr); - if (pgd_present(*pgd)) { - pud = pud_offset(pgd, addr); - if (pud_present(*pud)) - pmd = pmd_offset(pud, addr); - } - return (pte_t *) pmd; -} + /* Get the top-level page table entry. */ + pgd = (pgd_t *)get_pte((pte_t *)mm->pgd, pgd_index(addr), 0); + if (!pgd_present(*pgd)) + return NULL; -#ifdef HUGETLB_TEST -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, - int write) -{ - unsigned long start = address; - int length = 1; - int nr; - struct page *page; - struct vm_area_struct *vma; + /* We don't have four levels. */ + pud = pud_offset(pgd, addr); +#ifndef __PAGETABLE_PUD_FOLDED +# error support fourth page table level +#endif - vma = find_vma(mm, addr); - if (!vma || !is_vm_hugetlb_page(vma)) - return ERR_PTR(-EINVAL); + /* Check for an L0 huge PTE, if we have three levels. */ +#ifndef __PAGETABLE_PMD_FOLDED + if (pud_huge(*pud)) + return (pte_t *)pud; - pte = huge_pte_offset(mm, address); + pmd = (pmd_t *)get_pte((pte_t *)pud_page_vaddr(*pud), + pmd_index(addr), 1); + if (!pmd_present(*pmd)) + return NULL; +#else + pmd = pmd_offset(pud, addr); +#endif - /* hugetlb should be locked, and hence, prefaulted */ - WARN_ON(!pte || pte_none(*pte)); + /* Check for an L1 huge PTE. */ + if (pmd_huge(*pmd)) + return (pte_t *)pmd; - page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; +#ifdef CONFIG_HUGETLB_SUPER_PAGES + /* Check for an L2 huge PTE. */ + pte = get_pte((pte_t *)pmd_page_vaddr(*pmd), pte_index(addr), 2); + if (!pte_present(*pte)) + return NULL; + if (pte_super(*pte)) + return pte; +#endif - WARN_ON(!PageHead(page)); - - return page; -} - -int pmd_huge(pmd_t pmd) -{ - return 0; -} - -int pud_huge(pud_t pud) -{ - return 0; -} - -struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int write) -{ return NULL; } -#else - struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { @@ -149,8 +225,6 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) return 0; } -#endif - #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, unsigned long len, @@ -322,21 +396,102 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return hugetlb_get_unmapped_area_topdown(file, addr, len, pgoff, flags); } +#endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */ + +#ifdef CONFIG_HUGETLB_SUPER_PAGES +static __init int __setup_hugepagesz(unsigned long ps) +{ + int log_ps = __builtin_ctzl(ps); + int level, base_shift; + + if ((1UL << log_ps) != ps || (log_ps & 1) != 0) { + pr_warn("Not enabling %ld byte huge pages;" + " must be a power of four.\n", ps); + return -EINVAL; + } + + if (ps > 64*1024*1024*1024UL) { + pr_warn("Not enabling %ld MB huge pages;" + " largest legal value is 64 GB .\n", ps >> 20); + return -EINVAL; + } else if (ps >= PUD_SIZE) { + static long hv_jpage_size; + if (hv_jpage_size == 0) + hv_jpage_size = hv_sysconf(HV_SYSCONF_PAGE_SIZE_JUMBO); + if (hv_jpage_size != PUD_SIZE) { + pr_warn("Not enabling >= %ld MB huge pages:" + " hypervisor reports size %ld\n", + PUD_SIZE >> 20, hv_jpage_size); + return -EINVAL; + } + level = 0; + base_shift = PUD_SHIFT; + } else if (ps >= PMD_SIZE) { + level = 1; + base_shift = PMD_SHIFT; + } else if (ps > PAGE_SIZE) { + level = 2; + base_shift = PAGE_SHIFT; + } else { + pr_err("hugepagesz: huge page size %ld too small\n", ps); + return -EINVAL; + } + + if (log_ps != base_shift) { + int shift_val = log_ps - base_shift; + if (huge_shift[level] != 0) { + int old_shift = base_shift + huge_shift[level]; + pr_warn("Not enabling %ld MB huge pages;" + " already have size %ld MB.\n", + ps >> 20, (1UL << old_shift) >> 20); + return -EINVAL; + } + if (hv_set_pte_super_shift(level, shift_val) != 0) { + pr_warn("Not enabling %ld MB huge pages;" + " no hypervisor support.\n", ps >> 20); + return -EINVAL; + } + printk(KERN_DEBUG "Enabled %ld MB huge pages\n", ps >> 20); + huge_shift[level] = shift_val; + } + + hugetlb_add_hstate(log_ps - PAGE_SHIFT); + + return 0; +} + +static bool saw_hugepagesz; static __init int setup_hugepagesz(char *opt) { - unsigned long ps = memparse(opt, &opt); - if (ps == PMD_SIZE) { - hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); - } else if (ps == PUD_SIZE) { - hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); - } else { - pr_err("hugepagesz: Unsupported page size %lu M\n", - ps >> 20); - return 0; + if (!saw_hugepagesz) { + saw_hugepagesz = true; + memset(huge_shift, 0, sizeof(huge_shift)); } - return 1; + return __setup_hugepagesz(memparse(opt, NULL)); } __setup("hugepagesz=", setup_hugepagesz); -#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ +#ifdef ADDITIONAL_HUGE_SIZE +/* + * Provide an additional huge page size if no "hugepagesz" args are given. + * In that case, all the cores have properly set up their hv super_shift + * already, but we need to notify the hugetlb code to enable the + * new huge page size from the Linux point of view. + */ +static __init int add_default_hugepagesz(void) +{ + if (!saw_hugepagesz) { + BUILD_BUG_ON(ADDITIONAL_HUGE_SIZE >= PMD_SIZE || + ADDITIONAL_HUGE_SIZE <= PAGE_SIZE); + BUILD_BUG_ON((PAGE_SIZE << ADDITIONAL_HUGE_SHIFT) != + ADDITIONAL_HUGE_SIZE); + BUILD_BUG_ON(ADDITIONAL_HUGE_SHIFT & 1); + hugetlb_add_hstate(ADDITIONAL_HUGE_SHIFT); + } + return 0; +} +arch_initcall(add_default_hugepagesz); +#endif + +#endif /* CONFIG_HUGETLB_SUPER_PAGES */ diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c index c04fbfd93fc5..630dd2ce2afe 100644 --- a/arch/tile/mm/init.c +++ b/arch/tile/mm/init.c @@ -698,6 +698,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) #endif /* CONFIG_HIGHMEM */ +#ifndef CONFIG_64BIT static void __init init_free_pfn_range(unsigned long start, unsigned long end) { unsigned long pfn; @@ -770,6 +771,7 @@ static void __init set_non_bootmem_pages_init(void) init_free_pfn_range(start, end); } } +#endif /* * paging_init() sets up the page tables - note that all of lowmem is @@ -858,8 +860,10 @@ void __init mem_init(void) /* this will put all bootmem onto the freelists */ totalram_pages += free_all_bootmem(); +#ifndef CONFIG_64BIT /* count all remaining LOWMEM and give all HIGHMEM to page allocator */ set_non_bootmem_pages_init(); +#endif codesize = (unsigned long)&_etext - (unsigned long)&_text; datasize = (unsigned long)&_end - (unsigned long)&_sdata; diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c index 3d7074347e6d..345edfed9fcd 100644 --- a/arch/tile/mm/pgtable.c +++ b/arch/tile/mm/pgtable.c @@ -132,15 +132,6 @@ void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags) set_pte_pfn(address, phys >> PAGE_SHIFT, flags); } -#if defined(CONFIG_HIGHPTE) -pte_t *_pte_offset_map(pmd_t *dir, unsigned long address) -{ - pte_t *pte = kmap_atomic(pmd_page(*dir)) + - (pmd_ptfn(*dir) << HV_LOG2_PAGE_TABLE_ALIGN) & ~PAGE_MASK; - return &pte[pte_index(address)]; -} -#endif - /** * shatter_huge_page() - ensure a given address is mapped by a small page. * @@ -296,10 +287,6 @@ struct page *pgtable_alloc_one(struct mm_struct *mm, unsigned long address, struct page *p; int i; -#ifdef CONFIG_HIGHPTE - flags |= __GFP_HIGHMEM; -#endif - p = alloc_pages(flags, L2_USER_PGTABLE_ORDER); if (p == NULL) return NULL;