diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index 0dc8632aa01e..d9cb65cf5cfd 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt @@ -113,9 +113,26 @@ guaranteed, but it may be more likely in case the allocation is for a MADV_HUGEPAGE region. echo always >/sys/kernel/mm/transparent_hugepage/defrag +echo defer >/sys/kernel/mm/transparent_hugepage/defrag echo madvise >/sys/kernel/mm/transparent_hugepage/defrag echo never >/sys/kernel/mm/transparent_hugepage/defrag +"always" means that an application requesting THP will stall on allocation +failure and directly reclaim pages and compact memory in an effort to +allocate a THP immediately. This may be desirable for virtual machines +that benefit heavily from THP use and are willing to delay the VM start +to utilise them. + +"defer" means that an application will wake kswapd in the background +to reclaim pages and wake kcompact to compact memory so that THP is +available in the near future. It's the responsibility of khugepaged +to then install the THP pages later. + +"madvise" will enter direct reclaim like "always" but only for regions +that are have used madvise(MADV_HUGEPAGE). This is the default behaviour. + +"never" should be self-explanatory. + By default kernel tries to use huge zero page on read page fault. It's possible to disable huge zero page by writing 0 or enable it back by writing 1: diff --git a/include/linux/gfp.h b/include/linux/gfp.h index c083d0820a87..11d56c6e7ef2 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -257,7 +257,7 @@ struct vm_area_struct; #define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) #define GFP_TRANSHUGE ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & \ - ~__GFP_KSWAPD_RECLAIM) + ~__GFP_RECLAIM) /* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 459fd25b378e..a4cecb4801ec 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -41,7 +41,8 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *, enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, - TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, @@ -71,12 +72,6 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); ((__vma)->vm_flags & VM_HUGEPAGE))) && \ !((__vma)->vm_flags & VM_NOHUGEPAGE) && \ !is_vma_temporary_stack(__vma)) -#define transparent_hugepage_defrag(__vma) \ - ((transparent_hugepage_flags & \ - (1<vm_flags & VM_HUGEPAGE)) #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ (1<vm_flags & VM_HUGEPAGE)) + reclaim_flags = __GFP_DIRECT_RECLAIM; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) + reclaim_flags = __GFP_KSWAPD_RECLAIM; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) + reclaim_flags = __GFP_DIRECT_RECLAIM; + + return GFP_TRANSHUGE | reclaim_flags; +} + +/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ +static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) +{ + return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0); } /* Caller must hold page table lock. */ @@ -919,7 +950,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, } return ret; } - gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + gfp = alloc_hugepage_direct_gfpmask(vma); page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); @@ -1279,7 +1310,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + huge_gfp = alloc_hugepage_direct_gfpmask(vma); new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); } else new_page = NULL; @@ -2249,11 +2280,12 @@ static int khugepaged_find_target_node(void) return 0; } -static inline struct page *alloc_hugepage(int defrag) +static inline struct page *alloc_khugepaged_hugepage(void) { struct page *page; - page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER); + page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), + HPAGE_PMD_ORDER); if (page) prep_transhuge_page(page); return page; @@ -2264,7 +2296,7 @@ static struct page *khugepaged_alloc_hugepage(bool *wait) struct page *hpage; do { - hpage = alloc_hugepage(khugepaged_defrag()); + hpage = alloc_khugepaged_hugepage(); if (!hpage) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); if (!*wait) @@ -2335,8 +2367,7 @@ static void collapse_huge_page(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); /* Only allocate from the target node */ - gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | - __GFP_THISNODE; + gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE; /* release the mmap_sem read lock. */ new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d156310aedeb..096a00d98a45 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3119,14 +3119,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; - /* - * If this allocation cannot block and it is for a specific node, then - * fail early. There's no need to wakeup kswapd or retry for a - * speculative node-specific allocation. - */ - if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim) - goto nopage; - retry: if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); diff --git a/mm/slab.c b/mm/slab.c index 56dd0df2a8ce..e1f6c27c3db5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -670,7 +670,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, static inline gfp_t gfp_exact_node(gfp_t flags) { - return flags; + return flags & ~__GFP_NOFAIL; } #else /* CONFIG_NUMA */ @@ -841,12 +841,12 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } /* - * Construct gfp mask to allocate from a specific node but do not direct reclaim - * or warn about failures. kswapd may still wake to reclaim in the background. + * Construct gfp mask to allocate from a specific node but do not reclaim or + * warn about failures. */ static inline gfp_t gfp_exact_node(gfp_t flags) { - return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM; + return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL); } #endif diff --git a/mm/slub.c b/mm/slub.c index 2f2f04d39104..64ed5f3a3046 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1426,7 +1426,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) - alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM; + alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL); page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) {