diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst index 8aed9103e48a..5c0552e78c58 100644 --- a/Documentation/core-api/cachetlb.rst +++ b/Documentation/core-api/cachetlb.rst @@ -326,6 +326,12 @@ maps this page at its virtual address. dirty. Again, see sparc64 for examples of how to deal with this. + ``void flush_dcache_folio(struct folio *folio)`` + This function is called under the same circumstances as + flush_dcache_page(). It allows the architecture to + optimise for flushing the entire folio of pages instead + of flushing one page at a time. + ``void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long user_vaddr, void *dst, void *src, int len)`` ``void copy_from_user_page(struct vm_area_struct *vma, struct page *page, diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index a42f9baddfbf..395835f9289f 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -95,6 +95,11 @@ More Memory Management Functions .. kernel-doc:: mm/mempolicy.c .. kernel-doc:: include/linux/mm_types.h :internal: +.. kernel-doc:: include/linux/mm_inline.h +.. kernel-doc:: include/linux/page-flags.h .. kernel-doc:: include/linux/mm.h :internal: +.. kernel-doc:: include/linux/page_ref.h .. kernel-doc:: include/linux/mmzone.h +.. kernel-doc:: mm/util.c + :functions: folio_mapping diff --git a/Documentation/filesystems/netfs_library.rst b/Documentation/filesystems/netfs_library.rst index 57a641847818..bb68d39f03b7 100644 --- a/Documentation/filesystems/netfs_library.rst +++ b/Documentation/filesystems/netfs_library.rst @@ -524,3 +524,5 @@ Note that these methods are passed a pointer to the cache resource structure, not the read request structure as they could be used in other situations where there isn't a read request structure as well, such as writing dirty data to the cache. + +.. kernel-doc:: include/linux/netfs.h diff --git a/arch/arc/include/asm/cacheflush.h b/arch/arc/include/asm/cacheflush.h index e201b4b1655a..e8c2c7469e10 100644 --- a/arch/arc/include/asm/cacheflush.h +++ b/arch/arc/include/asm/cacheflush.h @@ -36,6 +36,7 @@ void __flush_dcache_page(phys_addr_t paddr, unsigned long vaddr); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 void flush_dcache_page(struct page *page); +void flush_dcache_folio(struct folio *folio); void dma_cache_wback_inv(phys_addr_t start, unsigned long sz); void dma_cache_inv(phys_addr_t start, unsigned long sz); diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 5e56288e343b..e68fb879e4f9 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -290,6 +290,7 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr */ #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *); +void flush_dcache_folio(struct folio *folio); #define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 static inline void flush_kernel_vmap_range(void *addr, int size) diff --git a/arch/m68k/include/asm/cacheflush_mm.h b/arch/m68k/include/asm/cacheflush_mm.h index 1ac55e7b47f0..8ab46625ddd3 100644 --- a/arch/m68k/include/asm/cacheflush_mm.h +++ b/arch/m68k/include/asm/cacheflush_mm.h @@ -250,6 +250,7 @@ static inline void __flush_page_to_ram(void *vaddr) #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 #define flush_dcache_page(page) __flush_page_to_ram(page_address(page)) +void flush_dcache_folio(struct folio *folio); #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) #define flush_icache_page(vma, page) __flush_page_to_ram(page_address(page)) diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h index b3dc9c589442..f207388541d5 100644 --- a/arch/mips/include/asm/cacheflush.h +++ b/arch/mips/include/asm/cacheflush.h @@ -61,6 +61,8 @@ static inline void flush_dcache_page(struct page *page) SetPageDcacheDirty(page); } +void flush_dcache_folio(struct folio *folio); + #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) diff --git a/arch/nds32/include/asm/cacheflush.h b/arch/nds32/include/asm/cacheflush.h index c2a222ebfa2a..3fc0bb7d6487 100644 --- a/arch/nds32/include/asm/cacheflush.h +++ b/arch/nds32/include/asm/cacheflush.h @@ -27,6 +27,7 @@ void flush_cache_vunmap(unsigned long start, unsigned long end); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 void flush_dcache_page(struct page *page); +void flush_dcache_folio(struct folio *folio); void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, void *src, int len); void copy_from_user_page(struct vm_area_struct *vma, struct page *page, diff --git a/arch/nios2/include/asm/cacheflush.h b/arch/nios2/include/asm/cacheflush.h index 18eb9f69f806..1999561b22aa 100644 --- a/arch/nios2/include/asm/cacheflush.h +++ b/arch/nios2/include/asm/cacheflush.h @@ -28,7 +28,8 @@ extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, extern void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long pfn); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -extern void flush_dcache_page(struct page *page); +void flush_dcache_page(struct page *page); +void flush_dcache_folio(struct folio *folio); extern void flush_icache_range(unsigned long start, unsigned long end); extern void flush_icache_page(struct vm_area_struct *vma, struct page *page); diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index eef0096db5f8..da0cd4b3a28f 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -49,7 +49,8 @@ void invalidate_kernel_vmap_range(void *vaddr, int size); #define flush_cache_vunmap(start, end) flush_cache_all() #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -extern void flush_dcache_page(struct page *page); +void flush_dcache_page(struct page *page); +void flush_dcache_folio(struct folio *folio); #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h index 372afa82fee6..c7a97f32432f 100644 --- a/arch/sh/include/asm/cacheflush.h +++ b/arch/sh/include/asm/cacheflush.h @@ -42,7 +42,8 @@ extern void flush_cache_page(struct vm_area_struct *vma, extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -extern void flush_dcache_page(struct page *page); +void flush_dcache_page(struct page *page); +void flush_dcache_folio(struct folio *folio); extern void flush_icache_range(unsigned long start, unsigned long end); #define flush_icache_user_range flush_icache_range extern void flush_icache_page(struct vm_area_struct *vma, diff --git a/arch/xtensa/include/asm/cacheflush.h b/arch/xtensa/include/asm/cacheflush.h index cf907e5bf2f2..a8a041609c5d 100644 --- a/arch/xtensa/include/asm/cacheflush.h +++ b/arch/xtensa/include/asm/cacheflush.h @@ -120,7 +120,8 @@ void flush_cache_page(struct vm_area_struct*, #define flush_cache_vunmap(start,end) flush_cache_all() #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -extern void flush_dcache_page(struct page*); +void flush_dcache_page(struct page *); +void flush_dcache_folio(struct folio *); void local_flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); @@ -137,7 +138,9 @@ void local_flush_cache_page(struct vm_area_struct *vma, #define flush_cache_vunmap(start,end) do { } while (0) #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO #define flush_dcache_page(page) do { } while (0) +static inline void flush_dcache_folio(struct folio *folio) { } #define flush_icache_range local_flush_icache_range #define flush_cache_page(vma, addr, pfn) do { } while (0) diff --git a/fs/afs/write.c b/fs/afs/write.c index f24370f5c774..8b1d9c2f6bec 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -861,7 +861,8 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) { - struct page *page = thp_head(vmf->page); + struct folio *folio = page_folio(vmf->page); + struct page *page = &folio->page; struct file *file = vmf->vma->vm_file; struct inode *inode = file_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); @@ -884,7 +885,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) goto out; #endif - if (wait_on_page_writeback_killable(page)) + if (folio_wait_writeback_killable(folio)) goto out; if (lock_page_killable(page) < 0) @@ -894,8 +895,8 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) * details the portion of the page we need to write back and we might * need to redirty the page if there's a problem. */ - if (wait_on_page_writeback_killable(page) < 0) { - unlock_page(page); + if (folio_wait_writeback_killable(folio) < 0) { + folio_unlock(folio); goto out; } diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 8ffc40e84a59..fcf4f3b72923 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -25,20 +25,20 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode, struct cachefiles_object *object; struct fscache_retrieval *op = monitor->op; struct wait_page_key *key = _key; - struct page *page = wait->private; + struct folio *folio = wait->private; ASSERT(key); _enter("{%lu},%u,%d,{%p,%u}", monitor->netfs_page->index, mode, sync, - key->page, key->bit_nr); + key->folio, key->bit_nr); - if (key->page != page || key->bit_nr != PG_locked) + if (key->folio != folio || key->bit_nr != PG_locked) return 0; - _debug("--- monitor %p %lx ---", page, page->flags); + _debug("--- monitor %p %lx ---", folio, folio->flags); - if (!PageUptodate(page) && !PageError(page)) { + if (!folio_test_uptodate(folio) && !folio_test_error(folio)) { /* unlocked, not uptodate and not erronous? */ _debug("page probably truncated"); } @@ -107,7 +107,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object, put_page(backpage2); INIT_LIST_HEAD(&monitor->op_link); - add_page_wait_queue(backpage, &monitor->monitor); + folio_add_wait_queue(page_folio(backpage), &monitor->monitor); if (trylock_page(backpage)) { ret = -EIO; @@ -294,7 +294,7 @@ monitor_backing_page: get_page(backpage); monitor->back_page = backpage; monitor->monitor.private = backpage; - add_page_wait_queue(backpage, &monitor->monitor); + folio_add_wait_queue(page_folio(backpage), &monitor->monitor); monitor = NULL; /* but the page may have been read before the monitor was installed, so @@ -548,7 +548,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, get_page(backpage); monitor->back_page = backpage; monitor->monitor.private = backpage; - add_page_wait_queue(backpage, &monitor->monitor); + folio_add_wait_queue(page_folio(backpage), &monitor->monitor); monitor = NULL; /* but the page may have been read before the monitor was diff --git a/fs/io_uring.c b/fs/io_uring.c index bc18af5e0a93..3a098b473401 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3356,7 +3356,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } /* - * This is our waitqueue callback handler, registered through lock_page_async() + * This is our waitqueue callback handler, registered through __folio_lock_async() * when we initially tried to do the IO with the iocb armed our waitqueue. * This gets called when the page is unlocked, and we generally expect that to * happen when the page IO is completed and the page is now uptodate. This will diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 176580f54af9..104ae698443e 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "jfs_incore.h" #include "jfs_superblock.h" #include "jfs_filsys.h" diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index 4a674db4e1fa..fedc0dfa4877 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -49,9 +49,15 @@ static inline void flush_cache_page(struct vm_area_struct *vma, static inline void flush_dcache_page(struct page *page) { } + +static inline void flush_dcache_folio(struct folio *folio) { } #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO #endif +#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO +void flush_dcache_folio(struct folio *folio); +#endif #ifndef flush_dcache_mmap_lock static inline void flush_dcache_mmap_lock(struct address_space *mapping) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index ac7f231b8825..a62e72dd829f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -64,7 +64,7 @@ static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi) return atomic_long_read(&bdi->tot_write_bandwidth); } -static inline void __add_wb_stat(struct bdi_writeback *wb, +static inline void wb_stat_mod(struct bdi_writeback *wb, enum wb_stat_item item, s64 amount) { percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH); @@ -72,12 +72,12 @@ static inline void __add_wb_stat(struct bdi_writeback *wb, static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { - __add_wb_stat(wb, item, 1); + wb_stat_mod(wb, item, 1); } static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { - __add_wb_stat(wb, item, -1); + wb_stat_mod(wb, item, -1); } static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h index c12df59d3f5f..3e378b1fb0bc 100644 --- a/include/linux/flex_proportions.h +++ b/include/linux/flex_proportions.h @@ -83,9 +83,10 @@ struct fprop_local_percpu { int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp); void fprop_local_destroy_percpu(struct fprop_local_percpu *pl); -void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl); -void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl, - int max_frac); +void __fprop_add_percpu(struct fprop_global *p, struct fprop_local_percpu *pl, + long nr); +void __fprop_add_percpu_max(struct fprop_global *p, + struct fprop_local_percpu *pl, int max_frac, long nr); void fprop_fraction_percpu(struct fprop_global *p, struct fprop_local_percpu *pl, unsigned long *numerator, unsigned long *denominator); @@ -96,7 +97,7 @@ void fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl) unsigned long flags; local_irq_save(flags); - __fprop_inc_percpu(p, pl); + __fprop_add_percpu(p, pl, 1); local_irq_restore(flags); } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 55b2ec1f965a..3745efd21cf6 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -520,15 +520,11 @@ static inline void arch_free_page(struct page *page, int order) { } #ifndef HAVE_ARCH_ALLOC_PAGE static inline void arch_alloc_page(struct page *page, int order) { } #endif -#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE -static inline int arch_make_page_accessible(struct page *page) -{ - return 0; -} -#endif struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask); +struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, + nodemask_t *nodemask); unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, @@ -570,6 +566,15 @@ __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) return __alloc_pages(gfp_mask, order, nid, NULL); } +static inline +struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid) +{ + VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); + VM_WARN_ON((gfp & __GFP_THISNODE) && !node_online(nid)); + + return __folio_alloc(gfp, order, nid, NULL); +} + /* * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE, * prefer the current CPU's closest node. Otherwise node must be valid and @@ -586,6 +591,7 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, #ifdef CONFIG_NUMA struct page *alloc_pages(gfp_t gfp, unsigned int order); +struct folio *folio_alloc(gfp_t gfp, unsigned order); extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, int node, bool hugepage); @@ -596,6 +602,10 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) { return alloc_pages_node(numa_node_id(), gfp_mask, order); } +static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) +{ + return __folio_alloc_node(gfp, order, numa_node_id()); +} #define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ alloc_pages(gfp_mask, order) #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 4aa1031d3e4c..0a0b2b09b1b8 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -73,6 +73,12 @@ static inline void *kmap_local_page(struct page *page) return __kmap_local_page_prot(page, kmap_prot); } +static inline void *kmap_local_folio(struct folio *folio, size_t offset) +{ + struct page *page = folio_page(folio, offset / PAGE_SIZE); + return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE; +} + static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) { return __kmap_local_page_prot(page, prot); @@ -171,6 +177,11 @@ static inline void *kmap_local_page(struct page *page) return page_address(page); } +static inline void *kmap_local_folio(struct folio *folio, size_t offset) +{ + return page_address(&folio->page) + offset; +} + static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) { return kmap_local_page(page); diff --git a/include/linux/highmem.h b/include/linux/highmem.h index b4c49f9cc379..27cdd715c5f9 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -96,6 +96,43 @@ static inline void kmap_flush_unused(void); */ static inline void *kmap_local_page(struct page *page); +/** + * kmap_local_folio - Map a page in this folio for temporary usage + * @folio: The folio containing the page. + * @offset: The byte offset within the folio which identifies the page. + * + * Requires careful handling when nesting multiple mappings because the map + * management is stack based. The unmap has to be in the reverse order of + * the map operation:: + * + * addr1 = kmap_local_folio(folio1, offset1); + * addr2 = kmap_local_folio(folio2, offset2); + * ... + * kunmap_local(addr2); + * kunmap_local(addr1); + * + * Unmapping addr1 before addr2 is invalid and causes malfunction. + * + * Contrary to kmap() mappings the mapping is only valid in the context of + * the caller and cannot be handed to other contexts. + * + * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the + * virtual address of the direct mapping. Only real highmem pages are + * temporarily mapped. + * + * While it is significantly faster than kmap() for the higmem case it + * comes with restrictions about the pointer validity. Only use when really + * necessary. + * + * On HIGHMEM enabled systems mapping a highmem page has the side effect of + * disabling migration in order to keep the virtual address stable across + * preemption. No caller of kmap_local_folio() can rely on this side effect. + * + * Context: Can be invoked from any context. + * Return: The virtual address of @offset. + */ +static inline void *kmap_local_folio(struct folio *folio, size_t offset); + /** * kmap_atomic - Atomically map a page for temporary usage - Deprecated! * @page: Pointer to the page to be mapped diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f123e15d966e..f280f33ff223 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -250,15 +250,6 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, return NULL; } -/** - * thp_head - Head page of a transparent huge page. - * @page: Any page (tail, head or regular) found in the page cache. - */ -static inline struct page *thp_head(struct page *page) -{ - return compound_head(page); -} - /** * thp_order - Order of a transparent huge page. * @page: Head page of a transparent huge page. @@ -336,12 +327,6 @@ static inline struct list_head *page_deferred_list(struct page *page) #define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; }) -static inline struct page *thp_head(struct page *page) -{ - VM_BUG_ON_PGFLAGS(PageTail(page), page); - return page; -} - static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 161e8164abcf..a38a5bca1ba5 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -52,7 +52,7 @@ struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address); void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); -void ksm_migrate_page(struct page *newpage, struct page *oldpage); +void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); #else /* !CONFIG_KSM */ @@ -83,7 +83,7 @@ static inline void rmap_walk_ksm(struct page *page, { } -static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) +static inline void folio_migrate_ksm(struct folio *newfolio, struct folio *old) { } #endif /* CONFIG_MMU */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3096c9a0ee01..e34bf0cbdf55 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -369,7 +369,7 @@ enum page_memcg_data_flags { #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) -static inline bool PageMemcgKmem(struct page *page); +static inline bool folio_memcg_kmem(struct folio *folio); /* * After the initialization objcg->memcg is always pointing at @@ -384,89 +384,95 @@ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) } /* - * __page_memcg - get the memory cgroup associated with a non-kmem page - * @page: a pointer to the page struct + * __folio_memcg - Get the memory cgroup associated with a non-kmem folio + * @folio: Pointer to the folio. * - * Returns a pointer to the memory cgroup associated with the page, - * or NULL. This function assumes that the page is known to have a + * Returns a pointer to the memory cgroup associated with the folio, + * or NULL. This function assumes that the folio is known to have a * proper memory cgroup pointer. It's not safe to call this function - * against some type of pages, e.g. slab pages or ex-slab pages or - * kmem pages. + * against some type of folios, e.g. slab folios or ex-slab folios or + * kmem folios. */ -static inline struct mem_cgroup *__page_memcg(struct page *page) +static inline struct mem_cgroup *__folio_memcg(struct folio *folio) { - unsigned long memcg_data = page->memcg_data; + unsigned long memcg_data = folio->memcg_data; - VM_BUG_ON_PAGE(PageSlab(page), page); - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page); - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page); + VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); + VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio); + VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio); return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } /* - * __page_objcg - get the object cgroup associated with a kmem page - * @page: a pointer to the page struct + * __folio_objcg - get the object cgroup associated with a kmem folio. + * @folio: Pointer to the folio. * - * Returns a pointer to the object cgroup associated with the page, - * or NULL. This function assumes that the page is known to have a + * Returns a pointer to the object cgroup associated with the folio, + * or NULL. This function assumes that the folio is known to have a * proper object cgroup pointer. It's not safe to call this function - * against some type of pages, e.g. slab pages or ex-slab pages or - * LRU pages. + * against some type of folios, e.g. slab folios or ex-slab folios or + * LRU folios. */ -static inline struct obj_cgroup *__page_objcg(struct page *page) +static inline struct obj_cgroup *__folio_objcg(struct folio *folio) { - unsigned long memcg_data = page->memcg_data; + unsigned long memcg_data = folio->memcg_data; - VM_BUG_ON_PAGE(PageSlab(page), page); - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page); - VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page); + VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); + VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio); + VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio); return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } /* - * page_memcg - get the memory cgroup associated with a page - * @page: a pointer to the page struct + * folio_memcg - Get the memory cgroup associated with a folio. + * @folio: Pointer to the folio. * - * Returns a pointer to the memory cgroup associated with the page, - * or NULL. This function assumes that the page is known to have a + * Returns a pointer to the memory cgroup associated with the folio, + * or NULL. This function assumes that the folio is known to have a * proper memory cgroup pointer. It's not safe to call this function - * against some type of pages, e.g. slab pages or ex-slab pages. + * against some type of folios, e.g. slab folios or ex-slab folios. * - * For a non-kmem page any of the following ensures page and memcg binding + * For a non-kmem folio any of the following ensures folio and memcg binding * stability: * - * - the page lock + * - the folio lock * - LRU isolation * - lock_page_memcg() * - exclusive reference * - * For a kmem page a caller should hold an rcu read lock to protect memcg - * associated with a kmem page from being released. + * For a kmem folio a caller should hold an rcu read lock to protect memcg + * associated with a kmem folio from being released. */ -static inline struct mem_cgroup *page_memcg(struct page *page) +static inline struct mem_cgroup *folio_memcg(struct folio *folio) { - if (PageMemcgKmem(page)) - return obj_cgroup_memcg(__page_objcg(page)); - else - return __page_memcg(page); + if (folio_memcg_kmem(folio)) + return obj_cgroup_memcg(__folio_objcg(folio)); + return __folio_memcg(folio); } -/* - * page_memcg_rcu - locklessly get the memory cgroup associated with a page - * @page: a pointer to the page struct - * - * Returns a pointer to the memory cgroup associated with the page, - * or NULL. This function assumes that the page is known to have a - * proper memory cgroup pointer. It's not safe to call this function - * against some type of pages, e.g. slab pages or ex-slab pages. - */ -static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +static inline struct mem_cgroup *page_memcg(struct page *page) { - unsigned long memcg_data = READ_ONCE(page->memcg_data); + return folio_memcg(page_folio(page)); +} - VM_BUG_ON_PAGE(PageSlab(page), page); +/** + * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio. + * @folio: Pointer to the folio. + * + * This function assumes that the folio is known to have a + * proper memory cgroup pointer. It's not safe to call this function + * against some type of folios, e.g. slab folios or ex-slab folios. + * + * Return: A pointer to the memory cgroup associated with the folio, + * or NULL. + */ +static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) +{ + unsigned long memcg_data = READ_ONCE(folio->memcg_data); + + VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); WARN_ON_ONCE(!rcu_read_lock_held()); if (memcg_data & MEMCG_DATA_KMEM) { @@ -523,17 +529,18 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) #ifdef CONFIG_MEMCG_KMEM /* - * PageMemcgKmem - check if the page has MemcgKmem flag set - * @page: a pointer to the page struct + * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set. + * @folio: Pointer to the folio. * - * Checks if the page has MemcgKmem flag set. The caller must ensure that - * the page has an associated memory cgroup. It's not safe to call this function - * against some types of pages, e.g. slab pages. + * Checks if the folio has MemcgKmem flag set. The caller must ensure + * that the folio has an associated memory cgroup. It's not safe to call + * this function against some types of folios, e.g. slab folios. */ -static inline bool PageMemcgKmem(struct page *page) +static inline bool folio_memcg_kmem(struct folio *folio) { - VM_BUG_ON_PAGE(page->memcg_data & MEMCG_DATA_OBJCGS, page); - return page->memcg_data & MEMCG_DATA_KMEM; + VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page); + VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJCGS, folio); + return folio->memcg_data & MEMCG_DATA_KMEM; } /* @@ -577,7 +584,7 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) } #else -static inline bool PageMemcgKmem(struct page *page) +static inline bool folio_memcg_kmem(struct folio *folio) { return false; } @@ -593,6 +600,11 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) } #endif +static inline bool PageMemcgKmem(struct page *page) +{ + return folio_memcg_kmem(page_folio(page)); +} + static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); @@ -684,26 +696,47 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) page_counter_read(&memcg->memory); } -int __mem_cgroup_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask); -static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask) +int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp); + +/** + * mem_cgroup_charge - Charge a newly allocated folio to a cgroup. + * @folio: Folio to charge. + * @mm: mm context of the allocating task. + * @gfp: Reclaim mode. + * + * Try to charge @folio to the memcg that @mm belongs to, reclaiming + * pages according to @gfp if necessary. If @mm is NULL, try to + * charge to the active memcg. + * + * Do not use this for folios allocated for swapin. + * + * Return: 0 on success. Otherwise, an error code is returned. + */ +static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, + gfp_t gfp) { if (mem_cgroup_disabled()) return 0; - return __mem_cgroup_charge(page, mm, gfp_mask); + return __mem_cgroup_charge(folio, mm, gfp); } int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); -void __mem_cgroup_uncharge(struct page *page); -static inline void mem_cgroup_uncharge(struct page *page) +void __mem_cgroup_uncharge(struct folio *folio); + +/** + * mem_cgroup_uncharge - Uncharge a folio. + * @folio: Folio to uncharge. + * + * Uncharge a folio previously charged with mem_cgroup_charge(). + */ +static inline void mem_cgroup_uncharge(struct folio *folio) { if (mem_cgroup_disabled()) return; - __mem_cgroup_uncharge(page); + __mem_cgroup_uncharge(folio); } void __mem_cgroup_uncharge_list(struct list_head *page_list); @@ -714,7 +747,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) __mem_cgroup_uncharge_list(page_list); } -void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); +void mem_cgroup_migrate(struct folio *old, struct folio *new); /** * mem_cgroup_lruvec - get the lru list vector for a memcg & node @@ -753,33 +786,33 @@ out: } /** - * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page - * @page: the page + * folio_lruvec - return lruvec for isolating/putting an LRU folio + * @folio: Pointer to the folio. * - * This function relies on page->mem_cgroup being stable. + * This function relies on folio->mem_cgroup being stable. */ -static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page) +static inline struct lruvec *folio_lruvec(struct folio *folio) { - pg_data_t *pgdat = page_pgdat(page); - struct mem_cgroup *memcg = page_memcg(page); + struct mem_cgroup *memcg = folio_memcg(folio); - VM_WARN_ON_ONCE_PAGE(!memcg && !mem_cgroup_disabled(), page); - return mem_cgroup_lruvec(memcg, pgdat); + VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled(), folio); + return mem_cgroup_lruvec(memcg, folio_pgdat(folio)); } struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); -struct lruvec *lock_page_lruvec(struct page *page); -struct lruvec *lock_page_lruvec_irq(struct page *page); -struct lruvec *lock_page_lruvec_irqsave(struct page *page, +struct lruvec *folio_lruvec_lock(struct folio *folio); +struct lruvec *folio_lruvec_lock_irq(struct folio *folio); +struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags); #ifdef CONFIG_DEBUG_VM -void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page); +void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio); #else -static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +static inline +void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { } #endif @@ -947,6 +980,8 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); extern bool cgroup_memory_noswap; #endif +void folio_memcg_lock(struct folio *folio); +void folio_memcg_unlock(struct folio *folio); void lock_page_memcg(struct page *page); void unlock_page_memcg(struct page *page); @@ -1115,12 +1150,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, #define MEM_CGROUP_ID_SHIFT 0 #define MEM_CGROUP_ID_MAX 0 +static inline struct mem_cgroup *folio_memcg(struct folio *folio) +{ + return NULL; +} + static inline struct mem_cgroup *page_memcg(struct page *page) { return NULL; } -static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) { WARN_ON_ONCE(!rcu_read_lock_held()); return NULL; @@ -1131,6 +1171,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) return NULL; } +static inline bool folio_memcg_kmem(struct folio *folio) +{ + return false; +} + static inline bool PageMemcgKmem(struct page *page) { return false; @@ -1179,8 +1224,8 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) return false; } -static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask) +static inline int mem_cgroup_charge(struct folio *folio, + struct mm_struct *mm, gfp_t gfp) { return 0; } @@ -1195,7 +1240,7 @@ static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) { } -static inline void mem_cgroup_uncharge(struct page *page) +static inline void mem_cgroup_uncharge(struct folio *folio) { } @@ -1203,7 +1248,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) { } -static inline void mem_cgroup_migrate(struct page *old, struct page *new) +static inline void mem_cgroup_migrate(struct folio *old, struct folio *new) { } @@ -1213,14 +1258,14 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, return &pgdat->__lruvec; } -static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page) +static inline struct lruvec *folio_lruvec(struct folio *folio) { - pg_data_t *pgdat = page_pgdat(page); - + struct pglist_data *pgdat = folio_pgdat(folio); return &pgdat->__lruvec; } -static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +static inline +void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { } @@ -1250,26 +1295,26 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } -static inline struct lruvec *lock_page_lruvec(struct page *page) +static inline struct lruvec *folio_lruvec_lock(struct folio *folio) { - struct pglist_data *pgdat = page_pgdat(page); + struct pglist_data *pgdat = folio_pgdat(folio); spin_lock(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } -static inline struct lruvec *lock_page_lruvec_irq(struct page *page) +static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { - struct pglist_data *pgdat = page_pgdat(page); + struct pglist_data *pgdat = folio_pgdat(folio); spin_lock_irq(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } -static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page, +static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flagsp) { - struct pglist_data *pgdat = page_pgdat(page); + struct pglist_data *pgdat = folio_pgdat(folio); spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp); return &pgdat->__lruvec; @@ -1356,6 +1401,14 @@ static inline void unlock_page_memcg(struct page *page) { } +static inline void folio_memcg_lock(struct folio *folio) +{ +} + +static inline void folio_memcg_unlock(struct folio *folio) +{ +} + static inline void mem_cgroup_handle_over_high(void) { } @@ -1517,38 +1570,39 @@ static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, } /* Test requires a stable page->memcg binding, see page_memcg() */ -static inline bool page_matches_lruvec(struct page *page, struct lruvec *lruvec) +static inline bool folio_matches_lruvec(struct folio *folio, + struct lruvec *lruvec) { - return lruvec_pgdat(lruvec) == page_pgdat(page) && - lruvec_memcg(lruvec) == page_memcg(page); + return lruvec_pgdat(lruvec) == folio_pgdat(folio) && + lruvec_memcg(lruvec) == folio_memcg(folio); } /* Don't lock again iff page's lruvec locked */ -static inline struct lruvec *relock_page_lruvec_irq(struct page *page, +static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio, struct lruvec *locked_lruvec) { if (locked_lruvec) { - if (page_matches_lruvec(page, locked_lruvec)) + if (folio_matches_lruvec(folio, locked_lruvec)) return locked_lruvec; unlock_page_lruvec_irq(locked_lruvec); } - return lock_page_lruvec_irq(page); + return folio_lruvec_lock_irq(folio); } /* Don't lock again iff page's lruvec locked */ -static inline struct lruvec *relock_page_lruvec_irqsave(struct page *page, +static inline struct lruvec *folio_lruvec_relock_irqsave(struct folio *folio, struct lruvec *locked_lruvec, unsigned long *flags) { if (locked_lruvec) { - if (page_matches_lruvec(page, locked_lruvec)) + if (folio_matches_lruvec(folio, locked_lruvec)) return locked_lruvec; unlock_page_lruvec_irqrestore(locked_lruvec, *flags); } - return lock_page_lruvec_irqsave(page, flags); + return folio_lruvec_lock_irqsave(folio, flags); } #ifdef CONFIG_CGROUP_WRITEBACK @@ -1558,17 +1612,17 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, unsigned long *pwriteback); -void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, +void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, struct bdi_writeback *wb); -static inline void mem_cgroup_track_foreign_dirty(struct page *page, +static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { if (mem_cgroup_disabled()) return; - if (unlikely(&page_memcg(page)->css != wb->memcg_css)) - mem_cgroup_track_foreign_dirty_slowpath(page, wb); + if (unlikely(&folio_memcg(folio)->css != wb->memcg_css)) + mem_cgroup_track_foreign_dirty_slowpath(folio, wb); } void mem_cgroup_flush_foreign(struct bdi_writeback *wb); @@ -1588,7 +1642,7 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb, { } -static inline void mem_cgroup_track_foreign_dirty(struct page *page, +static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { } diff --git a/include/linux/migrate.h b/include/linux/migrate.h index c8077e936691..0d2aeb9b0f66 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -57,6 +57,10 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page); extern int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, int extra_count); +void folio_migrate_flags(struct folio *newfolio, struct folio *folio); +void folio_migrate_copy(struct folio *newfolio, struct folio *folio); +int folio_migrate_mapping(struct address_space *mapping, + struct folio *newfolio, struct folio *folio, int extra_count); #else static inline void putback_movable_pages(struct list_head *l) {} diff --git a/include/linux/mm.h b/include/linux/mm.h index 73a52aba448f..40ff114aaf9e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -36,10 +36,7 @@ struct mempolicy; struct anon_vma; struct anon_vma_chain; -struct file_ra_state; struct user_struct; -struct writeback_control; -struct bdi_writeback; struct pt_regs; extern int sysctl_page_lock_unfairness; @@ -216,13 +213,6 @@ int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, loff_t *); -/* - * Any attempt to mark this function as static leads to build failure - * when CONFIG_DEBUG_INFO_BTF is enabled because __add_to_page_cache_locked() - * is referred to by BPF code. This must be visible for error injection. - */ -int __add_to_page_cache_locked(struct page *page, struct address_space *mapping, - pgoff_t index, gfp_t gfp, void **shadowp); #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) @@ -748,13 +738,18 @@ static inline int put_page_testzero(struct page *page) return page_ref_dec_and_test(page); } +static inline int folio_put_testzero(struct folio *folio) +{ + return put_page_testzero(&folio->page); +} + /* * Try to grab a ref unless the page has a refcount of zero, return false if * that is the case. * This can be called when MMU is off so it must not access * any of the virtual mappings. */ -static inline int get_page_unless_zero(struct page *page) +static inline bool get_page_unless_zero(struct page *page) { return page_ref_add_unless(page, 1, 0); } @@ -907,7 +902,7 @@ void __put_page(struct page *page); void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); -void copy_huge_page(struct page *dst, struct page *src); +void folio_copy(struct folio *dst, struct folio *src); /* * Compound pages have a destructor function. Provide a @@ -950,6 +945,20 @@ static inline unsigned int compound_order(struct page *page) return page[1].compound_order; } +/** + * folio_order - The allocation order of a folio. + * @folio: The folio. + * + * A folio is composed of 2^order pages. See get_order() for the definition + * of order. + * + * Return: The order of the folio. + */ +static inline unsigned int folio_order(struct folio *folio) +{ + return compound_order(&folio->page); +} + static inline bool hpage_pincount_available(struct page *page) { /* @@ -1131,6 +1140,11 @@ static inline enum zone_type page_zonenum(const struct page *page) return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } +static inline enum zone_type folio_zonenum(const struct folio *folio) +{ + return page_zonenum(&folio->page); +} + #ifdef CONFIG_ZONE_DEVICE static inline bool is_zone_device_page(const struct page *page) { @@ -1200,18 +1214,26 @@ static inline bool is_pci_p2pdma_page(const struct page *page) } /* 127: arbitrary random number, small enough to assemble well */ -#define page_ref_zero_or_close_to_overflow(page) \ - ((unsigned int) page_ref_count(page) + 127u <= 127u) +#define folio_ref_zero_or_close_to_overflow(folio) \ + ((unsigned int) folio_ref_count(folio) + 127u <= 127u) + +/** + * folio_get - Increment the reference count on a folio. + * @folio: The folio. + * + * Context: May be called in any context, as long as you know that + * you have a refcount on the folio. If you do not already have one, + * folio_try_get() may be the right interface for you to use. + */ +static inline void folio_get(struct folio *folio) +{ + VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio); + folio_ref_inc(folio); +} static inline void get_page(struct page *page) { - page = compound_head(page); - /* - * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_refcount. - */ - VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page); - page_ref_inc(page); + folio_get(page_folio(page)); } bool __must_check try_grab_page(struct page *page, unsigned int flags); @@ -1228,9 +1250,28 @@ static inline __must_check bool try_get_page(struct page *page) return true; } +/** + * folio_put - Decrement the reference count on a folio. + * @folio: The folio. + * + * If the folio's reference count reaches zero, the memory will be + * released back to the page allocator and may be used by another + * allocation immediately. Do not access the memory or the struct folio + * after calling folio_put() unless you can be sure that it wasn't the + * last reference. + * + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. + */ +static inline void folio_put(struct folio *folio) +{ + if (folio_put_testzero(folio)) + __put_page(&folio->page); +} + static inline void put_page(struct page *page) { - page = compound_head(page); + struct folio *folio = page_folio(page); /* * For devmap managed pages we need to catch refcount transition from @@ -1238,13 +1279,12 @@ static inline void put_page(struct page *page) * need to inform the device driver through callback. See * include/linux/memremap.h and HMM for details. */ - if (page_is_devmap_managed(page)) { - put_devmap_managed_page(page); + if (page_is_devmap_managed(&folio->page)) { + put_devmap_managed_page(&folio->page); return; } - if (put_page_testzero(page)) - __put_page(page); + folio_put(folio); } /* @@ -1379,6 +1419,11 @@ static inline int page_to_nid(const struct page *page) } #endif +static inline int folio_nid(const struct folio *folio) +{ + return page_to_nid(&folio->page); +} + #ifdef CONFIG_NUMA_BALANCING static inline int cpu_pid_to_cpupid(int cpu, int pid) { @@ -1546,6 +1591,16 @@ static inline pg_data_t *page_pgdat(const struct page *page) return NODE_DATA(page_to_nid(page)); } +static inline struct zone *folio_zone(const struct folio *folio) +{ + return page_zone(&folio->page); +} + +static inline pg_data_t *folio_pgdat(const struct folio *folio) +{ + return page_pgdat(&folio->page); +} + #ifdef SECTION_IN_PAGE_FLAGS static inline void set_page_section(struct page *page, unsigned long section) { @@ -1559,6 +1614,20 @@ static inline unsigned long page_to_section(const struct page *page) } #endif +/** + * folio_pfn - Return the Page Frame Number of a folio. + * @folio: The folio. + * + * A folio may contain multiple pages. The pages have consecutive + * Page Frame Numbers. + * + * Return: The Page Frame Number of the first page in the folio. + */ +static inline unsigned long folio_pfn(struct folio *folio) +{ + return page_to_pfn(&folio->page); +} + /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */ #ifdef CONFIG_MIGRATION static inline bool is_pinnable_page(struct page *page) @@ -1595,6 +1664,89 @@ static inline void set_page_links(struct page *page, enum zone_type zone, #endif } +/** + * folio_nr_pages - The number of pages in the folio. + * @folio: The folio. + * + * Return: A positive power of two. + */ +static inline long folio_nr_pages(struct folio *folio) +{ + return compound_nr(&folio->page); +} + +/** + * folio_next - Move to the next physical folio. + * @folio: The folio we're currently operating on. + * + * If you have physically contiguous memory which may span more than + * one folio (eg a &struct bio_vec), use this function to move from one + * folio to the next. Do not use it if the memory is only virtually + * contiguous as the folios are almost certainly not adjacent to each + * other. This is the folio equivalent to writing ``page++``. + * + * Context: We assume that the folios are refcounted and/or locked at a + * higher level and do not adjust the reference counts. + * Return: The next struct folio. + */ +static inline struct folio *folio_next(struct folio *folio) +{ + return (struct folio *)folio_page(folio, folio_nr_pages(folio)); +} + +/** + * folio_shift - The size of the memory described by this folio. + * @folio: The folio. + * + * A folio represents a number of bytes which is a power-of-two in size. + * This function tells you which power-of-two the folio is. See also + * folio_size() and folio_order(). + * + * Context: The caller should have a reference on the folio to prevent + * it from being split. It is not necessary for the folio to be locked. + * Return: The base-2 logarithm of the size of this folio. + */ +static inline unsigned int folio_shift(struct folio *folio) +{ + return PAGE_SHIFT + folio_order(folio); +} + +/** + * folio_size - The number of bytes in a folio. + * @folio: The folio. + * + * Context: The caller should have a reference on the folio to prevent + * it from being split. It is not necessary for the folio to be locked. + * Return: The number of bytes in this folio. + */ +static inline size_t folio_size(struct folio *folio) +{ + return PAGE_SIZE << folio_order(folio); +} + +#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE +static inline int arch_make_page_accessible(struct page *page) +{ + return 0; +} +#endif + +#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE +static inline int arch_make_folio_accessible(struct folio *folio) +{ + int ret; + long i, nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) { + ret = arch_make_page_accessible(folio_page(folio, i)); + if (ret) + break; + } + + return ret; +} +#endif + /* * Some inline functions in vmstat.h depend on page_zone() */ @@ -1635,19 +1787,6 @@ void page_address_init(void); extern void *page_rmapping(struct page *page); extern struct anon_vma *page_anon_vma(struct page *page); -extern struct address_space *page_mapping(struct page *page); - -extern struct address_space *__page_file_mapping(struct page *); - -static inline -struct address_space *page_file_mapping(struct page *page) -{ - if (unlikely(PageSwapCache(page))) - return __page_file_mapping(page); - - return page->mapping; -} - extern pgoff_t __page_file_index(struct page *page); /* @@ -1662,7 +1801,7 @@ static inline pgoff_t page_index(struct page *page) } bool page_mapped(struct page *page); -struct address_space *page_mapping(struct page *page); +bool folio_mapped(struct folio *folio); /* * Return true only if the page has been allocated with @@ -1700,6 +1839,7 @@ extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) #define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1)) +#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1)) /* * Flags passed to show_mem() and show_free_areas() to suppress output in @@ -1854,20 +1994,9 @@ extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned int offset, unsigned int length); -int redirty_page_for_writepage(struct writeback_control *wbc, - struct page *page); -void account_page_cleaned(struct page *page, struct address_space *mapping, - struct bdi_writeback *wb); -int set_page_dirty(struct page *page); +bool folio_mark_dirty(struct folio *folio); +bool set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); -void __cancel_dirty_page(struct page *page); -static inline void cancel_dirty_page(struct page *page) -{ - /* Avoid atomic ops, locking, etc. when not actually needed. */ - if (PageDirty(page)) - __cancel_dirty_page(page); -} -int clear_page_dirty_for_io(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); @@ -2659,10 +2788,6 @@ extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); -/* mm/page-writeback.c */ -int __must_check write_one_page(struct page *page); -void task_dirty_inc(struct task_struct *tsk); - extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 355ea1ee32bd..e2ec68b0515c 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -6,27 +6,33 @@ #include /** - * page_is_file_lru - should the page be on a file LRU or anon LRU? - * @page: the page to test - * - * Returns 1 if @page is a regular filesystem backed page cache page or a lazily - * freed anonymous page (e.g. via MADV_FREE). Returns 0 if @page is a normal - * anonymous page, a tmpfs page or otherwise ram or swap backed page. Used by - * functions that manipulate the LRU lists, to sort a page onto the right LRU - * list. + * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? + * @folio: The folio to test. * * We would like to get this info without a page flag, but the state - * needs to survive until the page is last deleted from the LRU, which + * needs to survive until the folio is last deleted from the LRU, which * could be as far down as __page_cache_release. + * + * Return: An integer (not a boolean!) used to sort a folio onto the + * right LRU list and to account folios correctly. + * 1 if @folio is a regular filesystem backed page cache folio + * or a lazily freed anonymous folio (e.g. via MADV_FREE). + * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise + * ram or swap backed folio. */ +static inline int folio_is_file_lru(struct folio *folio) +{ + return !folio_test_swapbacked(folio); +} + static inline int page_is_file_lru(struct page *page) { - return !PageSwapBacked(page); + return folio_is_file_lru(page_folio(page)); } static __always_inline void update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, - int nr_pages) + long nr_pages) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -39,69 +45,94 @@ static __always_inline void update_lru_size(struct lruvec *lruvec, } /** - * __clear_page_lru_flags - clear page lru flags before releasing a page - * @page: the page that was on lru and now has a zero reference + * __folio_clear_lru_flags - Clear page lru flags before releasing a page. + * @folio: The folio that was on lru and now has a zero reference. */ -static __always_inline void __clear_page_lru_flags(struct page *page) +static __always_inline void __folio_clear_lru_flags(struct folio *folio) { - VM_BUG_ON_PAGE(!PageLRU(page), page); + VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio); - __ClearPageLRU(page); + __folio_clear_lru(folio); /* this shouldn't happen, so leave the flags to bad_page() */ - if (PageActive(page) && PageUnevictable(page)) + if (folio_test_active(folio) && folio_test_unevictable(folio)) return; - __ClearPageActive(page); - __ClearPageUnevictable(page); + __folio_clear_active(folio); + __folio_clear_unevictable(folio); +} + +static __always_inline void __clear_page_lru_flags(struct page *page) +{ + __folio_clear_lru_flags(page_folio(page)); } /** - * page_lru - which LRU list should a page be on? - * @page: the page to test + * folio_lru_list - Which LRU list should a folio be on? + * @folio: The folio to test. * - * Returns the LRU list a page should be on, as an index + * Return: The LRU list a folio should be on, as an index * into the array of LRU lists. */ -static __always_inline enum lru_list page_lru(struct page *page) +static __always_inline enum lru_list folio_lru_list(struct folio *folio) { enum lru_list lru; - VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); + VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); - if (PageUnevictable(page)) + if (folio_test_unevictable(folio)) return LRU_UNEVICTABLE; - lru = page_is_file_lru(page) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON; - if (PageActive(page)) + lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON; + if (folio_test_active(folio)) lru += LRU_ACTIVE; return lru; } +static __always_inline +void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) +{ + enum lru_list lru = folio_lru_list(folio); + + update_lru_size(lruvec, lru, folio_zonenum(folio), + folio_nr_pages(folio)); + list_add(&folio->lru, &lruvec->lists[lru]); +} + static __always_inline void add_page_to_lru_list(struct page *page, struct lruvec *lruvec) { - enum lru_list lru = page_lru(page); + lruvec_add_folio(lruvec, page_folio(page)); +} - update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); - list_add(&page->lru, &lruvec->lists[lru]); +static __always_inline +void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) +{ + enum lru_list lru = folio_lru_list(folio); + + update_lru_size(lruvec, lru, folio_zonenum(folio), + folio_nr_pages(folio)); + list_add_tail(&folio->lru, &lruvec->lists[lru]); } static __always_inline void add_page_to_lru_list_tail(struct page *page, struct lruvec *lruvec) { - enum lru_list lru = page_lru(page); + lruvec_add_folio_tail(lruvec, page_folio(page)); +} - update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); - list_add_tail(&page->lru, &lruvec->lists[lru]); +static __always_inline +void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) +{ + list_del(&folio->lru); + update_lru_size(lruvec, folio_lru_list(folio), folio_zonenum(folio), + -folio_nr_pages(folio)); } static __always_inline void del_page_from_lru_list(struct page *page, struct lruvec *lruvec) { - list_del(&page->lru); - update_lru_size(lruvec, page_lru(page), page_zonenum(page), - -thp_nr_pages(page)); + lruvec_del_folio(lruvec, page_folio(page)); } #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7f8ee09c711f..82dab23205c3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -239,6 +239,72 @@ struct page { #endif } _struct_page_alignment; +/** + * struct folio - Represents a contiguous set of bytes. + * @flags: Identical to the page flags. + * @lru: Least Recently Used list; tracks how recently this folio was used. + * @mapping: The file this page belongs to, or refers to the anon_vma for + * anonymous memory. + * @index: Offset within the file, in units of pages. For anonymous memory, + * this is the index from the beginning of the mmap. + * @private: Filesystem per-folio data (see folio_attach_private()). + * Used for swp_entry_t if folio_test_swapcache(). + * @_mapcount: Do not access this member directly. Use folio_mapcount() to + * find out how many times this folio is mapped by userspace. + * @_refcount: Do not access this member directly. Use folio_ref_count() + * to find how many references there are to this folio. + * @memcg_data: Memory Control Group data. + * + * A folio is a physically, virtually and logically contiguous set + * of bytes. It is a power-of-two in size, and it is aligned to that + * same power-of-two. It is at least as large as %PAGE_SIZE. If it is + * in the page cache, it is at a file offset which is a multiple of that + * power-of-two. It may be mapped into userspace at an address which is + * at an arbitrary page offset, but its kernel virtual address is aligned + * to its size. + */ +struct folio { + /* private: don't document the anon union */ + union { + struct { + /* public: */ + unsigned long flags; + struct list_head lru; + struct address_space *mapping; + pgoff_t index; + void *private; + atomic_t _mapcount; + atomic_t _refcount; +#ifdef CONFIG_MEMCG + unsigned long memcg_data; +#endif + /* private: the union with struct page is transitional */ + }; + struct page page; + }; +}; + +static_assert(sizeof(struct page) == sizeof(struct folio)); +#define FOLIO_MATCH(pg, fl) \ + static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl)) +FOLIO_MATCH(flags, flags); +FOLIO_MATCH(lru, lru); +FOLIO_MATCH(compound_head, lru); +FOLIO_MATCH(index, index); +FOLIO_MATCH(private, private); +FOLIO_MATCH(_mapcount, _mapcount); +FOLIO_MATCH(_refcount, _refcount); +#ifdef CONFIG_MEMCG +FOLIO_MATCH(memcg_data, memcg_data); +#endif +#undef FOLIO_MATCH + +static inline atomic_t *folio_mapcount_ptr(struct folio *folio) +{ + struct page *tail = &folio->page + 1; + return &tail->compound_mapcount; +} + static inline atomic_t *compound_mapcount_ptr(struct page *page) { return &page[1].compound_mapcount; @@ -257,6 +323,12 @@ static inline atomic_t *compound_pincount_ptr(struct page *page) #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) +/* + * page_private can be used on tail pages. However, PagePrivate is only + * checked by the VM on the head page. So page_private on the tail pages + * should be used for data that's ancillary to the head page (eg attaching + * buffer heads to tail pages after attaching buffer heads to the head page) + */ #define page_private(page) ((page)->private) static inline void set_page_private(struct page *page, unsigned long private) @@ -264,6 +336,11 @@ static inline void set_page_private(struct page *page, unsigned long private) page->private = private; } +static inline void *folio_get_private(struct folio *folio) +{ + return folio->private; +} + struct page_frag_cache { void * va; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 1935d4c72d10..d7285f8148a3 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -22,6 +22,13 @@ void dump_mm(const struct mm_struct *mm); BUG(); \ } \ } while (0) +#define VM_BUG_ON_FOLIO(cond, folio) \ + do { \ + if (unlikely(cond)) { \ + dump_page(&folio->page, "VM_BUG_ON_FOLIO(" __stringify(cond)")");\ + BUG(); \ + } \ + } while (0) #define VM_BUG_ON_VMA(cond, vma) \ do { \ if (unlikely(cond)) { \ @@ -47,6 +54,17 @@ void dump_mm(const struct mm_struct *mm); } \ unlikely(__ret_warn_once); \ }) +#define VM_WARN_ON_ONCE_FOLIO(cond, folio) ({ \ + static bool __section(".data.once") __warned; \ + int __ret_warn_once = !!(cond); \ + \ + if (unlikely(__ret_warn_once && !__warned)) { \ + dump_page(&folio->page, "VM_WARN_ON_ONCE_FOLIO(" __stringify(cond)")");\ + __warned = true; \ + WARN_ON(1); \ + } \ + unlikely(__ret_warn_once); \ +}) #define VM_WARN_ON(cond) (void)WARN_ON(cond) #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) @@ -55,11 +73,13 @@ void dump_mm(const struct mm_struct *mm); #else #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) +#define VM_BUG_ON_FOLIO(cond, folio) VM_BUG_ON(cond) #define VM_BUG_ON_VMA(cond, vma) VM_BUG_ON(cond) #define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond) #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 5d6a4158a9a6..12c4177f7703 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -22,6 +22,7 @@ * Overload PG_private_2 to give us PG_fscache - this is used to indicate that * a page is currently backed by a local disk cache */ +#define folio_test_fscache(folio) folio_test_private_2(folio) #define PageFsCache(page) PagePrivate2((page)) #define SetPageFsCache(page) SetPagePrivate2((page)) #define ClearPageFsCache(page) ClearPagePrivate2((page)) @@ -29,60 +30,80 @@ #define TestClearPageFsCache(page) TestClearPagePrivate2((page)) /** - * set_page_fscache - Set PG_fscache on a page and take a ref - * @page: The page. + * folio_start_fscache - Start an fscache write on a folio. + * @folio: The folio. * - * Set the PG_fscache (PG_private_2) flag on a page and take the reference - * needed for the VM to handle its lifetime correctly. This sets the flag and - * takes the reference unconditionally, so care must be taken not to set the - * flag again if it's already set. + * Call this function before writing a folio to a local cache. Starting a + * second write before the first one finishes is not allowed. */ -static inline void set_page_fscache(struct page *page) +static inline void folio_start_fscache(struct folio *folio) { - set_page_private_2(page); + VM_BUG_ON_FOLIO(folio_test_private_2(folio), folio); + folio_get(folio); + folio_set_private_2(folio); } /** - * end_page_fscache - Clear PG_fscache and release any waiters - * @page: The page + * folio_end_fscache - End an fscache write on a folio. + * @folio: The folio. * - * Clear the PG_fscache (PG_private_2) bit on a page and wake up any sleepers - * waiting for this. The page ref held for PG_private_2 being set is released. - * - * This is, for example, used when a netfs page is being written to a local - * disk cache, thereby allowing writes to the cache for the same page to be - * serialised. + * Call this function after the folio has been written to the local cache. + * This will wake any sleepers waiting on this folio. */ -static inline void end_page_fscache(struct page *page) +static inline void folio_end_fscache(struct folio *folio) { - end_page_private_2(page); + folio_end_private_2(folio); } /** - * wait_on_page_fscache - Wait for PG_fscache to be cleared on a page - * @page: The page to wait on + * folio_wait_fscache - Wait for an fscache write on this folio to end. + * @folio: The folio. * - * Wait for PG_fscache (aka PG_private_2) to be cleared on a page. + * If this folio is currently being written to a local cache, wait for + * the write to finish. Another write may start after this one finishes, + * unless the caller holds the folio lock. */ -static inline void wait_on_page_fscache(struct page *page) +static inline void folio_wait_fscache(struct folio *folio) { - wait_on_page_private_2(page); + folio_wait_private_2(folio); } /** - * wait_on_page_fscache_killable - Wait for PG_fscache to be cleared on a page - * @page: The page to wait on + * folio_wait_fscache_killable - Wait for an fscache write on this folio to end. + * @folio: The folio. * - * Wait for PG_fscache (aka PG_private_2) to be cleared on a page or until a - * fatal signal is received by the calling task. + * If this folio is currently being written to a local cache, wait + * for the write to finish or for a fatal signal to be received. + * Another write may start after this one finishes, unless the caller + * holds the folio lock. * * Return: * - 0 if successful. * - -EINTR if a fatal signal was encountered. */ +static inline int folio_wait_fscache_killable(struct folio *folio) +{ + return folio_wait_private_2_killable(folio); +} + +static inline void set_page_fscache(struct page *page) +{ + folio_start_fscache(page_folio(page)); +} + +static inline void end_page_fscache(struct page *page) +{ + folio_end_private_2(page_folio(page)); +} + +static inline void wait_on_page_fscache(struct page *page) +{ + folio_wait_private_2(page_folio(page)); +} + static inline int wait_on_page_fscache_killable(struct page *page) { - return wait_on_page_private_2_killable(page); + return folio_wait_private_2_killable(page_folio(page)); } enum netfs_read_source { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index fbfd3fad48f2..d8623d6e1141 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -143,6 +143,8 @@ enum pageflags { #endif __NR_PAGEFLAGS, + PG_readahead = PG_reclaim, + /* Filesystems */ PG_checked = PG_owner_priv_1, @@ -202,6 +204,34 @@ static inline unsigned long _compound_head(const struct page *page) #define compound_head(page) ((typeof(page))_compound_head(page)) +/** + * page_folio - Converts from page to folio. + * @p: The page. + * + * Every page is part of a folio. This function cannot be called on a + * NULL pointer. + * + * Context: No reference, nor lock is required on @page. If the caller + * does not hold a reference, this call may race with a folio split, so + * it should re-check the folio still contains this page after gaining + * a reference on the folio. + * Return: The folio which contains this page. + */ +#define page_folio(p) (_Generic((p), \ + const struct page *: (const struct folio *)_compound_head(p), \ + struct page *: (struct folio *)_compound_head(p))) + +/** + * folio_page - Return a page from a folio. + * @folio: The folio. + * @n: The page number to return. + * + * @n is relative to the start of the folio. This function does not + * check that the page number lies within @folio; the caller is presumed + * to have a reference to the page. + */ +#define folio_page(folio, n) nth_page(&(folio)->page, n) + static __always_inline int PageTail(struct page *page) { return READ_ONCE(page->compound_head) & 1; @@ -226,6 +256,15 @@ static inline void page_init_poison(struct page *page, size_t size) } #endif +static unsigned long *folio_flags(struct folio *folio, unsigned n) +{ + struct page *page = &folio->page; + + VM_BUG_ON_PGFLAGS(PageTail(page), page); + VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); + return &page[n].flags; +} + /* * Page flags policies wrt compound pages * @@ -270,36 +309,64 @@ static inline void page_init_poison(struct page *page, size_t size) VM_BUG_ON_PGFLAGS(!PageHead(page), page); \ PF_POISONED_CHECK(&page[1]); }) +/* Which page is the flag stored in */ +#define FOLIO_PF_ANY 0 +#define FOLIO_PF_HEAD 0 +#define FOLIO_PF_ONLY_HEAD 0 +#define FOLIO_PF_NO_TAIL 0 +#define FOLIO_PF_NO_COMPOUND 0 +#define FOLIO_PF_SECOND 1 + /* * Macros to create function definitions for page flags */ #define TESTPAGEFLAG(uname, lname, policy) \ +static __always_inline bool folio_test_##lname(struct folio *folio) \ +{ return test_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline int Page##uname(struct page *page) \ - { return test_bit(PG_##lname, &policy(page, 0)->flags); } +{ return test_bit(PG_##lname, &policy(page, 0)->flags); } #define SETPAGEFLAG(uname, lname, policy) \ +static __always_inline \ +void folio_set_##lname(struct folio *folio) \ +{ set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline void SetPage##uname(struct page *page) \ - { set_bit(PG_##lname, &policy(page, 1)->flags); } +{ set_bit(PG_##lname, &policy(page, 1)->flags); } #define CLEARPAGEFLAG(uname, lname, policy) \ +static __always_inline \ +void folio_clear_##lname(struct folio *folio) \ +{ clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline void ClearPage##uname(struct page *page) \ - { clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ clear_bit(PG_##lname, &policy(page, 1)->flags); } #define __SETPAGEFLAG(uname, lname, policy) \ +static __always_inline \ +void __folio_set_##lname(struct folio *folio) \ +{ __set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline void __SetPage##uname(struct page *page) \ - { __set_bit(PG_##lname, &policy(page, 1)->flags); } +{ __set_bit(PG_##lname, &policy(page, 1)->flags); } #define __CLEARPAGEFLAG(uname, lname, policy) \ +static __always_inline \ +void __folio_clear_##lname(struct folio *folio) \ +{ __clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline void __ClearPage##uname(struct page *page) \ - { __clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ __clear_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTSETFLAG(uname, lname, policy) \ +static __always_inline \ +bool folio_test_set_##lname(struct folio *folio) \ +{ return test_and_set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline int TestSetPage##uname(struct page *page) \ - { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } +{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTCLEARFLAG(uname, lname, policy) \ +static __always_inline \ +bool folio_test_clear_##lname(struct folio *folio) \ +{ return test_and_clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline int TestClearPage##uname(struct page *page) \ - { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } #define PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ @@ -315,29 +382,37 @@ static __always_inline int TestClearPage##uname(struct page *page) \ TESTSETFLAG(uname, lname, policy) \ TESTCLEARFLAG(uname, lname, policy) -#define TESTPAGEFLAG_FALSE(uname) \ +#define TESTPAGEFLAG_FALSE(uname, lname) \ +static inline bool folio_test_##lname(const struct folio *folio) { return 0; } \ static inline int Page##uname(const struct page *page) { return 0; } -#define SETPAGEFLAG_NOOP(uname) \ +#define SETPAGEFLAG_NOOP(uname, lname) \ +static inline void folio_set_##lname(struct folio *folio) { } \ static inline void SetPage##uname(struct page *page) { } -#define CLEARPAGEFLAG_NOOP(uname) \ +#define CLEARPAGEFLAG_NOOP(uname, lname) \ +static inline void folio_clear_##lname(struct folio *folio) { } \ static inline void ClearPage##uname(struct page *page) { } -#define __CLEARPAGEFLAG_NOOP(uname) \ +#define __CLEARPAGEFLAG_NOOP(uname, lname) \ +static inline void __folio_clear_##lname(struct folio *folio) { } \ static inline void __ClearPage##uname(struct page *page) { } -#define TESTSETFLAG_FALSE(uname) \ +#define TESTSETFLAG_FALSE(uname, lname) \ +static inline bool folio_test_set_##lname(struct folio *folio) \ +{ return 0; } \ static inline int TestSetPage##uname(struct page *page) { return 0; } -#define TESTCLEARFLAG_FALSE(uname) \ +#define TESTCLEARFLAG_FALSE(uname, lname) \ +static inline bool folio_test_clear_##lname(struct folio *folio) \ +{ return 0; } \ static inline int TestClearPage##uname(struct page *page) { return 0; } -#define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname) \ - SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname) +#define PAGEFLAG_FALSE(uname, lname) TESTPAGEFLAG_FALSE(uname, lname) \ + SETPAGEFLAG_NOOP(uname, lname) CLEARPAGEFLAG_NOOP(uname, lname) -#define TESTSCFLAG_FALSE(uname) \ - TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname) +#define TESTSCFLAG_FALSE(uname, lname) \ + TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname) __PAGEFLAG(Locked, locked, PF_NO_TAIL) PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) @@ -393,8 +468,8 @@ PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL) -PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND) - TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND) +PAGEFLAG(Readahead, readahead, PF_NO_COMPOUND) + TESTCLEARFLAG(Readahead, readahead, PF_NO_COMPOUND) #ifdef CONFIG_HIGHMEM /* @@ -403,22 +478,25 @@ PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND) */ #define PageHighMem(__p) is_highmem_idx(page_zonenum(__p)) #else -PAGEFLAG_FALSE(HighMem) +PAGEFLAG_FALSE(HighMem, highmem) #endif #ifdef CONFIG_SWAP -static __always_inline int PageSwapCache(struct page *page) +static __always_inline bool folio_test_swapcache(struct folio *folio) { -#ifdef CONFIG_THP_SWAP - page = compound_head(page); -#endif - return PageSwapBacked(page) && test_bit(PG_swapcache, &page->flags); - + return folio_test_swapbacked(folio) && + test_bit(PG_swapcache, folio_flags(folio, 0)); } + +static __always_inline bool PageSwapCache(struct page *page) +{ + return folio_test_swapcache(page_folio(page)); +} + SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL) CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL) #else -PAGEFLAG_FALSE(SwapCache) +PAGEFLAG_FALSE(SwapCache, swapcache) #endif PAGEFLAG(Unevictable, unevictable, PF_HEAD) @@ -430,14 +508,14 @@ PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL) #else -PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) - TESTSCFLAG_FALSE(Mlocked) +PAGEFLAG_FALSE(Mlocked, mlocked) __CLEARPAGEFLAG_NOOP(Mlocked, mlocked) + TESTSCFLAG_FALSE(Mlocked, mlocked) #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND) #else -PAGEFLAG_FALSE(Uncached) +PAGEFLAG_FALSE(Uncached, uncached) #endif #ifdef CONFIG_MEMORY_FAILURE @@ -446,7 +524,7 @@ TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) extern bool take_page_off_buddy(struct page *page); #else -PAGEFLAG_FALSE(HWPoison) +PAGEFLAG_FALSE(HWPoison, hwpoison) #define __PG_HWPOISON 0 #endif @@ -460,7 +538,7 @@ PAGEFLAG(Idle, idle, PF_ANY) #ifdef CONFIG_KASAN_HW_TAGS PAGEFLAG(SkipKASanPoison, skip_kasan_poison, PF_HEAD) #else -PAGEFLAG_FALSE(SkipKASanPoison) +PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison) #endif /* @@ -498,10 +576,14 @@ static __always_inline int PageMappingFlags(struct page *page) return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0; } -static __always_inline int PageAnon(struct page *page) +static __always_inline bool folio_test_anon(struct folio *folio) { - page = compound_head(page); - return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; + return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0; +} + +static __always_inline bool PageAnon(struct page *page) +{ + return folio_test_anon(page_folio(page)); } static __always_inline int __PageMovable(struct page *page) @@ -517,30 +599,32 @@ static __always_inline int __PageMovable(struct page *page) * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any * anon_vma, but to that page's node of the stable tree. */ -static __always_inline int PageKsm(struct page *page) +static __always_inline bool folio_test_ksm(struct folio *folio) { - page = compound_head(page); - return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == + return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_KSM; } + +static __always_inline bool PageKsm(struct page *page) +{ + return folio_test_ksm(page_folio(page)); +} #else -TESTPAGEFLAG_FALSE(Ksm) +TESTPAGEFLAG_FALSE(Ksm, ksm) #endif u64 stable_page_flags(struct page *page); -static inline int PageUptodate(struct page *page) +static inline bool folio_test_uptodate(struct folio *folio) { - int ret; - page = compound_head(page); - ret = test_bit(PG_uptodate, &(page)->flags); + bool ret = test_bit(PG_uptodate, folio_flags(folio, 0)); /* - * Must ensure that the data we read out of the page is loaded - * _after_ we've loaded page->flags to check for PageUptodate. - * We can skip the barrier if the page is not uptodate, because + * Must ensure that the data we read out of the folio is loaded + * _after_ we've loaded folio->flags to check the uptodate bit. + * We can skip the barrier if the folio is not uptodate, because * we wouldn't be reading anything from it. * - * See SetPageUptodate() for the other side of the story. + * See folio_mark_uptodate() for the other side of the story. */ if (ret) smp_rmb(); @@ -548,47 +632,71 @@ static inline int PageUptodate(struct page *page) return ret; } +static inline int PageUptodate(struct page *page) +{ + return folio_test_uptodate(page_folio(page)); +} + +static __always_inline void __folio_mark_uptodate(struct folio *folio) +{ + smp_wmb(); + __set_bit(PG_uptodate, folio_flags(folio, 0)); +} + +static __always_inline void folio_mark_uptodate(struct folio *folio) +{ + /* + * Memory barrier must be issued before setting the PG_uptodate bit, + * so that all previous stores issued in order to bring the folio + * uptodate are actually visible before folio_test_uptodate becomes true. + */ + smp_wmb(); + set_bit(PG_uptodate, folio_flags(folio, 0)); +} + static __always_inline void __SetPageUptodate(struct page *page) { - VM_BUG_ON_PAGE(PageTail(page), page); - smp_wmb(); - __set_bit(PG_uptodate, &page->flags); + __folio_mark_uptodate((struct folio *)page); } static __always_inline void SetPageUptodate(struct page *page) { - VM_BUG_ON_PAGE(PageTail(page), page); - /* - * Memory barrier must be issued before setting the PG_uptodate bit, - * so that all previous stores issued in order to bring the page - * uptodate are actually visible before PageUptodate becomes true. - */ - smp_wmb(); - set_bit(PG_uptodate, &page->flags); + folio_mark_uptodate((struct folio *)page); } CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) -int test_clear_page_writeback(struct page *page); -int __test_set_page_writeback(struct page *page, bool keep_write); +bool __folio_start_writeback(struct folio *folio, bool keep_write); +bool set_page_writeback(struct page *page); -#define test_set_page_writeback(page) \ - __test_set_page_writeback(page, false) -#define test_set_page_writeback_keepwrite(page) \ - __test_set_page_writeback(page, true) - -static inline void set_page_writeback(struct page *page) -{ - test_set_page_writeback(page); -} +#define folio_start_writeback(folio) \ + __folio_start_writeback(folio, false) +#define folio_start_writeback_keepwrite(folio) \ + __folio_start_writeback(folio, true) static inline void set_page_writeback_keepwrite(struct page *page) { - test_set_page_writeback_keepwrite(page); + folio_start_writeback_keepwrite(page_folio(page)); +} + +static inline bool test_set_page_writeback(struct page *page) +{ + return set_page_writeback(page); } __PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) +/* Whether there are one or multiple pages in a folio */ +static inline bool folio_test_single(struct folio *folio) +{ + return !folio_test_head(folio); +} + +static inline bool folio_test_multi(struct folio *folio) +{ + return folio_test_head(folio); +} + static __always_inline void set_compound_head(struct page *page, struct page *head) { WRITE_ONCE(page->compound_head, (unsigned long)head + 1); @@ -612,12 +720,15 @@ static inline void ClearPageCompound(struct page *page) #ifdef CONFIG_HUGETLB_PAGE int PageHuge(struct page *page); int PageHeadHuge(struct page *page); +static inline bool folio_test_hugetlb(struct folio *folio) +{ + return PageHeadHuge(&folio->page); +} #else -TESTPAGEFLAG_FALSE(Huge) -TESTPAGEFLAG_FALSE(HeadHuge) +TESTPAGEFLAG_FALSE(Huge, hugetlb) +TESTPAGEFLAG_FALSE(HeadHuge, headhuge) #endif - #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * PageHuge() only returns true for hugetlbfs pages, but not for @@ -633,6 +744,11 @@ static inline int PageTransHuge(struct page *page) return PageHead(page); } +static inline bool folio_test_transhuge(struct folio *folio) +{ + return folio_test_head(folio); +} + /* * PageTransCompound returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known @@ -669,12 +785,12 @@ static inline int PageTransTail(struct page *page) PAGEFLAG(DoubleMap, double_map, PF_SECOND) TESTSCFLAG(DoubleMap, double_map, PF_SECOND) #else -TESTPAGEFLAG_FALSE(TransHuge) -TESTPAGEFLAG_FALSE(TransCompound) -TESTPAGEFLAG_FALSE(TransCompoundMap) -TESTPAGEFLAG_FALSE(TransTail) -PAGEFLAG_FALSE(DoubleMap) - TESTSCFLAG_FALSE(DoubleMap) +TESTPAGEFLAG_FALSE(TransHuge, transhuge) +TESTPAGEFLAG_FALSE(TransCompound, transcompound) +TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap) +TESTPAGEFLAG_FALSE(TransTail, transtail) +PAGEFLAG_FALSE(DoubleMap, double_map) + TESTSCFLAG_FALSE(DoubleMap, double_map) #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE) @@ -872,6 +988,11 @@ static inline int page_has_private(struct page *page) return !!(page->flags & PAGE_FLAGS_PRIVATE); } +static inline bool folio_has_private(struct folio *folio) +{ + return page_has_private(&folio->page); +} + #undef PF_ANY #undef PF_HEAD #undef PF_ONLY_HEAD diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index d8a6aecf99cb..83abf95e9fa7 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -8,46 +8,16 @@ #ifdef CONFIG_PAGE_IDLE_FLAG -#ifdef CONFIG_64BIT -static inline bool page_is_young(struct page *page) -{ - return PageYoung(page); -} - -static inline void set_page_young(struct page *page) -{ - SetPageYoung(page); -} - -static inline bool test_and_clear_page_young(struct page *page) -{ - return TestClearPageYoung(page); -} - -static inline bool page_is_idle(struct page *page) -{ - return PageIdle(page); -} - -static inline void set_page_idle(struct page *page) -{ - SetPageIdle(page); -} - -static inline void clear_page_idle(struct page *page) -{ - ClearPageIdle(page); -} -#else /* !CONFIG_64BIT */ +#ifndef CONFIG_64BIT /* * If there is not enough space to store Idle and Young bits in page flags, use * page ext flags instead. */ extern struct page_ext_operations page_idle_ops; -static inline bool page_is_young(struct page *page) +static inline bool folio_test_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = lookup_page_ext(&folio->page); if (unlikely(!page_ext)) return false; @@ -55,9 +25,9 @@ static inline bool page_is_young(struct page *page) return test_bit(PAGE_EXT_YOUNG, &page_ext->flags); } -static inline void set_page_young(struct page *page) +static inline void folio_set_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = lookup_page_ext(&folio->page); if (unlikely(!page_ext)) return; @@ -65,9 +35,9 @@ static inline void set_page_young(struct page *page) set_bit(PAGE_EXT_YOUNG, &page_ext->flags); } -static inline bool test_and_clear_page_young(struct page *page) +static inline bool folio_test_clear_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = lookup_page_ext(&folio->page); if (unlikely(!page_ext)) return false; @@ -75,9 +45,9 @@ static inline bool test_and_clear_page_young(struct page *page) return test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags); } -static inline bool page_is_idle(struct page *page) +static inline bool folio_test_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = lookup_page_ext(&folio->page); if (unlikely(!page_ext)) return false; @@ -85,9 +55,9 @@ static inline bool page_is_idle(struct page *page) return test_bit(PAGE_EXT_IDLE, &page_ext->flags); } -static inline void set_page_idle(struct page *page) +static inline void folio_set_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = lookup_page_ext(&folio->page); if (unlikely(!page_ext)) return; @@ -95,46 +65,75 @@ static inline void set_page_idle(struct page *page) set_bit(PAGE_EXT_IDLE, &page_ext->flags); } -static inline void clear_page_idle(struct page *page) +static inline void folio_clear_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = lookup_page_ext(&folio->page); if (unlikely(!page_ext)) return; clear_bit(PAGE_EXT_IDLE, &page_ext->flags); } -#endif /* CONFIG_64BIT */ +#endif /* !CONFIG_64BIT */ #else /* !CONFIG_PAGE_IDLE_FLAG */ -static inline bool page_is_young(struct page *page) +static inline bool folio_test_young(struct folio *folio) { return false; } -static inline void set_page_young(struct page *page) +static inline void folio_set_young(struct folio *folio) { } -static inline bool test_and_clear_page_young(struct page *page) +static inline bool folio_test_clear_young(struct folio *folio) { return false; } -static inline bool page_is_idle(struct page *page) +static inline bool folio_test_idle(struct folio *folio) { return false; } -static inline void set_page_idle(struct page *page) +static inline void folio_set_idle(struct folio *folio) { } -static inline void clear_page_idle(struct page *page) +static inline void folio_clear_idle(struct folio *folio) { } #endif /* CONFIG_PAGE_IDLE_FLAG */ +static inline bool page_is_young(struct page *page) +{ + return folio_test_young(page_folio(page)); +} + +static inline void set_page_young(struct page *page) +{ + folio_set_young(page_folio(page)); +} + +static inline bool test_and_clear_page_young(struct page *page) +{ + return folio_test_clear_young(page_folio(page)); +} + +static inline bool page_is_idle(struct page *page) +{ + return folio_test_idle(page_folio(page)); +} + +static inline void set_page_idle(struct page *page) +{ + folio_set_idle(page_folio(page)); +} + +static inline void clear_page_idle(struct page *page) +{ + folio_clear_idle(page_folio(page)); +} #endif /* _LINUX_MM_PAGE_IDLE_H */ diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 719bfe5108c5..43c638c51c1f 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -12,7 +12,7 @@ extern void __reset_page_owner(struct page *page, unsigned int order); extern void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask); extern void __split_page_owner(struct page *page, unsigned int nr); -extern void __copy_page_owner(struct page *oldpage, struct page *newpage); +extern void __folio_copy_owner(struct folio *newfolio, struct folio *old); extern void __set_page_owner_migrate_reason(struct page *page, int reason); extern void __dump_page_owner(const struct page *page); extern void pagetypeinfo_showmixedcount_print(struct seq_file *m, @@ -36,10 +36,10 @@ static inline void split_page_owner(struct page *page, unsigned int nr) if (static_branch_unlikely(&page_owner_inited)) __split_page_owner(page, nr); } -static inline void copy_page_owner(struct page *oldpage, struct page *newpage) +static inline void folio_copy_owner(struct folio *newfolio, struct folio *old) { if (static_branch_unlikely(&page_owner_inited)) - __copy_page_owner(oldpage, newpage); + __folio_copy_owner(newfolio, old); } static inline void set_page_owner_migrate_reason(struct page *page, int reason) { @@ -63,7 +63,7 @@ static inline void split_page_owner(struct page *page, unsigned int order) { } -static inline void copy_page_owner(struct page *oldpage, struct page *newpage) +static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio) { } static inline void set_page_owner_migrate_reason(struct page *page, int reason) diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 7ad46f45df39..2e677e6ad09f 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -67,9 +67,31 @@ static inline int page_ref_count(const struct page *page) return atomic_read(&page->_refcount); } +/** + * folio_ref_count - The reference count on this folio. + * @folio: The folio. + * + * The refcount is usually incremented by calls to folio_get() and + * decremented by calls to folio_put(). Some typical users of the + * folio refcount: + * + * - Each reference from a page table + * - The page cache + * - Filesystem private data + * - The LRU list + * - Pipes + * - Direct IO which references this page in the process address space + * + * Return: The number of references to this folio. + */ +static inline int folio_ref_count(const struct folio *folio) +{ + return page_ref_count(&folio->page); +} + static inline int page_count(const struct page *page) { - return atomic_read(&compound_head(page)->_refcount); + return folio_ref_count(page_folio(page)); } static inline void set_page_count(struct page *page, int v) @@ -79,6 +101,11 @@ static inline void set_page_count(struct page *page, int v) __page_ref_set(page, v); } +static inline void folio_set_count(struct folio *folio, int v) +{ + set_page_count(&folio->page, v); +} + /* * Setup the page count before being freed into the page allocator for * the first time (boot or memory hotplug) @@ -95,6 +122,11 @@ static inline void page_ref_add(struct page *page, int nr) __page_ref_mod(page, nr); } +static inline void folio_ref_add(struct folio *folio, int nr) +{ + page_ref_add(&folio->page, nr); +} + static inline void page_ref_sub(struct page *page, int nr) { atomic_sub(nr, &page->_refcount); @@ -102,6 +134,11 @@ static inline void page_ref_sub(struct page *page, int nr) __page_ref_mod(page, -nr); } +static inline void folio_ref_sub(struct folio *folio, int nr) +{ + page_ref_sub(&folio->page, nr); +} + static inline int page_ref_sub_return(struct page *page, int nr) { int ret = atomic_sub_return(nr, &page->_refcount); @@ -111,6 +148,11 @@ static inline int page_ref_sub_return(struct page *page, int nr) return ret; } +static inline int folio_ref_sub_return(struct folio *folio, int nr) +{ + return page_ref_sub_return(&folio->page, nr); +} + static inline void page_ref_inc(struct page *page) { atomic_inc(&page->_refcount); @@ -118,6 +160,11 @@ static inline void page_ref_inc(struct page *page) __page_ref_mod(page, 1); } +static inline void folio_ref_inc(struct folio *folio) +{ + page_ref_inc(&folio->page); +} + static inline void page_ref_dec(struct page *page) { atomic_dec(&page->_refcount); @@ -125,6 +172,11 @@ static inline void page_ref_dec(struct page *page) __page_ref_mod(page, -1); } +static inline void folio_ref_dec(struct folio *folio) +{ + page_ref_dec(&folio->page); +} + static inline int page_ref_sub_and_test(struct page *page, int nr) { int ret = atomic_sub_and_test(nr, &page->_refcount); @@ -134,6 +186,11 @@ static inline int page_ref_sub_and_test(struct page *page, int nr) return ret; } +static inline int folio_ref_sub_and_test(struct folio *folio, int nr) +{ + return page_ref_sub_and_test(&folio->page, nr); +} + static inline int page_ref_inc_return(struct page *page) { int ret = atomic_inc_return(&page->_refcount); @@ -143,6 +200,11 @@ static inline int page_ref_inc_return(struct page *page) return ret; } +static inline int folio_ref_inc_return(struct folio *folio) +{ + return page_ref_inc_return(&folio->page); +} + static inline int page_ref_dec_and_test(struct page *page) { int ret = atomic_dec_and_test(&page->_refcount); @@ -152,6 +214,11 @@ static inline int page_ref_dec_and_test(struct page *page) return ret; } +static inline int folio_ref_dec_and_test(struct folio *folio) +{ + return page_ref_dec_and_test(&folio->page); +} + static inline int page_ref_dec_return(struct page *page) { int ret = atomic_dec_return(&page->_refcount); @@ -161,15 +228,91 @@ static inline int page_ref_dec_return(struct page *page) return ret; } -static inline int page_ref_add_unless(struct page *page, int nr, int u) +static inline int folio_ref_dec_return(struct folio *folio) { - int ret = atomic_add_unless(&page->_refcount, nr, u); + return page_ref_dec_return(&folio->page); +} + +static inline bool page_ref_add_unless(struct page *page, int nr, int u) +{ + bool ret = atomic_add_unless(&page->_refcount, nr, u); if (page_ref_tracepoint_active(page_ref_mod_unless)) __page_ref_mod_unless(page, nr, ret); return ret; } +static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u) +{ + return page_ref_add_unless(&folio->page, nr, u); +} + +/** + * folio_try_get - Attempt to increase the refcount on a folio. + * @folio: The folio. + * + * If you do not already have a reference to a folio, you can attempt to + * get one using this function. It may fail if, for example, the folio + * has been freed since you found a pointer to it, or it is frozen for + * the purposes of splitting or migration. + * + * Return: True if the reference count was successfully incremented. + */ +static inline bool folio_try_get(struct folio *folio) +{ + return folio_ref_add_unless(folio, 1, 0); +} + +static inline bool folio_ref_try_add_rcu(struct folio *folio, int count) +{ +#ifdef CONFIG_TINY_RCU + /* + * The caller guarantees the folio will not be freed from interrupt + * context, so (on !SMP) we only need preemption to be disabled + * and TINY_RCU does that for us. + */ +# ifdef CONFIG_PREEMPT_COUNT + VM_BUG_ON(!in_atomic() && !irqs_disabled()); +# endif + VM_BUG_ON_FOLIO(folio_ref_count(folio) == 0, folio); + folio_ref_add(folio, count); +#else + if (unlikely(!folio_ref_add_unless(folio, count, 0))) { + /* Either the folio has been freed, or will be freed. */ + return false; + } +#endif + return true; +} + +/** + * folio_try_get_rcu - Attempt to increase the refcount on a folio. + * @folio: The folio. + * + * This is a version of folio_try_get() optimised for non-SMP kernels. + * If you are still holding the rcu_read_lock() after looking up the + * page and know that the page cannot have its refcount decreased to + * zero in interrupt context, you can use this instead of folio_try_get(). + * + * Example users include get_user_pages_fast() (as pages are not unmapped + * from interrupt context) and the page cache lookups (as pages are not + * truncated from interrupt context). We also know that pages are not + * frozen in interrupt context for the purposes of splitting or migration. + * + * You can also use this function if you're holding a lock that prevents + * pages being frozen & removed; eg the i_pages lock for the page cache + * or the mmap_sem or page table lock for page tables. In this case, + * it will always succeed, and you could have used a plain folio_get(), + * but it's sometimes more convenient to have a common function called + * from both locked and RCU-protected contexts. + * + * Return: True if the reference count was successfully incremented. + */ +static inline bool folio_try_get_rcu(struct folio *folio) +{ + return folio_ref_try_add_rcu(folio, 1); +} + static inline int page_ref_freeze(struct page *page, int count) { int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count); @@ -179,6 +322,11 @@ static inline int page_ref_freeze(struct page *page, int count) return ret; } +static inline int folio_ref_freeze(struct folio *folio, int count) +{ + return page_ref_freeze(&folio->page, count); +} + static inline void page_ref_unfreeze(struct page *page, int count) { VM_BUG_ON_PAGE(page_count(page) != 0, page); @@ -189,4 +337,8 @@ static inline void page_ref_unfreeze(struct page *page, int count) __page_ref_unfreeze(page, count); } +static inline void folio_ref_unfreeze(struct folio *folio, int count) +{ + page_ref_unfreeze(&folio->page, count); +} #endif diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 62db6b0176b9..013cdc90f5fd 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -162,149 +162,119 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) void release_pages(struct page **pages, int nr); +struct address_space *page_mapping(struct page *); +struct address_space *folio_mapping(struct folio *); +struct address_space *swapcache_mapping(struct folio *); + +/** + * folio_file_mapping - Find the mapping this folio belongs to. + * @folio: The folio. + * + * For folios which are in the page cache, return the mapping that this + * page belongs to. Folios in the swap cache return the mapping of the + * swap file or swap device where the data is stored. This is different + * from the mapping returned by folio_mapping(). The only reason to + * use it is if, like NFS, you return 0 from ->activate_swapfile. + * + * Do not call this for folios which aren't in the page cache or swap cache. + */ +static inline struct address_space *folio_file_mapping(struct folio *folio) +{ + if (unlikely(folio_test_swapcache(folio))) + return swapcache_mapping(folio); + + return folio->mapping; +} + +static inline struct address_space *page_file_mapping(struct page *page) +{ + return folio_file_mapping(page_folio(page)); +} + /* * For file cache pages, return the address_space, otherwise return NULL */ static inline struct address_space *page_mapping_file(struct page *page) { - if (unlikely(PageSwapCache(page))) + struct folio *folio = page_folio(page); + + if (unlikely(folio_test_swapcache(folio))) return NULL; - return page_mapping(page); + return folio_mapping(folio); } -/* - * speculatively take a reference to a page. - * If the page is free (_refcount == 0), then _refcount is untouched, and 0 - * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned. - * - * This function must be called inside the same rcu_read_lock() section as has - * been used to lookup the page in the pagecache radix-tree (or page table): - * this allows allocators to use a synchronize_rcu() to stabilize _refcount. - * - * Unless an RCU grace period has passed, the count of all pages coming out - * of the allocator must be considered unstable. page_count may return higher - * than expected, and put_page must be able to do the right thing when the - * page has been finished with, no matter what it is subsequently allocated - * for (because put_page is what is used here to drop an invalid speculative - * reference). - * - * This is the interesting part of the lockless pagecache (and lockless - * get_user_pages) locking protocol, where the lookup-side (eg. find_get_page) - * has the following pattern: - * 1. find page in radix tree - * 2. conditionally increment refcount - * 3. check the page is still in pagecache (if no, goto 1) - * - * Remove-side that cares about stability of _refcount (eg. reclaim) has the - * following (with the i_pages lock held): - * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg) - * B. remove page from pagecache - * C. free the page - * - * There are 2 critical interleavings that matter: - * - 2 runs before A: in this case, A sees elevated refcount and bails out - * - A runs before 2: in this case, 2 sees zero refcount and retries; - * subsequently, B will complete and 1 will find no page, causing the - * lookup to return NULL. - * - * It is possible that between 1 and 2, the page is removed then the exact same - * page is inserted into the same position in pagecache. That's OK: the - * old find_get_page using a lock could equally have run before or after - * such a re-insertion, depending on order that locks are granted. - * - * Lookups racing against pagecache insertion isn't a big problem: either 1 - * will find the page or it will not. Likewise, the old find_get_page could run - * either before the insertion or afterwards, depending on timing. - */ -static inline int __page_cache_add_speculative(struct page *page, int count) +static inline bool page_cache_add_speculative(struct page *page, int count) { -#ifdef CONFIG_TINY_RCU -# ifdef CONFIG_PREEMPT_COUNT - VM_BUG_ON(!in_atomic() && !irqs_disabled()); -# endif - /* - * Preempt must be disabled here - we rely on rcu_read_lock doing - * this for us. - * - * Pagecache won't be truncated from interrupt context, so if we have - * found a page in the radix tree here, we have pinned its refcount by - * disabling preempt, and hence no need for the "speculative get" that - * SMP requires. - */ - VM_BUG_ON_PAGE(page_count(page) == 0, page); - page_ref_add(page, count); - -#else - if (unlikely(!page_ref_add_unless(page, count, 0))) { - /* - * Either the page has been freed, or will be freed. - * In either case, retry here and the caller should - * do the right thing (see comments above). - */ - return 0; - } -#endif VM_BUG_ON_PAGE(PageTail(page), page); - - return 1; + return folio_ref_try_add_rcu((struct folio *)page, count); } -static inline int page_cache_get_speculative(struct page *page) +static inline bool page_cache_get_speculative(struct page *page) { - return __page_cache_add_speculative(page, 1); -} - -static inline int page_cache_add_speculative(struct page *page, int count) -{ - return __page_cache_add_speculative(page, count); + return page_cache_add_speculative(page, 1); } /** - * attach_page_private - Attach private data to a page. - * @page: Page to attach data to. - * @data: Data to attach to page. + * folio_attach_private - Attach private data to a folio. + * @folio: Folio to attach data to. + * @data: Data to attach to folio. * - * Attaching private data to a page increments the page's reference count. - * The data must be detached before the page will be freed. + * Attaching private data to a folio increments the page's reference count. + * The data must be detached before the folio will be freed. */ -static inline void attach_page_private(struct page *page, void *data) +static inline void folio_attach_private(struct folio *folio, void *data) { - get_page(page); - set_page_private(page, (unsigned long)data); - SetPagePrivate(page); + folio_get(folio); + folio->private = data; + folio_set_private(folio); } /** - * detach_page_private - Detach private data from a page. - * @page: Page to detach data from. + * folio_detach_private - Detach private data from a folio. + * @folio: Folio to detach data from. * - * Removes the data that was previously attached to the page and decrements + * Removes the data that was previously attached to the folio and decrements * the refcount on the page. * - * Return: Data that was attached to the page. + * Return: Data that was attached to the folio. */ -static inline void *detach_page_private(struct page *page) +static inline void *folio_detach_private(struct folio *folio) { - void *data = (void *)page_private(page); + void *data = folio_get_private(folio); - if (!PagePrivate(page)) + if (!folio_test_private(folio)) return NULL; - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); + folio_clear_private(folio); + folio->private = NULL; + folio_put(folio); return data; } -#ifdef CONFIG_NUMA -extern struct page *__page_cache_alloc(gfp_t gfp); -#else -static inline struct page *__page_cache_alloc(gfp_t gfp) +static inline void attach_page_private(struct page *page, void *data) { - return alloc_pages(gfp, 0); + folio_attach_private(page_folio(page), data); +} + +static inline void *detach_page_private(struct page *page) +{ + return folio_detach_private(page_folio(page)); +} + +#ifdef CONFIG_NUMA +struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order); +#else +static inline struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) +{ + return folio_alloc(gfp, order); } #endif +static inline struct page *__page_cache_alloc(gfp_t gfp) +{ + return &filemap_alloc_folio(gfp, 0)->page; +} + static inline struct page *page_cache_alloc(struct address_space *x) { return __page_cache_alloc(mapping_gfp_mask(x)); @@ -331,9 +301,28 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_FOR_MMAP 0x00000040 #define FGP_HEAD 0x00000080 #define FGP_ENTRY 0x00000100 +#define FGP_STABLE 0x00000200 -struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, - int fgp_flags, gfp_t cache_gfp_mask); +struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, + int fgp_flags, gfp_t gfp); +struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, + int fgp_flags, gfp_t gfp); + +/** + * filemap_get_folio - Find and get a folio. + * @mapping: The address_space to search. + * @index: The page index. + * + * Looks up the page cache entry at @mapping & @index. If a folio is + * present, it is returned with an increased refcount. + * + * Otherwise, %NULL is returned. + */ +static inline struct folio *filemap_get_folio(struct address_space *mapping, + pgoff_t index) +{ + return __filemap_get_folio(mapping, index, 0, 0); +} /** * find_get_page - find and get a page reference @@ -376,25 +365,6 @@ static inline struct page *find_lock_page(struct address_space *mapping, return pagecache_get_page(mapping, index, FGP_LOCK, 0); } -/** - * find_lock_head - Locate, pin and lock a pagecache page. - * @mapping: The address_space to search. - * @index: The page index. - * - * Looks up the page cache entry at @mapping & @index. If there is a - * page cache page, its head page is returned locked and with an increased - * refcount. - * - * Context: May sleep. - * Return: A struct page which is !PageTail, or %NULL if there is no page - * in the cache for this index. - */ -static inline struct page *find_lock_head(struct address_space *mapping, - pgoff_t index) -{ - return pagecache_get_page(mapping, index, FGP_LOCK | FGP_HEAD, 0); -} - /** * find_or_create_page - locate or add a pagecache page * @mapping: the page's address_space @@ -452,6 +422,73 @@ static inline bool thp_contains(struct page *head, pgoff_t index) return page_index(head) == (index & ~(thp_nr_pages(head) - 1UL)); } +#define swapcache_index(folio) __page_file_index(&(folio)->page) + +/** + * folio_index - File index of a folio. + * @folio: The folio. + * + * For a folio which is either in the page cache or the swap cache, + * return its index within the address_space it belongs to. If you know + * the page is definitely in the page cache, you can look at the folio's + * index directly. + * + * Return: The index (offset in units of pages) of a folio in its file. + */ +static inline pgoff_t folio_index(struct folio *folio) +{ + if (unlikely(folio_test_swapcache(folio))) + return swapcache_index(folio); + return folio->index; +} + +/** + * folio_next_index - Get the index of the next folio. + * @folio: The current folio. + * + * Return: The index of the folio which follows this folio in the file. + */ +static inline pgoff_t folio_next_index(struct folio *folio) +{ + return folio->index + folio_nr_pages(folio); +} + +/** + * folio_file_page - The page for a particular index. + * @folio: The folio which contains this index. + * @index: The index we want to look up. + * + * Sometimes after looking up a folio in the page cache, we need to + * obtain the specific page for an index (eg a page fault). + * + * Return: The page containing the file data for this index. + */ +static inline struct page *folio_file_page(struct folio *folio, pgoff_t index) +{ + /* HugeTLBfs indexes the page cache in units of hpage_size */ + if (folio_test_hugetlb(folio)) + return &folio->page; + return folio_page(folio, index & (folio_nr_pages(folio) - 1)); +} + +/** + * folio_contains - Does this folio contain this index? + * @folio: The folio. + * @index: The page index within the file. + * + * Context: The caller should have the page locked in order to prevent + * (eg) shmem from moving the page between the page cache and swap cache + * and changing its index in the middle of the operation. + * Return: true or false. + */ +static inline bool folio_contains(struct folio *folio, pgoff_t index) +{ + /* HugeTLBfs indexes the page cache in units of hpage_size */ + if (folio_test_hugetlb(folio)) + return folio->index == index; + return index - folio_index(folio) < folio_nr_pages(folio); +} + /* * Given the page we found in the page cache, return the page corresponding * to this index in the file @@ -560,6 +597,27 @@ static inline loff_t page_file_offset(struct page *page) return ((loff_t)page_index(page)) << PAGE_SHIFT; } +/** + * folio_pos - Returns the byte position of this folio in its file. + * @folio: The folio. + */ +static inline loff_t folio_pos(struct folio *folio) +{ + return page_offset(&folio->page); +} + +/** + * folio_file_pos - Returns the byte position of this folio in its file. + * @folio: The folio. + * + * This differs from folio_pos() for folios which belong to a swap file. + * NFS is the only filesystem today which needs to use folio_file_pos(). + */ +static inline loff_t folio_file_pos(struct folio *folio) +{ + return page_file_offset(&folio->page); +} + extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma, unsigned long address); @@ -575,13 +633,13 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma, } struct wait_page_key { - struct page *page; + struct folio *folio; int bit_nr; int page_match; }; struct wait_page_queue { - struct page *page; + struct folio *folio; int bit_nr; wait_queue_entry_t wait; }; @@ -589,7 +647,7 @@ struct wait_page_queue { static inline bool wake_page_match(struct wait_page_queue *wait_page, struct wait_page_key *key) { - if (wait_page->page != key->page) + if (wait_page->folio != key->folio) return false; key->page_match = 1; @@ -599,20 +657,31 @@ static inline bool wake_page_match(struct wait_page_queue *wait_page, return true; } -extern void __lock_page(struct page *page); -extern int __lock_page_killable(struct page *page); -extern int __lock_page_async(struct page *page, struct wait_page_queue *wait); -extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, +void __folio_lock(struct folio *folio); +int __folio_lock_killable(struct folio *folio); +bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm, unsigned int flags); -extern void unlock_page(struct page *page); +void unlock_page(struct page *page); +void folio_unlock(struct folio *folio); + +static inline bool folio_trylock(struct folio *folio) +{ + return likely(!test_and_set_bit_lock(PG_locked, folio_flags(folio, 0))); +} /* * Return true if the page was successfully locked */ static inline int trylock_page(struct page *page) { - page = compound_head(page); - return (likely(!test_and_set_bit_lock(PG_locked, &page->flags))); + return folio_trylock(page_folio(page)); +} + +static inline void folio_lock(struct folio *folio) +{ + might_sleep(); + if (!folio_trylock(folio)) + __folio_lock(folio); } /* @@ -620,9 +689,20 @@ static inline int trylock_page(struct page *page) */ static inline void lock_page(struct page *page) { + struct folio *folio; might_sleep(); - if (!trylock_page(page)) - __lock_page(page); + + folio = page_folio(page); + if (!folio_trylock(folio)) + __folio_lock(folio); +} + +static inline int folio_lock_killable(struct folio *folio) +{ + might_sleep(); + if (!folio_trylock(folio)) + return __folio_lock_killable(folio); + return 0; } /* @@ -632,26 +712,7 @@ static inline void lock_page(struct page *page) */ static inline int lock_page_killable(struct page *page) { - might_sleep(); - if (!trylock_page(page)) - return __lock_page_killable(page); - return 0; -} - -/* - * lock_page_async - Lock the page, unless this would block. If the page - * is already locked, then queue a callback when the page becomes unlocked. - * This callback can then retry the operation. - * - * Returns 0 if the page is locked successfully, or -EIOCBQUEUED if the page - * was already locked and the callback defined in 'wait' was queued. - */ -static inline int lock_page_async(struct page *page, - struct wait_page_queue *wait) -{ - if (!trylock_page(page)) - return __lock_page_async(page, wait); - return 0; + return folio_lock_killable(page_folio(page)); } /* @@ -659,78 +720,108 @@ static inline int lock_page_async(struct page *page, * caller indicated that it can handle a retry. * * Return value and mmap_lock implications depend on flags; see - * __lock_page_or_retry(). + * __folio_lock_or_retry(). */ -static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm, +static inline bool lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { + struct folio *folio; might_sleep(); - return trylock_page(page) || __lock_page_or_retry(page, mm, flags); + + folio = page_folio(page); + return folio_trylock(folio) || __folio_lock_or_retry(folio, mm, flags); } /* - * This is exported only for wait_on_page_locked/wait_on_page_writeback, etc., + * This is exported only for folio_wait_locked/folio_wait_writeback, etc., * and should not be used directly. */ -extern void wait_on_page_bit(struct page *page, int bit_nr); -extern int wait_on_page_bit_killable(struct page *page, int bit_nr); +void folio_wait_bit(struct folio *folio, int bit_nr); +int folio_wait_bit_killable(struct folio *folio, int bit_nr); /* - * Wait for a page to be unlocked. + * Wait for a folio to be unlocked. * - * This must be called with the caller "holding" the page, - * ie with increased "page->count" so that the page won't + * This must be called with the caller "holding" the folio, + * ie with increased "page->count" so that the folio won't * go away during the wait.. */ +static inline void folio_wait_locked(struct folio *folio) +{ + if (folio_test_locked(folio)) + folio_wait_bit(folio, PG_locked); +} + +static inline int folio_wait_locked_killable(struct folio *folio) +{ + if (!folio_test_locked(folio)) + return 0; + return folio_wait_bit_killable(folio, PG_locked); +} + static inline void wait_on_page_locked(struct page *page) { - if (PageLocked(page)) - wait_on_page_bit(compound_head(page), PG_locked); + folio_wait_locked(page_folio(page)); } static inline int wait_on_page_locked_killable(struct page *page) { - if (!PageLocked(page)) - return 0; - return wait_on_page_bit_killable(compound_head(page), PG_locked); + return folio_wait_locked_killable(page_folio(page)); } int put_and_wait_on_page_locked(struct page *page, int state); void wait_on_page_writeback(struct page *page); -int wait_on_page_writeback_killable(struct page *page); -extern void end_page_writeback(struct page *page); +void folio_wait_writeback(struct folio *folio); +int folio_wait_writeback_killable(struct folio *folio); +void end_page_writeback(struct page *page); +void folio_end_writeback(struct folio *folio); void wait_for_stable_page(struct page *page); +void folio_wait_stable(struct folio *folio); +void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn); +static inline void __set_page_dirty(struct page *page, + struct address_space *mapping, int warn) +{ + __folio_mark_dirty(page_folio(page), mapping, warn); +} +void folio_account_cleaned(struct folio *folio, struct address_space *mapping, + struct bdi_writeback *wb); +static inline void account_page_cleaned(struct page *page, + struct address_space *mapping, struct bdi_writeback *wb) +{ + return folio_account_cleaned(page_folio(page), mapping, wb); +} +void __folio_cancel_dirty(struct folio *folio); +static inline void folio_cancel_dirty(struct folio *folio) +{ + /* Avoid atomic ops, locking, etc. when not actually needed. */ + if (folio_test_dirty(folio)) + __folio_cancel_dirty(folio); +} +static inline void cancel_dirty_page(struct page *page) +{ + folio_cancel_dirty(page_folio(page)); +} +bool folio_clear_dirty_for_io(struct folio *folio); +bool clear_page_dirty_for_io(struct page *page); +int __must_check folio_write_one(struct folio *folio); +static inline int __must_check write_one_page(struct page *page) +{ + return folio_write_one(page_folio(page)); +} -void __set_page_dirty(struct page *, struct address_space *, int warn); int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); void page_endio(struct page *page, bool is_write, int err); -/** - * set_page_private_2 - Set PG_private_2 on a page and take a ref - * @page: The page. - * - * Set the PG_private_2 flag on a page and take the reference needed for the VM - * to handle its lifetime correctly. This sets the flag and takes the - * reference unconditionally, so care must be taken not to set the flag again - * if it's already set. - */ -static inline void set_page_private_2(struct page *page) -{ - page = compound_head(page); - get_page(page); - SetPagePrivate2(page); -} - -void end_page_private_2(struct page *page); -void wait_on_page_private_2(struct page *page); -int wait_on_page_private_2_killable(struct page *page); +void folio_end_private_2(struct folio *folio); +void folio_wait_private_2(struct folio *folio); +int folio_wait_private_2_killable(struct folio *folio); /* * Add an arbitrary waiter to a page's wait queue */ -extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter); +void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter); /* * Fault everything in given userspace address range in. @@ -790,9 +881,11 @@ static inline int fault_in_pages_readable(const char __user *uaddr, size_t size) } int add_to_page_cache_locked(struct page *page, struct address_space *mapping, - pgoff_t index, gfp_t gfp_mask); + pgoff_t index, gfp_t gfp); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, - pgoff_t index, gfp_t gfp_mask); + pgoff_t index, gfp_t gfp); +int filemap_add_folio(struct address_space *mapping, struct folio *folio, + pgoff_t index, gfp_t gfp); extern void delete_from_page_cache(struct page *page); extern void __delete_from_page_cache(struct page *page, void *shadow); void replace_page_cache_page(struct page *old, struct page *new); @@ -817,6 +910,10 @@ static inline int add_to_page_cache(struct page *page, return error; } +/* Must be non-static for BPF error injection */ +int __filemap_add_folio(struct address_space *mapping, struct folio *folio, + pgoff_t index, gfp_t gfp, void **shadowp); + /** * struct readahead_control - Describes a readahead request. * @@ -906,33 +1003,57 @@ void page_cache_async_readahead(struct address_space *mapping, page_cache_async_ra(&ractl, page, req_count); } +static inline struct folio *__readahead_folio(struct readahead_control *ractl) +{ + struct folio *folio; + + BUG_ON(ractl->_batch_count > ractl->_nr_pages); + ractl->_nr_pages -= ractl->_batch_count; + ractl->_index += ractl->_batch_count; + + if (!ractl->_nr_pages) { + ractl->_batch_count = 0; + return NULL; + } + + folio = xa_load(&ractl->mapping->i_pages, ractl->_index); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + ractl->_batch_count = folio_nr_pages(folio); + + return folio; +} + /** * readahead_page - Get the next page to read. - * @rac: The current readahead request. + * @ractl: The current readahead request. * * Context: The page is locked and has an elevated refcount. The caller * should decreases the refcount once the page has been submitted for I/O * and unlock the page once all I/O to that page has completed. * Return: A pointer to the next page, or %NULL if we are done. */ -static inline struct page *readahead_page(struct readahead_control *rac) +static inline struct page *readahead_page(struct readahead_control *ractl) { - struct page *page; + struct folio *folio = __readahead_folio(ractl); - BUG_ON(rac->_batch_count > rac->_nr_pages); - rac->_nr_pages -= rac->_batch_count; - rac->_index += rac->_batch_count; + return &folio->page; +} - if (!rac->_nr_pages) { - rac->_batch_count = 0; - return NULL; - } +/** + * readahead_folio - Get the next folio to read. + * @ractl: The current readahead request. + * + * Context: The folio is locked. The caller should unlock the folio once + * all I/O to that folio has completed. + * Return: A pointer to the next folio, or %NULL if we are done. + */ +static inline struct folio *readahead_folio(struct readahead_control *ractl) +{ + struct folio *folio = __readahead_folio(ractl); - page = xa_load(&rac->mapping->i_pages, rac->_index); - VM_BUG_ON_PAGE(!PageLocked(page), page); - rac->_batch_count = thp_nr_pages(page); - - return page; + if (folio) + folio_put(folio); + return folio; } static inline unsigned int __readahead_batch(struct readahead_control *rac, @@ -1039,6 +1160,34 @@ static inline unsigned long dir_pages(struct inode *inode) PAGE_SHIFT; } +/** + * folio_mkwrite_check_truncate - check if folio was truncated + * @folio: the folio to check + * @inode: the inode to check the folio against + * + * Return: the number of bytes in the folio up to EOF, + * or -EFAULT if the folio was truncated. + */ +static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio, + struct inode *inode) +{ + loff_t size = i_size_read(inode); + pgoff_t index = size >> PAGE_SHIFT; + size_t offset = offset_in_folio(folio, size); + + if (!folio->mapping) + return -EFAULT; + + /* folio is wholly inside EOF */ + if (folio_next_index(folio) - 1 < index) + return folio_size(folio); + /* folio is wholly past EOF */ + if (folio->index > index || !offset) + return -EFAULT; + /* folio is partially inside EOF */ + return offset; +} + /** * page_mkwrite_check_truncate - check if page was truncated * @page: the page to check @@ -1068,19 +1217,25 @@ static inline int page_mkwrite_check_truncate(struct page *page, } /** - * i_blocks_per_page - How many blocks fit in this page. + * i_blocks_per_folio - How many blocks fit in this folio. * @inode: The inode which contains the blocks. - * @page: The page (head page if the page is a THP). + * @folio: The folio. * - * If the block size is larger than the size of this page, return zero. + * If the block size is larger than the size of this folio, return zero. * - * Context: The caller should hold a refcount on the page to prevent it + * Context: The caller should hold a refcount on the folio to prevent it * from being split. - * Return: The number of filesystem blocks covered by this page. + * Return: The number of filesystem blocks covered by this folio. */ +static inline +unsigned int i_blocks_per_folio(struct inode *inode, struct folio *folio) +{ + return folio_size(folio) >> inode->i_blkbits; +} + static inline unsigned int i_blocks_per_page(struct inode *inode, struct page *page) { - return thp_size(page) >> inode->i_blkbits; + return i_blocks_per_folio(inode, page_folio(page)); } #endif /* _LINUX_PAGEMAP_H */ diff --git a/include/linux/rmap.h b/include/linux/rmap.h index c976cc6de257..e704b1a4c06c 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -235,7 +235,7 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); * * returns the number of cleaned PTEs. */ -int page_mkclean(struct page *); +int folio_mkclean(struct folio *); /* * called in munlock()/munmap() path to check for other vmas holding @@ -295,12 +295,14 @@ static inline void try_to_unmap(struct page *page, enum ttu_flags flags) { } -static inline int page_mkclean(struct page *page) +static inline int folio_mkclean(struct folio *folio) { return 0; } - - #endif /* CONFIG_MMU */ +static inline int page_mkclean(struct page *page) +{ + return folio_mkclean(page_folio(page)); +} #endif /* _LINUX_RMAP_H */ diff --git a/include/linux/swap.h b/include/linux/swap.h index ba52f3a3478e..cdf0957a88a4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -320,11 +320,17 @@ struct vma_swap_readahead { #endif }; +static inline swp_entry_t folio_swap_entry(struct folio *folio) +{ + swp_entry_t entry = { .val = page_private(&folio->page) }; + return entry; +} + /* linux/mm/workingset.c */ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg); -void workingset_refault(struct page *page, void *shadow); -void workingset_activation(struct page *page); +void workingset_refault(struct folio *folio, void *shadow); +void workingset_activation(struct folio *folio); /* Only track the nodes of mappings with shadow entries */ void workingset_update_node(struct xa_node *node); @@ -344,9 +350,11 @@ extern unsigned long nr_free_buffer_pages(void); /* linux/mm/swap.c */ extern void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages); -extern void lru_note_cost_page(struct page *); +extern void lru_note_cost_folio(struct folio *); +extern void folio_add_lru(struct folio *); extern void lru_cache_add(struct page *); -extern void mark_page_accessed(struct page *); +void mark_page_accessed(struct page *); +void folio_mark_accessed(struct folio *); extern atomic_t lru_disable_count; @@ -365,7 +373,6 @@ extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); -extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); extern void deactivate_page(struct page *page); extern void mark_page_lazyfree(struct page *page); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index d6a6cf53b127..bfe38869498d 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -415,6 +415,78 @@ static inline void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats) { } #endif /* CONFIG_SMP */ +static inline void __zone_stat_mod_folio(struct folio *folio, + enum zone_stat_item item, long nr) +{ + __mod_zone_page_state(folio_zone(folio), item, nr); +} + +static inline void __zone_stat_add_folio(struct folio *folio, + enum zone_stat_item item) +{ + __mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio)); +} + +static inline void __zone_stat_sub_folio(struct folio *folio, + enum zone_stat_item item) +{ + __mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio)); +} + +static inline void zone_stat_mod_folio(struct folio *folio, + enum zone_stat_item item, long nr) +{ + mod_zone_page_state(folio_zone(folio), item, nr); +} + +static inline void zone_stat_add_folio(struct folio *folio, + enum zone_stat_item item) +{ + mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio)); +} + +static inline void zone_stat_sub_folio(struct folio *folio, + enum zone_stat_item item) +{ + mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio)); +} + +static inline void __node_stat_mod_folio(struct folio *folio, + enum node_stat_item item, long nr) +{ + __mod_node_page_state(folio_pgdat(folio), item, nr); +} + +static inline void __node_stat_add_folio(struct folio *folio, + enum node_stat_item item) +{ + __mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio)); +} + +static inline void __node_stat_sub_folio(struct folio *folio, + enum node_stat_item item) +{ + __mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio)); +} + +static inline void node_stat_mod_folio(struct folio *folio, + enum node_stat_item item, long nr) +{ + mod_node_page_state(folio_pgdat(folio), item, nr); +} + +static inline void node_stat_add_folio(struct folio *folio, + enum node_stat_item item) +{ + mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio)); +} + +static inline void node_stat_sub_folio(struct folio *folio, + enum node_stat_item item) +{ + mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio)); +} + static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, int migratetype) { @@ -525,12 +597,6 @@ static inline void mod_lruvec_page_state(struct page *page, #endif /* CONFIG_MEMCG */ -static inline void inc_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx) -{ - mod_lruvec_state(lruvec, idx, 1); -} - static inline void __inc_lruvec_page_state(struct page *page, enum node_stat_item idx) { @@ -543,6 +609,24 @@ static inline void __dec_lruvec_page_state(struct page *page, __mod_lruvec_page_state(page, idx, -1); } +static inline void __lruvec_stat_mod_folio(struct folio *folio, + enum node_stat_item idx, int val) +{ + __mod_lruvec_page_state(&folio->page, idx, val); +} + +static inline void __lruvec_stat_add_folio(struct folio *folio, + enum node_stat_item idx) +{ + __lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio)); +} + +static inline void __lruvec_stat_sub_folio(struct folio *folio, + enum node_stat_item idx) +{ + __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); +} + static inline void inc_lruvec_page_state(struct page *page, enum node_stat_item idx) { @@ -555,4 +639,21 @@ static inline void dec_lruvec_page_state(struct page *page, mod_lruvec_page_state(page, idx, -1); } +static inline void lruvec_stat_mod_folio(struct folio *folio, + enum node_stat_item idx, int val) +{ + mod_lruvec_page_state(&folio->page, idx, val); +} + +static inline void lruvec_stat_add_folio(struct folio *folio, + enum node_stat_item idx) +{ + lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio)); +} + +static inline void lruvec_stat_sub_folio(struct folio *folio, + enum node_stat_item idx) +{ + lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); +} #endif /* _LINUX_VMSTAT_H */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d1f65adf6a26..3571f383dfb6 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -393,7 +393,14 @@ void writeback_set_ratelimit(void); void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); -void account_page_redirty(struct page *page); +bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio); +void folio_account_redirty(struct folio *folio); +static inline void account_page_redirty(struct page *page) +{ + folio_account_redirty(page_folio(page)); +} +bool folio_redirty_for_writepage(struct writeback_control *, struct folio *); +bool redirty_page_for_writepage(struct writeback_control *, struct page *); void sb_mark_inode_writeback(struct inode *inode); void sb_clear_inode_writeback(struct inode *inode); diff --git a/include/trace/events/pagemap.h b/include/trace/events/pagemap.h index 1d28431e85bd..171524d3526d 100644 --- a/include/trace/events/pagemap.h +++ b/include/trace/events/pagemap.h @@ -16,38 +16,38 @@ #define PAGEMAP_MAPPEDDISK 0x0020u #define PAGEMAP_BUFFERS 0x0040u -#define trace_pagemap_flags(page) ( \ - (PageAnon(page) ? PAGEMAP_ANONYMOUS : PAGEMAP_FILE) | \ - (page_mapped(page) ? PAGEMAP_MAPPED : 0) | \ - (PageSwapCache(page) ? PAGEMAP_SWAPCACHE : 0) | \ - (PageSwapBacked(page) ? PAGEMAP_SWAPBACKED : 0) | \ - (PageMappedToDisk(page) ? PAGEMAP_MAPPEDDISK : 0) | \ - (page_has_private(page) ? PAGEMAP_BUFFERS : 0) \ +#define trace_pagemap_flags(folio) ( \ + (folio_test_anon(folio) ? PAGEMAP_ANONYMOUS : PAGEMAP_FILE) | \ + (folio_mapped(folio) ? PAGEMAP_MAPPED : 0) | \ + (folio_test_swapcache(folio) ? PAGEMAP_SWAPCACHE : 0) | \ + (folio_test_swapbacked(folio) ? PAGEMAP_SWAPBACKED : 0) | \ + (folio_test_mappedtodisk(folio) ? PAGEMAP_MAPPEDDISK : 0) | \ + (folio_test_private(folio) ? PAGEMAP_BUFFERS : 0) \ ) TRACE_EVENT(mm_lru_insertion, - TP_PROTO(struct page *page), + TP_PROTO(struct folio *folio), - TP_ARGS(page), + TP_ARGS(folio), TP_STRUCT__entry( - __field(struct page *, page ) + __field(struct folio *, folio ) __field(unsigned long, pfn ) __field(enum lru_list, lru ) __field(unsigned long, flags ) ), TP_fast_assign( - __entry->page = page; - __entry->pfn = page_to_pfn(page); - __entry->lru = page_lru(page); - __entry->flags = trace_pagemap_flags(page); + __entry->folio = folio; + __entry->pfn = folio_pfn(folio); + __entry->lru = folio_lru_list(folio); + __entry->flags = trace_pagemap_flags(folio); ), /* Flag format is based on page-types.c formatting for pagemap */ - TP_printk("page=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s", - __entry->page, + TP_printk("folio=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s", + __entry->folio, __entry->pfn, __entry->lru, __entry->flags & PAGEMAP_MAPPED ? "M" : " ", @@ -60,23 +60,21 @@ TRACE_EVENT(mm_lru_insertion, TRACE_EVENT(mm_lru_activate, - TP_PROTO(struct page *page), + TP_PROTO(struct folio *folio), - TP_ARGS(page), + TP_ARGS(folio), TP_STRUCT__entry( - __field(struct page *, page ) + __field(struct folio *, folio ) __field(unsigned long, pfn ) ), TP_fast_assign( - __entry->page = page; - __entry->pfn = page_to_pfn(page); + __entry->folio = folio; + __entry->pfn = folio_pfn(folio); ), - /* Flag format is based on page-types.c formatting for pagemap */ - TP_printk("page=%p pfn=0x%lx", __entry->page, __entry->pfn) - + TP_printk("folio=%p pfn=0x%lx", __entry->folio, __entry->pfn) ); #endif /* _TRACE_PAGEMAP_H */ diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 840d1ba84cf5..7dccb66474f7 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -52,11 +52,11 @@ WB_WORK_REASON struct wb_writeback_work; -DECLARE_EVENT_CLASS(writeback_page_template, +DECLARE_EVENT_CLASS(writeback_folio_template, - TP_PROTO(struct page *page, struct address_space *mapping), + TP_PROTO(struct folio *folio, struct address_space *mapping), - TP_ARGS(page, mapping), + TP_ARGS(folio, mapping), TP_STRUCT__entry ( __array(char, name, 32) @@ -69,7 +69,7 @@ DECLARE_EVENT_CLASS(writeback_page_template, bdi_dev_name(mapping ? inode_to_bdi(mapping->host) : NULL), 32); __entry->ino = mapping ? mapping->host->i_ino : 0; - __entry->index = page->index; + __entry->index = folio->index; ), TP_printk("bdi %s: ino=%lu index=%lu", @@ -79,18 +79,18 @@ DECLARE_EVENT_CLASS(writeback_page_template, ) ); -DEFINE_EVENT(writeback_page_template, writeback_dirty_page, +DEFINE_EVENT(writeback_folio_template, writeback_dirty_folio, - TP_PROTO(struct page *page, struct address_space *mapping), + TP_PROTO(struct folio *folio, struct address_space *mapping), - TP_ARGS(page, mapping) + TP_ARGS(folio, mapping) ); -DEFINE_EVENT(writeback_page_template, wait_on_page_writeback, +DEFINE_EVENT(writeback_folio_template, folio_wait_writeback, - TP_PROTO(struct page *page, struct address_space *mapping), + TP_PROTO(struct folio *folio, struct address_space *mapping), - TP_ARGS(page, mapping) + TP_ARGS(folio, mapping) ); DECLARE_EVENT_CLASS(writeback_dirty_inode_template, @@ -236,9 +236,9 @@ TRACE_EVENT(inode_switch_wbs, TRACE_EVENT(track_foreign_dirty, - TP_PROTO(struct page *page, struct bdi_writeback *wb), + TP_PROTO(struct folio *folio, struct bdi_writeback *wb), - TP_ARGS(page, wb), + TP_ARGS(folio, wb), TP_STRUCT__entry( __array(char, name, 32) @@ -250,7 +250,7 @@ TRACE_EVENT(track_foreign_dirty, ), TP_fast_assign( - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = folio_mapping(folio); struct inode *inode = mapping ? mapping->host : NULL; strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); @@ -258,7 +258,7 @@ TRACE_EVENT(track_foreign_dirty, __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); - __entry->page_cgroup_ino = cgroup_ino(page_memcg(page)->css.cgroup); + __entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup); ), TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e76b55917905..de006552be8a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13319,7 +13319,7 @@ BTF_SET_START(btf_non_sleepable_error_inject) /* Three functions below can be called from sleepable and non-sleepable context. * Assume non-sleepable from bpf safety point of view. */ -BTF_ID(func, __add_to_page_cache_locked) +BTF_ID(func, __filemap_add_folio) BTF_ID(func, should_fail_alloc_page) BTF_ID(func, should_failslab) BTF_SET_END(btf_non_sleepable_error_inject) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index af24dc3febbe..6357c3580d07 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -167,7 +167,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, addr + PAGE_SIZE); if (new_page) { - err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL); + err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm, + GFP_KERNEL); if (err) return err; } diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c index 451543937524..53e7eb1dd76c 100644 --- a/lib/flex_proportions.c +++ b/lib/flex_proportions.c @@ -217,11 +217,12 @@ static void fprop_reflect_period_percpu(struct fprop_global *p, } /* Event of type pl happened */ -void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl) +void __fprop_add_percpu(struct fprop_global *p, struct fprop_local_percpu *pl, + long nr) { fprop_reflect_period_percpu(p, pl); - percpu_counter_add_batch(&pl->events, 1, PROP_BATCH); - percpu_counter_add(&p->events, 1); + percpu_counter_add_batch(&pl->events, nr, PROP_BATCH); + percpu_counter_add(&p->events, nr); } void fprop_fraction_percpu(struct fprop_global *p, @@ -253,20 +254,29 @@ void fprop_fraction_percpu(struct fprop_global *p, } /* - * Like __fprop_inc_percpu() except that event is counted only if the given + * Like __fprop_add_percpu() except that event is counted only if the given * type has fraction smaller than @max_frac/FPROP_FRAC_BASE */ -void __fprop_inc_percpu_max(struct fprop_global *p, - struct fprop_local_percpu *pl, int max_frac) +void __fprop_add_percpu_max(struct fprop_global *p, + struct fprop_local_percpu *pl, int max_frac, long nr) { if (unlikely(max_frac < FPROP_FRAC_BASE)) { unsigned long numerator, denominator; + s64 tmp; fprop_fraction_percpu(p, pl, &numerator, &denominator); - if (numerator > - (((u64)denominator) * max_frac) >> FPROP_FRAC_SHIFT) + /* Adding 'nr' to fraction exceeds max_frac/FPROP_FRAC_BASE? */ + tmp = (u64)denominator * max_frac - + ((u64)numerator << FPROP_FRAC_SHIFT); + if (tmp < 0) { + /* Maximum fraction already exceeded? */ return; + } else if (tmp < nr * (FPROP_FRAC_BASE - max_frac)) { + /* Add just enough for the fraction to saturate */ + nr = div_u64(tmp + FPROP_FRAC_BASE - max_frac - 1, + FPROP_FRAC_BASE - max_frac); + } } - __fprop_inc_percpu(p, pl); + __fprop_add_percpu(p, pl, nr); } diff --git a/mm/Makefile b/mm/Makefile index fc60a40ce954..d6c0042e3aa0 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -46,7 +46,7 @@ mmu-$(CONFIG_MMU) += process_vm_access.o endif obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ - maccess.o page-writeback.o \ + maccess.o page-writeback.o folio-compat.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o percpu.o slab_common.o \ diff --git a/mm/compaction.c b/mm/compaction.c index bfc93da1c2c7..fbc60f964c38 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1022,7 +1022,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (!TestClearPageLRU(page)) goto isolate_fail_put; - lruvec = mem_cgroup_page_lruvec(page); + lruvec = folio_lruvec(page_folio(page)); /* If we already hold the lock, we can skip some rechecking */ if (lruvec != locked) { @@ -1032,7 +1032,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, compact_lock_irqsave(&lruvec->lru_lock, &flags, cc); locked = lruvec; - lruvec_memcg_debug(lruvec, page); + lruvec_memcg_debug(lruvec, page_folio(page)); /* Try get exclusive access under lock */ if (!skip_updated) { diff --git a/mm/filemap.c b/mm/filemap.c index dae481293b5d..3e9feb5cc570 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -835,6 +835,8 @@ EXPORT_SYMBOL(file_write_and_wait_range); */ void replace_page_cache_page(struct page *old, struct page *new) { + struct folio *fold = page_folio(old); + struct folio *fnew = page_folio(new); struct address_space *mapping = old->mapping; void (*freepage)(struct page *) = mapping->a_ops->freepage; pgoff_t offset = old->index; @@ -848,7 +850,7 @@ void replace_page_cache_page(struct page *old, struct page *new) new->mapping = mapping; new->index = offset; - mem_cgroup_migrate(old, new); + mem_cgroup_migrate(fold, fnew); xas_lock_irq(&xas); xas_store(&xas, new); @@ -870,26 +872,25 @@ void replace_page_cache_page(struct page *old, struct page *new) } EXPORT_SYMBOL_GPL(replace_page_cache_page); -noinline int __add_to_page_cache_locked(struct page *page, - struct address_space *mapping, - pgoff_t offset, gfp_t gfp, - void **shadowp) +noinline int __filemap_add_folio(struct address_space *mapping, + struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) { - XA_STATE(xas, &mapping->i_pages, offset); - int huge = PageHuge(page); + XA_STATE(xas, &mapping->i_pages, index); + int huge = folio_test_hugetlb(folio); int error; bool charged = false; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageSwapBacked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); mapping_set_update(&xas, mapping); - get_page(page); - page->mapping = mapping; - page->index = offset; + folio_get(folio); + folio->mapping = mapping; + folio->index = index; if (!huge) { - error = mem_cgroup_charge(page, NULL, gfp); + error = mem_cgroup_charge(folio, NULL, gfp); + VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); if (error) goto error; charged = true; @@ -901,7 +902,7 @@ noinline int __add_to_page_cache_locked(struct page *page, unsigned int order = xa_get_order(xas.xa, xas.xa_index); void *entry, *old = NULL; - if (order > thp_order(page)) + if (order > folio_order(folio)) xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index), order, gfp); xas_lock_irq(&xas); @@ -918,13 +919,13 @@ noinline int __add_to_page_cache_locked(struct page *page, *shadowp = old; /* entry may have been split before we acquired lock */ order = xa_get_order(xas.xa, xas.xa_index); - if (order > thp_order(page)) { + if (order > folio_order(folio)) { xas_split(&xas, old, order); xas_reset(&xas); } } - xas_store(&xas, page); + xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; @@ -932,7 +933,7 @@ noinline int __add_to_page_cache_locked(struct page *page, /* hugetlb pages do not participate in page cache accounting */ if (!huge) - __inc_lruvec_page_state(page, NR_FILE_PAGES); + __lruvec_stat_add_folio(folio, NR_FILE_PAGES); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); @@ -940,19 +941,19 @@ unlock: if (xas_error(&xas)) { error = xas_error(&xas); if (charged) - mem_cgroup_uncharge(page); + mem_cgroup_uncharge(folio); goto error; } - trace_mm_filemap_add_to_page_cache(page); + trace_mm_filemap_add_to_page_cache(&folio->page); return 0; error: - page->mapping = NULL; + folio->mapping = NULL; /* Leave page->index set: truncation relies upon it */ - put_page(page); + folio_put(folio); return error; } -ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO); +ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO); /** * add_to_page_cache_locked - add a locked page to the pagecache @@ -969,59 +970,58 @@ ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO); int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { - return __add_to_page_cache_locked(page, mapping, offset, + return __filemap_add_folio(mapping, page_folio(page), offset, gfp_mask, NULL); } EXPORT_SYMBOL(add_to_page_cache_locked); -int add_to_page_cache_lru(struct page *page, struct address_space *mapping, - pgoff_t offset, gfp_t gfp_mask) +int filemap_add_folio(struct address_space *mapping, struct folio *folio, + pgoff_t index, gfp_t gfp) { void *shadow = NULL; int ret; - __SetPageLocked(page); - ret = __add_to_page_cache_locked(page, mapping, offset, - gfp_mask, &shadow); + __folio_set_locked(folio); + ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow); if (unlikely(ret)) - __ClearPageLocked(page); + __folio_clear_locked(folio); else { /* - * The page might have been evicted from cache only + * The folio might have been evicted from cache only * recently, in which case it should be activated like - * any other repeatedly accessed page. - * The exception is pages getting rewritten; evicting other + * any other repeatedly accessed folio. + * The exception is folios getting rewritten; evicting other * data from the working set, only to cache data that will * get overwritten with something else, is a waste of memory. */ - WARN_ON_ONCE(PageActive(page)); - if (!(gfp_mask & __GFP_WRITE) && shadow) - workingset_refault(page, shadow); - lru_cache_add(page); + WARN_ON_ONCE(folio_test_active(folio)); + if (!(gfp & __GFP_WRITE) && shadow) + workingset_refault(folio, shadow); + folio_add_lru(folio); } return ret; } -EXPORT_SYMBOL_GPL(add_to_page_cache_lru); +EXPORT_SYMBOL_GPL(filemap_add_folio); #ifdef CONFIG_NUMA -struct page *__page_cache_alloc(gfp_t gfp) +struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) { int n; - struct page *page; + struct folio *folio; if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { cpuset_mems_cookie = read_mems_allowed_begin(); n = cpuset_mem_spread_node(); - page = __alloc_pages_node(n, gfp, 0); - } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); + folio = __folio_alloc_node(gfp, order, n); + } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie)); - return page; + return folio; } - return alloc_pages(gfp, 0); + return folio_alloc(gfp, order); } -EXPORT_SYMBOL(__page_cache_alloc); +EXPORT_SYMBOL(filemap_alloc_folio); #endif /* @@ -1074,11 +1074,11 @@ EXPORT_SYMBOL(filemap_invalidate_unlock_two); */ #define PAGE_WAIT_TABLE_BITS 8 #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS) -static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned; +static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned; -static wait_queue_head_t *page_waitqueue(struct page *page) +static wait_queue_head_t *folio_waitqueue(struct folio *folio) { - return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)]; + return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)]; } void __init pagecache_init(void) @@ -1086,7 +1086,7 @@ void __init pagecache_init(void) int i; for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++) - init_waitqueue_head(&page_wait_table[i]); + init_waitqueue_head(&folio_wait_table[i]); page_writeback_init(); } @@ -1141,10 +1141,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, */ flags = wait->flags; if (flags & WQ_FLAG_EXCLUSIVE) { - if (test_bit(key->bit_nr, &key->page->flags)) + if (test_bit(key->bit_nr, &key->folio->flags)) return -1; if (flags & WQ_FLAG_CUSTOM) { - if (test_and_set_bit(key->bit_nr, &key->page->flags)) + if (test_and_set_bit(key->bit_nr, &key->folio->flags)) return -1; flags |= WQ_FLAG_DONE; } @@ -1157,7 +1157,7 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, * * So update the flags atomically, and wake up the waiter * afterwards to avoid any races. This store-release pairs - * with the load-acquire in wait_on_page_bit_common(). + * with the load-acquire in folio_wait_bit_common(). */ smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN); wake_up_state(wait->private, mode); @@ -1176,14 +1176,14 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, return (flags & WQ_FLAG_EXCLUSIVE) != 0; } -static void wake_up_page_bit(struct page *page, int bit_nr) +static void folio_wake_bit(struct folio *folio, int bit_nr) { - wait_queue_head_t *q = page_waitqueue(page); + wait_queue_head_t *q = folio_waitqueue(folio); struct wait_page_key key; unsigned long flags; wait_queue_entry_t bookmark; - key.page = page; + key.folio = folio; key.bit_nr = bit_nr; key.page_match = 0; @@ -1218,7 +1218,7 @@ static void wake_up_page_bit(struct page *page, int bit_nr) * page waiters. */ if (!waitqueue_active(q) || !key.page_match) { - ClearPageWaiters(page); + folio_clear_waiters(folio); /* * It's possible to miss clearing Waiters here, when we woke * our page waiters, but the hashed waitqueue has waiters for @@ -1230,19 +1230,19 @@ static void wake_up_page_bit(struct page *page, int bit_nr) spin_unlock_irqrestore(&q->lock, flags); } -static void wake_up_page(struct page *page, int bit) +static void folio_wake(struct folio *folio, int bit) { - if (!PageWaiters(page)) + if (!folio_test_waiters(folio)) return; - wake_up_page_bit(page, bit); + folio_wake_bit(folio, bit); } /* - * A choice of three behaviors for wait_on_page_bit_common(): + * A choice of three behaviors for folio_wait_bit_common(): */ enum behavior { EXCLUSIVE, /* Hold ref to page and take the bit when woken, like - * __lock_page() waiting on then setting PG_locked. + * __folio_lock() waiting on then setting PG_locked. */ SHARED, /* Hold ref to page and check the bit when woken, like * wait_on_page_writeback() waiting on PG_writeback. @@ -1253,16 +1253,16 @@ enum behavior { }; /* - * Attempt to check (or get) the page bit, and mark us done + * Attempt to check (or get) the folio flag, and mark us done * if successful. */ -static inline bool trylock_page_bit_common(struct page *page, int bit_nr, +static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, struct wait_queue_entry *wait) { if (wait->flags & WQ_FLAG_EXCLUSIVE) { - if (test_and_set_bit(bit_nr, &page->flags)) + if (test_and_set_bit(bit_nr, &folio->flags)) return false; - } else if (test_bit(bit_nr, &page->flags)) + } else if (test_bit(bit_nr, &folio->flags)) return false; wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; @@ -1272,9 +1272,10 @@ static inline bool trylock_page_bit_common(struct page *page, int bit_nr, /* How many times do we accept lock stealing from under a waiter? */ int sysctl_page_lock_unfairness = 5; -static inline int wait_on_page_bit_common(wait_queue_head_t *q, - struct page *page, int bit_nr, int state, enum behavior behavior) +static inline int folio_wait_bit_common(struct folio *folio, int bit_nr, + int state, enum behavior behavior) { + wait_queue_head_t *q = folio_waitqueue(folio); int unfairness = sysctl_page_lock_unfairness; struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; @@ -1283,8 +1284,8 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, unsigned long pflags; if (bit_nr == PG_locked && - !PageUptodate(page) && PageWorkingset(page)) { - if (!PageSwapBacked(page)) { + !folio_test_uptodate(folio) && folio_test_workingset(folio)) { + if (!folio_test_swapbacked(folio)) { delayacct_thrashing_start(); delayacct = true; } @@ -1294,7 +1295,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, init_wait(wait); wait->func = wake_page_function; - wait_page.page = page; + wait_page.folio = folio; wait_page.bit_nr = bit_nr; repeat: @@ -1309,7 +1310,7 @@ repeat: * Do one last check whether we can get the * page bit synchronously. * - * Do the SetPageWaiters() marking before that + * Do the folio_set_waiters() marking before that * to let any waker we _just_ missed know they * need to wake us up (otherwise they'll never * even go to the slow case that looks at the @@ -1320,8 +1321,8 @@ repeat: * lock to avoid races. */ spin_lock_irq(&q->lock); - SetPageWaiters(page); - if (!trylock_page_bit_common(page, bit_nr, wait)) + folio_set_waiters(folio); + if (!folio_trylock_flag(folio, bit_nr, wait)) __add_wait_queue_entry_tail(q, wait); spin_unlock_irq(&q->lock); @@ -1331,10 +1332,10 @@ repeat: * see whether the page bit testing has already * been done by the wake function. * - * We can drop our reference to the page. + * We can drop our reference to the folio. */ if (behavior == DROP) - put_page(page); + folio_put(folio); /* * Note that until the "finish_wait()", or until @@ -1371,7 +1372,7 @@ repeat: * * And if that fails, we'll have to retry this all. */ - if (unlikely(test_and_set_bit(bit_nr, &page->flags))) + if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0)))) goto repeat; wait->flags |= WQ_FLAG_DONE; @@ -1380,7 +1381,7 @@ repeat: /* * If a signal happened, this 'finish_wait()' may remove the last - * waiter from the wait-queues, but the PageWaiters bit will remain + * waiter from the wait-queues, but the folio waiters bit will remain * set. That's ok. The next wakeup will take care of it, and trying * to do it here would be difficult and prone to races. */ @@ -1411,19 +1412,17 @@ repeat: return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; } -void wait_on_page_bit(struct page *page, int bit_nr) +void folio_wait_bit(struct folio *folio, int bit_nr) { - wait_queue_head_t *q = page_waitqueue(page); - wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED); + folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED); } -EXPORT_SYMBOL(wait_on_page_bit); +EXPORT_SYMBOL(folio_wait_bit); -int wait_on_page_bit_killable(struct page *page, int bit_nr) +int folio_wait_bit_killable(struct folio *folio, int bit_nr) { - wait_queue_head_t *q = page_waitqueue(page); - return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED); + return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED); } -EXPORT_SYMBOL(wait_on_page_bit_killable); +EXPORT_SYMBOL(folio_wait_bit_killable); /** * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked @@ -1440,31 +1439,28 @@ EXPORT_SYMBOL(wait_on_page_bit_killable); */ int put_and_wait_on_page_locked(struct page *page, int state) { - wait_queue_head_t *q; - - page = compound_head(page); - q = page_waitqueue(page); - return wait_on_page_bit_common(q, page, PG_locked, state, DROP); + return folio_wait_bit_common(page_folio(page), PG_locked, state, + DROP); } /** - * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue - * @page: Page defining the wait queue of interest + * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue + * @folio: Folio defining the wait queue of interest * @waiter: Waiter to add to the queue * - * Add an arbitrary @waiter to the wait queue for the nominated @page. + * Add an arbitrary @waiter to the wait queue for the nominated @folio. */ -void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter) +void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) { - wait_queue_head_t *q = page_waitqueue(page); + wait_queue_head_t *q = folio_waitqueue(folio); unsigned long flags; spin_lock_irqsave(&q->lock, flags); __add_wait_queue_entry_tail(q, waiter); - SetPageWaiters(page); + folio_set_waiters(folio); spin_unlock_irqrestore(&q->lock, flags); } -EXPORT_SYMBOL_GPL(add_page_wait_queue); +EXPORT_SYMBOL_GPL(folio_add_wait_queue); #ifndef clear_bit_unlock_is_negative_byte @@ -1490,124 +1486,116 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem #endif /** - * unlock_page - unlock a locked page - * @page: the page + * folio_unlock - Unlock a locked folio. + * @folio: The folio. * - * Unlocks the page and wakes up sleepers in wait_on_page_locked(). - * Also wakes sleepers in wait_on_page_writeback() because the wakeup - * mechanism between PageLocked pages and PageWriteback pages is shared. - * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. + * Unlocks the folio and wakes up any thread sleeping on the page lock. * - * Note that this depends on PG_waiters being the sign bit in the byte - * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to - * clear the PG_locked bit and test PG_waiters at the same time fairly - * portably (architectures that do LL/SC can test any bit, while x86 can - * test the sign bit). + * Context: May be called from interrupt or process context. May not be + * called from NMI context. */ -void unlock_page(struct page *page) +void folio_unlock(struct folio *folio) { + /* Bit 7 allows x86 to check the byte's sign bit */ BUILD_BUG_ON(PG_waiters != 7); - page = compound_head(page); - VM_BUG_ON_PAGE(!PageLocked(page), page); - if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags)) - wake_up_page_bit(page, PG_locked); + BUILD_BUG_ON(PG_locked > 7); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + if (clear_bit_unlock_is_negative_byte(PG_locked, folio_flags(folio, 0))) + folio_wake_bit(folio, PG_locked); } -EXPORT_SYMBOL(unlock_page); +EXPORT_SYMBOL(folio_unlock); /** - * end_page_private_2 - Clear PG_private_2 and release any waiters - * @page: The page + * folio_end_private_2 - Clear PG_private_2 and wake any waiters. + * @folio: The folio. * - * Clear the PG_private_2 bit on a page and wake up any sleepers waiting for - * this. The page ref held for PG_private_2 being set is released. + * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for + * it. The folio reference held for PG_private_2 being set is released. * - * This is, for example, used when a netfs page is being written to a local - * disk cache, thereby allowing writes to the cache for the same page to be + * This is, for example, used when a netfs folio is being written to a local + * disk cache, thereby allowing writes to the cache for the same folio to be * serialised. */ -void end_page_private_2(struct page *page) +void folio_end_private_2(struct folio *folio) { - page = compound_head(page); - VM_BUG_ON_PAGE(!PagePrivate2(page), page); - clear_bit_unlock(PG_private_2, &page->flags); - wake_up_page_bit(page, PG_private_2); - put_page(page); + VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio); + clear_bit_unlock(PG_private_2, folio_flags(folio, 0)); + folio_wake_bit(folio, PG_private_2); + folio_put(folio); } -EXPORT_SYMBOL(end_page_private_2); +EXPORT_SYMBOL(folio_end_private_2); /** - * wait_on_page_private_2 - Wait for PG_private_2 to be cleared on a page - * @page: The page to wait on + * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio. + * @folio: The folio to wait on. * - * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page. + * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio. */ -void wait_on_page_private_2(struct page *page) +void folio_wait_private_2(struct folio *folio) { - page = compound_head(page); - while (PagePrivate2(page)) - wait_on_page_bit(page, PG_private_2); + while (folio_test_private_2(folio)) + folio_wait_bit(folio, PG_private_2); } -EXPORT_SYMBOL(wait_on_page_private_2); +EXPORT_SYMBOL(folio_wait_private_2); /** - * wait_on_page_private_2_killable - Wait for PG_private_2 to be cleared on a page - * @page: The page to wait on + * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio. + * @folio: The folio to wait on. * - * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page or until a + * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a * fatal signal is received by the calling task. * * Return: * - 0 if successful. * - -EINTR if a fatal signal was encountered. */ -int wait_on_page_private_2_killable(struct page *page) +int folio_wait_private_2_killable(struct folio *folio) { int ret = 0; - page = compound_head(page); - while (PagePrivate2(page)) { - ret = wait_on_page_bit_killable(page, PG_private_2); + while (folio_test_private_2(folio)) { + ret = folio_wait_bit_killable(folio, PG_private_2); if (ret < 0) break; } return ret; } -EXPORT_SYMBOL(wait_on_page_private_2_killable); +EXPORT_SYMBOL(folio_wait_private_2_killable); /** - * end_page_writeback - end writeback against a page - * @page: the page + * folio_end_writeback - End writeback against a folio. + * @folio: The folio. */ -void end_page_writeback(struct page *page) +void folio_end_writeback(struct folio *folio) { /* - * TestClearPageReclaim could be used here but it is an atomic - * operation and overkill in this particular case. Failing to - * shuffle a page marked for immediate reclaim is too mild to - * justify taking an atomic operation penalty at the end of - * ever page writeback. + * folio_test_clear_reclaim() could be used here but it is an + * atomic operation and overkill in this particular case. Failing + * to shuffle a folio marked for immediate reclaim is too mild + * a gain to justify taking an atomic operation penalty at the + * end of every folio writeback. */ - if (PageReclaim(page)) { - ClearPageReclaim(page); - rotate_reclaimable_page(page); + if (folio_test_reclaim(folio)) { + folio_clear_reclaim(folio); + folio_rotate_reclaimable(folio); } /* - * Writeback does not hold a page reference of its own, relying + * Writeback does not hold a folio reference of its own, relying * on truncation to wait for the clearing of PG_writeback. - * But here we must make sure that the page is not freed and - * reused before the wake_up_page(). + * But here we must make sure that the folio is not freed and + * reused before the folio_wake(). */ - get_page(page); - if (!test_clear_page_writeback(page)) + folio_get(folio); + if (!__folio_end_writeback(folio)) BUG(); smp_mb__after_atomic(); - wake_up_page(page, PG_writeback); - put_page(page); + folio_wake(folio, PG_writeback); + folio_put(folio); } -EXPORT_SYMBOL(end_page_writeback); +EXPORT_SYMBOL(folio_end_writeback); /* * After completing I/O on a page, call this routine to update the page @@ -1638,39 +1626,35 @@ void page_endio(struct page *page, bool is_write, int err) EXPORT_SYMBOL_GPL(page_endio); /** - * __lock_page - get a lock on the page, assuming we need to sleep to get it - * @__page: the page to lock + * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it. + * @folio: The folio to lock */ -void __lock_page(struct page *__page) +void __folio_lock(struct folio *folio) { - struct page *page = compound_head(__page); - wait_queue_head_t *q = page_waitqueue(page); - wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, + folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE, EXCLUSIVE); } -EXPORT_SYMBOL(__lock_page); +EXPORT_SYMBOL(__folio_lock); -int __lock_page_killable(struct page *__page) +int __folio_lock_killable(struct folio *folio) { - struct page *page = compound_head(__page); - wait_queue_head_t *q = page_waitqueue(page); - return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, + return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE, EXCLUSIVE); } -EXPORT_SYMBOL_GPL(__lock_page_killable); +EXPORT_SYMBOL_GPL(__folio_lock_killable); -int __lock_page_async(struct page *page, struct wait_page_queue *wait) +static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait) { - struct wait_queue_head *q = page_waitqueue(page); + struct wait_queue_head *q = folio_waitqueue(folio); int ret = 0; - wait->page = page; + wait->folio = folio; wait->bit_nr = PG_locked; spin_lock_irq(&q->lock); __add_wait_queue_entry_tail(q, &wait->wait); - SetPageWaiters(page); - ret = !trylock_page(page); + folio_set_waiters(folio); + ret = !folio_trylock(folio); /* * If we were successful now, we know we're still on the * waitqueue as we're still under the lock. This means it's @@ -1687,16 +1671,16 @@ int __lock_page_async(struct page *page, struct wait_page_queue *wait) /* * Return values: - * 1 - page is locked; mmap_lock is still held. - * 0 - page is not locked. + * true - folio is locked; mmap_lock is still held. + * false - folio is not locked. * mmap_lock has been released (mmap_read_unlock(), unless flags had both * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in * which case mmap_lock is still held. * - * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1 - * with the page locked and the mmap_lock unperturbed. + * If neither ALLOW_RETRY nor KILLABLE are set, will always return true + * with the folio locked and the mmap_lock unperturbed. */ -int __lock_page_or_retry(struct page *page, struct mm_struct *mm, +bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm, unsigned int flags) { if (fault_flag_allow_retry_first(flags)) { @@ -1705,28 +1689,28 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, * even though return 0. */ if (flags & FAULT_FLAG_RETRY_NOWAIT) - return 0; + return false; mmap_read_unlock(mm); if (flags & FAULT_FLAG_KILLABLE) - wait_on_page_locked_killable(page); + folio_wait_locked_killable(folio); else - wait_on_page_locked(page); - return 0; + folio_wait_locked(folio); + return false; } if (flags & FAULT_FLAG_KILLABLE) { - int ret; + bool ret; - ret = __lock_page_killable(page); + ret = __folio_lock_killable(folio); if (ret) { mmap_read_unlock(mm); - return 0; + return false; } } else { - __lock_page(page); + __folio_lock(folio); } - return 1; + return true; } /** @@ -1801,144 +1785,156 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, } EXPORT_SYMBOL(page_cache_prev_miss); +/* + * Lockless page cache protocol: + * On the lookup side: + * 1. Load the folio from i_pages + * 2. Increment the refcount if it's not zero + * 3. If the folio is not found by xas_reload(), put the refcount and retry + * + * On the removal side: + * A. Freeze the page (by zeroing the refcount if nobody else has a reference) + * B. Remove the page from i_pages + * C. Return the page to the page allocator + * + * This means that any page may have its reference count temporarily + * increased by a speculative page cache (or fast GUP) lookup as it can + * be allocated by another user before the RCU grace period expires. + * Because the refcount temporarily acquired here may end up being the + * last refcount on the page, any page allocation must be freeable by + * folio_put(). + */ + /* * mapping_get_entry - Get a page cache entry. * @mapping: the address_space to search * @index: The page cache index. * - * Looks up the page cache slot at @mapping & @index. If there is a - * page cache page, the head page is returned with an increased refcount. + * Looks up the page cache entry at @mapping & @index. If it is a folio, + * it is returned with an increased refcount. If it is a shadow entry + * of a previously evicted folio, or a swap entry from shmem/tmpfs, + * it is returned without further action. * - * If the slot holds a shadow entry of a previously evicted page, or a - * swap entry from shmem/tmpfs, it is returned. - * - * Return: The head page or shadow entry, %NULL if nothing is found. + * Return: The folio, swap or shadow entry, %NULL if nothing is found. */ -static struct page *mapping_get_entry(struct address_space *mapping, - pgoff_t index) +static void *mapping_get_entry(struct address_space *mapping, pgoff_t index) { XA_STATE(xas, &mapping->i_pages, index); - struct page *page; + struct folio *folio; rcu_read_lock(); repeat: xas_reset(&xas); - page = xas_load(&xas); - if (xas_retry(&xas, page)) + folio = xas_load(&xas); + if (xas_retry(&xas, folio)) goto repeat; /* * A shadow entry of a recently evicted page, or a swap entry from * shmem/tmpfs. Return it without attempting to raise page count. */ - if (!page || xa_is_value(page)) + if (!folio || xa_is_value(folio)) goto out; - if (!page_cache_get_speculative(page)) + if (!folio_try_get_rcu(folio)) goto repeat; - /* - * Has the page moved or been split? - * This is part of the lockless pagecache protocol. See - * include/linux/pagemap.h for details. - */ - if (unlikely(page != xas_reload(&xas))) { - put_page(page); + if (unlikely(folio != xas_reload(&xas))) { + folio_put(folio); goto repeat; } out: rcu_read_unlock(); - return page; + return folio; } /** - * pagecache_get_page - Find and get a reference to a page. + * __filemap_get_folio - Find and get a reference to a folio. * @mapping: The address_space to search. * @index: The page index. - * @fgp_flags: %FGP flags modify how the page is returned. - * @gfp_mask: Memory allocation flags to use if %FGP_CREAT is specified. + * @fgp_flags: %FGP flags modify how the folio is returned. + * @gfp: Memory allocation flags to use if %FGP_CREAT is specified. * * Looks up the page cache entry at @mapping & @index. * * @fgp_flags can be zero or more of these flags: * - * * %FGP_ACCESSED - The page will be marked accessed. - * * %FGP_LOCK - The page is returned locked. - * * %FGP_HEAD - If the page is present and a THP, return the head page - * rather than the exact page specified by the index. + * * %FGP_ACCESSED - The folio will be marked accessed. + * * %FGP_LOCK - The folio is returned locked. * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it - * instead of allocating a new page to replace it. + * instead of allocating a new folio to replace it. * * %FGP_CREAT - If no page is present then a new page is allocated using - * @gfp_mask and added to the page cache and the VM's LRU list. + * @gfp and added to the page cache and the VM's LRU list. * The page is returned locked and with an increased refcount. * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the * page is already in cache. If the page was allocated, unlock it before * returning so the caller can do the same dance. - * * %FGP_WRITE - The page will be written - * * %FGP_NOFS - __GFP_FS will get cleared in gfp mask - * * %FGP_NOWAIT - Don't get blocked by page lock + * * %FGP_WRITE - The page will be written to by the caller. + * * %FGP_NOFS - __GFP_FS will get cleared in gfp. + * * %FGP_NOWAIT - Don't get blocked by page lock. + * * %FGP_STABLE - Wait for the folio to be stable (finished writeback) * * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even * if the %GFP flags specified for %FGP_CREAT are atomic. * * If there is a page cache page, it is returned with an increased refcount. * - * Return: The found page or %NULL otherwise. + * Return: The found folio or %NULL otherwise. */ -struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp_mask) +struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, + int fgp_flags, gfp_t gfp) { - struct page *page; + struct folio *folio; repeat: - page = mapping_get_entry(mapping, index); - if (xa_is_value(page)) { + folio = mapping_get_entry(mapping, index); + if (xa_is_value(folio)) { if (fgp_flags & FGP_ENTRY) - return page; - page = NULL; + return folio; + folio = NULL; } - if (!page) + if (!folio) goto no_page; if (fgp_flags & FGP_LOCK) { if (fgp_flags & FGP_NOWAIT) { - if (!trylock_page(page)) { - put_page(page); + if (!folio_trylock(folio)) { + folio_put(folio); return NULL; } } else { - lock_page(page); + folio_lock(folio); } /* Has the page been truncated? */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); + if (unlikely(folio->mapping != mapping)) { + folio_unlock(folio); + folio_put(folio); goto repeat; } - VM_BUG_ON_PAGE(!thp_contains(page, index), page); + VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); } if (fgp_flags & FGP_ACCESSED) - mark_page_accessed(page); + folio_mark_accessed(folio); else if (fgp_flags & FGP_WRITE) { /* Clear idle flag for buffer write */ - if (page_is_idle(page)) - clear_page_idle(page); + if (folio_test_idle(folio)) + folio_clear_idle(folio); } - if (!(fgp_flags & FGP_HEAD)) - page = find_subpage(page, index); + if (fgp_flags & FGP_STABLE) + folio_wait_stable(folio); no_page: - if (!page && (fgp_flags & FGP_CREAT)) { + if (!folio && (fgp_flags & FGP_CREAT)) { int err; if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) - gfp_mask |= __GFP_WRITE; + gfp |= __GFP_WRITE; if (fgp_flags & FGP_NOFS) - gfp_mask &= ~__GFP_FS; + gfp &= ~__GFP_FS; - page = __page_cache_alloc(gfp_mask); - if (!page) + folio = filemap_alloc_folio(gfp, 0); + if (!folio) return NULL; if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) @@ -1946,27 +1942,27 @@ no_page: /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) - __SetPageReferenced(page); + __folio_set_referenced(folio); - err = add_to_page_cache_lru(page, mapping, index, gfp_mask); + err = filemap_add_folio(mapping, folio, index, gfp); if (unlikely(err)) { - put_page(page); - page = NULL; + folio_put(folio); + folio = NULL; if (err == -EEXIST) goto repeat; } /* - * add_to_page_cache_lru locks the page, and for mmap we expect - * an unlocked page. + * filemap_add_folio locks the page, and for mmap + * we expect an unlocked page. */ - if (page && (fgp_flags & FGP_FOR_MMAP)) - unlock_page(page); + if (folio && (fgp_flags & FGP_FOR_MMAP)) + folio_unlock(folio); } - return page; + return folio; } -EXPORT_SYMBOL(pagecache_get_page); +EXPORT_SYMBOL(__filemap_get_folio); static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max, xa_mark_t mark) @@ -2421,6 +2417,7 @@ static int filemap_update_page(struct kiocb *iocb, struct address_space *mapping, struct iov_iter *iter, struct page *page) { + struct folio *folio = page_folio(page); int error; if (iocb->ki_flags & IOCB_NOWAIT) { @@ -2430,40 +2427,40 @@ static int filemap_update_page(struct kiocb *iocb, filemap_invalidate_lock_shared(mapping); } - if (!trylock_page(page)) { + if (!folio_trylock(folio)) { error = -EAGAIN; if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) goto unlock_mapping; if (!(iocb->ki_flags & IOCB_WAITQ)) { filemap_invalidate_unlock_shared(mapping); - put_and_wait_on_page_locked(page, TASK_KILLABLE); + put_and_wait_on_page_locked(&folio->page, TASK_KILLABLE); return AOP_TRUNCATED_PAGE; } - error = __lock_page_async(page, iocb->ki_waitq); + error = __folio_lock_async(folio, iocb->ki_waitq); if (error) goto unlock_mapping; } error = AOP_TRUNCATED_PAGE; - if (!page->mapping) + if (!folio->mapping) goto unlock; error = 0; - if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page)) + if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, &folio->page)) goto unlock; error = -EAGAIN; if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) goto unlock; - error = filemap_read_page(iocb->ki_filp, mapping, page); + error = filemap_read_page(iocb->ki_filp, mapping, &folio->page); goto unlock_mapping; unlock: - unlock_page(page); + folio_unlock(folio); unlock_mapping: filemap_invalidate_unlock_shared(mapping); if (error == AOP_TRUNCATED_PAGE) - put_page(page); + folio_put(folio); return error; } @@ -2900,7 +2897,9 @@ unlock: static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, struct file **fpin) { - if (trylock_page(page)) + struct folio *folio = page_folio(page); + + if (folio_trylock(folio)) return 1; /* @@ -2913,7 +2912,7 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, *fpin = maybe_unlock_mmap_for_io(vmf, *fpin); if (vmf->flags & FAULT_FLAG_KILLABLE) { - if (__lock_page_killable(page)) { + if (__folio_lock_killable(folio)) { /* * We didn't have the right flags to drop the mmap_lock, * but all fault_handlers only check for fatal signals @@ -2925,11 +2924,11 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, return 0; } } else - __lock_page(page); + __folio_lock(folio); + return 1; } - /* * Synchronous readahead happens when we don't even find a page in the page * cache at all. We don't want to perform IO under the mmap sem, so if we have @@ -3708,28 +3707,6 @@ out: } EXPORT_SYMBOL(generic_file_direct_write); -/* - * Find or create a page at the given pagecache position. Return the locked - * page. This function is specifically for buffered writes. - */ -struct page *grab_cache_page_write_begin(struct address_space *mapping, - pgoff_t index, unsigned flags) -{ - struct page *page; - int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT; - - if (flags & AOP_FLAG_NOFS) - fgp_flags |= FGP_NOFS; - - page = pagecache_get_page(mapping, index, fgp_flags, - mapping_gfp_mask(mapping)); - if (page) - wait_for_stable_page(page); - - return page; -} -EXPORT_SYMBOL(grab_cache_page_write_begin); - ssize_t generic_perform_write(struct file *file, struct iov_iter *i, loff_t pos) { diff --git a/mm/folio-compat.c b/mm/folio-compat.c new file mode 100644 index 000000000000..5b6ae1da314e --- /dev/null +++ b/mm/folio-compat.c @@ -0,0 +1,142 @@ +/* + * Compatibility functions which bloat the callers too much to make inline. + * All of the callers of these functions should be converted to use folios + * eventually. + */ + +#include +#include +#include + +struct address_space *page_mapping(struct page *page) +{ + return folio_mapping(page_folio(page)); +} +EXPORT_SYMBOL(page_mapping); + +void unlock_page(struct page *page) +{ + return folio_unlock(page_folio(page)); +} +EXPORT_SYMBOL(unlock_page); + +void end_page_writeback(struct page *page) +{ + return folio_end_writeback(page_folio(page)); +} +EXPORT_SYMBOL(end_page_writeback); + +void wait_on_page_writeback(struct page *page) +{ + return folio_wait_writeback(page_folio(page)); +} +EXPORT_SYMBOL_GPL(wait_on_page_writeback); + +void wait_for_stable_page(struct page *page) +{ + return folio_wait_stable(page_folio(page)); +} +EXPORT_SYMBOL_GPL(wait_for_stable_page); + +bool page_mapped(struct page *page) +{ + return folio_mapped(page_folio(page)); +} +EXPORT_SYMBOL(page_mapped); + +void mark_page_accessed(struct page *page) +{ + folio_mark_accessed(page_folio(page)); +} +EXPORT_SYMBOL(mark_page_accessed); + +#ifdef CONFIG_MIGRATION +int migrate_page_move_mapping(struct address_space *mapping, + struct page *newpage, struct page *page, int extra_count) +{ + return folio_migrate_mapping(mapping, page_folio(newpage), + page_folio(page), extra_count); +} +EXPORT_SYMBOL(migrate_page_move_mapping); + +void migrate_page_states(struct page *newpage, struct page *page) +{ + folio_migrate_flags(page_folio(newpage), page_folio(page)); +} +EXPORT_SYMBOL(migrate_page_states); + +void migrate_page_copy(struct page *newpage, struct page *page) +{ + folio_migrate_copy(page_folio(newpage), page_folio(page)); +} +EXPORT_SYMBOL(migrate_page_copy); +#endif + +bool set_page_writeback(struct page *page) +{ + return folio_start_writeback(page_folio(page)); +} +EXPORT_SYMBOL(set_page_writeback); + +bool set_page_dirty(struct page *page) +{ + return folio_mark_dirty(page_folio(page)); +} +EXPORT_SYMBOL(set_page_dirty); + +int __set_page_dirty_nobuffers(struct page *page) +{ + return filemap_dirty_folio(page_mapping(page), page_folio(page)); +} +EXPORT_SYMBOL(__set_page_dirty_nobuffers); + +bool clear_page_dirty_for_io(struct page *page) +{ + return folio_clear_dirty_for_io(page_folio(page)); +} +EXPORT_SYMBOL(clear_page_dirty_for_io); + +bool redirty_page_for_writepage(struct writeback_control *wbc, + struct page *page) +{ + return folio_redirty_for_writepage(wbc, page_folio(page)); +} +EXPORT_SYMBOL(redirty_page_for_writepage); + +void lru_cache_add(struct page *page) +{ + folio_add_lru(page_folio(page)); +} +EXPORT_SYMBOL(lru_cache_add); + +int add_to_page_cache_lru(struct page *page, struct address_space *mapping, + pgoff_t index, gfp_t gfp) +{ + return filemap_add_folio(mapping, page_folio(page), index, gfp); +} +EXPORT_SYMBOL(add_to_page_cache_lru); + +noinline +struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, + int fgp_flags, gfp_t gfp) +{ + struct folio *folio; + + folio = __filemap_get_folio(mapping, index, fgp_flags, gfp); + if ((fgp_flags & FGP_HEAD) || !folio || xa_is_value(folio)) + return &folio->page; + return folio_file_page(folio, index); +} +EXPORT_SYMBOL(pagecache_get_page); + +struct page *grab_cache_page_write_begin(struct address_space *mapping, + pgoff_t index, unsigned flags) +{ + unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; + + if (flags & AOP_FLAG_NOFS) + fgp_flags |= FGP_NOFS; + return pagecache_get_page(mapping, index, fgp_flags, + mapping_gfp_mask(mapping)); +} +EXPORT_SYMBOL(grab_cache_page_write_begin); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c5142d237e48..e5483347291c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -603,7 +603,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_charge(page, vma->vm_mm, gfp)) { + if (mem_cgroup_charge(page_folio(page), vma->vm_mm, gfp)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); @@ -2405,7 +2405,8 @@ static void __split_huge_page_tail(struct page *head, int tail, static void __split_huge_page(struct page *page, struct list_head *list, pgoff_t end) { - struct page *head = compound_head(page); + struct folio *folio = page_folio(page); + struct page *head = &folio->page; struct lruvec *lruvec; struct address_space *swap_cache = NULL; unsigned long offset = 0; @@ -2424,7 +2425,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, } /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ - lruvec = lock_page_lruvec(head); + lruvec = folio_lruvec_lock(folio); ClearPageHasHWPoisoned(head); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 95dc7b83381f..6378c1066459 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5302,7 +5302,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, *pagep = NULL; goto out; } - copy_huge_page(page, *pagep); + folio_copy(page_folio(page), page_folio(*pagep)); put_page(*pagep); *pagep = NULL; } diff --git a/mm/internal.h b/mm/internal.h index cf3cb933eba3..b1001ebeb286 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -34,7 +34,16 @@ void page_writeback_init(void); +static inline void *folio_raw_mapping(struct folio *folio) +{ + unsigned long mapping = (unsigned long)folio->mapping; + + return (void *)(mapping & ~PAGE_MAPPING_FLAGS); +} + vm_fault_t do_swap_page(struct vm_fault *vmf); +void folio_rotate_reclaimable(struct folio *folio); +bool __folio_end_writeback(struct folio *folio); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); @@ -63,17 +72,28 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, pgoff_t end, struct pagevec *pvec, pgoff_t *indices); /** - * page_evictable - test whether a page is evictable - * @page: the page to test + * folio_evictable - Test whether a folio is evictable. + * @folio: The folio to test. * - * Test whether page is evictable--i.e., should be placed on active/inactive - * lists vs unevictable list. - * - * Reasons page might not be evictable: - * (1) page's mapping marked unevictable - * (2) page is part of an mlocked VMA + * Test whether @folio is evictable -- i.e., should be placed on + * active/inactive lists vs unevictable list. * + * Reasons folio might not be evictable: + * 1. folio's mapping marked unevictable + * 2. One of the pages in the folio is part of an mlocked VMA */ +static inline bool folio_evictable(struct folio *folio) +{ + bool ret; + + /* Prevent address_space of inode and swap cache from being freed */ + rcu_read_lock(); + ret = !mapping_unevictable(folio_mapping(folio)) && + !folio_test_mlocked(folio); + rcu_read_unlock(); + return ret; +} + static inline bool page_evictable(struct page *page) { bool ret; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 8a8b3aa92937..5f02fda6f265 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1090,7 +1090,7 @@ static void collapse_huge_page(struct mm_struct *mm, goto out_nolock; } - if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) { + if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out_nolock; } @@ -1214,7 +1214,7 @@ out_up_write: mmap_write_unlock(mm); out_nolock: if (!IS_ERR_OR_NULL(*hpage)) - mem_cgroup_uncharge(*hpage); + mem_cgroup_uncharge(page_folio(*hpage)); trace_mm_collapse_huge_page(mm, isolated, result); return; } @@ -1661,7 +1661,7 @@ static void collapse_file(struct mm_struct *mm, goto out; } - if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) { + if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out; } @@ -1983,7 +1983,7 @@ xa_unlocked: out: VM_BUG_ON(!list_empty(&pagelist)); if (!IS_ERR_OR_NULL(*hpage)) - mem_cgroup_uncharge(*hpage); + mem_cgroup_uncharge(page_folio(*hpage)); /* TODO: tracepoints */ } diff --git a/mm/ksm.c b/mm/ksm.c index a5716fdec1aa..0662093237e4 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -751,7 +751,7 @@ stale: /* * We come here from above when page->mapping or !PageSwapCache * suggests that the node is stale; but it might be under migration. - * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(), + * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(), * before checking whether node->kpfn has been changed. */ smp_rmb(); @@ -852,9 +852,14 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, return err; } +static inline struct stable_node *folio_stable_node(struct folio *folio) +{ + return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL; +} + static inline struct stable_node *page_stable_node(struct page *page) { - return PageKsm(page) ? page_rmapping(page) : NULL; + return folio_stable_node(page_folio(page)); } static inline void set_page_stable_node(struct page *page, @@ -2578,7 +2583,8 @@ struct page *ksm_might_need_to_copy(struct page *page, return page; /* let do_swap_page report the error */ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (new_page && mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL)) { + if (new_page && + mem_cgroup_charge(page_folio(new_page), vma->vm_mm, GFP_KERNEL)) { put_page(new_page); new_page = NULL; } @@ -2658,26 +2664,26 @@ again: } #ifdef CONFIG_MIGRATION -void ksm_migrate_page(struct page *newpage, struct page *oldpage) +void folio_migrate_ksm(struct folio *newfolio, struct folio *folio) { struct stable_node *stable_node; - VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); - VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio); + VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio); - stable_node = page_stable_node(newpage); + stable_node = folio_stable_node(folio); if (stable_node) { - VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage); - stable_node->kpfn = page_to_pfn(newpage); + VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio); + stable_node->kpfn = folio_pfn(newfolio); /* - * newpage->mapping was set in advance; now we need smp_wmb() + * newfolio->mapping was set in advance; now we need smp_wmb() * to make sure that the new stable_node->kpfn is visible - * to get_ksm_page() before it can see that oldpage->mapping - * has gone stale (or that PageSwapCache has been cleared). + * to get_ksm_page() before it can see that folio->mapping + * has gone stale (or that folio_test_swapcache has been cleared). */ smp_wmb(); - set_page_stable_node(oldpage, NULL); + set_page_stable_node(&folio->page, NULL); } } #endif /* CONFIG_MIGRATION */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6da5020a8656..8dab23a71fc4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -456,28 +456,6 @@ ino_t page_cgroup_ino(struct page *page) return ino; } -static struct mem_cgroup_per_node * -mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) -{ - int nid = page_to_nid(page); - - return memcg->nodeinfo[nid]; -} - -static struct mem_cgroup_tree_per_node * -soft_limit_tree_node(int nid) -{ - return soft_limit_tree.rb_tree_per_node[nid]; -} - -static struct mem_cgroup_tree_per_node * -soft_limit_tree_from_page(struct page *page) -{ - int nid = page_to_nid(page); - - return soft_limit_tree.rb_tree_per_node[nid]; -} - static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, struct mem_cgroup_tree_per_node *mctz, unsigned long new_usage_in_excess) @@ -548,13 +526,13 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg) return excess; } -static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) +static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) { unsigned long excess; struct mem_cgroup_per_node *mz; struct mem_cgroup_tree_per_node *mctz; - mctz = soft_limit_tree_from_page(page); + mctz = soft_limit_tree.rb_tree_per_node[nid]; if (!mctz) return; /* @@ -562,7 +540,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) * because their event counter is not touched. */ for (; memcg; memcg = parent_mem_cgroup(memcg)) { - mz = mem_cgroup_page_nodeinfo(memcg, page); + mz = memcg->nodeinfo[nid]; excess = soft_limit_excess(memcg); /* * We have to update the tree if mz is on RB-tree or @@ -593,7 +571,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) for_each_node(nid) { mz = memcg->nodeinfo[nid]; - mctz = soft_limit_tree_node(nid); + mctz = soft_limit_tree.rb_tree_per_node[nid]; if (mctz) mem_cgroup_remove_exceeded(mz, mctz); } @@ -799,7 +777,6 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, - struct page *page, int nr_pages) { /* pagein of a big page is an event. So, ignore page size */ @@ -842,7 +819,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, * Check events in order. * */ -static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) +static void memcg_check_events(struct mem_cgroup *memcg, int nid) { /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, @@ -853,7 +830,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) MEM_CGROUP_TARGET_SOFTLIMIT); mem_cgroup_threshold(memcg); if (unlikely(do_softlimit)) - mem_cgroup_update_tree(memcg, page); + mem_cgroup_update_tree(memcg, nid); } } @@ -1149,64 +1126,88 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, } #ifdef CONFIG_DEBUG_VM -void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; - memcg = page_memcg(page); + memcg = folio_memcg(folio); if (!memcg) - VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page); + VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio); else - VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page); + VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio); } #endif /** - * lock_page_lruvec - lock and return lruvec for a given page. - * @page: the page + * folio_lruvec_lock - Lock the lruvec for a folio. + * @folio: Pointer to the folio. * * These functions are safe to use under any of the following conditions: - * - page locked - * - PageLRU cleared - * - lock_page_memcg() - * - page->_refcount is zero + * - folio locked + * - folio_test_lru false + * - folio_memcg_lock() + * - folio frozen (refcount of 0) + * + * Return: The lruvec this folio is on with its lock held. */ -struct lruvec *lock_page_lruvec(struct page *page) +struct lruvec *folio_lruvec_lock(struct folio *folio) { - struct lruvec *lruvec; + struct lruvec *lruvec = folio_lruvec(folio); - lruvec = mem_cgroup_page_lruvec(page); spin_lock(&lruvec->lru_lock); - - lruvec_memcg_debug(lruvec, page); + lruvec_memcg_debug(lruvec, folio); return lruvec; } -struct lruvec *lock_page_lruvec_irq(struct page *page) +/** + * folio_lruvec_lock_irq - Lock the lruvec for a folio. + * @folio: Pointer to the folio. + * + * These functions are safe to use under any of the following conditions: + * - folio locked + * - folio_test_lru false + * - folio_memcg_lock() + * - folio frozen (refcount of 0) + * + * Return: The lruvec this folio is on with its lock held and interrupts + * disabled. + */ +struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { - struct lruvec *lruvec; + struct lruvec *lruvec = folio_lruvec(folio); - lruvec = mem_cgroup_page_lruvec(page); spin_lock_irq(&lruvec->lru_lock); - - lruvec_memcg_debug(lruvec, page); + lruvec_memcg_debug(lruvec, folio); return lruvec; } -struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags) +/** + * folio_lruvec_lock_irqsave - Lock the lruvec for a folio. + * @folio: Pointer to the folio. + * @flags: Pointer to irqsave flags. + * + * These functions are safe to use under any of the following conditions: + * - folio locked + * - folio_test_lru false + * - folio_memcg_lock() + * - folio frozen (refcount of 0) + * + * Return: The lruvec this folio is on with its lock held and interrupts + * disabled. + */ +struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, + unsigned long *flags) { - struct lruvec *lruvec; + struct lruvec *lruvec = folio_lruvec(folio); - lruvec = mem_cgroup_page_lruvec(page); spin_lock_irqsave(&lruvec->lru_lock, *flags); - - lruvec_memcg_debug(lruvec, page); + lruvec_memcg_debug(lruvec, folio); return lruvec; } @@ -1956,18 +1957,17 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) } /** - * lock_page_memcg - lock a page and memcg binding - * @page: the page + * folio_memcg_lock - Bind a folio to its memcg. + * @folio: The folio. * - * This function protects unlocked LRU pages from being moved to + * This function prevents unlocked LRU folios from being moved to * another cgroup. * - * It ensures lifetime of the locked memcg. Caller is responsible - * for the lifetime of the page. + * It ensures lifetime of the bound memcg. The caller is responsible + * for the lifetime of the folio. */ -void lock_page_memcg(struct page *page) +void folio_memcg_lock(struct folio *folio) { - struct page *head = compound_head(page); /* rmap on tail pages */ struct mem_cgroup *memcg; unsigned long flags; @@ -1981,7 +1981,7 @@ void lock_page_memcg(struct page *page) if (mem_cgroup_disabled()) return; again: - memcg = page_memcg(head); + memcg = folio_memcg(folio); if (unlikely(!memcg)) return; @@ -1995,7 +1995,7 @@ again: return; spin_lock_irqsave(&memcg->move_lock, flags); - if (memcg != page_memcg(head)) { + if (memcg != folio_memcg(folio)) { spin_unlock_irqrestore(&memcg->move_lock, flags); goto again; } @@ -2009,9 +2009,15 @@ again: memcg->move_lock_task = current; memcg->move_lock_flags = flags; } +EXPORT_SYMBOL(folio_memcg_lock); + +void lock_page_memcg(struct page *page) +{ + folio_memcg_lock(page_folio(page)); +} EXPORT_SYMBOL(lock_page_memcg); -static void __unlock_page_memcg(struct mem_cgroup *memcg) +static void __folio_memcg_unlock(struct mem_cgroup *memcg) { if (memcg && memcg->move_lock_task == current) { unsigned long flags = memcg->move_lock_flags; @@ -2026,14 +2032,22 @@ static void __unlock_page_memcg(struct mem_cgroup *memcg) } /** - * unlock_page_memcg - unlock a page and memcg binding - * @page: the page + * folio_memcg_unlock - Release the binding between a folio and its memcg. + * @folio: The folio. + * + * This releases the binding created by folio_memcg_lock(). This does + * not change the accounting of this folio to its memcg, but it does + * permit others to change it. */ +void folio_memcg_unlock(struct folio *folio) +{ + __folio_memcg_unlock(folio_memcg(folio)); +} +EXPORT_SYMBOL(folio_memcg_unlock); + void unlock_page_memcg(struct page *page) { - struct page *head = compound_head(page); - - __unlock_page_memcg(page_memcg(head)); + folio_memcg_unlock(page_folio(page)); } EXPORT_SYMBOL(unlock_page_memcg); @@ -2734,9 +2748,9 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) } #endif -static void commit_charge(struct page *page, struct mem_cgroup *memcg) +static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) { - VM_BUG_ON_PAGE(page_memcg(page), page); + VM_BUG_ON_FOLIO(folio_memcg(folio), folio); /* * Any of the following ensures page's memcg stability: * @@ -2745,7 +2759,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg) * - lock_page_memcg() * - exclusive reference */ - page->memcg_data = (unsigned long)memcg; + folio->memcg_data = (unsigned long)memcg; } static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) @@ -3015,15 +3029,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) */ void __memcg_kmem_uncharge_page(struct page *page, int order) { + struct folio *folio = page_folio(page); struct obj_cgroup *objcg; unsigned int nr_pages = 1 << order; - if (!PageMemcgKmem(page)) + if (!folio_memcg_kmem(folio)) return; - objcg = __page_objcg(page); + objcg = __folio_objcg(folio); obj_cgroup_uncharge_pages(objcg, nr_pages); - page->memcg_data = 0; + folio->memcg_data = 0; obj_cgroup_put(objcg); } @@ -3257,17 +3272,18 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) */ void split_page_memcg(struct page *head, unsigned int nr) { - struct mem_cgroup *memcg = page_memcg(head); + struct folio *folio = page_folio(head); + struct mem_cgroup *memcg = folio_memcg(folio); int i; if (mem_cgroup_disabled() || !memcg) return; for (i = 1; i < nr; i++) - head[i].memcg_data = head->memcg_data; + folio_page(folio, i)->memcg_data = folio->memcg_data; - if (PageMemcgKmem(head)) - obj_cgroup_get_many(__page_objcg(head), nr - 1); + if (folio_memcg_kmem(folio)) + obj_cgroup_get_many(__folio_objcg(folio), nr - 1); else css_get_many(&memcg->css, nr - 1); } @@ -3381,7 +3397,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, if (order > 0) return 0; - mctz = soft_limit_tree_node(pgdat->node_id); + mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id]; /* * Do not even bother to check the largest node if the root @@ -4537,17 +4553,17 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, * As being wrong occasionally doesn't matter, updates and accesses to the * records are lockless and racy. */ -void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, +void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, struct bdi_writeback *wb) { - struct mem_cgroup *memcg = page_memcg(page); + struct mem_cgroup *memcg = folio_memcg(folio); struct memcg_cgwb_frn *frn; u64 now = get_jiffies_64(); u64 oldest_at = now; int oldest = -1; int i; - trace_track_foreign_dirty(page, wb); + trace_track_foreign_dirty(folio, wb); /* * Pick the slot to use. If there is already a slot for @wb, keep @@ -5575,38 +5591,39 @@ static int mem_cgroup_move_account(struct page *page, struct mem_cgroup *from, struct mem_cgroup *to) { + struct folio *folio = page_folio(page); struct lruvec *from_vec, *to_vec; struct pglist_data *pgdat; - unsigned int nr_pages = compound ? thp_nr_pages(page) : 1; - int ret; + unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1; + int nid, ret; VM_BUG_ON(from == to); - VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON(compound && !PageTransHuge(page)); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + VM_BUG_ON(compound && !folio_test_multi(folio)); /* * Prevent mem_cgroup_migrate() from looking at * page's memory cgroup of its source page while we change it. */ ret = -EBUSY; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto out; ret = -EINVAL; - if (page_memcg(page) != from) + if (folio_memcg(folio) != from) goto out_unlock; - pgdat = page_pgdat(page); + pgdat = folio_pgdat(folio); from_vec = mem_cgroup_lruvec(from, pgdat); to_vec = mem_cgroup_lruvec(to, pgdat); - lock_page_memcg(page); + folio_memcg_lock(folio); - if (PageAnon(page)) { - if (page_mapped(page)) { + if (folio_test_anon(folio)) { + if (folio_mapped(folio)) { __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); - if (PageTransHuge(page)) { + if (folio_test_transhuge(folio)) { __mod_lruvec_state(from_vec, NR_ANON_THPS, -nr_pages); __mod_lruvec_state(to_vec, NR_ANON_THPS, @@ -5617,18 +5634,18 @@ static int mem_cgroup_move_account(struct page *page, __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); - if (PageSwapBacked(page)) { + if (folio_test_swapbacked(folio)) { __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); } - if (page_mapped(page)) { + if (folio_mapped(folio)) { __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); } - if (PageDirty(page)) { - struct address_space *mapping = page_mapping(page); + if (folio_test_dirty(folio)) { + struct address_space *mapping = folio_mapping(folio); if (mapping_can_writeback(mapping)) { __mod_lruvec_state(from_vec, NR_FILE_DIRTY, @@ -5639,7 +5656,7 @@ static int mem_cgroup_move_account(struct page *page, } } - if (PageWriteback(page)) { + if (folio_test_writeback(folio)) { __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); } @@ -5662,20 +5679,21 @@ static int mem_cgroup_move_account(struct page *page, css_get(&to->css); css_put(&from->css); - page->memcg_data = (unsigned long)to; + folio->memcg_data = (unsigned long)to; - __unlock_page_memcg(from); + __folio_memcg_unlock(from); ret = 0; + nid = folio_nid(folio); local_irq_disable(); - mem_cgroup_charge_statistics(to, page, nr_pages); - memcg_check_events(to, page); - mem_cgroup_charge_statistics(from, page, -nr_pages); - memcg_check_events(from, page); + mem_cgroup_charge_statistics(to, nr_pages); + memcg_check_events(to, nid); + mem_cgroup_charge_statistics(from, -nr_pages); + memcg_check_events(from, nid); local_irq_enable(); out_unlock: - unlock_page(page); + folio_unlock(folio); out: return ret; } @@ -6680,9 +6698,10 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, atomic_long_read(&parent->memory.children_low_usage))); } -static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp) +static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, + gfp_t gfp) { - unsigned int nr_pages = thp_nr_pages(page); + long nr_pages = folio_nr_pages(folio); int ret; ret = try_charge(memcg, gfp, nr_pages); @@ -6690,38 +6709,23 @@ static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp) goto out; css_get(&memcg->css); - commit_charge(page, memcg); + commit_charge(folio, memcg); local_irq_disable(); - mem_cgroup_charge_statistics(memcg, page, nr_pages); - memcg_check_events(memcg, page); + mem_cgroup_charge_statistics(memcg, nr_pages); + memcg_check_events(memcg, folio_nid(folio)); local_irq_enable(); out: return ret; } -/** - * __mem_cgroup_charge - charge a newly allocated page to a cgroup - * @page: page to charge - * @mm: mm context of the victim - * @gfp_mask: reclaim mode - * - * Try to charge @page to the memcg that @mm belongs to, reclaiming - * pages according to @gfp_mask if necessary. if @mm is NULL, try to - * charge to the active memcg. - * - * Do not use this for pages allocated for swapin. - * - * Returns 0 on success. Otherwise, an error code is returned. - */ -int __mem_cgroup_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask) +int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { struct mem_cgroup *memcg; int ret; memcg = get_mem_cgroup_from_mm(mm); - ret = charge_memcg(page, memcg, gfp_mask); + ret = charge_memcg(folio, memcg, gfp); css_put(&memcg->css); return ret; @@ -6742,6 +6746,7 @@ int __mem_cgroup_charge(struct page *page, struct mm_struct *mm, int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { + struct folio *folio = page_folio(page); struct mem_cgroup *memcg; unsigned short id; int ret; @@ -6756,7 +6761,7 @@ int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, memcg = get_mem_cgroup_from_mm(mm); rcu_read_unlock(); - ret = charge_memcg(page, memcg, gfp); + ret = charge_memcg(folio, memcg, gfp); css_put(&memcg->css); return ret; @@ -6800,7 +6805,7 @@ struct uncharge_gather { unsigned long nr_memory; unsigned long pgpgout; unsigned long nr_kmem; - struct page *dummy_page; + int nid; }; static inline void uncharge_gather_clear(struct uncharge_gather *ug) @@ -6824,36 +6829,36 @@ static void uncharge_batch(const struct uncharge_gather *ug) local_irq_save(flags); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); - memcg_check_events(ug->memcg, ug->dummy_page); + memcg_check_events(ug->memcg, ug->nid); local_irq_restore(flags); - /* drop reference from uncharge_page */ + /* drop reference from uncharge_folio */ css_put(&ug->memcg->css); } -static void uncharge_page(struct page *page, struct uncharge_gather *ug) +static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) { - unsigned long nr_pages; + long nr_pages; struct mem_cgroup *memcg; struct obj_cgroup *objcg; - bool use_objcg = PageMemcgKmem(page); + bool use_objcg = folio_memcg_kmem(folio); - VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* * Nobody should be changing or seriously looking at - * page memcg or objcg at this point, we have fully - * exclusive access to the page. + * folio memcg or objcg at this point, we have fully + * exclusive access to the folio. */ if (use_objcg) { - objcg = __page_objcg(page); + objcg = __folio_objcg(folio); /* * This get matches the put at the end of the function and * kmem pages do not hold memcg references anymore. */ memcg = get_mem_cgroup_from_objcg(objcg); } else { - memcg = __page_memcg(page); + memcg = __folio_memcg(folio); } if (!memcg) @@ -6865,19 +6870,19 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) uncharge_gather_clear(ug); } ug->memcg = memcg; - ug->dummy_page = page; + ug->nid = folio_nid(folio); /* pairs with css_put in uncharge_batch */ css_get(&memcg->css); } - nr_pages = compound_nr(page); + nr_pages = folio_nr_pages(folio); if (use_objcg) { ug->nr_memory += nr_pages; ug->nr_kmem += nr_pages; - page->memcg_data = 0; + folio->memcg_data = 0; obj_cgroup_put(objcg); } else { /* LRU pages aren't accounted at the root level */ @@ -6885,28 +6890,22 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) ug->nr_memory += nr_pages; ug->pgpgout++; - page->memcg_data = 0; + folio->memcg_data = 0; } css_put(&memcg->css); } -/** - * __mem_cgroup_uncharge - uncharge a page - * @page: page to uncharge - * - * Uncharge a page previously charged with __mem_cgroup_charge(). - */ -void __mem_cgroup_uncharge(struct page *page) +void __mem_cgroup_uncharge(struct folio *folio) { struct uncharge_gather ug; - /* Don't touch page->lru of any random page, pre-check: */ - if (!page_memcg(page)) + /* Don't touch folio->lru of any random page, pre-check: */ + if (!folio_memcg(folio)) return; uncharge_gather_clear(&ug); - uncharge_page(page, &ug); + uncharge_folio(folio, &ug); uncharge_batch(&ug); } @@ -6920,52 +6919,49 @@ void __mem_cgroup_uncharge(struct page *page) void __mem_cgroup_uncharge_list(struct list_head *page_list) { struct uncharge_gather ug; - struct page *page; + struct folio *folio; uncharge_gather_clear(&ug); - list_for_each_entry(page, page_list, lru) - uncharge_page(page, &ug); + list_for_each_entry(folio, page_list, lru) + uncharge_folio(folio, &ug); if (ug.memcg) uncharge_batch(&ug); } /** - * mem_cgroup_migrate - charge a page's replacement - * @oldpage: currently circulating page - * @newpage: replacement page + * mem_cgroup_migrate - Charge a folio's replacement. + * @old: Currently circulating folio. + * @new: Replacement folio. * - * Charge @newpage as a replacement page for @oldpage. @oldpage will + * Charge @new as a replacement folio for @old. @old will * be uncharged upon free. * - * Both pages must be locked, @newpage->mapping must be set up. + * Both folios must be locked, @new->mapping must be set up. */ -void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) +void mem_cgroup_migrate(struct folio *old, struct folio *new) { struct mem_cgroup *memcg; - unsigned int nr_pages; + long nr_pages = folio_nr_pages(new); unsigned long flags; - VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); - VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); - VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), - newpage); + VM_BUG_ON_FOLIO(!folio_test_locked(old), old); + VM_BUG_ON_FOLIO(!folio_test_locked(new), new); + VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new); + VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new); if (mem_cgroup_disabled()) return; - /* Page cache replacement: new page already charged? */ - if (page_memcg(newpage)) + /* Page cache replacement: new folio already charged? */ + if (folio_memcg(new)) return; - memcg = page_memcg(oldpage); - VM_WARN_ON_ONCE_PAGE(!memcg, oldpage); + memcg = folio_memcg(old); + VM_WARN_ON_ONCE_FOLIO(!memcg, old); if (!memcg) return; /* Force-charge the new page. The old one will be freed soon */ - nr_pages = thp_nr_pages(newpage); - if (!mem_cgroup_is_root(memcg)) { page_counter_charge(&memcg->memory, nr_pages); if (do_memsw_account()) @@ -6973,11 +6969,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) } css_get(&memcg->css); - commit_charge(newpage, memcg); + commit_charge(new, memcg); local_irq_save(flags); - mem_cgroup_charge_statistics(memcg, newpage, nr_pages); - memcg_check_events(memcg, newpage); + mem_cgroup_charge_statistics(memcg, nr_pages); + memcg_check_events(memcg, folio_nid(new)); local_irq_restore(flags); } @@ -7204,8 +7200,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * only synchronisation we have for updating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); - mem_cgroup_charge_statistics(memcg, page, -nr_entries); - memcg_check_events(memcg, page); + mem_cgroup_charge_statistics(memcg, -nr_entries); + memcg_check_events(memcg, page_to_nid(page)); css_put(&memcg->css); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index bdbbb32211a5..93078a2859a7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -762,7 +762,7 @@ static int delete_from_lru_cache(struct page *p) * Poisoned page might never drop its ref count to 0 so we have * to uncharge it manually from its memcg. */ - mem_cgroup_uncharge(p); + mem_cgroup_uncharge(page_folio(p)); /* * drop the page count elevated by isolate_lru_page() diff --git a/mm/memory.c b/mm/memory.c index c52be6d6b605..4b1de80c2a9c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -990,7 +990,7 @@ page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, if (!new_page) return NULL; - if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) { + if (mem_cgroup_charge(page_folio(new_page), src_mm, GFP_KERNEL)) { put_page(new_page); return NULL; } @@ -3019,7 +3019,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } } - if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) + if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL)) goto oom_free_new; cgroup_throttle_swaprate(new_page, GFP_KERNEL); @@ -3539,7 +3539,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) shadow = get_shadow_from_swap_cache(entry); if (shadow) - workingset_refault(page, shadow); + workingset_refault(page_folio(page), + shadow); lru_cache_add(page); @@ -3769,7 +3770,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (!page) goto oom; - if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) + if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) goto oom_free_page; cgroup_throttle_swaprate(page, GFP_KERNEL); @@ -4202,7 +4203,8 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) if (!vmf->cow_page) return VM_FAULT_OOM; - if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) { + if (mem_cgroup_charge(page_folio(vmf->cow_page), vma->vm_mm, + GFP_KERNEL)) { put_page(vmf->cow_page); return VM_FAULT_OOM; } @@ -4267,7 +4269,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults). * The mmap_lock may have been released depending on flags and our - * return value. See filemap_fault() and __lock_page_or_retry(). + * return value. See filemap_fault() and __folio_lock_or_retry(). * If mmap_lock is released, vma may become invalid (for example * by other thread calling munmap()). */ @@ -4508,7 +4510,7 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) * concurrent faults). * * The mmap_lock may have been released depending on flags and our return value. - * See filemap_fault() and __lock_page_or_retry(). + * See filemap_fault() and __folio_lock_or_retry(). */ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) { @@ -4612,7 +4614,7 @@ unlock: * By the time we get here, we already hold the mm semaphore * * The mmap_lock may have been released depending on flags and our - * return value. See filemap_fault() and __lock_page_or_retry(). + * return value. See filemap_fault() and __folio_lock_or_retry(). */ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -4768,7 +4770,7 @@ static inline void mm_account_fault(struct pt_regs *regs, * By the time we get here, we already hold the mm semaphore * * The mmap_lock may have been released depending on flags and our - * return value. See filemap_fault() and __lock_page_or_retry(). + * return value. See filemap_fault() and __folio_lock_or_retry(). */ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d12e0608fced..f4b4be7af4d3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2196,6 +2196,16 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) } EXPORT_SYMBOL(alloc_pages); +struct folio *folio_alloc(gfp_t gfp, unsigned order) +{ + struct page *page = alloc_pages(gfp | __GFP_COMP, order); + + if (page && order > 1) + prep_transhuge_page(page); + return (struct folio *)page; +} +EXPORT_SYMBOL(folio_alloc); + int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { struct mempolicy *pol = mpol_dup(vma_policy(src)); diff --git a/mm/memremap.c b/mm/memremap.c index ed593bf87109..5a66a71ab591 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -505,7 +505,7 @@ void free_devmap_managed_page(struct page *page) __ClearPageWaiters(page); - mem_cgroup_uncharge(page); + mem_cgroup_uncharge(page_folio(page)); /* * When a device_private page is freed, the page->mapping field diff --git a/mm/migrate.c b/mm/migrate.c index 1852d787e6ab..efa9941ebe03 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -364,7 +364,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) */ expected_count += is_device_private_page(page); if (mapping) - expected_count += thp_nr_pages(page) + page_has_private(page); + expected_count += compound_nr(page) + page_has_private(page); return expected_count; } @@ -377,74 +377,75 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) * 2 for pages with a mapping * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. */ -int migrate_page_move_mapping(struct address_space *mapping, - struct page *newpage, struct page *page, int extra_count) +int folio_migrate_mapping(struct address_space *mapping, + struct folio *newfolio, struct folio *folio, int extra_count) { - XA_STATE(xas, &mapping->i_pages, page_index(page)); + XA_STATE(xas, &mapping->i_pages, folio_index(folio)); struct zone *oldzone, *newzone; int dirty; - int expected_count = expected_page_refs(mapping, page) + extra_count; - int nr = thp_nr_pages(page); + int expected_count = expected_page_refs(mapping, &folio->page) + extra_count; + long nr = folio_nr_pages(folio); if (!mapping) { /* Anonymous page without mapping */ - if (page_count(page) != expected_count) + if (folio_ref_count(folio) != expected_count) return -EAGAIN; /* No turning back from here */ - newpage->index = page->index; - newpage->mapping = page->mapping; - if (PageSwapBacked(page)) - __SetPageSwapBacked(newpage); + newfolio->index = folio->index; + newfolio->mapping = folio->mapping; + if (folio_test_swapbacked(folio)) + __folio_set_swapbacked(newfolio); return MIGRATEPAGE_SUCCESS; } - oldzone = page_zone(page); - newzone = page_zone(newpage); + oldzone = folio_zone(folio); + newzone = folio_zone(newfolio); xas_lock_irq(&xas); - if (page_count(page) != expected_count || xas_load(&xas) != page) { + if (folio_ref_count(folio) != expected_count || + xas_load(&xas) != folio) { xas_unlock_irq(&xas); return -EAGAIN; } - if (!page_ref_freeze(page, expected_count)) { + if (!folio_ref_freeze(folio, expected_count)) { xas_unlock_irq(&xas); return -EAGAIN; } /* - * Now we know that no one else is looking at the page: + * Now we know that no one else is looking at the folio: * no turning back from here. */ - newpage->index = page->index; - newpage->mapping = page->mapping; - page_ref_add(newpage, nr); /* add cache reference */ - if (PageSwapBacked(page)) { - __SetPageSwapBacked(newpage); - if (PageSwapCache(page)) { - SetPageSwapCache(newpage); - set_page_private(newpage, page_private(page)); + newfolio->index = folio->index; + newfolio->mapping = folio->mapping; + folio_ref_add(newfolio, nr); /* add cache reference */ + if (folio_test_swapbacked(folio)) { + __folio_set_swapbacked(newfolio); + if (folio_test_swapcache(folio)) { + folio_set_swapcache(newfolio); + newfolio->private = folio_get_private(folio); } } else { - VM_BUG_ON_PAGE(PageSwapCache(page), page); + VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); } /* Move dirty while page refs frozen and newpage not yet exposed */ - dirty = PageDirty(page); + dirty = folio_test_dirty(folio); if (dirty) { - ClearPageDirty(page); - SetPageDirty(newpage); + folio_clear_dirty(folio); + folio_set_dirty(newfolio); } - xas_store(&xas, newpage); - if (PageTransHuge(page)) { + xas_store(&xas, newfolio); + if (nr > 1) { int i; for (i = 1; i < nr; i++) { xas_next(&xas); - xas_store(&xas, newpage); + xas_store(&xas, newfolio); } } @@ -453,7 +454,7 @@ int migrate_page_move_mapping(struct address_space *mapping, * to one less reference. * We know this isn't the last reference. */ - page_ref_unfreeze(page, expected_count - nr); + folio_ref_unfreeze(folio, expected_count - nr); xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */ @@ -472,18 +473,18 @@ int migrate_page_move_mapping(struct address_space *mapping, struct lruvec *old_lruvec, *new_lruvec; struct mem_cgroup *memcg; - memcg = page_memcg(page); + memcg = folio_memcg(folio); old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr); __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr); - if (PageSwapBacked(page) && !PageSwapCache(page)) { + if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) { __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); __mod_lruvec_state(new_lruvec, NR_SHMEM, nr); } #ifdef CONFIG_SWAP - if (PageSwapCache(page)) { + if (folio_test_swapcache(folio)) { __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr); __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr); } @@ -499,11 +500,11 @@ int migrate_page_move_mapping(struct address_space *mapping, return MIGRATEPAGE_SUCCESS; } -EXPORT_SYMBOL(migrate_page_move_mapping); +EXPORT_SYMBOL(folio_migrate_mapping); /* * The expected number of remaining references is the same as that - * of migrate_page_move_mapping(). + * of folio_migrate_mapping(). */ int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page) @@ -538,91 +539,87 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, } /* - * Copy the page to its new location + * Copy the flags and some other ancillary information */ -void migrate_page_states(struct page *newpage, struct page *page) +void folio_migrate_flags(struct folio *newfolio, struct folio *folio) { int cpupid; - if (PageError(page)) - SetPageError(newpage); - if (PageReferenced(page)) - SetPageReferenced(newpage); - if (PageUptodate(page)) - SetPageUptodate(newpage); - if (TestClearPageActive(page)) { - VM_BUG_ON_PAGE(PageUnevictable(page), page); - SetPageActive(newpage); - } else if (TestClearPageUnevictable(page)) - SetPageUnevictable(newpage); - if (PageWorkingset(page)) - SetPageWorkingset(newpage); - if (PageChecked(page)) - SetPageChecked(newpage); - if (PageMappedToDisk(page)) - SetPageMappedToDisk(newpage); + if (folio_test_error(folio)) + folio_set_error(newfolio); + if (folio_test_referenced(folio)) + folio_set_referenced(newfolio); + if (folio_test_uptodate(folio)) + folio_mark_uptodate(newfolio); + if (folio_test_clear_active(folio)) { + VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio); + folio_set_active(newfolio); + } else if (folio_test_clear_unevictable(folio)) + folio_set_unevictable(newfolio); + if (folio_test_workingset(folio)) + folio_set_workingset(newfolio); + if (folio_test_checked(folio)) + folio_set_checked(newfolio); + if (folio_test_mappedtodisk(folio)) + folio_set_mappedtodisk(newfolio); - /* Move dirty on pages not done by migrate_page_move_mapping() */ - if (PageDirty(page)) - SetPageDirty(newpage); + /* Move dirty on pages not done by folio_migrate_mapping() */ + if (folio_test_dirty(folio)) + folio_set_dirty(newfolio); - if (page_is_young(page)) - set_page_young(newpage); - if (page_is_idle(page)) - set_page_idle(newpage); + if (folio_test_young(folio)) + folio_set_young(newfolio); + if (folio_test_idle(folio)) + folio_set_idle(newfolio); /* * Copy NUMA information to the new page, to prevent over-eager * future migrations of this same page. */ - cpupid = page_cpupid_xchg_last(page, -1); - page_cpupid_xchg_last(newpage, cpupid); + cpupid = page_cpupid_xchg_last(&folio->page, -1); + page_cpupid_xchg_last(&newfolio->page, cpupid); - ksm_migrate_page(newpage, page); + folio_migrate_ksm(newfolio, folio); /* * Please do not reorder this without considering how mm/ksm.c's * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). */ - if (PageSwapCache(page)) - ClearPageSwapCache(page); - ClearPagePrivate(page); + if (folio_test_swapcache(folio)) + folio_clear_swapcache(folio); + folio_clear_private(folio); /* page->private contains hugetlb specific flags */ - if (!PageHuge(page)) - set_page_private(page, 0); + if (!folio_test_hugetlb(folio)) + folio->private = NULL; /* * If any waiters have accumulated on the new page then * wake them up. */ - if (PageWriteback(newpage)) - end_page_writeback(newpage); + if (folio_test_writeback(newfolio)) + folio_end_writeback(newfolio); /* * PG_readahead shares the same bit with PG_reclaim. The above * end_page_writeback() may clear PG_readahead mistakenly, so set the * bit after that. */ - if (PageReadahead(page)) - SetPageReadahead(newpage); + if (folio_test_readahead(folio)) + folio_set_readahead(newfolio); - copy_page_owner(page, newpage); + folio_copy_owner(newfolio, folio); - if (!PageHuge(page)) - mem_cgroup_migrate(page, newpage); + if (!folio_test_hugetlb(folio)) + mem_cgroup_migrate(folio, newfolio); } -EXPORT_SYMBOL(migrate_page_states); +EXPORT_SYMBOL(folio_migrate_flags); -void migrate_page_copy(struct page *newpage, struct page *page) +void folio_migrate_copy(struct folio *newfolio, struct folio *folio) { - if (PageHuge(page) || PageTransHuge(page)) - copy_huge_page(newpage, page); - else - copy_highpage(newpage, page); - - migrate_page_states(newpage, page); + folio_copy(newfolio, folio); + folio_migrate_flags(newfolio, folio); } -EXPORT_SYMBOL(migrate_page_copy); +EXPORT_SYMBOL(folio_migrate_copy); /************************************************************ * Migration functions @@ -638,19 +635,21 @@ int migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { + struct folio *newfolio = page_folio(newpage); + struct folio *folio = page_folio(page); int rc; - BUG_ON(PageWriteback(page)); /* Writeback must be complete */ + BUG_ON(folio_test_writeback(folio)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(mapping, newpage, page, 0); + rc = folio_migrate_mapping(mapping, newfolio, folio, 0); if (rc != MIGRATEPAGE_SUCCESS) return rc; if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); + folio_migrate_copy(newfolio, folio); else - migrate_page_states(newpage, page); + folio_migrate_flags(newfolio, folio); return MIGRATEPAGE_SUCCESS; } EXPORT_SYMBOL(migrate_page); @@ -2468,7 +2467,7 @@ static void migrate_vma_collect(struct migrate_vma *migrate) * @page: struct page to check * * Pinned pages cannot be migrated. This is the same test as in - * migrate_page_move_mapping(), except that here we allow migration of a + * folio_migrate_mapping(), except that here we allow migration of a * ZONE_DEVICE page. */ static bool migrate_vma_check_page(struct page *page) @@ -2846,7 +2845,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, if (unlikely(anon_vma_prepare(vma))) goto abort; - if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) + if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) goto abort; /* diff --git a/mm/mlock.c b/mm/mlock.c index 16d2ee160d43..e263d62ae2d0 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -271,6 +271,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) /* Phase 1: page isolation */ for (i = 0; i < nr; i++) { struct page *page = pvec->pages[i]; + struct folio *folio = page_folio(page); if (TestClearPageMlocked(page)) { /* @@ -278,7 +279,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) * so we can spare the get_page() here. */ if (TestClearPageLRU(page)) { - lruvec = relock_page_lruvec_irq(page, lruvec); + lruvec = folio_lruvec_relock_irq(folio, lruvec); del_page_from_lru_list(page, lruvec); continue; } else diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 4812a17b288c..9c64490171e0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -562,12 +562,12 @@ static unsigned long wp_next_time(unsigned long cur_time) return cur_time; } -static void wb_domain_writeout_inc(struct wb_domain *dom, +static void wb_domain_writeout_add(struct wb_domain *dom, struct fprop_local_percpu *completions, - unsigned int max_prop_frac) + unsigned int max_prop_frac, long nr) { - __fprop_inc_percpu_max(&dom->completions, completions, - max_prop_frac); + __fprop_add_percpu_max(&dom->completions, completions, + max_prop_frac, nr); /* First event after period switching was turned off? */ if (unlikely(!dom->period_time)) { /* @@ -583,20 +583,20 @@ static void wb_domain_writeout_inc(struct wb_domain *dom, /* * Increment @wb's writeout completion count and the global writeout - * completion count. Called from test_clear_page_writeback(). + * completion count. Called from __folio_end_writeback(). */ -static inline void __wb_writeout_inc(struct bdi_writeback *wb) +static inline void __wb_writeout_add(struct bdi_writeback *wb, long nr) { struct wb_domain *cgdom; - inc_wb_stat(wb, WB_WRITTEN); - wb_domain_writeout_inc(&global_wb_domain, &wb->completions, - wb->bdi->max_prop_frac); + wb_stat_mod(wb, WB_WRITTEN, nr); + wb_domain_writeout_add(&global_wb_domain, &wb->completions, + wb->bdi->max_prop_frac, nr); cgdom = mem_cgroup_wb_domain(wb); if (cgdom) - wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb), - wb->bdi->max_prop_frac); + wb_domain_writeout_add(cgdom, wb_memcg_completions(wb), + wb->bdi->max_prop_frac, nr); } void wb_writeout_inc(struct bdi_writeback *wb) @@ -604,7 +604,7 @@ void wb_writeout_inc(struct bdi_writeback *wb) unsigned long flags; local_irq_save(flags); - __wb_writeout_inc(wb); + __wb_writeout_add(wb, 1); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(wb_writeout_inc); @@ -1084,7 +1084,7 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb, * write_bandwidth = --------------------------------------------------- * period * - * @written may have decreased due to account_page_redirty(). + * @written may have decreased due to folio_account_redirty(). * Avoid underflowing @bw calculation. */ bw = written - min(written, wb->written_stamp); @@ -2381,44 +2381,44 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) } /** - * write_one_page - write out a single page and wait on I/O - * @page: the page to write + * folio_write_one - write out a single folio and wait on I/O. + * @folio: The folio to write. * - * The page must be locked by the caller and will be unlocked upon return. + * The folio must be locked by the caller and will be unlocked upon return. * * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this * function returns. * * Return: %0 on success, negative error code otherwise */ -int write_one_page(struct page *page) +int folio_write_one(struct folio *folio) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; int ret = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, - .nr_to_write = 1, + .nr_to_write = folio_nr_pages(folio), }; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); - wait_on_page_writeback(page); + folio_wait_writeback(folio); - if (clear_page_dirty_for_io(page)) { - get_page(page); - ret = mapping->a_ops->writepage(page, &wbc); + if (folio_clear_dirty_for_io(folio)) { + folio_get(folio); + ret = mapping->a_ops->writepage(&folio->page, &wbc); if (ret == 0) - wait_on_page_writeback(page); - put_page(page); + folio_wait_writeback(folio); + folio_put(folio); } else { - unlock_page(page); + folio_unlock(folio); } if (!ret) ret = filemap_check_errors(mapping); return ret; } -EXPORT_SYMBOL(write_one_page); +EXPORT_SYMBOL(folio_write_one); /* * For address_spaces which do not use buffers nor write back. @@ -2438,29 +2438,30 @@ EXPORT_SYMBOL(__set_page_dirty_no_writeback); * * NOTE: This relies on being atomic wrt interrupts. */ -static void account_page_dirtied(struct page *page, +static void folio_account_dirtied(struct folio *folio, struct address_space *mapping) { struct inode *inode = mapping->host; - trace_writeback_dirty_page(page, mapping); + trace_writeback_dirty_folio(folio, mapping); if (mapping_can_writeback(mapping)) { struct bdi_writeback *wb; + long nr = folio_nr_pages(folio); - inode_attach_wb(inode, page); + inode_attach_wb(inode, &folio->page); wb = inode_to_wb(inode); - __inc_lruvec_page_state(page, NR_FILE_DIRTY); - __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); - __inc_node_page_state(page, NR_DIRTIED); - inc_wb_stat(wb, WB_RECLAIMABLE); - inc_wb_stat(wb, WB_DIRTIED); - task_io_account_write(PAGE_SIZE); - current->nr_dirtied++; - __this_cpu_inc(bdp_ratelimits); + __lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr); + __zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); + __node_stat_mod_folio(folio, NR_DIRTIED, nr); + wb_stat_mod(wb, WB_RECLAIMABLE, nr); + wb_stat_mod(wb, WB_DIRTIED, nr); + task_io_account_write(nr * PAGE_SIZE); + current->nr_dirtied += nr; + __this_cpu_add(bdp_ratelimits, nr); - mem_cgroup_track_foreign_dirty(page, wb); + mem_cgroup_track_foreign_dirty(folio, wb); } } @@ -2469,130 +2470,152 @@ static void account_page_dirtied(struct page *page, * * Caller must hold lock_page_memcg(). */ -void account_page_cleaned(struct page *page, struct address_space *mapping, +void folio_account_cleaned(struct folio *folio, struct address_space *mapping, struct bdi_writeback *wb) { if (mapping_can_writeback(mapping)) { - dec_lruvec_page_state(page, NR_FILE_DIRTY); - dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); - dec_wb_stat(wb, WB_RECLAIMABLE); - task_io_account_cancelled_write(PAGE_SIZE); + long nr = folio_nr_pages(folio); + lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); + wb_stat_mod(wb, WB_RECLAIMABLE, -nr); + task_io_account_cancelled_write(nr * PAGE_SIZE); } } /* - * Mark the page dirty, and set it dirty in the page cache, and mark the inode - * dirty. + * Mark the folio dirty, and set it dirty in the page cache, and mark + * the inode dirty. * - * If warn is true, then emit a warning if the page is not uptodate and has + * If warn is true, then emit a warning if the folio is not uptodate and has * not been truncated. * * The caller must hold lock_page_memcg(). */ -void __set_page_dirty(struct page *page, struct address_space *mapping, +void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, int warn) { unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); - if (page->mapping) { /* Race with truncate? */ - WARN_ON_ONCE(warn && !PageUptodate(page)); - account_page_dirtied(page, mapping); - __xa_set_mark(&mapping->i_pages, page_index(page), + if (folio->mapping) { /* Race with truncate? */ + WARN_ON_ONCE(warn && !folio_test_uptodate(folio)); + folio_account_dirtied(folio, mapping); + __xa_set_mark(&mapping->i_pages, folio_index(folio), PAGECACHE_TAG_DIRTY); } xa_unlock_irqrestore(&mapping->i_pages, flags); } -/* - * For address_spaces which do not use buffers. Just tag the page as dirty in - * the xarray. +/** + * filemap_dirty_folio - Mark a folio dirty for filesystems which do not use buffer_heads. + * @mapping: Address space this folio belongs to. + * @folio: Folio to be marked as dirty. * - * This is also used when a single buffer is being dirtied: we want to set the - * page dirty in that case, but not all the buffers. This is a "bottom-up" - * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. + * Filesystems which do not use buffer heads should call this function + * from their set_page_dirty address space operation. It ignores the + * contents of folio_get_private(), so if the filesystem marks individual + * blocks as dirty, the filesystem should handle that itself. * - * The caller must ensure this doesn't race with truncation. Most will simply - * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and - * the pte lock held, which also locks out truncation. + * This is also sometimes used by filesystems which use buffer_heads when + * a single buffer is being dirtied: we want to set the folio dirty in + * that case, but not all the buffers. This is a "bottom-up" dirtying, + * whereas __set_page_dirty_buffers() is a "top-down" dirtying. + * + * The caller must ensure this doesn't race with truncation. Most will + * simply hold the folio lock, but e.g. zap_pte_range() calls with the + * folio mapped and the pte lock held, which also locks out truncation. */ -int __set_page_dirty_nobuffers(struct page *page) +bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio) { - lock_page_memcg(page); - if (!TestSetPageDirty(page)) { - struct address_space *mapping = page_mapping(page); - - if (!mapping) { - unlock_page_memcg(page); - return 1; - } - __set_page_dirty(page, mapping, !PagePrivate(page)); - unlock_page_memcg(page); - - if (mapping->host) { - /* !PageAnon && !swapper_space */ - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - } - return 1; + folio_memcg_lock(folio); + if (folio_test_set_dirty(folio)) { + folio_memcg_unlock(folio); + return false; } - unlock_page_memcg(page); - return 0; -} -EXPORT_SYMBOL(__set_page_dirty_nobuffers); -/* - * Call this whenever redirtying a page, to de-account the dirty counters - * (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written - * counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to - * systematic errors in balanced_dirty_ratelimit and the dirty pages position - * control. + __folio_mark_dirty(folio, mapping, !folio_test_private(folio)); + folio_memcg_unlock(folio); + + if (mapping->host) { + /* !PageAnon && !swapper_space */ + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + } + return true; +} +EXPORT_SYMBOL(filemap_dirty_folio); + +/** + * folio_account_redirty - Manually account for redirtying a page. + * @folio: The folio which is being redirtied. + * + * Most filesystems should call folio_redirty_for_writepage() instead + * of this fuction. If your filesystem is doing writeback outside the + * context of a writeback_control(), it can call this when redirtying + * a folio, to de-account the dirty counters (NR_DIRTIED, WB_DIRTIED, + * tsk->nr_dirtied), so that they match the written counters (NR_WRITTEN, + * WB_WRITTEN) in long term. The mismatches will lead to systematic errors + * in balanced_dirty_ratelimit and the dirty pages position control. */ -void account_page_redirty(struct page *page) +void folio_account_redirty(struct folio *folio) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; if (mapping && mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; + long nr = folio_nr_pages(folio); wb = unlocked_inode_to_wb_begin(inode, &cookie); - current->nr_dirtied--; - dec_node_page_state(page, NR_DIRTIED); - dec_wb_stat(wb, WB_DIRTIED); + current->nr_dirtied -= nr; + node_stat_mod_folio(folio, NR_DIRTIED, -nr); + wb_stat_mod(wb, WB_DIRTIED, -nr); unlocked_inode_to_wb_end(inode, &cookie); } } -EXPORT_SYMBOL(account_page_redirty); +EXPORT_SYMBOL(folio_account_redirty); -/* - * When a writepage implementation decides that it doesn't want to write this - * page for some reason, it should redirty the locked page via - * redirty_page_for_writepage() and it should then unlock the page and return 0 +/** + * folio_redirty_for_writepage - Decline to write a dirty folio. + * @wbc: The writeback control. + * @folio: The folio. + * + * When a writepage implementation decides that it doesn't want to write + * @folio for some reason, it should call this function, unlock @folio and + * return 0. + * + * Return: True if we redirtied the folio. False if someone else dirtied + * it first. */ -int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) +bool folio_redirty_for_writepage(struct writeback_control *wbc, + struct folio *folio) { - int ret; + bool ret; + long nr = folio_nr_pages(folio); + + wbc->pages_skipped += nr; + ret = filemap_dirty_folio(folio->mapping, folio); + folio_account_redirty(folio); - wbc->pages_skipped++; - ret = __set_page_dirty_nobuffers(page); - account_page_redirty(page); return ret; } -EXPORT_SYMBOL(redirty_page_for_writepage); +EXPORT_SYMBOL(folio_redirty_for_writepage); -/* - * Dirty a page. +/** + * folio_mark_dirty - Mark a folio as being modified. + * @folio: The folio. * - * For pages with a mapping this should be done under the page lock for the - * benefit of asynchronous memory errors who prefer a consistent dirty state. - * This rule can be broken in some special cases, but should be better not to. + * For folios with a mapping this should be done under the page lock + * for the benefit of asynchronous memory errors who prefer a consistent + * dirty state. This rule can be broken in some special cases, + * but should be better not to. + * + * Return: True if the folio was newly dirtied, false if it was already dirty. */ -int set_page_dirty(struct page *page) +bool folio_mark_dirty(struct folio *folio) { - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = folio_mapping(folio); - page = compound_head(page); if (likely(mapping)) { /* * readahead/lru_deactivate_page could remain @@ -2604,17 +2627,17 @@ int set_page_dirty(struct page *page) * it will confuse readahead and make it restart the size rampup * process. But it's a trivial problem. */ - if (PageReclaim(page)) - ClearPageReclaim(page); - return mapping->a_ops->set_page_dirty(page); + if (folio_test_reclaim(folio)) + folio_clear_reclaim(folio); + return mapping->a_ops->set_page_dirty(&folio->page); } - if (!PageDirty(page)) { - if (!TestSetPageDirty(page)) - return 1; + if (!folio_test_dirty(folio)) { + if (!folio_test_set_dirty(folio)) + return true; } - return 0; + return false; } -EXPORT_SYMBOL(set_page_dirty); +EXPORT_SYMBOL(folio_mark_dirty); /* * set_page_dirty() is racy if the caller has no reference against @@ -2650,49 +2673,49 @@ EXPORT_SYMBOL(set_page_dirty_lock); * page without actually doing it through the VM. Can you say "ext3 is * horribly ugly"? Thought you could. */ -void __cancel_dirty_page(struct page *page) +void __folio_cancel_dirty(struct folio *folio) { - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = folio_mapping(folio); if (mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; - lock_page_memcg(page); + folio_memcg_lock(folio); wb = unlocked_inode_to_wb_begin(inode, &cookie); - if (TestClearPageDirty(page)) - account_page_cleaned(page, mapping, wb); + if (folio_test_clear_dirty(folio)) + folio_account_cleaned(folio, mapping, wb); unlocked_inode_to_wb_end(inode, &cookie); - unlock_page_memcg(page); + folio_memcg_unlock(folio); } else { - ClearPageDirty(page); + folio_clear_dirty(folio); } } -EXPORT_SYMBOL(__cancel_dirty_page); +EXPORT_SYMBOL(__folio_cancel_dirty); /* - * Clear a page's dirty flag, while caring for dirty memory accounting. - * Returns true if the page was previously dirty. + * Clear a folio's dirty flag, while caring for dirty memory accounting. + * Returns true if the folio was previously dirty. * - * This is for preparing to put the page under writeout. We leave the page - * tagged as dirty in the xarray so that a concurrent write-for-sync - * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage - * implementation will run either set_page_writeback() or set_page_dirty(), - * at which stage we bring the page's dirty flag and xarray dirty tag - * back into sync. + * This is for preparing to put the folio under writeout. We leave + * the folio tagged as dirty in the xarray so that a concurrent + * write-for-sync can discover it via a PAGECACHE_TAG_DIRTY walk. + * The ->writepage implementation will run either folio_start_writeback() + * or folio_mark_dirty(), at which stage we bring the folio's dirty flag + * and xarray dirty tag back into sync. * - * This incoherency between the page's dirty flag and xarray tag is - * unfortunate, but it only exists while the page is locked. + * This incoherency between the folio's dirty flag and xarray tag is + * unfortunate, but it only exists while the folio is locked. */ -int clear_page_dirty_for_io(struct page *page) +bool folio_clear_dirty_for_io(struct folio *folio) { - struct address_space *mapping = page_mapping(page); - int ret = 0; + struct address_space *mapping = folio_mapping(folio); + bool ret = false; - VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (mapping && mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; @@ -2705,48 +2728,49 @@ int clear_page_dirty_for_io(struct page *page) * We use this sequence to make sure that * (a) we account for dirty stats properly * (b) we tell the low-level filesystem to - * mark the whole page dirty if it was + * mark the whole folio dirty if it was * dirty in a pagetable. Only to then - * (c) clean the page again and return 1 to + * (c) clean the folio again and return 1 to * cause the writeback. * * This way we avoid all nasty races with the * dirty bit in multiple places and clearing * them concurrently from different threads. * - * Note! Normally the "set_page_dirty(page)" + * Note! Normally the "folio_mark_dirty(folio)" * has no effect on the actual dirty bit - since * that will already usually be set. But we * need the side effects, and it can help us * avoid races. * - * We basically use the page "master dirty bit" + * We basically use the folio "master dirty bit" * as a serialization point for all the different * threads doing their things. */ - if (page_mkclean(page)) - set_page_dirty(page); + if (folio_mkclean(folio)) + folio_mark_dirty(folio); /* * We carefully synchronise fault handlers against - * installing a dirty pte and marking the page dirty + * installing a dirty pte and marking the folio dirty * at this point. We do this by having them hold the - * page lock while dirtying the page, and pages are + * page lock while dirtying the folio, and folios are * always locked coming in here, so we get the desired * exclusion. */ wb = unlocked_inode_to_wb_begin(inode, &cookie); - if (TestClearPageDirty(page)) { - dec_lruvec_page_state(page, NR_FILE_DIRTY); - dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); - dec_wb_stat(wb, WB_RECLAIMABLE); - ret = 1; + if (folio_test_clear_dirty(folio)) { + long nr = folio_nr_pages(folio); + lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); + wb_stat_mod(wb, WB_RECLAIMABLE, -nr); + ret = true; } unlocked_inode_to_wb_end(inode, &cookie); return ret; } - return TestClearPageDirty(page); + return folio_test_clear_dirty(folio); } -EXPORT_SYMBOL(clear_page_dirty_for_io); +EXPORT_SYMBOL(folio_clear_dirty_for_io); static void wb_inode_writeback_start(struct bdi_writeback *wb) { @@ -2766,27 +2790,28 @@ static void wb_inode_writeback_end(struct bdi_writeback *wb) queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL); } -int test_clear_page_writeback(struct page *page) +bool __folio_end_writeback(struct folio *folio) { - struct address_space *mapping = page_mapping(page); - int ret; + long nr = folio_nr_pages(folio); + struct address_space *mapping = folio_mapping(folio); + bool ret; - lock_page_memcg(page); + folio_memcg_lock(folio); if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); - ret = TestClearPageWriteback(page); + ret = folio_test_clear_writeback(folio); if (ret) { - __xa_clear_mark(&mapping->i_pages, page_index(page), + __xa_clear_mark(&mapping->i_pages, folio_index(folio), PAGECACHE_TAG_WRITEBACK); if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { struct bdi_writeback *wb = inode_to_wb(inode); - dec_wb_stat(wb, WB_WRITEBACK); - __wb_writeout_inc(wb); + wb_stat_mod(wb, WB_WRITEBACK, -nr); + __wb_writeout_add(wb, nr); if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) wb_inode_writeback_end(wb); @@ -2799,32 +2824,34 @@ int test_clear_page_writeback(struct page *page) xa_unlock_irqrestore(&mapping->i_pages, flags); } else { - ret = TestClearPageWriteback(page); + ret = folio_test_clear_writeback(folio); } if (ret) { - dec_lruvec_page_state(page, NR_WRITEBACK); - dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); - inc_node_page_state(page, NR_WRITTEN); + lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); + node_stat_mod_folio(folio, NR_WRITTEN, nr); } - unlock_page_memcg(page); + folio_memcg_unlock(folio); return ret; } -int __test_set_page_writeback(struct page *page, bool keep_write) +bool __folio_start_writeback(struct folio *folio, bool keep_write) { - struct address_space *mapping = page_mapping(page); - int ret, access_ret; + long nr = folio_nr_pages(folio); + struct address_space *mapping = folio_mapping(folio); + bool ret; + int access_ret; - lock_page_memcg(page); + folio_memcg_lock(folio); if (mapping && mapping_use_writeback_tags(mapping)) { - XA_STATE(xas, &mapping->i_pages, page_index(page)); + XA_STATE(xas, &mapping->i_pages, folio_index(folio)); struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; xas_lock_irqsave(&xas, flags); xas_load(&xas); - ret = TestSetPageWriteback(page); + ret = folio_test_set_writeback(folio); if (!ret) { bool on_wblist; @@ -2835,84 +2862,105 @@ int __test_set_page_writeback(struct page *page, bool keep_write) if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { struct bdi_writeback *wb = inode_to_wb(inode); - inc_wb_stat(wb, WB_WRITEBACK); + wb_stat_mod(wb, WB_WRITEBACK, nr); if (!on_wblist) wb_inode_writeback_start(wb); } /* - * We can come through here when swapping anonymous - * pages, so we don't necessarily have an inode to track - * for sync. + * We can come through here when swapping + * anonymous folios, so we don't necessarily + * have an inode to track for sync. */ if (mapping->host && !on_wblist) sb_mark_inode_writeback(mapping->host); } - if (!PageDirty(page)) + if (!folio_test_dirty(folio)) xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); if (!keep_write) xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irqrestore(&xas, flags); } else { - ret = TestSetPageWriteback(page); + ret = folio_test_set_writeback(folio); } if (!ret) { - inc_lruvec_page_state(page, NR_WRITEBACK); - inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); + lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); } - unlock_page_memcg(page); - access_ret = arch_make_page_accessible(page); + folio_memcg_unlock(folio); + access_ret = arch_make_folio_accessible(folio); /* * If writeback has been triggered on a page that cannot be made * accessible, it is too late to recover here. */ - VM_BUG_ON_PAGE(access_ret != 0, page); + VM_BUG_ON_FOLIO(access_ret != 0, folio); return ret; - } -EXPORT_SYMBOL(__test_set_page_writeback); +EXPORT_SYMBOL(__folio_start_writeback); -/* - * Wait for a page to complete writeback +/** + * folio_wait_writeback - Wait for a folio to finish writeback. + * @folio: The folio to wait for. + * + * If the folio is currently being written back to storage, wait for the + * I/O to complete. + * + * Context: Sleeps. Must be called in process context and with + * no spinlocks held. Caller should hold a reference on the folio. + * If the folio is not locked, writeback may start again after writeback + * has finished. */ -void wait_on_page_writeback(struct page *page) +void folio_wait_writeback(struct folio *folio) { - while (PageWriteback(page)) { - trace_wait_on_page_writeback(page, page_mapping(page)); - wait_on_page_bit(page, PG_writeback); + while (folio_test_writeback(folio)) { + trace_folio_wait_writeback(folio, folio_mapping(folio)); + folio_wait_bit(folio, PG_writeback); } } -EXPORT_SYMBOL_GPL(wait_on_page_writeback); +EXPORT_SYMBOL_GPL(folio_wait_writeback); -/* - * Wait for a page to complete writeback. Returns -EINTR if we get a - * fatal signal while waiting. +/** + * folio_wait_writeback_killable - Wait for a folio to finish writeback. + * @folio: The folio to wait for. + * + * If the folio is currently being written back to storage, wait for the + * I/O to complete or a fatal signal to arrive. + * + * Context: Sleeps. Must be called in process context and with + * no spinlocks held. Caller should hold a reference on the folio. + * If the folio is not locked, writeback may start again after writeback + * has finished. + * Return: 0 on success, -EINTR if we get a fatal signal while waiting. */ -int wait_on_page_writeback_killable(struct page *page) +int folio_wait_writeback_killable(struct folio *folio) { - while (PageWriteback(page)) { - trace_wait_on_page_writeback(page, page_mapping(page)); - if (wait_on_page_bit_killable(page, PG_writeback)) + while (folio_test_writeback(folio)) { + trace_folio_wait_writeback(folio, folio_mapping(folio)); + if (folio_wait_bit_killable(folio, PG_writeback)) return -EINTR; } return 0; } -EXPORT_SYMBOL_GPL(wait_on_page_writeback_killable); +EXPORT_SYMBOL_GPL(folio_wait_writeback_killable); /** - * wait_for_stable_page() - wait for writeback to finish, if necessary. - * @page: The page to wait on. + * folio_wait_stable() - wait for writeback to finish, if necessary. + * @folio: The folio to wait on. * - * This function determines if the given page is related to a backing device - * that requires page contents to be held stable during writeback. If so, then - * it will wait for any pending writeback to complete. + * This function determines if the given folio is related to a backing + * device that requires folio contents to be held stable during writeback. + * If so, then it will wait for any pending writeback to complete. + * + * Context: Sleeps. Must be called in process context and with + * no spinlocks held. Caller should hold a reference on the folio. + * If the folio is not locked, writeback may start again after writeback + * has finished. */ -void wait_for_stable_page(struct page *page) +void folio_wait_stable(struct folio *folio) { - page = thp_head(page); - if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES) - wait_on_page_writeback(page); + if (folio->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES) + folio_wait_writeback(folio); } -EXPORT_SYMBOL_GPL(wait_for_stable_page); +EXPORT_SYMBOL_GPL(folio_wait_stable); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 23d3339ac4e8..fee18ada46a2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -724,7 +724,7 @@ static inline void free_the_page(struct page *page, unsigned int order) void free_compound_page(struct page *page) { - mem_cgroup_uncharge(page); + mem_cgroup_uncharge(page_folio(page)); free_the_page(page, compound_order(page)); } @@ -5406,6 +5406,18 @@ out: } EXPORT_SYMBOL(__alloc_pages); +struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, + nodemask_t *nodemask) +{ + struct page *page = __alloc_pages(gfp | __GFP_COMP, order, + preferred_nid, nodemask); + + if (page && order > 1) + prep_transhuge_page(page); + return (struct folio *)page; +} +EXPORT_SYMBOL(__folio_alloc); + /* * Common helper functions. Never use with __GFP_HIGHMEM because the returned * address cannot represent highmem pages. Use alloc_pages and then kmap if diff --git a/mm/page_io.c b/mm/page_io.c index c493ce9ebcf5..d597bc6e6e45 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -38,7 +38,7 @@ void end_swap_bio_write(struct bio *bio) * Also print a dire warning that things will go BAD (tm) * very quickly. * - * Also clear PG_reclaim to avoid rotate_reclaimable_page() + * Also clear PG_reclaim to avoid folio_rotate_reclaimable() */ set_page_dirty(page); pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n", @@ -317,7 +317,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, * temporary failure if the system has limited * memory for allocating transmit buffers. * Mark the page dirty and avoid - * rotate_reclaimable_page but rate-limit the + * folio_rotate_reclaimable but rate-limit the * messages but do not flag PageError like * the normal direct-to-bio case as it could * be temporary. diff --git a/mm/page_owner.c b/mm/page_owner.c index 62402d22539b..d24ed221357c 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -210,10 +210,10 @@ void __split_page_owner(struct page *page, unsigned int nr) } } -void __copy_page_owner(struct page *oldpage, struct page *newpage) +void __folio_copy_owner(struct folio *newfolio, struct folio *old) { - struct page_ext *old_ext = lookup_page_ext(oldpage); - struct page_ext *new_ext = lookup_page_ext(newpage); + struct page_ext *old_ext = lookup_page_ext(&old->page); + struct page_ext *new_ext = lookup_page_ext(&newfolio->page); struct page_owner *old_page_owner, *new_page_owner; if (unlikely(!old_ext || !new_ext)) @@ -231,11 +231,11 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; /* - * We don't clear the bit on the oldpage as it's going to be freed + * We don't clear the bit on the old folio as it's going to be freed * after migration. Until then, the info can be useful in case of * a bug, and the overall stats will be off a bit only temporarily. * Also, migrate_misplaced_transhuge_page() can still fail the - * migration and then we want the oldpage to retain the info. But + * migration and then we want the old folio to retain the info. But * in that case we also don't need to explicitly clear the info from * the new page, which will be freed. */ diff --git a/mm/rmap.c b/mm/rmap.c index 6aebd1747251..3a1059c284c3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -34,7 +34,7 @@ * mapping->private_lock (in __set_page_dirty_buffers) * lock_page_memcg move_lock (in __set_page_dirty_buffers) * i_pages lock (widely used) - * lruvec->lru_lock (in lock_page_lruvec_irq) + * lruvec->lru_lock (in folio_lruvec_lock_irq) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) @@ -981,7 +981,7 @@ static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) return true; } -int page_mkclean(struct page *page) +int folio_mkclean(struct folio *folio) { int cleaned = 0; struct address_space *mapping; @@ -991,20 +991,20 @@ int page_mkclean(struct page *page) .invalid_vma = invalid_mkclean_vma, }; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); - if (!page_mapped(page)) + if (!folio_mapped(folio)) return 0; - mapping = page_mapping(page); + mapping = folio_mapping(folio); if (!mapping) return 0; - rmap_walk(page, &rwc); + rmap_walk(&folio->page, &rwc); return cleaned; } -EXPORT_SYMBOL_GPL(page_mkclean); +EXPORT_SYMBOL_GPL(folio_mkclean); /** * page_move_anon_rmap - move a page to our anon_vma diff --git a/mm/shmem.c b/mm/shmem.c index b5860f4a2738..1588f33d009a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -710,7 +710,7 @@ static int shmem_add_to_page_cache(struct page *page, page->index = index; if (!PageSwapCache(page)) { - error = mem_cgroup_charge(page, charge_mm, gfp); + error = mem_cgroup_charge(page_folio(page), charge_mm, gfp); if (error) { if (PageTransHuge(page)) { count_vm_event(THP_FILE_FALLBACK); @@ -1637,6 +1637,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct page *oldpage, *newpage; + struct folio *old, *new; struct address_space *swap_mapping; swp_entry_t entry; pgoff_t swap_index; @@ -1673,7 +1674,9 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, xa_lock_irq(&swap_mapping->i_pages); error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); if (!error) { - mem_cgroup_migrate(oldpage, newpage); + old = page_folio(oldpage); + new = page_folio(newpage); + mem_cgroup_migrate(old, new); __inc_lruvec_page_state(newpage, NR_FILE_PAGES); __dec_lruvec_page_state(oldpage, NR_FILE_PAGES); } diff --git a/mm/swap.c b/mm/swap.c index af3cad4e5378..8ff9ba7cf2de 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -80,10 +80,11 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { static void __page_cache_release(struct page *page) { if (PageLRU(page)) { + struct folio *folio = page_folio(page); struct lruvec *lruvec; unsigned long flags; - lruvec = lock_page_lruvec_irqsave(page, &flags); + lruvec = folio_lruvec_lock_irqsave(folio, &flags); del_page_from_lru_list(page, lruvec); __clear_page_lru_flags(page); unlock_page_lruvec_irqrestore(lruvec, flags); @@ -94,7 +95,7 @@ static void __page_cache_release(struct page *page) static void __put_single_page(struct page *page) { __page_cache_release(page); - mem_cgroup_uncharge(page); + mem_cgroup_uncharge(page_folio(page)); free_unref_page(page, 0); } @@ -188,12 +189,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; + struct folio *folio = page_folio(page); /* block memcg migration during page moving between lru */ if (!TestClearPageLRU(page)) continue; - lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags); + lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); (*move_fn)(page, lruvec); SetPageLRU(page); @@ -206,11 +208,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec) { - if (!PageUnevictable(page)) { - del_page_from_lru_list(page, lruvec); - ClearPageActive(page); - add_page_to_lru_list_tail(page, lruvec); - __count_vm_events(PGROTATED, thp_nr_pages(page)); + struct folio *folio = page_folio(page); + + if (!folio_test_unevictable(folio)) { + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + lruvec_add_folio_tail(lruvec, folio); + __count_vm_events(PGROTATED, folio_nr_pages(folio)); } } @@ -227,23 +231,23 @@ static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page) } /* - * Writeback is about to end against a page which has been marked for immediate - * reclaim. If it still appears to be reclaimable, move it to the tail of the - * inactive list. + * Writeback is about to end against a folio which has been marked for + * immediate reclaim. If it still appears to be reclaimable, move it + * to the tail of the inactive list. * - * rotate_reclaimable_page() must disable IRQs, to prevent nasty races. + * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races. */ -void rotate_reclaimable_page(struct page *page) +void folio_rotate_reclaimable(struct folio *folio) { - if (!PageLocked(page) && !PageDirty(page) && - !PageUnevictable(page) && PageLRU(page)) { + if (!folio_test_locked(folio) && !folio_test_dirty(folio) && + !folio_test_unevictable(folio) && folio_test_lru(folio)) { struct pagevec *pvec; unsigned long flags; - get_page(page); + folio_get(folio); local_lock_irqsave(&lru_rotate.lock, flags); pvec = this_cpu_ptr(&lru_rotate.pvec); - if (pagevec_add_and_need_flush(pvec, page)) + if (pagevec_add_and_need_flush(pvec, &folio->page)) pagevec_lru_move_fn(pvec, pagevec_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } @@ -289,21 +293,21 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) } while ((lruvec = parent_lruvec(lruvec))); } -void lru_note_cost_page(struct page *page) +void lru_note_cost_folio(struct folio *folio) { - lru_note_cost(mem_cgroup_page_lruvec(page), - page_is_file_lru(page), thp_nr_pages(page)); + lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio), + folio_nr_pages(folio)); } -static void __activate_page(struct page *page, struct lruvec *lruvec) +static void __folio_activate(struct folio *folio, struct lruvec *lruvec) { - if (!PageActive(page) && !PageUnevictable(page)) { - int nr_pages = thp_nr_pages(page); + if (!folio_test_active(folio) && !folio_test_unevictable(folio)) { + long nr_pages = folio_nr_pages(folio); - del_page_from_lru_list(page, lruvec); - SetPageActive(page); - add_page_to_lru_list(page, lruvec); - trace_mm_lru_activate(page); + lruvec_del_folio(lruvec, folio); + folio_set_active(folio); + lruvec_add_folio(lruvec, folio); + trace_mm_lru_activate(folio); __count_vm_events(PGACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, @@ -312,6 +316,11 @@ static void __activate_page(struct page *page, struct lruvec *lruvec) } #ifdef CONFIG_SMP +static void __activate_page(struct page *page, struct lruvec *lruvec) +{ + return __folio_activate(page_folio(page), lruvec); +} + static void activate_page_drain(int cpu) { struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu); @@ -325,16 +334,16 @@ static bool need_activate_page_drain(int cpu) return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0; } -static void activate_page(struct page *page) +static void folio_activate(struct folio *folio) { - page = compound_head(page); - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + if (folio_test_lru(folio) && !folio_test_active(folio) && + !folio_test_unevictable(folio)) { struct pagevec *pvec; + folio_get(folio); local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.activate_page); - get_page(page); - if (pagevec_add_and_need_flush(pvec, page)) + if (pagevec_add_and_need_flush(pvec, &folio->page)) pagevec_lru_move_fn(pvec, __activate_page); local_unlock(&lru_pvecs.lock); } @@ -345,21 +354,20 @@ static inline void activate_page_drain(int cpu) { } -static void activate_page(struct page *page) +static void folio_activate(struct folio *folio) { struct lruvec *lruvec; - page = compound_head(page); - if (TestClearPageLRU(page)) { - lruvec = lock_page_lruvec_irq(page); - __activate_page(page, lruvec); + if (folio_test_clear_lru(folio)) { + lruvec = folio_lruvec_lock_irq(folio); + __folio_activate(folio, lruvec); unlock_page_lruvec_irq(lruvec); - SetPageLRU(page); + folio_set_lru(folio); } } #endif -static void __lru_cache_activate_page(struct page *page) +static void __lru_cache_activate_folio(struct folio *folio) { struct pagevec *pvec; int i; @@ -380,8 +388,8 @@ static void __lru_cache_activate_page(struct page *page) for (i = pagevec_count(pvec) - 1; i >= 0; i--) { struct page *pagevec_page = pvec->pages[i]; - if (pagevec_page == page) { - SetPageActive(page); + if (pagevec_page == &folio->page) { + folio_set_active(folio); break; } } @@ -399,61 +407,59 @@ static void __lru_cache_activate_page(struct page *page) * When a newly allocated page is not yet visible, so safe for non-atomic ops, * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). */ -void mark_page_accessed(struct page *page) +void folio_mark_accessed(struct folio *folio) { - page = compound_head(page); - - if (!PageReferenced(page)) { - SetPageReferenced(page); - } else if (PageUnevictable(page)) { + if (!folio_test_referenced(folio)) { + folio_set_referenced(folio); + } else if (folio_test_unevictable(folio)) { /* * Unevictable pages are on the "LRU_UNEVICTABLE" list. But, * this list is never rotated or maintained, so marking an * evictable page accessed has no effect. */ - } else if (!PageActive(page)) { + } else if (!folio_test_active(folio)) { /* * If the page is on the LRU, queue it for activation via * lru_pvecs.activate_page. Otherwise, assume the page is on a * pagevec, mark it active and it'll be moved to the active * LRU on the next drain. */ - if (PageLRU(page)) - activate_page(page); + if (folio_test_lru(folio)) + folio_activate(folio); else - __lru_cache_activate_page(page); - ClearPageReferenced(page); - workingset_activation(page); + __lru_cache_activate_folio(folio); + folio_clear_referenced(folio); + workingset_activation(folio); } - if (page_is_idle(page)) - clear_page_idle(page); + if (folio_test_idle(folio)) + folio_clear_idle(folio); } -EXPORT_SYMBOL(mark_page_accessed); +EXPORT_SYMBOL(folio_mark_accessed); /** - * lru_cache_add - add a page to a page list - * @page: the page to be added to the LRU. + * folio_add_lru - Add a folio to an LRU list. + * @folio: The folio to be added to the LRU. * - * Queue the page for addition to the LRU via pagevec. The decision on whether + * Queue the folio for addition to the LRU. The decision on whether * to add the page to the [in]active [file|anon] list is deferred until the - * pagevec is drained. This gives a chance for the caller of lru_cache_add() - * have the page added to the active list using mark_page_accessed(). + * pagevec is drained. This gives a chance for the caller of folio_add_lru() + * have the folio added to the active list using folio_mark_accessed(). */ -void lru_cache_add(struct page *page) +void folio_add_lru(struct folio *folio) { struct pagevec *pvec; - VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); - VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - get_page(page); + folio_get(folio); local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_add); - if (pagevec_add_and_need_flush(pvec, page)) + if (pagevec_add_and_need_flush(pvec, &folio->page)) __pagevec_lru_add(pvec); local_unlock(&lru_pvecs.lock); } -EXPORT_SYMBOL(lru_cache_add); +EXPORT_SYMBOL(folio_add_lru); /** * lru_cache_add_inactive_or_unevictable @@ -888,11 +894,12 @@ void release_pages(struct page **pages, int nr) int i; LIST_HEAD(pages_to_free); struct lruvec *lruvec = NULL; - unsigned long flags; + unsigned long flags = 0; unsigned int lock_batch; for (i = 0; i < nr; i++) { struct page *page = pages[i]; + struct folio *folio = page_folio(page); /* * Make sure the IRQ-safe lock-holding time does not get @@ -904,7 +911,7 @@ void release_pages(struct page **pages, int nr) lruvec = NULL; } - page = compound_head(page); + page = &folio->page; if (is_huge_zero_page(page)) continue; @@ -943,7 +950,7 @@ void release_pages(struct page **pages, int nr) if (PageLRU(page)) { struct lruvec *prev_lruvec = lruvec; - lruvec = relock_page_lruvec_irqsave(page, lruvec, + lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); if (prev_lruvec != lruvec) lock_batch = 0; @@ -985,17 +992,18 @@ void __pagevec_release(struct pagevec *pvec) } EXPORT_SYMBOL(__pagevec_release); -static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec) +static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) { - int was_unevictable = TestClearPageUnevictable(page); - int nr_pages = thp_nr_pages(page); + int was_unevictable = folio_test_clear_unevictable(folio); + long nr_pages = folio_nr_pages(folio); - VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* - * Page becomes evictable in two ways: + * A folio becomes evictable in two ways: * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()]. - * 2) Before acquiring LRU lock to put the page to correct LRU and then + * 2) Before acquiring LRU lock to put the folio on the correct LRU + * and then * a) do PageLRU check with lock [check_move_unevictable_pages] * b) do PageLRU check before lock [clear_page_mlock] * @@ -1004,35 +1012,36 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec) * * #0: __pagevec_lru_add_fn #1: clear_page_mlock * - * SetPageLRU() TestClearPageMlocked() + * folio_set_lru() folio_test_clear_mlocked() * smp_mb() // explicit ordering // above provides strict * // ordering - * PageMlocked() PageLRU() + * folio_test_mlocked() folio_test_lru() * * - * if '#1' does not observe setting of PG_lru by '#0' and fails - * isolation, the explicit barrier will make sure that page_evictable - * check will put the page in correct LRU. Without smp_mb(), SetPageLRU - * can be reordered after PageMlocked check and can make '#1' to fail - * the isolation of the page whose Mlocked bit is cleared (#0 is also - * looking at the same page) and the evictable page will be stranded - * in an unevictable LRU. + * if '#1' does not observe setting of PG_lru by '#0' and + * fails isolation, the explicit barrier will make sure that + * folio_evictable check will put the folio on the correct + * LRU. Without smp_mb(), folio_set_lru() can be reordered + * after folio_test_mlocked() check and can make '#1' fail the + * isolation of the folio whose mlocked bit is cleared (#0 is + * also looking at the same folio) and the evictable folio will + * be stranded on an unevictable LRU. */ - SetPageLRU(page); + folio_set_lru(folio); smp_mb__after_atomic(); - if (page_evictable(page)) { + if (folio_evictable(folio)) { if (was_unevictable) __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } else { - ClearPageActive(page); - SetPageUnevictable(page); + folio_clear_active(folio); + folio_set_unevictable(folio); if (!was_unevictable) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } - add_page_to_lru_list(page, lruvec); - trace_mm_lru_insertion(page); + lruvec_add_folio(lruvec, folio); + trace_mm_lru_insertion(folio); } /* @@ -1046,10 +1055,10 @@ void __pagevec_lru_add(struct pagevec *pvec) unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; + struct folio *folio = page_folio(pvec->pages[i]); - lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags); - __pagevec_lru_add_fn(page, lruvec); + lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); + __pagevec_lru_add_fn(folio, lruvec); } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); diff --git a/mm/swap_state.c b/mm/swap_state.c index bc7cee6b2ec5..8d4104242100 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -498,7 +498,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, mem_cgroup_swapin_uncharge_swap(entry); if (shadow) - workingset_refault(page, shadow); + workingset_refault(page_folio(page), shadow); /* Caller will initiate read into locked page */ lru_cache_add(page); diff --git a/mm/swapfile.c b/mm/swapfile.c index 22d10f713848..e3dcaeecc50f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3534,13 +3534,13 @@ struct swap_info_struct *page_swap_info(struct page *page) } /* - * out-of-line __page_file_ methods to avoid include hell. + * out-of-line methods to avoid include hell. */ -struct address_space *__page_file_mapping(struct page *page) +struct address_space *swapcache_mapping(struct folio *folio) { - return page_swap_info(page)->swap_file->f_mapping; + return page_swap_info(&folio->page)->swap_file->f_mapping; } -EXPORT_SYMBOL_GPL(__page_file_mapping); +EXPORT_SYMBOL_GPL(swapcache_mapping); pgoff_t __page_file_index(struct page *page) { diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 7a9008415534..36e5f6ab976f 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -164,7 +164,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, __SetPageUptodate(page); ret = -ENOMEM; - if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL)) + if (mem_cgroup_charge(page_folio(page), dst_mm, GFP_KERNEL)) goto out_release; ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, diff --git a/mm/util.c b/mm/util.c index bacabe446906..e58151a61255 100644 --- a/mm/util.c +++ b/mm/util.c @@ -654,81 +654,78 @@ void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) } EXPORT_SYMBOL(kvrealloc); -static inline void *__page_rmapping(struct page *page) -{ - unsigned long mapping; - - mapping = (unsigned long)page->mapping; - mapping &= ~PAGE_MAPPING_FLAGS; - - return (void *)mapping; -} - /* Neutral page->mapping pointer to address_space or anon_vma or other */ void *page_rmapping(struct page *page) { - page = compound_head(page); - return __page_rmapping(page); + return folio_raw_mapping(page_folio(page)); } -/* - * Return true if this page is mapped into pagetables. - * For compound page it returns true if any subpage of compound page is mapped. +/** + * folio_mapped - Is this folio mapped into userspace? + * @folio: The folio. + * + * Return: True if any page in this folio is referenced by user page tables. */ -bool page_mapped(struct page *page) +bool folio_mapped(struct folio *folio) { - int i; + long i, nr; - if (likely(!PageCompound(page))) - return atomic_read(&page->_mapcount) >= 0; - page = compound_head(page); - if (atomic_read(compound_mapcount_ptr(page)) >= 0) + if (folio_test_single(folio)) + return atomic_read(&folio->_mapcount) >= 0; + if (atomic_read(folio_mapcount_ptr(folio)) >= 0) return true; - if (PageHuge(page)) + if (folio_test_hugetlb(folio)) return false; - for (i = 0; i < compound_nr(page); i++) { - if (atomic_read(&page[i]._mapcount) >= 0) + + nr = folio_nr_pages(folio); + for (i = 0; i < nr; i++) { + if (atomic_read(&folio_page(folio, i)->_mapcount) >= 0) return true; } return false; } -EXPORT_SYMBOL(page_mapped); +EXPORT_SYMBOL(folio_mapped); struct anon_vma *page_anon_vma(struct page *page) { - unsigned long mapping; + struct folio *folio = page_folio(page); + unsigned long mapping = (unsigned long)folio->mapping; - page = compound_head(page); - mapping = (unsigned long)page->mapping; if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) return NULL; - return __page_rmapping(page); + return (void *)(mapping - PAGE_MAPPING_ANON); } -struct address_space *page_mapping(struct page *page) +/** + * folio_mapping - Find the mapping where this folio is stored. + * @folio: The folio. + * + * For folios which are in the page cache, return the mapping that this + * page belongs to. Folios in the swap cache return the swap mapping + * this page is stored in (which is different from the mapping for the + * swap file or swap device where the data is stored). + * + * You can call this for folios which aren't in the swap cache or page + * cache and it will return NULL. + */ +struct address_space *folio_mapping(struct folio *folio) { struct address_space *mapping; - page = compound_head(page); - /* This happens if someone calls flush_dcache_page on slab page */ - if (unlikely(PageSlab(page))) + if (unlikely(folio_test_slab(folio))) return NULL; - if (unlikely(PageSwapCache(page))) { - swp_entry_t entry; + if (unlikely(folio_test_swapcache(folio))) + return swap_address_space(folio_swap_entry(folio)); - entry.val = page_private(page); - return swap_address_space(entry); - } - - mapping = page->mapping; + mapping = folio->mapping; if ((unsigned long)mapping & PAGE_MAPPING_ANON) return NULL; return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS); } -EXPORT_SYMBOL(page_mapping); +EXPORT_SYMBOL(folio_mapping); /* Slow path of page_mapcount() for compound pages */ int __page_mapcount(struct page *page) @@ -750,13 +747,26 @@ int __page_mapcount(struct page *page) } EXPORT_SYMBOL_GPL(__page_mapcount); -void copy_huge_page(struct page *dst, struct page *src) +/** + * folio_copy - Copy the contents of one folio to another. + * @dst: Folio to copy to. + * @src: Folio to copy from. + * + * The bytes in the folio represented by @src are copied to @dst. + * Assumes the caller has validated that @dst is at least as large as @src. + * Can be called in atomic context for order-0 folios, but if the folio is + * larger, it may sleep. + */ +void folio_copy(struct folio *dst, struct folio *src) { - unsigned i, nr = compound_nr(src); + long i = 0; + long nr = folio_nr_pages(src); - for (i = 0; i < nr; i++) { + for (;;) { + copy_highpage(folio_page(dst, i), folio_page(src, i)); + if (++i == nr) + break; cond_resched(); - copy_highpage(nth_page(dst, i), nth_page(src, i)); } } @@ -1079,3 +1089,14 @@ void page_offline_end(void) up_write(&page_offline_rwsem); } EXPORT_SYMBOL(page_offline_end); + +#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO +void flush_dcache_folio(struct folio *folio) +{ + long i, nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) + flush_dcache_page(folio_page(folio, i)); +} +EXPORT_SYMBOL(flush_dcache_folio); +#endif diff --git a/mm/vmscan.c b/mm/vmscan.c index 74296c2d1fed..306229c4313f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2090,6 +2090,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, */ int isolate_lru_page(struct page *page) { + struct folio *folio = page_folio(page); int ret = -EBUSY; VM_BUG_ON_PAGE(!page_count(page), page); @@ -2099,7 +2100,7 @@ int isolate_lru_page(struct page *page) struct lruvec *lruvec; get_page(page); - lruvec = lock_page_lruvec_irq(page); + lruvec = folio_lruvec_lock_irq(folio); del_page_from_lru_list(page, lruvec); unlock_page_lruvec_irq(lruvec); ret = 0; @@ -2199,7 +2200,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec, * All pages were isolated from the same lruvec (and isolation * inhibits memcg migration). */ - VM_BUG_ON_PAGE(!page_matches_lruvec(page, lruvec), page); + VM_BUG_ON_PAGE(!folio_matches_lruvec(page_folio(page), lruvec), page); add_page_to_lru_list(page, lruvec); nr_pages = thp_nr_pages(page); nr_moved += nr_pages; @@ -4665,6 +4666,7 @@ void check_move_unevictable_pages(struct pagevec *pvec) for (i = 0; i < pvec->nr; i++) { struct page *page = pvec->pages[i]; + struct folio *folio = page_folio(page); int nr_pages; if (PageTransTail(page)) @@ -4677,7 +4679,7 @@ void check_move_unevictable_pages(struct pagevec *pvec) if (!TestClearPageLRU(page)) continue; - lruvec = relock_page_lruvec_irq(page, lruvec); + lruvec = folio_lruvec_relock_irq(folio, lruvec); if (page_evictable(page) && PageUnevictable(page)) { del_page_from_lru_list(page, lruvec); ClearPageUnevictable(page); diff --git a/mm/workingset.c b/mm/workingset.c index d5b81e4f4cbe..109ab978251a 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -273,17 +273,17 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) } /** - * workingset_refault - evaluate the refault of a previously evicted page - * @page: the freshly allocated replacement page - * @shadow: shadow entry of the evicted page + * workingset_refault - Evaluate the refault of a previously evicted folio. + * @folio: The freshly allocated replacement folio. + * @shadow: Shadow entry of the evicted folio. * * Calculates and evaluates the refault distance of the previously - * evicted page in the context of the node and the memcg whose memory + * evicted folio in the context of the node and the memcg whose memory * pressure caused the eviction. */ -void workingset_refault(struct page *page, void *shadow) +void workingset_refault(struct folio *folio, void *shadow) { - bool file = page_is_file_lru(page); + bool file = folio_is_file_lru(folio); struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; unsigned long refault_distance; @@ -295,16 +295,17 @@ void workingset_refault(struct page *page, void *shadow) unsigned long refault; bool workingset; int memcgid; + long nr; unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); rcu_read_lock(); /* * Look up the memcg associated with the stored ID. It might - * have been deleted since the page's eviction. + * have been deleted since the folio's eviction. * * Note that in rare events the ID could have been recycled - * for a new cgroup that refaults a shared page. This is + * for a new cgroup that refaults a shared folio. This is * impossible to tell from the available data. However, this * should be a rare and limited disturbance, and activations * are always speculative anyway. Ultimately, it's the aging @@ -340,17 +341,18 @@ void workingset_refault(struct page *page, void *shadow) refault_distance = (refault - eviction) & EVICTION_MASK; /* - * The activation decision for this page is made at the level + * The activation decision for this folio is made at the level * where the eviction occurred, as that is where the LRU order - * during page reclaim is being determined. + * during folio reclaim is being determined. * - * However, the cgroup that will own the page is the one that + * However, the cgroup that will own the folio is the one that * is actually experiencing the refault event. */ - memcg = page_memcg(page); + nr = folio_nr_pages(folio); + memcg = folio_memcg(folio); lruvec = mem_cgroup_lruvec(memcg, pgdat); - inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file); + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); mem_cgroup_flush_stats(); /* @@ -376,16 +378,16 @@ void workingset_refault(struct page *page, void *shadow) if (refault_distance > workingset_size) goto out; - SetPageActive(page); - workingset_age_nonresident(lruvec, thp_nr_pages(page)); - inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file); + folio_set_active(folio); + workingset_age_nonresident(lruvec, nr); + mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr); - /* Page was active prior to eviction */ + /* Folio was active prior to eviction */ if (workingset) { - SetPageWorkingset(page); + folio_set_workingset(folio); /* XXX: Move to lru_cache_add() when it supports new vs putback */ - lru_note_cost_page(page); - inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file); + lru_note_cost_folio(folio); + mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } out: rcu_read_unlock(); @@ -393,12 +395,11 @@ out: /** * workingset_activation - note a page activation - * @page: page that is being activated + * @folio: Folio that is being activated. */ -void workingset_activation(struct page *page) +void workingset_activation(struct folio *folio) { struct mem_cgroup *memcg; - struct lruvec *lruvec; rcu_read_lock(); /* @@ -408,11 +409,10 @@ void workingset_activation(struct page *page) * XXX: See workingset_refault() - this should return * root_mem_cgroup even for !CONFIG_MEMCG. */ - memcg = page_memcg_rcu(page); + memcg = folio_memcg_rcu(folio); if (!mem_cgroup_disabled() && !memcg) goto out; - lruvec = mem_cgroup_page_lruvec(page); - workingset_age_nonresident(lruvec, thp_nr_pages(page)); + workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); out: rcu_read_unlock(); }