diff --git a/fs/inode.c b/fs/inode.c index ed0cab8a32db..a49695f57e1e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -428,11 +428,20 @@ void ihold(struct inode *inode) } EXPORT_SYMBOL(ihold); -static void inode_lru_list_add(struct inode *inode) +static void __inode_add_lru(struct inode *inode, bool rotate) { + if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) + return; + if (atomic_read(&inode->i_count)) + return; + if (!(inode->i_sb->s_flags & SB_ACTIVE)) + return; + if (!mapping_shrinkable(&inode->i_data)) + return; + if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); - else + else if (rotate) inode->i_state |= I_REFERENCED; } @@ -443,16 +452,11 @@ static void inode_lru_list_add(struct inode *inode) */ void inode_add_lru(struct inode *inode) { - if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC | - I_FREEING | I_WILL_FREE)) && - !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE) - inode_lru_list_add(inode); + __inode_add_lru(inode, false); } - static void inode_lru_list_del(struct inode *inode) { - if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused); } @@ -728,10 +732,6 @@ again: /* * Isolate the inode from the LRU in preparation for freeing it. * - * Any inodes which are pinned purely because of attached pagecache have their - * pagecache removed. If the inode has metadata buffers attached to - * mapping->private_list then try to remove them. - * * If the inode has the I_REFERENCED flag set, then it means that it has been * used recently - the flag is set in iput_final(). When we encounter such an * inode, clear the flag and move it to the back of the LRU so it gets another @@ -747,31 +747,39 @@ static enum lru_status inode_lru_isolate(struct list_head *item, struct inode *inode = container_of(item, struct inode, i_lru); /* - * we are inverting the lru lock/inode->i_lock here, so use a trylock. - * If we fail to get the lock, just skip it. + * We are inverting the lru lock/inode->i_lock here, so use a + * trylock. If we fail to get the lock, just skip it. */ if (!spin_trylock(&inode->i_lock)) return LRU_SKIP; /* - * Referenced or dirty inodes are still in use. Give them another pass - * through the LRU as we canot reclaim them now. + * Inodes can get referenced, redirtied, or repopulated while + * they're already on the LRU, and this can make them + * unreclaimable for a while. Remove them lazily here; iput, + * sync, or the last page cache deletion will requeue them. */ if (atomic_read(&inode->i_count) || - (inode->i_state & ~I_REFERENCED)) { + (inode->i_state & ~I_REFERENCED) || + !mapping_shrinkable(&inode->i_data)) { list_lru_isolate(lru, &inode->i_lru); spin_unlock(&inode->i_lock); this_cpu_dec(nr_unused); return LRU_REMOVED; } - /* recently referenced inodes get one more pass */ + /* Recently referenced inodes get one more pass */ if (inode->i_state & I_REFERENCED) { inode->i_state &= ~I_REFERENCED; spin_unlock(&inode->i_lock); return LRU_ROTATE; } + /* + * On highmem systems, mapping_shrinkable() permits dropping + * page cache in order to free up struct inodes: lowmem might + * be under pressure before the cache inside the highmem zone. + */ if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) { __iget(inode); spin_unlock(&inode->i_lock); @@ -1638,7 +1646,7 @@ static void iput_final(struct inode *inode) if (!drop && !(inode->i_state & I_DONTCACHE) && (sb->s_flags & SB_ACTIVE)) { - inode_add_lru(inode); + __inode_add_lru(inode, true); spin_unlock(&inode->i_lock); return; } diff --git a/fs/internal.h b/fs/internal.h index 3cd065c8a66b..2854ff29f116 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -149,7 +149,6 @@ extern int vfs_open(const struct path *, struct file *); * inode.c */ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); -extern void inode_add_lru(struct inode *inode); extern int dentry_needs_remove_privs(struct dentry *dentry); /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 226de651f52e..de35a2640e70 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3193,6 +3193,7 @@ static inline void remove_inode_hash(struct inode *inode) } extern void inode_sb_list_add(struct inode *inode); +extern void inode_add_lru(struct inode *inode); extern int sb_set_blocksize(struct super_block *, int); extern int sb_min_blocksize(struct super_block *, int); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 62db6b0176b9..5c74a45ff97a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -23,6 +23,56 @@ static inline bool mapping_empty(struct address_space *mapping) return xa_empty(&mapping->i_pages); } +/* + * mapping_shrinkable - test if page cache state allows inode reclaim + * @mapping: the page cache mapping + * + * This checks the mapping's cache state for the pupose of inode + * reclaim and LRU management. + * + * The caller is expected to hold the i_lock, but is not required to + * hold the i_pages lock, which usually protects cache state. That's + * because the i_lock and the list_lru lock that protect the inode and + * its LRU state don't nest inside the irq-safe i_pages lock. + * + * Cache deletions are performed under the i_lock, which ensures that + * when an inode goes empty, it will reliably get queued on the LRU. + * + * Cache additions do not acquire the i_lock and may race with this + * check, in which case we'll report the inode as shrinkable when it + * has cache pages. This is okay: the shrinker also checks the + * refcount and the referenced bit, which will be elevated or set in + * the process of adding new cache pages to an inode. + */ +static inline bool mapping_shrinkable(struct address_space *mapping) +{ + void *head; + + /* + * On highmem systems, there could be lowmem pressure from the + * inodes before there is highmem pressure from the page + * cache. Make inodes shrinkable regardless of cache state. + */ + if (IS_ENABLED(CONFIG_HIGHMEM)) + return true; + + /* Cache completely empty? Shrink away. */ + head = rcu_access_pointer(mapping->i_pages.xa_head); + if (!head) + return true; + + /* + * The xarray stores single offset-0 entries directly in the + * head pointer, which allows non-resident page cache entries + * to escape the shadow shrinker's list of xarray nodes. The + * inode shrinker needs to pick them up under memory pressure. + */ + if (!xa_is_node(head) && xa_is_value(head)) + return true; + + return false; +} + /* * Bits in mapping->flags. */ diff --git a/mm/filemap.c b/mm/filemap.c index b6140debc2da..06de9a17499f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -262,9 +262,13 @@ void delete_from_page_cache(struct page *page) struct address_space *mapping = page_mapping(page); BUG_ON(!PageLocked(page)); + spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); __delete_from_page_cache(page, NULL); xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); page_cache_free_page(mapping, page); } @@ -340,6 +344,7 @@ void delete_from_page_cache_batch(struct address_space *mapping, if (!pagevec_count(pvec)) return; + spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); for (i = 0; i < pagevec_count(pvec); i++) { trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); @@ -348,6 +353,9 @@ void delete_from_page_cache_batch(struct address_space *mapping, } page_cache_delete_batch(mapping, pvec); xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); for (i = 0; i < pagevec_count(pvec); i++) page_cache_free_page(mapping, pvec->pages[i]); diff --git a/mm/truncate.c b/mm/truncate.c index 714eaf19821d..cc83a3f7c1ad 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -45,9 +45,13 @@ static inline void __clear_shadow_entry(struct address_space *mapping, static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, void *entry) { + spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); __clear_shadow_entry(mapping, index, entry); xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); } /* @@ -73,8 +77,10 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, return; dax = dax_mapping(mapping); - if (!dax) + if (!dax) { + spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); + } for (i = j; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -93,8 +99,12 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, __clear_shadow_entry(mapping, index, page); } - if (!dax) + if (!dax) { xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); + } pvec->nr = j; } @@ -567,6 +577,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; + spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); if (PageDirty(page)) goto failed; @@ -574,6 +585,9 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) BUG_ON(page_has_private(page)); __delete_from_page_cache(page, NULL); xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -582,6 +596,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) return 1; failed: xa_unlock_irq(&mapping->i_pages); + spin_unlock(&mapping->host->i_lock); return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 41f5f6007c30..26f07518d7a4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1190,6 +1190,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); + if (!PageSwapCache(page)) + spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); /* * The non racy check for a busy page. @@ -1258,6 +1260,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, shadow = workingset_eviction(page, target_memcg); __delete_from_page_cache(page, shadow); xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); if (freepage != NULL) freepage(page); @@ -1267,6 +1272,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, cannot_free: xa_unlock_irq(&mapping->i_pages); + if (!PageSwapCache(page)) + spin_unlock(&mapping->host->i_lock); return 0; } diff --git a/mm/workingset.c b/mm/workingset.c index d5b81e4f4cbe..23df60ce2e10 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -543,6 +543,13 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out; } + if (!spin_trylock(&mapping->host->i_lock)) { + xa_unlock(&mapping->i_pages); + spin_unlock_irq(lru_lock); + ret = LRU_RETRY; + goto out; + } + list_lru_isolate(lru, item); __dec_lruvec_kmem_state(node, WORKINGSET_NODES); @@ -562,6 +569,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, out_invalid: xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); ret = LRU_REMOVED_RETRY; out: cond_resched();