From 255794c7ed7adb914e831f5e4905d783d31378d2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 7 Jun 2021 09:34:49 -0700 Subject: [PATCH 1/3] xfs: only reset incore inode health state flags when reclaiming an inode While running some fuzz tests on inode metadata, I noticed that the filesystem health report (as provided by xfs_spaceman) failed to report the file corruption even when spaceman was run immediately after running xfs_scrub to detect the corruption. That isn't the intended behavior; one ought to be able to run scrub to detect errors in the ondisk metadata and be able to access to those reports for some time after the scrub. After running the same sequence through an instrumented kernel, I discovered the reason why -- scrub igets the file, scans it, marks it sick, and ireleases the inode. When the VFS lets go of the incore inode, it moves to RECLAIMABLE state. If spaceman igets the incore inode before it moves to RECLAIM state, iget reinitializes the VFS state, clears the sick and checked masks, and hands back the inode. At this point, the caller has the exact same incore inode, but with all the health state erased. In other words, we're erasing the incore inode's health state flags when we've decided NOT to sever the link between the incore inode and the ondisk inode. This is wrong, so we need to remove the lines that zero the fields from xfs_iget_cache_hit. As a precaution, we add the same lines into xfs_reclaim_inode just after we sever the link between incore and ondisk inode. Strictly speaking this isn't necessary because once an inode has gone through reclaim it must go through xfs_inode_alloc (which also zeroes the state) and xfs_iget is careful to check for mismatches between the inode it pulls out of the radix tree and the one it wants. Fixes: 6772c1f11206 ("xfs: track metadata health status") Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Carlos Maiolino --- fs/xfs/xfs_icache.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0cd29a2f9da5..c0d288e4d0fd 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -523,9 +523,6 @@ xfs_iget_cache_hit( XFS_INO_TO_AGINO(pag->pag_mount, ino), XFS_ICI_RECLAIM_TAG); inode->i_state = I_NEW; - ip->i_sick = 0; - ip->i_checked = 0; - spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); } else { @@ -979,6 +976,8 @@ xfs_reclaim_inode( spin_lock(&ip->i_flags_lock); ip->i_flags = XFS_IRECLAIM; ip->i_ino = 0; + ip->i_sick = 0; + ip->i_checked = 0; spin_unlock(&ip->i_flags_lock); xfs_iunlock(ip, XFS_ILOCK_EXCL); From 7975e465af6b46e9d0eaf94f764922dc92b28d9c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 7 Jun 2021 09:34:50 -0700 Subject: [PATCH 2/3] xfs: drop IDONTCACHE on inodes when we mark them sick When we decide to mark an inode sick, clear the DONTCACHE flag so that the incore inode will be kept around until memory pressure forces it out of memory. This increases the chances that the sick status will be caught by someone compiling a health report later on. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Carlos Maiolino --- fs/xfs/xfs_health.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 5de3195f6cb2..eb10eacabc8f 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -229,6 +229,15 @@ xfs_inode_mark_sick( ip->i_sick |= mask; ip->i_checked |= mask; spin_unlock(&ip->i_flags_lock); + + /* + * Keep this inode around so we don't lose the sickness report. Scrub + * grabs inodes with DONTCACHE assuming that most inode are ok, which + * is not the case here. + */ + spin_lock(&VFS_I(ip)->i_lock); + VFS_I(ip)->i_state &= ~I_DONTCACHE; + spin_unlock(&VFS_I(ip)->i_lock); } /* Mark parts of an inode healed. */ From 9492750a8b18f02a8dec2aab594c59aabe2e4d0d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 7 Jun 2021 09:34:50 -0700 Subject: [PATCH 3/3] xfs: selectively keep sick inodes in memory It's important that the filesystem retain its memory of sick inodes for a little while after problems are found so that reports can be collected about what was wrong. Don't let inode reclamation free sick inodes unless we're unmounting or the fs already went down. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Carlos Maiolino --- fs/xfs/xfs_icache.c | 45 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index c0d288e4d0fd..6f1383bf706a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -71,10 +71,13 @@ static int xfs_icwalk_ag(struct xfs_perag *pag, /* Stop scanning after icw_scan_limit inodes. */ #define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28) +#define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27) + #define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_DROP_UDQUOT | \ XFS_ICWALK_FLAG_DROP_GDQUOT | \ XFS_ICWALK_FLAG_DROP_PDQUOT | \ - XFS_ICWALK_FLAG_SCAN_LIMIT) + XFS_ICWALK_FLAG_SCAN_LIMIT | \ + XFS_ICWALK_FLAG_RECLAIM_SICK) /* * Allocate and initialise an xfs_inode. @@ -910,7 +913,8 @@ xfs_dqrele_all_inodes( */ static bool xfs_reclaim_igrab( - struct xfs_inode *ip) + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) { ASSERT(rcu_read_lock_held()); @@ -921,6 +925,14 @@ xfs_reclaim_igrab( spin_unlock(&ip->i_flags_lock); return false; } + + /* Don't reclaim a sick inode unless the caller asked for it. */ + if (ip->i_sick && + (!eofb || !(eofb->eof_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) { + spin_unlock(&ip->i_flags_lock); + return false; + } + __xfs_iflags_set(ip, XFS_IRECLAIM); spin_unlock(&ip->i_flags_lock); return true; @@ -1021,13 +1033,30 @@ xfs_reclaim_inode( xfs_iflags_clear(ip, XFS_IRECLAIM); } +/* Reclaim sick inodes if we're unmounting or the fs went down. */ +static inline bool +xfs_want_reclaim_sick( + struct xfs_mount *mp) +{ + return (mp->m_flags & XFS_MOUNT_UNMOUNTING) || + (mp->m_flags & XFS_MOUNT_NORECOVERY) || + XFS_FORCED_SHUTDOWN(mp); +} + void xfs_reclaim_inodes( struct xfs_mount *mp) { + struct xfs_eofblocks eofb = { + .eof_flags = 0, + }; + + if (xfs_want_reclaim_sick(mp)) + eofb.eof_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; + while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { xfs_ail_push_all_sync(mp->m_ail); - xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL); + xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &eofb); } } @@ -1048,6 +1077,9 @@ xfs_reclaim_inodes_nr( .icw_scan_limit = nr_to_scan, }; + if (xfs_want_reclaim_sick(mp)) + eofb.eof_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; + /* kick background reclaimer and push the AIL */ xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); @@ -1597,7 +1629,8 @@ xfs_blockgc_free_quota( static inline bool xfs_icwalk_igrab( enum xfs_icwalk_goal goal, - struct xfs_inode *ip) + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) { switch (goal) { case XFS_ICWALK_DQRELE: @@ -1605,7 +1638,7 @@ xfs_icwalk_igrab( case XFS_ICWALK_BLOCKGC: return xfs_blockgc_igrab(ip); case XFS_ICWALK_RECLAIM: - return xfs_reclaim_igrab(ip); + return xfs_reclaim_igrab(ip, eofb); default: return false; } @@ -1694,7 +1727,7 @@ xfs_icwalk_ag( for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i]; - if (done || !xfs_icwalk_igrab(goal, ip)) + if (done || !xfs_icwalk_igrab(goal, ip, eofb)) batch[i] = NULL; /*