xfs: bug fixes for 6.4-rc2

o fixes for inode garbage collection shutdown racing with work queue updates o ensure inodegc workers run on the CPU they are supposed to o disable counter scrubbing until we can exclusively freeze the filesystem from the kernel o Regression fixes for new allocation related bugs o a couple of minor cleanups -----BEGIN PGP SIGNATURE----- iQJIBAABCgAyFiEEmJOoJ8GffZYWSjj/regpR/R1+h0FAmRcSIsUHGRhdmlkQGZy b21vcmJpdC5jb20ACgkQregpR/R1+h2Y8xAAxtsTdOx71XtDuNyfBOiqzZgTCq6b 6LsckJIDQa1AXjUNq9G3zWcUcWBcRWcw+CWbkqjqQ9W47K/ijLuoKnjRsQ+5B4DU TBUctVq+/Zk2lBlb6HKuKdzqDGnIFWGVKVd7u8KlowqnXuzUeQ0vFkT7ZHTepUKG P+midgGNVT4+tykq7oH0H8WxoTyNPZhKiAUcZjneBgA60IAoQWHA2iUt+SKpbrkL 1HyK+/edVMTXiDXtyHfXmDaH9Pgy6NCpw3TNkPDhuL1UDpLhg/zgT39rFZGBsAUt gaDM3wN5jBrot/mvJE3rH9bdZhkcf+NQKPx/1DDg3DL8plS/1/LUC4cImdolBJ3w RNmgJv1lK+AlE4MUJ/bUDlEpHUmwAjnnsxBXwEvnYNfj+9V6/mDB+HqKiY7/XxVK vF77s6z+CWvefdnZavJ4/72pVVJNkcDYCYmvh/donRP6vtnwZyzocFUeBeNMInV1 /s3WMrF9hwmJqAClKG7p1fnszWp658yFIuw/TXVs+NrjTtQgXwMpl2cEYYvUZEJN Trq2p0xH/JSwcnOPSPJO6WHb8UPoqrM6lgGFaJVWJx1AWt1i1CFLf5eA5X+XisDV AJKgpqlnDg02bBMQ0tMFGZUaNx/1S1mwtxcZsyEFTutpUNxqJKDaMohpxxrWb0WC ppSqDvyJN4wtlFI= =qok2 -----END PGP SIGNATURE----- Merge tag 'xfs-6.4-rc1-fixes' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux Pull xfs bug fixes from Dave Chinner: "Largely minor bug fixes and cleanups, th emost important of which are probably the fixes for regressions in the extent allocation code: - fixes for inode garbage collection shutdown racing with work queue updates - ensure inodegc workers run on the CPU they are supposed to - disable counter scrubbing until we can exclusively freeze the filesystem from the kernel - regression fixes for new allocation related bugs - a couple of minor cleanups" * tag 'xfs-6.4-rc1-fixes' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: xfs: fix xfs_inodegc_stop racing with mod_delayed_work xfs: disable reaping in fscounters scrub xfs: check that per-cpu inodegc workers actually run on that cpu xfs: explicitly specify cpu when forcing inodegc delayed work to run immediately xfs: fix negative array access in xfs_getbmap xfs: don't allocate into the data fork for an unshare request xfs: flush dirty data and drain directios before scrubbing cow fork xfs: set bnobt/cntbt numrecs correctly when formatting new AGs xfs: don't unconditionally null args->pag in xfs_bmap_btalloc_at_eof
2024-09-27 04:47:05 +00:00 · 2023-05-11 16:51:11 -05:00 · 2023-05-11 16:51:11 -05:00 · 849a4f0973
commit 849a4f0973
parent 105131df9c 2254a7396a
14 changed files with 65 additions and 63 deletions
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@ -495,10 +495,12 @@ xfs_freesp_init_recs(
 		ASSERT(start >= mp->m_ag_prealloc_blocks);
 		if (start != mp->m_ag_prealloc_blocks) {
 			/*
-			 * Modify first record to pad stripe align of log
+			 * Modify first record to pad stripe align of log and
+			 * bump the record count.
 			 */
 			arec->ar_blockcount = cpu_to_be32(start -
 						mp->m_ag_prealloc_blocks);
+			be16_add_cpu(&block->bb_numrecs, 1);
 			nrec = arec + 1;

 			/*
@ -509,7 +511,6 @@ xfs_freesp_init_recs(
 					be32_to_cpu(arec->ar_startblock) +
 					be32_to_cpu(arec->ar_blockcount));
 			arec = nrec;
-			be16_add_cpu(&block->bb_numrecs, 1);
 		}
 		/*
 		 * Change record start to after the internal log
@ -518,15 +519,13 @@ xfs_freesp_init_recs(
 	}

 	/*
-	 * Calculate the record block count and check for the case where
-	 * the log might have consumed all available space in the AG. If
-	 * so, reset the record count to 0 to avoid exposure of an invalid
-	 * record start block.
+	 * Calculate the block count of this record; if it is nonzero,
+	 * increment the record count.
 	 */
 	arec->ar_blockcount = cpu_to_be32(id->agsize -
 					  be32_to_cpu(arec->ar_startblock));
-	if (!arec->ar_blockcount)
-		block->bb_numrecs = 0;
+	if (arec->ar_blockcount)
+		be16_add_cpu(&block->bb_numrecs, 1);
 }

 /*
@ -538,7 +537,7 @@ xfs_bnoroot_init(
 	struct xfs_buf		*bp,
 	struct aghdr_init_data	*id)
 {
-	xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno);
+	xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 0, id->agno);
 	xfs_freesp_init_recs(mp, bp, id);
 }

@ -548,7 +547,7 @@ xfs_cntroot_init(
 	struct xfs_buf		*bp,
 	struct aghdr_init_data	*id)
 {
-	xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno);
+	xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 0, id->agno);
 	xfs_freesp_init_recs(mp, bp, id);
 }

--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@ -3494,8 +3494,10 @@ xfs_bmap_btalloc_at_eof(
 		if (!caller_pag)
 			args->pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ap->blkno));
 		error = xfs_alloc_vextent_exact_bno(args, ap->blkno);
-		if (!caller_pag)
+		if (!caller_pag) {
 			xfs_perag_put(args->pag);
+			args->pag = NULL;
+		}
 		if (error)
 			return error;

@ -3505,7 +3507,6 @@ xfs_bmap_btalloc_at_eof(
 		 * Exact allocation failed. Reset to try an aligned allocation
 		 * according to the original allocation specification.
 		 */
-		args->pag = NULL;
 		args->alignment = stripe_align;
 		args->minlen = nextminlen;
 		args->minalignslop = 0;
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@ -42,12 +42,12 @@ xchk_setup_inode_bmap(
 	xfs_ilock(sc->ip, XFS_IOLOCK_EXCL);

 	/*
-	 * We don't want any ephemeral data fork updates sitting around
+	 * We don't want any ephemeral data/cow fork updates sitting around
 	 * while we inspect block mappings, so wait for directio to finish
 	 * and flush dirty data if we have delalloc reservations.
 	 */
 	if (S_ISREG(VFS_I(sc->ip)->i_mode) &&
-	    sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) {
+	    sc->sm->sm_type != XFS_SCRUB_TYPE_BMBTA) {
 		struct address_space	*mapping = VFS_I(sc->ip)->i_mapping;

 		sc->ilock_flags |= XFS_MMAPLOCK_EXCL;
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@ -1164,32 +1164,6 @@ xchk_metadata_inode_forks(
 	return 0;
 }

-/* Pause background reaping of resources. */
-void
-xchk_stop_reaping(
-	struct xfs_scrub	*sc)
-{
-	sc->flags |= XCHK_REAPING_DISABLED;
-	xfs_blockgc_stop(sc->mp);
-	xfs_inodegc_stop(sc->mp);
-}
-
-/* Restart background reaping of resources. */
-void
-xchk_start_reaping(
-	struct xfs_scrub	*sc)
-{
-	/*
-	 * Readonly filesystems do not perform inactivation or speculative
-	 * preallocation, so there's no need to restart the workers.
-	 */
-	if (!xfs_is_readonly(sc->mp)) {
-		xfs_inodegc_start(sc->mp);
-		xfs_blockgc_start(sc->mp);
-	}
-	sc->flags &= ~XCHK_REAPING_DISABLED;
-}
-
 /*
 * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub
 * operation.  Callers must not hold any locks that intersect with the CPU
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@ -156,8 +156,6 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
 }

 int xchk_metadata_inode_forks(struct xfs_scrub *sc);
-void xchk_stop_reaping(struct xfs_scrub *sc);
-void xchk_start_reaping(struct xfs_scrub *sc);

 /*
 * Setting up a hook to wait for intents to drain is costly -- we have to take
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@ -150,13 +150,6 @@ xchk_setup_fscounters(
 	if (error)
 		return error;

-	/*
-	 * Pause background reclaim while we're scrubbing to reduce the
-	 * likelihood of background perturbations to the counters throwing off
-	 * our calculations.
-	 */
-	xchk_stop_reaping(sc);
-
 	return xchk_trans_alloc(sc, 0);
 }

@ -453,6 +446,12 @@ xchk_fscounters(
 	if (frextents > mp->m_sb.sb_rextents)
 		xchk_set_corrupt(sc);

+	/*
+	 * XXX: We can't quiesce percpu counter updates, so exit early.
+	 * This can be re-enabled when we gain exclusive freeze functionality.
+	 */
+	return 0;
+
 	/*
 	 * If ifree exceeds icount by more than the minimum variance then
 	 * something's probably wrong with the counters.
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@ -186,8 +186,6 @@ xchk_teardown(
 	}
 	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
 		mnt_drop_write_file(sc->file);
-	if (sc->flags & XCHK_REAPING_DISABLED)
-		xchk_start_reaping(sc);
 	if (sc->buf) {
 		if (sc->buf_cleanup)
 			sc->buf_cleanup(sc->buf);
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@ -106,7 +106,6 @@ struct xfs_scrub {

 /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
 #define XCHK_TRY_HARDER		(1 << 0)  /* can't get resources, try again */
-#define XCHK_REAPING_DISABLED	(1 << 1)  /* background block reaping paused */
 #define XCHK_FSGATES_DRAIN	(1 << 2)  /* defer ops draining enabled */
 #define XCHK_NEED_DRAIN		(1 << 3)  /* scrub needs to drain defer ops */
 #define XREP_ALREADY_FIXED	(1 << 31) /* checking our repair work */
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@ -98,7 +98,6 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);

 #define XFS_SCRUB_STATE_STRINGS \
 	{ XCHK_TRY_HARDER,			"try_harder" }, \
-	{ XCHK_REAPING_DISABLED,		"reaping_disabled" }, \
 	{ XCHK_FSGATES_DRAIN,			"fsgates_drain" }, \
 	{ XCHK_NEED_DRAIN,			"need_drain" }, \
 	{ XREP_ALREADY_FIXED,			"already_fixed" }
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@ -558,7 +558,9 @@ xfs_getbmap(
 		if (!xfs_iext_next_extent(ifp, &icur, &got)) {
 			xfs_fileoff_t	end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));

-			out[bmv->bmv_entries - 1].bmv_oflags |= BMV_OF_LAST;
+			if (bmv->bmv_entries > 0)
+				out[bmv->bmv_entries - 1].bmv_oflags |=
+								BMV_OF_LAST;

 			if (whichfork != XFS_ATTR_FORK && bno < end &&
 			    !xfs_getbmap_full(bmv)) {
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@ -435,18 +435,23 @@ xfs_iget_check_free_state(
 }

 /* Make all pending inactivation work start immediately. */
-static void
+static bool
 xfs_inodegc_queue_all(
 	struct xfs_mount	*mp)
 {
 	struct xfs_inodegc	*gc;
 	int			cpu;
+	bool			ret = false;

 	for_each_online_cpu(cpu) {
 		gc = per_cpu_ptr(mp->m_inodegc, cpu);
-		if (!llist_empty(&gc->list))
+		if (!llist_empty(&gc->list)) {
 			mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
+			ret = true;
+		}
 	}
+
+	return ret;
 }

 /*
@ -1856,6 +1861,8 @@ xfs_inodegc_worker(
 	struct xfs_inode	*ip, *n;
 	unsigned int		nofs_flag;

+	ASSERT(gc->cpu == smp_processor_id());
+
 	WRITE_ONCE(gc->items, 0);

 	if (!node)
@ -1909,24 +1916,41 @@ xfs_inodegc_flush(

 /*
 * Flush all the pending work and then disable the inode inactivation background
- * workers and wait for them to stop.
+ * workers and wait for them to stop.  Caller must hold sb->s_umount to
+ * coordinate changes in the inodegc_enabled state.
 */
 void
 xfs_inodegc_stop(
 	struct xfs_mount	*mp)
 {
+	bool			rerun;
+
 	if (!xfs_clear_inodegc_enabled(mp))
 		return;

+	/*
+	 * Drain all pending inodegc work, including inodes that could be
+	 * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan
+	 * threads that sample the inodegc state just prior to us clearing it.
+	 * The inodegc flag state prevents new threads from queuing more
+	 * inodes, so we queue pending work items and flush the workqueue until
+	 * all inodegc lists are empty.  IOWs, we cannot use drain_workqueue
+	 * here because it does not allow other unserialized mechanisms to
+	 * reschedule inodegc work while this draining is in progress.
+	 */
 	xfs_inodegc_queue_all(mp);
-	drain_workqueue(mp->m_inodegc_wq);
+	do {
+		flush_workqueue(mp->m_inodegc_wq);
+		rerun = xfs_inodegc_queue_all(mp);
+	} while (rerun);

 	trace_xfs_inodegc_stop(mp, __return_address);
 }

 /*
 * Enable the inode inactivation background workers and schedule deferred inode
- * inactivation work if there is any.
+ * inactivation work if there is any.  Caller must hold sb->s_umount to
+ * coordinate changes in the inodegc_enabled state.
 */
 void
 xfs_inodegc_start(
@ -2069,7 +2093,8 @@ xfs_inodegc_queue(
 		queue_delay = 0;

 	trace_xfs_inodegc_queue(mp, __return_address);
-	mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay);
+	mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
+			queue_delay);
 	put_cpu_ptr(gc);

 	if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
@ -2113,7 +2138,8 @@ xfs_inodegc_cpu_dead(

 	if (xfs_is_inodegc_enabled(mp)) {
 		trace_xfs_inodegc_queue(mp, __return_address);
-		mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0);
+		mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
+				0);
 	}
 	put_cpu_ptr(gc);
 }
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@ -1006,8 +1006,9 @@ xfs_buffered_write_iomap_begin(
 	if (eof)
 		imap.br_startoff = end_fsb; /* fake hole until the end */

-	/* We never need to allocate blocks for zeroing a hole. */
-	if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
+	/* We never need to allocate blocks for zeroing or unsharing a hole. */
+	if ((flags & (IOMAP_UNSHARE | IOMAP_ZERO)) &&
+	    imap.br_startoff > offset_fsb) {
 		xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
 		goto out_unlock;
 	}
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@ -66,6 +66,9 @@ struct xfs_inodegc {
 	/* approximate count of inodes in the list */
 	unsigned int		items;
 	unsigned int		shrinker_hits;
+#if defined(DEBUG) || defined(XFS_WARN)
+	unsigned int		cpu;
+#endif
 };

 /*
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@ -1095,6 +1095,9 @@ xfs_inodegc_init_percpu(

 	for_each_possible_cpu(cpu) {
 		gc = per_cpu_ptr(mp->m_inodegc, cpu);
+#if defined(DEBUG) || defined(XFS_WARN)
+		gc->cpu = cpu;
+#endif
 		init_llist_head(&gc->list);
 		gc->items = 0;
 		INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);