linux-stable/fs/xfs/libxfs/xfs_trans_inode.c
Dave Chinner 82842fee6e xfs: fix AGF vs inode cluster buffer deadlock
Lock order in XFS is AGI -> AGF, hence for operations involving
inode unlinked list operations we always lock the AGI first. Inode
unlinked list operations operate on the inode cluster buffer,
so the lock order there is AGI -> inode cluster buffer.

For O_TMPFILE operations, this now means the lock order set down in
xfs_rename and xfs_link is AGI -> inode cluster buffer -> AGF as the
unlinked ops are done before the directory modifications that may
allocate space and lock the AGF.

Unfortunately, we also now lock the inode cluster buffer when
logging an inode so that we can attach the inode to the cluster
buffer and pin it in memory. This creates a lock order of AGF ->
inode cluster buffer in directory operations as we have to log the
inode after we've allocated new space for it.

This creates a lock inversion between the AGF and the inode cluster
buffer. Because the inode cluster buffer is shared across multiple
inodes, the inversion is not specific to individual inodes but can
occur when inodes in the same cluster buffer are accessed in
different orders.

To fix this we need move all the inode log item cluster buffer
interactions to the end of the current transaction. Unfortunately,
xfs_trans_log_inode() calls are littered throughout the transactions
with no thought to ordering against other items or locking. This
makes it difficult to do anything that involves changing the call
sites of xfs_trans_log_inode() to change locking orders.

However, we do now have a mechanism that allows is to postpone dirty
item processing to just before we commit the transaction: the
->iop_precommit method. This will be called after all the
modifications are done and high level objects like AGI and AGF
buffers have been locked and modified, thereby providing a mechanism
that guarantees we don't lock the inode cluster buffer before those
high level objects are locked.

This change is largely moving the guts of xfs_trans_log_inode() to
xfs_inode_item_precommit() and providing an extra flag context in
the inode log item to track the dirty state of the inode in the
current transaction. This also means we do a lot less repeated work
in xfs_trans_log_inode() by only doing it once per transaction when
all the work is done.

Fixes: 298f7bec50 ("xfs: pin inode backing buffer to the inode log item")
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2023-06-05 04:08:27 +10:00

128 lines
3.3 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2000,2005 Silicon Graphics, Inc.
* All Rights Reserved.
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_inode_item.h"
#include <linux/iversion.h>
/*
* Add a locked inode to the transaction.
*
* The inode must be locked, and it cannot be associated with any transaction.
* If lock_flags is non-zero the inode will be unlocked on transaction commit.
*/
void
xfs_trans_ijoin(
struct xfs_trans *tp,
struct xfs_inode *ip,
uint lock_flags)
{
struct xfs_inode_log_item *iip;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
if (ip->i_itemp == NULL)
xfs_inode_item_init(ip, ip->i_mount);
iip = ip->i_itemp;
ASSERT(iip->ili_lock_flags == 0);
iip->ili_lock_flags = lock_flags;
ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
/* Reset the per-tx dirty context and add the item to the tx. */
iip->ili_dirty_flags = 0;
xfs_trans_add_item(tp, &iip->ili_item);
}
/*
* Transactional inode timestamp update. Requires the inode to be locked and
* joined to the transaction supplied. Relies on the transaction subsystem to
* track dirty state and update/writeback the inode accordingly.
*/
void
xfs_trans_ichgtime(
struct xfs_trans *tp,
struct xfs_inode *ip,
int flags)
{
struct inode *inode = VFS_I(ip);
struct timespec64 tv;
ASSERT(tp);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
tv = current_time(inode);
if (flags & XFS_ICHGTIME_MOD)
inode->i_mtime = tv;
if (flags & XFS_ICHGTIME_CHG)
inode->i_ctime = tv;
if (flags & XFS_ICHGTIME_CREATE)
ip->i_crtime = tv;
}
/*
* This is called to mark the fields indicated in fieldmask as needing to be
* logged when the transaction is committed. The inode must already be
* associated with the given transaction. All we do here is record where the
* inode was dirtied and mark the transaction and inode log item dirty;
* everything else is done in the ->precommit log item operation after the
* changes in the transaction have been completed.
*/
void
xfs_trans_log_inode(
struct xfs_trans *tp,
struct xfs_inode *ip,
uint flags)
{
struct xfs_inode_log_item *iip = ip->i_itemp;
struct inode *inode = VFS_I(ip);
ASSERT(iip);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
tp->t_flags |= XFS_TRANS_DIRTY;
/*
* First time we log the inode in a transaction, bump the inode change
* counter if it is configured for this to occur. While we have the
* inode locked exclusively for metadata modification, we can usually
* avoid setting XFS_ILOG_CORE if no one has queried the value since
* the last time it was incremented. If we have XFS_ILOG_CORE already
* set however, then go ahead and bump the i_version counter
* unconditionally.
*/
if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) {
if (IS_I_VERSION(inode) &&
inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE))
flags |= XFS_ILOG_IVERSION;
}
iip->ili_dirty_flags |= flags;
}
int
xfs_trans_roll_inode(
struct xfs_trans **tpp,
struct xfs_inode *ip)
{
int error;
xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
error = xfs_trans_roll(tpp);
if (!error)
xfs_trans_ijoin(*tpp, ip, 0);
return error;
}