gfs2: Rework freeze / thaw logic

So far, at mount time, gfs2 would take the freeze glock in shared mode
and then immediately drop it again, turning it into a cached glock that
can be reclaimed at any time.  To freeze the filesystem cluster-wide,
the node initiating the freeze would take the freeze glock in exclusive
mode, which would cause the freeze glock's freeze_go_sync() callback to
run on each node.  There, gfs2 would freeze the filesystem and schedule
gfs2_freeze_func() to run.  gfs2_freeze_func() would re-acquire the
freeze glock in shared mode, thaw the filesystem, and drop the freeze
glock again.  The initiating node would keep the freeze glock held in
exclusive mode.  To thaw the filesystem, the initiating node would drop
the freeze glock again, which would allow gfs2_freeze_func() to resume
on all nodes, leaving the filesystem in the thawed state.

It turns out that in freeze_go_sync(), we cannot reliably and safely
freeze the filesystem.  This is primarily because the final unmount of a
filesystem takes a write lock on the s_umount rw semaphore before
calling into gfs2_put_super(), and freeze_go_sync() needs to call
freeze_super() which also takes a write lock on the same semaphore,
causing a deadlock.  We could work around this by trying to take an
active reference on the super block first, which would prevent unmount
from running at the same time.  But that can fail, and freeze_go_sync()
isn't actually allowed to fail.

To get around this, this patch changes the freeze glock locking scheme
as follows:

At mount time, each node takes the freeze glock in shared mode.  To
freeze a filesystem, the initiating node first freezes the filesystem
locally and then drops and re-acquires the freeze glock in exclusive
mode.  All other nodes notice that there is contention on the freeze
glock in their go_callback callbacks, and they schedule
gfs2_freeze_func() to run.  There, they freeze the filesystem locally
and drop and re-acquire the freeze glock before re-thawing the
filesystem.  This is happening outside of the glock state engine, so
there, we are allowed to fail.

From a cluster point of view, taking and immediately dropping a glock is
indistinguishable from taking the glock and only dropping it upon
contention, so this new scheme is compatible with the old one.

Thanks to Li Dong <lidong@vivo.com> for reporting a locking bug in
gfs2_freeze_func() in a previous version of this commit.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
This commit is contained in:
Andreas Gruenbacher 2022-11-14 23:34:50 +01:00
parent cad1e15804
commit b77b4a4815
7 changed files with 178 additions and 110 deletions

View File

@ -561,47 +561,33 @@ static void inode_go_dump(struct seq_file *seq, struct gfs2_glock *gl,
}
/**
* freeze_go_sync - promote/demote the freeze glock
* freeze_go_callback - A cluster node is requesting a freeze
* @gl: the glock
* @remote: true if this came from a different cluster node
*/
static int freeze_go_sync(struct gfs2_glock *gl)
static void freeze_go_callback(struct gfs2_glock *gl, bool remote)
{
int error = 0;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct super_block *sb = sdp->sd_vfs;
if (!remote ||
gl->gl_state != LM_ST_SHARED ||
gl->gl_demote_state != LM_ST_UNLOCKED)
return;
/*
* We need to check gl_state == LM_ST_SHARED here and not gl_req ==
* LM_ST_EXCLUSIVE. That's because when any node does a freeze,
* all the nodes should have the freeze glock in SH mode and they all
* call do_xmote: One for EX and the others for UN. They ALL must
* freeze locally, and they ALL must queue freeze work. The freeze_work
* calls freeze_func, which tries to reacquire the freeze glock in SH,
* effectively waiting for the thaw on the node who holds it in EX.
* Once thawed, the work func acquires the freeze glock in
* SH and everybody goes back to thawed.
* Try to get an active super block reference to prevent racing with
* unmount (see trylock_super()). But note that unmount isn't the only
* place where a write lock on s_umount is taken, and we can fail here
* because of things like remount as well.
*/
if (gl->gl_state == LM_ST_SHARED && !gfs2_withdrawn(sdp) &&
!test_bit(SDF_NORECOVERY, &sdp->sd_flags)) {
atomic_set(&sdp->sd_freeze_state, SFS_STARTING_FREEZE);
error = freeze_super(sdp->sd_vfs);
if (error) {
fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n",
error);
if (gfs2_withdrawn(sdp)) {
atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN);
return 0;
}
gfs2_assert_withdraw(sdp, 0);
}
queue_work(gfs2_freeze_wq, &sdp->sd_freeze_work);
if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE |
GFS2_LFC_FREEZE_GO_SYNC);
else /* read-only mounts */
atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
if (down_read_trylock(&sb->s_umount)) {
atomic_inc(&sb->s_active);
up_read(&sb->s_umount);
if (!queue_work(gfs2_freeze_wq, &sdp->sd_freeze_work))
deactivate_super(sb);
}
return 0;
}
/**
@ -761,9 +747,9 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
};
const struct gfs2_glock_operations gfs2_freeze_glops = {
.go_sync = freeze_go_sync,
.go_xmote_bh = freeze_go_xmote_bh,
.go_demote_ok = freeze_go_demote_ok,
.go_callback = freeze_go_callback,
.go_type = LM_TYPE_NONDISK,
.go_flags = GLOF_NONDISK,
};

View File

@ -1136,8 +1136,6 @@ repeat:
if (flags & (GFS2_LOG_HEAD_FLUSH_SHUTDOWN |
GFS2_LOG_HEAD_FLUSH_FREEZE))
gfs2_log_shutdown(sdp);
if (flags & GFS2_LOG_HEAD_FLUSH_FREEZE)
atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
}
out_end:

View File

@ -1140,7 +1140,6 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
int silent = fc->sb_flags & SB_SILENT;
struct gfs2_sbd *sdp;
struct gfs2_holder mount_gh;
struct gfs2_holder freeze_gh;
int error;
sdp = init_sbd(sb);
@ -1269,15 +1268,15 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
}
}
error = gfs2_freeze_lock_shared(sdp, &freeze_gh, 0);
error = gfs2_freeze_lock_shared(sdp, &sdp->sd_freeze_gh, 0);
if (error)
goto fail_per_node;
if (!sb_rdonly(sb))
error = gfs2_make_fs_rw(sdp);
gfs2_freeze_unlock(&freeze_gh);
if (error) {
gfs2_freeze_unlock(&sdp->sd_freeze_gh);
if (sdp->sd_quotad_process)
kthread_stop(sdp->sd_quotad_process);
sdp->sd_quotad_process = NULL;

View File

@ -404,7 +404,7 @@ void gfs2_recover_func(struct work_struct *work)
struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
struct gfs2_log_header_host head;
struct gfs2_holder j_gh, ji_gh, thaw_gh;
struct gfs2_holder j_gh, ji_gh;
ktime_t t_start, t_jlck, t_jhd, t_tlck, t_rep;
int ro = 0;
unsigned int pass;
@ -465,14 +465,14 @@ void gfs2_recover_func(struct work_struct *work)
ktime_ms_delta(t_jhd, t_jlck));
if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
fs_info(sdp, "jid=%u: Acquiring the freeze glock...\n",
jd->jd_jid);
mutex_lock(&sdp->sd_freeze_mutex);
/* Acquire a shared hold on the freeze glock */
error = gfs2_freeze_lock_shared(sdp, &thaw_gh, LM_FLAG_PRIORITY);
if (error)
if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN) {
mutex_unlock(&sdp->sd_freeze_mutex);
fs_warn(sdp, "jid=%u: Can't replay: filesystem "
"is frozen\n", jd->jd_jid);
goto fail_gunlock_ji;
}
if (test_bit(SDF_RORECOVERY, &sdp->sd_flags)) {
ro = 1;
@ -496,7 +496,7 @@ void gfs2_recover_func(struct work_struct *work)
fs_warn(sdp, "jid=%u: Can't replay: read-only block "
"device\n", jd->jd_jid);
error = -EROFS;
goto fail_gunlock_thaw;
goto fail_gunlock_nofreeze;
}
t_tlck = ktime_get();
@ -514,7 +514,7 @@ void gfs2_recover_func(struct work_struct *work)
lops_after_scan(jd, error, pass);
if (error) {
up_read(&sdp->sd_log_flush_lock);
goto fail_gunlock_thaw;
goto fail_gunlock_nofreeze;
}
}
@ -522,7 +522,7 @@ void gfs2_recover_func(struct work_struct *work)
clean_journal(jd, &head);
up_read(&sdp->sd_log_flush_lock);
gfs2_freeze_unlock(&thaw_gh);
mutex_unlock(&sdp->sd_freeze_mutex);
t_rep = ktime_get();
fs_info(sdp, "jid=%u: Journal replayed in %lldms [jlck:%lldms, "
"jhead:%lldms, tlck:%lldms, replay:%lldms]\n",
@ -543,8 +543,8 @@ void gfs2_recover_func(struct work_struct *work)
fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
goto done;
fail_gunlock_thaw:
gfs2_freeze_unlock(&thaw_gh);
fail_gunlock_nofreeze:
mutex_unlock(&sdp->sd_freeze_mutex);
fail_gunlock_ji:
if (jlocked) {
gfs2_glock_dq_uninit(&ji_gh);

View File

@ -332,7 +332,12 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp)
struct lfcc *lfcc;
LIST_HEAD(list);
struct gfs2_log_header_host lh;
int error;
int error, error2;
/*
* Grab all the journal glocks in SH mode. We are *probably* doing
* that to prevent recovery.
*/
list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
@ -349,11 +354,13 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp)
list_add(&lfcc->list, &list);
}
gfs2_freeze_unlock(&sdp->sd_freeze_gh);
error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE,
LM_FLAG_NOEXP | GL_NOPID,
&sdp->sd_freeze_gh);
if (error)
goto out;
goto relock_shared;
list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
error = gfs2_jdesc_check(jd);
@ -368,8 +375,14 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp)
}
}
if (error)
gfs2_freeze_unlock(&sdp->sd_freeze_gh);
if (!error)
goto out; /* success */
gfs2_freeze_unlock(&sdp->sd_freeze_gh);
relock_shared:
error2 = gfs2_freeze_lock_shared(sdp, &sdp->sd_freeze_gh, 0);
gfs2_assert_withdraw(sdp, !error2);
out:
while (!list_empty(&list)) {
@ -615,6 +628,8 @@ restart:
/* Release stuff */
gfs2_freeze_unlock(&sdp->sd_freeze_gh);
iput(sdp->sd_jindex);
iput(sdp->sd_statfs_inode);
iput(sdp->sd_rindex);
@ -669,31 +684,82 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
return sdp->sd_log_error;
}
static int gfs2_freeze_locally(struct gfs2_sbd *sdp)
{
struct super_block *sb = sdp->sd_vfs;
int error;
atomic_set(&sdp->sd_freeze_state, SFS_STARTING_FREEZE);
error = freeze_super(sb);
if (error)
goto fail;
if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE |
GFS2_LFC_FREEZE_GO_SYNC);
if (gfs2_withdrawn(sdp)) {
thaw_super(sb);
error = -EIO;
goto fail;
}
}
return 0;
fail:
atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN);
return error;
}
static int gfs2_do_thaw(struct gfs2_sbd *sdp)
{
struct super_block *sb = sdp->sd_vfs;
int error;
error = gfs2_freeze_lock_shared(sdp, &sdp->sd_freeze_gh, 0);
if (error)
goto fail;
error = thaw_super(sb);
if (!error)
return 0;
fail:
fs_info(sdp, "GFS2: couldn't thaw filesystem: %d\n", error);
gfs2_assert_withdraw(sdp, 0);
return error;
}
void gfs2_freeze_func(struct work_struct *work)
{
int error;
struct gfs2_holder freeze_gh;
struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_freeze_work);
struct super_block *sb = sdp->sd_vfs;
int error;
atomic_inc(&sb->s_active);
error = gfs2_freeze_lock_shared(sdp, &freeze_gh, 0);
if (error) {
gfs2_assert_withdraw(sdp, 0);
} else {
atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN);
error = thaw_super(sb);
if (error) {
fs_info(sdp, "GFS2: couldn't thaw filesystem: %d\n",
error);
gfs2_assert_withdraw(sdp, 0);
}
gfs2_freeze_unlock(&freeze_gh);
}
mutex_lock(&sdp->sd_freeze_mutex);
error = -EBUSY;
if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN)
goto freeze_failed;
error = gfs2_freeze_locally(sdp);
if (error)
goto freeze_failed;
gfs2_freeze_unlock(&sdp->sd_freeze_gh);
atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
error = gfs2_do_thaw(sdp);
if (error)
goto out;
atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN);
goto out;
freeze_failed:
fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n", error);
out:
mutex_unlock(&sdp->sd_freeze_mutex);
deactivate_super(sb);
clear_bit_unlock(SDF_FREEZE_INITIATOR, &sdp->sd_flags);
wake_up_bit(&sdp->sd_flags, SDF_FREEZE_INITIATOR);
return;
}
/**
@ -707,21 +773,27 @@ static int gfs2_freeze_super(struct super_block *sb)
struct gfs2_sbd *sdp = sb->s_fs_info;
int error;
mutex_lock(&sdp->sd_freeze_mutex);
if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN) {
error = -EBUSY;
if (!mutex_trylock(&sdp->sd_freeze_mutex))
return -EBUSY;
error = -EBUSY;
if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN)
goto out;
}
for (;;) {
if (gfs2_withdrawn(sdp)) {
error = -EINVAL;
error = gfs2_freeze_locally(sdp);
if (error) {
fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n",
error);
goto out;
}
error = gfs2_lock_fs_check_clean(sdp);
if (!error)
break;
break; /* success */
error = gfs2_do_thaw(sdp);
if (error)
goto out;
if (error == -EBUSY)
fs_err(sdp, "waiting for recovery before freeze\n");
@ -735,8 +807,12 @@ static int gfs2_freeze_super(struct super_block *sb)
fs_err(sdp, "retrying...\n");
msleep(1000);
}
set_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags);
out:
if (!error) {
set_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags);
atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
}
mutex_unlock(&sdp->sd_freeze_mutex);
return error;
}
@ -750,17 +826,39 @@ out:
static int gfs2_thaw_super(struct super_block *sb)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
int error;
mutex_lock(&sdp->sd_freeze_mutex);
if (atomic_read(&sdp->sd_freeze_state) != SFS_FROZEN ||
!gfs2_holder_initialized(&sdp->sd_freeze_gh)) {
mutex_unlock(&sdp->sd_freeze_mutex);
return -EINVAL;
}
if (!mutex_trylock(&sdp->sd_freeze_mutex))
return -EBUSY;
error = -EINVAL;
if (!test_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags))
goto out;
gfs2_freeze_unlock(&sdp->sd_freeze_gh);
error = gfs2_do_thaw(sdp);
if (!error) {
clear_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags);
atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN);
}
out:
mutex_unlock(&sdp->sd_freeze_mutex);
return error;
}
void gfs2_thaw_freeze_initiator(struct super_block *sb)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
mutex_lock(&sdp->sd_freeze_mutex);
if (!test_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags))
goto out;
gfs2_freeze_unlock(&sdp->sd_freeze_gh);
out:
mutex_unlock(&sdp->sd_freeze_mutex);
return wait_on_bit(&sdp->sd_flags, SDF_FREEZE_INITIATOR, TASK_INTERRUPTIBLE);
}
/**

View File

@ -46,6 +46,7 @@ extern void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc,
extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh);
extern int gfs2_statfs_sync(struct super_block *sb, int type);
extern void gfs2_freeze_func(struct work_struct *work);
extern void gfs2_thaw_freeze_initiator(struct super_block *sb);
extern void free_local_statfs_inodes(struct gfs2_sbd *sdp);
extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp,

View File

@ -124,7 +124,6 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
struct gfs2_inode *ip;
struct gfs2_glock *i_gl;
u64 no_formal_ino;
int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
int ret = 0;
int tries;
@ -152,24 +151,18 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
*/
clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
if (!sb_rdonly(sdp->sd_vfs)) {
struct gfs2_holder freeze_gh;
bool locked = mutex_trylock(&sdp->sd_freeze_mutex);
gfs2_make_fs_ro(sdp);
if (locked)
mutex_unlock(&sdp->sd_freeze_mutex);
gfs2_holder_mark_uninitialized(&freeze_gh);
if (sdp->sd_freeze_gl &&
!gfs2_glock_is_locked_by_me(sdp->sd_freeze_gl)) {
ret = gfs2_freeze_lock_shared(sdp, &freeze_gh,
log_write_allowed ? 0 : LM_FLAG_TRY);
if (ret == GLR_TRYFAILED)
ret = 0;
}
if (!ret)
gfs2_make_fs_ro(sdp);
/*
* Dequeue any pending non-system glock holders that can no
* longer be granted because the file system is withdrawn.
*/
gfs2_gl_dq_holders(sdp);
gfs2_freeze_unlock(&freeze_gh);
}
if (sdp->sd_lockstruct.ls_ops->lm_lock == NULL) { /* lock_nolock */
@ -187,15 +180,8 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
}
sdp->sd_jinode_gh.gh_flags |= GL_NOCACHE;
gfs2_glock_dq(&sdp->sd_jinode_gh);
if (test_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags)) {
/* Make sure gfs2_thaw_super works if partially-frozen */
flush_work(&sdp->sd_freeze_work);
atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
thaw_super(sdp->sd_vfs);
} else {
wait_on_bit(&i_gl->gl_flags, GLF_DEMOTE,
TASK_UNINTERRUPTIBLE);
}
gfs2_thaw_freeze_initiator(sdp->sd_vfs);
wait_on_bit(&i_gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE);
/*
* holder_uninit to force glock_put, to force dlm to let go