From 35c23fba4eb4b3043b42acbdd3fbabdd8824f56f Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 2 Nov 2022 23:57:50 +0100 Subject: [PATCH 01/17] gfs2: Add extra error check in alloc_dinode We have reserved the number of blocks we want to allocate, so the actual allocation isn't expected to fail. Nevertheless, make the code behave correctly even when things go wrong. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 04a201584fa7..41fa69c1be1b 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -403,12 +403,15 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks) goto out_ipreserv; error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation); + if (error) + goto out_trans_end; + ip->i_no_formal_ino = ip->i_generation; ip->i_inode.i_ino = ip->i_no_addr; ip->i_goal = ip->i_no_addr; +out_trans_end: gfs2_trans_end(sdp); - out_ipreserv: gfs2_inplace_release(ip); out_quota: From 761fdbbce96fb3d0569f50a77b1214dbc4b17c44 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 4 Nov 2022 13:26:46 +0100 Subject: [PATCH 02/17] gfs2: Get rid of ghs[] in gfs2_create_inode In gfs2_create_inode, get rid of the ghs array in favor of two separate variables. This makes the code much less irritating. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/inode.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 41fa69c1be1b..465f1673101f 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -599,7 +599,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, { const struct qstr *name = &dentry->d_name; struct posix_acl *default_acl, *acl; - struct gfs2_holder ghs[2]; + struct gfs2_holder d_gh, gh; struct inode *inode = NULL; struct gfs2_inode *dip = GFS2_I(dir), *ip; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); @@ -620,10 +620,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail; - error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, &d_gh); if (error) goto fail; - gfs2_holder_mark_uninitialized(ghs + 1); + gfs2_holder_mark_uninitialized(&gh); error = create_ok(dip, name, mode); if (error) @@ -645,7 +645,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, else error = finish_no_open(file, NULL); } - gfs2_glock_dq_uninit(ghs); + gfs2_glock_dq_uninit(&d_gh); goto fail; } else if (error != -ENOENT) { goto fail_gunlock; @@ -734,7 +734,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock2; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (error) goto fail_gunlock3; @@ -788,9 +788,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, file->f_mode |= FMODE_CREATED; error = finish_open(file, dentry, gfs2_open_common); } - gfs2_glock_dq_uninit(ghs); + gfs2_glock_dq_uninit(&d_gh); gfs2_qa_put(ip); - gfs2_glock_dq_uninit(ghs + 1); + gfs2_glock_dq_uninit(&gh); gfs2_glock_put(io_gl); gfs2_qa_put(dip); unlock_new_inode(inode); @@ -815,7 +815,7 @@ fail_free_acls: posix_acl_release(acl); fail_gunlock: gfs2_dir_no_add(&da); - gfs2_glock_dq_uninit(ghs); + gfs2_glock_dq_uninit(&d_gh); if (!IS_ERR_OR_NULL(inode)) { clear_nlink(inode); if (!free_vfs_inode) @@ -827,8 +827,8 @@ fail_gunlock: else iput(inode); } - if (gfs2_holder_initialized(ghs + 1)) - gfs2_glock_dq_uninit(ghs + 1); + if (gfs2_holder_initialized(&gh)) + gfs2_glock_dq_uninit(&gh); fail: gfs2_qa_put(dip); return error; From 3d0258bc11185ccb21f922332eca731e1928c5a4 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 2 Nov 2022 18:34:42 +0100 Subject: [PATCH 03/17] gfs2: Clean up initialization of "ip" in gfs2_create_inode Initialize variable "ip" earlier so that it can be used interchangeably with "inode" everywhere. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 465f1673101f..b91f15abe24e 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -659,12 +659,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = -ENOMEM; if (!inode) goto fail_gunlock; + ip = GFS2_I(inode); error = posix_acl_create(dir, &mode, &default_acl, &acl); if (error) goto fail_gunlock; - ip = GFS2_I(inode); error = gfs2_qa_get(ip); if (error) goto fail_free_acls; @@ -821,7 +821,7 @@ fail_gunlock: if (!free_vfs_inode) mark_inode_dirty(inode); set_bit(free_vfs_inode ? GIF_FREE_VFS_INODE : GIF_ALLOC_FAILED, - &GFS2_I(inode)->i_flags); + &ip->i_flags); if (inode->i_state & I_NEW) iget_failed(inode); else From 38552ff676f072e7d15c5e0a877fda613e57ed2d Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 2 Nov 2022 17:06:58 +0100 Subject: [PATCH 04/17] gfs2: Fix and clean up create / evict interaction When gfs2_create_inode() fails after creating a new inode, it uses the GIF_FREE_VFS_INODE and GIF_ALLOC_FAILED inode flags to communicate to gfs2_evict_inode() which parts of the inode need to be deallocated and destroyed. In some error cases, the inode ends up being allocated on disk and then accidentally left behind. In others, the inode is partially constructed and then not properly destroyed. Clean this up by completely handling the inode deallocation and destruction in gfs2_evict_inode(). This means that gfs2_evict_inode() may now be faced with partially constructed inodes, so add the necessary checks to cope with that. In particular, make sure that for incompletely constructed inodes, we're not accessing the buffers backing the on-disk blocks; the contents may be undefined. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/inode.c | 26 ++++++++++++-------------- fs/gfs2/meta_io.c | 6 ++++++ fs/gfs2/super.c | 35 +++++++++++++++++++++-------------- fs/gfs2/xattr.c | 26 +++++++++++++++----------- 4 files changed, 54 insertions(+), 39 deletions(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index b91f15abe24e..c057f3bd475f 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -409,6 +409,8 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks) ip->i_no_formal_ino = ip->i_generation; ip->i_inode.i_ino = ip->i_no_addr; ip->i_goal = ip->i_no_addr; + if (*dblocks > 1) + ip->i_eattr = ip->i_no_addr + 1; out_trans_end: gfs2_trans_end(sdp); @@ -589,6 +591,12 @@ static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, * @size: The initial size of the inode (ignored for directories) * @excl: Force fail if inode exists * + * FIXME: Change to allocate the disk blocks and write them out in the same + * transaction. That way, we can no longer end up in a situation in which an + * inode is allocated, the node crashes, and the block looks like a valid + * inode. (With atomic creates in place, we will also no longer need to zero + * the link count and dirty the inode here on failure.) + * * Returns: 0 on success, or error code */ @@ -604,7 +612,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, struct gfs2_inode *dip = GFS2_I(dir), *ip; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_glock *io_gl; - int error, free_vfs_inode = 1; + int error; u32 aflags = 0; unsigned blocks = 1; struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, }; @@ -742,10 +750,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock3; - if (blocks > 1) { - ip->i_eattr = ip->i_no_addr + 1; + if (blocks > 1) gfs2_init_xattr(ip); - } init_dinode(dip, ip, symname); gfs2_trans_end(sdp); @@ -753,9 +759,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, glock_set_object(io_gl, ip); gfs2_set_iop(inode); - free_vfs_inode = 0; /* After this point, the inode is no longer - considered free. Any failures need to undo - the gfs2 structures. */ if (default_acl) { error = __gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); if (error) @@ -804,10 +807,6 @@ fail_gunlock3: fail_gunlock2: gfs2_glock_put(io_gl); fail_free_inode: - if (ip->i_gl) { - if (free_vfs_inode) /* else evict will do the put for us */ - gfs2_glock_put(ip->i_gl); - } gfs2_rs_deltree(&ip->i_res); gfs2_qa_put(ip); fail_free_acls: @@ -817,11 +816,10 @@ fail_gunlock: gfs2_dir_no_add(&da); gfs2_glock_dq_uninit(&d_gh); if (!IS_ERR_OR_NULL(inode)) { + set_bit(GIF_ALLOC_FAILED, &ip->i_flags); clear_nlink(inode); - if (!free_vfs_inode) + if (ip->i_no_addr) mark_inode_dirty(inode); - set_bit(free_vfs_inode ? GIF_FREE_VFS_INODE : GIF_ALLOC_FAILED, - &ip->i_flags); if (inode->i_state & I_NEW) iget_failed(inode); else diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 6ed728aae9a5..3c41b864ee5b 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -442,6 +442,12 @@ void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) struct buffer_head *bh; int ty; + if (!ip->i_gl) { + /* This can only happen during incomplete inode creation. */ + BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)); + return; + } + gfs2_ail1_wipe(sdp, bstart, blen); while (blen) { ty = REMOVE_META; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b018957a1bb2..eac9b0c34aac 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -475,6 +475,12 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) int need_endtrans = 0; int ret; + if (unlikely(!ip->i_gl)) { + /* This can only happen during incomplete inode creation. */ + BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)); + return; + } + if (unlikely(gfs2_withdrawn(sdp))) return; if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { @@ -927,8 +933,7 @@ static int gfs2_drop_inode(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); - if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) && - inode->i_nlink && + if (inode->i_nlink && gfs2_holder_initialized(&ip->i_iopen_gh)) { struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; if (test_bit(GLF_DEMOTE, &gl->gl_flags)) @@ -1076,7 +1081,13 @@ static void gfs2_final_release_pages(struct gfs2_inode *ip) struct inode *inode = &ip->i_inode; struct gfs2_glock *gl = ip->i_gl; - truncate_inode_pages(gfs2_glock2aspace(ip->i_gl), 0); + if (unlikely(!gl)) { + /* This can only happen during incomplete inode creation. */ + BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)); + return; + } + + truncate_inode_pages(gfs2_glock2aspace(gl), 0); truncate_inode_pages(&inode->i_data, 0); if (atomic_read(&gl->gl_revokes) == 0) { @@ -1218,10 +1229,8 @@ static enum dinode_demise evict_should_delete(struct inode *inode, struct gfs2_sbd *sdp = sb->s_fs_info; int ret; - if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) { - BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl)); + if (unlikely(test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) goto should_delete; - } if (test_bit(GIF_DEFERRED_DELETE, &ip->i_flags)) return SHOULD_DEFER_EVICTION; @@ -1298,9 +1307,11 @@ static int evict_unlinked_inode(struct inode *inode) do, gfs2_create_inode can create another inode at the same block location and try to set gl_object again. We clear gl_object here so that subsequent inode creates don't see an old gl_object. */ - glock_clear_object(ip->i_gl, ip); + if (ip->i_gl) { + glock_clear_object(ip->i_gl, ip); + gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino); + } ret = gfs2_dinode_dealloc(ip); - gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino); out: return ret; } @@ -1367,12 +1378,7 @@ static void gfs2_evict_inode(struct inode *inode) struct gfs2_holder gh; int ret; - if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) { - clear_inode(inode); - return; - } - - if (inode->i_nlink || sb_rdonly(sb)) + if (inode->i_nlink || sb_rdonly(sb) || !ip->i_no_addr) goto out; gfs2_holder_mark_uninitialized(&gh); @@ -1429,6 +1435,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) ip = alloc_inode_sb(sb, gfs2_inode_cachep, GFP_KERNEL); if (!ip) return NULL; + ip->i_no_addr = 0; ip->i_flags = 0; ip->i_gl = NULL; gfs2_holder_mark_uninitialized(&ip->i_iopen_gh); diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index f6a66050380e..518c0677e12a 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1412,11 +1412,13 @@ static int ea_dealloc_block(struct gfs2_inode *ip) ip->i_eattr = 0; gfs2_add_inode_blocks(&ip->i_inode, -1); - error = gfs2_meta_inode_buffer(ip, &dibh); - if (!error) { - gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); + if (likely(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) { + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + } } gfs2_trans_end(sdp); @@ -1445,14 +1447,16 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip) if (error) return error; - error = ea_foreach(ip, ea_dealloc_unstuffed, NULL); - if (error) - goto out_quota; - - if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { - error = ea_dealloc_indirect(ip); + if (likely(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) { + error = ea_foreach(ip, ea_dealloc_unstuffed, NULL); if (error) goto out_quota; + + if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { + error = ea_dealloc_indirect(ip); + if (error) + goto out_quota; + } } error = ea_dealloc_block(ip); From 4ec3c19d058f7391ec631b8a1b0a690422b246a9 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sat, 5 Nov 2022 17:12:34 +0100 Subject: [PATCH 05/17] gfs2: Handle -EBUSY result of insert_inode_locked4 When creating a new inode, there is a small chance that an inode lookup for a previous version of the same inode is still in progress. In that case, that previous lookup will eventually fail, but we may still need to retry here. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/inode.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index c057f3bd475f..9fbbc365a404 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -734,8 +734,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, goto fail_free_inode; gfs2_cancel_delete_work(io_gl); +retry: error = insert_inode_locked4(inode, ip->i_no_addr, iget_test, &ip->i_no_addr); - BUG_ON(error); + if (error == -EBUSY) + goto retry; + if (error) + goto fail_gunlock2; error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT | GL_NOPID, &ip->i_iopen_gh); From 7db354444ad8429e660b0f8145d425285d4f90ff Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sun, 4 Dec 2022 16:50:41 +0100 Subject: [PATCH 06/17] gfs2: Cosmetic gfs2_dinode_{in,out} cleanup In each of the two functions, add an inode variable that points to &ip->i_inode and use that throughout the rest of the function. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glops.c | 41 +++++++++++++++++++++-------------------- fs/gfs2/super.c | 27 ++++++++++++++------------- 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 49210a2e7ce7..af69a1bacd55 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -397,38 +397,39 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) struct timespec64 atime; u16 height, depth; umode_t mode = be32_to_cpu(str->di_mode); - bool is_new = ip->i_inode.i_state & I_NEW; + struct inode *inode = &ip->i_inode; + bool is_new = inode->i_state & I_NEW; if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) goto corrupt; - if (unlikely(!is_new && inode_wrong_type(&ip->i_inode, mode))) + if (unlikely(!is_new && inode_wrong_type(inode, mode))) goto corrupt; ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino); - ip->i_inode.i_mode = mode; + inode->i_mode = mode; if (is_new) { - ip->i_inode.i_rdev = 0; + inode->i_rdev = 0; switch (mode & S_IFMT) { case S_IFBLK: case S_IFCHR: - ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major), - be32_to_cpu(str->di_minor)); + inode->i_rdev = MKDEV(be32_to_cpu(str->di_major), + be32_to_cpu(str->di_minor)); break; } } - i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid)); - i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid)); - set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink)); - i_size_write(&ip->i_inode, be64_to_cpu(str->di_size)); - gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); + i_uid_write(inode, be32_to_cpu(str->di_uid)); + i_gid_write(inode, be32_to_cpu(str->di_gid)); + set_nlink(inode, be32_to_cpu(str->di_nlink)); + i_size_write(inode, be64_to_cpu(str->di_size)); + gfs2_set_inode_blocks(inode, be64_to_cpu(str->di_blocks)); atime.tv_sec = be64_to_cpu(str->di_atime); atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); - if (timespec64_compare(&ip->i_inode.i_atime, &atime) < 0) - ip->i_inode.i_atime = atime; - ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime); - ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); - ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime); - ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); + if (timespec64_compare(&inode->i_atime, &atime) < 0) + inode->i_atime = atime; + inode->i_mtime.tv_sec = be64_to_cpu(str->di_mtime); + inode->i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); + inode->i_ctime.tv_sec = be64_to_cpu(str->di_ctime); + inode->i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); ip->i_goal = be64_to_cpu(str->di_goal_meta); ip->i_generation = be64_to_cpu(str->di_generation); @@ -436,7 +437,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) ip->i_diskflags = be32_to_cpu(str->di_flags); ip->i_eattr = be64_to_cpu(str->di_eattr); /* i_diskflags and i_eattr must be set before gfs2_set_inode_flags() */ - gfs2_set_inode_flags(&ip->i_inode); + gfs2_set_inode_flags(inode); height = be16_to_cpu(str->di_height); if (unlikely(height > GFS2_MAX_META_HEIGHT)) goto corrupt; @@ -448,8 +449,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) ip->i_depth = (u8)depth; ip->i_entries = be32_to_cpu(str->di_entries); - if (S_ISREG(ip->i_inode.i_mode)) - gfs2_set_aops(&ip->i_inode); + if (S_ISREG(inode->i_mode)) + gfs2_set_aops(inode); return 0; corrupt: diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index eac9b0c34aac..075fad8fb1d1 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -379,6 +379,7 @@ out: void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) { + const struct inode *inode = &ip->i_inode; struct gfs2_dinode *str = buf; str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC); @@ -386,15 +387,15 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI); str->di_num.no_addr = cpu_to_be64(ip->i_no_addr); str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); - str->di_mode = cpu_to_be32(ip->i_inode.i_mode); - str->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode)); - str->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode)); - str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); - str->di_size = cpu_to_be64(i_size_read(&ip->i_inode)); - str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); - str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); - str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); - str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec); + str->di_mode = cpu_to_be32(inode->i_mode); + str->di_uid = cpu_to_be32(i_uid_read(inode)); + str->di_gid = cpu_to_be32(i_gid_read(inode)); + str->di_nlink = cpu_to_be32(inode->i_nlink); + str->di_size = cpu_to_be64(i_size_read(inode)); + str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(inode)); + str->di_atime = cpu_to_be64(inode->i_atime.tv_sec); + str->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec); + str->di_ctime = cpu_to_be64(inode->i_ctime.tv_sec); str->di_goal_meta = cpu_to_be64(ip->i_goal); str->di_goal_data = cpu_to_be64(ip->i_goal); @@ -402,16 +403,16 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_flags = cpu_to_be32(ip->i_diskflags); str->di_height = cpu_to_be16(ip->i_height); - str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) && + str->di_payload_format = cpu_to_be32(S_ISDIR(inode->i_mode) && !(ip->i_diskflags & GFS2_DIF_EXHASH) ? GFS2_FORMAT_DE : 0); str->di_depth = cpu_to_be16(ip->i_depth); str->di_entries = cpu_to_be32(ip->i_entries); str->di_eattr = cpu_to_be64(ip->i_eattr); - str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec); - str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec); - str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec); + str->di_atime_nsec = cpu_to_be32(inode->i_atime.tv_nsec); + str->di_mtime_nsec = cpu_to_be32(inode->i_mtime.tv_nsec); + str->di_ctime_nsec = cpu_to_be32(inode->i_ctime.tv_nsec); } /** From 70376c7ff31221f1d21db5611d8209e677781d3a Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sun, 4 Dec 2022 17:00:04 +0100 Subject: [PATCH 07/17] gfs2: Always check inode size of inline inodes Check if the inode size of stuffed (inline) inodes is within the allowed range when reading inodes from disk (gfs2_dinode_in()). This prevents us from on-disk corruption. The two checks in stuffed_readpage() and gfs2_unstuffer_page() that just truncate inline data to the maximum allowed size don't actually make sense, and they can be removed now as well. Reported-by: syzbot+7bb81dfa9cda07d9cd9d@syzkaller.appspotmail.com Signed-off-by: Andreas Gruenbacher --- fs/gfs2/aops.c | 2 -- fs/gfs2/bmap.c | 3 --- fs/gfs2/glops.c | 3 +++ 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 05bee80ac7de..e782b4f1d104 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -427,8 +427,6 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) return error; kaddr = kmap_atomic(page); - if (dsize > gfs2_max_stuffed_size(ip)) - dsize = gfs2_max_stuffed_size(ip); memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); memset(kaddr + dsize, 0, PAGE_SIZE - dsize); kunmap_atomic(kaddr); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 3bdb2c668a71..e7537fd305dd 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -61,9 +61,6 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, void *kaddr = kmap(page); u64 dsize = i_size_read(inode); - if (dsize > gfs2_max_stuffed_size(ip)) - dsize = gfs2_max_stuffed_size(ip); - memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); memset(kaddr + dsize, 0, PAGE_SIZE - dsize); kunmap(page); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index af69a1bacd55..d78b61ecc1cd 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -449,6 +449,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) ip->i_depth = (u8)depth; ip->i_entries = be32_to_cpu(str->di_entries); + if (gfs2_is_stuffed(ip) && inode->i_size > gfs2_max_stuffed_size(ip)) + goto corrupt; + if (S_ISREG(inode->i_mode)) gfs2_set_aops(inode); From 4ad02083a092b497f35804de03eaa62cf81fada6 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 2 Dec 2022 18:00:15 +0100 Subject: [PATCH 08/17] gfs2: Make gfs2_glock_hold return its glock argument This allows code like 'gl = gfs2_glock_hold(...)'. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 3 +-- fs/gfs2/glock.c | 6 +++--- fs/gfs2/glock.h | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 60c6fb91fb58..eea5be4fbf0e 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1445,14 +1445,13 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) static void __flock_holder_uninit(struct file *file, struct gfs2_holder *fl_gh) { - struct gfs2_glock *gl = fl_gh->gh_gl; + struct gfs2_glock *gl = gfs2_glock_hold(fl_gh->gh_gl); /* * Make sure gfs2_glock_put() won't sleep under the file->f_lock * spinlock. */ - gfs2_glock_hold(gl); spin_lock(&file->f_lock); gfs2_holder_uninit(fl_gh); spin_unlock(&file->f_lock); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index df335c258eb0..1a6c1eb7bd6b 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -186,10 +186,11 @@ void gfs2_glock_free(struct gfs2_glock *gl) * */ -void gfs2_glock_hold(struct gfs2_glock *gl) +struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl) { GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); lockref_get(&gl->gl_lockref); + return gl; } /** @@ -1256,13 +1257,12 @@ void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags, struct gfs2_holder *gh, unsigned long ip) { INIT_LIST_HEAD(&gh->gh_list); - gh->gh_gl = gl; + gh->gh_gl = gfs2_glock_hold(gl); gh->gh_ip = ip; gh->gh_owner_pid = get_pid(task_pid(current)); gh->gh_state = state; gh->gh_flags = flags; gh->gh_iflags = 0; - gfs2_glock_hold(gl); } /** diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 0d068f4fd7d6..76cd2fabc668 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -196,7 +196,7 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp); -extern void gfs2_glock_hold(struct gfs2_glock *gl); +extern struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl); extern void gfs2_glock_put(struct gfs2_glock *gl); extern void gfs2_glock_queue_put(struct gfs2_glock *gl); From 97236ad5a68c6b7603cea2ad01c588887e5cb961 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sun, 4 Dec 2022 13:02:39 +0100 Subject: [PATCH 09/17] gfs2: Avoid dequeuing GL_ASYNC glock holders twice When a locking request fails, the associated glock holder is automatically dequeued from the list of active and waiting holders. For GL_ASYNC locking requests, this will obviously happen asynchronously and it can race with attempts to cancel that locking request via gfs2_glock_dq(). Therefore, don't forget to check if a locking request has already been dequeued in gfs2_glock_dq(). Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 1a6c1eb7bd6b..0f5c5c12d8c6 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1707,6 +1707,13 @@ void gfs2_glock_dq(struct gfs2_holder *gh) struct gfs2_glock *gl = gh->gh_gl; spin_lock(&gl->gl_lockref.lock); + if (!gfs2_holder_queued(gh)) { + /* + * May have already been dequeued because the locking request + * was GL_ASYNC and it has failed in the meantime. + */ + goto out; + } if (list_is_first(&gh->gh_list, &gl->gl_holders) && !test_bit(HIF_HOLDER, &gh->gh_iflags)) { spin_unlock(&gl->gl_lockref.lock); @@ -1716,6 +1723,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh) } __gfs2_glock_dq(gh); +out: spin_unlock(&gl->gl_lockref.lock); } From 764665c6775251d4569ba9f09981459bbb166359 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sun, 4 Dec 2022 03:48:52 +0100 Subject: [PATCH 10/17] gfs2: Clean up after gfs2_create_inode rework Since commit 3d36e57ff768 ("gfs2: gfs2_create_inode rework"), gfs2_evict_inode() and gfs2_create_inode() / gfs2_inode_lookup() will synchronize via the inode hash table and we can be certain that once a new inode is inserted into the inode hash table(), gfs2_evict_inode() has completely destroyed any previous versions. We no longer need to worry about overlapping inode object lifespans. Update the code and comments accordingly. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.h | 14 -------------- fs/gfs2/super.c | 21 ++++++++++++++------- 2 files changed, 14 insertions(+), 21 deletions(-) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 76cd2fabc668..d561126cfb47 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -322,20 +322,6 @@ static inline void glock_set_object(struct gfs2_glock *gl, void *object) /** * glock_clear_object - clear the gl_object field of a glock * @gl: the glock - * @object: the object - * - * I'd love to similarly add this: - * else if (gfs2_assert_warn(gl->gl_sbd, gl->gl_object == object)) - * gfs2_dump_glock(NULL, gl, true); - * Unfortunately, that's not possible because as soon as gfs2_delete_inode - * frees the block in the rgrp, another process can reassign it for an I_NEW - * inode in gfs2_create_inode because that calls new_inode, not gfs2_iget. - * That means gfs2_delete_inode may subsequently try to call this function - * for a glock that's already pointing to a brand new inode. If we clear the - * new inode's gl_object, we'll introduce metadata corruption. Function - * gfs2_delete_inode calls clear_inode which calls gfs2_clear_inode which also - * tries to clear gl_object, so it's more than just gfs2_delete_inode. - * */ static inline void glock_clear_object(struct gfs2_glock *gl, void *object) { diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 075fad8fb1d1..02f1b5f2d7f2 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1304,14 +1304,21 @@ static int evict_unlinked_inode(struct inode *inode) goto out; } - /* We're about to clear the bitmap for the dinode, but as soon as we - do, gfs2_create_inode can create another inode at the same block - location and try to set gl_object again. We clear gl_object here so - that subsequent inode creates don't see an old gl_object. */ - if (ip->i_gl) { - glock_clear_object(ip->i_gl, ip); + if (ip->i_gl) gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino); - } + + /* + * As soon as we clear the bitmap for the dinode, gfs2_create_inode() + * can get called to recreate it, or even gfs2_inode_lookup() if the + * inode was recreated on another node in the meantime. + * + * However, inserting the new inode into the inode hash table will not + * succeed until the old inode is removed, and that only happens after + * ->evict_inode() returns. The new inode is attached to its inode and + * iopen glocks after inserting it into the inode hash table, so at + * that point we can be sure that both glocks are unused. + */ + ret = gfs2_dinode_dealloc(ip); out: return ret; From fe1bff6517de789d491844f53e61e4ff02e8f8b1 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sun, 4 Dec 2022 13:27:11 +0100 Subject: [PATCH 11/17] gfs2: Simply dequeue iopen glock in gfs2_evict_inode With the previous change, to simplify things, we can always just dequeue and uninitialize the iopen glock in gfs2_evict_inode() even if it isn't queued anymore. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/super.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 02f1b5f2d7f2..999cc146d708 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1419,12 +1419,9 @@ out: struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; glock_clear_object(gl, ip); - if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { - ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq(&ip->i_iopen_gh); - } gfs2_glock_hold(gl); - gfs2_holder_uninit(&ip->i_iopen_gh); + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_uninit(&ip->i_iopen_gh); gfs2_glock_put_eventually(gl); } if (ip->i_gl) { From 3781ec9e09123d955b93fc8522ffb683a51f865d Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 5 Dec 2022 14:44:37 +0100 Subject: [PATCH 12/17] gfs2: Uninline and improve glock_{set,clear}_object Those functions have reached a size at which having them inline isn't useful anymore, so uninline them. In addition, report the glock name on assertion failures. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 42 ++++++++++++++++++++++++++++++++++++++++++ fs/gfs2/glock.h | 29 +++-------------------------- 2 files changed, 45 insertions(+), 26 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 0f5c5c12d8c6..76432efe6e02 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -928,6 +928,48 @@ out_unlock: return; } +/** + * glock_set_object - set the gl_object field of a glock + * @gl: the glock + * @object: the object + */ +void glock_set_object(struct gfs2_glock *gl, void *object) +{ + void *prev_object; + + spin_lock(&gl->gl_lockref.lock); + prev_object = gl->gl_object; + gl->gl_object = object; + spin_unlock(&gl->gl_lockref.lock); + if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == NULL)) { + pr_warn("glock=%u/%llx\n", + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number); + gfs2_dump_glock(NULL, gl, true); + } +} + +/** + * glock_clear_object - clear the gl_object field of a glock + * @gl: the glock + */ +void glock_clear_object(struct gfs2_glock *gl, void *object) +{ + void *prev_object; + + spin_lock(&gl->gl_lockref.lock); + prev_object = gl->gl_object; + gl->gl_object = NULL; + spin_unlock(&gl->gl_lockref.lock); + if (gfs2_assert_warn(gl->gl_name.ln_sbd, + prev_object == object || prev_object == NULL)) { + pr_warn("glock=%u/%llx\n", + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number); + gfs2_dump_glock(NULL, gl, true); + } +} + void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation) { struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr; diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index d561126cfb47..e4be9e4bc979 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -288,6 +288,9 @@ extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp); extern void gfs2_register_debugfs(void); extern void gfs2_unregister_debugfs(void); +extern void glock_set_object(struct gfs2_glock *gl, void *object); +extern void glock_clear_object(struct gfs2_glock *gl, void *object); + extern const struct lm_lockops gfs2_dlm_ops; static inline void gfs2_holder_mark_uninitialized(struct gfs2_holder *gh) @@ -305,32 +308,6 @@ static inline bool gfs2_holder_queued(struct gfs2_holder *gh) return !list_empty(&gh->gh_list); } -/** - * glock_set_object - set the gl_object field of a glock - * @gl: the glock - * @object: the object - */ -static inline void glock_set_object(struct gfs2_glock *gl, void *object) -{ - spin_lock(&gl->gl_lockref.lock); - if (gfs2_assert_warn(gl->gl_name.ln_sbd, gl->gl_object == NULL)) - gfs2_dump_glock(NULL, gl, true); - gl->gl_object = object; - spin_unlock(&gl->gl_lockref.lock); -} - -/** - * glock_clear_object - clear the gl_object field of a glock - * @gl: the glock - */ -static inline void glock_clear_object(struct gfs2_glock *gl, void *object) -{ - spin_lock(&gl->gl_lockref.lock); - if (gl->gl_object == object) - gl->gl_object = NULL; - spin_unlock(&gl->gl_lockref.lock); -} - static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; From 2ec750a01d189cf1872cd79490d0911a7bd519f8 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sun, 4 Dec 2022 12:51:55 +0100 Subject: [PATCH 13/17] gfs2: Add gfs2_inode_lookup comment Add comment on when and why gfs2_cancel_delete_work() needs to be skipped in gfs2_inode_lookup(). Signed-off-by: Andreas Gruenbacher --- fs/gfs2/inode.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 9fbbc365a404..8d4c4b5c4c0d 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -142,6 +142,11 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (unlikely(error)) goto fail; + /* + * The only caller that sets @blktype to GFS2_BLKST_UNLINKED is + * delete_work_func(). Make sure not to cancel the delete work + * from within itself here. + */ if (blktype == GFS2_BLKST_UNLINKED) extra_flags |= LM_FLAG_TRY; else From 88f4a9f813c549f6b8a6fbf12030949b48a4d5a4 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 5 Dec 2022 22:27:28 +0100 Subject: [PATCH 14/17] gfs2: Partially revert gfs2_inode_lookup change Commit c412a97cf6c5 changed delete_work_func() to always perform an inode lookup when gfs2_try_evict() fails. This doesn't make sense as a gfs2_try_evict() failure indicates that the inode is likely still in use. Revert that change. Fixes: c412a97cf6c5 ("gfs2: Use TRY lock in gfs2_inode_lookup for UNLINKED inodes") Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 76432efe6e02..6f2de8c0b2d0 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1082,6 +1082,7 @@ static void delete_work_func(struct work_struct *work) if (gfs2_queue_delete_work(gl, 5 * HZ)) return; } + goto out; } inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino, @@ -1094,6 +1095,7 @@ static void delete_work_func(struct work_struct *work) d_prune_aliases(inode); iput(inode); } +out: gfs2_glock_put(gl); } From f0c0ade8d874fb127f9b451d415bee8cbb6bf7a6 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 5 Dec 2022 22:27:38 +0100 Subject: [PATCH 15/17] gfs2: Minor gfs2_try_evict cleanup In gfs2_try_evict(), when an inode can't be evicted, we are grabbing a temporary reference on the inode glock to poke that glock. That should be safe, but it's easier to just grab an inode reference as we already do earlier in this function. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 6f2de8c0b2d0..c32c25b4c37c 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1023,8 +1023,6 @@ static bool gfs2_try_evict(struct gfs2_glock *gl) ip = NULL; spin_unlock(&gl->gl_lockref.lock); if (ip) { - struct gfs2_glock *inode_gl = NULL; - gl->gl_no_formal_ino = ip->i_no_formal_ino; set_bit(GIF_DEFERRED_DELETE, &ip->i_flags); d_prune_aliases(&ip->i_inode); @@ -1034,14 +1032,14 @@ static bool gfs2_try_evict(struct gfs2_glock *gl) spin_lock(&gl->gl_lockref.lock); ip = gl->gl_object; if (ip) { - inode_gl = ip->i_gl; - lockref_get(&inode_gl->gl_lockref); clear_bit(GIF_DEFERRED_DELETE, &ip->i_flags); + if (!igrab(&ip->i_inode)) + ip = NULL; } spin_unlock(&gl->gl_lockref.lock); - if (inode_gl) { - gfs2_glock_poke(inode_gl); - gfs2_glock_put(inode_gl); + if (ip) { + gfs2_glock_poke(ip->i_gl); + iput(&ip->i_inode); } evicted = !ip; } From ba3e77a4a22af018d2fe4d745902b2531ca82aba Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 9 Dec 2022 22:14:01 +0100 Subject: [PATCH 16/17] gfs2: Remove support for glock holder auto-demotion Remove the support for glock holder auto-demotion (commit dc732906c245 and folow-ups) as we are not planning to use this feature, and the additional code therefore only adds unnecessary complexity. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 194 +++++++++-------------------------------------- fs/gfs2/glock.h | 20 ----- fs/gfs2/incore.h | 1 - 3 files changed, 36 insertions(+), 179 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c32c25b4c37c..a4c15e86b1f5 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -206,12 +206,6 @@ static int demote_ok(const struct gfs2_glock *gl) if (gl->gl_state == LM_ST_UNLOCKED) return 0; - /* - * Note that demote_ok is used for the lru process of disposing of - * glocks. For this purpose, we don't care if the glock's holders - * have the HIF_MAY_DEMOTE flag set or not. If someone is using - * them, don't demote. - */ if (!list_empty(&gl->gl_holders)) return 0; if (glops->go_demote_ok) @@ -394,7 +388,7 @@ static void do_error(struct gfs2_glock *gl, const int ret) struct gfs2_holder *gh, *tmp; list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { - if (!test_bit(HIF_WAIT, &gh->gh_iflags)) + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) continue; if (ret & LM_OUT_ERROR) gh->gh_error = -EIO; @@ -408,45 +402,6 @@ static void do_error(struct gfs2_glock *gl, const int ret) } } -/** - * demote_incompat_holders - demote incompatible demoteable holders - * @gl: the glock we want to promote - * @current_gh: the newly promoted holder - * - * We're passing the newly promoted holder in @current_gh, but actually, any of - * the strong holders would do. - */ -static void demote_incompat_holders(struct gfs2_glock *gl, - struct gfs2_holder *current_gh) -{ - struct gfs2_holder *gh, *tmp; - - /* - * Demote incompatible holders before we make ourselves eligible. - * (This holder may or may not allow auto-demoting, but we don't want - * to demote the new holder before it's even granted.) - */ - list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { - /* - * Since holders are at the front of the list, we stop when we - * find the first non-holder. - */ - if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) - return; - if (gh == current_gh) - continue; - if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) && - !may_grant(gl, current_gh, gh)) { - /* - * We should not recurse into do_promote because - * __gfs2_glock_dq only calls handle_callback, - * gfs2_glock_add_to_lru and __gfs2_glock_queue_work. - */ - __gfs2_glock_dq(gh); - } - } -} - /** * find_first_holder - find the first "holder" gh * @gl: the glock @@ -465,26 +420,6 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) return NULL; } -/** - * find_first_strong_holder - find the first non-demoteable holder - * @gl: the glock - * - * Find the first holder that doesn't have the HIF_MAY_DEMOTE flag set. - */ -static inline struct gfs2_holder * -find_first_strong_holder(struct gfs2_glock *gl) -{ - struct gfs2_holder *gh; - - list_for_each_entry(gh, &gl->gl_holders, gh_list) { - if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) - return NULL; - if (!test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags)) - return gh; - } - return NULL; -} - /* * gfs2_instantiate - Call the glops instantiate function * @gh: The glock holder @@ -541,9 +476,8 @@ done: static int do_promote(struct gfs2_glock *gl) { struct gfs2_holder *gh, *current_gh; - bool incompat_holders_demoted = false; - current_gh = find_first_strong_holder(gl); + current_gh = find_first_holder(gl); list_for_each_entry(gh, &gl->gl_holders, gh_list) { if (test_bit(HIF_HOLDER, &gh->gh_iflags)) continue; @@ -562,11 +496,8 @@ static int do_promote(struct gfs2_glock *gl) set_bit(HIF_HOLDER, &gh->gh_iflags); trace_gfs2_promote(gh); gfs2_holder_wake(gh); - if (!incompat_holders_demoted) { + if (!current_gh) current_gh = gh; - demote_incompat_holders(gl, current_gh); - incompat_holders_demoted = true; - } } return 0; } @@ -1538,7 +1469,7 @@ __acquires(&gl->gl_lockref.lock) if (test_bit(GLF_LOCK, &gl->gl_flags)) { struct gfs2_holder *current_gh; - current_gh = find_first_strong_holder(gl); + current_gh = find_first_holder(gl); try_futile = !may_grant(gl, current_gh, gh); } if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) @@ -1550,8 +1481,6 @@ __acquires(&gl->gl_lockref.lock) continue; if (gh->gh_gl->gl_ops->go_type == LM_TYPE_FLOCK) continue; - if (test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags)) - continue; if (!pid_is_meaningful(gh2)) continue; goto trap_recursive; @@ -1666,64 +1595,42 @@ static void __gfs2_glock_dq(struct gfs2_holder *gh) int fast_path = 0; /* - * This while loop is similar to function demote_incompat_holders: - * If the glock is due to be demoted (which may be from another node - * or even if this holder is GL_NOCACHE), the weak holders are - * demoted as well, allowing the glock to be demoted. + * If we're in the process of file system withdraw, we cannot just + * dequeue any glocks until our journal is recovered, lest we introduce + * file system corruption. We need two exceptions to this rule: We need + * to allow unlocking of nondisk glocks and the glock for our own + * journal that needs recovery. */ - while (gh) { - /* - * If we're in the process of file system withdraw, we cannot - * just dequeue any glocks until our journal is recovered, lest - * we introduce file system corruption. We need two exceptions - * to this rule: We need to allow unlocking of nondisk glocks - * and the glock for our own journal that needs recovery. - */ - if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) && - glock_blocked_by_withdraw(gl) && - gh->gh_gl != sdp->sd_jinode_gl) { - sdp->sd_glock_dqs_held++; - spin_unlock(&gl->gl_lockref.lock); - might_sleep(); - wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY, - TASK_UNINTERRUPTIBLE); - spin_lock(&gl->gl_lockref.lock); - } + if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) && + glock_blocked_by_withdraw(gl) && + gh->gh_gl != sdp->sd_jinode_gl) { + sdp->sd_glock_dqs_held++; + spin_unlock(&gl->gl_lockref.lock); + might_sleep(); + wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY, + TASK_UNINTERRUPTIBLE); + spin_lock(&gl->gl_lockref.lock); + } - /* - * This holder should not be cached, so mark it for demote. - * Note: this should be done before the check for needs_demote - * below. - */ - if (gh->gh_flags & GL_NOCACHE) - handle_callback(gl, LM_ST_UNLOCKED, 0, false); + /* + * This holder should not be cached, so mark it for demote. + * Note: this should be done before the check for needs_demote + * below. + */ + if (gh->gh_flags & GL_NOCACHE) + handle_callback(gl, LM_ST_UNLOCKED, 0, false); - list_del_init(&gh->gh_list); - clear_bit(HIF_HOLDER, &gh->gh_iflags); - trace_gfs2_glock_queue(gh, 0); + list_del_init(&gh->gh_list); + clear_bit(HIF_HOLDER, &gh->gh_iflags); + trace_gfs2_glock_queue(gh, 0); - /* - * If there hasn't been a demote request we are done. - * (Let the remaining holders, if any, keep holding it.) - */ - if (!needs_demote(gl)) { - if (list_empty(&gl->gl_holders)) - fast_path = 1; - break; - } - /* - * If we have another strong holder (we cannot auto-demote) - * we are done. It keeps holding it until it is done. - */ - if (find_first_strong_holder(gl)) - break; - - /* - * If we have a weak holder at the head of the list, it - * (and all others like it) must be auto-demoted. If there - * are no more weak holders, we exit the while loop. - */ - gh = find_first_holder(gl); + /* + * If there hasn't been a demote request we are done. + * (Let the remaining holders, if any, keep holding it.) + */ + if (!needs_demote(gl)) { + if (list_empty(&gl->gl_holders)) + fast_path = 1; } if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl)) @@ -1938,33 +1845,6 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) delay = gl->gl_hold_time; } - /* - * Note 1: We cannot call demote_incompat_holders from handle_callback - * or gfs2_set_demote due to recursion problems like: gfs2_glock_dq -> - * handle_callback -> demote_incompat_holders -> gfs2_glock_dq - * Plus, we only want to demote the holders if the request comes from - * a remote cluster node because local holder conflicts are resolved - * elsewhere. - * - * Note 2: if a remote node wants this glock in EX mode, lock_dlm will - * request that we set our state to UNLOCKED. Here we mock up a holder - * to make it look like someone wants the lock EX locally. Any SH - * and DF requests should be able to share the lock without demoting. - * - * Note 3: We only want to demote the demoteable holders when there - * are no more strong holders. The demoteable holders might as well - * keep the glock until the last strong holder is done with it. - */ - if (!find_first_strong_holder(gl)) { - struct gfs2_holder mock_gh = { - .gh_gl = gl, - .gh_state = (state == LM_ST_UNLOCKED) ? - LM_ST_EXCLUSIVE : state, - .gh_iflags = BIT(HIF_HOLDER) - }; - - demote_incompat_holders(gl, &mock_gh); - } handle_callback(gl, state, delay, true); __gfs2_glock_queue_work(gl, delay); spin_unlock(&gl->gl_lockref.lock); @@ -2356,8 +2236,6 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) *p++ = 'H'; if (test_bit(HIF_WAIT, &iflags)) *p++ = 'W'; - if (test_bit(HIF_MAY_DEMOTE, &iflags)) - *p++ = 'D'; if (flags & GL_SKIP) *p++ = 's'; *p = 0; diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index e4be9e4bc979..f37ac087e2c1 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -156,8 +156,6 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock * list_for_each_entry(gh, &gl->gl_holders, gh_list) { if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) break; - if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags)) - continue; if (gh->gh_owner_pid == pid) goto out; } @@ -308,24 +306,6 @@ static inline bool gfs2_holder_queued(struct gfs2_holder *gh) return !list_empty(&gh->gh_list); } -static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh) -{ - struct gfs2_glock *gl = gh->gh_gl; - - spin_lock(&gl->gl_lockref.lock); - set_bit(HIF_MAY_DEMOTE, &gh->gh_iflags); - spin_unlock(&gl->gl_lockref.lock); -} - -static inline void gfs2_holder_disallow_demote(struct gfs2_holder *gh) -{ - struct gfs2_glock *gl = gh->gh_gl; - - spin_lock(&gl->gl_lockref.lock); - clear_bit(HIF_MAY_DEMOTE, &gh->gh_iflags); - spin_unlock(&gl->gl_lockref.lock); -} - extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation); extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index d09d9892cd05..c26765080f28 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -252,7 +252,6 @@ struct gfs2_lkstats { enum { /* States */ - HIF_MAY_DEMOTE = 1, HIF_HOLDER = 6, /* Set for gh that "holds" the glock */ HIF_WAIT = 10, }; From 6b46a06100dd0e0ebe400573e94ccd09163bfd5b Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 9 Dec 2022 23:03:59 +0100 Subject: [PATCH 17/17] gfs2: Remove support for glock holder auto-demotion (2) As a follow-up to the previous commit, move the recovery related code in __gfs2_glock_dq() to gfs2_glock_dq() where it better fits. No functional change. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index a4c15e86b1f5..524f3c96b9a4 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1590,28 +1590,9 @@ static inline bool needs_demote(struct gfs2_glock *gl) static void __gfs2_glock_dq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; - struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; unsigned delay = 0; int fast_path = 0; - /* - * If we're in the process of file system withdraw, we cannot just - * dequeue any glocks until our journal is recovered, lest we introduce - * file system corruption. We need two exceptions to this rule: We need - * to allow unlocking of nondisk glocks and the glock for our own - * journal that needs recovery. - */ - if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) && - glock_blocked_by_withdraw(gl) && - gh->gh_gl != sdp->sd_jinode_gl) { - sdp->sd_glock_dqs_held++; - spin_unlock(&gl->gl_lockref.lock); - might_sleep(); - wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY, - TASK_UNINTERRUPTIBLE); - spin_lock(&gl->gl_lockref.lock); - } - /* * This holder should not be cached, so mark it for demote. * Note: this should be done before the check for needs_demote @@ -1654,6 +1635,7 @@ static void __gfs2_glock_dq(struct gfs2_holder *gh) void gfs2_glock_dq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; spin_lock(&gl->gl_lockref.lock); if (!gfs2_holder_queued(gh)) { @@ -1663,6 +1645,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh) */ goto out; } + if (list_is_first(&gh->gh_list, &gl->gl_holders) && !test_bit(HIF_HOLDER, &gh->gh_iflags)) { spin_unlock(&gl->gl_lockref.lock); @@ -1671,6 +1654,24 @@ void gfs2_glock_dq(struct gfs2_holder *gh) spin_lock(&gl->gl_lockref.lock); } + /* + * If we're in the process of file system withdraw, we cannot just + * dequeue any glocks until our journal is recovered, lest we introduce + * file system corruption. We need two exceptions to this rule: We need + * to allow unlocking of nondisk glocks and the glock for our own + * journal that needs recovery. + */ + if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) && + glock_blocked_by_withdraw(gl) && + gh->gh_gl != sdp->sd_jinode_gl) { + sdp->sd_glock_dqs_held++; + spin_unlock(&gl->gl_lockref.lock); + might_sleep(); + wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY, + TASK_UNINTERRUPTIBLE); + spin_lock(&gl->gl_lockref.lock); + } + __gfs2_glock_dq(gh); out: spin_unlock(&gl->gl_lockref.lock);