From 484d4fbfdafea581fdc2fd04df6781c3a59122c9 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 12 Nov 2023 08:55:57 +0200 Subject: [PATCH 01/40] ovl: stop using d_alloc_anon()/d_instantiate_anon() Commit f9c34674bc60 ("vfs: factor out helpers d_instantiate_anon() and d_alloc_anon()") was introduced so overlayfs could initialize a non-dir disconnected overlay dentry before overlay inode is attached to it. Since commit ("0af950f57fef ovl: move ovl_entry into ovl_inode"), all ovl_obtain_alias() can do is set DCACHE_OP_*REVALIDATE flags in ->d_flags and OVL_E_UPPER_ALIAS flag in ->d_fsdata. The DCACHE_OP_*REVALIDATE flags and OVL_E_UPPER_ALIAS flag are irrelevant for a disconnected non-dir dentry, so it is better to use d_obtain_alias() instead of open coding it. Suggested-by: Al Viro Signed-off-by: Amir Goldstein Signed-off-by: Al Viro --- fs/overlayfs/export.c | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 7e16bbcad95e..9e316d5f936e 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -289,7 +289,6 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb, { struct dentry *lower = lowerpath ? lowerpath->dentry : NULL; struct dentry *upper = upper_alias ?: index; - struct dentry *dentry; struct inode *inode = NULL; struct ovl_entry *oe; struct ovl_inode_params oip = { @@ -320,27 +319,7 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb, if (upper) ovl_set_flag(OVL_UPPERDATA, inode); - dentry = d_find_any_alias(inode); - if (dentry) - goto out_iput; - - dentry = d_alloc_anon(inode->i_sb); - if (unlikely(!dentry)) - goto nomem; - - if (upper_alias) - ovl_dentry_set_upper_alias(dentry); - - ovl_dentry_init_reval(dentry, upper, OVL_I_E(inode)); - - return d_instantiate_anon(dentry, inode); - -nomem: - dput(dentry); - dentry = ERR_PTR(-ENOMEM); -out_iput: - iput(inode); - return dentry; + return d_obtain_alias(inode); } /* Get the upper or lower dentry in stack whose on layer @idx */ From acfde6e8abee6b23e53b08606f861d9124288030 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 4 Nov 2023 00:10:18 -0400 Subject: [PATCH 02/40] struct dentry: get rid of randomize_layout idiocy This is beyond ridiculous. There is a reason why that thing is cacheline-aligned... Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/dcache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 3da2f0545d5d..1d9f7f132055 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -111,7 +111,7 @@ struct dentry { struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ struct rcu_head d_rcu; } d_u; -} __randomize_layout; +}; /* * dentry->d_lock spinlock nesting subclasses: From 6d73c9ce0285adff18b54a5acadfbbbf8ba40dd5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Oct 2023 13:41:06 -0400 Subject: [PATCH 03/40] get rid of __dget() fold into the sole remaining caller Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/dcache.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index c82ae731df9a..b8f1b54a1492 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -948,11 +948,6 @@ static inline void __dget_dlock(struct dentry *dentry) dentry->d_lockref.count++; } -static inline void __dget(struct dentry *dentry) -{ - lockref_get(&dentry->d_lockref); -} - struct dentry *dget_parent(struct dentry *dentry) { int gotref; @@ -1002,7 +997,7 @@ static struct dentry * __d_find_any_alias(struct inode *inode) if (hlist_empty(&inode->i_dentry)) return NULL; alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); - __dget(alias); + lockref_get(&alias->d_lockref); return alias; } From 641c3ef5cb68a1426d42e6d3aba16db9bdfbe94f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 10 Nov 2023 12:46:30 -0500 Subject: [PATCH 04/40] DCACHE_... ->d_flags bits: switch to BIT() For bits 20..22 (inode type cached in ->d_flags) turn the definitions into expressions like (5 << 20); everything else turns into straight use of BIT() Signed-off-by: Al Viro --- include/linux/dcache.h | 70 +++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 1d9f7f132055..d9c314cc93b8 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -151,13 +151,13 @@ struct dentry_operations { */ /* d_flags entries */ -#define DCACHE_OP_HASH 0x00000001 -#define DCACHE_OP_COMPARE 0x00000002 -#define DCACHE_OP_REVALIDATE 0x00000004 -#define DCACHE_OP_DELETE 0x00000008 -#define DCACHE_OP_PRUNE 0x00000010 +#define DCACHE_OP_HASH BIT(0) +#define DCACHE_OP_COMPARE BIT(1) +#define DCACHE_OP_REVALIDATE BIT(2) +#define DCACHE_OP_DELETE BIT(3) +#define DCACHE_OP_PRUNE BIT(4) -#define DCACHE_DISCONNECTED 0x00000020 +#define DCACHE_DISCONNECTED BIT(5) /* This dentry is possibly not currently connected to the dcache tree, in * which case its parent will either be itself, or will have this flag as * well. nfsd will not use a dentry with this bit set, but will first @@ -168,50 +168,50 @@ struct dentry_operations { * dentry into place and return that dentry rather than the passed one, * typically using d_splice_alias. */ -#define DCACHE_REFERENCED 0x00000040 /* Recently used, don't discard. */ +#define DCACHE_REFERENCED BIT(6) /* Recently used, don't discard. */ -#define DCACHE_DONTCACHE 0x00000080 /* Purge from memory on final dput() */ +#define DCACHE_DONTCACHE BIT(7) /* Purge from memory on final dput() */ -#define DCACHE_CANT_MOUNT 0x00000100 -#define DCACHE_GENOCIDE 0x00000200 -#define DCACHE_SHRINK_LIST 0x00000400 +#define DCACHE_CANT_MOUNT BIT(8) +#define DCACHE_GENOCIDE BIT(9) +#define DCACHE_SHRINK_LIST BIT(10) -#define DCACHE_OP_WEAK_REVALIDATE 0x00000800 +#define DCACHE_OP_WEAK_REVALIDATE BIT(11) -#define DCACHE_NFSFS_RENAMED 0x00001000 +#define DCACHE_NFSFS_RENAMED BIT(12) /* this dentry has been "silly renamed" and has to be deleted on the last * dput() */ -#define DCACHE_COOKIE 0x00002000 /* For use by dcookie subsystem */ -#define DCACHE_FSNOTIFY_PARENT_WATCHED 0x00004000 +#define DCACHE_COOKIE BIT(13) /* For use by dcookie subsystem */ +#define DCACHE_FSNOTIFY_PARENT_WATCHED BIT(14) /* Parent inode is watched by some fsnotify listener */ -#define DCACHE_DENTRY_KILLED 0x00008000 +#define DCACHE_DENTRY_KILLED BIT(15) -#define DCACHE_MOUNTED 0x00010000 /* is a mountpoint */ -#define DCACHE_NEED_AUTOMOUNT 0x00020000 /* handle automount on this dir */ -#define DCACHE_MANAGE_TRANSIT 0x00040000 /* manage transit from this dirent */ +#define DCACHE_MOUNTED BIT(16) /* is a mountpoint */ +#define DCACHE_NEED_AUTOMOUNT BIT(17) /* handle automount on this dir */ +#define DCACHE_MANAGE_TRANSIT BIT(18) /* manage transit from this dirent */ #define DCACHE_MANAGED_DENTRY \ (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT) -#define DCACHE_LRU_LIST 0x00080000 +#define DCACHE_LRU_LIST BIT(19) -#define DCACHE_ENTRY_TYPE 0x00700000 -#define DCACHE_MISS_TYPE 0x00000000 /* Negative dentry (maybe fallthru to nowhere) */ -#define DCACHE_WHITEOUT_TYPE 0x00100000 /* Whiteout dentry (stop pathwalk) */ -#define DCACHE_DIRECTORY_TYPE 0x00200000 /* Normal directory */ -#define DCACHE_AUTODIR_TYPE 0x00300000 /* Lookupless directory (presumed automount) */ -#define DCACHE_REGULAR_TYPE 0x00400000 /* Regular file type (or fallthru to such) */ -#define DCACHE_SPECIAL_TYPE 0x00500000 /* Other file type (or fallthru to such) */ -#define DCACHE_SYMLINK_TYPE 0x00600000 /* Symlink (or fallthru to such) */ +#define DCACHE_ENTRY_TYPE (7 << 20) /* bits 20..22 are for storing type: */ +#define DCACHE_MISS_TYPE (0 << 20) /* Negative dentry (maybe fallthru to nowhere) */ +#define DCACHE_WHITEOUT_TYPE (1 << 20) /* Whiteout dentry (stop pathwalk) */ +#define DCACHE_DIRECTORY_TYPE (2 << 20) /* Normal directory */ +#define DCACHE_AUTODIR_TYPE (3 << 20) /* Lookupless directory (presumed automount) */ +#define DCACHE_REGULAR_TYPE (4 << 20) /* Regular file type (or fallthru to such) */ +#define DCACHE_SPECIAL_TYPE (5 << 20) /* Other file type (or fallthru to such) */ +#define DCACHE_SYMLINK_TYPE (6 << 20) /* Symlink (or fallthru to such) */ -#define DCACHE_MAY_FREE 0x00800000 -#define DCACHE_FALLTHRU 0x01000000 /* Fall through to lower layer */ -#define DCACHE_NOKEY_NAME 0x02000000 /* Encrypted name encoded without key */ -#define DCACHE_OP_REAL 0x04000000 +#define DCACHE_MAY_FREE BIT(23) +#define DCACHE_FALLTHRU BIT(24) /* Fall through to lower layer */ +#define DCACHE_NOKEY_NAME BIT(25) /* Encrypted name encoded without key */ +#define DCACHE_OP_REAL BIT(26) -#define DCACHE_PAR_LOOKUP 0x10000000 /* being looked up (with parent locked shared) */ -#define DCACHE_DENTRY_CURSOR 0x20000000 -#define DCACHE_NORCU 0x40000000 /* No RCU delay for freeing */ +#define DCACHE_PAR_LOOKUP BIT(28) /* being looked up (with parent locked shared) */ +#define DCACHE_DENTRY_CURSOR BIT(29) +#define DCACHE_NORCU BIT(30) /* No RCU delay for freeing */ extern seqlock_t rename_lock; From 0bec65a80f1b1ebcda05286e539a204713b70353 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 10 Nov 2023 12:50:29 -0500 Subject: [PATCH 05/40] DCACHE_COOKIE: RIP the last user gone in 2021... Signed-off-by: Al Viro --- include/linux/dcache.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/dcache.h b/include/linux/dcache.h index d9c314cc93b8..92c0b2a1ae2e 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -181,7 +181,6 @@ struct dentry_operations { #define DCACHE_NFSFS_RENAMED BIT(12) /* this dentry has been "silly renamed" and has to be deleted on the last * dput() */ -#define DCACHE_COOKIE BIT(13) /* For use by dcookie subsystem */ #define DCACHE_FSNOTIFY_PARENT_WATCHED BIT(14) /* Parent inode is watched by some fsnotify listener */ From 8219cb58feddcf28909072015f4e17e29f68c41a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 10 Nov 2023 14:32:05 -0500 Subject: [PATCH 06/40] kill d_{is,set}_fallthru() Introduced in 2015 and never had any in-tree users... Signed-off-by: Al Viro --- fs/dcache.c | 20 ++------------------ include/linux/dcache.h | 17 ++++------------- 2 files changed, 6 insertions(+), 31 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index b8f1b54a1492..9f5b2b5c1e6d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -344,7 +344,7 @@ static inline void __d_set_inode_and_type(struct dentry *dentry, dentry->d_inode = inode; flags = READ_ONCE(dentry->d_flags); - flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); + flags &= ~DCACHE_ENTRY_TYPE; flags |= type_flags; smp_store_release(&dentry->d_flags, flags); } @@ -353,7 +353,7 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry) { unsigned flags = READ_ONCE(dentry->d_flags); - flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); + flags &= ~DCACHE_ENTRY_TYPE; WRITE_ONCE(dentry->d_flags, flags); dentry->d_inode = NULL; if (dentry->d_flags & DCACHE_LRU_LIST) @@ -1936,22 +1936,6 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) } EXPORT_SYMBOL(d_set_d_op); - -/* - * d_set_fallthru - Mark a dentry as falling through to a lower layer - * @dentry - The dentry to mark - * - * Mark a dentry as falling through to the lower layer (as set with - * d_pin_lower()). This flag may be recorded on the medium. - */ -void d_set_fallthru(struct dentry *dentry) -{ - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_FALLTHRU; - spin_unlock(&dentry->d_lock); -} -EXPORT_SYMBOL(d_set_fallthru); - static unsigned d_flags_for_inode(struct inode *inode) { unsigned add_flags = DCACHE_REGULAR_TYPE; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 92c0b2a1ae2e..8cd937bb2292 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -195,16 +195,15 @@ struct dentry_operations { #define DCACHE_LRU_LIST BIT(19) #define DCACHE_ENTRY_TYPE (7 << 20) /* bits 20..22 are for storing type: */ -#define DCACHE_MISS_TYPE (0 << 20) /* Negative dentry (maybe fallthru to nowhere) */ +#define DCACHE_MISS_TYPE (0 << 20) /* Negative dentry */ #define DCACHE_WHITEOUT_TYPE (1 << 20) /* Whiteout dentry (stop pathwalk) */ #define DCACHE_DIRECTORY_TYPE (2 << 20) /* Normal directory */ #define DCACHE_AUTODIR_TYPE (3 << 20) /* Lookupless directory (presumed automount) */ -#define DCACHE_REGULAR_TYPE (4 << 20) /* Regular file type (or fallthru to such) */ -#define DCACHE_SPECIAL_TYPE (5 << 20) /* Other file type (or fallthru to such) */ -#define DCACHE_SYMLINK_TYPE (6 << 20) /* Symlink (or fallthru to such) */ +#define DCACHE_REGULAR_TYPE (4 << 20) /* Regular file type */ +#define DCACHE_SPECIAL_TYPE (5 << 20) /* Other file type */ +#define DCACHE_SYMLINK_TYPE (6 << 20) /* Symlink */ #define DCACHE_MAY_FREE BIT(23) -#define DCACHE_FALLTHRU BIT(24) /* Fall through to lower layer */ #define DCACHE_NOKEY_NAME BIT(25) /* Encrypted name encoded without key */ #define DCACHE_OP_REAL BIT(26) @@ -489,14 +488,6 @@ static inline int simple_positive(const struct dentry *dentry) return d_really_is_positive(dentry) && !d_unhashed(dentry); } -extern void d_set_fallthru(struct dentry *dentry); - -static inline bool d_is_fallthru(const struct dentry *dentry) -{ - return dentry->d_flags & DCACHE_FALLTHRU; -} - - extern int sysctl_vfs_cache_pressure; static inline unsigned long vfs_pressure_ratio(unsigned long val) From 0d486510f86eb8162022ed61e6dc424a10909a10 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 10 Nov 2023 15:22:40 -0500 Subject: [PATCH 07/40] dentry.h: trim externs d_instantiate_unique() had been gone for 7 years; __d_lookup...() and shrink_dcache_for_umount() are fs/internal.h fodder. Signed-off-by: Al Viro --- fs/internal.h | 4 ++++ include/linux/dcache.h | 5 ----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/internal.h b/fs/internal.h index 58e43341aebf..9e9fc629f935 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -215,6 +215,10 @@ extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *) extern char *simple_dname(struct dentry *, char *, int); extern void dput_to_list(struct dentry *, struct list_head *); extern void shrink_dentry_list(struct list_head *); +extern void shrink_dcache_for_umount(struct super_block *); +extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *); +extern struct dentry *__d_lookup_rcu(const struct dentry *parent, + const struct qstr *name, unsigned *seq); /* * pipe.c diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 8cd937bb2292..9706bf1dc5de 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -218,7 +218,6 @@ extern seqlock_t rename_lock; */ extern void d_instantiate(struct dentry *, struct inode *); extern void d_instantiate_new(struct dentry *, struct inode *); -extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *); extern struct dentry * d_instantiate_anon(struct dentry *, struct inode *); extern void __d_drop(struct dentry *dentry); extern void d_drop(struct dentry *dentry); @@ -240,7 +239,6 @@ extern struct dentry * d_obtain_alias(struct inode *); extern struct dentry * d_obtain_root(struct inode *); extern void shrink_dcache_sb(struct super_block *); extern void shrink_dcache_parent(struct dentry *); -extern void shrink_dcache_for_umount(struct super_block *); extern void d_invalidate(struct dentry *); /* only used at mount-time */ @@ -275,9 +273,6 @@ extern struct dentry *d_ancestor(struct dentry *, struct dentry *); /* appendix may either be NULL or be used for transname suffixes */ extern struct dentry *d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); -extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *); -extern struct dentry *__d_lookup_rcu(const struct dentry *parent, - const struct qstr *name, unsigned *seq); static inline unsigned d_count(const struct dentry *dentry) { From 2fcd38f4de7256e2b5cb23ad22a6e3ebfea7dd18 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 10 Nov 2023 15:24:45 -0500 Subject: [PATCH 08/40] [software coproarchaeology] dentry.h: kill a mysterious comment there's a strange comment in front of d_lookup() declaration: /* appendix may either be NULL or be used for transname suffixes */ Looks like nobody had been curious enough to track its history; it predates git, it predates bitkeeper and if you look through the pre-BK trees, you finally arrive at this in 2.1.44-for-davem: /* appendix may either be NULL or be used for transname suffixes */ -extern struct dentry * d_lookup(struct inode * dir, struct qstr * name, - struct qstr * appendix); +extern struct dentry * d_lookup(struct dentry * dir, struct qstr * name); In other words, it refers to the third argument d_lookup() used to have back then. It had been introduced in 2.1.43-pre, on June 12 1997, along with d_lookup(), only to be removed by July 4 1997, presumably when the Cthulhu-awful thing it used to be used for (look for CONFIG_TRANS_NAMES in 2.1.43-pre, and keep a heavy-duty barfbag ready) had been, er, noticed and recognized for what it had been. Despite the appendectomy, the comment remained. Some things really need to be put out of their misery... Signed-off-by: Al Viro --- include/linux/dcache.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 9706bf1dc5de..a5e5e274eee0 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -270,7 +270,6 @@ extern void d_move(struct dentry *, struct dentry *); extern void d_exchange(struct dentry *, struct dentry *); extern struct dentry *d_ancestor(struct dentry *, struct dentry *); -/* appendix may either be NULL or be used for transname suffixes */ extern struct dentry *d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); From 698f1e2b71736977b04f951e2e2ef1c9a80696ff Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 10 Nov 2023 16:19:59 -0500 Subject: [PATCH 09/40] kill d_backing_dentry() no users left Signed-off-by: Al Viro --- include/linux/dcache.h | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/include/linux/dcache.h b/include/linux/dcache.h index a5e5e274eee0..fa0414cff85c 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -530,21 +530,6 @@ static inline struct inode *d_backing_inode(const struct dentry *upper) return inode; } -/** - * d_backing_dentry - Get upper or lower dentry we should be using - * @upper: The upper layer - * - * This is the helper that should be used to get the dentry of the inode that - * will be used if this dentry were opened as a file. It may be the upper - * dentry or it may be a lower dentry pinned by the upper. - * - * Normal filesystems should not use this to access their own dentries. - */ -static inline struct dentry *d_backing_dentry(struct dentry *upper) -{ - return upper; -} - /** * d_real - Return the real dentry * @dentry: the dentry to query From b7a14708aafeb12730ab23de8ecedd14f401a30c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 3 Nov 2023 14:35:16 -0400 Subject: [PATCH 10/40] switch nfsd_client_rmdir() to use of simple_recursive_removal() nfsd_client_rmdir() open-codes a subset of simple_recursive_removal(). Conversion to calling simple_recursive_removal() allows to clean things up quite a bit. While we are at it, nfsdfs_create_files() doesn't need to mess with "pick the reference to struct nfsdfs_client from the already created parent" - the caller already knows it (that's where the parent got it from, after all), so we might as well just pass it as an explicit argument. So __get_nfsdfs_client() is only needed in get_nfsdfs_client() and can be folded in there. Incidentally, the locking in get_nfsdfs_client() is too heavy - we don't need ->i_rwsem for that, ->i_lock serves just fine. Reviewed-by: Jeff Layton Tested-by: Jeff Layton Acked-by: Chuck Lever Acked-by: Christian Brauner Signed-off-by: Al Viro --- fs/nfsd/nfsctl.c | 70 ++++++++++-------------------------------------- 1 file changed, 14 insertions(+), 56 deletions(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 3e15b72f421d..50df5e9d0069 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1236,63 +1236,34 @@ static inline void _nfsd_symlink(struct dentry *parent, const char *name, #endif -static void clear_ncl(struct inode *inode) +static void clear_ncl(struct dentry *dentry) { + struct inode *inode = d_inode(dentry); struct nfsdfs_client *ncl = inode->i_private; + spin_lock(&inode->i_lock); inode->i_private = NULL; + spin_unlock(&inode->i_lock); kref_put(&ncl->cl_ref, ncl->cl_release); } -static struct nfsdfs_client *__get_nfsdfs_client(struct inode *inode) -{ - struct nfsdfs_client *nc = inode->i_private; - - if (nc) - kref_get(&nc->cl_ref); - return nc; -} - struct nfsdfs_client *get_nfsdfs_client(struct inode *inode) { struct nfsdfs_client *nc; - inode_lock_shared(inode); - nc = __get_nfsdfs_client(inode); - inode_unlock_shared(inode); + spin_lock(&inode->i_lock); + nc = inode->i_private; + if (nc) + kref_get(&nc->cl_ref); + spin_unlock(&inode->i_lock); return nc; } -/* from __rpc_unlink */ -static void nfsdfs_remove_file(struct inode *dir, struct dentry *dentry) -{ - int ret; - - clear_ncl(d_inode(dentry)); - dget(dentry); - ret = simple_unlink(dir, dentry); - d_drop(dentry); - fsnotify_unlink(dir, dentry); - dput(dentry); - WARN_ON_ONCE(ret); -} - -static void nfsdfs_remove_files(struct dentry *root) -{ - struct dentry *dentry, *tmp; - - list_for_each_entry_safe(dentry, tmp, &root->d_subdirs, d_child) { - if (!simple_positive(dentry)) { - WARN_ON_ONCE(1); /* I think this can't happen? */ - continue; - } - nfsdfs_remove_file(d_inode(root), dentry); - } -} /* XXX: cut'n'paste from simple_fill_super; figure out if we could share * code instead. */ static int nfsdfs_create_files(struct dentry *root, const struct tree_descr *files, + struct nfsdfs_client *ncl, struct dentry **fdentries) { struct inode *dir = d_inode(root); @@ -1311,8 +1282,9 @@ static int nfsdfs_create_files(struct dentry *root, dput(dentry); goto out; } + kref_get(&ncl->cl_ref); inode->i_fop = files->ops; - inode->i_private = __get_nfsdfs_client(dir); + inode->i_private = ncl; d_add(dentry, inode); fsnotify_create(dir, dentry); if (fdentries) @@ -1321,7 +1293,6 @@ static int nfsdfs_create_files(struct dentry *root, inode_unlock(dir); return 0; out: - nfsdfs_remove_files(root); inode_unlock(dir); return -ENOMEM; } @@ -1341,7 +1312,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn, dentry = nfsd_mkdir(nn->nfsd_client_dir, ncl, name); if (IS_ERR(dentry)) /* XXX: tossing errors? */ return NULL; - ret = nfsdfs_create_files(dentry, files, fdentries); + ret = nfsdfs_create_files(dentry, files, ncl, fdentries); if (ret) { nfsd_client_rmdir(dentry); return NULL; @@ -1352,20 +1323,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn, /* Taken from __rpc_rmdir: */ void nfsd_client_rmdir(struct dentry *dentry) { - struct inode *dir = d_inode(dentry->d_parent); - struct inode *inode = d_inode(dentry); - int ret; - - inode_lock(dir); - nfsdfs_remove_files(dentry); - clear_ncl(inode); - dget(dentry); - ret = simple_rmdir(dir, dentry); - WARN_ON_ONCE(ret); - d_drop(dentry); - fsnotify_rmdir(dir, dentry); - dput(dentry); - inode_unlock(dir); + simple_recursive_removal(dentry, clear_ncl); } static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) From b31559f8e471f402cd71117f35b9cde52d192138 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 7 Nov 2023 12:35:37 -0500 Subject: [PATCH 11/40] coda_flag_children(): cope with dentries turning negative ->d_lock on parent does not stabilize ->d_inode of child. We don't do much with that inode in there, but we need at least to avoid struct inode getting freed under us... [rcu_read_lock() is not needed here, since parent's ->d_lock provides an rcu-critical area] Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/coda/cache.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 3b8c4513118f..d6254cf9f397 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -94,12 +94,12 @@ static void coda_flag_children(struct dentry *parent, int flag) spin_lock(&parent->d_lock); list_for_each_entry(de, &parent->d_subdirs, d_child) { + struct inode *inode = d_inode_rcu(de); /* don't know what to do with negative dentries */ - if (d_inode(de) ) - coda_flag_inode(d_inode(de), flag); + if (inode) + coda_flag_inode(inode, flag); } spin_unlock(&parent->d_lock); - return; } void coda_flag_inode_children(struct inode *inode, int flag) From da549bdd15c295c24b2ee7ffe7ad0f3877fa8a87 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 7 Nov 2023 02:00:39 -0500 Subject: [PATCH 12/40] dentry: switch the lists of children to hlist Saves a pointer per struct dentry and actually makes the things less clumsy. Cleaned the d_walk() and dcache_readdir() a bit by use of hlist_for_... iterators. A couple of new helpers - d_first_child() and d_next_sibling(), to make the expressions less awful. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- Documentation/filesystems/porting.rst | 9 +++ arch/powerpc/platforms/cell/spufs/inode.c | 5 +- fs/afs/dynroot.c | 5 +- fs/autofs/expire.c | 7 +-- fs/ceph/dir.c | 2 +- fs/ceph/mds_client.c | 2 +- fs/coda/cache.c | 2 +- fs/dcache.c | 76 +++++++++++------------ fs/libfs.c | 45 +++++++------- fs/notify/fsnotify.c | 2 +- fs/tracefs/inode.c | 34 +++++----- include/linux/dcache.h | 20 ++++-- 12 files changed, 108 insertions(+), 101 deletions(-) diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 878e72b2f8b7..331405f4b29f 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1061,3 +1061,12 @@ export_operations ->encode_fh() no longer has a default implementation to encode FILEID_INO32_GEN* file handles. Filesystems that used the default implementation may use the generic helper generic_encode_ino32_fh() explicitly. + +--- + +**mandatory** + +The list of children anchored in parent dentry got turned into hlist now. +Field names got changed (->d_children/->d_sib instead of ->d_subdirs/->d_child +for anchor/entries resp.), so any affected places will be immediately caught +by compiler. diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 10c1320adfd0..030de2b8c145 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -145,10 +145,11 @@ spufs_evict_inode(struct inode *inode) static void spufs_prune_dir(struct dentry *dir) { - struct dentry *dentry, *tmp; + struct dentry *dentry; + struct hlist_node *n; inode_lock(d_inode(dir)); - list_for_each_entry_safe(dentry, tmp, &dir->d_subdirs, d_child) { + hlist_for_each_entry_safe(dentry, n, &dir->d_children, d_sib) { spin_lock(&dentry->d_lock); if (simple_positive(dentry)) { dget_dlock(dentry); diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 4d04ef2d3ae7..fe45462834cc 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -370,7 +370,7 @@ error: void afs_dynroot_depopulate(struct super_block *sb) { struct afs_net *net = afs_sb2net(sb); - struct dentry *root = sb->s_root, *subdir, *tmp; + struct dentry *root = sb->s_root, *subdir; /* Prevent more subdirs from being created */ mutex_lock(&net->proc_cells_lock); @@ -379,10 +379,11 @@ void afs_dynroot_depopulate(struct super_block *sb) mutex_unlock(&net->proc_cells_lock); if (root) { + struct hlist_node *n; inode_lock(root->d_inode); /* Remove all the pins for dirs created for manually added cells */ - list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) { + hlist_for_each_entry_safe(subdir, n, &root->d_children, d_sib) { if (subdir->d_fsdata) { subdir->d_fsdata = NULL; dput(subdir); diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index 038b3d2d9f57..39d8c84c16f4 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -73,12 +73,9 @@ done: /* p->d_lock held */ static struct dentry *positive_after(struct dentry *p, struct dentry *child) { - if (child) - child = list_next_entry(child, d_child); - else - child = list_first_entry(&p->d_subdirs, struct dentry, d_child); + child = child ? d_next_sibling(child) : d_first_child(p); - list_for_each_entry_from(child, &p->d_subdirs, d_child) { + hlist_for_each_entry_from(child, d_sib) { spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(child)) { dget_dlock(child); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 91709934c8b1..678596684596 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -174,7 +174,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, /* * When possible, we try to satisfy a readdir by peeking at the * dcache. We make this work by carefully ordering dentries on - * d_child when we initially get results back from the MDS, and + * d_children when we initially get results back from the MDS, and * falling back to a "normal" sync readdir if any dentries in the dir * are dropped. * diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d95eb525519a..02ebfabfc8ee 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2128,7 +2128,7 @@ static bool drop_negative_children(struct dentry *dentry) goto out; spin_lock(&dentry->d_lock); - list_for_each_entry(child, &dentry->d_subdirs, d_child) { + hlist_for_each_entry(child, &dentry->d_children, d_sib) { if (d_really_is_positive(child)) { all_negative = false; break; diff --git a/fs/coda/cache.c b/fs/coda/cache.c index d6254cf9f397..970f0022ec52 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -93,7 +93,7 @@ static void coda_flag_children(struct dentry *parent, int flag) struct dentry *de; spin_lock(&parent->d_lock); - list_for_each_entry(de, &parent->d_subdirs, d_child) { + hlist_for_each_entry(de, &parent->d_children, d_sib) { struct inode *inode = d_inode_rcu(de); /* don't know what to do with negative dentries */ if (inode) diff --git a/fs/dcache.c b/fs/dcache.c index c82ae731df9a..59f76c9a15d1 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -51,8 +51,8 @@ * - d_lru * - d_count * - d_unhashed() - * - d_parent and d_subdirs - * - childrens' d_child and d_parent + * - d_parent and d_chilren + * - childrens' d_sib and d_parent * - d_u.d_alias, d_inode * * Ordering: @@ -537,7 +537,7 @@ void d_drop(struct dentry *dentry) } EXPORT_SYMBOL(d_drop); -static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent) +static inline void dentry_unlist(struct dentry *dentry) { struct dentry *next; /* @@ -545,12 +545,12 @@ static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent) * attached to the dentry tree */ dentry->d_flags |= DCACHE_DENTRY_KILLED; - if (unlikely(list_empty(&dentry->d_child))) + if (unlikely(hlist_unhashed(&dentry->d_sib))) return; - __list_del_entry(&dentry->d_child); + __hlist_del(&dentry->d_sib); /* * Cursors can move around the list of children. While we'd been - * a normal list member, it didn't matter - ->d_child.next would've + * a normal list member, it didn't matter - ->d_sib.next would've * been updated. However, from now on it won't be and for the * things like d_walk() it might end up with a nasty surprise. * Normally d_walk() doesn't care about cursors moving around - @@ -558,20 +558,20 @@ static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent) * of its own, we get through it without ever unlocking the parent. * There is one exception, though - if we ascend from a child that * gets killed as soon as we unlock it, the next sibling is found - * using the value left in its ->d_child.next. And if _that_ + * using the value left in its ->d_sib.next. And if _that_ * pointed to a cursor, and cursor got moved (e.g. by lseek()) * before d_walk() regains parent->d_lock, we'll end up skipping * everything the cursor had been moved past. * - * Solution: make sure that the pointer left behind in ->d_child.next + * Solution: make sure that the pointer left behind in ->d_sib.next * points to something that won't be moving around. I.e. skip the * cursors. */ - while (dentry->d_child.next != &parent->d_subdirs) { - next = list_entry(dentry->d_child.next, struct dentry, d_child); + while (dentry->d_sib.next) { + next = hlist_entry(dentry->d_sib.next, struct dentry, d_sib); if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR))) break; - dentry->d_child.next = next->d_child.next; + dentry->d_sib.next = next->d_sib.next; } } @@ -600,7 +600,7 @@ static void __dentry_kill(struct dentry *dentry) } /* if it was on the hash then remove it */ __d_drop(dentry); - dentry_unlist(dentry, parent); + dentry_unlist(dentry); if (parent) spin_unlock(&parent->d_lock); if (dentry->d_inode) @@ -1348,8 +1348,7 @@ enum d_walk_ret { static void d_walk(struct dentry *parent, void *data, enum d_walk_ret (*enter)(void *, struct dentry *)) { - struct dentry *this_parent; - struct list_head *next; + struct dentry *this_parent, *dentry; unsigned seq = 0; enum d_walk_ret ret; bool retry = true; @@ -1371,13 +1370,9 @@ again: break; } repeat: - next = this_parent->d_subdirs.next; + dentry = d_first_child(this_parent); resume: - while (next != &this_parent->d_subdirs) { - struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_child); - next = tmp->next; - + hlist_for_each_entry_from(dentry, d_sib) { if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR)) continue; @@ -1398,7 +1393,7 @@ resume: continue; } - if (!list_empty(&dentry->d_subdirs)) { + if (!hlist_empty(&dentry->d_children)) { spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, _RET_IP_); this_parent = dentry; @@ -1413,24 +1408,23 @@ resume: rcu_read_lock(); ascend: if (this_parent != parent) { - struct dentry *child = this_parent; - this_parent = child->d_parent; + dentry = this_parent; + this_parent = dentry->d_parent; - spin_unlock(&child->d_lock); + spin_unlock(&dentry->d_lock); spin_lock(&this_parent->d_lock); /* might go back up the wrong parent if we have had a rename. */ if (need_seqretry(&rename_lock, seq)) goto rename_retry; /* go into the first sibling still alive */ - do { - next = child->d_child.next; - if (next == &this_parent->d_subdirs) - goto ascend; - child = list_entry(next, struct dentry, d_child); - } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)); - rcu_read_unlock(); - goto resume; + hlist_for_each_entry_continue(dentry, d_sib) { + if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) { + rcu_read_unlock(); + goto resume; + } + } + goto ascend; } if (need_seqretry(&rename_lock, seq)) goto rename_retry; @@ -1530,7 +1524,7 @@ out: * Search the dentry child list of the specified parent, * and move any unused dentries to the end of the unused * list for prune_dcache(). We descend to the next level - * whenever the d_subdirs list is non-empty and continue + * whenever the d_children list is non-empty and continue * searching. * * It returns zero iff there are no unused children, @@ -1657,7 +1651,7 @@ EXPORT_SYMBOL(shrink_dcache_parent); static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) { /* it has busy descendents; complain about those instead */ - if (!list_empty(&dentry->d_subdirs)) + if (!hlist_empty(&dentry->d_children)) return D_WALK_CONTINUE; /* root with refcount 1 is fine */ @@ -1814,9 +1808,9 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dentry->d_fsdata = NULL; INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); - INIT_LIST_HEAD(&dentry->d_subdirs); + INIT_HLIST_HEAD(&dentry->d_children); INIT_HLIST_NODE(&dentry->d_u.d_alias); - INIT_LIST_HEAD(&dentry->d_child); + INIT_HLIST_NODE(&dentry->d_sib); d_set_d_op(dentry, dentry->d_sb->s_d_op); if (dentry->d_op && dentry->d_op->d_init) { @@ -1855,7 +1849,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) */ __dget_dlock(parent); dentry->d_parent = parent; - list_add(&dentry->d_child, &parent->d_subdirs); + hlist_add_head(&dentry->d_sib, &parent->d_children); spin_unlock(&parent->d_lock); return dentry; @@ -2993,11 +2987,15 @@ static void __d_move(struct dentry *dentry, struct dentry *target, } else { target->d_parent = old_parent; swap_names(dentry, target); - list_move(&target->d_child, &target->d_parent->d_subdirs); + if (!hlist_unhashed(&target->d_sib)) + __hlist_del(&target->d_sib); + hlist_add_head(&target->d_sib, &target->d_parent->d_children); __d_rehash(target); fsnotify_update_flags(target); } - list_move(&dentry->d_child, &dentry->d_parent->d_subdirs); + if (!hlist_unhashed(&dentry->d_sib)) + __hlist_del(&dentry->d_sib); + hlist_add_head(&dentry->d_sib, &dentry->d_parent->d_children); __d_rehash(dentry); fsnotify_update_flags(dentry); fscrypt_handle_d_move(dentry); diff --git a/fs/libfs.c b/fs/libfs.c index e9440d55073c..46c9177769c1 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -104,15 +104,16 @@ EXPORT_SYMBOL(dcache_dir_close); * If no such element exists, NULL is returned. */ static struct dentry *scan_positives(struct dentry *cursor, - struct list_head *p, + struct hlist_node **p, loff_t count, struct dentry *last) { struct dentry *dentry = cursor->d_parent, *found = NULL; spin_lock(&dentry->d_lock); - while ((p = p->next) != &dentry->d_subdirs) { - struct dentry *d = list_entry(p, struct dentry, d_child); + while (*p) { + struct dentry *d = hlist_entry(*p, struct dentry, d_sib); + p = &d->d_sib.next; // we must at least skip cursors, to avoid livelocks if (d->d_flags & DCACHE_DENTRY_CURSOR) continue; @@ -126,8 +127,10 @@ static struct dentry *scan_positives(struct dentry *cursor, count = 1; } if (need_resched()) { - list_move(&cursor->d_child, p); - p = &cursor->d_child; + if (!hlist_unhashed(&cursor->d_sib)) + __hlist_del(&cursor->d_sib); + hlist_add_behind(&cursor->d_sib, &d->d_sib); + p = &cursor->d_sib.next; spin_unlock(&dentry->d_lock); cond_resched(); spin_lock(&dentry->d_lock); @@ -159,13 +162,12 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) inode_lock_shared(dentry->d_inode); if (offset > 2) - to = scan_positives(cursor, &dentry->d_subdirs, + to = scan_positives(cursor, &dentry->d_children.first, offset - 2, NULL); spin_lock(&dentry->d_lock); + hlist_del_init(&cursor->d_sib); if (to) - list_move(&cursor->d_child, &to->d_child); - else - list_del_init(&cursor->d_child); + hlist_add_behind(&cursor->d_sib, &to->d_sib); spin_unlock(&dentry->d_lock); dput(to); @@ -187,19 +189,16 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct dentry *cursor = file->private_data; - struct list_head *anchor = &dentry->d_subdirs; struct dentry *next = NULL; - struct list_head *p; + struct hlist_node **p; if (!dir_emit_dots(file, ctx)) return 0; if (ctx->pos == 2) - p = anchor; - else if (!list_empty(&cursor->d_child)) - p = &cursor->d_child; + p = &dentry->d_children.first; else - return 0; + p = &cursor->d_sib.next; while ((next = scan_positives(cursor, p, 1, next)) != NULL) { if (!dir_emit(ctx, next->d_name.name, next->d_name.len, @@ -207,13 +206,12 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) fs_umode_to_dtype(d_inode(next)->i_mode))) break; ctx->pos++; - p = &next->d_child; + p = &next->d_sib.next; } spin_lock(&dentry->d_lock); + hlist_del_init(&cursor->d_sib); if (next) - list_move_tail(&cursor->d_child, &next->d_child); - else - list_del_init(&cursor->d_child); + hlist_add_before(&cursor->d_sib, &next->d_sib); spin_unlock(&dentry->d_lock); dput(next); @@ -492,12 +490,11 @@ const struct file_operations simple_offset_dir_operations = { static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) { - struct dentry *child = NULL; - struct list_head *p = prev ? &prev->d_child : &parent->d_subdirs; + struct dentry *child = NULL, *d; spin_lock(&parent->d_lock); - while ((p = p->next) != &parent->d_subdirs) { - struct dentry *d = container_of(p, struct dentry, d_child); + d = prev ? d_next_sibling(prev) : d_first_child(parent); + hlist_for_each_entry_from(d, d_sib) { if (simple_positive(d)) { spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(d)) @@ -658,7 +655,7 @@ int simple_empty(struct dentry *dentry) int ret = 0; spin_lock(&dentry->d_lock); - list_for_each_entry(child, &dentry->d_subdirs, d_child) { + hlist_for_each_entry(child, &dentry->d_children, d_sib) { spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(child)) { spin_unlock(&child->d_lock); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 7974e91ffe13..8bfd690e9f10 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -124,7 +124,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) * d_flags to indicate parental interest (their parent is the * original inode) */ spin_lock(&alias->d_lock); - list_for_each_entry(child, &alias->d_subdirs, d_child) { + hlist_for_each_entry(child, &alias->d_children, d_sib) { if (!child->d_inode) continue; diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 5b54948514fe..61ca5fcf10f9 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -199,26 +199,21 @@ static void change_gid(struct dentry *dentry, kgid_t gid) */ static void set_gid(struct dentry *parent, kgid_t gid) { - struct dentry *this_parent; - struct list_head *next; + struct dentry *this_parent, *dentry; this_parent = parent; spin_lock(&this_parent->d_lock); change_gid(this_parent, gid); repeat: - next = this_parent->d_subdirs.next; + dentry = d_first_child(this_parent); resume: - while (next != &this_parent->d_subdirs) { - struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_child); - next = tmp->next; - + hlist_for_each_entry_from(dentry, d_sib) { spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); change_gid(dentry, gid); - if (!list_empty(&dentry->d_subdirs)) { + if (!hlist_empty(&dentry->d_children)) { spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, _RET_IP_); this_parent = dentry; @@ -233,21 +228,20 @@ resume: rcu_read_lock(); ascend: if (this_parent != parent) { - struct dentry *child = this_parent; - this_parent = child->d_parent; + dentry = this_parent; + this_parent = dentry->d_parent; - spin_unlock(&child->d_lock); + spin_unlock(&dentry->d_lock); spin_lock(&this_parent->d_lock); /* go into the first sibling still alive */ - do { - next = child->d_child.next; - if (next == &this_parent->d_subdirs) - goto ascend; - child = list_entry(next, struct dentry, d_child); - } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)); - rcu_read_unlock(); - goto resume; + hlist_for_each_entry_continue(dentry, d_sib) { + if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) { + rcu_read_unlock(); + goto resume; + } + } + goto ascend; } rcu_read_unlock(); spin_unlock(&this_parent->d_lock); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 3da2f0545d5d..0e397a0c519c 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -68,12 +68,12 @@ extern const struct qstr dotdot_name; * large memory footprint increase). */ #ifdef CONFIG_64BIT -# define DNAME_INLINE_LEN 32 /* 192 bytes */ +# define DNAME_INLINE_LEN 40 /* 192 bytes */ #else # ifdef CONFIG_SMP -# define DNAME_INLINE_LEN 36 /* 128 bytes */ -# else # define DNAME_INLINE_LEN 40 /* 128 bytes */ +# else +# define DNAME_INLINE_LEN 44 /* 128 bytes */ # endif #endif @@ -101,8 +101,8 @@ struct dentry { struct list_head d_lru; /* LRU list */ wait_queue_head_t *d_wait; /* in-lookup ones only */ }; - struct list_head d_child; /* child of parent list */ - struct list_head d_subdirs; /* our children */ + struct hlist_node d_sib; /* child of parent list */ + struct hlist_head d_children; /* our children */ /* * d_alias and d_rcu can share memory */ @@ -600,4 +600,14 @@ struct name_snapshot { void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *); void release_dentry_name_snapshot(struct name_snapshot *); +static inline struct dentry *d_first_child(const struct dentry *dentry) +{ + return hlist_entry_safe(dentry->d_children.first, struct dentry, d_sib); +} + +static inline struct dentry *d_next_sibling(const struct dentry *dentry) +{ + return hlist_entry_safe(dentry->d_sib.next, struct dentry, d_sib); +} + #endif /* __LINUX_DCACHE_H */ From 3fcf535626a44e107e2d536a2c43da0fb5bde121 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 7 Nov 2023 15:21:33 -0500 Subject: [PATCH 13/40] centralize killing dentry from shrink list new helper unifying identical bits of shrink_dentry_list() and shring_dcache_for_umount() Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/dcache.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 59f76c9a15d1..bb862a304e1b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1174,10 +1174,18 @@ out: return false; } +static inline void shrink_kill(struct dentry *victim, struct list_head *list) +{ + struct dentry *parent = victim->d_parent; + if (parent != victim) + __dput_to_list(parent, list); + __dentry_kill(victim); +} + void shrink_dentry_list(struct list_head *list) { while (!list_empty(list)) { - struct dentry *dentry, *parent; + struct dentry *dentry; dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); @@ -1195,10 +1203,7 @@ void shrink_dentry_list(struct list_head *list) } rcu_read_unlock(); d_shrink_del(dentry); - parent = dentry->d_parent; - if (parent != dentry) - __dput_to_list(parent, list); - __dentry_kill(dentry); + shrink_kill(dentry, list); } } @@ -1629,17 +1634,13 @@ void shrink_dcache_parent(struct dentry *parent) data.victim = NULL; d_walk(parent, &data, select_collect2); if (data.victim) { - struct dentry *parent; spin_lock(&data.victim->d_lock); if (!shrink_lock_dentry(data.victim)) { spin_unlock(&data.victim->d_lock); rcu_read_unlock(); } else { rcu_read_unlock(); - parent = data.victim->d_parent; - if (parent != data.victim) - __dput_to_list(parent, &data.dispose); - __dentry_kill(data.victim); + shrink_kill(data.victim, &data.dispose); } } if (!list_empty(&data.dispose)) From cd9f84f35c2eaaf4da7e111021b604662326d8aa Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 30 Oct 2023 13:57:12 -0400 Subject: [PATCH 14/40] shrink_dentry_list(): no need to check that dentry refcount is marked dead ... we won't see DCACHE_MAY_FREE on anything that is *not* dead and checking d_flags is just as cheap as checking refcount. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/dcache.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index bb862a304e1b..c1025921f8d3 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1191,11 +1191,10 @@ void shrink_dentry_list(struct list_head *list) spin_lock(&dentry->d_lock); rcu_read_lock(); if (!shrink_lock_dentry(dentry)) { - bool can_free = false; + bool can_free; rcu_read_unlock(); d_shrink_del(dentry); - if (dentry->d_lockref.count < 0) - can_free = dentry->d_flags & DCACHE_MAY_FREE; + can_free = dentry->d_flags & DCACHE_MAY_FREE; spin_unlock(&dentry->d_lock); if (can_free) dentry_free(dentry); From 15220fbf187100e1cc1ad553b7d21633bdc27e76 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 30 Oct 2023 00:02:14 -0400 Subject: [PATCH 15/40] fast_dput(): having ->d_delete() is not reason to delay refcount decrement ->d_delete() is a way for filesystem to tell that dentry is not worth keeping cached. It is not guaranteed to be called every time a dentry has refcount drop down to zero; it is not guaranteed to be called before dentry gets evicted. In other words, it is not suitable for any kind of keeping track of dentry state. None of the in-tree filesystems attempt to use it that way, fortunately. So the contortions done by fast_dput() (as well as dentry_kill()) are not warranted. fast_dput() certainly should treat having ->d_delete() instance as "can't assume we'll be keeping it", but that's not different from the way we treat e.g. DCACHE_DONTCACHE (which is rather similar to making ->d_delete() returns true when called). Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/dcache.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index c1025921f8d3..00c19041adf3 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -768,15 +768,7 @@ static inline bool fast_dput(struct dentry *dentry) unsigned int d_flags; /* - * If we have a d_op->d_delete() operation, we sould not - * let the dentry count go to zero, so use "put_or_lock". - */ - if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) - return lockref_put_or_lock(&dentry->d_lockref); - - /* - * .. otherwise, we can try to just decrement the - * lockref optimistically. + * try to decrement the lockref optimistically. */ ret = lockref_put_return(&dentry->d_lockref); @@ -830,7 +822,7 @@ static inline bool fast_dput(struct dentry *dentry) */ smp_rmb(); d_flags = READ_ONCE(dentry->d_flags); - d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | + d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_OP_DELETE | DCACHE_DISCONNECTED | DCACHE_DONTCACHE; /* Nothing to do? Dropping the reference was all we needed? */ From 504e08cebe1d4e1efe25f915234f646e74a364a8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 1 Nov 2023 01:08:54 -0400 Subject: [PATCH 16/40] fast_dput(): handle underflows gracefully If refcount is less than 1, we should just warn, unlock dentry and return true, so that the caller doesn't try to do anything else. Taking care of that leaves the rest of "lockref_put_return() has failed" case equivalent to "decrement refcount and rejoin the normal slow path after the point where we grab ->d_lock". NOTE: lockref_put_return() is strictly a fastpath thing - unlike the rest of lockref primitives, it does not contain a fallback. Caller (and it looks like fast_dput() is the only legitimate one in the entire kernel) has to do that itself. Reasons for lockref_put_return() failures: * ->d_lock held by somebody * refcount <= 0 * ... or an architecture not supporting lockref use of cmpxchg - sparc, anything non-SMP, config with spinlock debugging... We could add a fallback, but it would be a clumsy API - we'd have to distinguish between: (1) refcount > 1 - decremented, lock not held on return (2) refcount < 1 - left alone, probably no sense to hold the lock (3) refcount is 1, no cmphxcg - decremented, lock held on return (4) refcount is 1, cmphxcg supported - decremented, lock *NOT* held on return. We want to return with no lock held in case (4); that's the whole point of that thing. We very much do not want to have the fallback in case (3) return without a lock, since the caller might have to retake it in that case. So it wouldn't be more convenient than doing the fallback in the caller and it would be very easy to screw up, especially since the test coverage would suck - no way to test (3) and (4) on the same kernel build. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/dcache.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 00c19041adf3..9edabc7e2e64 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -779,12 +779,12 @@ static inline bool fast_dput(struct dentry *dentry) */ if (unlikely(ret < 0)) { spin_lock(&dentry->d_lock); - if (dentry->d_lockref.count > 1) { - dentry->d_lockref.count--; + if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) { spin_unlock(&dentry->d_lock); return true; } - return false; + dentry->d_lockref.count--; + goto locked; } /* @@ -842,6 +842,7 @@ static inline bool fast_dput(struct dentry *dentry) * else could have killed it and marked it dead. Either way, we * don't need to do anything else. */ +locked: if (dentry->d_lockref.count) { spin_unlock(&dentry->d_lock); return true; From 15f23734a1def3b22ba790bbf9f0d01949aae231 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 30 Oct 2023 00:11:58 -0400 Subject: [PATCH 17/40] fast_dput(): new rules for refcount By now there is only one place in entire fast_dput() where we return false; that happens after refcount had been decremented and found (under ->d_lock) to be zero. In that case, just prior to returning false to caller, fast_dput() forcibly changes the refcount from 0 to 1. Lift that resetting refcount to 1 into the callers; later in the series it will be massaged out of existence. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/dcache.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 9edabc7e2e64..a00e9ba22480 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -847,13 +847,6 @@ locked: spin_unlock(&dentry->d_lock); return true; } - - /* - * Re-get the reference we optimistically dropped. We hold the - * lock, and we just tested that it was zero, so we can just - * set it to 1. - */ - dentry->d_lockref.count = 1; return false; } @@ -896,6 +889,7 @@ void dput(struct dentry *dentry) } /* Slow case: now with the dentry lock held */ + dentry->d_lockref.count = 1; rcu_read_unlock(); if (likely(retain_dentry(dentry))) { @@ -930,6 +924,7 @@ void dput_to_list(struct dentry *dentry, struct list_head *list) return; } rcu_read_unlock(); + dentry->d_lockref.count = 1; if (!retain_dentry(dentry)) __dput_to_list(dentry, list); spin_unlock(&dentry->d_lock); From 6511f6be777f3b06e904c7407fd9f2a39dae8b67 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 30 Oct 2023 00:43:48 -0400 Subject: [PATCH 18/40] __dput_to_list(): do decrement of refcount in the callers ... and rename it to to_shrink_list(), seeing that it no longer does dropping any references Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/dcache.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index a00e9ba22480..0718b3895c12 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -902,16 +902,13 @@ void dput(struct dentry *dentry) } EXPORT_SYMBOL(dput); -static void __dput_to_list(struct dentry *dentry, struct list_head *list) +static void to_shrink_list(struct dentry *dentry, struct list_head *list) __must_hold(&dentry->d_lock) { - if (dentry->d_flags & DCACHE_SHRINK_LIST) { - /* let the owner of the list it's on deal with it */ - --dentry->d_lockref.count; - } else { + if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); - if (!--dentry->d_lockref.count) + if (!dentry->d_lockref.count) d_shrink_add(dentry, list); } } @@ -925,8 +922,10 @@ void dput_to_list(struct dentry *dentry, struct list_head *list) } rcu_read_unlock(); dentry->d_lockref.count = 1; - if (!retain_dentry(dentry)) - __dput_to_list(dentry, list); + if (!retain_dentry(dentry)) { + --dentry->d_lockref.count; + to_shrink_list(dentry, list); + } spin_unlock(&dentry->d_lock); } @@ -1165,8 +1164,10 @@ out: static inline void shrink_kill(struct dentry *victim, struct list_head *list) { struct dentry *parent = victim->d_parent; - if (parent != victim) - __dput_to_list(parent, list); + if (parent != victim) { + --parent->d_lockref.count; + to_shrink_list(parent, list); + } __dentry_kill(victim); } From e9d130d05077e71b1224ad96e419f5f5512b8574 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Oct 2023 19:09:11 -0400 Subject: [PATCH 19/40] make retain_dentry() neutral with respect to refcounting retain_dentry() used to decrement refcount if and only if it returned true. Lift those decrements into the callers. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/dcache.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/dcache.c b/fs/dcache.c index 0718b3895c12..2e74f3f2ce2e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -680,7 +680,6 @@ static inline bool retain_dentry(struct dentry *dentry) return false; /* retain; LRU fodder */ - dentry->d_lockref.count--; if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) d_lru_add(dentry); else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED))) @@ -744,6 +743,8 @@ got_locks: } else if (likely(!retain_dentry(dentry))) { __dentry_kill(dentry); return parent; + } else { + dentry->d_lockref.count--; } /* we are keeping it, after all */ if (inode) @@ -893,6 +894,7 @@ void dput(struct dentry *dentry) rcu_read_unlock(); if (likely(retain_dentry(dentry))) { + dentry->d_lockref.count--; spin_unlock(&dentry->d_lock); return; } @@ -925,6 +927,8 @@ void dput_to_list(struct dentry *dentry, struct list_head *list) if (!retain_dentry(dentry)) { --dentry->d_lockref.count; to_shrink_list(dentry, list); + } else { + --dentry->d_lockref.count; } spin_unlock(&dentry->d_lock); } From ee0c82503dcd0d14cc1ad53da18d32a04f612c4c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Oct 2023 18:38:27 -0400 Subject: [PATCH 20/40] __dentry_kill(): get consistent rules for victim's refcount Currently we call it with refcount equal to 1 when called from dentry_kill(); all other callers have it equal to 0. Make it always be called with zero refcount; on this step we just decrement it before the calls in dentry_kill(). That is safe, since all places that care about the value of refcount either do that under ->d_lock or hold a reference to dentry in question. Either is sufficient to prevent observing a dentry immediately prior to __dentry_kill() getting called from dentry_kill(). Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/dcache.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/dcache.c b/fs/dcache.c index 2e74f3f2ce2e..b527db8e5901 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -729,6 +729,7 @@ static struct dentry *dentry_kill(struct dentry *dentry) goto slow_positive; } } + dentry->d_lockref.count--; __dentry_kill(dentry); return parent; @@ -741,6 +742,7 @@ got_locks: if (unlikely(dentry->d_lockref.count != 1)) { dentry->d_lockref.count--; } else if (likely(!retain_dentry(dentry))) { + dentry->d_lockref.count--; __dentry_kill(dentry); return parent; } else { From b06c684d3984ef64e792ec3e89889c96cab97b5e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 31 Oct 2023 00:23:35 -0400 Subject: [PATCH 21/40] dentry_kill(): don't bother with retain_dentry() on slow path We have already checked it and dentry used to look not worthy of keeping. The only hard obstacle to evicting dentry is non-zero refcount; everything else is advisory - e.g. memory pressure could evict any dentry found with refcount zero. On the slow path in dentry_kill() we had dropped and regained ->d_lock; we must recheck the refcount, but everything else is not worth bothering with. Note that filesystem can not count upon ->d_delete() being called for dentry - not even once. Again, memory pressure (as well as d_prune_aliases(), or attempted rmdir() of ancestor, or...) will not call ->d_delete() at all. So from the correctness point of view we are fine doing the check only once. And it makes things simpler down the road. Signed-off-by: Al Viro --- fs/dcache.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index b527db8e5901..80992e49561c 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -739,14 +739,10 @@ slow_positive: spin_lock(&dentry->d_lock); parent = lock_parent(dentry); got_locks: - if (unlikely(dentry->d_lockref.count != 1)) { - dentry->d_lockref.count--; - } else if (likely(!retain_dentry(dentry))) { - dentry->d_lockref.count--; + dentry->d_lockref.count--; + if (likely(dentry->d_lockref.count == 0)) { __dentry_kill(dentry); return parent; - } else { - dentry->d_lockref.count--; } /* we are keeping it, after all */ if (inode) From 2f42f1eb9093834b635991c70d0273fbe249eabf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 30 Oct 2023 01:09:50 -0400 Subject: [PATCH 22/40] Call retain_dentry() with refcount 0 Instead of bumping it from 0 to 1, calling retain_dentry(), then decrementing it back to 0 (with ->d_lock held all the way through), just leave refcount at 0 through all of that. It will have a visible effect for ->d_delete() - now it can be called with refcount 0 instead of 1 and it can no longer play silly buggers with dropping/regaining ->d_lock. Not that any in-tree instances tried to (it's pretty hard to get right). Any out-of-tree ones will have to adjust (assuming they need any changes). Note that we do not need to extend rcu-critical area here - we have verified that refcount is non-negative after having grabbed ->d_lock, so nobody will be able to free dentry until they get into __dentry_kill(), which won't happen until they manage to grab ->d_lock. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- Documentation/filesystems/porting.rst | 8 ++++++++ fs/dcache.c | 10 ++-------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 331405f4b29f..da8073b592ff 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1070,3 +1070,11 @@ The list of children anchored in parent dentry got turned into hlist now. Field names got changed (->d_children/->d_sib instead of ->d_subdirs/->d_child for anchor/entries resp.), so any affected places will be immediately caught by compiler. + +--- + +**mandatory** + +->d_delete() instances are now called for dentries with ->d_lock held +and refcount equal to 0. They are not permitted to drop/regain ->d_lock. +None of in-tree instances did anything of that sort. Make sure yours do not... diff --git a/fs/dcache.c b/fs/dcache.c index 80992e49561c..8ce0fe70f303 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -888,15 +888,14 @@ void dput(struct dentry *dentry) } /* Slow case: now with the dentry lock held */ - dentry->d_lockref.count = 1; rcu_read_unlock(); if (likely(retain_dentry(dentry))) { - dentry->d_lockref.count--; spin_unlock(&dentry->d_lock); return; } + dentry->d_lockref.count = 1; dentry = dentry_kill(dentry); } } @@ -921,13 +920,8 @@ void dput_to_list(struct dentry *dentry, struct list_head *list) return; } rcu_read_unlock(); - dentry->d_lockref.count = 1; - if (!retain_dentry(dentry)) { - --dentry->d_lockref.count; + if (!retain_dentry(dentry)) to_shrink_list(dentry, list); - } else { - --dentry->d_lockref.count; - } spin_unlock(&dentry->d_lock); } From f05441c7e16e6a720cf24e08999661a76ff77478 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 31 Oct 2023 00:45:40 -0400 Subject: [PATCH 23/40] fold the call of retain_dentry() into fast_dput() Calls of retain_dentry() happen immediately after getting false from fast_dput() and getting true from retain_dentry() is treated the same way as non-zero refcount would be treated by fast_dput() - unlock dentry and bugger off. Doing that in fast_dput() itself is simpler. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/dcache.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 8ce0fe70f303..b69ff3a0b30f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -757,6 +757,8 @@ got_locks: * Try to do a lockless dput(), and return whether that was successful. * * If unsuccessful, we return false, having already taken the dentry lock. + * In that case refcount is guaranteed to be zero and we have already + * decided that it's not worth keeping around. * * The caller needs to hold the RCU read lock, so that the dentry is * guaranteed to stay around even if the refcount goes down to zero! @@ -842,7 +844,7 @@ static inline bool fast_dput(struct dentry *dentry) * don't need to do anything else. */ locked: - if (dentry->d_lockref.count) { + if (dentry->d_lockref.count || retain_dentry(dentry)) { spin_unlock(&dentry->d_lock); return true; } @@ -889,12 +891,6 @@ void dput(struct dentry *dentry) /* Slow case: now with the dentry lock held */ rcu_read_unlock(); - - if (likely(retain_dentry(dentry))) { - spin_unlock(&dentry->d_lock); - return; - } - dentry->d_lockref.count = 1; dentry = dentry_kill(dentry); } @@ -920,8 +916,7 @@ void dput_to_list(struct dentry *dentry, struct list_head *list) return; } rcu_read_unlock(); - if (!retain_dentry(dentry)) - to_shrink_list(dentry, list); + to_shrink_list(dentry, list); spin_unlock(&dentry->d_lock); } From 339e9e13530ba750fe0868c5e51bc23be07fd1f1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 31 Oct 2023 01:23:47 -0400 Subject: [PATCH 24/40] don't try to cut corners in shrink_lock_dentry() That is to say, do *not* treat the ->d_inode or ->d_parent changes as "it's hard, return false; somebody must have grabbed it, so even if has zero refcount, we don't need to bother killing it - final dput() from whoever grabbed it would've done everything". First of all, that is not guaranteed. It might have been dropped by shrink_kill() handling of victim's parent, which would've found it already on a shrink list (ours) and decided that they don't need to put it on their shrink list. What's more, dentry_kill() is doing pretty much the same thing, cutting its own set of corners (it assumes that dentry can't go from positive to negative, so its inode can change but only once and only in one direction). Doing that right allows to get rid of that not-quite-duplication and removes the only reason for re-incrementing refcount before the call of dentry_kill(). Replacement is called lock_for_kill(); called under rcu_read_lock and with ->d_lock held. If it returns false, dentry has non-zero refcount and the same locks are held. If it returns true, dentry has zero refcount and all locks required by __dentry_kill() are taken. Part of __lock_parent() had been lifted into lock_parent() to allow its reuse. Now it's called with rcu_read_lock already held and dentry already unlocked. Note that this is not the final change - locking requirements for __dentry_kill() are going to change later in the series and the set of locks taken by lock_for_kill() will be adjusted. Both lock_parent() and __lock_parent() will be gone once that happens. Signed-off-by: Al Viro --- fs/dcache.c | 159 ++++++++++++++++++++++------------------------------ 1 file changed, 66 insertions(+), 93 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index b69ff3a0b30f..a7f99d46c41b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -625,8 +625,6 @@ static void __dentry_kill(struct dentry *dentry) static struct dentry *__lock_parent(struct dentry *dentry) { struct dentry *parent; - rcu_read_lock(); - spin_unlock(&dentry->d_lock); again: parent = READ_ONCE(dentry->d_parent); spin_lock(&parent->d_lock); @@ -642,7 +640,6 @@ again: spin_unlock(&parent->d_lock); goto again; } - rcu_read_unlock(); if (parent != dentry) spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); else @@ -657,7 +654,64 @@ static inline struct dentry *lock_parent(struct dentry *dentry) return NULL; if (likely(spin_trylock(&parent->d_lock))) return parent; - return __lock_parent(dentry); + rcu_read_lock(); + spin_unlock(&dentry->d_lock); + parent = __lock_parent(dentry); + rcu_read_unlock(); + return parent; +} + +/* + * Lock a dentry for feeding it to __dentry_kill(). + * Called under rcu_read_lock() and dentry->d_lock; the former + * guarantees that nothing we access will be freed under us. + * Note that dentry is *not* protected from concurrent dentry_kill(), + * d_delete(), etc. + * + * Return false if dentry is busy. Otherwise, return true and have + * that dentry's inode and parent both locked. + */ + +static bool lock_for_kill(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct dentry *parent = dentry->d_parent; + + if (unlikely(dentry->d_lockref.count)) + return false; + + if (inode && unlikely(!spin_trylock(&inode->i_lock))) + goto slow; + if (dentry == parent) + return true; + if (likely(spin_trylock(&parent->d_lock))) + return true; + + if (inode) + spin_unlock(&inode->i_lock); +slow: + spin_unlock(&dentry->d_lock); + + for (;;) { + if (inode) + spin_lock(&inode->i_lock); + parent = __lock_parent(dentry); + if (likely(inode == dentry->d_inode)) + break; + if (inode) + spin_unlock(&inode->i_lock); + inode = dentry->d_inode; + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + } + if (likely(!dentry->d_lockref.count)) + return true; + if (inode) + spin_unlock(&inode->i_lock); + if (parent) + spin_unlock(&parent->d_lock); + return false; } static inline bool retain_dentry(struct dentry *dentry) @@ -710,45 +764,16 @@ EXPORT_SYMBOL(d_mark_dontcache); static struct dentry *dentry_kill(struct dentry *dentry) __releases(dentry->d_lock) { - struct inode *inode = dentry->d_inode; - struct dentry *parent = NULL; - if (inode && unlikely(!spin_trylock(&inode->i_lock))) - goto slow_positive; - - if (!IS_ROOT(dentry)) { - parent = dentry->d_parent; - if (unlikely(!spin_trylock(&parent->d_lock))) { - parent = __lock_parent(dentry); - if (likely(inode || !dentry->d_inode)) - goto got_locks; - /* negative that became positive */ - if (parent) - spin_unlock(&parent->d_lock); - inode = dentry->d_inode; - goto slow_positive; - } - } dentry->d_lockref.count--; - __dentry_kill(dentry); - return parent; - -slow_positive: - spin_unlock(&dentry->d_lock); - spin_lock(&inode->i_lock); - spin_lock(&dentry->d_lock); - parent = lock_parent(dentry); -got_locks: - dentry->d_lockref.count--; - if (likely(dentry->d_lockref.count == 0)) { + rcu_read_lock(); + if (likely(lock_for_kill(dentry))) { + struct dentry *parent = dentry->d_parent; + rcu_read_unlock(); __dentry_kill(dentry); - return parent; + return parent != dentry ? parent : NULL; } - /* we are keeping it, after all */ - if (inode) - spin_unlock(&inode->i_lock); - if (parent) - spin_unlock(&parent->d_lock); + rcu_read_unlock(); spin_unlock(&dentry->d_lock); return NULL; } @@ -1100,58 +1125,6 @@ restart: } EXPORT_SYMBOL(d_prune_aliases); -/* - * Lock a dentry from shrink list. - * Called under rcu_read_lock() and dentry->d_lock; the former - * guarantees that nothing we access will be freed under us. - * Note that dentry is *not* protected from concurrent dentry_kill(), - * d_delete(), etc. - * - * Return false if dentry has been disrupted or grabbed, leaving - * the caller to kick it off-list. Otherwise, return true and have - * that dentry's inode and parent both locked. - */ -static bool shrink_lock_dentry(struct dentry *dentry) -{ - struct inode *inode; - struct dentry *parent; - - if (dentry->d_lockref.count) - return false; - - inode = dentry->d_inode; - if (inode && unlikely(!spin_trylock(&inode->i_lock))) { - spin_unlock(&dentry->d_lock); - spin_lock(&inode->i_lock); - spin_lock(&dentry->d_lock); - if (unlikely(dentry->d_lockref.count)) - goto out; - /* changed inode means that somebody had grabbed it */ - if (unlikely(inode != dentry->d_inode)) - goto out; - } - - parent = dentry->d_parent; - if (IS_ROOT(dentry) || likely(spin_trylock(&parent->d_lock))) - return true; - - spin_unlock(&dentry->d_lock); - spin_lock(&parent->d_lock); - if (unlikely(parent != dentry->d_parent)) { - spin_unlock(&parent->d_lock); - spin_lock(&dentry->d_lock); - goto out; - } - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (likely(!dentry->d_lockref.count)) - return true; - spin_unlock(&parent->d_lock); -out: - if (inode) - spin_unlock(&inode->i_lock); - return false; -} - static inline void shrink_kill(struct dentry *victim, struct list_head *list) { struct dentry *parent = victim->d_parent; @@ -1170,7 +1143,7 @@ void shrink_dentry_list(struct list_head *list) dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); rcu_read_lock(); - if (!shrink_lock_dentry(dentry)) { + if (!lock_for_kill(dentry)) { bool can_free; rcu_read_unlock(); d_shrink_del(dentry); @@ -1614,7 +1587,7 @@ void shrink_dcache_parent(struct dentry *parent) d_walk(parent, &data, select_collect2); if (data.victim) { spin_lock(&data.victim->d_lock); - if (!shrink_lock_dentry(data.victim)) { + if (!lock_for_kill(data.victim)) { spin_unlock(&data.victim->d_lock); rcu_read_unlock(); } else { From 5e7a5c8d17d94216aed205cb0f20cd32adfd4487 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 31 Oct 2023 01:41:22 -0400 Subject: [PATCH 25/40] fold dentry_kill() into dput() Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/dcache.c | 37 ++++++++++++------------------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index a7f99d46c41b..5284b02747cd 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -756,28 +756,6 @@ void d_mark_dontcache(struct inode *inode) } EXPORT_SYMBOL(d_mark_dontcache); -/* - * Finish off a dentry we've decided to kill. - * dentry->d_lock must be held, returns with it unlocked. - * Returns dentry requiring refcount drop, or NULL if we're done. - */ -static struct dentry *dentry_kill(struct dentry *dentry) - __releases(dentry->d_lock) -{ - - dentry->d_lockref.count--; - rcu_read_lock(); - if (likely(lock_for_kill(dentry))) { - struct dentry *parent = dentry->d_parent; - rcu_read_unlock(); - __dentry_kill(dentry); - return parent != dentry ? parent : NULL; - } - rcu_read_unlock(); - spin_unlock(&dentry->d_lock); - return NULL; -} - /* * Try to do a lockless dput(), and return whether that was successful. * @@ -915,9 +893,18 @@ void dput(struct dentry *dentry) } /* Slow case: now with the dentry lock held */ - rcu_read_unlock(); - dentry->d_lockref.count = 1; - dentry = dentry_kill(dentry); + if (likely(lock_for_kill(dentry))) { + struct dentry *parent = dentry->d_parent; + rcu_read_unlock(); + __dentry_kill(dentry); + if (dentry == parent) + return; + dentry = parent; + } else { + rcu_read_unlock(); + spin_unlock(&dentry->d_lock); + return; + } } } EXPORT_SYMBOL(dput); From c2e5e29f3fdaff6f57795be068ce525c60954fd0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 30 Oct 2023 01:06:06 -0400 Subject: [PATCH 26/40] to_shrink_list(): call only if refcount is 0 The only thing it does if refcount is not zero is d_lru_del(); no point, IMO, seeing that plain dput() does nothing of that sort... Note that 2 of 3 current callers are guaranteed that refcount is 0. Acked-by: Christian Brauner Signed-off-by: Al Viro --- fs/dcache.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 5284b02747cd..704676bf06fd 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -915,8 +915,7 @@ __must_hold(&dentry->d_lock) if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); - if (!dentry->d_lockref.count) - d_shrink_add(dentry, list); + d_shrink_add(dentry, list); } } @@ -1115,10 +1114,8 @@ EXPORT_SYMBOL(d_prune_aliases); static inline void shrink_kill(struct dentry *victim, struct list_head *list) { struct dentry *parent = victim->d_parent; - if (parent != victim) { - --parent->d_lockref.count; + if (parent != victim && !--parent->d_lockref.count) to_shrink_list(parent, list); - } __dentry_kill(victim); } From f5c8a8a4b6c90f2160de6b580fa672b4afc15061 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 30 Oct 2023 21:07:34 -0400 Subject: [PATCH 27/40] switch select_collect{,2}() to use of to_shrink_list() Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/dcache.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 704676bf06fd..f68fe7c863e0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1495,13 +1495,9 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) if (dentry->d_flags & DCACHE_SHRINK_LIST) { data->found++; - } else { - if (dentry->d_flags & DCACHE_LRU_LIST) - d_lru_del(dentry); - if (!dentry->d_lockref.count) { - d_shrink_add(dentry, &data->dispose); - data->found++; - } + } else if (!dentry->d_lockref.count) { + to_shrink_list(dentry, &data->dispose); + data->found++; } /* * We can return to the caller if we have found some (this @@ -1522,17 +1518,13 @@ static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) if (data->start == dentry) goto out; - if (dentry->d_flags & DCACHE_SHRINK_LIST) { - if (!dentry->d_lockref.count) { + if (!dentry->d_lockref.count) { + if (dentry->d_flags & DCACHE_SHRINK_LIST) { rcu_read_lock(); data->victim = dentry; return D_WALK_QUIT; } - } else { - if (dentry->d_flags & DCACHE_LRU_LIST) - d_lru_del(dentry); - if (!dentry->d_lockref.count) - d_shrink_add(dentry, &data->dispose); + to_shrink_list(dentry, &data->dispose); } /* * We can return to the caller if we have found some (this From b4cc0734d25746d402db3baa6082d19109bca951 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 3 Nov 2023 14:46:14 -0400 Subject: [PATCH 28/40] d_prune_aliases(): use a shrink list Instead of dropping aliases one by one, restarting, etc., just collect them into a shrink list and kill them off in one pass. We don't really need the restarts - one alias can't pin another (directory has only one alias, and couldn't be its own ancestor anyway), so collecting everything that is not busy and taking it out would take care of everything evictable that had been there as we entered the function. And new aliases added while we'd been dropping old ones could just as easily have appeared right as we return to caller... Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/dcache.c | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index f68fe7c863e0..a3cc612a80d5 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -647,20 +647,6 @@ again: return parent; } -static inline struct dentry *lock_parent(struct dentry *dentry) -{ - struct dentry *parent = dentry->d_parent; - if (IS_ROOT(dentry)) - return NULL; - if (likely(spin_trylock(&parent->d_lock))) - return parent; - rcu_read_lock(); - spin_unlock(&dentry->d_lock); - parent = __lock_parent(dentry); - rcu_read_unlock(); - return parent; -} - /* * Lock a dentry for feeding it to __dentry_kill(). * Called under rcu_read_lock() and dentry->d_lock; the former @@ -1090,24 +1076,18 @@ struct dentry *d_find_alias_rcu(struct inode *inode) */ void d_prune_aliases(struct inode *inode) { + LIST_HEAD(dispose); struct dentry *dentry; -restart: + spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { spin_lock(&dentry->d_lock); - if (!dentry->d_lockref.count) { - struct dentry *parent = lock_parent(dentry); - if (likely(!dentry->d_lockref.count)) { - __dentry_kill(dentry); - dput(parent); - goto restart; - } - if (parent) - spin_unlock(&parent->d_lock); - } + if (!dentry->d_lockref.count) + to_shrink_list(dentry, &dispose); spin_unlock(&dentry->d_lock); } spin_unlock(&inode->i_lock); + shrink_dentry_list(&dispose); } EXPORT_SYMBOL(d_prune_aliases); From 1c18edd1b7a068e07fed7f00e059f22ed67c04c9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 7 Nov 2023 16:14:08 -0500 Subject: [PATCH 29/40] __dentry_kill(): new locking scheme Currently we enter __dentry_kill() with parent (along with the victim dentry and victim's inode) held locked. Then we mark dentry refcount as dead call ->d_prune() remove dentry from hash remove it from the parent's list of children unlock the parent, don't need it from that point on detach dentry from inode, unlock dentry and drop the inode (via ->d_iput()) call ->d_release() regain the lock on dentry check if it's on a shrink list (in which case freeing its empty husk has to be left to shrink_dentry_list()) or not (in which case we can free it ourselves). In the former case, mark it as an empty husk, so that shrink_dentry_list() would know it can free the sucker. drop the lock on dentry ... and usually the caller proceeds to drop a reference on the parent, possibly retaking the lock on it. That is painful for a bunch of reasons, starting with the need to take locks out of order, but not limited to that - the parent of positive dentry can change if we drop its ->d_lock, so getting these locks has to be done with care. Moreover, as soon as dentry is out of the parent's list of children, shrink_dcache_for_umount() won't see it anymore, making it appear as if the parent is inexplicably busy. We do work around that by having shrink_dentry_list() decrement the parent's refcount first and put it on shrink list to be evicted once we are done with __dentry_kill() of child, but that may in some cases lead to ->d_iput() on child called after the parent got killed. That doesn't happen in cases where in-tree ->d_iput() instances might want to look at the parent, but that's brittle as hell. Solution: do removal from the parent's list of children in the very end of __dentry_kill(). As the result, the callers do not need to lock the parent and by the time we really need the parent locked, dentry is negative and is guaranteed not to be moved around. It does mean that ->d_prune() will be called with parent not locked. It also means that we might see dentries in process of being torn down while going through the parent's list of children; those dentries will be unhashed, negative and with refcount marked dead. In practice, that's enough for in-tree code that looks through the list of children to do the right thing as-is. Out-of-tree code might need to be adjusted. Calling conventions: __dentry_kill(dentry) is called with dentry->d_lock held, along with ->i_lock of its inode (if any). It either returns the parent (locked, with refcount decremented to 0) or NULL (if there'd been no parent or if refcount decrement for parent hadn't reached 0). lock_for_kill() is adjusted for new requirements - it doesn't touch the parent's ->d_lock at all. Callers adjusted. Note that for dput() we don't need to bother with fast_dput() for the parent - we just need to check retain_dentry() for it, since its ->d_lock is still held since the moment when __dentry_kill() had taken it to remove the victim from the list of children. The kludge with early decrement of parent's refcount in shrink_dentry_list() is no longer needed - shrink_dcache_for_umount() sees the half-killed dentries in the list of children for as long as they are pinning the parent. They are easily recognized and accounted for by select_collect(), so we know we are not done yet. As the result, we always have the expected ordering for ->d_iput()/->d_release() vs. __dentry_kill() of the parent, no exceptions. Moreover, the current rules for shrink lists (one must make sure that shrink_dcache_for_umount() won't happen while any dentries from the superblock in question are on any shrink lists) are gone - shrink_dcache_for_umount() will do the right thing in all cases, taking such dentries out. Their empty husks (memory occupied by struct dentry itself + its external name, if any) will remain on the shrink lists, but they are no obstacles to filesystem shutdown. And such husks will get freed as soon as shrink_dentry_list() of the list they are on gets to them. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- Documentation/filesystems/porting.rst | 17 ++++ fs/dcache.c | 129 ++++++++++---------------- 2 files changed, 65 insertions(+), 81 deletions(-) diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index da8073b592ff..d890ef07a9fd 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1078,3 +1078,20 @@ by compiler. ->d_delete() instances are now called for dentries with ->d_lock held and refcount equal to 0. They are not permitted to drop/regain ->d_lock. None of in-tree instances did anything of that sort. Make sure yours do not... + +-- + +**mandatory** + +->d_prune() instances are now called without ->d_lock held on the parent. +->d_lock on dentry itself is still held; if you need per-parent exclusions (none +of the in-tree instances did), use your own spinlock. + +->d_iput() and ->d_release() are called with victim dentry still in the +list of parent's children. It is still unhashed, marked killed, etc., just not +removed from parent's ->d_children yet. + +Anyone iterating through the list of children needs to be aware of the +half-killed dentries that might be seen there; taking ->d_lock on those will +see them negative, unhashed and with negative refcount, which means that most +of the in-kernel users would've done the right thing anyway without any adjustment. diff --git a/fs/dcache.c b/fs/dcache.c index a3cc612a80d5..c795154ffa3a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -575,12 +575,10 @@ static inline void dentry_unlist(struct dentry *dentry) } } -static void __dentry_kill(struct dentry *dentry) +static struct dentry *__dentry_kill(struct dentry *dentry) { struct dentry *parent = NULL; bool can_free = true; - if (!IS_ROOT(dentry)) - parent = dentry->d_parent; /* * The dentry is now unrecoverably dead to the world. @@ -600,9 +598,6 @@ static void __dentry_kill(struct dentry *dentry) } /* if it was on the hash then remove it */ __d_drop(dentry); - dentry_unlist(dentry); - if (parent) - spin_unlock(&parent->d_lock); if (dentry->d_inode) dentry_unlink_inode(dentry); else @@ -611,7 +606,14 @@ static void __dentry_kill(struct dentry *dentry) if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); - spin_lock(&dentry->d_lock); + cond_resched(); + /* now that it's negative, ->d_parent is stable */ + if (!IS_ROOT(dentry)) { + parent = dentry->d_parent; + spin_lock(&parent->d_lock); + } + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + dentry_unlist(dentry); if (dentry->d_flags & DCACHE_SHRINK_LIST) { dentry->d_flags |= DCACHE_MAY_FREE; can_free = false; @@ -619,31 +621,10 @@ static void __dentry_kill(struct dentry *dentry) spin_unlock(&dentry->d_lock); if (likely(can_free)) dentry_free(dentry); - cond_resched(); -} - -static struct dentry *__lock_parent(struct dentry *dentry) -{ - struct dentry *parent; -again: - parent = READ_ONCE(dentry->d_parent); - spin_lock(&parent->d_lock); - /* - * We can't blindly lock dentry until we are sure - * that we won't violate the locking order. - * Any changes of dentry->d_parent must have - * been done with parent->d_lock held, so - * spin_lock() above is enough of a barrier - * for checking if it's still our child. - */ - if (unlikely(parent != dentry->d_parent)) { + if (parent && --parent->d_lockref.count) { spin_unlock(&parent->d_lock); - goto again; + return NULL; } - if (parent != dentry) - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - else - parent = NULL; return parent; } @@ -655,48 +636,32 @@ again: * d_delete(), etc. * * Return false if dentry is busy. Otherwise, return true and have - * that dentry's inode and parent both locked. + * that dentry's inode locked. */ static bool lock_for_kill(struct dentry *dentry) { struct inode *inode = dentry->d_inode; - struct dentry *parent = dentry->d_parent; if (unlikely(dentry->d_lockref.count)) return false; - if (inode && unlikely(!spin_trylock(&inode->i_lock))) - goto slow; - if (dentry == parent) - return true; - if (likely(spin_trylock(&parent->d_lock))) + if (!inode || likely(spin_trylock(&inode->i_lock))) return true; - if (inode) - spin_unlock(&inode->i_lock); -slow: - spin_unlock(&dentry->d_lock); - - for (;;) { - if (inode) - spin_lock(&inode->i_lock); - parent = __lock_parent(dentry); + do { + spin_unlock(&dentry->d_lock); + spin_lock(&inode->i_lock); + spin_lock(&dentry->d_lock); if (likely(inode == dentry->d_inode)) break; - if (inode) - spin_unlock(&inode->i_lock); + spin_unlock(&inode->i_lock); inode = dentry->d_inode; - spin_unlock(&dentry->d_lock); - if (parent) - spin_unlock(&parent->d_lock); - } + } while (inode); if (likely(!dentry->d_lockref.count)) return true; if (inode) spin_unlock(&inode->i_lock); - if (parent) - spin_unlock(&parent->d_lock); return false; } @@ -869,29 +834,27 @@ locked: */ void dput(struct dentry *dentry) { - while (dentry) { - might_sleep(); - - rcu_read_lock(); - if (likely(fast_dput(dentry))) { - rcu_read_unlock(); + if (!dentry) + return; + might_sleep(); + rcu_read_lock(); + if (likely(fast_dput(dentry))) { + rcu_read_unlock(); + return; + } + while (lock_for_kill(dentry)) { + rcu_read_unlock(); + dentry = __dentry_kill(dentry); + if (!dentry) return; - } - - /* Slow case: now with the dentry lock held */ - if (likely(lock_for_kill(dentry))) { - struct dentry *parent = dentry->d_parent; - rcu_read_unlock(); - __dentry_kill(dentry); - if (dentry == parent) - return; - dentry = parent; - } else { - rcu_read_unlock(); + if (retain_dentry(dentry)) { spin_unlock(&dentry->d_lock); return; } + rcu_read_lock(); } + rcu_read_unlock(); + spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(dput); @@ -1091,12 +1054,16 @@ void d_prune_aliases(struct inode *inode) } EXPORT_SYMBOL(d_prune_aliases); -static inline void shrink_kill(struct dentry *victim, struct list_head *list) +static inline void shrink_kill(struct dentry *victim) { - struct dentry *parent = victim->d_parent; - if (parent != victim && !--parent->d_lockref.count) - to_shrink_list(parent, list); - __dentry_kill(victim); + do { + rcu_read_unlock(); + victim = __dentry_kill(victim); + rcu_read_lock(); + } while (victim && lock_for_kill(victim)); + rcu_read_unlock(); + if (victim) + spin_unlock(&victim->d_lock); } void shrink_dentry_list(struct list_head *list) @@ -1117,9 +1084,8 @@ void shrink_dentry_list(struct list_head *list) dentry_free(dentry); continue; } - rcu_read_unlock(); d_shrink_del(dentry); - shrink_kill(dentry, list); + shrink_kill(dentry); } } @@ -1478,6 +1444,8 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) } else if (!dentry->d_lockref.count) { to_shrink_list(dentry, &data->dispose); data->found++; + } else if (dentry->d_lockref.count < 0) { + data->found++; } /* * We can return to the caller if we have found some (this @@ -1547,8 +1515,7 @@ void shrink_dcache_parent(struct dentry *parent) spin_unlock(&data.victim->d_lock); rcu_read_unlock(); } else { - rcu_read_unlock(); - shrink_kill(data.victim, &data.dispose); + shrink_kill(data.victim); } } if (!list_empty(&data.dispose)) From 6367b491c17a34b28aece294bddfda1a36ec0377 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 23 Nov 2023 17:33:21 -0500 Subject: [PATCH 30/40] retain_dentry(): introduce a trimmed-down lockless variant fast_dput() contains a small piece of code, preceded by scary comments about 5 times longer than it. What is actually done there is a trimmed-down subset of retain_dentry() - in some situations we can tell that retain_dentry() would have returned true without ever needing ->d_lock and that's what that code checks. If these checks come true fast_dput() can declare that we are done, without bothering with ->d_lock; otherwise it has to take the lock and do full variant of retain_dentry() checks. Trimmed-down variant of the checks is hard to follow and it's asking for trouble - if we ever decide to change the rules in retain_dentry(), we'll have to remember to update that code. It turns out that an equivalent variant of these checks more obviously parallel to retain_dentry() is not just possible, but easy to unify with retain_dentry() itself, passing it a new boolean argument ('locked') to distinguish between the full semantics and trimmed down one. Note that in lockless case true is returned only when locked variant would have returned true without ever needing the lock; false means "punt to the locking path and recheck there". Signed-off-by: Al Viro --- fs/dcache.c | 95 ++++++++++++++++++++++++++--------------------------- 1 file changed, 47 insertions(+), 48 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index c795154ffa3a..b212a65ed190 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -665,30 +665,57 @@ static bool lock_for_kill(struct dentry *dentry) return false; } -static inline bool retain_dentry(struct dentry *dentry) +/* + * Decide if dentry is worth retaining. Usually this is called with dentry + * locked; if not locked, we are more limited and might not be able to tell + * without a lock. False in this case means "punt to locked path and recheck". + * + * In case we aren't locked, these predicates are not "stable". However, it is + * sufficient that at some point after we dropped the reference the dentry was + * hashed and the flags had the proper value. Other dentry users may have + * re-gotten a reference to the dentry and change that, but our work is done - + * we can leave the dentry around with a zero refcount. + */ +static inline bool retain_dentry(struct dentry *dentry, bool locked) { - WARN_ON(d_in_lookup(dentry)); + unsigned int d_flags; - /* Unreachable? Get rid of it */ + smp_rmb(); + d_flags = READ_ONCE(dentry->d_flags); + + // Unreachable? Nobody would be able to look it up, no point retaining if (unlikely(d_unhashed(dentry))) return false; - if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) + // Same if it's disconnected + if (unlikely(d_flags & DCACHE_DISCONNECTED)) return false; - if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) { - if (dentry->d_op->d_delete(dentry)) + // ->d_delete() might tell us not to bother, but that requires + // ->d_lock; can't decide without it + if (unlikely(d_flags & DCACHE_OP_DELETE)) { + if (!locked || dentry->d_op->d_delete(dentry)) return false; } - if (unlikely(dentry->d_flags & DCACHE_DONTCACHE)) + // Explicitly told not to bother + if (unlikely(d_flags & DCACHE_DONTCACHE)) return false; - /* retain; LRU fodder */ - if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) + // At this point it looks like we ought to keep it. We also might + // need to do something - put it on LRU if it wasn't there already + // and mark it referenced if it was on LRU, but not marked yet. + // Unfortunately, both actions require ->d_lock, so in lockless + // case we'd have to punt rather than doing those. + if (unlikely(!(d_flags & DCACHE_LRU_LIST))) { + if (!locked) + return false; d_lru_add(dentry); - else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED))) + } else if (unlikely(!(d_flags & DCACHE_REFERENCED))) { + if (!locked) + return false; dentry->d_flags |= DCACHE_REFERENCED; + } return true; } @@ -720,7 +747,6 @@ EXPORT_SYMBOL(d_mark_dontcache); static inline bool fast_dput(struct dentry *dentry) { int ret; - unsigned int d_flags; /* * try to decrement the lockref optimistically. @@ -749,45 +775,18 @@ static inline bool fast_dput(struct dentry *dentry) return true; /* - * Careful, careful. The reference count went down - * to zero, but we don't hold the dentry lock, so - * somebody else could get it again, and do another - * dput(), and we need to not race with that. - * - * However, there is a very special and common case - * where we don't care, because there is nothing to - * do: the dentry is still hashed, it does not have - * a 'delete' op, and it's referenced and already on - * the LRU list. - * - * NOTE! Since we aren't locked, these values are - * not "stable". However, it is sufficient that at - * some point after we dropped the reference the - * dentry was hashed and the flags had the proper - * value. Other dentry users may have re-gotten - * a reference to the dentry and change that, but - * our work is done - we can leave the dentry - * around with a zero refcount. - * - * Nevertheless, there are two cases that we should kill - * the dentry anyway. - * 1. free disconnected dentries as soon as their refcount - * reached zero. - * 2. free dentries if they should not be cached. + * Can we decide that decrement of refcount is all we needed without + * taking the lock? There's a very common case when it's all we need - + * dentry looks like it ought to be retained and there's nothing else + * to do. */ - smp_rmb(); - d_flags = READ_ONCE(dentry->d_flags); - d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_OP_DELETE | - DCACHE_DISCONNECTED | DCACHE_DONTCACHE; - - /* Nothing to do? Dropping the reference was all we needed? */ - if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry)) + if (retain_dentry(dentry, false)) return true; /* - * Not the fast normal case? Get the lock. We've already decremented - * the refcount, but we'll need to re-check the situation after - * getting the lock. + * Either not worth retaining or we can't tell without the lock. + * Get the lock, then. We've already decremented the refcount to 0, + * but we'll need to re-check the situation after getting the lock. */ spin_lock(&dentry->d_lock); @@ -798,7 +797,7 @@ static inline bool fast_dput(struct dentry *dentry) * don't need to do anything else. */ locked: - if (dentry->d_lockref.count || retain_dentry(dentry)) { + if (dentry->d_lockref.count || retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return true; } @@ -847,7 +846,7 @@ void dput(struct dentry *dentry) dentry = __dentry_kill(dentry); if (!dentry) return; - if (retain_dentry(dentry)) { + if (retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return; } From f2824db1b49f947ba6e208ddf02edf4b1391480a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Nov 2023 16:42:43 -0500 Subject: [PATCH 31/40] kill d_instantate_anon(), fold __d_instantiate_anon() into remaining caller now that the only user of d_instantiate_anon() is gone... [braino fix folded - kudos to Dan Carpenter] Signed-off-by: Al Viro --- fs/dcache.c | 92 +++++++++++++++++------------------------- include/linux/dcache.h | 1 - 2 files changed, 36 insertions(+), 57 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 9f5b2b5c1e6d..6c520eda131f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2054,75 +2054,55 @@ struct dentry *d_make_root(struct inode *root_inode) } EXPORT_SYMBOL(d_make_root); -static struct dentry *__d_instantiate_anon(struct dentry *dentry, - struct inode *inode, - bool disconnected) -{ - struct dentry *res; - unsigned add_flags; - - security_d_instantiate(dentry, inode); - spin_lock(&inode->i_lock); - res = __d_find_any_alias(inode); - if (res) { - spin_unlock(&inode->i_lock); - dput(dentry); - goto out_iput; - } - - /* attach a disconnected dentry */ - add_flags = d_flags_for_inode(inode); - - if (disconnected) - add_flags |= DCACHE_DISCONNECTED; - - spin_lock(&dentry->d_lock); - __d_set_inode_and_type(dentry, inode, add_flags); - hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); - if (!disconnected) { - hlist_bl_lock(&dentry->d_sb->s_roots); - hlist_bl_add_head(&dentry->d_hash, &dentry->d_sb->s_roots); - hlist_bl_unlock(&dentry->d_sb->s_roots); - } - spin_unlock(&dentry->d_lock); - spin_unlock(&inode->i_lock); - - return dentry; - - out_iput: - iput(inode); - return res; -} - -struct dentry *d_instantiate_anon(struct dentry *dentry, struct inode *inode) -{ - return __d_instantiate_anon(dentry, inode, true); -} -EXPORT_SYMBOL(d_instantiate_anon); - static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected) { - struct dentry *tmp; - struct dentry *res; + struct super_block *sb; + struct dentry *new, *res; if (!inode) return ERR_PTR(-ESTALE); if (IS_ERR(inode)) return ERR_CAST(inode); - res = d_find_any_alias(inode); - if (res) - goto out_iput; + sb = inode->i_sb; - tmp = d_alloc_anon(inode->i_sb); - if (!tmp) { + res = d_find_any_alias(inode); /* existing alias? */ + if (res) + goto out; + + new = d_alloc_anon(sb); + if (!new) { res = ERR_PTR(-ENOMEM); - goto out_iput; + goto out; } - return __d_instantiate_anon(tmp, inode, disconnected); + security_d_instantiate(new, inode); + spin_lock(&inode->i_lock); + res = __d_find_any_alias(inode); /* recheck under lock */ + if (likely(!res)) { /* still no alias, attach a disconnected dentry */ + unsigned add_flags = d_flags_for_inode(inode); -out_iput: + if (disconnected) + add_flags |= DCACHE_DISCONNECTED; + + spin_lock(&new->d_lock); + __d_set_inode_and_type(new, inode, add_flags); + hlist_add_head(&new->d_u.d_alias, &inode->i_dentry); + if (!disconnected) { + hlist_bl_lock(&sb->s_roots); + hlist_bl_add_head(&new->d_hash, &sb->s_roots); + hlist_bl_unlock(&sb->s_roots); + } + spin_unlock(&new->d_lock); + spin_unlock(&inode->i_lock); + inode = NULL; /* consumed by new->d_inode */ + res = new; + } else { + spin_unlock(&inode->i_lock); + dput(new); + } + + out: iput(inode); return res; } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index fa0414cff85c..8c5e3bdf1147 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -218,7 +218,6 @@ extern seqlock_t rename_lock; */ extern void d_instantiate(struct dentry *, struct inode *); extern void d_instantiate_new(struct dentry *, struct inode *); -extern struct dentry * d_instantiate_anon(struct dentry *, struct inode *); extern void __d_drop(struct dentry *dentry); extern void d_drop(struct dentry *dentry); extern void d_delete(struct dentry *); From 9024b4c96576162a13e58003b0d58e93ebe3ac33 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Nov 2023 00:04:46 -0500 Subject: [PATCH 32/40] d_alloc_pseudo(): move setting ->d_op there from the (sole) caller Signed-off-by: Al Viro --- fs/dcache.c | 8 +++++++- fs/file_table.c | 5 ----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 6c520eda131f..5947556b6e90 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1890,9 +1890,15 @@ struct dentry *d_alloc_cursor(struct dentry * parent) */ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) { + static const struct dentry_operations anon_ops = { + .d_dname = simple_dname + }; struct dentry *dentry = __d_alloc(sb, name); - if (likely(dentry)) + if (likely(dentry)) { dentry->d_flags |= DCACHE_NORCU; + if (!sb->s_d_op) + d_set_d_op(dentry, &anon_ops); + } return dentry; } diff --git a/fs/file_table.c b/fs/file_table.c index de4a2915bfd4..8889cbee13f8 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -329,9 +329,6 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, const char *name, int flags, const struct file_operations *fops) { - static const struct dentry_operations anon_ops = { - .d_dname = simple_dname - }; struct qstr this = QSTR_INIT(name, strlen(name)); struct path path; struct file *file; @@ -339,8 +336,6 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this); if (!path.dentry) return ERR_PTR(-ENOMEM); - if (!mnt->mnt_sb->s_d_op) - d_set_d_op(path.dentry, &anon_ops); path.mnt = mntget(mnt); d_instantiate(path.dentry, inode); file = alloc_file(&path, flags, fops); From fb7945b484b8a3c1e204c36eb9e4cf99f32141ee Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Nov 2023 16:50:29 -0500 Subject: [PATCH 33/40] nsfs: use d_make_root() Normally d_make_root() is used to create the root dentry of superblock; here we use it for a different purpose, but... idiomatic or not, we need the same operation. Signed-off-by: Al Viro --- fs/nsfs.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/nsfs.c b/fs/nsfs.c index 9a4b228d42fa..34e1e3e36733 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -90,12 +90,9 @@ slow: inode->i_fop = &ns_file_operations; inode->i_private = ns; - dentry = d_alloc_anon(mnt->mnt_sb); - if (!dentry) { - iput(inode); + dentry = d_make_root(inode); /* not the normal use, but... */ + if (!dentry) return -ENOMEM; - } - d_instantiate(dentry, inode); dentry->d_fsdata = (void *)ns->ops; d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry); if (d) { From 715cd66aabf96b6cf423590954326799312c6dfa Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 11 Nov 2023 15:56:55 -0500 Subject: [PATCH 34/40] simple_fill_super(): don't bother with d_genocide() on failure Failing ->fill_super() will be followed by ->kill_sb(), which should include kill_litter_super() if the call of simple_fill_super() had been asked to create anything besides the root dentry. So there's no need to empty the partially populated tree - it will be trimmed by inevitable kill_litter_super(). Signed-off-by: Al Viro --- fs/libfs.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index e9440d55073c..6fa8ad36049f 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -912,7 +912,6 @@ int simple_fill_super(struct super_block *s, unsigned long magic, const struct tree_descr *files) { struct inode *inode; - struct dentry *root; struct dentry *dentry; int i; @@ -935,8 +934,8 @@ int simple_fill_super(struct super_block *s, unsigned long magic, inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); - root = d_make_root(inode); - if (!root) + s->s_root = d_make_root(inode); + if (!s->s_root) return -ENOMEM; for (i = 0; !files->name || files->name[0]; i++, files++) { if (!files->name) @@ -948,13 +947,13 @@ int simple_fill_super(struct super_block *s, unsigned long magic, "with an index of 1!\n", __func__, s->s_type->name); - dentry = d_alloc_name(root, files->name); + dentry = d_alloc_name(s->s_root, files->name); if (!dentry) - goto out; + return -ENOMEM; inode = new_inode(s); if (!inode) { dput(dentry); - goto out; + return -ENOMEM; } inode->i_mode = S_IFREG | files->mode; simple_inode_init_ts(inode); @@ -962,13 +961,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic, inode->i_ino = i; d_add(dentry, inode); } - s->s_root = root; return 0; -out: - d_genocide(root); - shrink_dcache_parent(root); - dput(root); - return -ENOMEM; } EXPORT_SYMBOL(simple_fill_super); From 8a54b38f3e5ced6cc4b246b8e54bd0f50deceaa8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 11 Nov 2023 16:01:27 -0500 Subject: [PATCH 35/40] d_genocide(): move the extern into fs/internal.h Signed-off-by: Al Viro --- fs/internal.h | 1 + include/linux/dcache.h | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/internal.h b/fs/internal.h index 9e9fc629f935..d9a920e2636e 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -219,6 +219,7 @@ extern void shrink_dcache_for_umount(struct super_block *); extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seq); +extern void d_genocide(struct dentry *); /* * pipe.c diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 8c5e3bdf1147..b4324d47f249 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -243,9 +243,6 @@ extern void d_invalidate(struct dentry *); /* only used at mount-time */ extern struct dentry * d_make_root(struct inode *); -/* - the ramfs-type tree */ -extern void d_genocide(struct dentry *); - extern void d_mark_tmpfile(struct file *, struct inode *); extern void d_tmpfile(struct file *, struct inode *); From 57851607326a2beef21e67f83f4f53a90df8445a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Nov 2023 21:38:48 -0500 Subject: [PATCH 36/40] get rid of DCACHE_GENOCIDE ... now that we never call d_genocide() other than from kill_litter_super() Signed-off-by: Al Viro --- fs/dcache.c | 5 +---- include/linux/dcache.h | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 5947556b6e90..8473c8f0ce22 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3198,10 +3198,7 @@ static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) if (d_unhashed(dentry) || !dentry->d_inode) return D_WALK_SKIP; - if (!(dentry->d_flags & DCACHE_GENOCIDE)) { - dentry->d_flags |= DCACHE_GENOCIDE; - dentry->d_lockref.count--; - } + dentry->d_lockref.count--; } return D_WALK_CONTINUE; } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index b4324d47f249..981f529c6cb5 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -173,7 +173,6 @@ struct dentry_operations { #define DCACHE_DONTCACHE BIT(7) /* Purge from memory on final dput() */ #define DCACHE_CANT_MOUNT BIT(8) -#define DCACHE_GENOCIDE BIT(9) #define DCACHE_SHRINK_LIST BIT(10) #define DCACHE_OP_WEAK_REVALIDATE BIT(11) From f9f677c5f723394170fa838b856602476150948d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Nov 2023 00:38:02 -0500 Subject: [PATCH 37/40] d_alloc_parallel(): in-lookup hash insertion doesn't need an RCU variant We only search in the damn thing under hlist_bl_lock(); RCU variant of insertion was, IIRC, pretty much cargo-culted - mea culpa... Signed-off-by: Al Viro --- fs/dcache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dcache.c b/fs/dcache.c index 8473c8f0ce22..82d086185703 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2692,7 +2692,7 @@ retry: /* we can't take ->d_lock here; it's OK, though. */ new->d_flags |= DCACHE_PAR_LOOKUP; new->d_wait = wq; - hlist_bl_add_head_rcu(&new->d_u.d_in_lookup_hash, b); + hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b); hlist_bl_unlock(b); return new; mismatch: From ef69f0506d8f3a250ac5baa96746e17ae22c67b5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 23 Nov 2023 18:11:00 -0500 Subject: [PATCH 38/40] __d_unalias() doesn't use inode argument ... and hasn't since 2015. Signed-off-by: Al Viro --- fs/dcache.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 82d086185703..74b2d073ebbf 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3045,8 +3045,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... */ -static int __d_unalias(struct inode *inode, - struct dentry *dentry, struct dentry *alias) +static int __d_unalias(struct dentry *dentry, struct dentry *alias) { struct mutex *m1 = NULL; struct rw_semaphore *m2 = NULL; @@ -3127,7 +3126,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) inode->i_sb->s_id); } else if (!IS_ROOT(new)) { struct dentry *old_parent = dget(new->d_parent); - int err = __d_unalias(inode, dentry, new); + int err = __d_unalias(dentry, new); write_sequnlock(&rename_lock); if (err) { dput(new); From 1b327b5ac57cf83e3d015de45d0142852f475375 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 10 Nov 2023 14:07:43 -0500 Subject: [PATCH 39/40] kill DCACHE_MAY_FREE With the new ordering in __dentry_kill() it has become redundant - it's set if and only if both DCACHE_DENTRY_KILLED and DCACHE_SHRINK_LIST are set. We set it in __dentry_kill(), after having set DCACHE_DENTRY_KILLED with the only condition being that DCACHE_SHRINK_LIST is there; all of that is done without dropping ->d_lock and the only place that checks that flag (shrink_dentry_list()) does so under ->d_lock, after having found the victim on its shrink list. Since DCACHE_SHRINK_LIST is set only when placing dentry into shrink list and removed only by shrink_dentry_list() itself, a check for DCACHE_DENTRY_KILLED in there would be equivalent to check for DCACHE_MAY_FREE. Signed-off-by: Al Viro --- fs/dcache.c | 6 ++---- include/linux/dcache.h | 1 - 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 475ef1edba03..523cbf0780f0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -614,10 +614,8 @@ static struct dentry *__dentry_kill(struct dentry *dentry) } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); dentry_unlist(dentry); - if (dentry->d_flags & DCACHE_SHRINK_LIST) { - dentry->d_flags |= DCACHE_MAY_FREE; + if (dentry->d_flags & DCACHE_SHRINK_LIST) can_free = false; - } spin_unlock(&dentry->d_lock); if (likely(can_free)) dentry_free(dentry); @@ -1072,7 +1070,7 @@ void shrink_dentry_list(struct list_head *list) bool can_free; rcu_read_unlock(); d_shrink_del(dentry); - can_free = dentry->d_flags & DCACHE_MAY_FREE; + can_free = dentry->d_flags & DCACHE_DENTRY_KILLED; spin_unlock(&dentry->d_lock); if (can_free) dentry_free(dentry); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index b4449a1a47ff..48b393545ec2 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -202,7 +202,6 @@ struct dentry_operations { #define DCACHE_SPECIAL_TYPE (5 << 20) /* Other file type */ #define DCACHE_SYMLINK_TYPE (6 << 20) /* Symlink */ -#define DCACHE_MAY_FREE BIT(23) #define DCACHE_NOKEY_NAME BIT(25) /* Encrypted name encoded without key */ #define DCACHE_OP_REAL BIT(26) From 1b6ae9f6e6c3e3c35aad0f11b116a81780b8aa03 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Mon, 6 Nov 2023 14:44:17 +0100 Subject: [PATCH 40/40] dcache: remove unnecessary NULL check in dget_dlock() dget_dlock() requires dentry->d_lock to be held when called, yet contains a NULL check for dentry. An audit of all calls to dget_dlock() shows that it is never called with a NULL pointer (as spin_lock()/spin_unlock() would crash in these cases): $ git grep -W '\' arch/powerpc/platforms/cell/spufs/inode.c- spin_lock(&dentry->d_lock); arch/powerpc/platforms/cell/spufs/inode.c- if (simple_positive(dentry)) { arch/powerpc/platforms/cell/spufs/inode.c: dget_dlock(dentry); fs/autofs/expire.c- spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); fs/autofs/expire.c- if (simple_positive(child)) { fs/autofs/expire.c: dget_dlock(child); fs/autofs/root.c: dget_dlock(active); fs/autofs/root.c- spin_unlock(&active->d_lock); fs/autofs/root.c: dget_dlock(expiring); fs/autofs/root.c- spin_unlock(&expiring->d_lock); fs/ceph/dir.c- if (!spin_trylock(&dentry->d_lock)) fs/ceph/dir.c- continue; [...] fs/ceph/dir.c: dget_dlock(dentry); fs/ceph/mds_client.c- spin_lock(&alias->d_lock); [...] fs/ceph/mds_client.c: dn = dget_dlock(alias); fs/configfs/inode.c- spin_lock(&dentry->d_lock); fs/configfs/inode.c- if (simple_positive(dentry)) { fs/configfs/inode.c: dget_dlock(dentry); fs/libfs.c: found = dget_dlock(d); fs/libfs.c- spin_unlock(&d->d_lock); fs/libfs.c: found = dget_dlock(child); fs/libfs.c- spin_unlock(&child->d_lock); fs/libfs.c: child = dget_dlock(d); fs/libfs.c- spin_unlock(&d->d_lock); fs/ocfs2/dcache.c: dget_dlock(dentry); fs/ocfs2/dcache.c- spin_unlock(&dentry->d_lock); include/linux/dcache.h:static inline struct dentry *dget_dlock(struct dentry *dentry) After taking out the NULL check, dget_dlock() becomes almost identical to __dget_dlock(); the only difference is that dget_dlock() returns the dentry that was passed in. These are static inline helpers, so we can rely on the compiler to discard unused return values. We can therefore also remove __dget_dlock() and replace calls to it by dget_dlock(). Also fix up and improve the kerneldoc comments while we're at it. Al Viro pointed out that we can also clean up some of the callers to make use of the returned value and provided a bit more info for the kerneldoc. While preparing v2 I also noticed that the tabs used in the kerneldoc comments were causing the kerneldoc to get parsed incorrectly so I also fixed this up (including for d_unhashed, which is otherwise unrelated). Testing: x86 defconfig build + boot; make htmldocs for the kerneldoc warning. objdump shows there are code generation changes. Link: https://lore.kernel.org/all/20231022164520.915013-1-vegard.nossum@oracle.com/ Cc: Alexander Viro Cc: Christian Brauner Cc: linux-fsdevel@vger.kernel.org Cc: Nick Piggin Cc: Waiman Long Cc: linux-doc@vger.kernel.org Signed-off-by: Vegard Nossum Signed-off-by: Al Viro --- fs/dcache.c | 16 ++++------------ include/linux/dcache.h | 41 ++++++++++++++++++++++++++++++----------- 2 files changed, 34 insertions(+), 23 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 523cbf0780f0..bb18ea91d717 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -877,12 +877,6 @@ void dput_to_list(struct dentry *dentry, struct list_head *list) spin_unlock(&dentry->d_lock); } -/* This must be called with d_lock held */ -static inline void __dget_dlock(struct dentry *dentry) -{ - dentry->d_lockref.count++; -} - struct dentry *dget_parent(struct dentry *dentry) { int gotref; @@ -964,7 +958,7 @@ static struct dentry *__d_find_alias(struct inode *inode) hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { - __dget_dlock(alias); + dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; } @@ -1569,8 +1563,7 @@ static enum d_walk_ret find_submount(void *_data, struct dentry *dentry) { struct dentry **victim = _data; if (d_mountpoint(dentry)) { - __dget_dlock(dentry); - *victim = dentry; + *victim = dget_dlock(dentry); return D_WALK_QUIT; } return D_WALK_CONTINUE; @@ -1715,8 +1708,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) * don't need child lock because it is not subject * to concurrency here */ - __dget_dlock(parent); - dentry->d_parent = parent; + dentry->d_parent = dget_dlock(parent); hlist_add_head(&dentry->d_sib, &parent->d_children); spin_unlock(&parent->d_lock); @@ -2683,7 +2675,7 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) spin_unlock(&alias->d_lock); alias = NULL; } else { - __dget_dlock(alias); + dget_dlock(alias); __d_rehash(alias); spin_unlock(&alias->d_lock); } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 48b393545ec2..1666c387861f 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -287,20 +287,40 @@ extern char *dentry_path(const struct dentry *, char *, int); /* Allocation counts.. */ /** - * dget, dget_dlock - get a reference to a dentry - * @dentry: dentry to get a reference to + * dget_dlock - get a reference to a dentry + * @dentry: dentry to get a reference to * - * Given a dentry or %NULL pointer increment the reference count - * if appropriate and return the dentry. A dentry will not be - * destroyed when it has references. + * Given a live dentry, increment the reference count and return the dentry. + * Caller must hold @dentry->d_lock. Making sure that dentry is alive is + * caller's resonsibility. There are many conditions sufficient to guarantee + * that; e.g. anything with non-negative refcount is alive, so's anything + * hashed, anything positive, anyone's parent, etc. */ static inline struct dentry *dget_dlock(struct dentry *dentry) { - if (dentry) - dentry->d_lockref.count++; + dentry->d_lockref.count++; return dentry; } + +/** + * dget - get a reference to a dentry + * @dentry: dentry to get a reference to + * + * Given a dentry or %NULL pointer increment the reference count + * if appropriate and return the dentry. A dentry will not be + * destroyed when it has references. Conversely, a dentry with + * no references can disappear for any number of reasons, starting + * with memory pressure. In other words, that primitive is + * used to clone an existing reference; using it on something with + * zero refcount is a bug. + * + * NOTE: it will spin if @dentry->d_lock is held. From the deadlock + * avoidance point of view it is equivalent to spin_lock()/increment + * refcount/spin_unlock(), so calling it under @dentry->d_lock is + * always a bug; so's calling it under ->d_lock on any of its descendents. + * + */ static inline struct dentry *dget(struct dentry *dentry) { if (dentry) @@ -311,12 +331,11 @@ static inline struct dentry *dget(struct dentry *dentry) extern struct dentry *dget_parent(struct dentry *dentry); /** - * d_unhashed - is dentry hashed - * @dentry: entry to check + * d_unhashed - is dentry hashed + * @dentry: entry to check * - * Returns true if the dentry passed is not currently hashed. + * Returns true if the dentry passed is not currently hashed. */ - static inline int d_unhashed(const struct dentry *dentry) { return hlist_bl_unhashed(&dentry->d_hash);