From e5c832d5558826cc6e9a24746cfdec8e7780063a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 8 Sep 2013 18:13:49 -0700 Subject: [PATCH] vfs: fix dentry RCU to refcounting possibly sleeping dput() This is the fix that the last two commits indirectly led up to - making sure that we don't call dput() in a bad context on the dentries we've looked up in RCU mode after the sequence count validation fails. This basically expands d_rcu_to_refcount() into the callers, and then fixes the callers to delay the dput() in the failure case until _after_ we've dropped all locks and are no longer in an RCU-locked region. The case of 'complete_walk()' was trivial, since its failure case did the unlock_rcu_walk() directly after the call to d_rcu_to_refcount(), and as such that is just a pure expansion of the function with a trivial movement of the resulting dput() to after 'unlock_rcu_walk()'. In contrast, the unlazy_walk() case was much more complicated, because not only does convert two different dentries from RCU to be reference counted, but it used to not call unlock_rcu_walk() at all, and instead just returned an error and let the caller clean everything up in "terminate_walk()". Happily, one of the dentries in question (called "parent" inside unlazy_walk()) is the dentry of "nd->path", which terminate_walk() wants a refcount to anyway for the non-RCU case. So what the new and improved unlazy_walk() does is to first turn that dentry into a refcounted one, and once that is set up, the error cases can continue to use the terminate_walk() helper for cleanup, but for the non-RCU case. Which makes it possible to drop out of RCU mode if we actually hit the sequence number failure case. Acked-by: Al Viro Signed-off-by: Linus Torvalds --- fs/namei.c | 102 +++++++++++++++++++++++++---------------------------- 1 file changed, 49 insertions(+), 53 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index cc4bcfaa8624..56e4f4d537d0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -494,37 +494,6 @@ static inline void unlock_rcu_walk(void) br_read_unlock(&vfsmount_lock); } -/* - * When we move over from the RCU domain to properly refcounted - * long-lived dentries, we need to check the sequence numbers - * we got before lookup very carefully. - * - * We cannot blindly increment a dentry refcount - even if it - * is not locked - if it is zero, because it may have gone - * through the final d_kill() logic already. - * - * So for a zero refcount, we need to get the spinlock (which is - * safe even for a dead dentry because the de-allocation is - * RCU-delayed), and check the sequence count under the lock. - * - * Once we have checked the sequence count, we know it is live, - * and since we hold the spinlock it cannot die from under us. - * - * In contrast, if the reference count wasn't zero, we can just - * increment the lockref without having to take the spinlock. - * Even if the sequence number ends up being stale, we haven't - * gone through the final dput() and killed the dentry yet. - */ -static inline int d_rcu_to_refcount(struct dentry *dentry, seqcount_t *validate, unsigned seq) -{ - if (likely(lockref_get_not_dead(&dentry->d_lockref))) { - if (!read_seqcount_retry(validate, seq)) - return 0; - dput(dentry); - } - return -ECHILD; -} - /** * unlazy_walk - try to switch to ref-walk mode. * @nd: nameidata pathwalk data @@ -539,16 +508,29 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) { struct fs_struct *fs = current->fs; struct dentry *parent = nd->path.dentry; - int want_root = 0; BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { - want_root = 1; - spin_lock(&fs->lock); - if (nd->root.mnt != fs->root.mnt || - nd->root.dentry != fs->root.dentry) - goto err_root; - } + + /* + * Get a reference to the parent first: we're + * going to make "path_put(nd->path)" valid in + * non-RCU context for "terminate_walk()". + * + * If this doesn't work, return immediately with + * RCU walking still active (and then we will do + * the RCU walk cleanup in terminate_walk()). + */ + if (!lockref_get_not_dead(&parent->d_lockref)) + return -ECHILD; + + /* + * After the mntget(), we terminate_walk() will do + * the right thing for non-RCU mode, and all our + * subsequent exit cases should unlock_rcu_walk() + * before returning. + */ + mntget(nd->path.mnt); + nd->flags &= ~LOOKUP_RCU; /* * For a negative lookup, the lookup sequence point is the parents @@ -562,30 +544,39 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) * be valid if the child sequence number is still valid. */ if (!dentry) { - if (d_rcu_to_refcount(parent, &parent->d_seq, nd->seq) < 0) - goto err_root; + if (read_seqcount_retry(&parent->d_seq, nd->seq)) + goto out; BUG_ON(nd->inode != parent->d_inode); } else { - if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0) - goto err_root; - if (d_rcu_to_refcount(parent, &dentry->d_seq, nd->seq) < 0) - goto err_parent; + if (!lockref_get_not_dead(&dentry->d_lockref)) + goto out; + if (read_seqcount_retry(&dentry->d_seq, nd->seq)) + goto drop_dentry; } - if (want_root) { + + /* + * Sequence counts matched. Now make sure that the root is + * still valid and get it if required. + */ + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + spin_lock(&fs->lock); + if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry) + goto unlock_and_drop_dentry; path_get(&nd->root); spin_unlock(&fs->lock); } - mntget(nd->path.mnt); unlock_rcu_walk(); - nd->flags &= ~LOOKUP_RCU; return 0; -err_parent: +unlock_and_drop_dentry: + spin_unlock(&fs->lock); +drop_dentry: + unlock_rcu_walk(); dput(dentry); -err_root: - if (want_root) - spin_unlock(&fs->lock); + return -ECHILD; +out: + unlock_rcu_walk(); return -ECHILD; } @@ -614,10 +605,15 @@ static int complete_walk(struct nameidata *nd) if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0) { + if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) { unlock_rcu_walk(); return -ECHILD; } + if (read_seqcount_retry(&dentry->d_seq, nd->seq)) { + unlock_rcu_walk(); + dput(dentry); + return -ECHILD; + } mntget(nd->path.mnt); unlock_rcu_walk(); }