diff --git a/Documentation/filesystems/files.rst b/Documentation/filesystems/files.rst index bcf84459917f..9e38e4c221ca 100644 --- a/Documentation/filesystems/files.rst +++ b/Documentation/filesystems/files.rst @@ -62,7 +62,7 @@ the fdtable structure - be held. 4. To look up the file structure given an fd, a reader - must use either lookup_fd_rcu() or files_lookup_fd_rcu() APIs. These + must use either lookup_fdget_rcu() or files_lookup_fdget_rcu() APIs. These take care of barrier requirements due to lock-free lookup. An example:: @@ -70,43 +70,22 @@ the fdtable structure - struct file *file; rcu_read_lock(); - file = lookup_fd_rcu(fd); + file = lookup_fdget_rcu(fd); + rcu_read_unlock(); if (file) { ... + fput(file); } .... - rcu_read_unlock(); -5. Handling of the file structures is special. Since the look-up - of the fd (fget()/fget_light()) are lock-free, it is possible - that look-up may race with the last put() operation on the - file structure. This is avoided using atomic_long_inc_not_zero() - on ->f_count:: - - rcu_read_lock(); - file = files_lookup_fd_rcu(files, fd); - if (file) { - if (atomic_long_inc_not_zero(&file->f_count)) - *fput_needed = 1; - else - /* Didn't get the reference, someone's freed */ - file = NULL; - } - rcu_read_unlock(); - .... - return file; - - atomic_long_inc_not_zero() detects if refcounts is already zero or - goes to zero during increment. If it does, we fail - fget()/fget_light(). - -6. Since both fdtable and file structures can be looked up +5. Since both fdtable and file structures can be looked up lock-free, they must be installed using rcu_assign_pointer() API. If they are looked up lock-free, rcu_dereference() must be used. However it is advisable to use files_fdtable() - and lookup_fd_rcu()/files_lookup_fd_rcu() which take care of these issues. + and lookup_fdget_rcu()/files_lookup_fdget_rcu() which take care of these + issues. -7. While updating, the fdtable pointer must be looked up while +6. While updating, the fdtable pointer must be looked up while holding files->file_lock. If ->file_lock is dropped, then another thread expand the files thereby creating a new fdtable and making the earlier fdtable pointer stale. @@ -126,3 +105,19 @@ the fdtable structure - Since locate_fd() can drop ->file_lock (and reacquire ->file_lock), the fdtable pointer (fdt) must be loaded after locate_fd(). +On newer kernels rcu based file lookup has been switched to rely on +SLAB_TYPESAFE_BY_RCU instead of call_rcu(). It isn't sufficient anymore +to just acquire a reference to the file in question under rcu using +atomic_long_inc_not_zero() since the file might have already been +recycled and someone else might have bumped the reference. In other +words, callers might see reference count bumps from newer users. For +this is reason it is necessary to verify that the pointer is the same +before and after the reference count increment. This pattern can be seen +in get_file_rcu() and __files_get_rcu(). + +In addition, it isn't possible to access or check fields in struct file +without first aqcuiring a reference on it under rcu lookup. Not doing +that was always very dodgy and it was only usable for non-pointer data +in struct file. With SLAB_TYPESAFE_BY_RCU it is necessary that callers +either first acquire a reference or they must hold the files_lock of the +fdtable. diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c index d5b3ed2c58f5..c380d8c30704 100644 --- a/arch/arc/kernel/troubleshoot.c +++ b/arch/arc/kernel/troubleshoot.c @@ -90,10 +90,12 @@ static void show_faulting_vma(unsigned long address) */ if (vma) { char buf[ARC_PATH_MAX]; - char *nm = "?"; + char *nm = "anon"; if (vma->vm_file) { - nm = file_path(vma->vm_file, buf, ARC_PATH_MAX-1); + /* XXX: can we use %pD below and get rid of buf? */ + nm = d_path(file_user_path(vma->vm_file), buf, + ARC_PATH_MAX-1); if (IS_ERR(nm)) nm = "?"; } diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c index 1a587618015c..18daafbe2e65 100644 --- a/arch/powerpc/platforms/cell/spufs/coredump.c +++ b/arch/powerpc/platforms/cell/spufs/coredump.c @@ -66,7 +66,7 @@ static int match_context(const void *v, struct file *file, unsigned fd) */ static struct spu_context *coredump_next_context(int *fd) { - struct spu_context *ctx; + struct spu_context *ctx = NULL; struct file *file; int n = iterate_fd(current->files, *fd, match_context, NULL); if (!n) @@ -74,10 +74,13 @@ static struct spu_context *coredump_next_context(int *fd) *fd = n - 1; rcu_read_lock(); - file = lookup_fd_rcu(*fd); - ctx = SPUFS_I(file_inode(file))->i_ctx; - get_spu_context(ctx); + file = lookup_fdget_rcu(*fd); rcu_read_unlock(); + if (file) { + ctx = SPUFS_I(file_inode(file))->i_ctx; + get_spu_context(ctx); + fput(file); + } return ctx; } diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index 310654542b42..a2195e28b625 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -916,11 +916,7 @@ static struct file *mmap_singleton(struct drm_i915_private *i915) { struct file *file; - rcu_read_lock(); - file = READ_ONCE(i915->gem.mmap_singleton); - if (file && !get_file_rcu(file)) - file = NULL; - rcu_read_unlock(); + file = get_file_active(&i915->gem.mmap_singleton); if (file) return file; diff --git a/fs/char_dev.c b/fs/char_dev.c index 950b6919fb87..6ba032442b39 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -350,7 +350,7 @@ static struct kobject *cdev_get(struct cdev *p) struct module *owner = p->owner; struct kobject *kobj; - if (owner && !try_module_get(owner)) + if (!try_module_get(owner)) return NULL; kobj = kobject_get_unless_zero(&p->kobj); if (!kobj) diff --git a/fs/file.c b/fs/file.c index 3e4a4dfa38fc..5fb0b146e79e 100644 --- a/fs/file.c +++ b/fs/file.c @@ -604,6 +604,9 @@ void fd_install(unsigned int fd, struct file *file) struct files_struct *files = current->files; struct fdtable *fdt; + if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING))) + return; + rcu_read_lock_sched(); if (unlikely(files->resize_in_progress)) { @@ -853,8 +856,104 @@ void do_close_on_exec(struct files_struct *files) spin_unlock(&files->file_lock); } +static struct file *__get_file_rcu(struct file __rcu **f) +{ + struct file __rcu *file; + struct file __rcu *file_reloaded; + struct file __rcu *file_reloaded_cmp; + + file = rcu_dereference_raw(*f); + if (!file) + return NULL; + + if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) + return ERR_PTR(-EAGAIN); + + file_reloaded = rcu_dereference_raw(*f); + + /* + * Ensure that all accesses have a dependency on the load from + * rcu_dereference_raw() above so we get correct ordering + * between reuse/allocation and the pointer check below. + */ + file_reloaded_cmp = file_reloaded; + OPTIMIZER_HIDE_VAR(file_reloaded_cmp); + + /* + * atomic_long_inc_not_zero() above provided a full memory + * barrier when we acquired a reference. + * + * This is paired with the write barrier from assigning to the + * __rcu protected file pointer so that if that pointer still + * matches the current file, we know we have successfully + * acquired a reference to the right file. + * + * If the pointers don't match the file has been reallocated by + * SLAB_TYPESAFE_BY_RCU. + */ + if (file == file_reloaded_cmp) + return file_reloaded; + + fput(file); + return ERR_PTR(-EAGAIN); +} + +/** + * get_file_rcu - try go get a reference to a file under rcu + * @f: the file to get a reference on + * + * This function tries to get a reference on @f carefully verifying that + * @f hasn't been reused. + * + * This function should rarely have to be used and only by users who + * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. + * + * Return: Returns @f with the reference count increased or NULL. + */ +struct file *get_file_rcu(struct file __rcu **f) +{ + for (;;) { + struct file __rcu *file; + + file = __get_file_rcu(f); + if (unlikely(!file)) + return NULL; + + if (unlikely(IS_ERR(file))) + continue; + + return file; + } +} +EXPORT_SYMBOL_GPL(get_file_rcu); + +/** + * get_file_active - try go get a reference to a file + * @f: the file to get a reference on + * + * In contast to get_file_rcu() the pointer itself isn't part of the + * reference counting. + * + * This function should rarely have to be used and only by users who + * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. + * + * Return: Returns @f with the reference count increased or NULL. + */ +struct file *get_file_active(struct file **f) +{ + struct file __rcu *file; + + rcu_read_lock(); + file = __get_file_rcu(f); + rcu_read_unlock(); + if (IS_ERR(file)) + file = NULL; + return file; +} +EXPORT_SYMBOL_GPL(get_file_active); + static inline struct file *__fget_files_rcu(struct files_struct *files, - unsigned int fd, fmode_t mask) + unsigned int fd, fmode_t mask) { for (;;) { struct file *file; @@ -865,12 +964,6 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, return NULL; fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds); - file = rcu_dereference_raw(*fdentry); - if (unlikely(!file)) - return NULL; - - if (unlikely(file->f_mode & mask)) - return NULL; /* * Ok, we have a file pointer. However, because we do @@ -879,10 +972,15 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * * Such a race can take two forms: * - * (a) the file ref already went down to zero, - * and get_file_rcu() fails. Just try again: + * (a) the file ref already went down to zero and the + * file hasn't been reused yet or the file count + * isn't zero but the file has already been reused. */ - if (unlikely(!get_file_rcu(file))) + file = __get_file_rcu(fdentry); + if (unlikely(!file)) + return NULL; + + if (unlikely(IS_ERR(file))) continue; /* @@ -893,12 +991,20 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * * If so, we need to put our ref and try again. */ - if (unlikely(rcu_dereference_raw(files->fdt) != fdt) || - unlikely(rcu_dereference_raw(*fdentry) != file)) { + if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) { fput(file); continue; } + /* + * This isn't the file we're looking for or we're not + * allowed to get a reference to it. + */ + if (unlikely(file->f_mode & mask)) { + fput(file); + return NULL; + } + /* * Ok, we have a ref to the file, and checked that it * still exists. @@ -948,7 +1054,14 @@ struct file *fget_task(struct task_struct *task, unsigned int fd) return file; } -struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd) +struct file *lookup_fdget_rcu(unsigned int fd) +{ + return __fget_files_rcu(current->files, fd, 0); + +} +EXPORT_SYMBOL_GPL(lookup_fdget_rcu); + +struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; @@ -957,13 +1070,13 @@ struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd) task_lock(task); files = task->files; if (files) - file = files_lookup_fd_rcu(files, fd); + file = __fget_files_rcu(files, fd, 0); task_unlock(task); return file; } -struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd) +struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; @@ -974,7 +1087,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret files = task->files; if (files) { for (; fd < files_fdtable(files)->max_fds; fd++) { - file = files_lookup_fd_rcu(files, fd); + file = __fget_files_rcu(files, fd, 0); if (file) break; } @@ -983,7 +1096,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret *ret_fd = fd; return file; } -EXPORT_SYMBOL(task_lookup_next_fd_rcu); +EXPORT_SYMBOL(task_lookup_next_fdget_rcu); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. @@ -1272,12 +1385,16 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) { if (unlikely(newfd == oldfd)) { /* corner case */ struct files_struct *files = current->files; + struct file *f; int retval = oldfd; rcu_read_lock(); - if (!files_lookup_fd_rcu(files, oldfd)) + f = __fget_files_rcu(files, oldfd, 0); + if (!f) retval = -EBADF; rcu_read_unlock(); + if (f) + fput(f); return retval; } return ksys_dup3(oldfd, newfd, 0); diff --git a/fs/file_table.c b/fs/file_table.c index ee21b3da9d08..fa92743ba6a9 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -44,10 +44,10 @@ static struct kmem_cache *filp_cachep __read_mostly; static struct percpu_counter nr_files __cacheline_aligned_in_smp; -/* Container for backing file with optional real path */ +/* Container for backing file with optional user path */ struct backing_file { struct file file; - struct path real_path; + struct path user_path; }; static inline struct backing_file *backing_file(struct file *f) @@ -55,31 +55,36 @@ static inline struct backing_file *backing_file(struct file *f) return container_of(f, struct backing_file, file); } -struct path *backing_file_real_path(struct file *f) +struct path *backing_file_user_path(struct file *f) { - return &backing_file(f)->real_path; -} -EXPORT_SYMBOL_GPL(backing_file_real_path); - -static void file_free_rcu(struct rcu_head *head) -{ - struct file *f = container_of(head, struct file, f_rcuhead); - - put_cred(f->f_cred); - if (unlikely(f->f_mode & FMODE_BACKING)) - kfree(backing_file(f)); - else - kmem_cache_free(filp_cachep, f); + return &backing_file(f)->user_path; } +EXPORT_SYMBOL_GPL(backing_file_user_path); static inline void file_free(struct file *f) { security_file_free(f); - if (unlikely(f->f_mode & FMODE_BACKING)) - path_put(backing_file_real_path(f)); if (likely(!(f->f_mode & FMODE_NOACCOUNT))) percpu_counter_dec(&nr_files); - call_rcu(&f->f_rcuhead, file_free_rcu); + put_cred(f->f_cred); + if (unlikely(f->f_mode & FMODE_BACKING)) { + path_put(backing_file_user_path(f)); + kfree(backing_file(f)); + } else { + kmem_cache_free(filp_cachep, f); + } +} + +void release_empty_file(struct file *f) +{ + WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED)); + if (atomic_long_dec_and_test(&f->f_count)) { + security_file_free(f); + put_cred(f->f_cred); + if (likely(!(f->f_mode & FMODE_NOACCOUNT))) + percpu_counter_dec(&nr_files); + kmem_cache_free(filp_cachep, f); + } } /* @@ -164,7 +169,6 @@ static int init_file(struct file *f, int flags, const struct cred *cred) return error; } - atomic_long_set(&f->f_count, 1); rwlock_init(&f->f_owner.lock); spin_lock_init(&f->f_lock); mutex_init(&f->f_pos_lock); @@ -172,6 +176,12 @@ static int init_file(struct file *f, int flags, const struct cred *cred) f->f_mode = OPEN_FMODE(flags); /* f->f_version: 0 */ + /* + * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While + * fget-rcu pattern users need to be able to handle spurious + * refcount bumps we should reinitialize the reused file first. + */ + atomic_long_set(&f->f_count, 1); return 0; } @@ -471,7 +481,8 @@ EXPORT_SYMBOL(__fput_sync); void __init files_init(void) { filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); + SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN | + SLAB_PANIC | SLAB_ACCOUNT, NULL); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c1af01b2c42d..1767493dffda 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -613,6 +613,24 @@ out_free: kfree(isw); } +static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw, + struct list_head *list, int *nr) +{ + struct inode *inode; + + list_for_each_entry(inode, list, i_io_list) { + if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + continue; + + isw->inodes[*nr] = inode; + (*nr)++; + + if (*nr >= WB_MAX_INODES_PER_ISW - 1) + return true; + } + return false; +} + /** * cleanup_offline_cgwb - detach associated inodes * @wb: target wb @@ -625,7 +643,6 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) { struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; - struct inode *inode; int nr; bool restart = false; @@ -647,17 +664,17 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) nr = 0; spin_lock(&wb->list_lock); - list_for_each_entry(inode, &wb->b_attached, i_io_list) { - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) - continue; - - isw->inodes[nr++] = inode; - - if (nr >= WB_MAX_INODES_PER_ISW - 1) { - restart = true; - break; - } - } + /* + * In addition to the inodes that have completed writeback, also switch + * cgwbs for those inodes only with dirty timestamps. Otherwise, those + * inodes won't be written back for a long time when lazytime is + * enabled, and thus pinning the dying cgwbs. It won't break the + * bandwidth restrictions, as writeback of inode metadata is not + * accounted for. + */ + restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr); + if (!restart) + restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr); spin_unlock(&wb->list_lock); /* no attached inodes? bail out */ diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 4a280be229a6..3772a5d9e85c 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -2719,16 +2719,19 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i) for(;; i->fd++) { struct inode *inode; - i->file = task_lookup_next_fd_rcu(i->task, &i->fd); + i->file = task_lookup_next_fdget_rcu(i->task, &i->fd); if (!i->file) { i->fd = 0; break; } + inode = file_inode(i->file); - if (inode->i_sb != i->sb) - continue; - if (get_file_rcu(i->file)) + if (inode->i_sb == i->sb) break; + + rcu_read_unlock(); + fput(i->file); + rcu_read_lock(); } rcu_read_unlock(); return i->file; diff --git a/fs/init.c b/fs/init.c index 9684406a8416..e9387b6c4f30 100644 --- a/fs/init.c +++ b/fs/init.c @@ -153,8 +153,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (!IS_POSIXACL(path.dentry->d_inode)) - mode &= ~current_umask(); + mode = mode_strip_umask(d_inode(path.dentry), mode); error = security_path_mknod(&path, dentry, mode, dev); if (!error) error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode, @@ -229,8 +228,7 @@ int __init init_mkdir(const char *pathname, umode_t mode) dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (!IS_POSIXACL(path.dentry->d_inode)) - mode &= ~current_umask(); + mode = mode_strip_umask(d_inode(path.dentry), mode); error = security_path_mkdir(&path, dentry, mode); if (!error) error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, diff --git a/fs/inode.c b/fs/inode.c index 84bc3c76e5cc..3bb6193f436c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2006,7 +2006,7 @@ void touch_atime(const struct path *path) if (!sb_start_write_trylock(inode->i_sb)) return; - if (__mnt_want_write(mnt) != 0) + if (mnt_get_write_access(mnt) != 0) goto skip_update; /* * File systems can error out when updating inodes if they need to @@ -2018,7 +2018,7 @@ void touch_atime(const struct path *path) * of the fs read only, e.g. subvolumes in Btrfs. */ inode_update_time(inode, S_ATIME); - __mnt_drop_write(mnt); + mnt_put_write_access(mnt); skip_update: sb_end_write(inode->i_sb); } @@ -2131,9 +2131,9 @@ static int __file_update_time(struct file *file, int sync_mode) struct inode *inode = file_inode(file); /* try to update time settings */ - if (!__mnt_want_write_file(file)) { + if (!mnt_get_write_access_file(file)) { ret = inode_update_time(inode, sync_mode); - __mnt_drop_write_file(file); + mnt_put_write_access_file(file); } return ret; diff --git a/fs/internal.h b/fs/internal.h index d64ae03998cc..58e43341aebf 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -73,8 +73,8 @@ extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); -extern int __mnt_want_write_file(struct file *); -extern void __mnt_drop_write_file(struct file *); +int mnt_get_write_access_file(struct file *file); +void mnt_put_write_access_file(struct file *file); extern void dissolve_on_fput(struct vfsmount *); extern bool may_mount(void); @@ -94,14 +94,22 @@ extern void chroot_fs_refs(const struct path *, const struct path *); struct file *alloc_empty_file(int flags, const struct cred *cred); struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred); struct file *alloc_empty_backing_file(int flags, const struct cred *cred); +void release_empty_file(struct file *f); + +static inline void file_put_write_access(struct file *file) +{ + put_write_access(file->f_inode); + mnt_put_write_access(file->f_path.mnt); + if (unlikely(file->f_mode & FMODE_BACKING)) + mnt_put_write_access(backing_file_user_path(file)->mnt); +} static inline void put_file_access(struct file *file) { if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_dec(file->f_inode); } else if (file->f_mode & FMODE_WRITER) { - put_write_access(file->f_inode); - __mnt_drop_write(file->f_path.mnt); + file_put_write_access(file); } } @@ -130,9 +138,9 @@ static inline void sb_start_ro_state_change(struct super_block *sb) * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY * cleared, it will see s_readonly_remount set. * For RW->RO transition, the barrier pairs with the barrier in - * __mnt_want_write() before the mnt_is_readonly() check. The barrier - * makes sure if __mnt_want_write() sees MNT_WRITE_HOLD already - * cleared, it will see s_readonly_remount set. + * mnt_get_write_access() before the mnt_is_readonly() check. + * The barrier makes sure if mnt_get_write_access() sees MNT_WRITE_HOLD + * already cleared, it will see s_readonly_remount set. */ smp_wmb(); } diff --git a/fs/namei.c b/fs/namei.c index 94565bd7e73f..71c13b2990b4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3104,25 +3104,6 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) } EXPORT_SYMBOL(unlock_rename); -/** - * mode_strip_umask - handle vfs umask stripping - * @dir: parent directory of the new inode - * @mode: mode of the new inode to be created in @dir - * - * Umask stripping depends on whether or not the filesystem supports POSIX - * ACLs. If the filesystem doesn't support it umask stripping is done directly - * in here. If the filesystem does support POSIX ACLs umask stripping is - * deferred until the filesystem calls posix_acl_create(). - * - * Returns: mode - */ -static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode) -{ - if (!IS_POSIXACL(dir)) - mode &= ~current_umask(); - return mode; -} - /** * vfs_prepare_mode - prepare the mode to be used for a new inode * @idmap: idmap of the mount the inode was found from @@ -3536,7 +3517,8 @@ static const char *open_last_lookups(struct nameidata *nd, if (likely(dentry)) goto finish_lookup; - BUG_ON(nd->flags & LOOKUP_RCU); + if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU)) + return ERR_PTR(-ECHILD); } else { /* create side of things */ if (nd->flags & LOOKUP_RCU) { @@ -3803,7 +3785,10 @@ static struct file *path_openat(struct nameidata *nd, WARN_ON(1); error = -EINVAL; } - fput(file); + if (unlikely(file->f_mode & FMODE_OPENED)) + fput(file); + else + release_empty_file(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; @@ -4387,11 +4372,9 @@ retry_deleg: if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ - if (last.name[last.len]) + if (last.name[last.len] || d_is_negative(dentry)) goto slashes; inode = dentry->d_inode; - if (d_is_negative(dentry)) - goto slashes; ihold(inode); error = security_path_unlink(&path, dentry); if (error) diff --git a/fs/namespace.c b/fs/namespace.c index e157efc54023..6bde71735efa 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -330,16 +330,16 @@ static int mnt_is_readonly(struct vfsmount *mnt) * can determine when writes are able to occur to a filesystem. */ /** - * __mnt_want_write - get write access to a mount without freeze protection + * mnt_get_write_access - get write access to a mount without freeze protection * @m: the mount on which to take a write * * This tells the low-level filesystem that a write is about to be performed to * it, and makes sure that writes are allowed (mnt it read-write) before * returning success. This operation does not protect against filesystem being - * frozen. When the write operation is finished, __mnt_drop_write() must be + * frozen. When the write operation is finished, mnt_put_write_access() must be * called. This is effectively a refcount. */ -int __mnt_want_write(struct vfsmount *m) +int mnt_get_write_access(struct vfsmount *m) { struct mount *mnt = real_mount(m); int ret = 0; @@ -386,6 +386,7 @@ int __mnt_want_write(struct vfsmount *m) return ret; } +EXPORT_SYMBOL_GPL(mnt_get_write_access); /** * mnt_want_write - get write access to a mount @@ -401,7 +402,7 @@ int mnt_want_write(struct vfsmount *m) int ret; sb_start_write(m->mnt_sb); - ret = __mnt_want_write(m); + ret = mnt_get_write_access(m); if (ret) sb_end_write(m->mnt_sb); return ret; @@ -409,15 +410,15 @@ int mnt_want_write(struct vfsmount *m) EXPORT_SYMBOL_GPL(mnt_want_write); /** - * __mnt_want_write_file - get write access to a file's mount + * mnt_get_write_access_file - get write access to a file's mount * @file: the file who's mount on which to take a write * - * This is like __mnt_want_write, but if the file is already open for writing it + * This is like mnt_get_write_access, but if @file is already open for write it * skips incrementing mnt_writers (since the open file already has a reference) * and instead only does the check for emergency r/o remounts. This must be - * paired with __mnt_drop_write_file. + * paired with mnt_put_write_access_file. */ -int __mnt_want_write_file(struct file *file) +int mnt_get_write_access_file(struct file *file) { if (file->f_mode & FMODE_WRITER) { /* @@ -428,7 +429,7 @@ int __mnt_want_write_file(struct file *file) return -EROFS; return 0; } - return __mnt_want_write(file->f_path.mnt); + return mnt_get_write_access(file->f_path.mnt); } /** @@ -445,7 +446,7 @@ int mnt_want_write_file(struct file *file) int ret; sb_start_write(file_inode(file)->i_sb); - ret = __mnt_want_write_file(file); + ret = mnt_get_write_access_file(file); if (ret) sb_end_write(file_inode(file)->i_sb); return ret; @@ -453,19 +454,20 @@ int mnt_want_write_file(struct file *file) EXPORT_SYMBOL_GPL(mnt_want_write_file); /** - * __mnt_drop_write - give up write access to a mount + * mnt_put_write_access - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done * performing writes to it. Must be matched with - * __mnt_want_write() call above. + * mnt_get_write_access() call above. */ -void __mnt_drop_write(struct vfsmount *mnt) +void mnt_put_write_access(struct vfsmount *mnt) { preempt_disable(); mnt_dec_writers(real_mount(mnt)); preempt_enable(); } +EXPORT_SYMBOL_GPL(mnt_put_write_access); /** * mnt_drop_write - give up write access to a mount @@ -477,20 +479,20 @@ void __mnt_drop_write(struct vfsmount *mnt) */ void mnt_drop_write(struct vfsmount *mnt) { - __mnt_drop_write(mnt); + mnt_put_write_access(mnt); sb_end_write(mnt->mnt_sb); } EXPORT_SYMBOL_GPL(mnt_drop_write); -void __mnt_drop_write_file(struct file *file) +void mnt_put_write_access_file(struct file *file) { if (!(file->f_mode & FMODE_WRITER)) - __mnt_drop_write(file->f_path.mnt); + mnt_put_write_access(file->f_path.mnt); } void mnt_drop_write_file(struct file *file) { - __mnt_drop_write_file(file); + mnt_put_write_access_file(file); sb_end_write(file_inode(file)->i_sb); } EXPORT_SYMBOL(mnt_drop_write_file); @@ -1344,9 +1346,9 @@ void mntput(struct vfsmount *mnt) { if (mnt) { struct mount *m = real_mount(mnt); - /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ + /* avoid cacheline pingpong */ if (unlikely(m->mnt_expiry_mark)) - m->mnt_expiry_mark = 0; + WRITE_ONCE(m->mnt_expiry_mark, 0); mntput_no_expire(m); } } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0d6473cb00cb..9b1cfca8112a 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1071,7 +1071,7 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx) sb->s_export_op = &nfs_export_ops; break; case 4: - sb->s_flags |= SB_POSIXACL; + sb->s_iflags |= SB_I_NOUMASK; sb->s_time_gran = 1; sb->s_time_min = S64_MIN; sb->s_time_max = S64_MAX; diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index ebdcc25df0f7..869b016014d2 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -265,7 +265,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) struct dnotify_struct *dn; struct inode *inode; fl_owner_t id = current->files; - struct file *f; + struct file *f = NULL; int destroy = 0, error = 0; __u32 mask; @@ -345,7 +345,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) } rcu_read_lock(); - f = lookup_fd_rcu(fd); + f = lookup_fdget_rcu(fd); rcu_read_unlock(); /* if (f != filp) means that we lost a race and another task/thread @@ -392,6 +392,8 @@ out_err: fsnotify_put_mark(new_fsn_mark); if (dn) kmem_cache_free(dnotify_struct_cache, dn); + if (f) + fput(f); return error; } diff --git a/fs/open.c b/fs/open.c index 98f6601fbac6..02dc608d40d8 100644 --- a/fs/open.c +++ b/fs/open.c @@ -870,6 +870,30 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) return ksys_fchown(fd, user, group); } +static inline int file_get_write_access(struct file *f) +{ + int error; + + error = get_write_access(f->f_inode); + if (unlikely(error)) + return error; + error = mnt_get_write_access(f->f_path.mnt); + if (unlikely(error)) + goto cleanup_inode; + if (unlikely(f->f_mode & FMODE_BACKING)) { + error = mnt_get_write_access(backing_file_user_path(f)->mnt); + if (unlikely(error)) + goto cleanup_mnt; + } + return 0; + +cleanup_mnt: + mnt_put_write_access(f->f_path.mnt); +cleanup_inode: + put_write_access(f->f_inode); + return error; +} + static int do_dentry_open(struct file *f, struct inode *inode, int (*open)(struct inode *, struct file *)) @@ -892,14 +916,9 @@ static int do_dentry_open(struct file *f, if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_inc(inode); } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { - error = get_write_access(inode); + error = file_get_write_access(f); if (unlikely(error)) goto cleanup_file; - error = __mnt_want_write(f->f_path.mnt); - if (unlikely(error)) { - put_write_access(inode); - goto cleanup_file; - } f->f_mode |= FMODE_WRITER; } @@ -1163,20 +1182,19 @@ EXPORT_SYMBOL_GPL(kernel_file_open); /** * backing_file_open - open a backing file for kernel internal use - * @path: path of the file to open + * @user_path: path that the user reuqested to open * @flags: open flags * @real_path: path of the backing file * @cred: credentials for open * * Open a backing file for a stackable filesystem (e.g., overlayfs). - * @path may be on the stackable filesystem and backing inode on the - * underlying filesystem. In this case, we want to be able to return - * the @real_path of the backing inode. This is done by embedding the - * returned file into a container structure that also stores the path of - * the backing inode on the underlying filesystem, which can be - * retrieved using backing_file_real_path(). + * @user_path may be on the stackable filesystem and @real_path on the + * underlying filesystem. In this case, we want to be able to return the + * @user_path of the stackable filesystem. This is done by embedding the + * returned file into a container structure that also stores the stacked + * file's path, which can be retrieved using backing_file_user_path(). */ -struct file *backing_file_open(const struct path *path, int flags, +struct file *backing_file_open(const struct path *user_path, int flags, const struct path *real_path, const struct cred *cred) { @@ -1187,9 +1205,9 @@ struct file *backing_file_open(const struct path *path, int flags, if (IS_ERR(f)) return f; - f->f_path = *path; - path_get(real_path); - *backing_file_real_path(f) = *real_path; + path_get(user_path); + *backing_file_user_path(f) = *user_path; + f->f_path = *real_path; error = do_dentry_open(f, d_inode(real_path->dentry), NULL); if (error) { fput(f); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 3fa2416264a4..c6c39ea267c5 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -34,14 +34,22 @@ static struct dentry *ovl_d_real(struct dentry *dentry, struct dentry *real = NULL, *lower; int err; - /* It's an overlay file */ + /* + * vfs is only expected to call d_real() with NULL from d_real_inode() + * and with overlay inode from file_dentry() on an overlay file. + * + * TODO: remove @inode argument from d_real() API, remove code in this + * function that deals with non-NULL @inode and remove d_real() call + * from file_dentry(). + */ if (inode && d_inode(dentry) == inode) return dentry; + else if (inode) + goto bug; if (!d_is_reg(dentry)) { - if (!inode || inode == d_inode(dentry)) - return dentry; - goto bug; + /* d_real_inode() is only relevant for regular files */ + return dentry; } real = ovl_dentry_upper(dentry); @@ -1488,8 +1496,16 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers : ovl_trusted_xattr_handlers; sb->s_fs_info = ofs; +#ifdef CONFIG_FS_POSIX_ACL sb->s_flags |= SB_POSIXACL; +#endif sb->s_iflags |= SB_I_SKIP_SYNC | SB_I_IMA_UNVERIFIABLE_SIGNATURE; + /* + * Ensure that umask handling is done by the filesystems used + * for the the upper layer instead of overlayfs as that would + * lead to unexpected results. + */ + sb->s_iflags |= SB_I_NOUMASK; err = -ENOMEM; root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe); diff --git a/fs/pipe.c b/fs/pipe.c index 139190165a1c..485e3be8903c 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -227,6 +227,36 @@ static inline bool pipe_readable(const struct pipe_inode_info *pipe) return !pipe_empty(head, tail) || !writers; } +static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, + unsigned int tail) +{ + pipe_buf_release(pipe, buf); + + /* + * If the pipe has a watch_queue, we need additional protection + * by the spinlock because notifications get posted with only + * this spinlock, no mutex + */ + if (pipe_has_watch_queue(pipe)) { + spin_lock_irq(&pipe->rd_wait.lock); +#ifdef CONFIG_WATCH_QUEUE + if (buf->flags & PIPE_BUF_FLAG_LOSS) + pipe->note_loss = true; +#endif + pipe->tail = ++tail; + spin_unlock_irq(&pipe->rd_wait.lock); + return tail; + } + + /* + * Without a watch_queue, we can simply increment the tail + * without the spinlock - the mutex is enough. + */ + pipe->tail = ++tail; + return tail; +} + static ssize_t pipe_read(struct kiocb *iocb, struct iov_iter *to) { @@ -320,17 +350,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) buf->len = 0; } - if (!buf->len) { - pipe_buf_release(pipe, buf); - spin_lock_irq(&pipe->rd_wait.lock); -#ifdef CONFIG_WATCH_QUEUE - if (buf->flags & PIPE_BUF_FLAG_LOSS) - pipe->note_loss = true; -#endif - tail++; - pipe->tail = tail; - spin_unlock_irq(&pipe->rd_wait.lock); - } + if (!buf->len) + tail = pipe_update_tail(pipe, buf, tail); total_len -= chars; if (!total_len) break; /* common path: read succeeded */ @@ -437,12 +458,10 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) goto out; } -#ifdef CONFIG_WATCH_QUEUE - if (pipe->watch_queue) { + if (pipe_has_watch_queue(pipe)) { ret = -EXDEV; goto out; } -#endif /* * If it wasn't empty we try to merge new data into @@ -507,16 +526,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) * it, either the reader will consume it or it'll still * be there for the next write. */ - spin_lock_irq(&pipe->rd_wait.lock); - - head = pipe->head; - if (pipe_full(head, pipe->tail, pipe->max_usage)) { - spin_unlock_irq(&pipe->rd_wait.lock); - continue; - } - pipe->head = head + 1; - spin_unlock_irq(&pipe->rd_wait.lock); /* Insert it into the buffer array */ buf = &pipe->bufs[head & mask]; @@ -1324,10 +1334,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg) unsigned int nr_slots, size; long ret = 0; -#ifdef CONFIG_WATCH_QUEUE - if (pipe->watch_queue) + if (pipe_has_watch_queue(pipe)) return -EBUSY; -#endif size = round_pipe_size(arg); nr_slots = size >> PAGE_SHIFT; @@ -1379,10 +1387,8 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) if (file->f_op != &pipefifo_fops || !pipe) return NULL; -#ifdef CONFIG_WATCH_QUEUE - if (for_splice && pipe->watch_queue) + if (for_splice && pipe_has_watch_queue(pipe)) return NULL; -#endif return pipe; } diff --git a/fs/proc/base.c b/fs/proc/base.c index ffd54617c354..20695c928ee6 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2218,7 +2218,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path) rc = -ENOENT; vma = find_exact_vma(mm, vm_start, vm_end); if (vma && vma->vm_file) { - *path = vma->vm_file->f_path; + *path = *file_user_path(vma->vm_file); path_get(path); rc = 0; } diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 6276b3938842..6e72e5ad42bc 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -113,10 +113,12 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode) struct file *file; rcu_read_lock(); - file = task_lookup_fd_rcu(task, fd); - if (file) - *mode = file->f_mode; + file = task_lookup_fdget_rcu(task, fd); rcu_read_unlock(); + if (file) { + *mode = file->f_mode; + fput(file); + } return !!file; } @@ -259,12 +261,13 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, char name[10 + 1]; unsigned int len; - f = task_lookup_next_fd_rcu(p, &fd); + f = task_lookup_next_fdget_rcu(p, &fd); ctx->pos = fd + 2LL; if (!f) break; data.mode = f->f_mode; rcu_read_unlock(); + fput(f); data.fd = fd; len = snprintf(name, sizeof(name), "%u", fd); diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 4d3493579458..c6e7ebc63756 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -58,7 +58,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) if (file) { seq_pad(m, ' '); - seq_file_path(m, file, ""); + seq_path(m, file_user_path(file), ""); } seq_putc(m, '\n'); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3dd5be96691b..1593940ca01e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -296,7 +296,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) if (anon_name) seq_printf(m, "[anon_shmem:%s]", anon_name->name); else - seq_file_path(m, file, "\n"); + seq_path(m, file_user_path(file), "\n"); goto done; } @@ -1967,7 +1967,7 @@ static int show_numa_map(struct seq_file *m, void *v) if (file) { seq_puts(m, " file="); - seq_file_path(m, file, "\n\t= "); + seq_path(m, file_user_path(file), "\n\t= "); } else if (vma_is_initial_heap(vma)) { seq_puts(m, " heap"); } else if (vma_is_initial_stack(vma)) { diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 7cebd397cc26..bce674533000 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -157,7 +157,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) if (file) { seq_pad(m, ' '); - seq_file_path(m, file, ""); + seq_path(m, file_user_path(file), ""); } else if (mm && vma_is_initial_stack(vma)) { seq_pad(m, ' '); seq_puts(m, "[stack]"); diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index e066816f3519..bc4c3287a65e 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -98,20 +98,9 @@ static inline struct file *files_lookup_fd_locked(struct files_struct *files, un return files_lookup_fd_raw(files, fd); } -static inline struct file *files_lookup_fd_rcu(struct files_struct *files, unsigned int fd) -{ - RCU_LOCKDEP_WARN(!rcu_read_lock_held(), - "suspicious rcu_dereference_check() usage"); - return files_lookup_fd_raw(files, fd); -} - -static inline struct file *lookup_fd_rcu(unsigned int fd) -{ - return files_lookup_fd_rcu(current->files, fd); -} - -struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd); -struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *fd); +struct file *lookup_fdget_rcu(unsigned int fd); +struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd); +struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *fd); struct task_struct; diff --git a/include/linux/fs.h b/include/linux/fs.h index fb5accebdcdf..4ba1833854c6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1042,7 +1042,10 @@ static inline struct file *get_file(struct file *f) atomic_long_inc(&f->f_count); return f; } -#define get_file_rcu(x) atomic_long_inc_not_zero(&(x)->f_count) + +struct file *get_file_rcu(struct file __rcu **f); +struct file *get_file_active(struct file **f); + #define file_count(x) atomic_long_read(&(x)->f_count) #define MAX_NON_LFS ((1UL<<31) - 1) @@ -1119,7 +1122,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_NOATIME BIT(10) /* Do not update access times. */ #define SB_NODIRATIME BIT(11) /* Do not update directory access times */ #define SB_SILENT BIT(15) -#define SB_POSIXACL BIT(16) /* VFS does not apply the umask */ +#define SB_POSIXACL BIT(16) /* Supports POSIX ACLs */ #define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */ #define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */ #define SB_I_VERSION BIT(23) /* Update inode I_version field */ @@ -1166,6 +1169,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ #define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ +#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ /* Possible states of 'frozen' field */ enum { @@ -2082,7 +2086,12 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) + +#ifdef CONFIG_FS_POSIX_ACL #define IS_POSIXACL(inode) __IS_FLG(inode, SB_POSIXACL) +#else +#define IS_POSIXACL(inode) 0 +#endif #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) @@ -2449,24 +2458,24 @@ struct file *dentry_open(const struct path *path, int flags, const struct cred *creds); struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred); -struct file *backing_file_open(const struct path *path, int flags, +struct file *backing_file_open(const struct path *user_path, int flags, const struct path *real_path, const struct cred *cred); -struct path *backing_file_real_path(struct file *f); +struct path *backing_file_user_path(struct file *f); /* - * file_real_path - get the path corresponding to f_inode + * file_user_path - get the path to display for memory mapped file * - * When opening a backing file for a stackable filesystem (e.g., - * overlayfs) f_path may be on the stackable filesystem and f_inode on - * the underlying filesystem. When the path associated with f_inode is - * needed, this helper should be used instead of accessing f_path - * directly. -*/ -static inline const struct path *file_real_path(struct file *f) + * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file + * stored in ->vm_file is a backing file whose f_inode is on the underlying + * filesystem. When the mapped file path is displayed to user (e.g. via + * /proc//maps), this helper should be used to get the path to display + * to the user, which is the path of the fd that user has requested to map. + */ +static inline const struct path *file_user_path(struct file *f) { if (unlikely(f->f_mode & FMODE_BACKING)) - return backing_file_real_path(f); + return backing_file_user_path(f); return &f->f_path; } diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index ed48e4f1e755..bcb6609b54b3 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -96,8 +96,7 @@ static inline int fsnotify_file(struct file *file, __u32 mask) if (file->f_mode & FMODE_NONOTIFY) return 0; - /* Overlayfs internal files have fake f_path */ - path = file_real_path(file); + path = &file->f_path; return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); } diff --git a/include/linux/mount.h b/include/linux/mount.h index 4f40b40306d0..ac3dd2876197 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -92,8 +92,8 @@ extern bool __mnt_is_readonly(struct vfsmount *mnt); extern bool mnt_may_suid(struct vfsmount *mnt); extern struct vfsmount *clone_private_mount(const struct path *path); -extern int __mnt_want_write(struct vfsmount *); -extern void __mnt_drop_write(struct vfsmount *); +int mnt_get_write_access(struct vfsmount *mnt); +void mnt_put_write_access(struct vfsmount *mnt); extern struct vfsmount *fc_mount(struct fs_context *fc); extern struct vfsmount *vfs_create_mount(struct fs_context *fc); diff --git a/include/linux/namei.h b/include/linux/namei.h index 1463cbda4888..3100371b5e32 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -92,6 +92,30 @@ extern struct dentry *lock_rename(struct dentry *, struct dentry *); extern struct dentry *lock_rename_child(struct dentry *, struct dentry *); extern void unlock_rename(struct dentry *, struct dentry *); +/** + * mode_strip_umask - handle vfs umask stripping + * @dir: parent directory of the new inode + * @mode: mode of the new inode to be created in @dir + * + * In most filesystems, umask stripping depends on whether or not the + * filesystem supports POSIX ACLs. If the filesystem doesn't support it umask + * stripping is done directly in here. If the filesystem does support POSIX + * ACLs umask stripping is deferred until the filesystem calls + * posix_acl_create(). + * + * Some filesystems (like NFSv4) also want to avoid umask stripping by the + * VFS, but don't support POSIX ACLs. Those filesystems can set SB_I_NOUMASK + * to get this effect without declaring that they support POSIX ACLs. + * + * Returns: mode + */ +static inline umode_t __must_check mode_strip_umask(const struct inode *dir, umode_t mode) +{ + if (!IS_POSIXACL(dir) && !(dir->i_sb->s_iflags & SB_I_NOUMASK)) + mode &= ~current_umask(); + return mode; +} + extern int __must_check nd_jump_link(const struct path *path); static inline void nd_terminate_link(void *name, size_t len, size_t maxlen) @@ -112,7 +136,7 @@ static inline void nd_terminate_link(void *name, size_t len, size_t maxlen) static inline bool retry_estale(const long error, const unsigned int flags) { - return error == -ESTALE && !(flags & LOOKUP_REVAL); + return unlikely(error == -ESTALE && !(flags & LOOKUP_REVAL)); } #endif /* _LINUX_NAMEI_H */ diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 608a9eb86bff..8ff23bf5a819 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -62,9 +62,6 @@ struct pipe_inode_info { unsigned int tail; unsigned int max_usage; unsigned int ring_size; -#ifdef CONFIG_WATCH_QUEUE - bool note_loss; -#endif unsigned int nr_accounted; unsigned int readers; unsigned int writers; @@ -72,6 +69,9 @@ struct pipe_inode_info { unsigned int r_counter; unsigned int w_counter; bool poll_usage; +#ifdef CONFIG_WATCH_QUEUE + bool note_loss; +#endif struct page *tmp_page; struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; @@ -124,6 +124,22 @@ struct pipe_buf_operations { bool (*get)(struct pipe_inode_info *, struct pipe_buffer *); }; +/** + * pipe_has_watch_queue - Check whether the pipe is a watch_queue, + * i.e. it was created with O_NOTIFICATION_PIPE + * @pipe: The pipe to check + * + * Return: true if pipe is a watch queue, false otherwise. + */ +static inline bool pipe_has_watch_queue(const struct pipe_inode_info *pipe) +{ +#ifdef CONFIG_WATCH_QUEUE + return pipe->watch_queue != NULL; +#else + return false; +#endif +} + /** * pipe_empty - Return true if the pipe is empty * @head: The pipe ring head pointer diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h index 45cd42f55d49..429c7b6afead 100644 --- a/include/linux/watch_queue.h +++ b/include/linux/watch_queue.h @@ -32,7 +32,7 @@ struct watch_filter { DECLARE_BITMAP(type_filter, WATCH_TYPE__NR); }; u32 nr_filters; /* Number of filters */ - struct watch_type_filter filters[]; + struct watch_type_filter filters[] __counted_by(nr_filters); }; struct watch_queue { diff --git a/init/do_mounts.c b/init/do_mounts.c index 5dfd30b13f48..5fdef94f0864 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -244,7 +244,7 @@ retry: for (i = 0, p = fs_names; i < num_fs; i++, p += strlen(p)+1) printk(" %s", p); printk("\n"); - panic("VFS: Unable to mount root fs on %s", b); + panic("VFS: Unable to mount root fs on \"%s\" or %s", pretty_name, b); out: put_page(page); } diff --git a/io_uring/openclose.c b/io_uring/openclose.c index e3fae26e025d..fb73adb89067 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -220,7 +220,6 @@ int io_close(struct io_kiocb *req, unsigned int issue_flags) { struct files_struct *files = current->files; struct io_close *close = io_kiocb_to_cmd(req, struct io_close); - struct fdtable *fdt; struct file *file; int ret = -EBADF; @@ -230,13 +229,7 @@ int io_close(struct io_kiocb *req, unsigned int issue_flags) } spin_lock(&files->file_lock); - fdt = files_fdtable(files); - if (close->fd >= fdt->max_fds) { - spin_unlock(&files->file_lock); - goto err; - } - file = rcu_dereference_protected(fdt->fd[close->fd], - lockdep_is_held(&files->file_lock)); + file = files_lookup_fd_locked(files, close->fd); if (!file || io_is_uring_fops(file)) { spin_unlock(&files->file_lock); goto err; diff --git a/kernel/acct.c b/kernel/acct.c index 1a9f929fe629..986c8214dabf 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -246,7 +246,7 @@ static int acct_on(struct filename *pathname) filp_close(file, NULL); return PTR_ERR(internal); } - err = __mnt_want_write(internal); + err = mnt_get_write_access(internal); if (err) { mntput(internal); kfree(acct); @@ -271,7 +271,7 @@ static int acct_on(struct filename *pathname) old = xchg(&ns->bacct, &acct->pin); mutex_unlock(&acct->lock); pin_kill(old); - __mnt_drop_write(mnt); + mnt_put_write_access(mnt); mntput(mnt); return 0; } diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index c4ab9d6cdbe9..82ad23b1d257 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -308,11 +308,9 @@ again: rcu_read_lock(); for (;; curr_fd++) { struct file *f; - f = task_lookup_next_fd_rcu(curr_task, &curr_fd); + f = task_lookup_next_fdget_rcu(curr_task, &curr_fd); if (!f) break; - if (!get_file_rcu(f)) - continue; /* set info->fd */ info->fd = curr_fd; diff --git a/kernel/fork.c b/kernel/fork.c index 3b6d20dfb9a8..640123767726 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1492,9 +1492,7 @@ struct file *get_mm_exe_file(struct mm_struct *mm) struct file *exe_file; rcu_read_lock(); - exe_file = rcu_dereference(mm->exe_file); - if (exe_file && !get_file_rcu(exe_file)) - exe_file = NULL; + exe_file = get_file_rcu(&mm->exe_file); rcu_read_unlock(); return exe_file; } diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 5353edfad8e1..b0639f21041f 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -64,8 +64,10 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx) struct file *file; rcu_read_lock(); - file = task_lookup_fd_rcu(task, idx); + file = task_lookup_fdget_rcu(task, idx); rcu_read_unlock(); + if (file) + fput(file); return file; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index db575094c498..d8b302d01083 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -404,7 +404,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, vmstart = vma->vm_start; } if (file) { - ret = trace_seq_path(s, &file->f_path); + ret = trace_seq_path(s, file_user_path(file)); if (ret) trace_seq_printf(s, "[+0x%lx]", ip - vmstart); diff --git a/mm/readahead.c b/mm/readahead.c index e815c114de21..6925e6959fd3 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -735,7 +735,8 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count) */ ret = -EINVAL; if (!f.file->f_mapping || !f.file->f_mapping->a_ops || - !S_ISREG(file_inode(f.file)->i_mode)) + (!S_ISREG(file_inode(f.file)->i_mode) && + !S_ISBLK(file_inode(f.file)->i_mode))) goto out; ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);