linux-stable/fs
Christian Brauner a1ff418d3e mount: fix mounting of detached mounts onto targets that reside on shared mounts
commit ee2e3f5062 upstream.

Creating a series of detached mounts, attaching them to the filesystem,
and unmounting them can be used to trigger an integer overflow in
ns->mounts causing the kernel to block any new mounts in count_mounts()
and returning ENOSPC because it falsely assumes that the maximum number
of mounts in the mount namespace has been reached, i.e. it thinks it
can't fit the new mounts into the mount namespace anymore.

Depending on the number of mounts in your system, this can be reproduced
on any kernel that supportes open_tree() and move_mount() by compiling
and running the following program:

  /* SPDX-License-Identifier: LGPL-2.1+ */

  #define _GNU_SOURCE
  #include <errno.h>
  #include <fcntl.h>
  #include <getopt.h>
  #include <limits.h>
  #include <stdbool.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
  #include <sys/mount.h>
  #include <sys/stat.h>
  #include <sys/syscall.h>
  #include <sys/types.h>
  #include <unistd.h>

  /* open_tree() */
  #ifndef OPEN_TREE_CLONE
  #define OPEN_TREE_CLONE 1
  #endif

  #ifndef OPEN_TREE_CLOEXEC
  #define OPEN_TREE_CLOEXEC O_CLOEXEC
  #endif

  #ifndef __NR_open_tree
          #if defined __alpha__
                  #define __NR_open_tree 538
          #elif defined _MIPS_SIM
                  #if _MIPS_SIM == _MIPS_SIM_ABI32        /* o32 */
                          #define __NR_open_tree 4428
                  #endif
                  #if _MIPS_SIM == _MIPS_SIM_NABI32       /* n32 */
                          #define __NR_open_tree 6428
                  #endif
                  #if _MIPS_SIM == _MIPS_SIM_ABI64        /* n64 */
                          #define __NR_open_tree 5428
                  #endif
          #elif defined __ia64__
                  #define __NR_open_tree (428 + 1024)
          #else
                  #define __NR_open_tree 428
          #endif
  #endif

  /* move_mount() */
  #ifndef MOVE_MOUNT_F_EMPTY_PATH
  #define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
  #endif

  #ifndef __NR_move_mount
          #if defined __alpha__
                  #define __NR_move_mount 539
          #elif defined _MIPS_SIM
                  #if _MIPS_SIM == _MIPS_SIM_ABI32        /* o32 */
                          #define __NR_move_mount 4429
                  #endif
                  #if _MIPS_SIM == _MIPS_SIM_NABI32       /* n32 */
                          #define __NR_move_mount 6429
                  #endif
                  #if _MIPS_SIM == _MIPS_SIM_ABI64        /* n64 */
                          #define __NR_move_mount 5429
                  #endif
          #elif defined __ia64__
                  #define __NR_move_mount (428 + 1024)
          #else
                  #define __NR_move_mount 429
          #endif
  #endif

  static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
  {
          return syscall(__NR_open_tree, dfd, filename, flags);
  }

  static inline int sys_move_mount(int from_dfd, const char *from_pathname, int to_dfd,
                                   const char *to_pathname, unsigned int flags)
  {
          return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, to_pathname, flags);
  }

  static bool is_shared_mountpoint(const char *path)
  {
          bool shared = false;
          FILE *f = NULL;
          char *line = NULL;
          int i;
          size_t len = 0;

          f = fopen("/proc/self/mountinfo", "re");
          if (!f)
                  return 0;

          while (getline(&line, &len, f) > 0) {
                  char *slider1, *slider2;

                  for (slider1 = line, i = 0; slider1 && i < 4; i++)
                          slider1 = strchr(slider1 + 1, ' ');

                  if (!slider1)
                          continue;

                  slider2 = strchr(slider1 + 1, ' ');
                  if (!slider2)
                          continue;

                  *slider2 = '\0';
                  if (strcmp(slider1 + 1, path) == 0) {
                          /* This is the path. Is it shared? */
                          slider1 = strchr(slider2 + 1, ' ');
                          if (slider1 && strstr(slider1, "shared:")) {
                                  shared = true;
                                  break;
                          }
                  }
          }
          fclose(f);
          free(line);

          return shared;
  }

  static void usage(void)
  {
          const char *text = "mount-new [--recursive] <base-dir>\n";
          fprintf(stderr, "%s", text);
          _exit(EXIT_SUCCESS);
  }

  #define exit_usage(format, ...)                              \
          ({                                                   \
                  fprintf(stderr, format "\n", ##__VA_ARGS__); \
                  usage();                                     \
          })

  #define exit_log(format, ...)                                \
          ({                                                   \
                  fprintf(stderr, format "\n", ##__VA_ARGS__); \
                  exit(EXIT_FAILURE);                          \
          })

  static const struct option longopts[] = {
          {"help",        no_argument,            0,      'a'},
          { NULL,         no_argument,            0,       0 },
  };

  int main(int argc, char *argv[])
  {
          int exit_code = EXIT_SUCCESS, index = 0;
          int dfd, fd_tree, new_argc, ret;
          char *base_dir;
          char *const *new_argv;
          char target[PATH_MAX];

          while ((ret = getopt_long_only(argc, argv, "", longopts, &index)) != -1) {
                  switch (ret) {
                  case 'a':
                          /* fallthrough */
                  default:
                          usage();
                  }
          }

          new_argv = &argv[optind];
          new_argc = argc - optind;
          if (new_argc < 1)
                  exit_usage("Missing base directory\n");
          base_dir = new_argv[0];

          if (*base_dir != '/')
                  exit_log("Please specify an absolute path");

          /* Ensure that target is a shared mountpoint. */
          if (!is_shared_mountpoint(base_dir))
                  exit_log("Please ensure that \"%s\" is a shared mountpoint", base_dir);

          dfd = open(base_dir, O_RDONLY | O_DIRECTORY | O_CLOEXEC);
          if (dfd < 0)
                  exit_log("%m - Failed to open base directory \"%s\"", base_dir);

          ret = mkdirat(dfd, "detached-move-mount", 0755);
          if (ret < 0)
                  exit_log("%m - Failed to create required temporary directories");

          ret = snprintf(target, sizeof(target), "%s/detached-move-mount", base_dir);
          if (ret < 0 || (size_t)ret >= sizeof(target))
                  exit_log("%m - Failed to assemble target path");

          /*
           * Having a mount table with 10000 mounts is already quite excessive
           * and shoult account even for weird test systems.
           */
          for (size_t i = 0; i < 10000; i++) {
                  fd_tree = sys_open_tree(dfd, "detached-move-mount",
                                          OPEN_TREE_CLONE |
                                          OPEN_TREE_CLOEXEC |
                                          AT_EMPTY_PATH);
                  if (fd_tree < 0) {
                          fprintf(stderr, "%m - Failed to open %d(detached-move-mount)", dfd);
                          exit_code = EXIT_FAILURE;
                          break;
                  }

                  ret = sys_move_mount(fd_tree, "", dfd, "detached-move-mount", MOVE_MOUNT_F_EMPTY_PATH);
                  if (ret < 0) {
                          if (errno == ENOSPC)
                                  fprintf(stderr, "%m - Buggy mount counting");
                          else
                                  fprintf(stderr, "%m - Failed to attach mount to %d(detached-move-mount)", dfd);
                          exit_code = EXIT_FAILURE;
                          break;
                  }
                  close(fd_tree);

                  ret = umount2(target, MNT_DETACH);
                  if (ret < 0) {
                          fprintf(stderr, "%m - Failed to unmount %s", target);
                          exit_code = EXIT_FAILURE;
                          break;
                  }
          }

          (void)unlinkat(dfd, "detached-move-mount", AT_REMOVEDIR);
          close(dfd);

          exit(exit_code);
  }

and wait for the kernel to refuse any new mounts by returning ENOSPC.
How many iterations are needed depends on the number of mounts in your
system. Assuming you have something like 50 mounts on a standard system
it should be almost instantaneous.

The root cause of this is that detached mounts aren't handled correctly
when source and target mount are identical and reside on a shared mount
causing a broken mount tree where the detached source itself is
propagated which propagation prevents for regular bind-mounts and new
mounts. This ultimately leads to a miscalculation of the number of
mounts in the mount namespace.

Detached mounts created via
open_tree(fd, path, OPEN_TREE_CLONE)
are essentially like an unattached new mount, or an unattached
bind-mount. They can then later on be attached to the filesystem via
move_mount() which calls into attach_recursive_mount(). Part of
attaching it to the filesystem is making sure that mounts get correctly
propagated in case the destination mountpoint is MS_SHARED, i.e. is a
shared mountpoint. This is done by calling into propagate_mnt() which
walks the list of peers calling propagate_one() on each mount in this
list making sure it receives the propagation event.
The propagate_one() functions thereby skips both new mounts and bind
mounts to not propagate them "into themselves". Both are identified by
checking whether the mount is already attached to any mount namespace in
mnt->mnt_ns. The is what the IS_MNT_NEW() helper is responsible for.

However, detached mounts have an anonymous mount namespace attached to
them stashed in mnt->mnt_ns which means that IS_MNT_NEW() doesn't
realize they need to be skipped causing the mount to propagate "into
itself" breaking the mount table and causing a disconnect between the
number of mounts recorded as being beneath or reachable from the target
mountpoint and the number of mounts actually recorded/counted in
ns->mounts ultimately causing an overflow which in turn prevents any new
mounts via the ENOSPC issue.

So teach propagation to handle detached mounts by making it aware of
them. I've been tracking this issue down for the last couple of days and
then verifying that the fix is correct by
unmounting everything in my current mount table leaving only /proc and
/sys mounted and running the reproducer above overnight verifying the
number of mounts counted in ns->mounts. With this fix the counts are
correct and the ENOSPC issue can't be reproduced.

This change will only have an effect on mounts created with the new
mount API since detached mounts cannot be created with the old mount API
so regressions are extremely unlikely.

Link: https://lore.kernel.org/r/20210306101010.243666-1-christian.brauner@ubuntu.com
Fixes: 2db154b3ea ("vfs: syscall: Add move_mount(2) to move mounts around")
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: <stable@vger.kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2021-03-17 17:03:33 +01:00
..
9p 9P: Cast to loff_t before multiplying 2020-11-05 11:43:34 +01:00
adfs
affs fs/affs: release old buffer head on error path 2021-03-04 10:26:48 +01:00
afs rxrpc: Fix deadlock around release of dst cached on udp tunnel 2021-02-10 09:25:27 +01:00
autofs autofs: fix a leak in autofs_expire_indirect() 2019-10-25 00:03:11 -04:00
befs
bfs bfs: don't use WARNING: string when it's just info. 2021-01-06 14:48:39 +01:00
btrfs btrfs: fix warning when creating a directory with smack enabled 2021-03-09 11:09:37 +01:00
cachefiles cachefiles: Handle readpage error correctly 2020-11-05 11:43:36 +01:00
ceph ceph: fix race in concurrent __ceph_remove_cap invocations 2020-12-30 11:51:40 +01:00
cifs cifs: Set CIFS_MOUNT_USE_PREFIX_PATH flag on setting cifs_sb->prepath. 2021-02-26 10:10:28 +01:00
coda y2038: add inode timestamp clamping 2019-09-19 09:42:37 -07:00
configfs configfs: fix config_item refcnt leak in configfs_rmdir() 2020-05-27 17:46:30 +02:00
cramfs cramfs: fix usage on non-MTD device 2019-11-23 21:44:49 -05:00
crypto fscrypt: remove kernel-internal constants from UAPI header 2021-01-06 14:48:35 +01:00
debugfs debugfs: do not attempt to create a new file before the filesystem is initalized 2021-03-04 10:26:10 +01:00
devpts
dlm fs: dlm: fix configfs memory leak 2020-10-29 09:58:03 +01:00
ecryptfs ecryptfs: replace BUG_ON with error handling code 2020-02-28 17:22:26 +01:00
efivarfs efivarfs: revert "fix memory leak in efivarfs_create()" 2020-12-02 08:49:53 +01:00
efs
erofs erofs: fix shift-out-of-bounds of blkszbits 2021-03-07 12:20:42 +01:00
exportfs exportfs_decode_fh(): negative pinned may become positive without the parent locked 2019-11-10 11:56:05 -05:00
ext2 ext2: don't update mtime on COW faults 2020-09-09 19:12:30 +02:00
ext4 ext4: fix potential htree index checksum corruption 2021-03-04 10:26:37 +01:00
f2fs f2fs: fix to set/clear I_LINKABLE under i_lock 2021-03-07 12:20:46 +01:00
fat fat: don't allow to mount if the FAT length == 0 2020-06-17 16:40:36 +02:00
freevxfs
fscache
fuse fuse: fix bad inode 2021-01-09 13:44:54 +01:00
gfs2 gfs2: Recursive gfs2_quota_hold in gfs2_iomap_end 2021-03-04 10:26:51 +01:00
hfs
hfsplus hfsplus: fix crash and filesystem corruption when deleting files 2020-04-17 10:50:22 +02:00
hostfs
hpfs
hugetlbfs mm: hugetlbfs: fix cannot migrate the fallocated HugeTLB page 2021-02-10 09:25:31 +01:00
iomap iomap: fix WARN_ON_ONCE() from unprivileged users 2020-10-29 09:58:06 +01:00
isofs isofs: release buffer head before return 2021-03-04 10:26:30 +01:00
jbd2 jbd2: fix up sparse warnings in checkpoint code 2020-11-18 19:20:30 +01:00
jffs2 jffs2: fix use after free in jffs2_sum_write_data() 2021-03-04 10:26:25 +01:00
jfs JFS: more checks for invalid superblock 2021-03-07 12:20:41 +01:00
kernfs kernfs: do not call fsnotify() with name without a parent 2020-08-19 08:16:12 +02:00
lockd lockd: don't use interval-based rebinding over TCP 2020-12-30 11:51:16 +01:00
minix fs/minix: remove expected error message in block_to_path() 2020-08-21 13:05:37 +02:00
nfs pNFS/NFSv4: Try to return invalid layout in pnfs_layout_process() 2021-02-13 13:52:55 +01:00
nfs_common nfs_common: need lock during iterate through the list 2020-12-30 11:51:22 +01:00
nfsd nfsd: register pernet ops last, unregister first 2021-03-04 10:26:34 +01:00
nilfs2 nilfs2: fix null pointer dereference at nilfs_segctor_do_construct() 2020-06-17 16:40:29 +02:00
nls
notify fanotify: fix ignore mask logic for events on child and on dir 2020-06-17 16:40:24 +02:00
ntfs ntfs: check for valid standard information attribute 2021-02-26 10:10:27 +01:00
ocfs2 ocfs2: fix a use after free on error 2021-03-04 10:26:39 +01:00
omfs
openpromfs
orangefs orangefs: get rid of knob code... 2020-08-21 13:05:29 +02:00
overlayfs ovl: expand warning in ovl_d_real() 2021-02-17 10:35:19 +01:00
proc proc: fix lookup in /proc/net subdirectories after setns(2) 2021-01-12 20:16:10 +01:00
pstore pstore: Fix typo in compression option name 2021-03-04 10:26:45 +01:00
qnx4
qnx6
quota quota: Fix memory leak when handling corrupted quota file 2021-03-04 10:26:26 +01:00
ramfs ramfs: fix nommu mmap with gaps in the page cache 2020-10-29 09:57:53 +01:00
reiserfs reiserfs: add check for an invalid ih_entry_count 2021-01-06 14:48:38 +01:00
romfs romfs: fix uninitialized memory leak in romfs_dev_read() 2020-08-26 10:40:51 +02:00
squashfs squashfs: add more sanity checks in xattr id lookup 2021-02-13 13:52:57 +01:00
sysfs sysfs: Add sysfs_emit and sysfs_emit_at to format sysfs output 2021-03-07 12:20:48 +01:00
sysv
tracefs tracing: Do not create tracefs files if tracefs lockdown is in effect 2019-10-12 20:49:07 -04:00
ubifs ubifs: Fix error return code in alloc_wbufs() 2021-03-04 10:26:26 +01:00
udf udf: fix the problem that the disc content is not displayed 2021-02-07 15:35:49 +01:00
ufs fs/ufs: avoid potential u32 multiplication overflow 2020-08-21 13:05:37 +02:00
unicode unicode: make array 'token' static const, makes object smaller 2019-09-17 11:48:24 -04:00
verity
xfs xfs: Fix assert failure in xfs_setattr_size() 2021-03-07 12:20:42 +01:00
aio.c aio: fix async fsync creds 2020-06-17 16:40:24 +02:00
anon_inodes.c
attr.c utimes: Clamp the timestamps in notify_change() 2020-02-11 04:35:12 -08:00
bad_inode.c
binfmt_aout.c
binfmt_elf.c fs/binfmt_elf.c: allocate initialized memory in fill_thread_core_info() 2020-06-03 08:21:27 +02:00
binfmt_elf_fdpic.c
binfmt_em86.c
binfmt_flat.c binfmt_flat: revert "binfmt_flat: don't offset the data start" 2020-09-03 11:26:39 +02:00
binfmt_misc.c
binfmt_script.c
block_dev.c bdev: Reduce time holding bd_mutex in sync in blkdev_close() 2020-10-01 13:17:55 +02:00
buffer.c fs: Don't invalidate page buffers in block_write_full_page() 2020-11-05 11:43:24 +01:00
char_dev.c chardev: Avoid potential use-after-free in 'chrdev_open()' 2020-01-14 20:08:18 +01:00
compat.c
compat_binfmt_elf.c
compat_ioctl.c fix compat handling of FICLONERANGE, FIDEDUPERANGE and FS_IOC_FIEMAP 2020-01-09 10:20:05 +01:00
coredump.c coredump: fix core_pattern parse error 2020-12-11 13:23:30 +01:00
d_path.c fs: fix NULL dereference due to data race in prepend_path() 2020-10-29 09:57:45 +01:00
dax.c mm: provide a saner PTE walking API for modules 2021-02-26 10:10:28 +01:00
dcache.c fix dget_parent() fastpath race 2020-10-01 13:17:19 +02:00
dcookies.c
direct-io.c fs/direct-io.c: fix kernel-doc warning 2019-10-14 15:04:01 -07:00
drop_caches.c fs: avoid softlockups in s_inodes iterators 2020-01-12 12:21:37 +01:00
eventfd.c eventfd: track eventfd_signal() recursion depth 2020-02-11 04:35:37 -08:00
eventpoll.c ep_create_wakeup_source(): dentry name can change under you... 2020-10-07 08:01:31 +02:00
exec.c exec: Transform exec_update_mutex into a rw_semaphore 2021-01-09 13:44:55 +01:00
fcntl.c fcntl: Fix potential deadlock in send_sig{io, urg}() 2021-01-06 14:48:39 +01:00
fhandle.c
file.c fix multiplication overflow in copy_fdtable() 2020-05-27 17:46:12 +02:00
file_table.c
filesystems.c fs/filesystems.c: downgrade user-reachable WARN_ONCE() to pr_warn_once() 2020-04-17 10:50:21 +02:00
fs-writeback.c fs: fix lazytime expiration handling in __writeback_single_inode() 2021-01-30 13:54:11 +01:00
fs_context.c vfs: subtype handling moved to fuse 2019-09-06 21:28:49 +02:00
fs_parser.c vfs: Make fs_parse() handle fs_param_is_fd-type params better 2019-09-12 21:06:14 -04:00
fs_pin.c
fs_struct.c
fs_types.c
fsopen.c
inode.c futex: Fix inode life-time issue 2020-03-25 08:25:58 +01:00
internal.h fs: move guard_bio_eod() after bio_set_op_attrs 2020-01-17 19:48:21 +01:00
io_uring.c io_uring: Fix current->fs handling in io_sq_wq_submit_work() 2021-01-30 13:54:10 +01:00
ioctl.c compat_ioctl: add compat_ptr_ioctl() 2019-12-17 19:55:30 +01:00
Kconfig fs-verity for 5.4 2019-09-18 16:59:14 -07:00
Kconfig.binfmt
libfs.c libfs: fix error cast of negative value in simple_attr_write() 2020-11-24 13:29:19 +01:00
locks.c locks: reinstate locks_delete_block optimization 2020-03-25 08:25:41 +01:00
Makefile fs-verity for 5.4 2019-09-18 16:59:14 -07:00
mbcache.c
mount.h
mpage.c fs: move guard_bio_eod() after bio_set_op_attrs 2020-01-17 19:48:21 +01:00
namei.c namei: only return -ECHILD from follow_dotdot_rcu() 2020-03-05 16:43:48 +01:00
namespace.c fs/namespace.c: WARN if mnt_count has become negative 2021-01-06 14:48:40 +01:00
no-block.c
nsfs.c
open.c cifs_atomic_open(): fix double-put on late allocation failure 2020-03-18 07:17:51 +01:00
pipe.c
pnode.c propagate_one(): mnt_set_mountpoint() needs mount_lock 2020-05-02 08:48:44 +02:00
pnode.h mount: fix mounting of detached mounts onto targets that reside on shared mounts 2021-03-17 17:03:33 +01:00
posix_acl.c
proc_namespace.c vfs: subtype handling moved to fuse 2019-09-06 21:28:49 +02:00
read_write.c fs: allow deduplication of eof block into the end of the destination file 2020-02-11 04:35:23 -08:00
readdir.c readdir: be more conservative with directory entry names 2020-01-29 16:45:31 +01:00
select.c
seq_file.c
signalfd.c fs/signalfd.c: fix inconsistent return codes for signalfd4 2020-08-26 10:40:58 +02:00
splice.c splice: only read in as much information as there is pipe buffer space 2019-12-17 19:56:52 +01:00
stack.c
stat.c
statfs.c vfs: Fix EOVERFLOW testing in put_compat_statfs64 2019-10-03 14:21:35 -07:00
super.c vfs: remove lockdep bogosity in __sb_start_write 2020-11-24 13:29:01 +01:00
sync.c
timerfd.c
userfaultfd.c userfaultfd: require CAP_SYS_PTRACE for UFFD_FEATURE_EVENT_FORK 2020-01-04 19:18:32 +01:00
utimes.c utimes: Clamp the timestamps in notify_change() 2020-02-11 04:35:12 -08:00
xattr.c xattr: break delegations in {set,remove}xattr 2020-08-11 15:33:39 +02:00