close-range-cloexec-unshare-v5.11

-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCX94a9QAKCRCRxhvAZXjc
 ovWlAQCazbbYOeb2hHwBq8iMX+jS2ZDyOXIQgZvQtR1e8UOChwD7BNIZnAzCMbCq
 kONk43x+aIrIRknKnY5oT8w7fD4h9QQ=
 =c1y6
 -----END PGP SIGNATURE-----

Merge tag 'close-range-cloexec-unshare-v5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux

Pull close_range fix from Christian Brauner:
 "syzbot reported a bug when asking close_range() to unshare the file
  descriptor table and making all fds close-on-exec.

  If CLOSE_RANGE_UNSHARE the caller will receive a private file
  descriptor table in case their file descriptor table is currently
  shared before operating on the requested file descriptor range.

  For the case where the caller has requested all file descriptors to be
  actually closed via e.g. close_range(3, ~0U, CLOSE_RANGE_UNSHARE) the
  kernel knows that the caller does not need any of the file descriptors
  anymore and will optimize the close operation by only copying all
  files in the range from 0 to 3 and no others.

  However, if the caller requested CLOSE_RANGE_CLOEXEC together with
  CLOSE_RANGE_UNSHARE the caller wants to still make use of the file
  descriptors so the kernel needs to copy all of them and can't
  optimize.

  The original patch didn't account for this and thus could cause oopses
  as evidenced by the syzbot report because it assumed that all fds had
  been copied. Fix this by handling the CLOSE_RANGE_CLOEXEC case and
  copying all fds if the two flags are specified together.

  This should've been caught in the selftests but the original patch
  didn't cover this case and I didn't catch it during review. So in
  addition to the bugfix I'm also adding selftests. They will reliably
  reproduce the bug on a non-fixed kernel and allows us to catch
  regressions and verify correct behavior.

  Note, the kernel selftest tree contained a bunch of changes that made
  the original selftest fail to compile so there are small fixups in
  here make them compile without warnings"

* tag 'close-range-cloexec-unshare-v5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux:
  selftests/core: add regression test for CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC
  selftests/core: add test for CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC
  selftests/core: handle missing syscall number for close_range
  selftests/core: fix close_range_test build after XFAIL removal
  close_range: unshare all fds for CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC
This commit is contained in:
Linus Torvalds 2020-12-19 13:03:12 -08:00
commit 467f8165a2
2 changed files with 278 additions and 7 deletions

View File

@ -694,8 +694,10 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
* If the requested range is greater than the current maximum,
* we're closing everything so only copy all file descriptors
* beneath the lowest file descriptor.
* If the caller requested all fds to be made cloexec copy all
* of the file descriptors since they still want to use them.
*/
if (max_fd >= cur_max)
if (!(flags & CLOSE_RANGE_CLOEXEC) && (max_fd >= cur_max))
max_unshare_fds = fd;
ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);

View File

@ -17,7 +17,23 @@
#include "../clone3/clone3_selftests.h"
#ifndef __NR_close_range
#define __NR_close_range -1
#if defined __alpha__
#define __NR_close_range 546
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_close_range (436 + 4000)
#endif
#if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
#define __NR_close_range (436 + 6000)
#endif
#if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
#define __NR_close_range (436 + 5000)
#endif
#elif defined __ia64__
#define __NR_close_range (436 + 1024)
#else
#define __NR_close_range 436
#endif
#endif
#ifndef CLOSE_RANGE_UNSHARE
@ -102,7 +118,7 @@ TEST(close_range_unshare)
int i, ret, status;
pid_t pid;
int open_fds[101];
struct clone_args args = {
struct __clone_args args = {
.flags = CLONE_FILES,
.exit_signal = SIGCHLD,
};
@ -191,7 +207,7 @@ TEST(close_range_unshare_capped)
int i, ret, status;
pid_t pid;
int open_fds[101];
struct clone_args args = {
struct __clone_args args = {
.flags = CLONE_FILES,
.exit_signal = SIGCHLD,
};
@ -241,7 +257,7 @@ TEST(close_range_cloexec)
fd = open("/dev/null", O_RDONLY);
ASSERT_GE(fd, 0) {
if (errno == ENOENT)
XFAIL(return, "Skipping test since /dev/null does not exist");
SKIP(return, "Skipping test since /dev/null does not exist");
}
open_fds[i] = fd;
@ -250,9 +266,9 @@ TEST(close_range_cloexec)
ret = sys_close_range(1000, 1000, CLOSE_RANGE_CLOEXEC);
if (ret < 0) {
if (errno == ENOSYS)
XFAIL(return, "close_range() syscall not supported");
SKIP(return, "close_range() syscall not supported");
if (errno == EINVAL)
XFAIL(return, "close_range() doesn't support CLOSE_RANGE_CLOEXEC");
SKIP(return, "close_range() doesn't support CLOSE_RANGE_CLOEXEC");
}
/* Ensure the FD_CLOEXEC bit is set also with a resource limit in place. */
@ -297,5 +313,258 @@ TEST(close_range_cloexec)
}
}
TEST(close_range_cloexec_unshare)
{
int i, ret;
int open_fds[101];
struct rlimit rlimit;
for (i = 0; i < ARRAY_SIZE(open_fds); i++) {
int fd;
fd = open("/dev/null", O_RDONLY);
ASSERT_GE(fd, 0) {
if (errno == ENOENT)
SKIP(return, "Skipping test since /dev/null does not exist");
}
open_fds[i] = fd;
}
ret = sys_close_range(1000, 1000, CLOSE_RANGE_CLOEXEC);
if (ret < 0) {
if (errno == ENOSYS)
SKIP(return, "close_range() syscall not supported");
if (errno == EINVAL)
SKIP(return, "close_range() doesn't support CLOSE_RANGE_CLOEXEC");
}
/* Ensure the FD_CLOEXEC bit is set also with a resource limit in place. */
ASSERT_EQ(0, getrlimit(RLIMIT_NOFILE, &rlimit));
rlimit.rlim_cur = 25;
ASSERT_EQ(0, setrlimit(RLIMIT_NOFILE, &rlimit));
/* Set close-on-exec for two ranges: [0-50] and [75-100]. */
ret = sys_close_range(open_fds[0], open_fds[50],
CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_UNSHARE);
ASSERT_EQ(0, ret);
ret = sys_close_range(open_fds[75], open_fds[100],
CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_UNSHARE);
ASSERT_EQ(0, ret);
for (i = 0; i <= 50; i++) {
int flags = fcntl(open_fds[i], F_GETFD);
EXPECT_GT(flags, -1);
EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC);
}
for (i = 51; i <= 74; i++) {
int flags = fcntl(open_fds[i], F_GETFD);
EXPECT_GT(flags, -1);
EXPECT_EQ(flags & FD_CLOEXEC, 0);
}
for (i = 75; i <= 100; i++) {
int flags = fcntl(open_fds[i], F_GETFD);
EXPECT_GT(flags, -1);
EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC);
}
/* Test a common pattern. */
ret = sys_close_range(3, UINT_MAX,
CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_UNSHARE);
for (i = 0; i <= 100; i++) {
int flags = fcntl(open_fds[i], F_GETFD);
EXPECT_GT(flags, -1);
EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC);
}
}
/*
* Regression test for syzbot+96cfd2b22b3213646a93@syzkaller.appspotmail.com
*/
TEST(close_range_cloexec_syzbot)
{
int fd1, fd2, fd3, flags, ret, status;
pid_t pid;
struct __clone_args args = {
.flags = CLONE_FILES,
.exit_signal = SIGCHLD,
};
/* Create a huge gap in the fd table. */
fd1 = open("/dev/null", O_RDWR);
EXPECT_GT(fd1, 0);
fd2 = dup2(fd1, 1000);
EXPECT_GT(fd2, 0);
pid = sys_clone3(&args, sizeof(args));
ASSERT_GE(pid, 0);
if (pid == 0) {
ret = sys_close_range(3, ~0U, CLOSE_RANGE_CLOEXEC);
if (ret)
exit(EXIT_FAILURE);
/*
* We now have a private file descriptor table and all
* our open fds should still be open but made
* close-on-exec.
*/
flags = fcntl(fd1, F_GETFD);
EXPECT_GT(flags, -1);
EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC);
flags = fcntl(fd2, F_GETFD);
EXPECT_GT(flags, -1);
EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC);
fd3 = dup2(fd1, 42);
EXPECT_GT(fd3, 0);
/*
* Duplicating the file descriptor must remove the
* FD_CLOEXEC flag.
*/
flags = fcntl(fd3, F_GETFD);
EXPECT_GT(flags, -1);
EXPECT_EQ(flags & FD_CLOEXEC, 0);
exit(EXIT_SUCCESS);
}
EXPECT_EQ(waitpid(pid, &status, 0), pid);
EXPECT_EQ(true, WIFEXITED(status));
EXPECT_EQ(0, WEXITSTATUS(status));
/*
* We had a shared file descriptor table before along with requesting
* close-on-exec so the original fds must not be close-on-exec.
*/
flags = fcntl(fd1, F_GETFD);
EXPECT_GT(flags, -1);
EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC);
flags = fcntl(fd2, F_GETFD);
EXPECT_GT(flags, -1);
EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC);
fd3 = dup2(fd1, 42);
EXPECT_GT(fd3, 0);
flags = fcntl(fd3, F_GETFD);
EXPECT_GT(flags, -1);
EXPECT_EQ(flags & FD_CLOEXEC, 0);
EXPECT_EQ(close(fd1), 0);
EXPECT_EQ(close(fd2), 0);
EXPECT_EQ(close(fd3), 0);
}
/*
* Regression test for syzbot+96cfd2b22b3213646a93@syzkaller.appspotmail.com
*/
TEST(close_range_cloexec_unshare_syzbot)
{
int i, fd1, fd2, fd3, flags, ret, status;
pid_t pid;
struct __clone_args args = {
.flags = CLONE_FILES,
.exit_signal = SIGCHLD,
};
/*
* Create a huge gap in the fd table. When we now call
* CLOSE_RANGE_UNSHARE with a shared fd table and and with ~0U as upper
* bound the kernel will only copy up to fd1 file descriptors into the
* new fd table. If the kernel is buggy and doesn't handle
* CLOSE_RANGE_CLOEXEC correctly it will not have copied all file
* descriptors and we will oops!
*
* On a buggy kernel this should immediately oops. But let's loop just
* to be sure.
*/
fd1 = open("/dev/null", O_RDWR);
EXPECT_GT(fd1, 0);
fd2 = dup2(fd1, 1000);
EXPECT_GT(fd2, 0);
for (i = 0; i < 100; i++) {
pid = sys_clone3(&args, sizeof(args));
ASSERT_GE(pid, 0);
if (pid == 0) {
ret = sys_close_range(3, ~0U, CLOSE_RANGE_UNSHARE |
CLOSE_RANGE_CLOEXEC);
if (ret)
exit(EXIT_FAILURE);
/*
* We now have a private file descriptor table and all
* our open fds should still be open but made
* close-on-exec.
*/
flags = fcntl(fd1, F_GETFD);
EXPECT_GT(flags, -1);
EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC);
flags = fcntl(fd2, F_GETFD);
EXPECT_GT(flags, -1);
EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC);
fd3 = dup2(fd1, 42);
EXPECT_GT(fd3, 0);
/*
* Duplicating the file descriptor must remove the
* FD_CLOEXEC flag.
*/
flags = fcntl(fd3, F_GETFD);
EXPECT_GT(flags, -1);
EXPECT_EQ(flags & FD_CLOEXEC, 0);
EXPECT_EQ(close(fd1), 0);
EXPECT_EQ(close(fd2), 0);
EXPECT_EQ(close(fd3), 0);
exit(EXIT_SUCCESS);
}
EXPECT_EQ(waitpid(pid, &status, 0), pid);
EXPECT_EQ(true, WIFEXITED(status));
EXPECT_EQ(0, WEXITSTATUS(status));
}
/*
* We created a private file descriptor table before along with
* requesting close-on-exec so the original fds must not be
* close-on-exec.
*/
flags = fcntl(fd1, F_GETFD);
EXPECT_GT(flags, -1);
EXPECT_EQ(flags & FD_CLOEXEC, 0);
flags = fcntl(fd2, F_GETFD);
EXPECT_GT(flags, -1);
EXPECT_EQ(flags & FD_CLOEXEC, 0);
fd3 = dup2(fd1, 42);
EXPECT_GT(fd3, 0);
flags = fcntl(fd3, F_GETFD);
EXPECT_GT(flags, -1);
EXPECT_EQ(flags & FD_CLOEXEC, 0);
EXPECT_EQ(close(fd1), 0);
EXPECT_EQ(close(fd2), 0);
EXPECT_EQ(close(fd3), 0);
}
TEST_HARNESS_MAIN