From fec8a6a691033f2538cd46848f17f337f0739923 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 17 Dec 2020 22:33:03 +0100 Subject: [PATCH 1/5] close_range: unshare all fds for CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC After introducing CLOSE_RANGE_CLOEXEC syzbot reported a crash when CLOSE_RANGE_CLOEXEC is specified in conjunction with CLOSE_RANGE_UNSHARE. When CLOSE_RANGE_UNSHARE is specified the caller will receive a private file descriptor table in case their file descriptor table is currently shared. For the case where the caller has requested all file descriptors to be actually closed via e.g. close_range(3, ~0U, 0) the kernel knows that the caller does not need any of the file descriptors anymore and will optimize the close operation by only copying all files in the range from 0 to 3 and no others. However, if the caller requested CLOSE_RANGE_CLOEXEC together with CLOSE_RANGE_UNSHARE the caller wants to still make use of the file descriptors so the kernel needs to copy all of them and can't optimize. The original patch didn't account for this and thus could cause oopses as evidenced by the syzbot report because it assumed that all fds had been copied. Fix this by handling the CLOSE_RANGE_CLOEXEC case. syzbot reported ================================================================== BUG: KASAN: null-ptr-deref in instrument_atomic_read include/linux/instrumented.h:71 [inline] BUG: KASAN: null-ptr-deref in atomic64_read include/asm-generic/atomic-instrumented.h:837 [inline] BUG: KASAN: null-ptr-deref in atomic_long_read include/asm-generic/atomic-long.h:29 [inline] BUG: KASAN: null-ptr-deref in filp_close+0x22/0x170 fs/open.c:1274 Read of size 8 at addr 0000000000000077 by task syz-executor511/8522 CPU: 1 PID: 8522 Comm: syz-executor511 Not tainted 5.10.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x107/0x163 lib/dump_stack.c:120 __kasan_report mm/kasan/report.c:549 [inline] kasan_report.cold+0x5/0x37 mm/kasan/report.c:562 check_memory_region_inline mm/kasan/generic.c:186 [inline] check_memory_region+0x13d/0x180 mm/kasan/generic.c:192 instrument_atomic_read include/linux/instrumented.h:71 [inline] atomic64_read include/asm-generic/atomic-instrumented.h:837 [inline] atomic_long_read include/asm-generic/atomic-long.h:29 [inline] filp_close+0x22/0x170 fs/open.c:1274 close_files fs/file.c:402 [inline] put_files_struct fs/file.c:417 [inline] put_files_struct+0x1cc/0x350 fs/file.c:414 exit_files+0x12a/0x170 fs/file.c:435 do_exit+0xb4f/0x2a00 kernel/exit.c:818 do_group_exit+0x125/0x310 kernel/exit.c:920 get_signal+0x428/0x2100 kernel/signal.c:2792 arch_do_signal_or_restart+0x2a8/0x1eb0 arch/x86/kernel/signal.c:811 handle_signal_work kernel/entry/common.c:147 [inline] exit_to_user_mode_loop kernel/entry/common.c:171 [inline] exit_to_user_mode_prepare+0x124/0x200 kernel/entry/common.c:201 __syscall_exit_to_user_mode_work kernel/entry/common.c:291 [inline] syscall_exit_to_user_mode+0x19/0x50 kernel/entry/common.c:302 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x447039 Code: Unable to access opcode bytes at RIP 0x44700f. RSP: 002b:00007f1b1225cdb8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca RAX: 0000000000000001 RBX: 00000000006dbc28 RCX: 0000000000447039 RDX: 00000000000f4240 RSI: 0000000000000081 RDI: 00000000006dbc2c RBP: 00000000006dbc20 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000006dbc2c R13: 00007fff223b6bef R14: 00007f1b1225d9c0 R15: 00000000006dbc2c ================================================================== syzbot has tested the proposed patch and the reproducer did not trigger any issue: Reported-and-tested-by: syzbot+96cfd2b22b3213646a93@syzkaller.appspotmail.com Tested on: commit: 10f7cddd selftests/core: add regression test for CLOSE_RAN.. git tree: git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git vfs kernel config: https://syzkaller.appspot.com/x/.config?x=5d42216b510180e3 dashboard link: https://syzkaller.appspot.com/bug?extid=96cfd2b22b3213646a93 compiler: gcc (GCC) 10.1.0-syz 20200507 Reported-by: syzbot+96cfd2b22b3213646a93@syzkaller.appspotmail.com Fixes: 582f1fb6b721 ("fs, close_range: add flag CLOSE_RANGE_CLOEXEC") Cc: Giuseppe Scrivano Cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20201217213303.722643-1-christian.brauner@ubuntu.com Signed-off-by: Christian Brauner --- fs/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/file.c b/fs/file.c index 8434e0afecc7..c0b60961c672 100644 --- a/fs/file.c +++ b/fs/file.c @@ -694,8 +694,10 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) * If the requested range is greater than the current maximum, * we're closing everything so only copy all file descriptors * beneath the lowest file descriptor. + * If the caller requested all fds to be made cloexec copy all + * of the file descriptors since they still want to use them. */ - if (max_fd >= cur_max) + if (!(flags & CLOSE_RANGE_CLOEXEC) && (max_fd >= cur_max)) max_unshare_fds = fd; ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds); From ca202504ea6f04b2e724741100ab63f8f018a8af Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 18 Dec 2020 15:54:12 +0100 Subject: [PATCH 2/5] selftests/core: fix close_range_test build after XFAIL removal XFAIL was removed in commit 9847d24af95c ("selftests/harness: Refactor XFAIL into SKIP") and its use in close_range_test was already replaced by commit 1d44d0dd61b6 ("selftests: core: use SKIP instead of XFAIL in close_range_test.c"). However, commit 23afeaeff3d9 ("selftests: core: add tests for CLOSE_RANGE_CLOEXEC") introduced usage of XFAIL in TEST(close_range_cloexec). Use SKIP there as well. Fixes: 23afeaeff3d9 ("selftests: core: add tests for CLOSE_RANGE_CLOEXEC") Cc: Giuseppe Scrivano Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Tobias Klauser Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20201218112428.13662-1-tklauser@distanz.ch Link: https://lore.kernel.org/r/20201218145415.801063-1-christian.brauner@ubuntu.com Signed-off-by: Christian Brauner --- tools/testing/selftests/core/close_range_test.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c index 87e16d65d9d7..c97dd1a7abd6 100644 --- a/tools/testing/selftests/core/close_range_test.c +++ b/tools/testing/selftests/core/close_range_test.c @@ -102,7 +102,7 @@ TEST(close_range_unshare) int i, ret, status; pid_t pid; int open_fds[101]; - struct clone_args args = { + struct __clone_args args = { .flags = CLONE_FILES, .exit_signal = SIGCHLD, }; @@ -191,7 +191,7 @@ TEST(close_range_unshare_capped) int i, ret, status; pid_t pid; int open_fds[101]; - struct clone_args args = { + struct __clone_args args = { .flags = CLONE_FILES, .exit_signal = SIGCHLD, }; @@ -241,7 +241,7 @@ TEST(close_range_cloexec) fd = open("/dev/null", O_RDONLY); ASSERT_GE(fd, 0) { if (errno == ENOENT) - XFAIL(return, "Skipping test since /dev/null does not exist"); + SKIP(return, "Skipping test since /dev/null does not exist"); } open_fds[i] = fd; @@ -250,9 +250,9 @@ TEST(close_range_cloexec) ret = sys_close_range(1000, 1000, CLOSE_RANGE_CLOEXEC); if (ret < 0) { if (errno == ENOSYS) - XFAIL(return, "close_range() syscall not supported"); + SKIP(return, "close_range() syscall not supported"); if (errno == EINVAL) - XFAIL(return, "close_range() doesn't support CLOSE_RANGE_CLOEXEC"); + SKIP(return, "close_range() doesn't support CLOSE_RANGE_CLOEXEC"); } /* Ensure the FD_CLOEXEC bit is set also with a resource limit in place. */ From ae78ba8d3bb66dfe8c0f7b7ec5ffe3f6a13feb86 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 18 Dec 2020 15:54:13 +0100 Subject: [PATCH 3/5] selftests/core: handle missing syscall number for close_range This improves the syscall number handling in the close_range() selftests. This should handle any architecture. Cc: Giuseppe Scrivano Cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20201218145415.801063-2-christian.brauner@ubuntu.com Signed-off-by: Christian Brauner --- .../testing/selftests/core/close_range_test.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c index c97dd1a7abd6..bc592a1372bb 100644 --- a/tools/testing/selftests/core/close_range_test.c +++ b/tools/testing/selftests/core/close_range_test.c @@ -17,7 +17,23 @@ #include "../clone3/clone3_selftests.h" #ifndef __NR_close_range -#define __NR_close_range -1 + #if defined __alpha__ + #define __NR_close_range 546 + #elif defined _MIPS_SIM + #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ + #define __NR_close_range (436 + 4000) + #endif + #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ + #define __NR_close_range (436 + 6000) + #endif + #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ + #define __NR_close_range (436 + 5000) + #endif + #elif defined __ia64__ + #define __NR_close_range (436 + 1024) + #else + #define __NR_close_range 436 + #endif #endif #ifndef CLOSE_RANGE_UNSHARE From fe325c3ff3188d551668c5847bac58463b9f3437 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 18 Dec 2020 15:54:14 +0100 Subject: [PATCH 4/5] selftests/core: add test for CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC Add a test to verify that CLOSE_RANGE_UNSHARE works correctly when combined with CLOSE_RANGE_CLOEXEC for the single-threaded case. Cc: Giuseppe Scrivano Cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20201218145415.801063-3-christian.brauner@ubuntu.com Signed-off-by: Christian Brauner --- .../testing/selftests/core/close_range_test.c | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c index bc592a1372bb..862444f1c244 100644 --- a/tools/testing/selftests/core/close_range_test.c +++ b/tools/testing/selftests/core/close_range_test.c @@ -313,5 +313,75 @@ TEST(close_range_cloexec) } } +TEST(close_range_cloexec_unshare) +{ + int i, ret; + int open_fds[101]; + struct rlimit rlimit; + + for (i = 0; i < ARRAY_SIZE(open_fds); i++) { + int fd; + + fd = open("/dev/null", O_RDONLY); + ASSERT_GE(fd, 0) { + if (errno == ENOENT) + SKIP(return, "Skipping test since /dev/null does not exist"); + } + + open_fds[i] = fd; + } + + ret = sys_close_range(1000, 1000, CLOSE_RANGE_CLOEXEC); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "close_range() syscall not supported"); + if (errno == EINVAL) + SKIP(return, "close_range() doesn't support CLOSE_RANGE_CLOEXEC"); + } + + /* Ensure the FD_CLOEXEC bit is set also with a resource limit in place. */ + ASSERT_EQ(0, getrlimit(RLIMIT_NOFILE, &rlimit)); + rlimit.rlim_cur = 25; + ASSERT_EQ(0, setrlimit(RLIMIT_NOFILE, &rlimit)); + + /* Set close-on-exec for two ranges: [0-50] and [75-100]. */ + ret = sys_close_range(open_fds[0], open_fds[50], + CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_UNSHARE); + ASSERT_EQ(0, ret); + ret = sys_close_range(open_fds[75], open_fds[100], + CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_UNSHARE); + ASSERT_EQ(0, ret); + + for (i = 0; i <= 50; i++) { + int flags = fcntl(open_fds[i], F_GETFD); + + EXPECT_GT(flags, -1); + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); + } + + for (i = 51; i <= 74; i++) { + int flags = fcntl(open_fds[i], F_GETFD); + + EXPECT_GT(flags, -1); + EXPECT_EQ(flags & FD_CLOEXEC, 0); + } + + for (i = 75; i <= 100; i++) { + int flags = fcntl(open_fds[i], F_GETFD); + + EXPECT_GT(flags, -1); + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); + } + + /* Test a common pattern. */ + ret = sys_close_range(3, UINT_MAX, + CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_UNSHARE); + for (i = 0; i <= 100; i++) { + int flags = fcntl(open_fds[i], F_GETFD); + + EXPECT_GT(flags, -1); + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); + } +} TEST_HARNESS_MAIN From 6abc20f8f879d891930f37186b19c9dc3ecc34dd Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 18 Dec 2020 15:54:15 +0100 Subject: [PATCH 5/5] selftests/core: add regression test for CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC This test is a minimalized version of the reproducer given by syzbot (cf. [1]). After introducing CLOSE_RANGE_CLOEXEC syzbot reported a crash when CLOSE_RANGE_CLOEXEC is specified in conjunction with CLOSE_RANGE_UNSHARE. When CLOSE_RANGE_UNSHARE is specified the caller will receive a private file descriptor table in case their file descriptor table is currently shared. For the case where the caller has requested all file descriptors to be actually closed via e.g. close_range(3, ~0U, 0) the kernel knows that the caller does not need any of the file descriptors anymore and will optimize the close operation by only copying all files in the range from 0 to 3 and no others. However, if the caller requested CLOSE_RANGE_CLOEXEC together with CLOSE_RANGE_UNSHARE the caller wants to still make use of the file descriptors so the kernel needs to copy all of them and can't optimize. The original patch didn't account for this and thus could cause oopses as evidenced by the syzbot report. Add tests for this regression. We first create a huge gap in the fd table. When we now call CLOSE_RANGE_UNSHARE with a shared fd table and and with ~0U as upper bound the kernel will only copy up to fd1 file descriptors into the new fd table. If the kernel is buggy and doesn't handle CLOSE_RANGE_CLOEXEC correctly it will not have copied all file descriptors and we will oops! This test passes on a fixed kernel and will trigger an oops on a buggy kernel. [1]: https://syzkaller.appspot.com/text?tag=KernelConfig&x=db720fe37a6a41d8 Cc: Giuseppe Scrivano Cc: linux-fsdevel@vger.kernel.org Link: syzbot+96cfd2b22b3213646a93@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20201218145415.801063-4-christian.brauner@ubuntu.com Signed-off-by: Christian Brauner --- .../testing/selftests/core/close_range_test.c | 183 ++++++++++++++++++ 1 file changed, 183 insertions(+) diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c index 862444f1c244..73eb29c916d1 100644 --- a/tools/testing/selftests/core/close_range_test.c +++ b/tools/testing/selftests/core/close_range_test.c @@ -384,4 +384,187 @@ TEST(close_range_cloexec_unshare) } } +/* + * Regression test for syzbot+96cfd2b22b3213646a93@syzkaller.appspotmail.com + */ +TEST(close_range_cloexec_syzbot) +{ + int fd1, fd2, fd3, flags, ret, status; + pid_t pid; + struct __clone_args args = { + .flags = CLONE_FILES, + .exit_signal = SIGCHLD, + }; + + /* Create a huge gap in the fd table. */ + fd1 = open("/dev/null", O_RDWR); + EXPECT_GT(fd1, 0); + + fd2 = dup2(fd1, 1000); + EXPECT_GT(fd2, 0); + + pid = sys_clone3(&args, sizeof(args)); + ASSERT_GE(pid, 0); + + if (pid == 0) { + ret = sys_close_range(3, ~0U, CLOSE_RANGE_CLOEXEC); + if (ret) + exit(EXIT_FAILURE); + + /* + * We now have a private file descriptor table and all + * our open fds should still be open but made + * close-on-exec. + */ + flags = fcntl(fd1, F_GETFD); + EXPECT_GT(flags, -1); + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); + + flags = fcntl(fd2, F_GETFD); + EXPECT_GT(flags, -1); + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); + + fd3 = dup2(fd1, 42); + EXPECT_GT(fd3, 0); + + /* + * Duplicating the file descriptor must remove the + * FD_CLOEXEC flag. + */ + flags = fcntl(fd3, F_GETFD); + EXPECT_GT(flags, -1); + EXPECT_EQ(flags & FD_CLOEXEC, 0); + + exit(EXIT_SUCCESS); + } + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + + /* + * We had a shared file descriptor table before along with requesting + * close-on-exec so the original fds must not be close-on-exec. + */ + flags = fcntl(fd1, F_GETFD); + EXPECT_GT(flags, -1); + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); + + flags = fcntl(fd2, F_GETFD); + EXPECT_GT(flags, -1); + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); + + fd3 = dup2(fd1, 42); + EXPECT_GT(fd3, 0); + + flags = fcntl(fd3, F_GETFD); + EXPECT_GT(flags, -1); + EXPECT_EQ(flags & FD_CLOEXEC, 0); + + EXPECT_EQ(close(fd1), 0); + EXPECT_EQ(close(fd2), 0); + EXPECT_EQ(close(fd3), 0); +} + +/* + * Regression test for syzbot+96cfd2b22b3213646a93@syzkaller.appspotmail.com + */ +TEST(close_range_cloexec_unshare_syzbot) +{ + int i, fd1, fd2, fd3, flags, ret, status; + pid_t pid; + struct __clone_args args = { + .flags = CLONE_FILES, + .exit_signal = SIGCHLD, + }; + + /* + * Create a huge gap in the fd table. When we now call + * CLOSE_RANGE_UNSHARE with a shared fd table and and with ~0U as upper + * bound the kernel will only copy up to fd1 file descriptors into the + * new fd table. If the kernel is buggy and doesn't handle + * CLOSE_RANGE_CLOEXEC correctly it will not have copied all file + * descriptors and we will oops! + * + * On a buggy kernel this should immediately oops. But let's loop just + * to be sure. + */ + fd1 = open("/dev/null", O_RDWR); + EXPECT_GT(fd1, 0); + + fd2 = dup2(fd1, 1000); + EXPECT_GT(fd2, 0); + + for (i = 0; i < 100; i++) { + + pid = sys_clone3(&args, sizeof(args)); + ASSERT_GE(pid, 0); + + if (pid == 0) { + ret = sys_close_range(3, ~0U, CLOSE_RANGE_UNSHARE | + CLOSE_RANGE_CLOEXEC); + if (ret) + exit(EXIT_FAILURE); + + /* + * We now have a private file descriptor table and all + * our open fds should still be open but made + * close-on-exec. + */ + flags = fcntl(fd1, F_GETFD); + EXPECT_GT(flags, -1); + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); + + flags = fcntl(fd2, F_GETFD); + EXPECT_GT(flags, -1); + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); + + fd3 = dup2(fd1, 42); + EXPECT_GT(fd3, 0); + + /* + * Duplicating the file descriptor must remove the + * FD_CLOEXEC flag. + */ + flags = fcntl(fd3, F_GETFD); + EXPECT_GT(flags, -1); + EXPECT_EQ(flags & FD_CLOEXEC, 0); + + EXPECT_EQ(close(fd1), 0); + EXPECT_EQ(close(fd2), 0); + EXPECT_EQ(close(fd3), 0); + + exit(EXIT_SUCCESS); + } + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + } + + /* + * We created a private file descriptor table before along with + * requesting close-on-exec so the original fds must not be + * close-on-exec. + */ + flags = fcntl(fd1, F_GETFD); + EXPECT_GT(flags, -1); + EXPECT_EQ(flags & FD_CLOEXEC, 0); + + flags = fcntl(fd2, F_GETFD); + EXPECT_GT(flags, -1); + EXPECT_EQ(flags & FD_CLOEXEC, 0); + + fd3 = dup2(fd1, 42); + EXPECT_GT(fd3, 0); + + flags = fcntl(fd3, F_GETFD); + EXPECT_GT(flags, -1); + EXPECT_EQ(flags & FD_CLOEXEC, 0); + + EXPECT_EQ(close(fd1), 0); + EXPECT_EQ(close(fd2), 0); + EXPECT_EQ(close(fd3), 0); +} + TEST_HARNESS_MAIN