diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 324352787aea..b606cded90cd 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -923,7 +923,7 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno) /* Do seccomp after ptrace; syscall may have changed. */ #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER - if (secure_computing(NULL) == -1) + if (secure_computing() == -1) return -1; #else /* XXX: remove this once OABI gets fixed */ diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 21176d02e21a..6771c399d40c 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1816,7 +1816,7 @@ int syscall_trace_enter(struct pt_regs *regs) } /* Do the secure computing after ptrace; failures should be fast. */ - if (secure_computing(NULL) == -1) + if (secure_computing() == -1) return -1; if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c index 9f6ff7bc06f9..f8c07dcbfb49 100644 --- a/arch/parisc/kernel/ptrace.c +++ b/arch/parisc/kernel/ptrace.c @@ -342,7 +342,7 @@ long do_syscall_trace_enter(struct pt_regs *regs) } /* Do the secure computing check after ptrace. */ - if (secure_computing(NULL) == -1) + if (secure_computing() == -1) return -1; #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 0f84628b9385..407464201b91 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -159,7 +159,7 @@ __visible void do_syscall_trace_enter(struct pt_regs *regs) * If this fails we might have return value in a0 from seccomp * (via SECCOMP_RET_ERRNO/TRACE). */ - if (secure_computing(NULL) == -1) { + if (secure_computing() == -1) { syscall_set_nr(current, regs, -1); return; } diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index ad71132374f0..58faa12542a1 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -856,7 +856,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) } /* Do the secure computing check after ptrace. */ - if (secure_computing(NULL)) { + if (secure_computing()) { /* seccomp failures shouldn't expose any additional code. */ return -1; } diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index f574b1856bc6..40d90dddf3f1 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -35,7 +35,7 @@ void handle_syscall(struct uml_pt_regs *r) goto out; /* Do the seccomp check after ptrace; failures should be fast. */ - if (secure_computing(NULL) == -1) + if (secure_computing() == -1) goto out; syscall = UPT_SYSCALL_NR(r); diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index e7c596dea947..b10cbf71a8cc 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -222,7 +222,7 @@ bool emulate_vsyscall(unsigned long error_code, */ regs->orig_ax = syscall_nr; regs->ax = -ENOSYS; - tmp = secure_computing(NULL); + tmp = secure_computing(); if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { warn_bad_vsyscall(KERN_DEBUG, regs, "seccomp tried to change syscall nr or ip"); diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 84868d37b35d..03583b6d1416 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -33,10 +33,10 @@ struct seccomp { #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER extern int __secure_computing(const struct seccomp_data *sd); -static inline int secure_computing(const struct seccomp_data *sd) +static inline int secure_computing(void) { if (unlikely(test_thread_flag(TIF_SECCOMP))) - return __secure_computing(sd); + return __secure_computing(NULL); return 0; } #else @@ -59,7 +59,7 @@ struct seccomp { }; struct seccomp_filter { }; #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER -static inline int secure_computing(struct seccomp_data *sd) { return 0; } +static inline int secure_computing(void) { return 0; } #else static inline void secure_computing_strict(int this_syscall) { return; } #endif diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 90734aa5aa36..be84d87f1f46 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -76,6 +76,35 @@ struct seccomp_notif { struct seccomp_data data; }; +/* + * Valid flags for struct seccomp_notif_resp + * + * Note, the SECCOMP_USER_NOTIF_FLAG_CONTINUE flag must be used with caution! + * If set by the process supervising the syscalls of another process the + * syscall will continue. This is problematic because of an inherent TOCTOU. + * An attacker can exploit the time while the supervised process is waiting on + * a response from the supervising process to rewrite syscall arguments which + * are passed as pointers of the intercepted syscall. + * It should be absolutely clear that this means that the seccomp notifier + * _cannot_ be used to implement a security policy! It should only ever be used + * in scenarios where a more privileged process supervises the syscalls of a + * lesser privileged process to get around kernel-enforced security + * restrictions when the privileged process deems this safe. In other words, + * in order to continue a syscall the supervising process should be sure that + * another security mechanism or the kernel itself will sufficiently block + * syscalls if arguments are rewritten to something unsafe. + * + * Similar precautions should be applied when stacking SECCOMP_RET_USER_NOTIF + * or SECCOMP_RET_TRACE. For SECCOMP_RET_USER_NOTIF filters acting on the + * same syscall, the most recently added filter takes precedence. This means + * that the new SECCOMP_RET_USER_NOTIF filter can override any + * SECCOMP_IOCTL_NOTIF_SEND from earlier filters, essentially allowing all + * such filtered syscalls to be executed by sending the response + * SECCOMP_USER_NOTIF_FLAG_CONTINUE. Note that SECCOMP_RET_TRACE can equally + * be overriden by SECCOMP_USER_NOTIF_FLAG_CONTINUE. + */ +#define SECCOMP_USER_NOTIF_FLAG_CONTINUE (1UL << 0) + struct seccomp_notif_resp { __u64 id; __s64 val; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index dba52a7db5e8..12d2227e5786 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -75,6 +75,7 @@ struct seccomp_knotif { /* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */ int error; long val; + u32 flags; /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */ struct completion ready; @@ -732,11 +733,12 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter) return filter->notif->next_id++; } -static void seccomp_do_user_notification(int this_syscall, - struct seccomp_filter *match, - const struct seccomp_data *sd) +static int seccomp_do_user_notification(int this_syscall, + struct seccomp_filter *match, + const struct seccomp_data *sd) { int err; + u32 flags = 0; long ret = 0; struct seccomp_knotif n = {}; @@ -764,6 +766,7 @@ static void seccomp_do_user_notification(int this_syscall, if (err == 0) { ret = n.val; err = n.error; + flags = n.flags; } /* @@ -780,8 +783,14 @@ static void seccomp_do_user_notification(int this_syscall, list_del(&n.list); out: mutex_unlock(&match->notify_lock); + + /* Userspace requests to continue the syscall. */ + if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) + return 0; + syscall_set_return_value(current, task_pt_regs(current), err, ret); + return -1; } static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, @@ -867,8 +876,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; case SECCOMP_RET_USER_NOTIF: - seccomp_do_user_notification(this_syscall, match, sd); - goto skip; + if (seccomp_do_user_notification(this_syscall, match, sd)) + goto skip; + + return 0; case SECCOMP_RET_LOG: seccomp_log(this_syscall, 0, action, true); @@ -1087,7 +1098,11 @@ static long seccomp_notify_send(struct seccomp_filter *filter, if (copy_from_user(&resp, buf, sizeof(resp))) return -EFAULT; - if (resp.flags) + if (resp.flags & ~SECCOMP_USER_NOTIF_FLAG_CONTINUE) + return -EINVAL; + + if ((resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) && + (resp.error || resp.val)) return -EINVAL; ret = mutex_lock_interruptible(&filter->notify_lock); @@ -1116,6 +1131,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter, knotif->state = SECCOMP_NOTIFY_REPLIED; knotif->error = resp.error; knotif->val = resp.val; + knotif->flags = resp.flags; complete(&knotif->ready); out: mutex_unlock(&filter->notify_lock); diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index aeb0fc37a654..6944b898bb53 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,7 @@ #include #include #include +#include #include #include @@ -206,6 +208,10 @@ struct seccomp_notif_sizes { #define PTRACE_EVENTMSG_SYSCALL_EXIT 2 #endif +#ifndef SECCOMP_USER_NOTIF_FLAG_CONTINUE +#define SECCOMP_USER_NOTIF_FLAG_CONTINUE 0x00000001 +#endif + #ifndef seccomp int seccomp(unsigned int op, unsigned int flags, void *args) { @@ -3083,7 +3089,7 @@ static int user_trap_syscall(int nr, unsigned int flags) return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog); } -#define USER_NOTIF_MAGIC 116983961184613L +#define USER_NOTIF_MAGIC INT_MAX TEST(user_notification_basic) { pid_t pid; @@ -3491,6 +3497,108 @@ TEST(seccomp_get_notif_sizes) EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp)); } +static int filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2) +{ +#ifdef __NR_kcmp + return syscall(__NR_kcmp, pid1, pid2, KCMP_FILE, fd1, fd2); +#else + errno = ENOSYS; + return -1; +#endif +} + +TEST(user_notification_continue) +{ + pid_t pid; + long ret; + int status, listener; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + struct pollfd pollfd; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + listener = user_trap_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER); + ASSERT_GE(listener, 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int dup_fd, pipe_fds[2]; + pid_t self; + + ret = pipe(pipe_fds); + if (ret < 0) + exit(1); + + dup_fd = dup(pipe_fds[0]); + if (dup_fd < 0) + exit(1); + + self = getpid(); + + ret = filecmp(self, self, pipe_fds[0], dup_fd); + if (ret) + exit(2); + + exit(0); + } + + pollfd.fd = listener; + pollfd.events = POLLIN | POLLOUT; + + EXPECT_GT(poll(&pollfd, 1, -1), 0); + EXPECT_EQ(pollfd.revents, POLLIN); + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + + pollfd.fd = listener; + pollfd.events = POLLIN | POLLOUT; + + EXPECT_GT(poll(&pollfd, 1, -1), 0); + EXPECT_EQ(pollfd.revents, POLLOUT); + + EXPECT_EQ(req.data.nr, __NR_dup); + + resp.id = req.id; + resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE; + + /* + * Verify that setting SECCOMP_USER_NOTIF_FLAG_CONTINUE enforces other + * args be set to 0. + */ + resp.error = 0; + resp.val = USER_NOTIF_MAGIC; + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1); + EXPECT_EQ(errno, EINVAL); + + resp.error = USER_NOTIF_MAGIC; + resp.val = 0; + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1); + EXPECT_EQ(errno, EINVAL); + + resp.error = 0; + resp.val = 0; + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0) { + if (errno == EINVAL) + XFAIL(goto skip, "Kernel does not support SECCOMP_USER_NOTIF_FLAG_CONTINUE"); + } + +skip: + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)) { + if (WEXITSTATUS(status) == 2) { + XFAIL(return, "Kernel does not support kcmp() syscall"); + return; + } + } +} + /* * TODO: * - add microbenchmarks