diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst index 6efb41cc8072..d61219889e49 100644 --- a/Documentation/userspace-api/seccomp_filter.rst +++ b/Documentation/userspace-api/seccomp_filter.rst @@ -259,6 +259,18 @@ and ``ioctl(SECCOMP_IOCTL_NOTIF_SEND)`` a response, indicating what should be returned to userspace. The ``id`` member of ``struct seccomp_notif_resp`` should be the same ``id`` as in ``struct seccomp_notif``. +Userspace can also add file descriptors to the notifying process via +``ioctl(SECCOMP_IOCTL_NOTIF_ADDFD)``. The ``id`` member of +``struct seccomp_notif_addfd`` should be the same ``id`` as in +``struct seccomp_notif``. The ``newfd_flags`` flag may be used to set flags +like O_EXEC on the file descriptor in the notifying process. If the supervisor +wants to inject the file descriptor with a specific number, the +``SECCOMP_ADDFD_FLAG_SETFD`` flag can be used, and set the ``newfd`` member to +the specific number to use. If that file descriptor is already open in the +notifying process it will be replaced. The supervisor can also add an FD, and +respond atomically by using the ``SECCOMP_ADDFD_FLAG_SEND`` flag and the return +value will be the injected file descriptor number. + It is worth noting that ``struct seccomp_data`` contains the values of register arguments to the syscall, but does not contain pointers to memory. The task's memory is accessible to suitably privileged traces via ``ptrace()`` or diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 6ba18b82a02e..78074254ab98 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -115,6 +115,7 @@ struct seccomp_notif_resp { /* valid flags for seccomp_notif_addfd */ #define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */ +#define SECCOMP_ADDFD_FLAG_SEND (1UL << 1) /* Addfd and return it, atomically */ /** * struct seccomp_notif_addfd diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 9f58049ac16d..057e17f3215d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -107,6 +107,7 @@ struct seccomp_knotif { * installing process should allocate the fd as normal. * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC * is allowed. + * @ioctl_flags: The flags used for the seccomp_addfd ioctl. * @ret: The return value of the installing process. It is set to the fd num * upon success (>= 0). * @completion: Indicates that the installing process has completed fd @@ -118,6 +119,7 @@ struct seccomp_kaddfd { struct file *file; int fd; unsigned int flags; + __u32 ioctl_flags; union { bool setfd; @@ -1065,18 +1067,37 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter) return filter->notif->next_id++; } -static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd) +static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_knotif *n) { + int fd; + /* * Remove the notification, and reset the list pointers, indicating * that it has been handled. */ list_del_init(&addfd->list); if (!addfd->setfd) - addfd->ret = receive_fd(addfd->file, addfd->flags); + fd = receive_fd(addfd->file, addfd->flags); else - addfd->ret = receive_fd_replace(addfd->fd, addfd->file, - addfd->flags); + fd = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); + addfd->ret = fd; + + if (addfd->ioctl_flags & SECCOMP_ADDFD_FLAG_SEND) { + /* If we fail reset and return an error to the notifier */ + if (fd < 0) { + n->state = SECCOMP_NOTIFY_SENT; + } else { + /* Return the FD we just added */ + n->flags = 0; + n->error = 0; + n->val = fd; + } + } + + /* + * Mark the notification as completed. From this point, addfd mem + * might be invalidated and we can't safely read it anymore. + */ complete(&addfd->completion); } @@ -1120,7 +1141,7 @@ static int seccomp_do_user_notification(int this_syscall, struct seccomp_kaddfd, list); /* Check if we were woken up by a addfd message */ if (addfd) - seccomp_handle_addfd(addfd); + seccomp_handle_addfd(addfd, &n); } while (n.state != SECCOMP_NOTIFY_REPLIED); @@ -1581,7 +1602,7 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter, if (addfd.newfd_flags & ~O_CLOEXEC) return -EINVAL; - if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD) + if (addfd.flags & ~(SECCOMP_ADDFD_FLAG_SETFD | SECCOMP_ADDFD_FLAG_SEND)) return -EINVAL; if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD)) @@ -1591,6 +1612,7 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter, if (!kaddfd.file) return -EBADF; + kaddfd.ioctl_flags = addfd.flags; kaddfd.flags = addfd.newfd_flags; kaddfd.setfd = addfd.flags & SECCOMP_ADDFD_FLAG_SETFD; kaddfd.fd = addfd.newfd; @@ -1616,6 +1638,23 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter, goto out_unlock; } + if (addfd.flags & SECCOMP_ADDFD_FLAG_SEND) { + /* + * Disallow queuing an atomic addfd + send reply while there are + * some addfd requests still to process. + * + * There is no clear reason to support it and allows us to keep + * the loop on the other side straight-forward. + */ + if (!list_empty(&knotif->addfd)) { + ret = -EBUSY; + goto out_unlock; + } + + /* Allow exactly only one reply */ + knotif->state = SECCOMP_NOTIFY_REPLIED; + } + list_add(&kaddfd.list, &knotif->addfd); complete(&knotif->ready); mutex_unlock(&filter->notify_lock); diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c index fcc806585266..6e5102a7d7c9 100644 --- a/tools/testing/selftests/seccomp/seccomp_benchmark.c +++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c @@ -143,9 +143,15 @@ int main(int argc, char *argv[]) unsigned long long native, filter1, filter2, bitmap1, bitmap2; unsigned long long entry, per_filter1, per_filter2; + setbuf(stdout, NULL); + + printf("Running on:\n"); + system("uname -a"); + printf("Current BPF sysctl settings:\n"); - system("sysctl net.core.bpf_jit_enable"); - system("sysctl net.core.bpf_jit_harden"); + /* Avoid using "sysctl" which may not be installed. */ + system("grep -H . /proc/sys/net/core/bpf_jit_enable"); + system("grep -H . /proc/sys/net/core/bpf_jit_harden"); if (argc > 1) samples = strtoull(argv[1], NULL, 0); diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index e3d5c77a8612..1d64891e6492 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -235,6 +235,10 @@ struct seccomp_notif_addfd { }; #endif +#ifndef SECCOMP_ADDFD_FLAG_SEND +#define SECCOMP_ADDFD_FLAG_SEND (1UL << 1) /* Addfd and return it, atomically */ +#endif + struct seccomp_notif_addfd_small { __u64 id; char weird[4]; @@ -3959,7 +3963,7 @@ TEST(user_notification_addfd) { pid_t pid; long ret; - int status, listener, memfd, fd; + int status, listener, memfd, fd, nextfd; struct seccomp_notif_addfd addfd = {}; struct seccomp_notif_addfd_small small = {}; struct seccomp_notif_addfd_big big = {}; @@ -3968,25 +3972,34 @@ TEST(user_notification_addfd) /* 100 ms */ struct timespec delay = { .tv_nsec = 100000000 }; + /* There may be arbitrary already-open fds at test start. */ memfd = memfd_create("test", 0); ASSERT_GE(memfd, 0); + nextfd = memfd + 1; ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); ASSERT_EQ(0, ret) { TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); } + /* fd: 4 */ /* Check that the basic notification machinery works */ listener = user_notif_syscall(__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER); - ASSERT_GE(listener, 0); + ASSERT_EQ(listener, nextfd++); pid = fork(); ASSERT_GE(pid, 0); if (pid == 0) { + /* fds will be added and this value is expected */ if (syscall(__NR_getppid) != USER_NOTIF_MAGIC) exit(1); + + /* Atomic addfd+send is received here. Check it is a valid fd */ + if (fcntl(syscall(__NR_getppid), F_GETFD) == -1) + exit(1); + exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC); } @@ -4028,14 +4041,14 @@ TEST(user_notification_addfd) /* Verify we can set an arbitrary remote fd */ fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd); - EXPECT_GE(fd, 0); + EXPECT_EQ(fd, nextfd++); EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0); /* Verify we can set an arbitrary remote fd with large size */ memset(&big, 0x0, sizeof(big)); big.addfd = addfd; fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big); - EXPECT_GE(fd, 0); + EXPECT_EQ(fd, nextfd++); /* Verify we can set a specific remote fd */ addfd.newfd = 42; @@ -4065,6 +4078,32 @@ TEST(user_notification_addfd) ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); ASSERT_EQ(addfd.id, req.id); + /* Verify we can do an atomic addfd and send */ + addfd.newfd = 0; + addfd.flags = SECCOMP_ADDFD_FLAG_SEND; + fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd); + /* + * Child has earlier "low" fds and now 42, so we expect the next + * lowest available fd to be assigned here. + */ + EXPECT_EQ(fd, nextfd++); + EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0); + + /* + * This sets the ID of the ADD FD to the last request plus 1. The + * notification ID increments 1 per notification. + */ + addfd.id = req.id + 1; + + /* This spins until the underlying notification is generated */ + while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 && + errno != -EINPROGRESS) + nanosleep(&delay, NULL); + + memset(&req, 0, sizeof(req)); + ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + ASSERT_EQ(addfd.id, req.id); + resp.id = req.id; resp.error = 0; resp.val = USER_NOTIF_MAGIC; @@ -4125,6 +4164,10 @@ TEST(user_notification_addfd_rlimit) EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1); EXPECT_EQ(errno, EMFILE); + addfd.flags = SECCOMP_ADDFD_FLAG_SEND; + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1); + EXPECT_EQ(errno, EMFILE); + addfd.newfd = 100; addfd.flags = SECCOMP_ADDFD_FLAG_SETFD; EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);