cosmopolitan/libc/calls/pledge.c

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8                                :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2022 Justine Alexandra Roberts Tunney                              │
│                                                                              │
│ Permission to use, copy, modify, and/or distribute this software for         │
│ any purpose with or without fee is hereby granted, provided that the         │
│ above copyright notice and this permission notice appear in all copies.      │
│                                                                              │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
│ PERFORMANCE OF THIS SOFTWARE.                                                │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/calls/calls.h"
#include "libc/calls/pledge.internal.h"
#include "libc/calls/state.internal.h"
#include "libc/calls/strace.internal.h"
#include "libc/calls/syscall-sysv.internal.h"
#include "libc/dce.h"
#include "libc/errno.h"
#include "libc/intrin/promises.internal.h"
#include "libc/runtime/runtime.h"
#include "libc/sysv/errfuns.h"

/**
 * Permits system operations, e.g.
 *
 *     if (pledge("stdio rfile tty", 0)) {
 *       perror("pledge");
 *       exit(1);
 *     }
 *
 * Pledging causes most system calls to become unavailable. Your system
 * call policy is enforced by the kernel (which means it can propagate
 * across execve() if permitted). Root access is not required. Support
 * is limited to Linux 2.6.23+ (c. RHEL6) and OpenBSD. If your kernel
 * isn't supported, then pledge() will return 0 and do nothing rather
 * than raising ENOSYS. We don't consider lack of system support to be
 * an error, because the specified operations will be permitted.
 *
 * The promises you give pledge() define which system calls are allowed.
 * Error messages are logged when sandbox violations occur that well you
 * which promise was needed, to stderr on Linux and /var/log/messages on
 * OpenBSD, and the unwatchable termination signal should be SIGABRT.
 *
 * Standard error logging can't happen on Linux if you use the `exec`
 * promise, since we polyfill logging in userspace which can't cross
 * execve() boundaries. However once you pledge away `exec` it works.
 * Another inconsistency that pledging `exec` causes, is your process
 * termination signal may become SIGSYS rather than SIGABRT.
 *
 * On Linux, our SECCOMP BPF polyfill offers more configurability in
 * terms of behavior. It's possible to choose different behaviors that
 * determine how sandbox violations are handled.
 *
 *     __pledge_mode = kPledgeModeKillThread;  // kill thread [default]
 *     __pledge_mode = kPledgeModeKillProcess; // kill all threads
 *     __pledge_mode = kPledgeModeErrno;       // just return EPERM
 *
 * Timing is everything with pledge. It's designed to be a voluntary
 * self-imposed security model. That works best when programs perform
 * permission-hungry operations (e.g. calling GetSymbolTable) towards
 * the beginning of execution, and then relinquish privilege afterwards
 * by calling pledge(). Here's an example of where that matters. Your
 * Cosmopolitan C Library needs to code morph your executable in memory
 * once you start using threads. But that's only possible to do if you
 * used the `prot_exec` promise. So the right thing to do here, is to
 * call __enable_threads() before calling pledge() to force it early.
 *
 *     __enable_threads();
 *     ShowCrashReports();
 *     pledge("...", 0);
 *
 * By default exit() is allowed. This is useful for processes that
 * perform pure computation and interface with the parent via shared
 * memory. On Linux we mean sys_exit (_Exit1), not sys_exit_group
 * (_Exit). The difference is effectively meaningless, since _Exit()
 * will attempt both. All it means is that, if you're using threads,
 * then a `pledge("", 0)` thread can't kill all your threads unless you
 * `pledge("stdio", 0)`.
 *
 * Once pledge is in effect, the chmod functions (if allowed) will not
 * permit the sticky/setuid/setgid bits to change. Linux will EPERM here
 * and OpenBSD should ignore those three bits rather than crashing.
 *
 * User and group IDs can't be changed once pledge is in effect. OpenBSD
 * should ignore chown without crashing; whereas Linux will just EPERM.
 *
 * Memory functions won't permit creating executable code after pledge.
 * Restrictions on origin of SYSCALL instructions will become enforced
 * on Linux (cf. msyscall) after pledge too, which means the process
 * gets killed if SYSCALL is used outside the .privileged section. One
 * exception is if the "exec" group is specified, in which case these
 * restrictions need to be loosened.
 *
 * Using pledge is irreversible. On Linux it causes PR_SET_NO_NEW_PRIVS
 * to be set on your process; however, if "id" or "recvfd" are allowed
 * then then they theoretically could permit the gaining of some new
 * privileges. You may call pledge() multiple times if "stdio" is
 * allowed. In that case, the process can only move towards a more
 * restrictive state.
 *
 * pledge() can't filter filesystem paths. See unveil() which lets you
 * do that. pledge() also can't do address firewalling. For example if
 * you use the `inet` promise then your process will be able to talk to
 * *every* internet address including public ones.
 *
 * `promises` is a string that may include any of the following groups
 * delimited by spaces.
 *
 * - "stdio" allows exit, close, dup, dup2, dup3, fchdir, fstat, fsync,
 *   fdatasync, ftruncate, getdents, getegid, getrandom, geteuid,
 *   getgid, getgroups, times, getrusage, getitimer, getpgid, getpgrp,
 *   getpid, getppid, getresgid, getresuid, getrlimit, getsid, wait4,
 *   gettimeofday, getuid, lseek, madvise, brk, arch_prctl, uname,
 *   set_tid_address, clock_getres, clock_gettime, clock_nanosleep,
 *   mremap, mmap, (PROT_EXEC and weird flags aren't allowed), mprotect
 *   (PROT_EXEC isn't allowed), msync, sync_file_range, migrate_pages,
 *   munmap, nanosleep, pipe, pipe2, read, readv, pread, recv, poll,
 *   recvfrom, preadv, write, writev, pwrite, pwritev, select, pselect6,
 *   copy_file_range, sendfile, tee, splice, vmsplice, alarm, pause,
 *   send, sendto (only if addr is null), setitimer, shutdown, sigaction
 *   (but SIGSYS is forbidden), sigaltstack, sigprocmask, sigreturn,
 *   sigsuspend, umask, mincore, socketpair, ioctl(FIONREAD),
 *   ioctl(FIONBIO), ioctl(FIOCLEX), ioctl(FIONCLEX), fcntl(F_GETFD),
 *   fcntl(F_SETFD), fcntl(F_GETFL), fcntl(F_SETFL), sched_yield,
 *   epoll_create, epoll_create1, epoll_ctl, epoll_wait, epoll_pwait,
 *   epoll_pwait2, clone(CLONE_THREAD), futex, set_robust_list,
 *   get_robust_list, sigpending.
 *
 * - "rpath" (read-only path ops) allows chdir, getcwd, open(O_RDONLY),
 *   openat(O_RDONLY), stat, fstat, lstat, fstatat, access, faccessat,
 *   faccessat2, readlink, readlinkat, statfs, fstatfs.
 *
 * - "wpath" (write path ops) allows getcwd, open(O_WRONLY),
 *   openat(O_WRONLY), stat, fstat, lstat, fstatat, access, faccessat,
 *   faccessat2, readlink, readlinkat, chmod, fchmod, fchmodat.
 *
 * - "cpath" (create path ops) allows open(O_CREAT), openat(O_CREAT),
 *   rename, renameat, renameat2, link, linkat, symlink, symlinkat,
 *   unlink, rmdir, unlinkat, mkdir, mkdirat.
 *
 * - "dpath" (create special path ops) allows mknod, mknodat, mkfifo.
 *
 * - "flock" allows flock, fcntl(F_GETLK), fcntl(F_SETLK),
 *   fcntl(F_SETLKW).
 *
 * - "tty" allows ioctl(TIOCGWINSZ), ioctl(TCGETS), ioctl(TCSETS),
 *   ioctl(TCSETSW), ioctl(TCSETSF).
 *
 * - "recvfd" allows recvmsg and recvmmsg.
 *
 * - "recvfd" allows sendmsg and sendmmsg.
 *
 * - "fattr" allows chmod, fchmod, fchmodat, utime, utimes, futimens,
 *   utimensat.
 *
 * - "inet" allows socket(AF_INET), listen, bind, connect, accept,
 *   accept4, getpeername, getsockname, setsockopt, getsockopt, sendto.
 *
 * - "unix" allows socket(AF_UNIX), listen, bind, connect, accept,
 *   accept4, getpeername, getsockname, setsockopt, getsockopt.
 *
 * - "dns" allows socket(AF_INET), sendto, recvfrom, connect.
 *
 * - "proc" allows fork, vfork, clone, kill, tgkill, getpriority,
 *   setpriority, prlimit, setrlimit, setpgid, setsid.
 *
 * - "id" allows setuid, setreuid, setresuid, setgid, setregid,
 *   setresgid, setgroups, prlimit, setrlimit, getpriority, setpriority,
 *   setfsuid, setfsgid.
 *
 * - "settime" allows settimeofday and clock_adjtime.
 *
 * - "exec" allows execve, execveat. On Linux, using this promise will
 *   cause (1) system call origin verification to be disabled; (2) error
 *   logging will be disabled; and (3) your termination signals might
 *   become SIGSYS instead of SIGABRT. Another thing to note is that
 *   `exec` alone might not be enough by itself to let your executable
 *   be executed. For dynamic, interpreted, and ape binaries, you'll
 *   usually want `rpath` and `prot_exec` too. With APE it's possible to
 *   work around this requirement, by "assimilating" your binaries
 *   beforehand. See the assimilate.com program and `--assimilate` flag
 *   which can be used to turn APE binaries into static native binaries.
 *
 * - "prot_exec" allows mmap(PROT_EXEC) and mprotect(PROT_EXEC). This is
 *   needed to (1) code morph mutexes in __enable_threads(), and it's
 *   needed to (2) launch non-static or non-native executables, e.g.
 *   non-assimilated APE binaries, or dynamic-linked executables.
 *
 * - "unveil" allows unveil() to be called, as well as the underlying
 *   landlock_create_ruleset, landlock_add_rule, landlock_restrict_self
 *   calls on Linux.
 *
 * - "vminfo" OpenBSD defines this for programs like `top`. On Linux,
 *   this is a placeholder group that lets tools like pledge.com check
 *   `__promises` and automatically unveil() a subset of files top would
 *   need, e.g. /proc/stat, /proc/meminfo.
 *
 * - "tmppath" allows unlink, unlinkat, and lstat. This is mostly a
 *   placeholder group for pledge.com, which reads the `__promises`
 *   global to determine if /tmp and $TMPPATH should be unveiled.
 *
 * `execpromises` only matters if "exec" is specified in `promises`. In
 * that case, this specifies the promises that'll apply once execve()
 * happens. If this is NULL then the default is used, which is
 * unrestricted. OpenBSD allows child processes to escape the sandbox
 * (so a pledged OpenSSH server process can do things like spawn a root
 * shell). Linux however requires monotonically decreasing privileges.
 * This function will will perform some validation on Linux to make sure
 * that `execpromises` is a subset of `promises`. Your libc wrapper for
 * execve() will then apply its SECCOMP BPF filter later. Since Linux
 * has to do this before calling sys_execve(), the executed process will
 * be weakened to have execute permissions too.
 *
 * @return 0 on success, or -1 w/ errno
 * @raise EINVAL if `execpromises` on Linux isn't a subset of `promises`
 * @raise EINVAL if `promises` allows exec and `execpromises` is null
 * @threadsafe
 * @vforksafe
 */
int pledge(const char *promises, const char *execpromises) {
  int e, rc;
  unsigned long ipromises, iexecpromises;
  if (!ParsePromises(promises, &ipromises) &&
      !ParsePromises(execpromises, &iexecpromises)) {
    if (IsLinux()) {
      // copy exec and execnative from promises to execpromises
      iexecpromises = ~(~iexecpromises | (~ipromises & (1ul << PROMISE_EXEC)));
      // if bits are missing in execpromises that exist in promises
      // then execpromises wouldn't be a monotonic access reduction
      // this check only matters when exec / execnative are allowed
      if ((ipromises & ~iexecpromises) &&
          (~ipromises & (1ul << PROMISE_EXEC))) {
        STRACE("execpromises must be a subset of promises");
        rc = einval();
      } else {
        rc = sys_pledge_linux(ipromises, __pledge_mode, true);
        if (rc > -4096u) errno = -rc, rc = -1;
      }
    } else {
      e = errno;
      rc = sys_pledge(promises, execpromises);
      if (rc && errno == ENOSYS) {
        errno = e;
        rc = 0;
      }
    }
    if (!rc && !__vforked &&
        (IsOpenbsd() || (IsLinux() && getpid() == gettid()))) {
      __promises = ipromises;
      __execpromises = iexecpromises;
    }
  } else {
    rc = einval();
  }
  STRACE("pledge(%#s, %#s) → %d% m", promises, execpromises, rc);
  return rc;
}