mirror of
				https://github.com/jart/cosmopolitan.git
				synced 2025-10-25 02:30:57 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			278 lines
		
	
	
	
		
			14 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			278 lines
		
	
	
	
		
			14 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 | |
| │vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8                                :vi│
 | |
| ╞══════════════════════════════════════════════════════════════════════════════╡
 | |
| │ Copyright 2022 Justine Alexandra Roberts Tunney                              │
 | |
| │                                                                              │
 | |
| │ Permission to use, copy, modify, and/or distribute this software for         │
 | |
| │ any purpose with or without fee is hereby granted, provided that the         │
 | |
| │ above copyright notice and this permission notice appear in all copies.      │
 | |
| │                                                                              │
 | |
| │ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
 | |
| │ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
 | |
| │ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
 | |
| │ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
 | |
| │ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
 | |
| │ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
 | |
| │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 | |
| │ PERFORMANCE OF THIS SOFTWARE.                                                │
 | |
| ╚─────────────────────────────────────────────────────────────────────────────*/
 | |
| #include "libc/calls/calls.h"
 | |
| #include "libc/calls/pledge.internal.h"
 | |
| #include "libc/calls/state.internal.h"
 | |
| #include "libc/calls/syscall-sysv.internal.h"
 | |
| #include "libc/dce.h"
 | |
| #include "libc/errno.h"
 | |
| #include "libc/intrin/promises.internal.h"
 | |
| #include "libc/intrin/strace.internal.h"
 | |
| #include "libc/nexgen32e/vendor.internal.h"
 | |
| #include "libc/runtime/runtime.h"
 | |
| #include "libc/sysv/errfuns.h"
 | |
| 
 | |
| /**
 | |
|  * Permits system operations, e.g.
 | |
|  *
 | |
|  *     __pledge_mode = PLEDGE_PENALTY_KILL_PROCESS | PLEDGE_STDERR_LOGGING;
 | |
|  *     if (pledge("stdio rfile tty", 0)) {
 | |
|  *       perror("pledge");
 | |
|  *       exit(1);
 | |
|  *     }
 | |
|  *
 | |
|  * Pledging causes most system calls to become unavailable. Your system
 | |
|  * call policy is enforced by the kernel (which means it can propagate
 | |
|  * across execve() if permitted). Root access is not required. Support
 | |
|  * is limited to Linux 2.6.23+ (c. RHEL6) and OpenBSD. If your kernel
 | |
|  * isn't supported, then pledge() will return 0 and do nothing rather
 | |
|  * than raising ENOSYS. We don't consider lack of system support to be
 | |
|  * an error, because the specified operations will be permitted.
 | |
|  *
 | |
|  * The promises you give pledge() define which system calls are allowed.
 | |
|  * Error messages are logged when sandbox violations occur, but how that
 | |
|  * happens depends on the `mode` parameter (see below).
 | |
|  *
 | |
|  * Timing is everything with pledge. It's designed to be a voluntary
 | |
|  * self-imposed security model. That works best when programs perform
 | |
|  * permission-hungry operations (e.g. calling GetSymbolTable) towards
 | |
|  * the beginning of execution, and then relinquish privilege afterwards
 | |
|  * by calling pledge(). Here's an example of where that matters. Your
 | |
|  * Cosmopolitan C Library needs to code morph your executable in memory
 | |
|  * once you start using threads. But that's only possible to do if you
 | |
|  * used the `prot_exec` promise. So the right thing to do here, is to
 | |
|  * call __enable_threads() before calling pledge() to force it early.
 | |
|  *
 | |
|  *     __enable_threads();
 | |
|  *     ShowCrashReports();
 | |
|  *     pledge("...", 0);
 | |
|  *
 | |
|  * By default exit() is allowed. This is useful for processes that
 | |
|  * perform pure computation and interface with the parent via shared
 | |
|  * memory. On Linux we mean sys_exit (_Exit1), not sys_exit_group
 | |
|  * (_Exit). The difference is effectively meaningless, since _Exit()
 | |
|  * will attempt both. All it means is that, if you're using threads,
 | |
|  * then a `pledge("", 0)` thread can't kill all your threads unless you
 | |
|  * `pledge("stdio", 0)`.
 | |
|  *
 | |
|  * Once pledge is in effect, the chmod functions (if allowed) will not
 | |
|  * permit the sticky/setuid/setgid bits to change. Linux will EPERM here
 | |
|  * and OpenBSD should ignore those three bits rather than crashing.
 | |
|  *
 | |
|  * User and group IDs can't be changed once pledge is in effect. OpenBSD
 | |
|  * should ignore chown without crashing; whereas Linux will just EPERM.
 | |
|  *
 | |
|  * Using pledge is irreversible. On Linux it causes PR_SET_NO_NEW_PRIVS
 | |
|  * to be set on your process; however, if "id" or "recvfd" are allowed
 | |
|  * then then they theoretically could permit the gaining of some new
 | |
|  * privileges. You may call pledge() multiple times if "stdio" is
 | |
|  * allowed. In that case, the process can only move towards a more
 | |
|  * restrictive state.
 | |
|  *
 | |
|  * pledge() can't filter filesystem paths. See unveil() which lets you
 | |
|  * do that. pledge() also can't do address firewalling. For example if
 | |
|  * you use the `inet` promise then your process will be able to talk to
 | |
|  * *every* internet address including public ones.
 | |
|  *
 | |
|  * `promises` is a string that may include any of the following groups
 | |
|  * delimited by spaces.
 | |
|  *
 | |
|  * - "stdio" allows exit, close, dup, dup2, dup3, fchdir, fstat, fsync,
 | |
|  *   fdatasync, ftruncate, getdents, getegid, getrandom, geteuid,
 | |
|  *   getgid, getgroups, times, getrusage, getitimer, getpgid, getpgrp,
 | |
|  *   getpid, getppid, getresgid, getresuid, getrlimit, getsid, wait4,
 | |
|  *   gettimeofday, getuid, lseek, madvise, brk, arch_prctl, uname,
 | |
|  *   set_tid_address, clock_getres, clock_gettime, clock_nanosleep,
 | |
|  *   mremap, mmap, (PROT_EXEC and weird flags aren't allowed), mprotect
 | |
|  *   (PROT_EXEC isn't allowed), msync, sync_file_range, migrate_pages,
 | |
|  *   munmap, nanosleep, pipe, pipe2, read, readv, pread, recv, poll,
 | |
|  *   recvfrom, preadv, write, writev, pwrite, pwritev, select, pselect6,
 | |
|  *   copy_file_range, sendfile, tee, splice, vmsplice, alarm, pause,
 | |
|  *   send, sendto (only if addr is null), setitimer, shutdown, sigaction
 | |
|  *   (but SIGSYS is forbidden), sigaltstack, sigprocmask, sigreturn,
 | |
|  *   sigsuspend, umask, mincore, socketpair, ioctl(FIONREAD),
 | |
|  *   ioctl(FIONBIO), ioctl(FIOCLEX), ioctl(FIONCLEX), fcntl(F_GETFD),
 | |
|  *   fcntl(F_SETFD), fcntl(F_GETFL), fcntl(F_SETFL), sched_yield,
 | |
|  *   epoll_create, epoll_create1, epoll_ctl, epoll_wait, epoll_pwait,
 | |
|  *   epoll_pwait2, clone(CLONE_THREAD), futex, set_robust_list,
 | |
|  *   get_robust_list, setaffinity, sigpending.
 | |
|  *
 | |
|  * - "rpath" (read-only path ops) allows chdir, getcwd, open(O_RDONLY),
 | |
|  *   openat(O_RDONLY), stat, fstat, lstat, fstatat, access, faccessat,
 | |
|  *   faccessat2, readlink, readlinkat, statfs, fstatfs.
 | |
|  *
 | |
|  * - "wpath" (write path ops) allows getcwd, open(O_WRONLY),
 | |
|  *   openat(O_WRONLY), stat, fstat, lstat, fstatat, access, faccessat,
 | |
|  *   faccessat2, readlink, readlinkat, chmod, fchmod, fchmodat.
 | |
|  *
 | |
|  * - "cpath" (create path ops) allows open(O_CREAT), openat(O_CREAT),
 | |
|  *   rename, renameat, renameat2, link, linkat, symlink, symlinkat,
 | |
|  *   unlink, rmdir, unlinkat, mkdir, mkdirat.
 | |
|  *
 | |
|  * - "dpath" (create special path ops) allows mknod, mknodat, mkfifo.
 | |
|  *
 | |
|  * - "flock" allows flock, fcntl(F_GETLK), fcntl(F_SETLK),
 | |
|  *   fcntl(F_SETLKW).
 | |
|  *
 | |
|  * - "tty" allows ioctl(TIOCGWINSZ), ioctl(TCGETS), ioctl(TCSETS),
 | |
|  *   ioctl(TCSETSW), ioctl(TCSETSF).
 | |
|  *
 | |
|  * - "recvfd" allows recvmsg and recvmmsg.
 | |
|  *
 | |
|  * - "recvfd" allows sendmsg and sendmmsg.
 | |
|  *
 | |
|  * - "fattr" allows chmod, fchmod, fchmodat, utime, utimes, futimens,
 | |
|  *   utimensat.
 | |
|  *
 | |
|  * - "inet" allows socket(AF_INET), listen, bind, connect, accept,
 | |
|  *   accept4, getpeername, getsockname, setsockopt, getsockopt, sendto.
 | |
|  *
 | |
|  * - "unix" allows socket(AF_UNIX), listen, bind, connect, accept,
 | |
|  *   accept4, getpeername, getsockname, setsockopt, getsockopt.
 | |
|  *
 | |
|  * - "dns" allows socket(AF_INET), sendto, recvfrom, connect.
 | |
|  *
 | |
|  * - "proc" allows fork, vfork, clone, kill, tgkill, getpriority,
 | |
|  *   setpriority, prlimit, setrlimit, setpgid, setsid.
 | |
|  *
 | |
|  * - "id" allows setuid, setreuid, setresuid, setgid, setregid,
 | |
|  *   setresgid, setgroups, prlimit, setrlimit, getpriority, setpriority,
 | |
|  *   setfsuid, setfsgid.
 | |
|  *
 | |
|  * - "settime" allows settimeofday and clock_adjtime.
 | |
|  *
 | |
|  * - "exec" allows execve, execveat. Note that `exec` alone might not be
 | |
|  *   enough by itself to let your executable be executed. For dynamic,
 | |
|  *   interpreted, and ape binaries, you'll usually want `rpath` and
 | |
|  *   `prot_exec` too. With APE it's possible to work around this
 | |
|  *   requirement, by "assimilating" your binaries beforehand. See the
 | |
|  *   assimilate.com program and `--assimilate` flag which can be used to
 | |
|  *   turn APE binaries into static native binaries.
 | |
|  *
 | |
|  * - "prot_exec" allows mmap(PROT_EXEC) and mprotect(PROT_EXEC). This is
 | |
|  *   needed to (1) code morph mutexes in __enable_threads(), and it's
 | |
|  *   needed to (2) launch non-static or non-native executables, e.g.
 | |
|  *   non-assimilated APE binaries, or dynamic-linked executables.
 | |
|  *
 | |
|  * - "unveil" allows unveil() to be called, as well as the underlying
 | |
|  *   landlock_create_ruleset, landlock_add_rule, landlock_restrict_self
 | |
|  *   calls on Linux.
 | |
|  *
 | |
|  * - "vminfo" OpenBSD defines this for programs like `top`. On Linux,
 | |
|  *   this is a placeholder group that lets tools like pledge.com check
 | |
|  *   `__promises` and automatically unveil() a subset of files top would
 | |
|  *   need, e.g. /proc/stat, /proc/meminfo.
 | |
|  *
 | |
|  * - "tmppath" allows unlink, unlinkat, and lstat. This is mostly a
 | |
|  *   placeholder group for pledge.com, which reads the `__promises`
 | |
|  *   global to determine if /tmp and $TMPPATH should be unveiled.
 | |
|  *
 | |
|  * `execpromises` only matters if "exec" is specified in `promises`. In
 | |
|  * that case, this specifies the promises that'll apply once execve()
 | |
|  * happens. If this is NULL then the default is used, which is
 | |
|  * unrestricted. OpenBSD allows child processes to escape the sandbox
 | |
|  * (so a pledged OpenSSH server process can do things like spawn a root
 | |
|  * shell). Linux however requires monotonically decreasing privileges.
 | |
|  * This function will will perform some validation on Linux to make sure
 | |
|  * that `execpromises` is a subset of `promises`. Your libc wrapper for
 | |
|  * execve() will then apply its SECCOMP BPF filter later. Since Linux
 | |
|  * has to do this before calling sys_execve(), the executed process will
 | |
|  * be weakened to have execute permissions too.
 | |
|  *
 | |
|  * `__pledge_mode` is available to improve the experience of pledge() on
 | |
|  * Linux. It should specify one of the following penalties:
 | |
|  *
 | |
|  * - `PLEDGE_PENALTY_KILL_THREAD` causes the violating thread to be
 | |
|  *   killed. This is the default on Linux. It's effectively the same as
 | |
|  *   killing the process, since redbean has no threads. The termination
 | |
|  *   signal can't be caught and will be either `SIGSYS` or `SIGABRT`.
 | |
|  *   Consider enabling stderr logging below so you'll know why your
 | |
|  *   program failed. Otherwise check the system log.
 | |
|  *
 | |
|  * - `PLEDGE_PENALTY_KILL_PROCESS` causes the process and all its
 | |
|  *   threads to be killed. This is always the case on OpenBSD.
 | |
|  *
 | |
|  * - `PLEDGE_PENALTY_RETURN_EPERM` causes system calls to just return an
 | |
|  *   `EPERM` error instead of killing. This is a gentler solution that
 | |
|  *   allows code to display a friendly warning. Please note this may
 | |
|  *   lead to weird behaviors if the software being sandboxed is lazy
 | |
|  *   about checking error results.
 | |
|  *
 | |
|  * `mode` may optionally bitwise or the following flags:
 | |
|  *
 | |
|  * - `PLEDGE_STDERR_LOGGING` enables friendly error message logging
 | |
|  *   letting you know which promises are needed whenever violations
 | |
|  *   occur. Without this, violations will be logged to `dmesg` on Linux
 | |
|  *   if the penalty is to kill the process. You would then need to
 | |
|  *   manually look up the system call number and then cross reference it
 | |
|  *   with the cosmopolitan libc pledge() documentation. You can also use
 | |
|  *   `strace -ff` which is easier. This is ignored OpenBSD, which
 | |
|  *   already has a good system log. Turning on stderr logging (which
 | |
|  *   uses SECCOMP trapping) also means that the `WTERMSIG()` on your
 | |
|  *   killed processes will always be `SIGABRT` on both Linux and
 | |
|  *   OpenBSD. Otherwise, Linux prefers to raise `SIGSYS`. Enabling this
 | |
|  *   option might not be a good idea if you're pledging `exec` because
 | |
|  *   subprocesses can't inherit the `SIGSYS` handler this installs.
 | |
|  *
 | |
|  * @return 0 on success, or -1 w/ errno
 | |
|  * @raise EINVAL if `execpromises` on Linux isn't a subset of `promises`
 | |
|  * @raise EINVAL if `promises` allows exec and `execpromises` is null
 | |
|  * @threadsafe
 | |
|  * @vforksafe
 | |
|  */
 | |
| int pledge(const char *promises, const char *execpromises) {
 | |
|   int e, rc;
 | |
|   unsigned long ipromises, iexecpromises;
 | |
|   if (IsGenuineBlink()) {
 | |
|     rc = 0;  // blink doesn't support seccomp
 | |
|   } else if (!ParsePromises(promises, &ipromises) &&
 | |
|              !ParsePromises(execpromises, &iexecpromises)) {
 | |
|     if (IsLinux()) {
 | |
|       // copy exec and execnative from promises to execpromises
 | |
|       iexecpromises = ~(~iexecpromises | (~ipromises & (1ul << PROMISE_EXEC)));
 | |
|       // if bits are missing in execpromises that exist in promises
 | |
|       // then execpromises wouldn't be a monotonic access reduction
 | |
|       // this check only matters when exec / execnative are allowed
 | |
|       if ((ipromises & ~iexecpromises) &&
 | |
|           (~ipromises & (1ul << PROMISE_EXEC))) {
 | |
|         STRACE("execpromises must be a subset of promises");
 | |
|         rc = einval();
 | |
|       } else {
 | |
|         rc = sys_pledge_linux(ipromises, __pledge_mode);
 | |
|         if (rc > -4096u) errno = -rc, rc = -1;
 | |
|       }
 | |
|     } else {
 | |
|       e = errno;
 | |
|       rc = sys_pledge(promises, execpromises);
 | |
|       if (rc && errno == ENOSYS) {
 | |
|         errno = e;
 | |
|         rc = 0;
 | |
|       }
 | |
|     }
 | |
|     if (!rc && !__vforked &&
 | |
|         (IsOpenbsd() || (IsLinux() && getpid() == gettid()))) {
 | |
|       __promises = ipromises;
 | |
|       __execpromises = iexecpromises;
 | |
|     }
 | |
|   } else {
 | |
|     rc = einval();
 | |
|   }
 | |
|   STRACE("pledge(%#s, %#s) → %d% m", promises, execpromises, rc);
 | |
|   return rc;
 | |
| }
 |