cosmopolitan/libc/calls/pledge.c
Justine Tunney e98514cdb7 Plug a hole in pledge()
Günther Noack points out that O_RDONLY|O_TRUNC will modify a file.
2022-07-24 23:41:59 -07:00

1580 lines
59 KiB
C

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2022 Justine Alexandra Roberts Tunney │
│ │
│ Permission to use, copy, modify, and/or distribute this software for │
│ any purpose with or without fee is hereby granted, provided that the │
│ above copyright notice and this permission notice appear in all copies. │
│ │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
│ PERFORMANCE OF THIS SOFTWARE. │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/assert.h"
#include "libc/bits/likely.h"
#include "libc/calls/calls.h"
#include "libc/calls/strace.internal.h"
#include "libc/calls/struct/bpf.h"
#include "libc/calls/struct/filter.h"
#include "libc/calls/struct/seccomp.h"
#include "libc/calls/syscall-sysv.internal.h"
#include "libc/intrin/promises.internal.h"
#include "libc/limits.h"
#include "libc/macros.internal.h"
#include "libc/runtime/runtime.h"
#include "libc/runtime/stack.h"
#include "libc/str/str.h"
#include "libc/sysv/consts/audit.h"
#include "libc/sysv/consts/nrlinux.h"
#include "libc/sysv/consts/o.h"
#include "libc/sysv/consts/pr.h"
#include "libc/sysv/consts/prot.h"
#include "libc/sysv/errfuns.h"
#define SPECIAL 0xf000
#define ADDRLESS 0x2000
#define INET 0x8000
#define LOCK 0x4000
#define NOEXEC 0x8000
#define EXEC 0x4000
#define READONLY 0x8000
#define WRITEONLY 0x4000
#define CREATONLY 0x2000
#define STDIO 0x8000
#define THREAD 0x8000
#define TTY 0x8000
#define UNIX 0x4000
#define NOBITS 0x8000
#define NOSIGSYS 0x8000
#define RESTRICT 0x1000
// TODO(jart): fix chibicc
#ifdef __chibicc__
#define OFF(f) -1
#else
#define OFF(f) offsetof(struct seccomp_data, f)
#endif
#define PLEDGE(pledge) pledge, ARRAYLEN(pledge)
struct Filter {
size_t n;
struct sock_filter p[700];
};
static const uint16_t kPledgeLinuxDefault[] = {
__NR_linux_exit, // thread return / exit()
};
// the stdio contains all the benign system calls. openbsd makes the
// assumption that preexisting file descriptors are trustworthy. we
// implement checking for these as a simple linear scan rather than
// binary search, since there doesn't appear to be any measurable
// difference in the latency of sched_yield() if it's at the start of
// the bpf script or the end.
static const uint16_t kPledgeLinuxStdio[] = {
__NR_linux_exit_group, //
__NR_linux_sched_yield, //
__NR_linux_sched_getaffinity, //
__NR_linux_clock_getres, //
__NR_linux_clock_gettime, //
__NR_linux_clock_nanosleep, //
__NR_linux_close_range, //
__NR_linux_close, //
__NR_linux_write, //
__NR_linux_writev, //
__NR_linux_pwrite, //
__NR_linux_pwritev, //
__NR_linux_pwritev2, //
__NR_linux_read, //
__NR_linux_readv, //
__NR_linux_pread, //
__NR_linux_preadv, //
__NR_linux_preadv2, //
__NR_linux_dup, //
__NR_linux_dup2, //
__NR_linux_dup3, //
__NR_linux_fchdir, //
__NR_linux_fcntl | STDIO, //
__NR_linux_fstat, //
__NR_linux_fsync, //
__NR_linux_sysinfo, //
__NR_linux_fdatasync, //
__NR_linux_ftruncate, //
__NR_linux_getdents, //
__NR_linux_getrandom, //
__NR_linux_getgroups, //
__NR_linux_getpgid, //
__NR_linux_getpgrp, //
__NR_linux_getpid, //
__NR_linux_gettid, //
__NR_linux_getuid, //
__NR_linux_getgid, //
__NR_linux_getsid, //
__NR_linux_getppid, //
__NR_linux_geteuid, //
__NR_linux_getegid, //
__NR_linux_getrlimit, //
__NR_linux_getresgid, //
__NR_linux_getresuid, //
__NR_linux_getitimer, //
__NR_linux_setitimer, //
__NR_linux_timerfd_create, //
__NR_linux_timerfd_settime, //
__NR_linux_timerfd_gettime, //
__NR_linux_copy_file_range, //
__NR_linux_gettimeofday, //
__NR_linux_sendfile, //
__NR_linux_vmsplice, //
__NR_linux_splice, //
__NR_linux_lseek, //
__NR_linux_tee, //
__NR_linux_brk, //
__NR_linux_msync, //
__NR_linux_mmap | NOEXEC, //
__NR_linux_mremap, //
__NR_linux_munmap, //
__NR_linux_mincore, //
__NR_linux_madvise, //
__NR_linux_fadvise, //
__NR_linux_mprotect | NOEXEC, //
__NR_linux_arch_prctl, //
__NR_linux_migrate_pages, //
__NR_linux_sync_file_range, //
__NR_linux_set_tid_address, //
__NR_linux_nanosleep, //
__NR_linux_pipe, //
__NR_linux_pipe2, //
__NR_linux_poll, //
__NR_linux_ppoll, //
__NR_linux_select, //
__NR_linux_pselect6, //
__NR_linux_epoll_create, //
__NR_linux_epoll_create1, //
__NR_linux_epoll_ctl, //
__NR_linux_epoll_wait, //
__NR_linux_epoll_pwait, //
__NR_linux_epoll_pwait2, //
__NR_linux_recvfrom, //
__NR_linux_sendto | ADDRLESS, //
__NR_linux_ioctl | RESTRICT, //
__NR_linux_alarm, //
__NR_linux_pause, //
__NR_linux_shutdown, //
__NR_linux_eventfd, //
__NR_linux_eventfd2, //
__NR_linux_signalfd, //
__NR_linux_signalfd4, //
__NR_linux_sigaction | NOSIGSYS, //
__NR_linux_sigaltstack, //
__NR_linux_sigprocmask, //
__NR_linux_sigsuspend, //
__NR_linux_sigreturn, //
__NR_linux_sigpending, //
__NR_linux_socketpair, //
__NR_linux_getrusage, //
__NR_linux_times, //
__NR_linux_umask, //
__NR_linux_wait4, //
__NR_linux_uname, //
__NR_linux_prctl | STDIO, //
__NR_linux_clone | THREAD, //
__NR_linux_futex, //
__NR_linux_set_robust_list, //
__NR_linux_get_robust_list, //
__NR_linux_prlimit | STDIO, //
};
static const uint16_t kPledgeLinuxFlock[] = {
__NR_linux_flock, //
__NR_linux_fcntl | LOCK, //
};
static const uint16_t kPledgeLinuxRpath[] = {
__NR_linux_chdir, //
__NR_linux_getcwd, //
__NR_linux_open | READONLY, //
__NR_linux_openat | READONLY, //
__NR_linux_stat, //
__NR_linux_lstat, //
__NR_linux_fstat, //
__NR_linux_fstatat, //
__NR_linux_access, //
__NR_linux_faccessat, //
__NR_linux_faccessat2, //
__NR_linux_readlink, //
__NR_linux_readlinkat, //
__NR_linux_statfs, //
__NR_linux_fstatfs, //
};
static const uint16_t kPledgeLinuxWpath[] = {
__NR_linux_getcwd, //
__NR_linux_open | WRITEONLY, //
__NR_linux_openat | WRITEONLY, //
__NR_linux_stat, //
__NR_linux_fstat, //
__NR_linux_lstat, //
__NR_linux_fstatat, //
__NR_linux_access, //
__NR_linux_faccessat, //
__NR_linux_faccessat2, //
__NR_linux_readlinkat, //
__NR_linux_chmod | NOBITS, //
__NR_linux_fchmod | NOBITS, //
__NR_linux_fchmodat | NOBITS, //
};
static const uint16_t kPledgeLinuxCpath[] = {
__NR_linux_open | CREATONLY, //
__NR_linux_openat | CREATONLY, //
__NR_linux_creat | RESTRICT, //
__NR_linux_rename, //
__NR_linux_renameat, //
__NR_linux_renameat2, //
__NR_linux_link, //
__NR_linux_linkat, //
__NR_linux_symlink, //
__NR_linux_symlinkat, //
__NR_linux_rmdir, //
__NR_linux_unlink, //
__NR_linux_unlinkat, //
__NR_linux_mkdir, //
__NR_linux_mkdirat, //
};
static const uint16_t kPledgeLinuxDpath[] = {
__NR_linux_mknod, //
__NR_linux_mknodat, //
};
static const uint16_t kPledgeLinuxFattr[] = {
__NR_linux_chmod | NOBITS, //
__NR_linux_fchmod | NOBITS, //
__NR_linux_fchmodat | NOBITS, //
__NR_linux_utime, //
__NR_linux_utimes, //
__NR_linux_futimesat, //
__NR_linux_utimensat, //
};
static const uint16_t kPledgeLinuxInet[] = {
__NR_linux_socket | INET, //
__NR_linux_listen, //
__NR_linux_bind, //
__NR_linux_sendto, //
__NR_linux_connect, //
__NR_linux_accept, //
__NR_linux_accept4, //
__NR_linux_getsockopt | RESTRICT, //
__NR_linux_setsockopt | RESTRICT, //
__NR_linux_getpeername, //
__NR_linux_getsockname, //
};
static const uint16_t kPledgeLinuxUnix[] = {
__NR_linux_socket | UNIX, //
__NR_linux_listen, //
__NR_linux_bind, //
__NR_linux_connect, //
__NR_linux_sendto, //
__NR_linux_accept, //
__NR_linux_accept4, //
__NR_linux_getsockopt | RESTRICT, //
__NR_linux_setsockopt | RESTRICT, //
__NR_linux_getpeername, //
__NR_linux_getsockname, //
};
static const uint16_t kPledgeLinuxDns[] = {
__NR_linux_socket | INET, //
__NR_linux_bind, //
__NR_linux_sendto, //
__NR_linux_connect, //
__NR_linux_recvfrom, //
__NR_linux_fstatat, //
__NR_linux_openat | READONLY, //
__NR_linux_read, //
__NR_linux_close, //
};
static const uint16_t kPledgeLinuxTty[] = {
__NR_linux_ioctl | TTY, //
};
static const uint16_t kPledgeLinuxRecvfd[] = {
__NR_linux_recvmsg, //
__NR_linux_recvmmsg, //
};
static const uint16_t kPledgeLinuxSendfd[] = {
__NR_linux_sendmsg, //
__NR_linux_sendmmsg, //
};
static const uint16_t kPledgeLinuxProc[] = {
__NR_linux_fork, //
__NR_linux_vfork, //
__NR_linux_clone | RESTRICT, //
__NR_linux_kill, //
__NR_linux_setsid, //
__NR_linux_setpgid, //
__NR_linux_prlimit, //
__NR_linux_setrlimit, //
__NR_linux_getpriority, //
__NR_linux_setpriority, //
__NR_linux_ioprio_get, //
__NR_linux_ioprio_set, //
__NR_linux_sched_getscheduler, //
__NR_linux_sched_setscheduler, //
__NR_linux_sched_get_priority_min, //
__NR_linux_sched_get_priority_max, //
__NR_linux_sched_getaffinity, //
__NR_linux_sched_setaffinity, //
__NR_linux_sched_getparam, //
__NR_linux_sched_setparam, //
__NR_linux_tgkill, //
};
static const uint16_t kPledgeLinuxId[] = {
__NR_linux_setuid, //
__NR_linux_setreuid, //
__NR_linux_setresuid, //
__NR_linux_setgid, //
__NR_linux_setregid, //
__NR_linux_setresgid, //
__NR_linux_setgroups, //
__NR_linux_prlimit, //
__NR_linux_setrlimit, //
__NR_linux_getpriority, //
__NR_linux_setpriority, //
__NR_linux_setfsuid, //
__NR_linux_setfsgid, //
};
static const uint16_t kPledgeLinuxSettime[] = {
__NR_linux_settimeofday, //
__NR_linux_clock_adjtime, //
};
static const uint16_t kPledgeLinuxProtExec[] = {
__NR_linux_mmap | EXEC, //
__NR_linux_mprotect, //
};
static const uint16_t kPledgeLinuxExec[] = {
__NR_linux_execve, //
__NR_linux_execveat, //
};
static const uint16_t kPledgeLinuxUnveil[] = {
__NR_linux_landlock_create_ruleset, //
__NR_linux_landlock_add_rule, //
__NR_linux_landlock_restrict_self, //
};
// placeholder group
//
// pledge.com checks this to do auto-unveiling
static const uint16_t kPledgeLinuxVminfo[] = {
__NR_linux_sched_yield, //
};
// placeholder group
//
// pledge.com uses this to auto-unveil /tmp and $TMPPATH with rwc
// permissions. pledge() alone (without unveil() too) offers very
// little security here. consider using them together.
static const uint16_t kPledgeLinuxTmppath[] = {
__NR_linux_lstat, //
__NR_linux_unlink, //
__NR_linux_unlinkat, //
};
static const struct Pledges {
const char *name;
const uint16_t *syscalls;
const size_t len;
} kPledgeLinux[] = {
[PROMISE_STDIO] = {"stdio", PLEDGE(kPledgeLinuxStdio)}, //
[PROMISE_RPATH] = {"rpath", PLEDGE(kPledgeLinuxRpath)}, //
[PROMISE_WPATH] = {"wpath", PLEDGE(kPledgeLinuxWpath)}, //
[PROMISE_CPATH] = {"cpath", PLEDGE(kPledgeLinuxCpath)}, //
[PROMISE_DPATH] = {"dpath", PLEDGE(kPledgeLinuxDpath)}, //
[PROMISE_FLOCK] = {"flock", PLEDGE(kPledgeLinuxFlock)}, //
[PROMISE_FATTR] = {"fattr", PLEDGE(kPledgeLinuxFattr)}, //
[PROMISE_INET] = {"inet", PLEDGE(kPledgeLinuxInet)}, //
[PROMISE_UNIX] = {"unix", PLEDGE(kPledgeLinuxUnix)}, //
[PROMISE_DNS] = {"dns", PLEDGE(kPledgeLinuxDns)}, //
[PROMISE_TTY] = {"tty", PLEDGE(kPledgeLinuxTty)}, //
[PROMISE_RECVFD] = {"recvfd", PLEDGE(kPledgeLinuxRecvfd)}, //
[PROMISE_SENDFD] = {"sendfd", PLEDGE(kPledgeLinuxSendfd)}, //
[PROMISE_PROC] = {"proc", PLEDGE(kPledgeLinuxProc)}, //
[PROMISE_EXEC] = {"exec", PLEDGE(kPledgeLinuxExec)}, //
[PROMISE_ID] = {"id", PLEDGE(kPledgeLinuxId)}, //
[PROMISE_UNVEIL] = {"unveil", PLEDGE(kPledgeLinuxUnveil)}, //
[PROMISE_SETTIME] = {"settime", PLEDGE(kPledgeLinuxSettime)}, //
[PROMISE_PROT_EXEC] = {"prot_exec", PLEDGE(kPledgeLinuxProtExec)}, //
[PROMISE_VMINFO] = {"vminfo", PLEDGE(kPledgeLinuxVminfo)}, //
[PROMISE_TMPPATH] = {"tmppath", PLEDGE(kPledgeLinuxTmppath)}, //
};
static const struct sock_filter kFilterStart[] = {
// make sure this isn't an i386 binary or something
BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(arch)),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 1, 0),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
// each filter assumes ordinal is already loaded into accumulator
BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
// Forbid some system calls with ENOSYS (rather than EPERM)
BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, __NR_linux_memfd_secret, 5, 0),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_rseq, 4, 0),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_memfd_create, 3, 0),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_openat2, 2, 0),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_clone3, 1, 0),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_statx, 0, 1),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ERRNO | (38 & SECCOMP_RET_DATA)),
};
static const struct sock_filter kFilterEnd[] = {
// if syscall isn't whitelisted then have it return -EPERM (-1)
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ERRNO | (1 & SECCOMP_RET_DATA)),
};
static void AppendFilter(struct Filter *f, struct sock_filter *p, size_t n) {
if (UNLIKELY(f->n + n > ARRAYLEN(f->p))) {
asm("hlt"); // need to increase array size
unreachable;
}
memcpy(f->p + f->n, p, n * sizeof(*f->p));
f->n += n;
}
// SYSCALL is only allowed in the .privileged section
// We assume program image is loaded in 32-bit spaces
static void AppendOriginVerification(struct Filter *f) {
intptr_t x = (intptr_t)__privileged_start;
intptr_t y = (intptr_t)__privileged_end;
assert(0 < x && x < y && y < INT_MAX);
struct sock_filter fragment[] = {
/*L0*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(instruction_pointer) + 4),
/*L1*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 5 - 2),
/*L2*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(instruction_pointer)),
/*L3*/ BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, x, 0, 5 - 4),
/*L4*/ BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, y, 0, 6 - 5),
/*L5*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL),
/*L6*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L7*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The first argument of sys_clone_linux() must NOT have:
//
// - CLONE_NEWNS (0x00020000)
// - CLONE_PTRACE (0x00002000)
// - CLONE_UNTRACED (0x00800000)
//
static void AllowCloneRestrict(struct Filter *f) {
static const struct sock_filter fragment[] = {
/*L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_clone, 0, 6 - 1),
/*L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[0])),
/*L2*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 0x00822000),
/*L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1),
/*L4*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L5*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L6*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The first argument of sys_clone_linux() must have:
//
// - CLONE_VM (0x00000100)
// - CLONE_FS (0x00000200)
// - CLONE_FILES (0x00000400)
// - CLONE_THREAD (0x00010000)
// - CLONE_SIGHAND (0x00000800)
//
// The first argument of sys_clone_linux() must NOT have:
//
// - CLONE_NEWNS (0x00020000)
// - CLONE_PTRACE (0x00002000)
// - CLONE_UNTRACED (0x00800000)
//
static void AllowCloneThread(struct Filter *f) {
static const struct sock_filter fragment[] = {
/*L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_clone, 0, 9 - 1),
/*L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[0])),
/*L2*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 0x00010f00),
/*L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x00010f00, 0, 8 - 4),
/*L4*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[0])),
/*L5*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 0x00822000),
/*L6*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1),
/*L7*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L8*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L9*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The second argument of ioctl() must be one of:
//
// - FIONREAD (0x541b)
// - FIONBIO (0x5421)
// - FIOCLEX (0x5451)
// - FIONCLEX (0x5450)
//
static void AllowIoctlStdio(struct Filter *f) {
static const struct sock_filter fragment[] = {
/*L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_ioctl, 0, 8 - 1),
/*L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[1])),
/*L2*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x541b, 3, 0),
/*L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x5421, 2, 0),
/*L4*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x5451, 1, 0),
/*L5*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x5450, 0, 1),
/*L6*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L7*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L8*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The second argument of ioctl() must be one of:
//
// - TCGETS (0x5401)
// - TCSETS (0x5402)
// - TCSETSW (0x5403)
// - TCSETSF (0x5404)
// - TIOCGWINSZ (0x5413)
// - TIOCSPGRP (0x5410)
// - TIOCGPGRP (0x540f)
// - TIOCSWINSZ (0x5414)
// - TCFLSH (0x540b)
// - TCXONC (0x540a)
// - TCSBRK (0x5409)
// - TIOCSBRK (0x5427)
//
static void AllowIoctlTty(struct Filter *f) {
static const struct sock_filter fragment[] = {
/* L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_ioctl, 0, 16 - 1),
/* L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[1])),
/* L2*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x5401, 11, 0),
/* L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x5402, 10, 0),
/* L4*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x5403, 9, 0),
/* L5*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x5404, 8, 0),
/* L6*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x5413, 7, 0),
/* L7*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x5410, 6, 0),
/* L8*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x540f, 5, 0),
/* L9*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x5414, 4, 0),
/*L10*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x540b, 3, 0),
/*L11*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x540a, 2, 0),
/*L12*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x5409, 1, 0),
/*L13*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x5427, 0, 1),
/*L14*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L15*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L16*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The level argument of setsockopt() must be one of:
//
// - SOL_IP (0)
// - SOL_SOCKET (1)
// - SOL_TCP (6)
//
// The optname argument of setsockopt() must be one of:
//
// - TCP_NODELAY (0x01)
// - TCP_CORK (0x03)
// - TCP_KEEPIDLE (0x04)
// - TCP_KEEPINTVL (0x05)
// - SO_TYPE (0x03)
// - SO_ERROR (0x04)
// - SO_DONTROUTE (0x05)
// - SO_REUSEPORT (0x0f)
// - SO_REUSEADDR (0x02)
// - SO_KEEPALIVE (0x09)
// - SO_RCVTIMEO (0x14)
// - SO_SNDTIMEO (0x15)
// - IP_RECVTTL (0x0c)
// - IP_RECVERR (0x0b)
// - TCP_FASTOPEN (0x17)
// - TCP_FASTOPEN_CONNECT (0x1e)
//
static void AllowSetsockoptRestrict(struct Filter *f) {
static const int nr = __NR_linux_setsockopt;
static const struct sock_filter fragment[] = {
/* L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, nr, 0, 21 - 1),
/* L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[1])),
/* L2*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 2, 0),
/* L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 1, 1, 0),
/* L4*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 6, 0, 20 - 5),
/* L5*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[2])),
/* L6*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x0f, 13, 0),
/* L6*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x03, 12, 0),
/* L7*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x0c, 11, 0),
/* L8*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x13, 10, 0),
/* L9*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x02, 9, 0),
/*L10*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x09, 8, 0),
/*L11*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x14, 7, 0),
/*L12*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x01, 6, 0),
/*L13*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x0b, 5, 0),
/*L14*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x04, 4, 0),
/*L15*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x05, 3, 0),
/*L16*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x17, 2, 0),
/*L17*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x1e, 1, 0),
/*L18*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x15, 0, 1),
/*L19*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L20*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L21*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The level argument of getsockopt() must be one of:
//
// - SOL_SOCKET (1)
// - SOL_TCP (6)
//
// The optname argument of getsockopt() must be one of:
//
// - SO_TYPE (0x03)
// - SO_REUSEPORT (0x0f)
// - SO_REUSEADDR (0x02)
// - SO_KEEPALIVE (0x09)
// - SO_RCVTIMEO (0x14)
// - SO_SNDTIMEO (0x15)
//
static void AllowGetsockoptRestrict(struct Filter *f) {
static const int nr = __NR_linux_getsockopt;
static const struct sock_filter fragment[] = {
/* L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, nr, 0, 13 - 1),
/* L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[1])),
/* L2*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 1, 1, 0),
/* L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 6, 0, 12 - 4),
/* L4*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[2])),
/* L5*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x03, 5, 0),
/* L6*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x0f, 4, 0),
/* L7*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x02, 3, 0),
/* L8*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x09, 2, 0),
/* L9*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x14, 1, 0),
/*L10*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x15, 0, 1),
/*L11*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L12*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L13*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The flags parameter of mmap() must not have:
//
// - MAP_LOCKED (0x02000)
// - MAP_NONBLOCK (0x10000)
// - MAP_HUGETLB (0x40000)
//
static void AllowMmapExec(struct Filter *f) {
intptr_t y = (intptr_t)__privileged_end;
assert(0 < y && y < INT_MAX);
struct sock_filter fragment[] = {
/*L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_mmap, 0, 6 - 1),
/*L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[3])), // flags
/*L2*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 0x52000),
/*L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 5 - 4),
/*L4*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L5*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L6*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The prot parameter of mmap() may only have:
//
// - PROT_NONE (0)
// - PROT_READ (1)
// - PROT_WRITE (2)
//
// The flags parameter must not have:
//
// - MAP_LOCKED (0x02000)
// - MAP_POPULATE (0x08000)
// - MAP_NONBLOCK (0x10000)
// - MAP_HUGETLB (0x40000)
//
static void AllowMmapNoexec(struct Filter *f) {
static const struct sock_filter fragment[] = {
/*L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_mmap, 0, 9 - 1),
/*L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[2])), // prot
/*L2*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, ~(PROT_READ | PROT_WRITE)),
/*L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 8 - 4),
/*L4*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[3])), // flags
/*L5*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 0x5a000),
/*L6*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1),
/*L7*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L8*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L9*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The prot parameter of mprotect() may only have:
//
// - PROT_NONE (0)
// - PROT_READ (1)
// - PROT_WRITE (2)
//
static void AllowMprotectNoexec(struct Filter *f) {
static const struct sock_filter fragment[] = {
/*L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_mprotect, 0, 6 - 1),
/*L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[2])), // prot
/*L2*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, ~(PROT_READ | PROT_WRITE)),
/*L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1),
/*L4*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L5*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L6*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The open() system call is permitted only when
//
// - (flags & O_ACCMODE) == O_RDONLY
//
// The flags parameter of open() must not have:
//
// - O_CREAT (000000100)
// - O_TRUNC (000001000)
// - __O_TMPFILE (020000000)
//
static void AllowOpenReadonly(struct Filter *f) {
static const struct sock_filter fragment[] = {
/*L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_open, 0, 9 - 1),
/*L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[1])),
/*L2*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, O_ACCMODE),
/*L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, O_RDONLY, 0, 8 - 4),
/*L4*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[1])),
/*L5*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 020001100),
/*L6*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1),
/*L7*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L8*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L9*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The open() system call is permitted only when
//
// - (flags & O_ACCMODE) == O_RDONLY
//
// The flags parameter of open() must not have:
//
// - O_CREAT (000000100)
// - O_TRUNC (000001000)
// - __O_TMPFILE (020000000)
//
static void AllowOpenatReadonly(struct Filter *f) {
static const struct sock_filter fragment[] = {
/*L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_openat, 0, 9 - 1),
/*L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[2])),
/*L2*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, O_ACCMODE),
/*L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, O_RDONLY, 0, 8 - 4),
/*L4*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[2])),
/*L5*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 020001100),
/*L6*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1),
/*L7*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L8*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L9*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The open() system call is permitted only when
//
// - (flags & O_ACCMODE) == O_WRONLY
// - (flags & O_ACCMODE) == O_RDWR
//
// The open() flags parameter must not contain
//
// - O_CREAT (000000100)
// - __O_TMPFILE (020000000)
//
static void AllowOpenWriteonly(struct Filter *f) {
static const struct sock_filter fragment[] = {
/* L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_open, 0, 10 - 1),
/* L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[1])),
/* L2*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, O_ACCMODE),
/* L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, O_WRONLY, 1, 0),
/* L4*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, O_RDWR, 0, 9 - 5),
/* L5*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[1])),
/* L6*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 020000100),
/* L7*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1),
/* L8*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/* L9*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L10*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The open() system call is permitted only when
//
// - (flags & O_ACCMODE) == O_WRONLY
// - (flags & O_ACCMODE) == O_RDWR
//
// The openat() flags parameter must not contain
//
// - O_CREAT (000000100)
// - __O_TMPFILE (020000000)
//
static void AllowOpenatWriteonly(struct Filter *f) {
static const struct sock_filter fragment[] = {
/* L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_openat, 0, 10 - 1),
/* L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[2])),
/* L2*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, O_ACCMODE),
/* L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, O_WRONLY, 1, 0),
/* L4*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, O_RDWR, 0, 9 - 5),
/* L5*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[2])),
/* L6*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 020000100),
/* L7*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1),
/* L8*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/* L9*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L10*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// If the flags parameter of open() has one of:
//
// - O_CREAT (000000100)
// - __O_TMPFILE (020000000)
//
// Then the mode parameter must not have:
//
// - S_ISVTX (01000 sticky)
// - S_ISGID (02000 setgid)
// - S_ISUID (04000 setuid)
//
static void AllowOpenCreatonly(struct Filter *f) {
static const struct sock_filter fragment[] = {
/* L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_open, 0, 12 - 1),
/* L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[1])),
/* L2*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 000000100),
/* L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 000000100, 7 - 4, 0),
/* L4*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[1])),
/* L5*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 020200000),
/* L6*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 020200000, 0, 10 - 7),
/* L7*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[2])),
/* L8*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 07000),
/* L9*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1),
/*L10*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L11*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L12*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// If the flags parameter of openat() has one of:
//
// - O_CREAT (000000100)
// - __O_TMPFILE (020000000)
//
// Then the mode parameter must not have:
//
// - S_ISVTX (01000 sticky)
// - S_ISGID (02000 setgid)
// - S_ISUID (04000 setuid)
//
static void AllowOpenatCreatonly(struct Filter *f) {
static const struct sock_filter fragment[] = {
/* L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_openat, 0, 12 - 1),
/* L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[2])),
/* L2*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 000000100),
/* L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 000000100, 7 - 4, 0),
/* L4*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[2])),
/* L5*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 020200000),
/* L6*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 020200000, 0, 10 - 7),
/* L7*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[3])),
/* L8*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 07000),
/* L9*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1),
/*L10*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L11*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L12*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// Then the mode parameter must not have:
//
// - S_ISVTX (01000 sticky)
// - S_ISGID (02000 setgid)
// - S_ISUID (04000 setuid)
//
static void AllowCreatRestrict(struct Filter *f) {
static const struct sock_filter fragment[] = {
/*L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_creat, 0, 6 - 1),
/*L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[1])),
/*L2*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 07000),
/*L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1),
/*L4*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L5*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L6*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The second argument of fcntl() must be one of:
//
// - F_DUPFD (0)
// - F_DUPFD_CLOEXEC (1030)
// - F_GETFD (1)
// - F_SETFD (2)
// - F_GETFL (3)
// - F_SETFL (4)
//
static void AllowFcntlStdio(struct Filter *f) {
static const struct sock_filter fragment[] = {
/*L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_fcntl, 0, 6 - 1),
/*L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[1])),
/*L2*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 1030, 4 - 3, 0),
/*L3*/ BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, 5, 5 - 4, 0),
/*L4*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L5*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L6*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The second argument of fcntl() must be one of:
//
// - F_GETLK (5)
// - F_SETLK (6)
// - F_SETLKW (7)
//
static void AllowFcntlLock(struct Filter *f) {
static const struct sock_filter fragment[] = {
/*L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_fcntl, 0, 6 - 1),
/*L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[1])),
/*L2*/ BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, 5, 0, 5 - 3),
/*L3*/ BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, 8, 5 - 4, 0),
/*L4*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L5*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L6*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The addr parameter of sendto() must be
//
// - NULL
//
static void AllowSendtoAddrless(struct Filter *f) {
static const struct sock_filter fragment[] = {
/*L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_sendto, 0, 7 - 1),
/*L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[4]) + 0),
/*L2*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 6 - 3),
/*L3*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[4]) + 4),
/*L4*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 6 - 5),
/*L5*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L6*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L7*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The sig parameter of sigaction() must NOT be
//
// - SIGSYS (31)
//
static void AllowSigactionNosigsys(struct Filter *f) {
static const int nr = __NR_linux_sigaction;
static const struct sock_filter fragment[] = {
/*L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, nr, 0, 5 - 1),
/*L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[0])),
/*L2*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 31, 1, 0),
/*L3*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L4*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L5*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The family parameter of socket() must be one of:
//
// - AF_INET (0x02)
// - AF_INET6 (0x0a)
//
// The type parameter of socket() will ignore:
//
// - SOCK_CLOEXEC (0x80000)
// - SOCK_NONBLOCK (0x00800)
//
// The type parameter of socket() must be one of:
//
// - SOCK_STREAM (0x01)
// - SOCK_DGRAM (0x02)
//
// The protocol parameter of socket() must be one of:
//
// - 0
// - IPPROTO_ICMP (0x01)
// - IPPROTO_TCP (0x06)
// - IPPROTO_UDP (0x11)
//
static void AllowSocketInet(struct Filter *f) {
static const struct sock_filter fragment[] = {
/* L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_socket, 0, 15 - 1),
/* L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[0])),
/* L2*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x02, 1, 0),
/* L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x0a, 0, 14 - 4),
/* L4*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[1])),
/* L5*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, ~0x80800),
/* L6*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x01, 1, 0),
/* L7*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x02, 0, 14 - 8),
/* L8*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[2])),
/* L9*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x00, 3, 0),
/*L10*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x01, 2, 0),
/*L11*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x06, 1, 0),
/*L12*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x11, 0, 1),
/*L13*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L14*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L15*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The family parameter of socket() must be one of:
//
// - AF_UNIX (1)
// - AF_LOCAL (1)
//
// The type parameter of socket() will ignore:
//
// - SOCK_CLOEXEC (0x80000)
// - SOCK_NONBLOCK (0x00800)
//
// The type parameter of socket() must be one of:
//
// - SOCK_STREAM (1)
// - SOCK_DGRAM (2)
//
// The protocol parameter of socket() must be one of:
//
// - 0
//
static void AllowSocketUnix(struct Filter *f) {
static const struct sock_filter fragment[] = {
/* L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_socket, 0, 11 - 1),
/* L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[0])),
/* L2*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 1, 0, 10 - 3),
/* L3*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[1])),
/* L5*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, ~0x80800),
/* L5*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 1, 1, 0),
/* L6*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 2, 0, 10 - 7),
/* L7*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[2])),
/* L8*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1),
/* L9*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L10*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L11*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The first parameter of prctl() can be any of
//
// - PR_SET_NAME (15)
// - PR_GET_NAME (16)
// - PR_GET_SECCOMP (21)
// - PR_SET_SECCOMP (22)
// - PR_SET_NO_NEW_PRIVS (38)
// - PR_CAPBSET_READ (23)
//
static void AllowPrctlStdio(struct Filter *f) {
static const struct sock_filter fragment[] = {
/*L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_prctl, 0, 10 - 1),
/*L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[0])),
/*L2*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 15, 5, 0),
/*L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 16, 4, 0),
/*L4*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 21, 3, 0),
/*L5*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 22, 2, 0),
/*L5*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 23, 1, 0),
/*L6*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 38, 0, 1),
/*L7*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L8*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L9*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The mode parameter of chmod() can't have the following:
//
// - S_ISVTX (01000 sticky)
// - S_ISGID (02000 setgid)
// - S_ISUID (04000 setuid)
//
static void AllowChmodNobits(struct Filter *f) {
static const struct sock_filter fragment[] = {
/*L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_chmod, 0, 6 - 1),
/*L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[1])),
/*L2*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 07000),
/*L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1),
/*L4*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L5*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L6*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The mode parameter of fchmod() can't have the following:
//
// - S_ISVTX (01000 sticky)
// - S_ISGID (02000 setgid)
// - S_ISUID (04000 setuid)
//
static void AllowFchmodNobits(struct Filter *f) {
static const struct sock_filter fragment[] = {
/*L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_fchmod, 0, 6 - 1),
/*L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[1])),
/*L2*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 07000),
/*L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1),
/*L4*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L5*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L6*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The mode parameter of fchmodat() can't have the following:
//
// - S_ISVTX (01000 sticky)
// - S_ISGID (02000 setgid)
// - S_ISUID (04000 setuid)
//
static void AllowFchmodatNobits(struct Filter *f) {
static const struct sock_filter fragment[] = {
/*L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_fchmodat, 0, 6 - 1),
/*L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[2])),
/*L2*/ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 07000),
/*L3*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1),
/*L4*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L5*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L6*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
// The new_limit parameter of prlimit() must be
//
// - NULL (0)
//
static void AllowPrlimitStdio(struct Filter *f) {
static const struct sock_filter fragment[] = {
/*L0*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_prlimit, 0, 7 - 1),
/*L1*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[2])),
/*L2*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 6 - 3),
/*L3*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(args[2]) + 4),
/*L4*/ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1),
/*L5*/ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
/*L6*/ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
/*L7*/ /* next filter */
};
AppendFilter(f, PLEDGE(fragment));
}
static int CountUnspecial(const uint16_t *p, size_t len) {
int i, count;
for (count = i = 0; i < len; ++i) {
if (!(p[i] & SPECIAL)) {
++count;
}
}
return count;
}
static void AppendPledge(struct Filter *f, const uint16_t *p, size_t len) {
int i, j, count;
// handle ordinals which allow syscalls regardless of args
// we put in extra effort here to reduce num of bpf instrs
if ((count = CountUnspecial(p, len))) {
if (count < 256) {
for (j = i = 0; i < len; ++i) {
if (p[i] & SPECIAL) continue;
// jump to ALLOW rule below if accumulator equals ordinal
struct sock_filter fragment[] = {
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, // instruction
p[i], // operand
count - j - 1, // jump if true displacement
j == count - 1), // jump if false displacement
};
AppendFilter(f, PLEDGE(fragment));
++j;
}
struct sock_filter fragment[] = {
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
};
AppendFilter(f, PLEDGE(fragment));
} else {
asm("hlt"); // list of ordinals exceeds max displacement
unreachable;
}
}
// handle "special" ordinals which use hand-crafted bpf
for (i = 0; i < len; ++i) {
if (!(p[i] & SPECIAL)) continue;
switch (p[i]) {
case __NR_linux_mmap | EXEC:
AllowMmapExec(f);
break;
case __NR_linux_mmap | NOEXEC:
AllowMmapNoexec(f);
break;
case __NR_linux_mprotect | NOEXEC:
AllowMprotectNoexec(f);
break;
case __NR_linux_chmod | NOBITS:
AllowChmodNobits(f);
break;
case __NR_linux_fchmod | NOBITS:
AllowFchmodNobits(f);
break;
case __NR_linux_fchmodat | NOBITS:
AllowFchmodatNobits(f);
break;
case __NR_linux_sigaction | NOSIGSYS:
AllowSigactionNosigsys(f);
break;
case __NR_linux_prctl | STDIO:
AllowPrctlStdio(f);
break;
case __NR_linux_open | CREATONLY:
AllowOpenCreatonly(f);
break;
case __NR_linux_openat | CREATONLY:
AllowOpenatCreatonly(f);
break;
case __NR_linux_open | READONLY:
AllowOpenReadonly(f);
break;
case __NR_linux_openat | READONLY:
AllowOpenatReadonly(f);
break;
case __NR_linux_open | WRITEONLY:
AllowOpenWriteonly(f);
break;
case __NR_linux_openat | WRITEONLY:
AllowOpenatWriteonly(f);
break;
case __NR_linux_setsockopt | RESTRICT:
AllowSetsockoptRestrict(f);
break;
case __NR_linux_getsockopt | RESTRICT:
AllowGetsockoptRestrict(f);
break;
case __NR_linux_creat | RESTRICT:
AllowCreatRestrict(f);
break;
case __NR_linux_fcntl | STDIO:
AllowFcntlStdio(f);
break;
case __NR_linux_fcntl | LOCK:
AllowFcntlLock(f);
break;
case __NR_linux_ioctl | RESTRICT:
AllowIoctlStdio(f);
break;
case __NR_linux_ioctl | TTY:
AllowIoctlTty(f);
break;
case __NR_linux_socket | INET:
AllowSocketInet(f);
break;
case __NR_linux_socket | UNIX:
AllowSocketUnix(f);
break;
case __NR_linux_sendto | ADDRLESS:
AllowSendtoAddrless(f);
break;
case __NR_linux_clone | RESTRICT:
AllowCloneRestrict(f);
break;
case __NR_linux_clone | THREAD:
AllowCloneThread(f);
break;
case __NR_linux_prlimit | STDIO:
AllowPrlimitStdio(f);
break;
default:
asm("hlt"); // switch forgot to define a special ordinal
unreachable;
}
}
}
int sys_pledge_linux(unsigned long ipromises) {
int i, rc = -1;
struct Filter f;
CheckLargeStackAllocation(&f, sizeof(f));
f.n = 0;
AppendFilter(&f, PLEDGE(kFilterStart));
if (!(~ipromises & (1ul << PROMISE_EXEC))) {
AppendOriginVerification(&f);
}
AppendPledge(&f, PLEDGE(kPledgeLinuxDefault));
for (i = 0; i < ARRAYLEN(kPledgeLinux); ++i) {
if (~ipromises & (1ul << i)) {
AppendPledge(&f, kPledgeLinux[i].syscalls, kPledgeLinux[i].len);
}
}
AppendFilter(&f, PLEDGE(kFilterEnd));
if ((rc = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) != -1) {
struct sock_fprog sandbox = {.len = f.n, .filter = f.p};
rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &sandbox);
}
return rc;
}
static int FindPromise(const char *name) {
int i;
for (i = 0; i < ARRAYLEN(kPledgeLinux); ++i) {
if (!strcasecmp(name, kPledgeLinux[i].name)) {
return i;
}
}
STRACE("unknown promise %s", name);
return -1;
}
int ParsePromises(const char *promises, unsigned long *out) {
int rc = 0;
int promise;
unsigned long ipromises;
char *tok, *state, *start, buf[256];
if (promises) {
ipromises = -1;
if (memccpy(buf, promises, 0, sizeof(buf))) {
start = buf;
while ((tok = strtok_r(start, " \t\r\n", &state))) {
if ((promise = FindPromise(tok)) != -1) {
ipromises &= ~(1ULL << promise);
} else {
rc = einval();
break;
}
start = 0;
}
} else {
rc = einval();
}
} else {
ipromises = 0;
}
if (!rc) {
*out = ipromises;
}
return rc;
}
/**
* Restricts system operations, e.g.
*
* pledge("stdio rfile tty", 0);
*
* Pledging causes most system calls to become unavailable. Your system
* call policy is enforced by the kernel, which means it can propagate
* across execve() if permitted. This system call is supported on
* OpenBSD and Linux where it's polyfilled using SECCOMP BPF. The way it
* works on Linux is verboten system calls will raise EPERM whereas
* OpenBSD just kills the process while logging a helpful message to
* /var/log/messages explaining which promise category you needed.
*
* Timing is everything with pledge. For example, if you're using
* threads, then you may want to enable them explicitly *before* calling
* pledge(), since otherwise you'd need "prot_exec":
*
* __enable_threads();
* pledge("...", 0);
*
* If you want crash reports, then you can avoid needing "rpath" with:
*
* ShowCrashReports();
* pledge("...", 0);
*
* By default exit() is allowed. This is useful for processes that
* perform pure computation and interface with the parent via shared
* memory. On Linux we mean sys_exit (_Exit1), not sys_exit_group
* (_Exit). The difference is effectively meaningless, since _Exit()
* will attempt both. All it means is that, if you're using threads,
* then a `pledge("", 0)` thread can't kill all your threads unless you
* `pledge("stdio", 0)`.
*
* Once pledge is in effect, the chmod functions (if allowed) will not
* permit the sticky/setuid/setgid bits to change. Linux will EPERM here
* and OpenBSD should ignore those three bits rather than crashing.
*
* User and group IDs can't be changed once pledge is in effect. OpenBSD
* should ignore chown without crashing; whereas Linux will just EPERM.
*
* Memory functions won't permit creating executable code after pledge.
* Restrictions on origin of SYSCALL instructions will become enforced
* on Linux (cf. msyscall) after pledge too, which means the process
* gets killed if SYSCALL is used outside the .privileged section. One
* exception is if the "exec" group is specified, in which case these
* restrictions need to be loosened.
*
* Using pledge is irreversible. On Linux it causes PR_SET_NO_NEW_PRIVS
* to be set on your process; however, if "id" or "recvfd" are allowed
* then then they theoretically could permit the gaining of some new
* privileges. You may call pledge() multiple times if "stdio" is
* allowed. In that case, the process can only move towards a more
* restrictive state.
*
* pledge() can't filter file system paths or internet addresses. For
* example, if you enable a category like "inet" then your process will
* be able to talk to any internet address. The same applies to
* categories like "wpath" and "cpath"; if enabled, any path the
* effective user id is permitted to change will be changeable.
*
* `promises` is a string that may include any of the following groups
* delimited by spaces.
*
* - "stdio" allows exit, close, dup, dup2, dup3, fchdir, fstat, fsync,
* fdatasync, ftruncate, getdents, getegid, getrandom, geteuid,
* getgid, getgroups, times, getrusage, getitimer, getpgid, getpgrp,
* getpid, getppid, getresgid, getresuid, getrlimit, getsid, wait4,
* gettimeofday, getuid, lseek, madvise, brk, arch_prctl, uname,
* set_tid_address, clock_getres, clock_gettime, clock_nanosleep,
* mremap, mmap, (PROT_EXEC and weird flags aren't allowed), mprotect
* (PROT_EXEC isn't allowed), msync, sync_file_range, migrate_pages,
* munmap, nanosleep, pipe, pipe2, read, readv, pread, recv, poll,
* recvfrom, preadv, write, writev, pwrite, pwritev, select, pselect6,
* copy_file_range, sendfile, tee, splice, vmsplice, alarm, pause,
* send, sendto (only if addr is null), setitimer, shutdown, sigaction
* (but SIGSYS is forbidden), sigaltstack, sigprocmask, sigreturn,
* sigsuspend, umask, mincore, socketpair, ioctl(FIONREAD),
* ioctl(FIONBIO), ioctl(FIOCLEX), ioctl(FIONCLEX), fcntl(F_GETFD),
* fcntl(F_SETFD), fcntl(F_GETFL), fcntl(F_SETFL), sched_yield,
* epoll_create, epoll_create1, epoll_ctl, epoll_wait, epoll_pwait,
* epoll_pwait2, clone(CLONE_THREAD), futex, set_robust_list,
* get_robust_list, sigpending.
*
* - "rpath" (read-only path ops) allows chdir, getcwd, open(O_RDONLY),
* openat(O_RDONLY), stat, fstat, lstat, fstatat, access, faccessat,
* faccessat2, readlink, readlinkat, statfs, fstatfs.
*
* - "wpath" (write path ops) allows getcwd, open(O_WRONLY),
* openat(O_WRONLY), stat, fstat, lstat, fstatat, access, faccessat,
* faccessat2, readlink, readlinkat, chmod, fchmod, fchmodat.
*
* - "cpath" (create path ops) allows open(O_CREAT), openat(O_CREAT),
* rename, renameat, renameat2, link, linkat, symlink, symlinkat,
* unlink, rmdir, unlinkat, mkdir, mkdirat.
*
* - "dpath" (create special path ops) allows mknod, mknodat, mkfifo.
*
* - "flock" allows flock, fcntl(F_GETLK), fcntl(F_SETLK),
* fcntl(F_SETLKW).
*
* - "tty" allows ioctl(TIOCGWINSZ), ioctl(TCGETS), ioctl(TCSETS),
* ioctl(TCSETSW), ioctl(TCSETSF).
*
* - "recvfd" allows recvmsg and recvmmsg.
*
* - "recvfd" allows sendmsg and sendmmsg.
*
* - "fattr" allows chmod, fchmod, fchmodat, utime, utimes, futimens,
* utimensat.
*
* - "inet" allows socket(AF_INET), listen, bind, connect, accept,
* accept4, getpeername, getsockname, setsockopt, getsockopt, sendto.
*
* - "unix" allows socket(AF_UNIX), listen, bind, connect, accept,
* accept4, getpeername, getsockname, setsockopt, getsockopt.
*
* - "dns" allows socket(AF_INET), sendto, recvfrom, connect.
*
* - "proc" allows fork, vfork, clone, kill, tgkill, getpriority,
* setpriority, prlimit, setrlimit, setpgid, setsid.
*
* - "id" allows setuid, setreuid, setresuid, setgid, setregid,
* setresgid, setgroups, prlimit, setrlimit, getpriority, setpriority,
* setfsuid, setfsgid.
*
* - "settime" allows settimeofday and clock_adjtime.
*
* - "exec" allows execve, execveat. If the executable in question needs
* a loader, then you'll need rpath and prot_exec too. However that's
* not needed if you assimilate your APE binary beforehand, because
* security is strongest for static binaries; use the --assimilate
* flag or o//tool/build/assimilate.com program.
*
* - "prot_exec" allows mmap(PROT_EXEC) and mprotect(PROT_EXEC). This is
* needed to (1) code morph mutexes in __enable_threads(), and it's
* needed to (2) launch non-static or non-native executables, e.g.
* non-assimilated APE binaries, or dynamic-linked executables.
*
* - "unveil" allows unveil() to be called, as well as the underlying
* landlock_create_ruleset, landlock_add_rule, landlock_restrict_self
* calls on Linux.
*
* - "vminfo" OpenBSD defines this for programs like `top`. On Linux,
* this is a placeholder group that lets tools like pledge.com check
* `__promises` and automatically unveil() a subset of files top would
* need, e.g. /proc/stat, /proc/meminfo.
*
* - "tmppath" allows unlink, unlinkat, and lstat. This is mostly a
* placeholder group for pledge.com, which reads the `__promises`
* global to determine if /tmp and $TMPPATH should be unveiled.
*
* `execpromises` only matters if "exec" is specified in `promises`. In
* that case, this specifies the promises that'll apply once execve()
* happens. If this is NULL then the default is used, which is
* unrestricted. OpenBSD allows child processes to escape the sandbox
* (so a pledged OpenSSH server process can do things like spawn a root
* shell). Linux however requires monotonically decreasing privileges.
* This function will will perform some validation on Linux to make sure
* that `execpromises` is a subset of `promises`. Your libc wrapper for
* execve() will then apply its SECCOMP BPF filter later. Since Linux
* has to do this before calling sys_execve(), the executed process will
* be weakened to have execute permissions too.
*
* @return 0 on success, or -1 w/ errno
* @raise ENOSYS if host os isn't Linux or OpenBSD
* @raise EINVAL if `execpromises` on Linux isn't a subset of `promises`
* @raise EINVAL if `promises` allows exec and `execpromises` is null
*/
int pledge(const char *promises, const char *execpromises) {
int rc;
unsigned long ipromises, iexecpromises;
if (!(rc = ParsePromises(promises, &ipromises)) &&
!(rc = ParsePromises(execpromises, &iexecpromises))) {
if (IsLinux()) {
// copy exec and execnative from promises to execpromises
iexecpromises = ~(~iexecpromises | (~ipromises & (1ul << PROMISE_EXEC)));
// if bits are missing in execpromises that exist in promises
// then execpromises wouldn't be a monotonic access reduction
// this check only matters when exec / execnative are allowed
if ((ipromises & ~iexecpromises) &&
(~ipromises & (1ul << PROMISE_EXEC))) {
STRACE("execpromises must be a subset of promises");
rc = einval();
} else {
rc = sys_pledge_linux(ipromises);
}
} else {
rc = sys_pledge(promises, execpromises);
}
if (!rc && (IsOpenbsd() || (IsLinux() && getpid() == gettid()))) {
__promises = ipromises;
__execpromises = iexecpromises;
}
}
STRACE("pledge(%#s, %#s) → %d% m", promises, execpromises, rc);
return rc;
}