diff --git a/arch.h b/arch.h new file mode 100644 index 0000000..dca7ede --- /dev/null +++ b/arch.h @@ -0,0 +1,21 @@ +/* Copyright © 2022 Arista Networks, Inc. All rights reserved. + * + * Use of this source code is governed by the MIT license that can be found + * in the LICENSE file. + */ + +#ifndef ARCH_H_ +# define ARCH_H_ + +# include "config.h" + +# define ARCH_STR_(x) #x +# define ARCH_STR(x) ARCH_STR_(x) + +/* *INDENT-OFF* - formatters try to add spaces here */ +# define ARCH_HEADER_BASE arch/ARCH +/* *INDENT-ON* */ + +# include ARCH_STR(ARCH_HEADER_BASE/syscall.h) + +#endif /* !ARCH_H_ */ diff --git a/arch/x86/syscall.h b/arch/x86/syscall.h new file mode 100644 index 0000000..36bff83 --- /dev/null +++ b/arch/x86/syscall.h @@ -0,0 +1,26 @@ +/* Copyright © 2022 Arista Networks, Inc. All rights reserved. + * + * Use of this source code is governed by the MIT license that can be found + * in the LICENSE file. + */ + +#include +#include + +/* The following is the x86-64-specific BPF boilerplate code for checking + that the BPF program is running on the right architecture + ABI. At + completion of these instructions, the accumulator contains the system + call number. */ + +/* For the x32 ABI, all system call numbers have bit 30 set */ + +#define X32_SYSCALL_BIT 0x40000000 + +#define CHECK_ARCH_AND_LOAD_SYSCALL_NR \ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, \ + (offsetof(struct seccomp_data, arch))), \ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, 2), \ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, \ + (offsetof(struct seccomp_data, nr))), \ + BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, X32_SYSCALL_BIT, 0, 1), \ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS) diff --git a/arch/x86_64 b/arch/x86_64 new file mode 120000 index 0000000..f4bad79 --- /dev/null +++ b/arch/x86_64 @@ -0,0 +1 @@ +x86 \ No newline at end of file diff --git a/capable.h b/capable.h index 92e62e5..7a3d598 100644 --- a/capable.h +++ b/capable.h @@ -20,6 +20,7 @@ # define BST_CAP_SETUID ((uint64_t) 1 << CAP_SETUID) # define BST_CAP_SETGID ((uint64_t) 1 << CAP_SETGID) # define BST_CAP_SYS_CHROOT ((uint64_t) 1 << CAP_SYS_CHROOT) +# define BST_CAP_MKNOD ((uint64_t) 1 << CAP_MKNOD) extern int deny_new_capabilities; diff --git a/config.h.in b/config.h.in index d6ff2bc..a4207ae 100644 --- a/config.h.in +++ b/config.h.in @@ -12,4 +12,10 @@ # define LIBEXECDIR "@libexecdir@" # define VERSION "@version@" +#mesondefine ARCH +#mesondefine ARCH_X86 +#mesondefine ARCH_X86_64 + +#mesondefine HAVE_SECCOMP_UNOTIFY + #endif /* !CONFIG_H_ */ diff --git a/enter.c b/enter.c index ac2a2f8..e3647fb 100644 --- a/enter.c +++ b/enter.c @@ -26,8 +26,10 @@ #include "bst_limits.h" #include "capable.h" +#include "config.h" #include "enter.h" #include "errutil.h" +#include "fd.h" #include "mount.h" #include "net.h" #include "ns.h" @@ -38,6 +40,10 @@ #include "sig.h" #include "util.h" +#ifdef HAVE_SECCOMP_UNOTIFY +# include "sec.h" +#endif + static inline size_t append_argv(char **argv, size_t argc, char *arg) { if (argc >= ARG_MAX) { @@ -411,6 +417,15 @@ int enter(struct entry_settings *opts) } outer_helper_sync(&outer_helper); + +#ifdef HAVE_SECCOMP_UNOTIFY + int seccomp_fd = sec_seccomp_install_filter(); + if (seccomp_fd != -1) { + send_fd(outer_helper.fd, seccomp_fd); + close(seccomp_fd); + } +#endif + outer_helper_close(&outer_helper); int rtnl = init_rtnetlink_socket(); diff --git a/meson.build b/meson.build index a97d8aa..d4d5b25 100644 --- a/meson.build +++ b/meson.build @@ -44,12 +44,20 @@ add_project_arguments( '-D_GNU_SOURCE', language: ['c']) +arch = host_machine.cpu_family() + config = configuration_data() config.set('package', meson.project_name()) config.set('bindir', bindir) config.set('libexecdir', libexecdir) config.set('version', version) +config.set('ARCH', arch) +config.set('ARCH_@0@'.format(arch.to_upper()), 1) + +has_seccomp_unotify = cc.has_header_symbol('linux/seccomp.h', 'SECCOMP_FILTER_FLAG_NEW_LISTENER') +config.set('HAVE_SECCOMP_UNOTIFY', has_seccomp_unotify) + configure_file(input: 'config.h.in', output: 'config.h', configuration: config) bst_init_sources = [ @@ -94,6 +102,13 @@ bst_sources = [ 'userns.c', ] +if has_seccomp_unotify + bst_sources += [ + 'proc.c', + 'sec.c', + ] +endif + executable('bst', bst_sources, install: true) if not get_option('no-setcap-or-suid') @@ -106,6 +121,7 @@ if not get_option('no-setcap-or-suid') 'cap_sys_admin', 'cap_sys_chroot', 'cap_sys_ptrace', + 'cap_mknod', ], 'bst-unpersist': [ 'cap_sys_admin', diff --git a/outer.c b/outer.c index a0a6f27..06c19dc 100644 --- a/outer.c +++ b/outer.c @@ -20,12 +20,18 @@ #include #include "capable.h" +#include "config.h" #include "enter.h" +#include "fd.h" #include "outer.h" #include "path.h" #include "userns.h" #include "util.h" +#ifdef HAVE_SECCOMP_UNOTIFY +# include "sec.h" +#endif + enum { /* This should be enough for defining our mappings. If we assign 340 mappings, and since each line would contain at most @@ -270,7 +276,13 @@ void outer_helper_spawn(struct outer_helper *helper) ssize_t count = write(fd, &ok, sizeof (ok)); assert((ssize_t)(sizeof (ok)) == count); +#ifdef HAVE_SECCOMP_UNOTIFY + int seccomp_fd = recv_fd(fd); + sec_seccomp_supervisor(seccomp_fd); + __builtin_unreachable(); +#else _exit(0); +#endif } void outer_helper_sendpid(const struct outer_helper *helper, pid_t pid) diff --git a/proc.c b/proc.c new file mode 100644 index 0000000..af7f922 --- /dev/null +++ b/proc.c @@ -0,0 +1,31 @@ +/* Copyright © 2022 Arista Networks, Inc. All rights reserved. + * + * Use of this source code is governed by the MIT license that can be found + * in the LICENSE file. + */ + +#include +#include +#include + +#include "proc.h" + +int proc_read_status(int procfd, struct proc_status *out) +{ + memset(out, 0, sizeof (*out)); + + int statusfd = openat(procfd, "status", O_RDONLY | O_CLOEXEC); + if (statusfd == -1) { + return -1; + } + + FILE *f = fdopen(statusfd, "r"); + + char line[4096]; + while (fgets(line, sizeof (line) - 1, f)) { + sscanf(line, "Umask:\t%o\n", &out->umask); + } + + fclose(f); + return 0; +} diff --git a/proc.h b/proc.h new file mode 100644 index 0000000..c204e6b --- /dev/null +++ b/proc.h @@ -0,0 +1,16 @@ +/* Copyright © 2022 Arista Networks, Inc. All rights reserved. + * + * Use of this source code is governed by the MIT license that can be found + * in the LICENSE file. + */ + +#ifndef PROC_H_ +# define PROC_H_ + +struct proc_status { + mode_t umask; +}; + +int proc_read_status(int procfd, struct proc_status *out); + +#endif /* !PROC_H_ */ diff --git a/sec.c b/sec.c new file mode 100644 index 0000000..64ddb73 --- /dev/null +++ b/sec.c @@ -0,0 +1,371 @@ +/* Copyright © 2022 Arista Networks, Inc. All rights reserved. + * + * Use of this source code is governed by the MIT license that can be found + * in the LICENSE file. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arch.h" +#include "capable.h" +#include "proc.h" +#include "sec.h" +#include "util.h" + +typedef int syscall_handler_func(int, int, struct seccomp_notif *, struct seccomp_notif_resp *); + +enum { + SYSCALL_HANDLED, + SYSCALL_CONTINUE, +}; + +static int pread_string(int memfd, char *buf, uintptr_t addr, size_t sz) +{ + ssize_t nread = pread(memfd, buf, sz, addr); + if (nread == -1) { + return -1; + } + + if (nread == 0 || strnlen(buf, nread) >= (size_t) nread) { + errno = EFAULT; + return -1; + } + + return 0; +} + +static int self_mnt_nsfd(void) { + + static int fd = -1; + + if (fd == -1) { + fd = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC); + if (fd == -1) { + err(1, "open /proc/self/ns/mnt"); + } + } + + return fd; +} + +static int check_seccomp_cookie(int seccomp_fd, __u64 *id) +{ + return ioctl(seccomp_fd, SECCOMP_IOCTL_NOTIF_ID_VALID, id); +} + +static int sec__mknodat_impl(int seccomp_fd, int procfd, + struct seccomp_notif *req, + struct seccomp_notif_resp *resp, + int dirfd, + uintptr_t pathnameaddr, + mode_t mode, + dev_t dev) +{ + if ((mode & S_IFCHR) == 0 || (mode & S_IFBLK) == 0) { + /* Fallthrough for non-privileged operations -- the caller already + has the rights to do this themselves. */ + return SYSCALL_CONTINUE; + } + + /* Is this one of the safe devices? */ + + struct devtype { + mode_t type; + dev_t dev; + }; + + const struct devtype safe_devices[] = { + { .type = S_IFCHR, .dev = makedev(0, 0) }, // whiteout device + { .type = S_IFCHR, .dev = makedev(1, 3) }, // null device + { .type = S_IFCHR, .dev = makedev(1, 5) }, // zero device + { .type = S_IFCHR, .dev = makedev(1, 7) }, // full device + { .type = S_IFCHR, .dev = makedev(1, 8) }, // random device + { .type = S_IFCHR, .dev = makedev(1, 9) }, // urandom device + { .type = S_IFCHR, .dev = makedev(5, 0) }, // tty device + }; + + for (size_t i = 0; i < lengthof(safe_devices); i++) { + if ((mode & S_IFMT) == safe_devices[i].type && dev == safe_devices[i].dev) { + goto safe; + } + } + return SYSCALL_CONTINUE; + +safe: {} + /* The device is safe to mount -- perform shenanigans */ + + struct proc_status status; + if (proc_read_status(procfd, &status) == -1) { + warn("proc_read_status /proc/%d/status", req->pid); + return -EINVAL; + } + + int selfmnt = self_mnt_nsfd(); + int realdirfd = -1; + mode_t old_umask = -1; + int rc = 0; + + int memfd = openat(procfd, "mem", O_RDONLY | O_CLOEXEC); + if (memfd == -1) { + warn("open /proc/%d/mem", req->pid); + return -EINVAL; + } + + char pathname[PATH_MAX]; + if (pread_string(memfd, pathname, pathnameaddr, PATH_MAX) == -1) { + warn("pread pid %d %lx:%u", req->pid, pathnameaddr, PATH_MAX); + close(memfd); + return -EINVAL; + } + + int mntns = openat(procfd, "ns/mnt", O_RDONLY | O_CLOEXEC); + if (mntns == -1) { + warn("open /proc/%d/ns/mnt", req->pid); + rc = -EINVAL; + goto error; + } + + if (dirfd == AT_FDCWD) { + realdirfd = openat(procfd, "cwd", O_PATH | O_CLOEXEC); + } else { + char fdpath[PATH_MAX+1]; + if ((size_t) snprintf(fdpath, PATH_MAX, "fd/%d", dirfd) >= sizeof (fdpath)) { + warnx("fd/%d takes more than PATH_MAX bytes.", dirfd); + rc = -EINVAL; + goto error; + } + realdirfd = openat(procfd, fdpath, O_PATH | O_CLOEXEC); + } + if (realdirfd == -1) { + warn("open"); + rc = -EOPNOTSUPP; + goto error; + } + + /* Check again that the process is alive and blocked on the syscall. This + handles cases where the syscall got interrupted by a signal handler + and the program state changed before we read the pathname or other + information from proc. */ + + if (check_seccomp_cookie(seccomp_fd, &req->id) == -1) { + rc = -errno; + goto error; + } + + old_umask = umask(status.umask); + + make_capable(BST_CAP_SYS_ADMIN | BST_CAP_SYS_CHROOT | BST_CAP_MKNOD); + + if (setns(mntns, CLONE_NEWNS) == -1) { + warn("setns"); + rc = -EOPNOTSUPP; + goto error; + } + + if (mknodat(realdirfd, pathname, mode, dev) == -1) { + rc = -errno; + goto error; + } + +error: + if (setns(selfmnt, CLONE_NEWNS) == -1) { + err(1, "setns"); + } + + reset_capabilities(); + + if (old_umask != (mode_t) -1) { + umask(old_umask); + } + close(mntns); + close(memfd); + if (realdirfd != -1) { + close(realdirfd); + } + return rc; +} + +static int sec__mknod(int seccomp_fd, int procfd, + struct seccomp_notif *req, + struct seccomp_notif_resp *resp) +{ + uintptr_t pathnameaddr = req->data.args[0]; + mode_t mode = req->data.args[1]; + dev_t dev = req->data.args[2]; + + return sec__mknodat_impl(seccomp_fd, procfd, req, resp, AT_FDCWD, pathnameaddr, mode, dev); +} + +static int sec__mknodat(int seccomp_fd, int procfd, + struct seccomp_notif *req, + struct seccomp_notif_resp *resp) +{ + int dirfd = req->data.args[0]; + uintptr_t pathnameaddr = req->data.args[1]; + mode_t mode = req->data.args[2]; + dev_t dev = req->data.args[3]; + + return sec__mknodat_impl(seccomp_fd, procfd, req, resp, dirfd, pathnameaddr, mode, dev); +} + +static int seccomp(unsigned int op, unsigned int flags, void *args) +{ + return syscall(__NR_seccomp, op, flags, args); +} + +int sec_seccomp_install_filter(void) +{ + struct sock_filter filter[] = { + CHECK_ARCH_AND_LOAD_SYSCALL_NR, + + /* The following syscalls triggers notification to user-space supervisor. */ + + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_mknod, 0, 1), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF), + + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_mknodat, 0, 1), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF), + + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), + }; + + struct sock_fprog prog = { + .len = lengthof(filter), + .filter = filter, + }; + + int fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog); + if (fd == -1) { + err(1, "seccomp SECCOMP_SET_MODE_FILTER"); + } + return fd; +} + +static void sec_seccomp_dispatch_syscall(int seccomp_fd, int procfd, + struct seccomp_notif *req, + struct seccomp_notif_resp *resp) +{ + static syscall_handler_func *const syscall_table[] = { + [__NR_mknod] = sec__mknod, + [__NR_mknodat] = sec__mknodat, + }; + + resp->id = req->id; + + if (req->data.nr <= 0 || (size_t) req->data.nr >= lengthof(syscall_table)) { + goto send; + } + syscall_handler_func *fn = syscall_table[(size_t) req->data.nr]; + if (!fn) { + goto send; + } + + int rc = fn(seccomp_fd, procfd, req, resp); + if (rc < 0) { + resp->error = rc; + } else if (rc == SYSCALL_CONTINUE) { + resp->flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE; + } + +send: + if (ioctl(seccomp_fd, SECCOMP_IOCTL_NOTIF_SEND, resp) == -1) { + if (errno == ENOENT) { + // This is survivable -- this usually means the syscall got + // interrupted by a signal. + return; + } + warn("ioctl SECCOMP_IOCTL_NOTIF_SEND"); + } +} + +noreturn void sec_seccomp_supervisor(int seccomp_fd) +{ + /* Run the seccomp supervisor. This supervisor is a privileged helper + that runs safe syscalls on behalf of the unprivileged child in a + user namespace. + + Use-cases include: + * Allowing mknod on devices deemed "safe", like /dev/null, or the + overlayfs whiteout file. + * Allow devtmpfs mount with our custom bst_devtmpfs logic. + + For now, this is intended to be a blocking loop -- if we need other + long-running agents down the line we might need to consider using + an epoll loop or forking these into other processes. */ + + struct seccomp_notif_sizes sizes; + + if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) == -1) + err(1, "seccomp SECCOMP_GET_NOTIF_SIZES"); + + struct seccomp_notif *req = malloc(sizes.seccomp_notif); + if (req == NULL) + err(1, "malloc"); + + /* When allocating the response buffer, we must allow for the fact + that the user-space binary may have been built with user-space + headers where 'struct seccomp_notif_resp' is bigger than the + response buffer expected by the (older) kernel. Therefore, we + allocate a buffer that is the maximum of the two sizes. This + ensures that if the supervisor places bytes into the response + structure that are past the response size that the kernel expects, + then the supervisor is not touching an invalid memory location. */ + + size_t resp_size = sizes.seccomp_notif_resp; + if (sizeof (struct seccomp_notif_resp) > resp_size) + resp_size = sizeof (struct seccomp_notif_resp); + + struct seccomp_notif_resp *resp = malloc(resp_size); + if (resp == NULL) + err(1, "malloc"); + + for (;;) { + memset(req, 0, sizes.seccomp_notif); + memset(resp, 0, resp_size); + + if (ioctl(seccomp_fd, SECCOMP_IOCTL_NOTIF_RECV, req) == -1) { + if (errno == EINTR) { + continue; + } + err(1, "ioctl SECCOMP_IOCTL_NOTIF_RECV"); + } + + char procpath[PATH_MAX+1]; + if ((size_t) snprintf(procpath, PATH_MAX, "/proc/%d", req->pid) >= sizeof (procpath)) { + errx(1, "/proc/%d takes more than PATH_MAX bytes.", req->pid); + } + + int procfd = open(procpath, O_RDONLY | O_CLOEXEC); + if (procfd == -1) { + err(1, "open"); + } + + /* Check that the target process is still alive and blocked on the + syscall that sent the notification. */ + + if (check_seccomp_cookie(seccomp_fd, &req->id) == -1) { + goto end; + } + + sec_seccomp_dispatch_syscall(seccomp_fd, procfd, req, resp); + + end: + close(procfd); + } +} + diff --git a/sec.h b/sec.h new file mode 100644 index 0000000..1da2ce3 --- /dev/null +++ b/sec.h @@ -0,0 +1,15 @@ +/* Copyright © 2022 Arista Networks, Inc. All rights reserved. + * + * Use of this source code is governed by the MIT license that can be found + * in the LICENSE file. + */ + +#ifndef SEC_H_ +# define SEC_H_ + +# include + +int sec_seccomp_install_filter(void); +noreturn void sec_seccomp_supervisor(int); + +#endif /* !SEC_H_ */