From 157dddcac84cdb5e8500bcfc90cced9544f84f90 Mon Sep 17 00:00:00 2001
From: "Franklin \"Snaipe\" Mathieu" <snaipe@arista.com>
Date: Thu, 19 May 2022 01:15:36 +0200
Subject: [PATCH] seccomp: add syscall emulation for safe syscalls, like mknod
 of /dev/null devices.

---
 arch.h             |  21 +++
 arch/x86/syscall.h |  26 ++++
 arch/x86_64        |   1 +
 capable.h          |   1 +
 config.h.in        |   6 +
 enter.c            |  15 ++
 meson.build        |  16 ++
 outer.c            |  12 ++
 proc.c             |  31 ++++
 proc.h             |  16 ++
 sec.c              | 371 +++++++++++++++++++++++++++++++++++++++++++++
 sec.h              |  15 ++
 12 files changed, 531 insertions(+)
 create mode 100644 arch.h
 create mode 100644 arch/x86/syscall.h
 create mode 120000 arch/x86_64
 create mode 100644 proc.c
 create mode 100644 proc.h
 create mode 100644 sec.c
 create mode 100644 sec.h

diff --git a/arch.h b/arch.h
new file mode 100644
index 0000000..dca7ede
--- /dev/null
+++ b/arch.h
@@ -0,0 +1,21 @@
+/* Copyright © 2022 Arista Networks, Inc. All rights reserved.
+ *
+ * Use of this source code is governed by the MIT license that can be found
+ * in the LICENSE file.
+ */
+
+#ifndef ARCH_H_
+# define ARCH_H_
+
+# include "config.h"
+
+# define ARCH_STR_(x) #x
+# define ARCH_STR(x) ARCH_STR_(x)
+
+/* *INDENT-OFF* - formatters try to add spaces here */
+# define ARCH_HEADER_BASE arch/ARCH
+/* *INDENT-ON* */
+
+# include ARCH_STR(ARCH_HEADER_BASE/syscall.h)
+
+#endif /* !ARCH_H_ */
diff --git a/arch/x86/syscall.h b/arch/x86/syscall.h
new file mode 100644
index 0000000..36bff83
--- /dev/null
+++ b/arch/x86/syscall.h
@@ -0,0 +1,26 @@
+/* Copyright © 2022 Arista Networks, Inc. All rights reserved.
+ *
+ * Use of this source code is governed by the MIT license that can be found
+ * in the LICENSE file.
+ */
+
+#include <stddef.h>
+#include <linux/audit.h>
+
+/* The following is the x86-64-specific BPF boilerplate code for checking
+   that the BPF program is running on the right architecture + ABI. At
+   completion of these instructions, the accumulator contains the system
+   call number. */
+
+/* For the x32 ABI, all system call numbers have bit 30 set */
+
+#define X32_SYSCALL_BIT         0x40000000
+
+#define CHECK_ARCH_AND_LOAD_SYSCALL_NR \
+	BPF_STMT(BPF_LD | BPF_W | BPF_ABS, \
+			(offsetof(struct seccomp_data, arch))), \
+	BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, 2), \
+	BPF_STMT(BPF_LD | BPF_W | BPF_ABS, \
+			(offsetof(struct seccomp_data, nr))), \
+	BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, X32_SYSCALL_BIT, 0, 1), \
+	BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS)
diff --git a/arch/x86_64 b/arch/x86_64
new file mode 120000
index 0000000..f4bad79
--- /dev/null
+++ b/arch/x86_64
@@ -0,0 +1 @@
+x86
\ No newline at end of file
diff --git a/capable.h b/capable.h
index 92e62e5..7a3d598 100644
--- a/capable.h
+++ b/capable.h
@@ -20,6 +20,7 @@
 # define BST_CAP_SETUID         ((uint64_t) 1 << CAP_SETUID)
 # define BST_CAP_SETGID         ((uint64_t) 1 << CAP_SETGID)
 # define BST_CAP_SYS_CHROOT     ((uint64_t) 1 << CAP_SYS_CHROOT)
+# define BST_CAP_MKNOD          ((uint64_t) 1 << CAP_MKNOD)
 
 extern int deny_new_capabilities;
 
diff --git a/config.h.in b/config.h.in
index d6ff2bc..a4207ae 100644
--- a/config.h.in
+++ b/config.h.in
@@ -12,4 +12,10 @@
 # define LIBEXECDIR "@libexecdir@"
 # define VERSION "@version@"
 
+#mesondefine ARCH
+#mesondefine ARCH_X86
+#mesondefine ARCH_X86_64
+
+#mesondefine HAVE_SECCOMP_UNOTIFY
+
 #endif /* !CONFIG_H_ */
diff --git a/enter.c b/enter.c
index ac2a2f8..e3647fb 100644
--- a/enter.c
+++ b/enter.c
@@ -26,8 +26,10 @@
 
 #include "bst_limits.h"
 #include "capable.h"
+#include "config.h"
 #include "enter.h"
 #include "errutil.h"
+#include "fd.h"
 #include "mount.h"
 #include "net.h"
 #include "ns.h"
@@ -38,6 +40,10 @@
 #include "sig.h"
 #include "util.h"
 
+#ifdef HAVE_SECCOMP_UNOTIFY
+# include "sec.h"
+#endif
+
 static inline size_t append_argv(char **argv, size_t argc, char *arg)
 {
 	if (argc >= ARG_MAX) {
@@ -411,6 +417,15 @@ int enter(struct entry_settings *opts)
 	}
 
 	outer_helper_sync(&outer_helper);
+
+#ifdef HAVE_SECCOMP_UNOTIFY
+		int seccomp_fd = sec_seccomp_install_filter();
+		if (seccomp_fd != -1) {
+			send_fd(outer_helper.fd, seccomp_fd);
+			close(seccomp_fd);
+		}
+#endif
+
 	outer_helper_close(&outer_helper);
 
 	int rtnl = init_rtnetlink_socket();
diff --git a/meson.build b/meson.build
index a97d8aa..d4d5b25 100644
--- a/meson.build
+++ b/meson.build
@@ -44,12 +44,20 @@ add_project_arguments(
 	'-D_GNU_SOURCE',
 	language: ['c'])
 
+arch = host_machine.cpu_family()
+
 config = configuration_data()
 config.set('package', meson.project_name())
 config.set('bindir', bindir)
 config.set('libexecdir', libexecdir)
 config.set('version', version)
 
+config.set('ARCH', arch)
+config.set('ARCH_@0@'.format(arch.to_upper()), 1)
+
+has_seccomp_unotify = cc.has_header_symbol('linux/seccomp.h', 'SECCOMP_FILTER_FLAG_NEW_LISTENER')
+config.set('HAVE_SECCOMP_UNOTIFY', has_seccomp_unotify)
+
 configure_file(input: 'config.h.in', output: 'config.h', configuration: config)
 
 bst_init_sources = [
@@ -94,6 +102,13 @@ bst_sources = [
 	'userns.c',
 ]
 
+if has_seccomp_unotify
+  bst_sources += [
+	'proc.c',
+	'sec.c',
+  ]
+endif
+
 executable('bst', bst_sources, install: true)
 
 if not get_option('no-setcap-or-suid')
@@ -106,6 +121,7 @@ if not get_option('no-setcap-or-suid')
 			'cap_sys_admin',
 			'cap_sys_chroot',
 			'cap_sys_ptrace',
+			'cap_mknod',
 		],
 		'bst-unpersist': [
 			'cap_sys_admin',
diff --git a/outer.c b/outer.c
index a0a6f27..06c19dc 100644
--- a/outer.c
+++ b/outer.c
@@ -20,12 +20,18 @@
 #include <unistd.h>
 
 #include "capable.h"
+#include "config.h"
 #include "enter.h"
+#include "fd.h"
 #include "outer.h"
 #include "path.h"
 #include "userns.h"
 #include "util.h"
 
+#ifdef HAVE_SECCOMP_UNOTIFY
+# include "sec.h"
+#endif
+
 enum {
 	/* This should be enough for defining our mappings. If we assign
 	   340 mappings, and since each line would contain at most
@@ -270,7 +276,13 @@ void outer_helper_spawn(struct outer_helper *helper)
 	ssize_t count = write(fd, &ok, sizeof (ok));
 	assert((ssize_t)(sizeof (ok)) == count);
 
+#ifdef HAVE_SECCOMP_UNOTIFY
+	int seccomp_fd = recv_fd(fd);
+	sec_seccomp_supervisor(seccomp_fd);
+	__builtin_unreachable();
+#else
 	_exit(0);
+#endif
 }
 
 void outer_helper_sendpid(const struct outer_helper *helper, pid_t pid)
diff --git a/proc.c b/proc.c
new file mode 100644
index 0000000..af7f922
--- /dev/null
+++ b/proc.c
@@ -0,0 +1,31 @@
+/* Copyright © 2022 Arista Networks, Inc. All rights reserved.
+ *
+ * Use of this source code is governed by the MIT license that can be found
+ * in the LICENSE file.
+ */
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "proc.h"
+
+int proc_read_status(int procfd, struct proc_status *out)
+{
+	memset(out, 0, sizeof (*out));
+
+	int statusfd = openat(procfd, "status", O_RDONLY | O_CLOEXEC);
+	if (statusfd == -1) {
+		return -1;
+	}
+
+	FILE *f = fdopen(statusfd, "r");
+
+	char line[4096];
+	while (fgets(line, sizeof (line) - 1, f)) {
+		sscanf(line, "Umask:\t%o\n", &out->umask);
+	}
+
+	fclose(f);
+	return 0;
+}
diff --git a/proc.h b/proc.h
new file mode 100644
index 0000000..c204e6b
--- /dev/null
+++ b/proc.h
@@ -0,0 +1,16 @@
+/* Copyright © 2022 Arista Networks, Inc. All rights reserved.
+ *
+ * Use of this source code is governed by the MIT license that can be found
+ * in the LICENSE file.
+ */
+
+#ifndef PROC_H_
+# define PROC_H_
+
+struct proc_status {
+	mode_t umask;
+};
+
+int proc_read_status(int procfd, struct proc_status *out);
+
+#endif /* !PROC_H_ */
diff --git a/sec.c b/sec.c
new file mode 100644
index 0000000..64ddb73
--- /dev/null
+++ b/sec.c
@@ -0,0 +1,371 @@
+/* Copyright © 2022 Arista Networks, Inc. All rights reserved.
+ *
+ * Use of this source code is governed by the MIT license that can be found
+ * in the LICENSE file.
+ */
+
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <sched.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include "arch.h"
+#include "capable.h"
+#include "proc.h"
+#include "sec.h"
+#include "util.h"
+
+typedef int syscall_handler_func(int, int, struct seccomp_notif *, struct seccomp_notif_resp *);
+
+enum {
+	SYSCALL_HANDLED,
+	SYSCALL_CONTINUE,
+};
+
+static int pread_string(int memfd, char *buf, uintptr_t addr, size_t sz)
+{
+	ssize_t nread = pread(memfd, buf, sz, addr);
+	if (nread == -1) {
+		return -1;
+	}
+
+	if (nread == 0 || strnlen(buf, nread) >= (size_t) nread) {
+		errno = EFAULT;
+		return -1;
+	}
+
+	return 0;
+}
+
+static int self_mnt_nsfd(void) {
+
+	static int fd = -1;
+
+	if (fd == -1) {
+		fd = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC);
+		if (fd == -1) {
+			err(1, "open /proc/self/ns/mnt");
+		}
+	}
+
+	return fd;
+}
+
+static int check_seccomp_cookie(int seccomp_fd, __u64 *id)
+{
+	return ioctl(seccomp_fd, SECCOMP_IOCTL_NOTIF_ID_VALID, id);
+}
+
+static int sec__mknodat_impl(int seccomp_fd, int procfd,
+		struct seccomp_notif *req,
+		struct seccomp_notif_resp *resp,
+		int dirfd,
+		uintptr_t pathnameaddr,
+		mode_t mode,
+		dev_t dev)
+{
+	if ((mode & S_IFCHR) == 0 || (mode & S_IFBLK) == 0) {
+		/* Fallthrough for non-privileged operations -- the caller already
+		   has the rights to do this themselves. */
+		return SYSCALL_CONTINUE;
+	}
+
+	/* Is this one of the safe devices? */
+
+	struct devtype {
+		mode_t type;
+		dev_t  dev;
+	};
+
+	const struct devtype safe_devices[] = {
+		{ .type = S_IFCHR, .dev = makedev(0, 0) }, // whiteout device
+		{ .type = S_IFCHR, .dev = makedev(1, 3) }, // null device
+		{ .type = S_IFCHR, .dev = makedev(1, 5) }, // zero device
+		{ .type = S_IFCHR, .dev = makedev(1, 7) }, // full device
+		{ .type = S_IFCHR, .dev = makedev(1, 8) }, // random device
+		{ .type = S_IFCHR, .dev = makedev(1, 9) }, // urandom device
+		{ .type = S_IFCHR, .dev = makedev(5, 0) }, // tty device
+	};
+
+	for (size_t i = 0; i < lengthof(safe_devices); i++) {
+		if ((mode & S_IFMT) == safe_devices[i].type && dev == safe_devices[i].dev) {
+			goto safe;
+		}
+	}
+	return SYSCALL_CONTINUE;
+
+safe: {}
+	/* The device is safe to mount -- perform shenanigans */
+
+	struct proc_status status;
+	if (proc_read_status(procfd, &status) == -1) {
+		warn("proc_read_status /proc/%d/status", req->pid);
+		return -EINVAL;
+	}
+
+	int selfmnt = self_mnt_nsfd();
+	int realdirfd = -1;
+	mode_t old_umask = -1;
+	int rc = 0;
+
+	int memfd = openat(procfd, "mem", O_RDONLY | O_CLOEXEC);
+	if (memfd == -1) {
+		warn("open /proc/%d/mem", req->pid);
+		return -EINVAL;
+	}
+
+	char pathname[PATH_MAX];
+	if (pread_string(memfd, pathname, pathnameaddr, PATH_MAX) == -1) {
+		warn("pread pid %d %lx:%u", req->pid, pathnameaddr, PATH_MAX);
+		close(memfd);
+		return -EINVAL;
+	}
+
+	int mntns = openat(procfd, "ns/mnt", O_RDONLY | O_CLOEXEC);
+	if (mntns == -1) {
+		warn("open /proc/%d/ns/mnt", req->pid);
+		rc = -EINVAL;
+		goto error;
+	}
+
+	if (dirfd == AT_FDCWD) {
+		realdirfd = openat(procfd, "cwd", O_PATH | O_CLOEXEC);
+	} else {
+		char fdpath[PATH_MAX+1];
+		if ((size_t) snprintf(fdpath, PATH_MAX, "fd/%d", dirfd) >= sizeof (fdpath)) {
+			warnx("fd/%d takes more than PATH_MAX bytes.", dirfd);
+			rc = -EINVAL;
+			goto error;
+		}
+		realdirfd = openat(procfd, fdpath, O_PATH | O_CLOEXEC);
+	}
+	if (realdirfd == -1) {
+		warn("open");
+		rc = -EOPNOTSUPP;
+		goto error;
+	}
+
+	/* Check again that the process is alive and blocked on the syscall. This
+	   handles cases where the syscall got interrupted by a signal handler
+	   and the program state changed before we read the pathname or other
+	   information from proc. */
+
+	if (check_seccomp_cookie(seccomp_fd, &req->id) == -1) {
+		rc = -errno;
+		goto error;
+	}
+
+	old_umask = umask(status.umask);
+
+	make_capable(BST_CAP_SYS_ADMIN | BST_CAP_SYS_CHROOT | BST_CAP_MKNOD);
+
+	if (setns(mntns, CLONE_NEWNS) == -1) {
+		warn("setns");
+		rc = -EOPNOTSUPP;
+		goto error;
+	}
+
+	if (mknodat(realdirfd, pathname, mode, dev) == -1) {
+		rc = -errno;
+		goto error;
+	}
+
+error:
+	if (setns(selfmnt, CLONE_NEWNS) == -1) {
+		err(1, "setns");
+	}
+
+	reset_capabilities();
+
+	if (old_umask != (mode_t) -1) {
+		umask(old_umask);
+	}
+	close(mntns);
+	close(memfd);
+	if (realdirfd != -1) {
+		close(realdirfd);
+	}
+	return rc;
+}
+
+static int sec__mknod(int seccomp_fd, int procfd,
+		struct seccomp_notif *req,
+		struct seccomp_notif_resp *resp)
+{
+	uintptr_t pathnameaddr = req->data.args[0];
+	mode_t mode = req->data.args[1];
+	dev_t dev = req->data.args[2];
+
+	return sec__mknodat_impl(seccomp_fd, procfd, req, resp, AT_FDCWD, pathnameaddr, mode, dev);
+}
+
+static int sec__mknodat(int seccomp_fd, int procfd,
+		struct seccomp_notif *req,
+		struct seccomp_notif_resp *resp)
+{
+	int dirfd = req->data.args[0];
+	uintptr_t pathnameaddr = req->data.args[1];
+	mode_t mode = req->data.args[2];
+	dev_t dev = req->data.args[3];
+
+	return sec__mknodat_impl(seccomp_fd, procfd, req, resp, dirfd, pathnameaddr, mode, dev);
+}
+
+static int seccomp(unsigned int op, unsigned int flags, void *args)
+{
+	return syscall(__NR_seccomp, op, flags, args);
+}
+
+int sec_seccomp_install_filter(void)
+{
+	struct sock_filter filter[] = {
+		CHECK_ARCH_AND_LOAD_SYSCALL_NR,
+
+		/* The following syscalls triggers notification to user-space supervisor. */
+
+		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_mknod, 0, 1),
+		BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
+
+		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_mknodat, 0, 1),
+		BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
+
+		BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
+	};
+
+	struct sock_fprog prog = {
+		.len    = lengthof(filter),
+		.filter = filter,
+	};
+
+	int fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
+	if (fd == -1) {
+		err(1, "seccomp SECCOMP_SET_MODE_FILTER");
+	}
+	return fd;
+}
+
+static void sec_seccomp_dispatch_syscall(int seccomp_fd, int procfd,
+		struct seccomp_notif *req,
+		struct seccomp_notif_resp *resp)
+{
+	static syscall_handler_func *const syscall_table[] = {
+		[__NR_mknod]   = sec__mknod,
+		[__NR_mknodat] = sec__mknodat,
+	};
+
+	resp->id = req->id;
+
+	if (req->data.nr <= 0 || (size_t) req->data.nr >= lengthof(syscall_table)) {
+		goto send;
+	}
+	syscall_handler_func *fn = syscall_table[(size_t) req->data.nr];
+	if (!fn) {
+		goto send;
+	}
+
+	int rc = fn(seccomp_fd, procfd, req, resp);
+	if (rc < 0) {
+		resp->error = rc;
+	} else if (rc == SYSCALL_CONTINUE) {
+		resp->flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
+	}
+
+send:
+	if (ioctl(seccomp_fd, SECCOMP_IOCTL_NOTIF_SEND, resp) == -1) {
+		if (errno == ENOENT) {
+			// This is survivable -- this usually means the syscall got
+			// interrupted by a signal.
+			return;
+		}
+		warn("ioctl SECCOMP_IOCTL_NOTIF_SEND");
+	}
+}
+
+noreturn void sec_seccomp_supervisor(int seccomp_fd)
+{
+	/* Run the seccomp supervisor. This supervisor is a privileged helper
+	   that runs safe syscalls on behalf of the unprivileged child in a
+	   user namespace.
+
+	   Use-cases include:
+	   * Allowing mknod on devices deemed "safe", like /dev/null, or the
+	     overlayfs whiteout file.
+	   * Allow devtmpfs mount with our custom bst_devtmpfs logic.
+	
+	   For now, this is intended to be a blocking loop -- if we need other
+	   long-running agents down the line we might need to consider using
+	   an epoll loop or forking these into other processes. */
+
+	struct seccomp_notif_sizes sizes;
+
+	if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) == -1)
+		err(1, "seccomp SECCOMP_GET_NOTIF_SIZES");
+
+	struct seccomp_notif *req = malloc(sizes.seccomp_notif);
+	if (req == NULL)
+		err(1, "malloc");
+
+	/* When allocating the response buffer, we must allow for the fact
+	   that the user-space binary may have been built with user-space
+	   headers where 'struct seccomp_notif_resp' is bigger than the
+	   response buffer expected by the (older) kernel. Therefore, we
+	   allocate a buffer that is the maximum of the two sizes. This
+	   ensures that if the supervisor places bytes into the response
+	   structure that are past the response size that the kernel expects,
+	   then the supervisor is not touching an invalid memory location. */
+
+	size_t resp_size = sizes.seccomp_notif_resp;
+	if (sizeof (struct seccomp_notif_resp) > resp_size)
+		resp_size = sizeof (struct seccomp_notif_resp);
+
+	struct seccomp_notif_resp *resp = malloc(resp_size);
+	if (resp == NULL)
+		err(1, "malloc");
+
+	for (;;) {
+		memset(req,  0, sizes.seccomp_notif);
+		memset(resp, 0, resp_size);
+
+		if (ioctl(seccomp_fd, SECCOMP_IOCTL_NOTIF_RECV, req) == -1) {
+			if (errno == EINTR) {
+				continue;
+			}
+			err(1, "ioctl SECCOMP_IOCTL_NOTIF_RECV");
+		}
+
+		char procpath[PATH_MAX+1];
+		if ((size_t) snprintf(procpath, PATH_MAX, "/proc/%d", req->pid) >= sizeof (procpath)) {
+			errx(1, "/proc/%d takes more than PATH_MAX bytes.", req->pid);
+		}
+
+		int procfd = open(procpath, O_RDONLY | O_CLOEXEC);
+		if (procfd == -1) {
+			err(1, "open");
+		}
+
+		/* Check that the target process is still alive and blocked on the
+		   syscall that sent the notification. */
+
+		if (check_seccomp_cookie(seccomp_fd, &req->id) == -1) {
+			goto end;
+		}
+
+		sec_seccomp_dispatch_syscall(seccomp_fd, procfd, req, resp);
+
+	end:
+		close(procfd);
+	}
+}
+
diff --git a/sec.h b/sec.h
new file mode 100644
index 0000000..1da2ce3
--- /dev/null
+++ b/sec.h
@@ -0,0 +1,15 @@
+/* Copyright © 2022 Arista Networks, Inc. All rights reserved.
+ *
+ * Use of this source code is governed by the MIT license that can be found
+ * in the LICENSE file.
+ */
+
+#ifndef SEC_H_
+# define SEC_H_
+
+# include <stdnoreturn.h>
+
+int sec_seccomp_install_filter(void);
+noreturn void sec_seccomp_supervisor(int);
+
+#endif /* !SEC_H_ */