// SPDX-License-Identifier: AGPL-3.0-or-later /* PASST - Plug A Simple Socket Transport * for qemu/UNIX domain socket mode * * PASTA - Pack A Subtle Tap Abstraction * for network namespace/tap device mode * * isolation.c - Self isolation helpers * * Copyright Red Hat * Author: Stefano Brivio * Author: David Gibson */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "util.h" #include "seccomp.h" #include "passt.h" #include "isolation.h" /** * drop_caps() - Drop capabilities we might have except for CAP_NET_BIND_SERVICE */ void drop_caps(void) { int i; for (i = 0; i < 64; i++) { if (i == CAP_NET_BIND_SERVICE) continue; prctl(PR_CAPBSET_DROP, i, 0, 0, 0); } } /** * isolate_user() - Switch to final UID/GID and move into userns * @uid: User ID to run as (in original userns) * @gid: Group ID to run as (in original userns) * @use_userns: Whether to join or create a userns * @userns: userns path to enter, may be empty */ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) { char nsmap[BUFSIZ]; /* First set our UID & GID in the original namespace */ if (setgroups(0, NULL)) { /* If we don't have CAP_SETGID, this will EPERM */ if (errno != EPERM) { err("Can't drop supplementary groups: %s", strerror(errno)); exit(EXIT_FAILURE); } } if (setgid(gid) != 0) { err("Can't set GID to %u: %s", gid, strerror(errno)); exit(EXIT_FAILURE); } if (setuid(uid) != 0) { err("Can't set UID to %u: %s", uid, strerror(errno)); exit(EXIT_FAILURE); } /* If we're told not to use a userns, nothing more to do */ if (!use_userns) return; /* Otherwise, if given a userns, join it */ if (*userns) { int ufd; ufd = open(userns, O_RDONLY | O_CLOEXEC); if (ufd < 0) { err("Couldn't open user namespace %s: %s", userns, strerror(errno)); exit(EXIT_FAILURE); } if (setns(ufd, CLONE_NEWUSER) != 0) { err("Couldn't enter user namespace %s: %s", userns, strerror(errno)); exit(EXIT_FAILURE); } close(ufd); return; } /* Otherwise, create our own userns */ if (unshare(CLONE_NEWUSER) != 0) { err("Couldn't create user namespace: %s", strerror(errno)); exit(EXIT_FAILURE); } /* Configure user and group mappings */ snprintf(nsmap, BUFSIZ, "0 %u 1", uid); FWRITE("/proc/self/uid_map", nsmap, "Cannot set uid_map in namespace"); FWRITE("/proc/self/setgroups", "deny", "Cannot write to setgroups in namespace"); snprintf(nsmap, BUFSIZ, "0 %u 1", gid); FWRITE("/proc/self/gid_map", nsmap, "Cannot set gid_map in namespace"); } /** * sandbox() - Unshare IPC, mount, PID, UTS, and user namespaces, "unmount" root * * Return: negative error code on failure, zero on success */ int sandbox(struct ctx *c) { int flags = CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWUTS; /* If we run in foreground, we have no chance to actually move to a new * PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody * ever gets around seccomp profiles -- there's no harm in passing it. */ if (!c->foreground || c->mode == MODE_PASST) flags |= CLONE_NEWPID; if (unshare(flags)) { perror("unshare"); return -errno; } if (mount("", "/", "", MS_UNBINDABLE | MS_REC, NULL)) { perror("mount /"); return -errno; } if (mount("", TMPDIR, "tmpfs", MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY, "nr_inodes=2,nr_blocks=0")) { perror("mount tmpfs"); return -errno; } if (chdir(TMPDIR)) { perror("chdir"); return -errno; } if (syscall(SYS_pivot_root, ".", ".")) { perror("pivot_root"); return -errno; } if (umount2(".", MNT_DETACH | UMOUNT_NOFOLLOW)) { perror("umount2"); return -errno; } drop_caps(); /* Relative to the new user namespace this time. */ return 0; } /** * seccomp() - Set up seccomp filters depending on mode, won't return on failure * @c: Execution context */ void seccomp(const struct ctx *c) { struct sock_fprog prog; if (c->mode == MODE_PASST) { prog.len = (unsigned short)ARRAY_SIZE(filter_passt); prog.filter = filter_passt; } else { prog.len = (unsigned short)ARRAY_SIZE(filter_pasta); prog.filter = filter_pasta; } if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) || prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) { perror("prctl"); exit(EXIT_FAILURE); } }