// SPDX-License-Identifier: GPL-2.0-or-later
/* nstool - maintain a namespace to be entered by other processes
*
* Copyright Red Hat
* Author: David Gibson <david@gibson.dropbear.id.au>
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <stdint.h>
#include <errno.h>
#include <unistd.h>
#include <getopt.h>
#include <stdarg.h>
#include <limits.h>
#include <fcntl.h>
#include <limits.h>
#include <unistd.h>
#include <sys/socket.h>
#include <sys/wait.h>
#include <sys/syscall.h>
#include <sys/prctl.h>
#include <linux/un.h>
#include <sched.h>
#include <linux/capability.h>
#define ARRAY_SIZE(a) ((int)(sizeof(a) / sizeof((a)[0])))
#define die(...) \
do { \
fprintf(stderr, __VA_ARGS__); \
exit(1); \
} while (0)
struct ns_type {
int flag;
const char *name;
};
const struct ns_type nstypes[] = {
{ CLONE_NEWCGROUP, "cgroup" },
{ CLONE_NEWIPC, "ipc" },
{ CLONE_NEWNET, "net" },
{ CLONE_NEWNS, "mnt" },
{ CLONE_NEWPID, "pid" },
{ CLONE_NEWTIME, "time" },
{ CLONE_NEWUSER, "user" },
{ CLONE_NEWUTS, "uts" },
};
#define for_each_nst(_nst, _flags) \
for ((_nst) = &nstypes[0]; \
((_nst) - nstypes) < ARRAY_SIZE(nstypes); \
(_nst)++) \
if ((_flags) & (_nst)->flag)
#define for_every_nst(_nst) for_each_nst(_nst, INT_MAX)
#define NSTOOL_MAGIC 0x7570017575601d75ULL
struct holder_info {
uint64_t magic;
pid_t pid;
uid_t uid;
gid_t gid;
char cwd[PATH_MAX];
};
static void usage(void)
{
die("Usage:\n"
" nstool hold SOCK\n"
" Run within a set of namespaces, open a Unix domain socket\n"
" (the \"control socket\") at SOCK and wait for requests from\n"
" other nstool subcommands.\n"
" nstool info [-pw] pid SOCK\n"
" Print information about the nstool hold process with control\n"
" socket at SOCK\n"
" -p Print just the holder's PID as seen by the caller\n"
" -w Retry connecting to SOCK until it is ready\n"
" nstool exec [--keep-caps] SOCK [COMMAND [ARGS...]]\n"
" Execute command or shell in the namespaces of the nstool hold\n"
" with control socket at SOCK\n"
" --keep-caps Give all possible capabilities to COMMAND via\n"
" the ambient capability mask\n"
" nstool stop SOCK\n"
" Instruct the nstool hold with control socket at SOCK to\n"
" terminate.\n");
}
static void sockaddr_from_path(struct sockaddr_un *addr, const char *sockpath)
{
if (strlen(sockpath) > UNIX_PATH_MAX)
die("\"%s\" is too long for Unix socket path (%zu > %d)",
sockpath, strlen(sockpath), UNIX_PATH_MAX);
addr->sun_family = AF_UNIX;
strncpy(addr->sun_path, sockpath, UNIX_PATH_MAX);
}
static int connect_ctl(const char *sockpath, bool wait,
struct holder_info *info,
struct ucred *peercred)
{
int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, PF_UNIX);
struct sockaddr_un addr;
struct holder_info discard;
ssize_t len;
int rc;
if (fd < 0)
die("socket(): %s\n", strerror(errno));
sockaddr_from_path(&addr, sockpath);
do {
rc = connect(fd, (struct sockaddr *)&addr, sizeof(addr));
if (rc < 0 &&
(!wait || (errno != ENOENT && errno != ECONNREFUSED)))
die("connect() to %s: %s\n", sockpath, strerror(errno));
} while (rc < 0);
if (!info)
info = &discard;
/* Always read the info structure, even if we don't need it,
* so that the holder doesn't get a broken pipe error
*/
len = read(fd, info, sizeof(*info));
if (len < 0)
die("read() on control socket %s: %s\n", sockpath, strerror(errno));
if ((size_t)len < sizeof(*info))
die("short read() on control socket %s\n", sockpath);
if (info->magic != NSTOOL_MAGIC)
die("Control socket %s doesn't appear to belong to nstool\n",
sockpath);
if (peercred) {
socklen_t optlen = sizeof(*peercred);
rc = getsockopt(fd, SOL_SOCKET, SO_PEERCRED,
peercred, &optlen);
if (rc < 0)
die("getsockopet(SO_PEERCRED) %s: %s\n",
sockpath, strerror(errno));
}
return fd;
}
static void cmd_hold(int argc, char *argv[])
{
int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, PF_UNIX);
struct sockaddr_un addr;
const char *sockpath = argv[1];
struct holder_info info;
int rc;
if (argc != 2)
usage();
if (fd < 0)
die("socket(): %s\n", strerror(errno));
sockaddr_from_path(&addr, sockpath);
rc = bind(fd, (struct sockaddr *)&addr, sizeof(addr));
if (rc < 0)
die("bind() to %s: %s\n", sockpath, strerror(errno));
rc = listen(fd, 0);
if (rc < 0)
die("listen() on %s: %s\n", sockpath, strerror(errno));
info.magic = NSTOOL_MAGIC;
info.pid = getpid();
info.uid = getuid();
info.gid = getgid();
if (!getcwd(info.cwd, sizeof(info.cwd)))
die("getcwd(): %s\n", strerror(errno));
do {
int afd = accept(fd, NULL, NULL);
char buf;
if (afd < 0)
die("accept(): %s\n", strerror(errno));
rc = write(afd, &info, sizeof(info));
if (rc < 0)
die("write(): %s\n", strerror(errno));
if ((size_t)rc < sizeof(info))
die("short write() on control socket\n");
rc = read(afd, &buf, sizeof(buf));
if (rc < 0)
die("read(): %s\n", strerror(errno));
close(afd);
} while (rc == 0);
unlink(sockpath);
}
static ssize_t getlink(char *buf, size_t bufsiz, const char *fmt, ...)
{
char linkpath[PATH_MAX];
ssize_t linklen;
va_list ap;
va_start(ap, fmt);
if (vsnprintf(linkpath, sizeof(linkpath), fmt, ap) >= PATH_MAX)
die("Truncated path \"%s\"\n", linkpath);
va_end(ap);
linklen = readlink(linkpath, buf, bufsiz);
if (linklen < 0)
die("readlink() on %s: %s\n", linkpath, strerror(errno));
if ((size_t)linklen >= bufsiz)
die("Target of symbolic link %s is too long\n", linkpath);
return linklen;
}
static int detect_namespaces(pid_t pid)
{
const struct ns_type *nst;
int flags = 0;
for_every_nst(nst) {
char selflink[PATH_MAX], pidlink[PATH_MAX];
ssize_t selflen, pidlen;
selflen = getlink(selflink, sizeof(selflink),
"/proc/self/ns/%s", nst->name);
pidlen = getlink(pidlink, sizeof(pidlink),
"/proc/%d/ns/%s", pid, nst->name);
if ((selflen != pidlen) || memcmp(selflink, pidlink, selflen))
flags |= nst->flag;
}
return flags;
}
static void print_nstypes(int flags)
{
const struct ns_type *nst;
bool first = true;
for_each_nst(nst, flags) {
printf("%s%s", first ? "" : ", " , nst->name);
first = false;
flags &= ~nst->flag;
}
if (flags)
printf("%s0x%x", first ? "" : ", ", flags);
}
static void cmd_info(int argc, char *argv[])
{
const struct option options[] = {
{"pid", no_argument, NULL, 'p' },
{"wait", no_argument, NULL, 'w' },
{ 0 },
};
bool pidonly = false, waitforsock = false;
const char *optstring = "pw";
struct holder_info info;
struct ucred peercred;
const char *sockpath;
int fd, opt;
do {
opt = getopt_long(argc, argv, optstring, options, NULL);
switch (opt) {
case 'p':
pidonly = true;
break;
case 'w':
waitforsock = true;
break;
case -1:
break;
default:
usage();
}
} while (opt != -1);
if (optind != argc - 1) {
usage();
}
sockpath = argv[optind];
fd = connect_ctl(sockpath, waitforsock, &info, &peercred);
close(fd);
if (pidonly) {
printf("%d\n", peercred.pid);
} else {
int flags = detect_namespaces(peercred.pid);
printf("Namespaces: ");
print_nstypes(flags);
printf("\n");
printf("As seen from calling context:\n");
printf("\tPID:\t%d\n", peercred.pid);
printf("\tUID:\t%u\n", peercred.uid);
printf("\tGID:\t%u\n", peercred.gid);
printf("As seen from holding context:\n");
printf("\tPID:\t%d\n", info.pid);
printf("\tUID:\t%u\n", info.uid);
printf("\tGID:\t%u\n", info.gid);
printf("\tCWD:\t%s\n", info.cwd);
}
}
static int openns(const char *fmt, ...)
{
char nspath[PATH_MAX];
va_list ap;
int fd;
va_start(ap, fmt);
if (vsnprintf(nspath, sizeof(nspath), fmt, ap) >= PATH_MAX)
die("Truncated path \"%s\"\n", nspath);
va_end(ap);
fd = open(nspath, O_RDONLY | O_CLOEXEC);
if (fd < 0)
die("open() %s: %s\n", nspath, strerror(errno));
return fd;
}
static void wait_for_child(pid_t pid)
{
int status;
/* Match the child's exit status, if possible */
for (;;) {
pid_t rc;
rc = waitpid(pid, &status, WUNTRACED);
if (rc < 0)
die("waitpid() on %d: %s\n", pid, strerror(errno));
if (rc != pid)
die("waitpid() on %d returned %d", pid, rc);
if (WIFSTOPPED(status)) {
/* Stop the parent to patch */
kill(getpid(), SIGSTOP);
/* We must have resumed, resume the child */
kill(pid, SIGCONT);
continue;
}
break;
}
if (WIFEXITED(status))
exit(WEXITSTATUS(status));
else if (WIFSIGNALED(status))
kill(getpid(), WTERMSIG(status));
die("Unexpected status for child %d\n", pid);
}
static void caps_to_ambient(void)
{
/* Use raw system calls to avoid the overly complex caps
* libraries. */
struct __user_cap_header_struct header = {
.version = _LINUX_CAPABILITY_VERSION_3,
.pid = 0,
};
struct __user_cap_data_struct payload[_LINUX_CAPABILITY_U32S_3] =
{{ 0 }};
uint64_t effective, cap;
if (syscall(SYS_capget, &header, payload) < 0)
die("capget(): %s\n", strerror(errno));
/* First make caps inheritable */
payload[0].inheritable = payload[0].permitted;
payload[1].inheritable = payload[1].permitted;
if (syscall(SYS_capset, &header, payload) < 0)
die("capset(): %s\n", strerror(errno));
effective = ((uint64_t)payload[1].effective << 32) | (uint64_t)payload[0].effective;
for (cap = 0; cap < (sizeof(effective) * 8); cap++) {
/* Skip non-existent caps */
if (prctl(PR_CAPBSET_READ, cap, 0, 0, 0) < 0)
continue;
if ((effective & (1 << cap))
&& prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0) < 0)
die("prctl(PR_CAP_AMBIENT): %s\n", strerror(errno));
}
}
static void cmd_exec(int argc, char *argv[])
{
enum {
OPT_EXEC_KEEPCAPS = CHAR_MAX + 1,
};
const struct option options[] = {
{"keep-caps", no_argument, NULL, OPT_EXEC_KEEPCAPS },
{ 0 },
};
const char *shargs[] = { NULL, NULL };
const char *sockpath = argv[1];
int nfd[ARRAY_SIZE(nstypes)];
const char *optstring = "";
const struct ns_type *nst;
int ctlfd, flags, opt, rc;
const char *const *xargs;
struct holder_info info;
bool keepcaps = false;
struct ucred peercred;
const char *exe;
pid_t xpid;
do {
opt = getopt_long(argc, argv, optstring, options, NULL);
switch (opt) {
case OPT_EXEC_KEEPCAPS:
keepcaps = true;
break;
case -1:
break;
default:
usage();
}
} while (opt != -1);
if (argc < optind + 1)
usage();
sockpath = argv[optind];
ctlfd = connect_ctl(sockpath, false, &info, &peercred);
flags = detect_namespaces(peercred.pid);
for_each_nst(nst, flags) {
int *fd = &nfd[nst - nstypes];
*fd = openns("/proc/%d/ns/%s", peercred.pid, nst->name);
}
/* First pass, will get things where we need the privileges of
* the initial userns */
for_each_nst(nst, flags) {
int fd = nfd[nst - nstypes];
rc = setns(fd, nst->flag);
if (rc == 0) {
flags &= ~nst->flag;
}
}
/* Second pass, will get things where we need the privileges
* of the target userns */
for_each_nst(nst, flags) {
int fd = nfd[nst - nstypes];
rc = setns(fd, nst->flag);
if (rc < 0)
die("setns() type %s: %s\n",
nst->name, strerror(errno));
}
/* If we've entered a mount ns, our cwd has changed to /.
* Switch to the cwd of the holder, which is probably less
* surprising. */
if (flags & CLONE_NEWNS) {
rc = chdir(info.cwd);
if (rc < 0)
die("chdir(\"%s\"): %s\n", info.cwd, strerror(errno));
}
/* Fork to properly enter PID namespace */
xpid = fork();
if (xpid < 0)
die("fork(): %s\n", strerror(errno));
if (xpid > 0) {
/* Close the control socket so the waiting parent
* doesn't block the holder */
close(ctlfd);
wait_for_child(xpid);
}
/* CHILD */
if (argc > optind + 1) {
exe = argv[optind + 1];
xargs = (const char * const*)(argv + optind + 1);
} else {
exe = getenv("SHELL");
if (!exe)
exe = "/bin/sh";
shargs[0] = exe;
xargs = shargs;
}
if (keepcaps)
caps_to_ambient();
rc = execvp(exe, (char *const *)xargs);
if (rc < 0)
die("execv() %s: %s\n", exe, strerror(errno));
die("Returned from exec()\n");
}
static void cmd_stop(int argc, char *argv[])
{
const char *sockpath = argv[1];
int fd, rc;
char buf = 'Q';
if (argc != 2)
usage();
fd = connect_ctl(sockpath, false, NULL, NULL);
rc = write(fd, &buf, sizeof(buf));
if (rc < 0)
die("write() to %s: %s\n", sockpath, strerror(errno));
close(fd);
}
int main(int argc, char *argv[])
{
const char *subcmd = argv[1];
int fd;
if (argc < 2)
usage();
fd = socket(AF_UNIX, SOCK_STREAM, PF_UNIX);
if (fd < 0)
die("socket(): %s\n", strerror(errno));
if (strcmp(subcmd, "hold") == 0)
cmd_hold(argc - 1, argv + 1);
else if (strcmp(subcmd, "info") == 0)
cmd_info(argc - 1, argv + 1);
else if (strcmp(subcmd, "exec") == 0)
cmd_exec(argc - 1, argv + 1);
else if (strcmp(subcmd, "stop") == 0)
cmd_stop(argc - 1, argv + 1);
else
usage();
exit(0);
}