diff options
-rw-r--r-- | conf.c | 85 | ||||
-rw-r--r-- | conf.h | 1 | ||||
-rw-r--r-- | passt.1 | 15 | ||||
-rw-r--r-- | tcp.c | 292 | ||||
-rw-r--r-- | tcp.h | 6 | ||||
-rw-r--r-- | udp.h | 4 | ||||
-rw-r--r-- | util.c | 8 | ||||
-rw-r--r-- | util.h | 2 |
8 files changed, 316 insertions, 97 deletions
@@ -40,6 +40,42 @@ #include "tcp.h" /** + * get_bound_ports() - Get maps of ports with bound sockets + * @c: Execution context + * @ns: If set, set bitmaps for ports to tap/ns -- to init otherwise + * @proto: Protocol number (IPPROTO_TCP or IPPROTO_UDP) + */ +void get_bound_ports(struct ctx *c, int ns, uint8_t proto) +{ + uint8_t *udp_map, *udp_exclude, *tcp_map, *tcp_exclude; + + if (ns) { + udp_map = c->udp.port_to_tap; + udp_exclude = c->udp.port_to_init; + tcp_map = c->tcp.port_to_tap; + tcp_exclude = c->tcp.port_to_init; + } else { + udp_map = c->udp.port_to_init; + udp_exclude = c->udp.port_to_tap; + tcp_map = c->tcp.port_to_init; + tcp_exclude = c->tcp.port_to_tap; + } + + if (proto == IPPROTO_UDP) { + memset(udp_map, 0, USHRT_MAX / 8); + procfs_scan_listen("udp", udp_map, udp_exclude); + procfs_scan_listen("udp6", udp_map, udp_exclude); + + procfs_scan_listen("tcp", udp_map, udp_exclude); + procfs_scan_listen("tcp6", udp_map, udp_exclude); + } else if (proto == IPPROTO_TCP) { + memset(tcp_map, 0, USHRT_MAX / 8); + procfs_scan_listen("tcp", tcp_map, tcp_exclude); + procfs_scan_listen("tcp6", tcp_map, tcp_exclude); + } +} + +/** * struct get_bound_ports_ns_arg - Arguments for get_bound_ports_ns() * @c: Execution context * @proto: Protocol number (IPPROTO_TCP or IPPROTO_UDP) @@ -50,7 +86,7 @@ struct get_bound_ports_ns_arg { }; /** - * get_bound_ports_ns() - Get maps of ports namespace with bound sockets + * get_bound_ports_ns() - Get maps of ports in namespace with bound sockets * @arg: See struct get_bound_ports_ns_arg * * Return: 0 @@ -63,39 +99,11 @@ static int get_bound_ports_ns(void *arg) if (!c->pasta_pid || ns_enter(c->pasta_pid)) return 0; - if (a->proto == IPPROTO_UDP) { - procfs_scan_listen("udp", c->udp.port_to_tap); - procfs_scan_listen("udp6", c->udp.port_to_tap); - - procfs_scan_listen("tcp", c->udp.port_to_tap); - procfs_scan_listen("tcp6", c->udp.port_to_tap); - } else if (a->proto == IPPROTO_TCP) { - procfs_scan_listen("tcp", c->tcp.port_to_tap); - procfs_scan_listen("tcp6", c->tcp.port_to_tap); - } + get_bound_ports(c, 1, a->proto); return 0; } -/** - * get_bound_ports() - Get maps of ports in init namespace with bound sockets - * @c: Execution context - * @proto: Protocol number (IPPROTO_TCP or IPPROTO_UDP) - */ -static void get_bound_ports(struct ctx *c, uint8_t proto) -{ - if (proto == IPPROTO_UDP) { - procfs_scan_listen("udp", c->udp.port_to_init); - procfs_scan_listen("udp6", c->udp.port_to_init); - - procfs_scan_listen("tcp", c->udp.port_to_init); - procfs_scan_listen("tcp6", c->udp.port_to_init); - } else if (proto == IPPROTO_TCP) { - procfs_scan_listen("tcp", c->tcp.port_to_init); - procfs_scan_listen("tcp6", c->tcp.port_to_init); - } -} - enum conf_port_type { PORT_SPEC = 1, PORT_NONE, @@ -1172,19 +1180,28 @@ void conf(struct ctx *c, int argc, char **argv) } #endif + c->tcp.ns_detect_ports = c->udp.ns_detect_ports = 0; + c->tcp.init_detect_ports = c->udp.init_detect_ports = 0; + if (c->mode == MODE_PASTA) { if (!tcp_tap || tcp_tap == PORT_AUTO) { + c->tcp.ns_detect_ports = 1; ns_ports_arg.proto = IPPROTO_TCP; NS_CALL(get_bound_ports_ns, &ns_ports_arg); } if (!udp_tap || udp_tap == PORT_AUTO) { + c->udp.ns_detect_ports = 1; ns_ports_arg.proto = IPPROTO_UDP; NS_CALL(get_bound_ports_ns, &ns_ports_arg); } - if (!tcp_init || tcp_init == PORT_AUTO) - get_bound_ports(c, IPPROTO_TCP); - if (!udp_init || udp_init == PORT_AUTO) - get_bound_ports(c, IPPROTO_UDP); + if (!tcp_init || tcp_init == PORT_AUTO) { + c->tcp.init_detect_ports = 1; + get_bound_ports(c, 0, IPPROTO_TCP); + } + if (!udp_init || udp_init == PORT_AUTO) { + c->udp.init_detect_ports = 1; + get_bound_ports(c, 0, IPPROTO_UDP); + } } conf_print(c); @@ -1 +1,2 @@ void conf(struct ctx *c, int argc, char **argv); +void get_bound_ports(struct ctx *c, int ns, uint8_t proto); @@ -297,9 +297,9 @@ Don't forward any ports .TP .BR auto -Forward all ports currently bound in the namespace. The list of ports is derived -from listening sockets reported by \fI/proc/net/tcp\fR and \fI/proc/net/tcp6\fR, -see \fBproc\fR(5). +Dynamically forward ports bound in the namespace. The list of ports is +periodically derived (every second) from listening sockets reported by +\fI/proc/net/tcp\fR and \fI/proc/net/tcp6\fR, see \fBproc\fR(5). .TP .BR ports @@ -331,9 +331,10 @@ Default is \fBauto\fR. .TP .BR \-u ", " \-\-udp-ports " " \fIspec -Configure UDP port forwarding to guest. \fIspec\fR is as described for TCP +Configure UDP port forwarding to namespace. \fIspec\fR is as described for TCP above, and the list of ports is derived from listening sockets reported by -\fI/proc/net/udp\fR and \fI/proc/net/udp6\fR, see \fBproc\fR(5). +\fI/proc/net/udp\fR and \fI/proc/net/udp6\fR, see \fBproc\fR(5), +when \fBpasta\fR starts (not periodically). Note: unless overridden, UDP ports with numbers corresponding to forwarded TCP port numbers are forwarded too, without, however, any port translation. @@ -345,14 +346,14 @@ Default is \fBauto\fR. .TP .BR \-T ", " \-\-tcp-ns " " \fIspec Configure TCP port forwarding from target namespace to init namespace. -\fIspec\fR is as described above. +\fIspec\fR is as described above for TCP. Default is \fBauto\fR. .TP .BR \-U ", " \-\-udp-ns " " \fIspec Configure UDP port forwarding from target namespace to init namespace. -\fIspec\fR is as described above. +\fIspec\fR is as described above for UDP. Default is \fBauto\fR. @@ -334,6 +334,7 @@ #include "tap.h" #include "siphash.h" #include "pcap.h" +#include "conf.h" #define MAX_TAP_CONNS (128 * 1024) #define MAX_SPLICE_CONNS (128 * 1024) @@ -363,6 +364,8 @@ #define TCP_SPLICE_PIPE_POOL_SIZE 256 #define REFILL_INTERVAL 1000 +#define PORT_DETECT_INTERVAL 1000 + /* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of * <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP */ @@ -525,6 +528,11 @@ struct tcp_splice_conn { static in_port_t tcp_port_delta_to_tap [USHRT_MAX]; static in_port_t tcp_port_delta_to_init [USHRT_MAX]; +/* Listening sockets, used for automatic port forwarding in pasta mode only */ +static int tcp_sock_init_lo [USHRT_MAX][IP_VERSIONS]; +static int tcp_sock_init_ext [USHRT_MAX][IP_VERSIONS]; +static int tcp_sock_ns [USHRT_MAX][IP_VERSIONS]; + /** * tcp_remap_to_tap() - Set delta for port translation toward guest/tap * @port: Original destination port, host order @@ -3002,6 +3010,93 @@ smaller: } /** + * tcp_sock_init_one() - Initialise listening sockets for a given port + * @c: Execution context + * @ns: In pasta mode, if set, bind with loopback address in namespace + * @port: Port, host order + */ +static void tcp_sock_init_one(struct ctx *c, int ns, in_port_t port) +{ + union tcp_epoll_ref tref = { .listen = 1 }; + int s; + + if (ns) + tref.index = (in_port_t)(port + tcp_port_delta_to_init[port]); + else + tref.index = (in_port_t)(port + tcp_port_delta_to_tap[port]); + + if (c->v4) { + tref.v6 = 0; + + tref.splice = 0; + if (!ns) { + s = sock_l4(c, AF_INET, IPPROTO_TCP, port, + c->mode == MODE_PASTA ? BIND_EXT : BIND_ANY, + tref.u32); + if (s > 0) + tcp_sock_set_bufsize(s); + else + s = -1; + + if (c->tcp.init_detect_ports) + tcp_sock_init_ext[port][V4] = s; + } + + if (c->mode == MODE_PASTA) { + tref.splice = 1; + s = sock_l4(c, AF_INET, IPPROTO_TCP, port, + BIND_LOOPBACK, tref.u32); + if (s > 0) + tcp_sock_set_bufsize(s); + else + s = -1; + + if (c->tcp.ns_detect_ports) { + if (ns) + tcp_sock_ns[port][V4] = s; + else + tcp_sock_init_lo[port][V4] = s; + } + } + } + + if (c->v6) { + tref.v6 = 1; + + tref.splice = 0; + if (!ns) { + s = sock_l4(c, AF_INET6, IPPROTO_TCP, port, + c->mode == MODE_PASTA ? BIND_EXT : BIND_ANY, + tref.u32); + if (s > 0) + tcp_sock_set_bufsize(s); + else + s = -1; + + if (c->tcp.init_detect_ports) + tcp_sock_init_ext[port][V6] = s; + } + + if (c->mode == MODE_PASTA) { + tref.splice = 1; + s = sock_l4(c, AF_INET6, IPPROTO_TCP, port, + BIND_LOOPBACK, tref.u32); + if (s > 0) + tcp_sock_set_bufsize(s); + else + s = -1; + + if (c->tcp.ns_detect_ports) { + if (ns) + tcp_sock_ns[port][V6] = s; + else + tcp_sock_init_lo[port][V6] = s; + } + } + } +} + +/** * tcp_sock_init_ns() - Bind sockets in namespace for inbound connections * @arg: Execution context * @@ -3009,10 +3104,8 @@ smaller: */ static int tcp_sock_init_ns(void *arg) { - union tcp_epoll_ref tref = { .listen = 1, .splice = 1 }; struct ctx *c = (struct ctx *)arg; in_port_t port; - int s; ns_enter(c->pasta_pid); @@ -3020,21 +3113,7 @@ static int tcp_sock_init_ns(void *arg) if (!bitmap_isset(c->tcp.port_to_init, port)) continue; - tref.index = (in_port_t)(port + tcp_port_delta_to_init[port]); - - if (c->v4) { - tref.v6 = 0; - s = sock_l4(c, AF_INET, IPPROTO_TCP, port, - BIND_LOOPBACK, tref.u32); - tcp_sock_set_bufsize(s); - } - - if (c->v6) { - tref.v6 = 1; - s = sock_l4(c, AF_INET6, IPPROTO_TCP, port, - BIND_LOOPBACK, tref.u32); - tcp_sock_set_bufsize(s); - } + tcp_sock_init_one(c, 1, port); } return 0; @@ -3128,9 +3207,7 @@ static int tcp_sock_refill(void *arg) int tcp_sock_init(struct ctx *c, struct timespec *now) { struct tcp_sock_refill_arg refill_arg = { c, 0 }; - union tcp_epoll_ref tref = { .listen = 1 }; in_port_t port; - int s; getrandom(&c->tcp.hash_secret, sizeof(c->tcp.hash_secret), GRND_RANDOM); @@ -3138,40 +3215,7 @@ int tcp_sock_init(struct ctx *c, struct timespec *now) if (!bitmap_isset(c->tcp.port_to_tap, port)) continue; - tref.index = (in_port_t)(port + tcp_port_delta_to_tap[port]); - if (c->v4) { - tref.v6 = 0; - - tref.splice = 0; - s = sock_l4(c, AF_INET, IPPROTO_TCP, port, - c->mode == MODE_PASTA ? BIND_EXT : BIND_ANY, - tref.u32); - tcp_sock_set_bufsize(s); - - if (c->mode == MODE_PASTA) { - tref.splice = 1; - s = sock_l4(c, AF_INET, IPPROTO_TCP, port, - BIND_LOOPBACK, tref.u32); - tcp_sock_set_bufsize(s); - } - } - - if (c->v6) { - tref.v6 = 1; - - tref.splice = 0; - s = sock_l4(c, AF_INET6, IPPROTO_TCP, port, - c->mode == MODE_PASTA ? BIND_EXT : BIND_ANY, - tref.u32); - tcp_sock_set_bufsize(s); - - if (c->mode == MODE_PASTA) { - tref.splice = 1; - s = sock_l4(c, AF_INET6, IPPROTO_TCP, port, - BIND_LOOPBACK, tref.u32); - tcp_sock_set_bufsize(s); - } - } + tcp_sock_init_one(c, 0, port); } if (c->v4) @@ -3190,6 +3234,8 @@ int tcp_sock_init(struct ctx *c, struct timespec *now) refill_arg.ns = 1; NS_CALL(tcp_sock_refill, &refill_arg); tcp_splice_pipe_refill(c); + + c->tcp.port_detect_ts = *now; } return 0; @@ -3284,6 +3330,122 @@ static void tcp_timer_one(struct ctx *c, struct tcp_tap_conn *conn, } /** + * struct tcp_port_detect_arg - Arguments for tcp_port_detect() + * @c: Execution context + * @detect_in_ns: Detect ports bound in namespace, not in init + */ +struct tcp_port_detect_arg { + struct ctx *c; + int detect_in_ns; +}; + +/** + * tcp_port_detect() - Detect ports bound in namespace or init + * @arg: See struct tcp_port_detect_arg + * + * Return: 0 + */ +static int tcp_port_detect(void *arg) +{ + struct tcp_port_detect_arg *a = (struct tcp_port_detect_arg *)arg; + + if (a->detect_in_ns) { + ns_enter(a->c->pasta_pid); + + get_bound_ports(a->c, 1, IPPROTO_TCP); + } else { + get_bound_ports(a->c, 0, IPPROTO_TCP); + } + + return 0; +} + +/** + * struct tcp_port_rebind_arg - Arguments for tcp_port_rebind() + * @c: Execution context + * @bind_in_ns: Rebind ports in namespace, not in init + */ +struct tcp_port_rebind_arg { + struct ctx *c; + int bind_in_ns; +}; + +/** + * tcp_port_rebind() - Rebind ports in namespace or init + * @arg: See struct tcp_port_rebind_arg + * + * Return: 0 + */ +static int tcp_port_rebind(void *arg) +{ + struct tcp_port_rebind_arg *a = (struct tcp_port_rebind_arg *)arg; + in_port_t port; + + if (a->bind_in_ns) { + ns_enter(a->c->pasta_pid); + + for (port = 0; port < USHRT_MAX; port++) { + if (!bitmap_isset(a->c->tcp.port_to_init, port)) { + if (tcp_sock_ns[port][V4] > 0) { + close(tcp_sock_ns[port][V4]); + tcp_sock_ns[port][V4] = 0; + } + + if (tcp_sock_ns[port][V6] > 0) { + close(tcp_sock_ns[port][V6]); + tcp_sock_ns[port][V6] = 0; + } + + continue; + } + + /* Don't loop back our own ports */ + if (bitmap_isset(a->c->tcp.port_to_tap, port)) + continue; + + if ((a->c->v4 && !tcp_sock_ns[port][V4]) || + (a->c->v6 && !tcp_sock_ns[port][V6])) + tcp_sock_init_one(a->c, 1, port); + } + } else { + for (port = 0; port < USHRT_MAX; port++) { + if (!bitmap_isset(a->c->tcp.port_to_tap, port)) { + if (tcp_sock_init_ext[port][V4] > 0) { + close(tcp_sock_init_ext[port][V4]); + tcp_sock_init_ext[port][V4] = 0; + } + + if (tcp_sock_init_ext[port][V6] > 0) { + close(tcp_sock_init_ext[port][V6]); + tcp_sock_init_ext[port][V6] = 0; + } + + if (tcp_sock_init_lo[port][V4] > 0) { + close(tcp_sock_init_lo[port][V4]); + tcp_sock_init_lo[port][V4] = 0; + } + + if (tcp_sock_init_lo[port][V6] > 0) { + close(tcp_sock_init_lo[port][V6]); + tcp_sock_init_lo[port][V6] = 0; + } + continue; + } + + /* Don't loop back our own ports */ + if (bitmap_isset(a->c->tcp.port_to_init, port)) + continue; + + if ((a->c->v4 && !tcp_sock_init_ext[port][V4]) || + (a->c->v6 && !tcp_sock_init_ext[port][V6])) + tcp_sock_init_one(a->c, 0, port); + } + } + + return 0; +} + +/** * tcp_timer() - Scan activity bitmap for sockets waiting for timed events * @c: Execution context * @ts: Timestamp from caller @@ -3293,6 +3455,30 @@ void tcp_timer(struct ctx *c, struct timespec *now) struct tcp_sock_refill_arg refill_arg = { c, 0 }; int i; + if (c->mode == MODE_PASTA) { + if (timespec_diff_ms(now, &c->tcp.port_detect_ts) > + PORT_DETECT_INTERVAL) { + struct tcp_port_detect_arg detect_arg = { c, 0 }; + struct tcp_port_rebind_arg rebind_arg = { c, 0 }; + + if (c->tcp.init_detect_ports) { + detect_arg.detect_in_ns = 0; + tcp_port_detect(&detect_arg); + rebind_arg.bind_in_ns = 1; + NS_CALL(tcp_port_rebind, &rebind_arg); + } + + if (c->tcp.ns_detect_ports) { + detect_arg.detect_in_ns = 1; + NS_CALL(tcp_port_detect, &detect_arg); + rebind_arg.bind_in_ns = 0; + tcp_port_rebind(&rebind_arg); + } + + c->tcp.port_detect_ts = *now; + } + } + if (timespec_diff_ms(now, &c->tcp.refill_ts) > REFILL_INTERVAL) { tcp_sock_refill(&refill_arg); if (c->mode == MODE_PASTA) { @@ -43,22 +43,28 @@ union tcp_epoll_ref { * @tap_conn_count: Count of tap connections in connection table * @splice_conn_count: Count of spliced connections in connection table * @port_to_tap: Ports bound host-side, packets to tap or spliced + * @init_detect_ports: If set, periodically detect ports bound in init * @port_to_init: Ports bound namespace-side, spliced to init + * @ns_detect_ports: If set, periodically detect ports bound in namespace * @timer_run: Timestamp of most recent timer run * @kernel_snd_wnd: Kernel reports sending window (with commit 8f7baad7f035) * @pipe_size: Size of pipes for spliced connections * @refill_ts: Time of last refill operation for pools of sockets/pipes + * @port_detect_ts: Time of last TCP port detection/rebind, if enabled */ struct tcp_ctx { uint64_t hash_secret[2]; int tap_conn_count; int splice_conn_count; uint8_t port_to_tap [USHRT_MAX / 8]; + int init_detect_ports; uint8_t port_to_init [USHRT_MAX / 8]; + int ns_detect_ports; struct timespec timer_run; int kernel_snd_wnd; size_t pipe_size; struct timespec refill_ts; + struct timespec port_detect_ts; }; #endif /* TCP_H */ @@ -40,12 +40,16 @@ union udp_epoll_ref { /** * struct udp_ctx - Execution context for UDP * @port_to_tap: Ports bound host-side, data to tap or ns L4 socket + * @init_detect_ports: If set, periodically detect ports bound in init (TODO) * @port_to_init: Ports bound namespace-side, data to init L4 socket + * @ns_detect_ports: If set, periodically detect ports bound in namespace * @timer_run: Timestamp of most recent timer run */ struct udp_ctx { uint8_t port_to_tap [USHRT_MAX / 8]; + int init_detect_ports; uint8_t port_to_init [USHRT_MAX / 8]; + int ns_detect_ports; struct timespec timer_run; }; @@ -266,8 +266,9 @@ int bitmap_isset(uint8_t *map, int bit) * procfs_scan_listen() - Set bits for listening TCP or UDP sockets from procfs * @name: Corresponding name of file under /proc/net/ * @map: Bitmap where numbers of ports in listening state will be set + * @exclude: Bitmap of ports to exclude from setting (and clear) */ -void procfs_scan_listen(char *name, uint8_t *map) +void procfs_scan_listen(char *name, uint8_t *map, uint8_t *exclude) { char line[200], path[PATH_MAX]; unsigned long port; @@ -288,7 +289,10 @@ void procfs_scan_listen(char *name, uint8_t *map) (strstr(name, "udp") && state != 0x07)) continue; - bitmap_set(map, port); + if (bitmap_isset(exclude, port)) + bitmap_clear(map, port); + else + bitmap_set(map, port); } fclose(fp); @@ -137,5 +137,5 @@ int timespec_diff_ms(struct timespec *a, struct timespec *b); void bitmap_set(uint8_t *map, int bit); void bitmap_clear(uint8_t *map, int bit); int bitmap_isset(uint8_t *map, int bit); -void procfs_scan_listen(char *name, uint8_t *map); +void procfs_scan_listen(char *name, uint8_t *map, uint8_t *exclude); int ns_enter(int target_pid); |