aboutgitcodebugslistschat
path: root/fwd.c
diff options
context:
space:
mode:
Diffstat (limited to 'fwd.c')
-rw-r--r--fwd.c387
1 files changed, 375 insertions, 12 deletions
diff --git a/fwd.c b/fwd.c
index a235d13..0b7f8b1 100644
--- a/fwd.c
+++ b/fwd.c
@@ -25,6 +25,81 @@
#include "fwd.h"
#include "passt.h"
#include "lineread.h"
+#include "flow_table.h"
+
+/* Empheral port range: values from RFC 6335 */
+static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14);
+static in_port_t fwd_ephemeral_max = NUM_PORTS - 1;
+
+#define PORT_RANGE_SYSCTL "/proc/sys/net/ipv4/ip_local_port_range"
+
+/** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral
+ *
+ * Work out what ports the host thinks are emphemeral and record it for later
+ * use by fwd_port_is_ephemeral(). If we're unable to probe, assume the range
+ * recommended by RFC 6335.
+ */
+void fwd_probe_ephemeral(void)
+{
+ char *line, *tab, *end;
+ struct lineread lr;
+ long min, max;
+ ssize_t len;
+ int fd;
+
+ fd = open(PORT_RANGE_SYSCTL, O_RDONLY | O_CLOEXEC);
+ if (fd < 0) {
+ warn_perror("Unable to open %s", PORT_RANGE_SYSCTL);
+ return;
+ }
+
+ lineread_init(&lr, fd);
+ len = lineread_get(&lr, &line);
+ close(fd);
+
+ if (len < 0)
+ goto parse_err;
+
+ tab = strchr(line, '\t');
+ if (!tab)
+ goto parse_err;
+ *tab = '\0';
+
+ errno = 0;
+ min = strtol(line, &end, 10);
+ if (*end || errno)
+ goto parse_err;
+
+ errno = 0;
+ max = strtol(tab + 1, &end, 10);
+ if (*end || errno)
+ goto parse_err;
+
+ if (min < 0 || min >= (long)NUM_PORTS ||
+ max < 0 || max >= (long)NUM_PORTS)
+ goto parse_err;
+
+ fwd_ephemeral_min = min;
+ fwd_ephemeral_max = max;
+
+ return;
+
+parse_err:
+ warn("Unable to parse %s", PORT_RANGE_SYSCTL);
+}
+
+/**
+ * fwd_port_is_ephemeral() - Is port number ephemeral?
+ * @port: Port number
+ *
+ * Return: true if @port is ephemeral, that is may be allocated by the kernel as
+ * a local port for outgoing connections or datagrams, but should not be
+ * used for binding services to.
+ */
+bool fwd_port_is_ephemeral(in_port_t port)
+{
+ return (port >= fwd_ephemeral_min) && (port <= fwd_ephemeral_max);
+}
/* See enum in kernel's include/net/tcp_states.h */
#define UDP_LISTEN 0x07
@@ -38,7 +113,7 @@
* @exclude: Bitmap of ports to exclude from setting (and clear)
*
* #syscalls:pasta lseek
- * #syscalls:pasta ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek
+ * #syscalls:pasta ppc64le:_llseek ppc64:_llseek arm:_llseek
*/
static void procfs_scan_listen(int fd, unsigned int lstate,
uint8_t *map, const uint8_t *exclude)
@@ -52,7 +127,7 @@ static void procfs_scan_listen(int fd, unsigned int lstate,
return;
if (lseek(fd, 0, SEEK_SET)) {
- warn("lseek() failed on /proc/net file: %s", strerror(errno));
+ warn_perror("lseek() failed on /proc/net file");
return;
}
@@ -128,18 +203,18 @@ void fwd_scan_ports_init(struct ctx *c)
c->tcp.fwd_in.scan4 = c->tcp.fwd_in.scan6 = -1;
c->tcp.fwd_out.scan4 = c->tcp.fwd_out.scan6 = -1;
- c->udp.fwd_in.f.scan4 = c->udp.fwd_in.f.scan6 = -1;
- c->udp.fwd_out.f.scan4 = c->udp.fwd_out.f.scan6 = -1;
+ c->udp.fwd_in.scan4 = c->udp.fwd_in.scan6 = -1;
+ c->udp.fwd_out.scan4 = c->udp.fwd_out.scan6 = -1;
if (c->tcp.fwd_in.mode == FWD_AUTO) {
c->tcp.fwd_in.scan4 = open_in_ns(c, "/proc/net/tcp", flags);
c->tcp.fwd_in.scan6 = open_in_ns(c, "/proc/net/tcp6", flags);
fwd_scan_ports_tcp(&c->tcp.fwd_in, &c->tcp.fwd_out);
}
- if (c->udp.fwd_in.f.mode == FWD_AUTO) {
- c->udp.fwd_in.f.scan4 = open_in_ns(c, "/proc/net/udp", flags);
- c->udp.fwd_in.f.scan6 = open_in_ns(c, "/proc/net/udp6", flags);
- fwd_scan_ports_udp(&c->udp.fwd_in.f, &c->udp.fwd_out.f,
+ if (c->udp.fwd_in.mode == FWD_AUTO) {
+ c->udp.fwd_in.scan4 = open_in_ns(c, "/proc/net/udp", flags);
+ c->udp.fwd_in.scan6 = open_in_ns(c, "/proc/net/udp6", flags);
+ fwd_scan_ports_udp(&c->udp.fwd_in, &c->udp.fwd_out,
&c->tcp.fwd_in, &c->tcp.fwd_out);
}
if (c->tcp.fwd_out.mode == FWD_AUTO) {
@@ -147,10 +222,298 @@ void fwd_scan_ports_init(struct ctx *c)
c->tcp.fwd_out.scan6 = open("/proc/net/tcp6", flags);
fwd_scan_ports_tcp(&c->tcp.fwd_out, &c->tcp.fwd_in);
}
- if (c->udp.fwd_out.f.mode == FWD_AUTO) {
- c->udp.fwd_out.f.scan4 = open("/proc/net/udp", flags);
- c->udp.fwd_out.f.scan6 = open("/proc/net/udp6", flags);
- fwd_scan_ports_udp(&c->udp.fwd_out.f, &c->udp.fwd_in.f,
+ if (c->udp.fwd_out.mode == FWD_AUTO) {
+ c->udp.fwd_out.scan4 = open("/proc/net/udp", flags);
+ c->udp.fwd_out.scan6 = open("/proc/net/udp6", flags);
+ fwd_scan_ports_udp(&c->udp.fwd_out, &c->udp.fwd_in,
&c->tcp.fwd_out, &c->tcp.fwd_in);
}
}
+
+/**
+ * is_dns_flow() - Determine if flow appears to be a DNS request
+ * @proto: Protocol (IP L4 protocol number)
+ * @ini: Flow address information of the initiating side
+ *
+ * Return: true if the flow appears to be directed at a dns server, that is a
+ * TCP or UDP flow to port 53 (domain) or port 853 (domain-s)
+ */
+static bool is_dns_flow(uint8_t proto, const struct flowside *ini)
+{
+ return ((proto == IPPROTO_UDP) || (proto == IPPROTO_TCP)) &&
+ ((ini->oport == 53) || (ini->oport == 853));
+}
+
+/**
+ * fwd_guest_accessible4() - Is IPv4 address guest-accessible
+ * @c: Execution context
+ * @addr: Host visible IPv4 address
+ *
+ * Return: true if @addr on the host is accessible to the guest without
+ * translation, false otherwise
+ */
+static bool fwd_guest_accessible4(const struct ctx *c,
+ const struct in_addr *addr)
+{
+ if (IN4_IS_ADDR_LOOPBACK(addr))
+ return false;
+
+ /* In socket interfaces 0.0.0.0 generally means "any" or unspecified,
+ * however on the wire it can mean "this host on this network". Since
+ * that has a different meaning for host and guest, we can't let it
+ * through untranslated.
+ */
+ if (IN4_IS_ADDR_UNSPECIFIED(addr))
+ return false;
+
+ /* For IPv4, addr_seen is initialised to addr, so is always a valid
+ * address
+ */
+ if (IN4_ARE_ADDR_EQUAL(addr, &c->ip4.addr) ||
+ IN4_ARE_ADDR_EQUAL(addr, &c->ip4.addr_seen))
+ return false;
+
+ return true;
+}
+
+/**
+ * fwd_guest_accessible6() - Is IPv6 address guest-accessible
+ * @c: Execution context
+ * @addr: Host visible IPv6 address
+ *
+ * Return: true if @addr on the host is accessible to the guest without
+ * translation, false otherwise
+ */
+static bool fwd_guest_accessible6(const struct ctx *c,
+ const struct in6_addr *addr)
+{
+ if (IN6_IS_ADDR_LOOPBACK(addr))
+ return false;
+
+ if (IN6_ARE_ADDR_EQUAL(addr, &c->ip6.addr))
+ return false;
+
+ /* For IPv6, addr_seen starts unspecified, because we don't know what LL
+ * address the guest will take until we see it. Only check against it
+ * if it has been set to a real address.
+ */
+ if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr_seen) &&
+ IN6_ARE_ADDR_EQUAL(addr, &c->ip6.addr_seen))
+ return false;
+
+ return true;
+}
+
+/**
+ * fwd_guest_accessible() - Is IPv[46] address guest-accessible
+ * @c: Execution context
+ * @addr: Host visible IPv[46] address
+ *
+ * Return: true if @addr on the host is accessible to the guest without
+ * translation, false otherwise
+ */
+static bool fwd_guest_accessible(const struct ctx *c,
+ const union inany_addr *addr)
+{
+ const struct in_addr *a4 = inany_v4(addr);
+
+ if (a4)
+ return fwd_guest_accessible4(c, a4);
+
+ return fwd_guest_accessible6(c, &addr->a6);
+}
+
+/**
+ * fwd_nat_from_tap() - Determine to forward a flow from the tap interface
+ * @c: Execution context
+ * @proto: Protocol (IP L4 protocol number)
+ * @ini: Flow address information of the initiating side
+ * @tgt: Flow address information on the target side (updated)
+ *
+ * Return: pif of the target interface to forward the flow to, PIF_NONE if the
+ * flow cannot or should not be forwarded at all.
+ */
+uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
+ const struct flowside *ini, struct flowside *tgt)
+{
+ if (is_dns_flow(proto, ini) &&
+ inany_equals4(&ini->oaddr, &c->ip4.dns_match))
+ tgt->eaddr = inany_from_v4(c->ip4.dns_host);
+ else if (is_dns_flow(proto, ini) &&
+ inany_equals6(&ini->oaddr, &c->ip6.dns_match))
+ tgt->eaddr.a6 = c->ip6.dns_host;
+ else if (inany_equals4(&ini->oaddr, &c->ip4.map_host_loopback))
+ tgt->eaddr = inany_loopback4;
+ else if (inany_equals6(&ini->oaddr, &c->ip6.map_host_loopback))
+ tgt->eaddr = inany_loopback6;
+ else if (inany_equals4(&ini->oaddr, &c->ip4.map_guest_addr))
+ tgt->eaddr = inany_from_v4(c->ip4.addr);
+ else if (inany_equals6(&ini->oaddr, &c->ip6.map_guest_addr))
+ tgt->eaddr.a6 = c->ip6.addr;
+ else
+ tgt->eaddr = ini->oaddr;
+
+ tgt->eport = ini->oport;
+
+ /* The relevant addr_out controls the host side source address. This
+ * may be unspecified, which allows the kernel to pick an address.
+ */
+ if (inany_v4(&tgt->eaddr))
+ tgt->oaddr = inany_from_v4(c->ip4.addr_out);
+ else
+ tgt->oaddr.a6 = c->ip6.addr_out;
+
+ /* Let the kernel pick a host side source port */
+ tgt->oport = 0;
+ if (proto == IPPROTO_UDP) {
+ /* But for UDP we preserve the source port */
+ tgt->oport = ini->eport;
+ }
+
+ return PIF_HOST;
+}
+
+/**
+ * fwd_nat_from_splice() - Determine to forward a flow from the splice interface
+ * @c: Execution context
+ * @proto: Protocol (IP L4 protocol number)
+ * @ini: Flow address information of the initiating side
+ * @tgt: Flow address information on the target side (updated)
+ *
+ * Return: pif of the target interface to forward the flow to, PIF_NONE if the
+ * flow cannot or should not be forwarded at all.
+ */
+uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
+ const struct flowside *ini, struct flowside *tgt)
+{
+ if (!inany_is_loopback(&ini->eaddr) ||
+ (!inany_is_loopback(&ini->oaddr) && !inany_is_unspecified(&ini->oaddr))) {
+ char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN];
+
+ debug("Non loopback address on %s: [%s]:%hu -> [%s]:%hu",
+ pif_name(PIF_SPLICE),
+ inany_ntop(&ini->eaddr, estr, sizeof(estr)), ini->eport,
+ inany_ntop(&ini->oaddr, fstr, sizeof(fstr)), ini->oport);
+ return PIF_NONE;
+ }
+
+ if (inany_v4(&ini->eaddr))
+ tgt->eaddr = inany_loopback4;
+ else
+ tgt->eaddr = inany_loopback6;
+
+ /* Preserve the specific loopback adddress used, but let the kernel pick
+ * a source port on the target side
+ */
+ tgt->oaddr = ini->eaddr;
+ tgt->oport = 0;
+
+ tgt->eport = ini->oport;
+ if (proto == IPPROTO_TCP)
+ tgt->eport += c->tcp.fwd_out.delta[tgt->eport];
+ else if (proto == IPPROTO_UDP)
+ tgt->eport += c->udp.fwd_out.delta[tgt->eport];
+
+ /* Let the kernel pick a host side source port */
+ tgt->oport = 0;
+ if (proto == IPPROTO_UDP)
+ /* But for UDP preserve the source port */
+ tgt->oport = ini->eport;
+
+ return PIF_HOST;
+}
+
+/**
+ * fwd_nat_from_host() - Determine to forward a flow from the host interface
+ * @c: Execution context
+ * @proto: Protocol (IP L4 protocol number)
+ * @ini: Flow address information of the initiating side
+ * @tgt: Flow address information on the target side (updated)
+ *
+ * Return: pif of the target interface to forward the flow to, PIF_NONE if the
+ * flow cannot or should not be forwarded at all.
+ */
+uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
+ const struct flowside *ini, struct flowside *tgt)
+{
+ /* Common for spliced and non-spliced cases */
+ tgt->eport = ini->oport;
+ if (proto == IPPROTO_TCP)
+ tgt->eport += c->tcp.fwd_in.delta[tgt->eport];
+ else if (proto == IPPROTO_UDP)
+ tgt->eport += c->udp.fwd_in.delta[tgt->eport];
+
+ if (c->mode == MODE_PASTA && inany_is_loopback(&ini->eaddr) &&
+ (proto == IPPROTO_TCP || proto == IPPROTO_UDP)) {
+ /* spliceable */
+
+ /* The traffic will go over the guest's 'lo' interface, but by
+ * default use its external address, so we don't inadvertently
+ * expose services that listen only on the guest's loopback
+ * address. That can be overridden by --host-lo-to-ns-lo which
+ * will instead forward to the loopback address in the guest.
+ *
+ * In either case, let the kernel pick the source address to
+ * match.
+ */
+ if (inany_v4(&ini->eaddr)) {
+ if (c->host_lo_to_ns_lo)
+ tgt->eaddr = inany_loopback4;
+ else
+ tgt->eaddr = inany_from_v4(c->ip4.addr_seen);
+ tgt->oaddr = inany_any4;
+ } else {
+ if (c->host_lo_to_ns_lo)
+ tgt->eaddr = inany_loopback6;
+ else
+ tgt->eaddr.a6 = c->ip6.addr_seen;
+ tgt->oaddr = inany_any6;
+ }
+
+ /* Let the kernel pick source port */
+ tgt->oport = 0;
+ if (proto == IPPROTO_UDP)
+ /* But for UDP preserve the source port */
+ tgt->oport = ini->eport;
+
+ return PIF_SPLICE;
+ }
+
+ if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_host_loopback) &&
+ inany_equals4(&ini->eaddr, &in4addr_loopback)) {
+ /* Specifically 127.0.0.1, not 127.0.0.0/8 */
+ tgt->oaddr = inany_from_v4(c->ip4.map_host_loopback);
+ } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) &&
+ inany_equals6(&ini->eaddr, &in6addr_loopback)) {
+ tgt->oaddr.a6 = c->ip6.map_host_loopback;
+ } else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) &&
+ inany_equals4(&ini->eaddr, &c->ip4.addr)) {
+ tgt->oaddr = inany_from_v4(c->ip4.map_guest_addr);
+ } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) &&
+ inany_equals6(&ini->eaddr, &c->ip6.addr)) {
+ tgt->oaddr.a6 = c->ip6.map_guest_addr;
+ } else if (!fwd_guest_accessible(c, &ini->eaddr)) {
+ if (inany_v4(&ini->eaddr)) {
+ if (IN4_IS_ADDR_UNSPECIFIED(&c->ip4.our_tap_addr))
+ /* No source address we can use */
+ return PIF_NONE;
+ tgt->oaddr = inany_from_v4(c->ip4.our_tap_addr);
+ } else {
+ tgt->oaddr.a6 = c->ip6.our_tap_ll;
+ }
+ } else {
+ tgt->oaddr = ini->eaddr;
+ }
+ tgt->oport = ini->eport;
+
+ if (inany_v4(&tgt->oaddr)) {
+ tgt->eaddr = inany_from_v4(c->ip4.addr_seen);
+ } else {
+ if (inany_is_linklocal6(&tgt->oaddr))
+ tgt->eaddr.a6 = c->ip6.addr_ll_seen;
+ else
+ tgt->eaddr.a6 = c->ip6.addr_seen;
+ }
+
+ return PIF_TAP;
+}