diff options
Diffstat (limited to 'udp.c')
| -rw-r--r-- | udp.c | 1758 |
1 files changed, 916 insertions, 842 deletions
@@ -15,79 +15,66 @@ /** * DOC: Theory of Operation * + * UDP Flows + * ========= * - * For UDP, a reduced version of port-based connection tracking is implemented - * with two purposes: - * - binding ephemeral ports when they're used as source port by the guest, so - * that replies on those ports can be forwarded back to the guest, with a - * fixed timeout for this binding - * - packets received from the local host get their source changed to a local - * address (gateway address) so that they can be forwarded to the guest, and - * packets sent as replies by the guest need their destination address to - * be changed back to the address of the local host. This is dynamic to allow - * connections from the gateway as well, and uses the same fixed 180s timeout - * - * Sockets for bound ports are created at initialisation time, one set for IPv4 - * and one for IPv6. + * UDP doesn't have true connections, but many protocols use a connection-like + * format. The flow is initiated by a client sending a datagram from a port of + * its choosing (usually ephemeral) to a specific port (usually well known) on a + * server. Both client and server address must be unicast. The server sends + * replies using the same addresses & ports with src/dest swapped. * - * Packets are forwarded back and forth, by prepending and stripping UDP headers - * in the obvious way, with no port translation. + * We track pseudo-connections of this type as flow table entries of type + * FLOW_UDP. We store the time of the last traffic on the flow in uflow->ts, + * and let the flow expire if there is no traffic for UDP_CONN_TIMEOUT seconds. * - * In PASTA mode, the L2-L4 translation is skipped for connections to ports - * bound between namespaces using the loopback interface, messages are directly - * transferred between L4 sockets instead. These are called spliced connections - * for consistency with the TCP implementation, but the splice() syscall isn't - * actually used as it wouldn't make sense for datagram-based connections: a - * pair of recvmmsg() and sendmmsg() deals with this case. + * NOTE: This won't handle multicast protocols, or some protocols with different + * port usage. We'll need specific logic if we want to handle those. + * + * "Listening" sockets + * =================== * - * The connection tracking for PASTA mode is slightly complicated by the absence - * of actual connections, see struct udp_splice_port, and these examples: + * UDP doesn't use listen(), but we consider long term sockets which are allowed + * to create new flows "listening" by analogy with TCP. This listening socket + * could receive packets from multiple flows, so we use a hash table match to + * find the specific flow for a datagram. * - * - from init to namespace: + * Flow sockets + * ============ * - * - forward direction: 127.0.0.1:5000 -> 127.0.0.1:80 in init from socket s, - * with epoll reference: index = 80, splice = 1, orig = 1, ns = 0 - * - if udp_splice_ns[V4][5000].sock: - * - send packet to udp_splice_ns[V4][5000].sock, with destination port - * 80 - * - otherwise: - * - create new socket udp_splice_ns[V4][5000].sock - * - bind in namespace to 127.0.0.1:5000 - * - add to epoll with reference: index = 5000, splice = 1, orig = 0, - * ns = 1 - * - update udp_splice_init[V4][80].ts and udp_splice_ns[V4][5000].ts with - * current time + * When a UDP flow targets a socket, we create a "flow" socket in + * uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive + * replies on the target side. This socket is both bound and connected and has + * EPOLL_TYPE_UDP. The connect() means it will only receive datagrams + * associated with this flow, so the epoll reference directly points to the flow + * and we don't need a hash lookup. * - * - reverse direction: 127.0.0.1:80 -> 127.0.0.1:5000 in namespace socket s, - * having epoll reference: index = 5000, splice = 1, orig = 0, ns = 1 - * - if udp_splice_init[V4][80].sock: - * - send to udp_splice_init[V4][80].sock, with destination port 5000 - * - update udp_splice_init[V4][80].ts and udp_splice_ns[V4][5000].ts with - * current time - * - otherwise, discard + * When a flow is initiated from a listening socket, we create a "flow" socket + * with the same bound address as the listening socket, but also connect()ed to + * the flow's peer. This is stored in uflow->s[INISIDE] and will last for the + * lifetime of the flow, even if the original listening socket is closed due to + * port auto-probing. The duplicate is used to deliver replies back to the + * originating side. * - * - from namespace to init: + * NOTE: A flow socket can have a bound address overlapping with a listening + * socket. That will happen naturally for flows initiated from a socket, but is + * also possible (though unlikely) for tap initiated flows, depending on the + * source port. We assume datagrams for the flow will come to a connect()ed + * socket in preference to a listening socket. The sample program + * doc/platform-requirements/reuseaddr-priority.c documents and tests that + * assumption. * - * - forward direction: 127.0.0.1:2000 -> 127.0.0.1:22 in namespace from - * socket s, with epoll reference: index = 22, splice = 1, orig = 1, ns = 1 - * - if udp4_splice_init[V4][2000].sock: - * - send packet to udp_splice_init[V4][2000].sock, with destination - * port 22 - * - otherwise: - * - create new socket udp_splice_init[V4][2000].sock - * - bind in init to 127.0.0.1:2000 - * - add to epoll with reference: index = 2000, splice = 1, orig = 0, - * ns = 0 - * - update udp_splice_ns[V4][22].ts and udp_splice_init[V4][2000].ts with - * current time + * "Spliced" flows + * =============== * - * - reverse direction: 127.0.0.1:22 -> 127.0.0.1:2000 in init from socket s, - * having epoll reference: index = 2000, splice = 1, orig = 0, ns = 0 - * - if udp_splice_ns[V4][22].sock: - * - send to udp_splice_ns[V4][22].sock, with destination port 2000 - * - update udp_splice_ns[V4][22].ts and udp_splice_init[V4][2000].ts with - * current time - * - otherwise, discard + * In PASTA mode, L2-L4 translation is skipped for connections to ports bound + * between namespaces using the loopback interface, messages are directly + * transferred between L4 sockets instead. These are called spliced connections + * in analogy with the TCP implementation. The the splice() syscall isn't + * actually used; it doesn't make sense for datagrams and instead a pair of + * recvmmsg() and sendmmsg() is used to forward the datagrams. + * + * Note that a spliced flow will have two flow sockets (see above). */ #include <sched.h> @@ -102,17 +89,21 @@ #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/udp.h> +#include <netinet/ip_icmp.h> +#include <netinet/icmp6.h> #include <stdint.h> #include <stddef.h> #include <string.h> -#include <sys/epoll.h> #include <sys/types.h> #include <sys/socket.h> #include <sys/uio.h> #include <time.h> +#include <arpa/inet.h> +#include <linux/errqueue.h> #include "checksum.h" #include "util.h" +#include "iov.h" #include "ip.h" #include "siphash.h" #include "inany.h" @@ -120,128 +111,87 @@ #include "tap.h" #include "pcap.h" #include "log.h" +#include "flow_table.h" +#include "udp_internal.h" +#include "udp_vu.h" +#include "epoll_ctl.h" -#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */ #define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ -/** - * struct udp_tap_port - Port tracking based on tap-facing source port - * @sock: Socket bound to source port used as index - * @flags: Flags for recent activity type seen from/to port - * @ts: Activity timestamp from tap, used for socket aging - */ -struct udp_tap_port { - int sock; - uint8_t flags; -#define PORT_LOCAL BIT(0) /* Port was contacted from local address */ -#define PORT_LOOPBACK BIT(1) /* Port was contacted from loopback address */ -#define PORT_GUA BIT(2) /* Port was contacted from global unicast */ -#define PORT_DNS_FWD BIT(3) /* Port used as source for DNS remapped query */ - - time_t ts; -}; - -/** - * struct udp_splice_port - Bound socket for spliced communication - * @sock: Socket bound to index port - * @ts: Activity timestamp - */ -struct udp_splice_port { - int sock; - time_t ts; -}; - -/* Port tracking, arrays indexed by packet source port (host order) */ -static struct udp_tap_port udp_tap_map [IP_VERSIONS][NUM_PORTS]; +/* Maximum UDP data to be returned in ICMP messages */ +#define ICMP4_MAX_DLEN 8 +#define ICMP6_MAX_DLEN (IPV6_MIN_MTU \ + - sizeof(struct udphdr) \ + - sizeof(struct ipv6hdr)) /* "Spliced" sockets indexed by bound port (host order) */ -static struct udp_splice_port udp_splice_ns [IP_VERSIONS][NUM_PORTS]; -static struct udp_splice_port udp_splice_init[IP_VERSIONS][NUM_PORTS]; - -enum udp_act_type { - UDP_ACT_TAP, - UDP_ACT_SPLICE_NS, - UDP_ACT_SPLICE_INIT, - UDP_ACT_TYPE_MAX, -}; - -/* Activity-based aging for bindings */ -static uint8_t udp_act[IP_VERSIONS][UDP_ACT_TYPE_MAX][DIV_ROUND_UP(NUM_PORTS, 8)]; +static int udp_splice_ns [IP_VERSIONS][NUM_PORTS]; +static int udp_splice_init[IP_VERSIONS][NUM_PORTS]; /* Static buffers */ -/** - * udp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections - * @s_in: Source socket address, filled in by recvmmsg() - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @uh: Headroom for UDP header - * @data: Storage for UDP payload - */ -static struct udp4_l2_buf_t { - struct sockaddr_in s_in; +/* UDP header and data for inbound messages */ +static struct udp_payload_t udp_payload[UDP_MAX_FRAMES]; - struct tap_hdr taph; - struct iphdr iph; - struct udphdr uh; - uint8_t data[USHRT_MAX - - (sizeof(struct iphdr) + sizeof(struct udphdr))]; -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -udp4_l2_buf[UDP_MAX_FRAMES]; +/* Ethernet headers for IPv4 and IPv6 frames */ +static struct ethhdr udp_eth_hdr[UDP_MAX_FRAMES]; /** - * udp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections - * @s_in6: Source socket address, filled in by recvmmsg() - * @taph: Tap-level headers (partially pre-filled) - * @ip6h: Pre-filled IP header (except for payload_len and addresses) - * @uh: Headroom for UDP header - * @data: Storage for UDP payload + * struct udp_meta_t - Pre-cooked headers for UDP packets + * @ip6h: Pre-filled IPv6 header (except for payload_len and addresses) + * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr) + * @taph: Tap backend specific header */ -struct udp6_l2_buf_t { - struct sockaddr_in6 s_in6; -#ifdef __AVX2__ - /* Align ip6h to 32-byte boundary. */ - uint8_t pad[64 - (sizeof(struct sockaddr_in6) + sizeof(struct ethhdr) + - sizeof(uint32_t))]; -#endif - - struct tap_hdr taph; +static struct udp_meta_t { struct ipv6hdr ip6h; - struct udphdr uh; - uint8_t data[USHRT_MAX - - (sizeof(struct ipv6hdr) + sizeof(struct udphdr))]; + struct iphdr ip4h; + struct tap_hdr taph; +} #ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) +__attribute__ ((aligned(32))) #endif -udp6_l2_buf[UDP_MAX_FRAMES]; +udp_meta[UDP_MAX_FRAMES]; -/* recvmmsg()/sendmmsg() data for tap */ -static struct iovec udp4_l2_iov_sock [UDP_MAX_FRAMES]; -static struct iovec udp6_l2_iov_sock [UDP_MAX_FRAMES]; +#define PKTINFO_SPACE \ + MAX(CMSG_SPACE(sizeof(struct in_pktinfo)), \ + CMSG_SPACE(sizeof(struct in6_pktinfo))) -static struct iovec udp4_l2_iov_tap [UDP_MAX_FRAMES]; -static struct iovec udp6_l2_iov_tap [UDP_MAX_FRAMES]; +#define RECVERR_SPACE \ + MAX(CMSG_SPACE(sizeof(struct sock_extended_err) + \ + sizeof(struct sockaddr_in)), \ + CMSG_SPACE(sizeof(struct sock_extended_err) + \ + sizeof(struct sockaddr_in6))) -static struct mmsghdr udp4_l2_mh_sock [UDP_MAX_FRAMES]; -static struct mmsghdr udp6_l2_mh_sock [UDP_MAX_FRAMES]; +/** + * enum udp_iov_idx - Indices for the buffers making up a single UDP frame + * @UDP_IOV_TAP tap specific header + * @UDP_IOV_ETH Ethernet header + * @UDP_IOV_IP IP (v4/v6) header + * @UDP_IOV_PAYLOAD IP payload (UDP header + data) + * @UDP_IOV_ETH_PAD Ethernet (802.3) padding to 60 bytes + * @UDP_NUM_IOVS the number of entries in the iovec array + */ +enum udp_iov_idx { + UDP_IOV_TAP, + UDP_IOV_ETH, + UDP_IOV_IP, + UDP_IOV_PAYLOAD, + UDP_IOV_ETH_PAD, + UDP_NUM_IOVS, +}; -/* recvmmsg()/sendmmsg() data for "spliced" connections */ -static struct iovec udp4_iov_splice [UDP_MAX_FRAMES]; -static struct iovec udp6_iov_splice [UDP_MAX_FRAMES]; +/* IOVs and msghdr arrays for receiving datagrams from sockets */ +static struct iovec udp_iov_recv [UDP_MAX_FRAMES]; +static struct mmsghdr udp_mh_recv [UDP_MAX_FRAMES]; -static struct sockaddr_in udp4_localname = { - .sin_family = AF_INET, - .sin_addr = IN4ADDR_LOOPBACK_INIT, -}; -static struct sockaddr_in6 udp6_localname = { - .sin6_family = AF_INET6, - .sin6_addr = IN6ADDR_LOOPBACK_INIT, -}; +/* IOVs and msghdr arrays for sending "spliced" datagrams to sockets */ +static union sockaddr_inany udp_splice_to; + +static struct iovec udp_iov_splice [UDP_MAX_FRAMES]; +static struct mmsghdr udp_mh_splice [UDP_MAX_FRAMES]; -static struct mmsghdr udp4_mh_splice [UDP_MAX_FRAMES]; -static struct mmsghdr udp6_mh_splice [UDP_MAX_FRAMES]; +/* IOVs for L2 frames */ +static struct iovec udp_l2_iov [UDP_MAX_FRAMES][UDP_NUM_IOVS]; /** * udp_portmap_clear() - Clear UDP port map before configuration @@ -251,541 +201,795 @@ void udp_portmap_clear(void) unsigned i; for (i = 0; i < NUM_PORTS; i++) { - udp_tap_map[V4][i].sock = udp_tap_map[V6][i].sock = -1; - udp_splice_ns[V4][i].sock = udp_splice_ns[V6][i].sock = -1; - udp_splice_init[V4][i].sock = udp_splice_init[V6][i].sock = -1; - } -} - -/** - * udp_invert_portmap() - Compute reverse port translations for return packets - * @fwd: Port forwarding configuration to compute reverse map for - */ -static void udp_invert_portmap(struct udp_fwd_ports *fwd) -{ - unsigned int i; - - static_assert(ARRAY_SIZE(fwd->f.delta) == ARRAY_SIZE(fwd->rdelta), - "Forward and reverse delta arrays must have same size"); - for (i = 0; i < ARRAY_SIZE(fwd->f.delta); i++) { - in_port_t delta = fwd->f.delta[i]; - in_port_t rport = i + delta; - - if (delta) - fwd->rdelta[rport] = NUM_PORTS - delta; + udp_splice_ns[V4][i] = udp_splice_ns[V6][i] = -1; + udp_splice_init[V4][i] = udp_splice_init[V6][i] = -1; } } /** * udp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses * @eth_d: Ethernet destination address, NULL if unchanged - * @eth_s: Ethernet source address, NULL if unchanged */ -void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) +void udp_update_l2_buf(const unsigned char *eth_d) { int i; - for (i = 0; i < UDP_MAX_FRAMES; i++) { - struct udp4_l2_buf_t *b4 = &udp4_l2_buf[i]; - struct udp6_l2_buf_t *b6 = &udp6_l2_buf[i]; - - eth_update_mac(&b4->taph.eh, eth_d, eth_s); - eth_update_mac(&b6->taph.eh, eth_d, eth_s); - } + for (i = 0; i < UDP_MAX_FRAMES; i++) + eth_update_mac(&udp_eth_hdr[i], eth_d, NULL); } /** - * udp_sock4_iov_init_one() - Initialise a scatter-gather L2 buffer for IPv4 + * udp_iov_init_one() - Initialise scatter-gather lists for one buffer * @c: Execution context * @i: Index of buffer to initialize */ -static void udp_sock4_iov_init_one(const struct ctx *c, size_t i) +static void udp_iov_init_one(const struct ctx *c, size_t i) { - struct msghdr *mh = &udp4_l2_mh_sock[i].msg_hdr; - struct udp4_l2_buf_t *buf = &udp4_l2_buf[i]; - struct iovec *siov = &udp4_l2_iov_sock[i]; - struct iovec *tiov = &udp4_l2_iov_tap[i]; - - *buf = (struct udp4_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = L2_BUF_IP4_INIT(IPPROTO_UDP) + struct udp_payload_t *payload = &udp_payload[i]; + struct msghdr *mh = &udp_mh_recv[i].msg_hdr; + struct udp_meta_t *meta = &udp_meta[i]; + struct iovec *siov = &udp_iov_recv[i]; + struct iovec *tiov = udp_l2_iov[i]; + + *meta = (struct udp_meta_t) { + .ip4h = L2_BUF_IP4_INIT(IPPROTO_UDP), + .ip6h = L2_BUF_IP6_INIT(IPPROTO_UDP), }; - siov->iov_base = buf->data; - siov->iov_len = sizeof(buf->data); + *siov = IOV_OF_LVALUE(payload->data); + + tiov[UDP_IOV_ETH] = IOV_OF_LVALUE(udp_eth_hdr[i]); + tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph); + tiov[UDP_IOV_PAYLOAD].iov_base = payload; + tiov[UDP_IOV_ETH_PAD].iov_base = eth_pad; - mh->msg_name = &buf->s_in; - mh->msg_namelen = sizeof(buf->s_in); mh->msg_iov = siov; mh->msg_iovlen = 1; - - tiov->iov_base = tap_frame_base(c, &buf->taph); } /** - * udp_sock6_iov_init_one() - Initialise a scatter-gather L2 buffer for IPv6 + * udp_iov_init() - Initialise scatter-gather L2 buffers * @c: Execution context - * @i: Index of buffer to initialize */ -static void udp_sock6_iov_init_one(const struct ctx *c, size_t i) +static void udp_iov_init(const struct ctx *c) { - struct msghdr *mh = &udp6_l2_mh_sock[i].msg_hdr; - struct udp6_l2_buf_t *buf = &udp6_l2_buf[i]; - struct iovec *siov = &udp6_l2_iov_sock[i]; - struct iovec *tiov = &udp6_l2_iov_tap[i]; - - *buf = (struct udp6_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IPV6), - .ip6h = L2_BUF_IP6_INIT(IPPROTO_UDP) - }; - - siov->iov_base = buf->data; - siov->iov_len = sizeof(buf->data); - - mh->msg_name = &buf->s_in6; - mh->msg_namelen = sizeof(buf->s_in6); - mh->msg_iov = siov; - mh->msg_iovlen = 1; + size_t i; - tiov->iov_base = tap_frame_base(c, &buf->taph); + for (i = 0; i < UDP_MAX_FRAMES; i++) + udp_iov_init_one(c, i); } /** - * udp_sock_iov_init() - Initialise scatter-gather L2 buffers - * @c: Execution context + * udp_update_hdr4() - Update headers for one IPv4 datagram + * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr) + * @bp: Pointer to udp_payload_t to update + * @toside: Flowside for destination side + * @dlen: Length of UDP payload + * @no_udp_csum: Do not set UDP checksum + * + * Return: size of IPv4 payload (UDP header + data) */ -static void udp_sock_iov_init(const struct ctx *c) +size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, + const struct flowside *toside, size_t dlen, + bool no_udp_csum) { - size_t i; - - for (i = 0; i < UDP_MAX_FRAMES; i++) { - if (c->ifi4) - udp_sock4_iov_init_one(c, i); - if (c->ifi6) - udp_sock6_iov_init_one(c, i); + const struct in_addr *src = inany_v4(&toside->oaddr); + const struct in_addr *dst = inany_v4(&toside->eaddr); + size_t l4len = dlen + sizeof(bp->uh); + size_t l3len = l4len + sizeof(*ip4h); + + ASSERT(src && dst); + + ip4h->tot_len = htons(l3len); + ip4h->daddr = dst->s_addr; + ip4h->saddr = src->s_addr; + ip4h->check = csum_ip4_header(l3len, IPPROTO_UDP, *src, *dst); + + bp->uh.source = htons(toside->oport); + bp->uh.dest = htons(toside->eport); + bp->uh.len = htons(l4len); + if (no_udp_csum) { + bp->uh.check = 0; + } else { + const struct iovec iov = { + .iov_base = bp->data, + .iov_len = dlen + }; + struct iov_tail data = IOV_TAIL(&iov, 1, 0); + csum_udp4(&bp->uh, *src, *dst, &data); } + + return l4len; } /** - * udp_splice_new() - Create and prepare socket for "spliced" binding - * @c: Execution context - * @v6: Set for IPv6 sockets - * @src: Source port of original connection, host order - * @ns: Does the splice originate in the ns or not - * - * Return: prepared socket, negative error code on failure + * udp_update_hdr6() - Update headers for one IPv6 datagram + * @ip6h: Pre-filled IPv6 header (except for payload_len and + * addresses) + * @bp: Pointer to udp_payload_t to update + * @toside: Flowside for destination side + * @dlen: Length of UDP payload + * @no_udp_csum: Do not set UDP checksum * - * #syscalls:pasta getsockname + * Return: size of IPv6 payload (UDP header + data) */ -int udp_splice_new(const struct ctx *c, int v6, in_port_t src, bool ns) +size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, + const struct flowside *toside, size_t dlen, + bool no_udp_csum) { - struct epoll_event ev = { .events = EPOLLIN | EPOLLRDHUP | EPOLLHUP }; - union epoll_ref ref = { .type = EPOLL_TYPE_UDP, - .udp = { .splice = true, .v6 = v6, .port = src } - }; - struct udp_splice_port *sp; - int act, s; - - if (ns) { - ref.udp.pif = PIF_SPLICE; - sp = &udp_splice_ns[v6 ? V6 : V4][src]; - act = UDP_ACT_SPLICE_NS; + uint16_t l4len = dlen + sizeof(bp->uh); + + ip6h->payload_len = htons(l4len); + ip6h->daddr = toside->eaddr.a6; + ip6h->saddr = toside->oaddr.a6; + ip6h->version = 6; + ip6h->nexthdr = IPPROTO_UDP; + ip6h->hop_limit = 255; + + bp->uh.source = htons(toside->oport); + bp->uh.dest = htons(toside->eport); + bp->uh.len = ip6h->payload_len; + if (no_udp_csum) { + /* 0 is an invalid checksum for UDP IPv6 and dropped by + * the kernel stack, even if the checksum is disabled by virtio + * flags. We need to put any non-zero value here. + */ + bp->uh.check = 0xffff; } else { - ref.udp.pif = PIF_HOST; - sp = &udp_splice_init[v6 ? V6 : V4][src]; - act = UDP_ACT_SPLICE_INIT; + const struct iovec iov = { + .iov_base = bp->data, + .iov_len = dlen + }; + struct iov_tail data = IOV_TAIL(&iov, 1, 0); + csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, &data); } - s = socket(v6 ? AF_INET6 : AF_INET, SOCK_DGRAM | SOCK_NONBLOCK, - IPPROTO_UDP); - - if (s > FD_REF_MAX) { - close(s); - return -EIO; - } + return l4len; +} - if (s < 0) - return s; +/** + * udp_tap_pad() - Calculate padding to send out of padding (zero) buffer + * @iov: Pointer to iovec of frame parts we're about to send + */ +static void udp_tap_pad(struct iovec *iov) +{ + size_t l2len = iov[UDP_IOV_ETH].iov_len + + iov[UDP_IOV_IP].iov_len + + iov[UDP_IOV_PAYLOAD].iov_len; - ref.fd = s; + if (l2len < ETH_ZLEN) + iov[UDP_IOV_ETH_PAD].iov_len = ETH_ZLEN - l2len; + else + iov[UDP_IOV_ETH_PAD].iov_len = 0; +} - if (v6) { - struct sockaddr_in6 addr6 = { - .sin6_family = AF_INET6, - .sin6_port = htons(src), - .sin6_addr = IN6ADDR_LOOPBACK_INIT, - }; - if (bind(s, (struct sockaddr *)&addr6, sizeof(addr6))) - goto fail; +/** + * udp_tap_prepare() - Convert one datagram into a tap frame + * @mmh: Receiving mmsghdr array + * @idx: Index of the datagram to prepare + * @tap_omac: MAC address of remote endpoint as seen from the guest + * @toside: Flowside for destination side + * @no_udp_csum: Do not set UDP checksum + */ +static void udp_tap_prepare(const struct mmsghdr *mmh, + unsigned int idx, + const uint8_t *tap_omac, + const struct flowside *toside, + bool no_udp_csum) +{ + struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx]; + struct ethhdr *eh = (*tap_iov)[UDP_IOV_ETH].iov_base; + struct udp_payload_t *bp = &udp_payload[idx]; + struct udp_meta_t *bm = &udp_meta[idx]; + size_t l4len, l2len; + + eth_update_mac(eh, NULL, tap_omac); + if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) { + l4len = udp_update_hdr6(&bm->ip6h, bp, toside, + mmh[idx].msg_len, no_udp_csum); + + l2len = MAX(l4len + sizeof(bm->ip6h) + ETH_HLEN, ETH_ZLEN); + tap_hdr_update(&bm->taph, l2len); + + eh->h_proto = htons_constant(ETH_P_IPV6); + (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h); } else { - struct sockaddr_in addr4 = { - .sin_family = AF_INET, - .sin_port = htons(src), - .sin_addr = IN4ADDR_LOOPBACK_INIT, - }; - if (bind(s, (struct sockaddr *)&addr4, sizeof(addr4))) - goto fail; - } + l4len = udp_update_hdr4(&bm->ip4h, bp, toside, + mmh[idx].msg_len, no_udp_csum); - sp->sock = s; - bitmap_set(udp_act[v6 ? V6 : V4][act], src); + l2len = MAX(l4len + sizeof(bm->ip4h) + ETH_HLEN, ETH_ZLEN); + tap_hdr_update(&bm->taph, l2len); - ev.data.u64 = ref.u64; - epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev); - return s; + eh->h_proto = htons_constant(ETH_P_IP); + (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip4h); + } + (*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len; -fail: - close(s); - return -1; + udp_tap_pad(*tap_iov); } /** - * struct udp_splice_new_ns_arg - Arguments for udp_splice_new_ns() + * udp_send_tap_icmp4() - Construct and send ICMPv4 to local peer * @c: Execution context - * @v6: Set for IPv6 - * @src: Source port of originating datagram, host order - * @dst: Destination port of originating datagram, host order - * @s: Newly created socket or negative error code + * @ee: Extended error descriptor + * @toside: Destination side of flow + * @saddr: Address of ICMP generating node + * @in: First bytes (max 8) of original UDP message body + * @dlen: Length of the read part of original UDP message body */ -struct udp_splice_new_ns_arg { - const struct ctx *c; - int v6; - in_port_t src; - int s; -}; +static void udp_send_tap_icmp4(const struct ctx *c, + const struct sock_extended_err *ee, + const struct flowside *toside, + struct in_addr saddr, + const void *in, size_t dlen) +{ + struct in_addr oaddr = toside->oaddr.v4mapped.a4; + struct in_addr eaddr = toside->eaddr.v4mapped.a4; + in_port_t eport = toside->eport; + in_port_t oport = toside->oport; + union inany_addr saddr_any; + uint8_t tap_omac[ETH_ALEN]; + struct { + struct icmphdr icmp4h; + struct iphdr ip4h; + struct udphdr uh; + char data[ICMP4_MAX_DLEN]; + } __attribute__((packed, aligned(__alignof__(max_align_t)))) msg; + size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen; + size_t l4len = dlen + sizeof(struct udphdr); + + ASSERT(dlen <= ICMP4_MAX_DLEN); + memset(&msg, 0, sizeof(msg)); + msg.icmp4h.type = ee->ee_type; + msg.icmp4h.code = ee->ee_code; + if (ee->ee_type == ICMP_DEST_UNREACH && ee->ee_code == ICMP_FRAG_NEEDED) + msg.icmp4h.un.frag.mtu = htons((uint16_t) ee->ee_info); + + /* Reconstruct the original headers as returned in the ICMP message */ + tap_push_ip4h(&msg.ip4h, eaddr, oaddr, l4len, IPPROTO_UDP); + tap_push_uh4(&msg.uh, eaddr, eport, oaddr, oport, in, dlen); + memcpy(&msg.data, in, dlen); + + /* Try to obtain the MAC address of the generating node */ + saddr_any = inany_from_v4(saddr); + fwd_neigh_mac_get(c, &saddr_any, tap_omac); + tap_icmp4_send(c, saddr, eaddr, &msg, tap_omac, msglen); +} + /** - * udp_splice_new_ns() - Enter namespace and call udp_splice_new() - * @arg: See struct udp_splice_new_ns_arg - * - * Return: 0 + * udp_send_tap_icmp6() - Construct and send ICMPv6 to local peer + * @c: Execution context + * @ee: Extended error descriptor + * @toside: Destination side of flow + * @saddr: Address of ICMP generating node + * @in: First bytes (max 1232) of original UDP message body + * @dlen: Length of the read part of original UDP message body + * @flow: IPv6 flow identifier */ -static int udp_splice_new_ns(void *arg) +static void udp_send_tap_icmp6(const struct ctx *c, + const struct sock_extended_err *ee, + const struct flowside *toside, + const struct in6_addr *saddr, + void *in, size_t dlen, uint32_t flow) { - struct udp_splice_new_ns_arg *a; - - a = (struct udp_splice_new_ns_arg *)arg; - - ns_enter(a->c); - - a->s = udp_splice_new(a->c, a->v6, a->src, true); - - return 0; + const struct in6_addr *oaddr = &toside->oaddr.a6; + const struct in6_addr *eaddr = &toside->eaddr.a6; + in_port_t eport = toside->eport; + in_port_t oport = toside->oport; + uint8_t tap_omac[ETH_ALEN]; + struct { + struct icmp6_hdr icmp6h; + struct ipv6hdr ip6h; + struct udphdr uh; + char data[ICMP6_MAX_DLEN]; + } __attribute__((packed, aligned(__alignof__(max_align_t)))) msg; + size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen; + size_t l4len = dlen + sizeof(struct udphdr); + + ASSERT(dlen <= ICMP6_MAX_DLEN); + memset(&msg, 0, sizeof(msg)); + msg.icmp6h.icmp6_type = ee->ee_type; + msg.icmp6h.icmp6_code = ee->ee_code; + if (ee->ee_type == ICMP6_PACKET_TOO_BIG) + msg.icmp6h.icmp6_dataun.icmp6_un_data32[0] = htonl(ee->ee_info); + + /* Reconstruct the original headers as returned in the ICMP message */ + tap_push_ip6h(&msg.ip6h, eaddr, oaddr, l4len, IPPROTO_UDP, flow); + tap_push_uh6(&msg.uh, eaddr, eport, oaddr, oport, in, dlen); + memcpy(&msg.data, in, dlen); + + /* Try to obtain the MAC address of the generating node */ + fwd_neigh_mac_get(c, (union inany_addr *) saddr, tap_omac); + tap_icmp6_send(c, saddr, eaddr, &msg, tap_omac, msglen); } /** - * udp_mmh_splice_port() - Is source address of message suitable for splicing? - * @v6: Is @sa a sockaddr_in6 (otherwise sockaddr_in)? - * @mmh: mmsghdr of incoming message + * udp_pktinfo() - Retrieve packet destination address from cmsg + * @msg: msghdr into which message has been received + * @dst: (Local) destination address of message in @msg (output) * - * Return: if @sa refers to localhost (127.0.0.1 or ::1) the port from - * @sa in host order, otherwise -1. + * Return: 0 on success, -1 if the information was missing (@dst is set to + * inany_any6). */ -static int udp_mmh_splice_port(bool v6, const struct mmsghdr *mmh) +static int udp_pktinfo(struct msghdr *msg, union inany_addr *dst) { - const struct sockaddr_in6 *sa6 = mmh->msg_hdr.msg_name; - const struct sockaddr_in *sa4 = mmh->msg_hdr.msg_name; + struct cmsghdr *hdr; - if (v6 && IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr)) - return ntohs(sa6->sin6_port); + for (hdr = CMSG_FIRSTHDR(msg); hdr; hdr = CMSG_NXTHDR(msg, hdr)) { + if (hdr->cmsg_level == IPPROTO_IP && + hdr->cmsg_type == IP_PKTINFO) { + const struct in_pktinfo *i4 = (void *)CMSG_DATA(hdr); - if (!v6 && IN4_IS_ADDR_LOOPBACK(&sa4->sin_addr)) - return ntohs(sa4->sin_port); + *dst = inany_from_v4(i4->ipi_addr); + return 0; + } + if (hdr->cmsg_level == IPPROTO_IPV6 && + hdr->cmsg_type == IPV6_PKTINFO) { + const struct in6_pktinfo *i6 = (void *)CMSG_DATA(hdr); + + dst->a6 = i6->ipi6_addr; + return 0; + } + } + + debug("Missing PKTINFO cmsg on datagram"); + *dst = inany_any6; return -1; } /** - * udp_splice_sendfrom() - Send datagrams from given port to given port + * udp_sock_recverr() - Receive and clear an error from a socket * @c: Execution context - * @start: Index of first datagram in udp[46]_l2_buf - * @n: Number of datagrams to send - * @src: Datagrams will be sent from this port (on origin side) - * @dst: Datagrams will be send to this port (on destination side) - * @from_pif: pif from which the packet originated - * @v6: Send as IPv6? - * @allow_new: If true create sending socket if needed, if false discard - * if no sending socket is available - * @now: Timestamp + * @s: Socket to receive errors from + * @sidx: Flow and side of @s, or FLOW_SIDX_NONE if unknown + * @pif: Interface on which the error occurred + * (only used if @sidx == FLOW_SIDX_NONE) + * @port: Local port number of @s (only used if @sidx == FLOW_SIDX_NONE) + * + * Return: 1 if error received and processed, 0 if no more errors in queue, < 0 + * if there was an error reading the queue + * + * #syscalls recvmsg */ -static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n, - in_port_t src, in_port_t dst, uint8_t from_pif, - bool v6, bool allow_new, - const struct timespec *now) +static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx, + uint8_t pif, in_port_t port) { - struct mmsghdr *mmh_recv, *mmh_send; - unsigned int i; - int s; + char buf[PKTINFO_SPACE + RECVERR_SPACE]; + const struct sock_extended_err *ee; + char data[ICMP6_MAX_DLEN]; + struct cmsghdr *hdr; + struct iovec iov = { + .iov_base = data, + .iov_len = sizeof(data) + }; + union sockaddr_inany src; + struct msghdr mh = { + .msg_name = &src, + .msg_namelen = sizeof(src), + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = buf, + .msg_controllen = sizeof(buf), + }; + const struct flowside *fromside, *toside; + union inany_addr offender, otap; + char astr[INANY_ADDRSTRLEN]; + char sastr[SOCKADDR_STRLEN]; + const struct in_addr *o4; + in_port_t offender_port; + struct udp_flow *uflow; + uint8_t topif; + size_t dlen; + ssize_t rc; + + rc = recvmsg(s, &mh, MSG_ERRQUEUE); + if (rc < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) + return 0; + + err_perror("UDP: Failed to read error queue"); + return -1; + } - if (v6) { - mmh_recv = udp6_l2_mh_sock; - mmh_send = udp6_mh_splice; - } else { - mmh_recv = udp4_l2_mh_sock; - mmh_send = udp4_mh_splice; + if (!(mh.msg_flags & MSG_ERRQUEUE)) { + err("Missing MSG_ERRQUEUE flag reading error queue"); + return -1; } - if (from_pif == PIF_SPLICE) { - src += c->udp.fwd_in.rdelta[src]; - s = udp_splice_init[v6][src].sock; - if (s < 0 && allow_new) - s = udp_splice_new(c, v6, src, false); + for (hdr = CMSG_FIRSTHDR(&mh); hdr; hdr = CMSG_NXTHDR(&mh, hdr)) { + if ((hdr->cmsg_level == IPPROTO_IP && + hdr->cmsg_type == IP_RECVERR) || + (hdr->cmsg_level == IPPROTO_IPV6 && + hdr->cmsg_type == IPV6_RECVERR)) + break; + } - if (s < 0) - return; + if (!hdr) { + err("Missing RECVERR cmsg in error queue"); + return -1; + } - udp_splice_ns[v6][dst].ts = now->tv_sec; - udp_splice_init[v6][src].ts = now->tv_sec; - } else { - ASSERT(from_pif == PIF_HOST); - src += c->udp.fwd_out.rdelta[src]; - s = udp_splice_ns[v6][src].sock; - if (s < 0 && allow_new) { - struct udp_splice_new_ns_arg arg = { - c, v6, src, -1, - }; - - NS_CALL(udp_splice_new_ns, &arg); - s = arg.s; + ee = (const struct sock_extended_err *)CMSG_DATA(hdr); + + debug("%s error on UDP socket %i: %s", + str_ee_origin(ee), s, strerror_(ee->ee_errno)); + + if (!flow_sidx_valid(sidx)) { + /* No hint from the socket, determine flow from addresses */ + union inany_addr dst; + + if (udp_pktinfo(&mh, &dst) < 0) { + debug("Missing PKTINFO on UDP error"); + return 1; } - if (s < 0) - return; - udp_splice_init[v6][dst].ts = now->tv_sec; - udp_splice_ns[v6][src].ts = now->tv_sec; + sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, &src, &dst, port); + if (!flow_sidx_valid(sidx)) { + debug("Ignoring UDP error without flow"); + return 1; + } + } else { + pif = pif_at_sidx(sidx); + } + + uflow = udp_at_sidx(sidx); + ASSERT(uflow); + fromside = &uflow->f.side[sidx.sidei]; + toside = &uflow->f.side[!sidx.sidei]; + topif = uflow->f.pif[!sidx.sidei]; + dlen = rc; + + if (inany_from_sockaddr(&offender, &offender_port, + SO_EE_OFFENDER(ee)) < 0) + goto fail; + + if (pif != PIF_HOST || topif != PIF_TAP) + /* XXX Can we support any other cases? */ + goto fail; + + /* If the offender *is* the endpoint, make sure our translation is + * consistent with the flow's translation. This matters if the flow + * endpoint has a port specific translation (like --dns-match). + */ + if (inany_equals(&offender, &fromside->eaddr)) + otap = toside->oaddr; + else if (!nat_inbound(c, &offender, &otap)) + goto fail; + + if (hdr->cmsg_level == IPPROTO_IP && + (o4 = inany_v4(&otap)) && inany_v4(&toside->eaddr)) { + dlen = MIN(dlen, ICMP4_MAX_DLEN); + udp_send_tap_icmp4(c, ee, toside, *o4, data, dlen); + return 1; } - for (i = start; i < start + n; i++) - mmh_send[i].msg_hdr.msg_iov->iov_len = mmh_recv[i].msg_len; + if (hdr->cmsg_level == IPPROTO_IPV6 && !inany_v4(&toside->eaddr)) { + udp_send_tap_icmp6(c, ee, toside, &otap.a6, data, dlen, + FLOW_IDX(uflow)); + return 1; + } - sendmmsg(s, mmh_send + start, n, MSG_NOSIGNAL); +fail: + flow_dbg(uflow, "Can't propagate %s error from %s %s to %s %s", + str_ee_origin(ee), + pif_name(pif), + sockaddr_ntop(SO_EE_OFFENDER(ee), sastr, sizeof(sastr)), + pif_name(topif), + inany_ntop(&toside->eaddr, astr, sizeof(astr))); + return 1; } /** - * udp_update_hdr4() - Update headers for one IPv4 datagram + * udp_sock_errs() - Process errors on a socket * @c: Execution context - * @b: Pointer to udp4_l2_buf to update - * @dstport: Destination port number - * @datalen: Length of UDP payload - * @now: Current timestamp + * @s: Socket to receive errors from + * @sidx: Flow and side of @s, or FLOW_SIDX_NONE if unknown + * @pif: Interface on which the error occurred + * (only used if @sidx == FLOW_SIDX_NONE) + * @port: Local port number of @s (only used if @sidx == FLOW_SIDX_NONE) * - * Return: size of tap frame with headers + * Return: number of errors handled, or < 0 if we have an unrecoverable error */ -static size_t udp_update_hdr4(const struct ctx *c, struct udp4_l2_buf_t *b, - in_port_t dstport, size_t datalen, - const struct timespec *now) +static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx, + uint8_t pif, in_port_t port) { - size_t ip_len = datalen + sizeof(b->iph) + sizeof(b->uh); - in_port_t srcport = ntohs(b->s_in.sin_port); - struct in_addr src = b->s_in.sin_addr; - - if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match) && - IN4_ARE_ADDR_EQUAL(&src, &c->ip4.dns_host) && srcport == 53 && - (udp_tap_map[V4][dstport].flags & PORT_DNS_FWD)) { - src = c->ip4.dns_match; - } else if (IN4_IS_ADDR_LOOPBACK(&src) || - IN4_ARE_ADDR_EQUAL(&src, &c->ip4.addr_seen)) { - udp_tap_map[V4][srcport].ts = now->tv_sec; - udp_tap_map[V4][srcport].flags |= PORT_LOCAL; - - if (IN4_IS_ADDR_LOOPBACK(&src)) - udp_tap_map[V4][srcport].flags |= PORT_LOOPBACK; - else - udp_tap_map[V4][srcport].flags &= ~PORT_LOOPBACK; - - bitmap_set(udp_act[V4][UDP_ACT_TAP], srcport); - - src = c->ip4.gw; + unsigned n_err = 0; + socklen_t errlen; + int rc, err; + + ASSERT(!c->no_udp); + + /* Empty the error queue */ + while ((rc = udp_sock_recverr(c, s, sidx, pif, port)) > 0) + n_err += rc; + + if (rc < 0) + return -1; /* error reading error, unrecoverable */ + + errlen = sizeof(err); + if (getsockopt(s, SOL_SOCKET, SO_ERROR, &err, &errlen) < 0 || + errlen != sizeof(err)) { + err_perror("Error reading SO_ERROR"); + return -1; /* error reading error, unrecoverable */ } - b->iph.tot_len = htons(ip_len); - b->iph.daddr = c->ip4.addr_seen.s_addr; - b->iph.saddr = src.s_addr; - b->iph.check = csum_ip4_header(b->iph.tot_len, IPPROTO_UDP, - src, c->ip4.addr_seen); + if (err) { + debug("Unqueued error on UDP socket %i: %s", s, strerror_(err)); + n_err++; + } - b->uh.source = b->s_in.sin_port; - b->uh.dest = htons(dstport); - b->uh.len = htons(datalen + sizeof(b->uh)); + if (!n_err) { + /* EPOLLERR, but no errors to clear !? */ + err("EPOLLERR event without reported errors on socket %i", s); + return -1; /* no way to clear, unrecoverable */ + } - return tap_frame_len(c, &b->taph, ip_len); + return n_err; } /** - * udp_update_hdr6() - Update headers for one IPv6 datagram - * @c: Execution context - * @b: Pointer to udp6_l2_buf to update - * @dstport: Destination port number - * @datalen: Length of UDP payload - * @now: Current timestamp + * udp_peek_addr() - Get source address for next packet + * @s: Socket to get information from + * @src: Socket address (output) + * @dst: (Local) destination address (output) * - * Return: size of tap frame with headers + * Return: 0 if no more packets, 1 on success, -ve error code on error */ -static size_t udp_update_hdr6(const struct ctx *c, struct udp6_l2_buf_t *b, - in_port_t dstport, size_t datalen, - const struct timespec *now) +static int udp_peek_addr(int s, union sockaddr_inany *src, + union inany_addr *dst) { - const struct in6_addr *src = &b->s_in6.sin6_addr; - const struct in6_addr *dst = &c->ip6.addr_seen; - uint16_t payload_len = datalen + sizeof(b->uh); - in_port_t srcport = ntohs(b->s_in6.sin6_port); - - if (IN6_IS_ADDR_LINKLOCAL(src)) { - dst = &c->ip6.addr_ll_seen; - } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match) && - IN6_ARE_ADDR_EQUAL(src, &c->ip6.dns_host) && - srcport == 53 && - (udp_tap_map[V4][dstport].flags & PORT_DNS_FWD)) { - src = &c->ip6.dns_match; - } else if (IN6_IS_ADDR_LOOPBACK(src) || - IN6_ARE_ADDR_EQUAL(src, &c->ip6.addr_seen) || - IN6_ARE_ADDR_EQUAL(src, &c->ip6.addr)) { - udp_tap_map[V6][srcport].ts = now->tv_sec; - udp_tap_map[V6][srcport].flags |= PORT_LOCAL; - - if (IN6_IS_ADDR_LOOPBACK(src)) - udp_tap_map[V6][srcport].flags |= PORT_LOOPBACK; - else - udp_tap_map[V6][srcport].flags &= ~PORT_LOOPBACK; - - if (IN6_ARE_ADDR_EQUAL(src, &c->ip6.addr)) - udp_tap_map[V6][srcport].flags |= PORT_GUA; - else - udp_tap_map[V6][srcport].flags &= ~PORT_GUA; - - bitmap_set(udp_act[V6][UDP_ACT_TAP], srcport); - - dst = &c->ip6.addr_ll_seen; - - if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw)) - src = &c->ip6.gw; - else - src = &c->ip6.addr_ll; + char sastr[SOCKADDR_STRLEN], dstr[INANY_ADDRSTRLEN]; + char cmsg[PKTINFO_SPACE]; + struct msghdr msg = { + .msg_name = src, + .msg_namelen = sizeof(*src), + .msg_control = cmsg, + .msg_controllen = sizeof(cmsg), + }; + int rc; + rc = recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT); + if (rc < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) + return 0; + return -errno; } - b->ip6h.payload_len = htons(payload_len); - b->ip6h.daddr = *dst; - b->ip6h.saddr = *src; - b->ip6h.version = 6; - b->ip6h.nexthdr = IPPROTO_UDP; - b->ip6h.hop_limit = 255; + udp_pktinfo(&msg, dst); - b->uh.source = b->s_in6.sin6_port; - b->uh.dest = htons(dstport); - b->uh.len = b->ip6h.payload_len; - csum_udp6(&b->uh, src, dst, b->data, datalen); + trace("Peeked UDP datagram: %s -> %s", + sockaddr_ntop(src, sastr, sizeof(sastr)), + inany_ntop(dst, dstr, sizeof(dstr))); - return tap_frame_len(c, &b->taph, payload_len + sizeof(b->ip6h)); + return 1; } /** - * udp_tap_send() - Prepare UDP datagrams and send to tap interface + * udp_sock_recv() - Receive datagrams from a socket * @c: Execution context - * @start: Index of first datagram in udp[46]_l2_buf pool - * @n: Number of datagrams to send - * @dstport: Destination port number - * @v6: True if using IPv6 - * @now: Current timestamp + * @s: Socket to receive from + * @mmh: mmsghdr array to receive into + * @n: Maximum number of datagrams to receive * - * Return: size of tap frame with headers + * Return: number of datagrams received + * + * #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64 */ -static void udp_tap_send(const struct ctx *c, - unsigned int start, unsigned int n, - in_port_t dstport, bool v6, const struct timespec *now) +static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n) { - struct iovec *tap_iov; - unsigned int i; + ASSERT(!c->no_udp); - if (v6) - tap_iov = udp6_l2_iov_tap; - else - tap_iov = udp4_l2_iov_tap; + n = recvmmsg(s, mmh, n, 0, NULL); + if (n < 0) { + trace("Error receiving datagrams: %s", strerror_(errno)); + /* Bail out and let the EPOLLERR handler deal with it */ + return 0; + } - for (i = start; i < start + n; i++) { - size_t buf_len; + return n; +} - if (v6) - buf_len = udp_update_hdr6(c, &udp6_l2_buf[i], dstport, - udp6_l2_mh_sock[i].msg_len, now); - else - buf_len = udp_update_hdr4(c, &udp4_l2_buf[i], dstport, - udp4_l2_mh_sock[i].msg_len, now); +/** + * udp_sock_to_sock() - Forward datagrams from socket to socket + * @c: Execution context + * @from_s: Socket to receive datagrams from + * @n: Maximum number of datagrams to forward + * @tosidx: Flow & side to forward datagrams to + * + * #syscalls sendmmsg + */ +static void udp_sock_to_sock(const struct ctx *c, int from_s, int n, + flow_sidx_t tosidx) +{ + const struct flowside *toside = flowside_at_sidx(tosidx); + const struct udp_flow *uflow = udp_at_sidx(tosidx); + uint8_t topif = pif_at_sidx(tosidx); + int to_s = uflow->s[tosidx.sidei]; + int i; + + if ((n = udp_sock_recv(c, from_s, udp_mh_recv, n)) <= 0) + return; - tap_iov[i].iov_len = buf_len; + for (i = 0; i < n; i++) { + udp_mh_splice[i].msg_hdr.msg_iov->iov_len + = udp_mh_recv[i].msg_len; } - tap_send_frames(c, tap_iov + start, 1, n); + pif_sockaddr(c, &udp_splice_to, topif, + &toside->eaddr, toside->eport); + + sendmmsg(to_s, udp_mh_splice, n, MSG_NOSIGNAL); } /** - * udp_sock_handler() - Handle new data from socket + * udp_buf_sock_to_tap() - Forward datagrams from socket to tap * @c: Execution context - * @ref: epoll reference - * @events: epoll events bitmap - * @now: Current timestamp - * - * #syscalls recvmmsg + * @s: Socket to read data from + * @n: Maximum number of datagrams to forward + * @tosidx: Flow & side to forward data from @s to */ -void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, - const struct timespec *now) +static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n, + flow_sidx_t tosidx) { - /* For not entirely clear reasons (data locality?) pasta gets - * better throughput if we receive tap datagrams one at a - * atime. For small splice datagrams throughput is slightly - * better if we do batch, but it's slightly worse for large - * splice datagrams. Since we don't know before we receive - * whether we'll use tap or splice, always go one at a time - * for pasta mode. - */ - ssize_t n = (c->mode == MODE_PASST ? UDP_MAX_FRAMES : 1); - in_port_t dstport = ref.udp.port; - bool v6 = ref.udp.v6; - struct mmsghdr *mmh_recv; - int i, m; + const struct flowside *toside = flowside_at_sidx(tosidx); + struct udp_flow *uflow = udp_at_sidx(tosidx); + uint8_t *omac = uflow->f.tap_omac; + int i; - if (c->no_udp || !(events & EPOLLIN)) + if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0) return; - if (ref.udp.pif == PIF_SPLICE) - dstport += c->udp.fwd_out.f.delta[dstport]; - else if (ref.udp.pif == PIF_HOST) - dstport += c->udp.fwd_in.f.delta[dstport]; + /* Find if neighbour table has a recorded MAC address */ + if (MAC_IS_UNDEF(omac)) + fwd_neigh_mac_get(c, &toside->oaddr, omac); - if (v6) { - mmh_recv = udp6_l2_mh_sock; - udp6_localname.sin6_port = htons(dstport); - } else { - mmh_recv = udp4_l2_mh_sock; - udp4_localname.sin_port = htons(dstport); + for (i = 0; i < n; i++) + udp_tap_prepare(udp_mh_recv, i, omac, toside, false); + + tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n); +} + +/** + * udp_sock_fwd() - Forward datagrams from a possibly unconnected socket + * @c: Execution context + * @s: Socket to forward from + * @frompif: Interface to which @s belongs + * @port: Our (local) port number of @s + * @now: Current timestamp + */ +void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif, + in_port_t port, const struct timespec *now) +{ + union sockaddr_inany src; + union inany_addr dst; + int rc; + + while ((rc = udp_peek_addr(s, &src, &dst)) != 0) { + bool discard = false; + flow_sidx_t tosidx; + uint8_t topif; + + if (rc < 0) { + trace("Error peeking at socket address: %s", + strerror_(-rc)); + /* Clear errors & carry on */ + if (udp_sock_errs(c, s, FLOW_SIDX_NONE, + frompif, port) < 0) { + err( +"UDP: Unrecoverable error on listening socket: (%s port %hu)", + pif_name(frompif), port); + /* FIXME: what now? close/re-open socket? */ + } + continue; + } + + tosidx = udp_flow_from_sock(c, frompif, &dst, port, &src, now); + topif = pif_at_sidx(tosidx); + + if (pif_is_socket(topif)) { + udp_sock_to_sock(c, s, 1, tosidx); + } else if (topif == PIF_TAP) { + if (c->mode == MODE_VU) + udp_vu_sock_to_tap(c, s, 1, tosidx); + else + udp_buf_sock_to_tap(c, s, 1, tosidx); + } else if (flow_sidx_valid(tosidx)) { + struct udp_flow *uflow = udp_at_sidx(tosidx); + + flow_err(uflow, + "No support for forwarding UDP from %s to %s", + pif_name(frompif), pif_name(topif)); + discard = true; + } else { + debug("Discarding datagram without flow"); + discard = true; + } + + if (discard) { + struct msghdr msg = { 0 }; + + if (recvmsg(s, &msg, MSG_DONTWAIT) < 0) + debug_perror("Failed to discard datagram"); + } } +} - n = recvmmsg(ref.fd, mmh_recv, n, 0, NULL); - if (n <= 0) - return; +/** + * udp_listen_sock_handler() - Handle new data from socket + * @c: Execution context + * @ref: epoll reference + * @events: epoll events bitmap + * @now: Current timestamp + */ +void udp_listen_sock_handler(const struct ctx *c, + union epoll_ref ref, uint32_t events, + const struct timespec *now) +{ + if (events & (EPOLLERR | EPOLLIN)) + udp_sock_fwd(c, ref.fd, ref.udp.pif, ref.udp.port, now); +} - for (i = 0; i < n; i += m) { - int splicefrom = -1; - m = n; +/** + * udp_sock_handler() - Handle new data from flow specific socket + * @c: Execution context + * @ref: epoll reference + * @events: epoll events bitmap + * @now: Current timestamp + */ +void udp_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now) +{ + struct udp_flow *uflow = udp_at_sidx(ref.flowside); - if (ref.udp.splice) { - splicefrom = udp_mmh_splice_port(v6, mmh_recv + i); + ASSERT(!c->no_udp && uflow); - for (m = 1; i + m < n; m++) { - int p; + if (events & EPOLLERR) { + if (udp_sock_errs(c, ref.fd, ref.flowside, PIF_NONE, 0) < 0) { + flow_err(uflow, "Unrecoverable error on flow socket"); + goto fail; + } + } - p = udp_mmh_splice_port(v6, mmh_recv + i + m); - if (p != splicefrom) - break; + if (events & EPOLLIN) { + /* For not entirely clear reasons (data locality?) pasta gets + * better throughput if we receive tap datagrams one at a + * time. For small splice datagrams throughput is slightly + * better if we do batch, but it's slightly worse for large + * splice datagrams. Since we don't know the size before we + * receive, always go one at a time for pasta mode. + */ + size_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES); + flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); + uint8_t topif = pif_at_sidx(tosidx); + int s = ref.fd; + + flow_trace(uflow, "Received data on reply socket"); + uflow->ts = now->tv_sec; + + if (pif_is_socket(topif)) { + udp_sock_to_sock(c, ref.fd, n, tosidx); + } else if (topif == PIF_TAP) { + if (c->mode == MODE_VU) { + udp_vu_sock_to_tap(c, s, UDP_MAX_FRAMES, + tosidx); + } else { + udp_buf_sock_to_tap(c, s, n, tosidx); } + } else { + flow_err(uflow, + "No support for forwarding UDP from %s to %s", + pif_name(pif_at_sidx(ref.flowside)), + pif_name(topif)); + goto fail; } - - if (splicefrom >= 0) - udp_splice_sendfrom(c, i, m, splicefrom, dstport, - ref.udp.pif, v6, ref.udp.orig, now); - else - udp_tap_send(c, i, m, dstport, v6, now); } + return; + +fail: + flow_err_details(uflow); + udp_flow_close(c, uflow); } /** @@ -795,6 +999,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, * @af: Address family, AF_INET or AF_INET6 * @saddr: Source address * @daddr: Destination address + * @ttl: TTL or hop limit for packets to be sent in this call * @p: Pool of UDP packets, with UDP headers * @idx: Index of first packet to process * @now: Current timestamp @@ -803,25 +1008,30 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, * * #syscalls sendmmsg */ -int udp_tap_handler(struct ctx *c, uint8_t pif, +int udp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, - const struct pool *p, int idx, const struct timespec *now) + uint8_t ttl, const struct pool *p, int idx, + const struct timespec *now) { + const struct flowside *toside; struct mmsghdr mm[UIO_MAXIOV]; + union sockaddr_inany to_sa; struct iovec m[UIO_MAXIOV]; - struct sockaddr_in6 s_in6; - struct sockaddr_in s_in; + struct udphdr uh_storage; const struct udphdr *uh; - struct sockaddr *sa; - int i, s, count = 0; + struct udp_flow *uflow; + int i, j, s, count = 0; + struct iov_tail data; + flow_sidx_t tosidx; in_port_t src, dst; - socklen_t sl; + uint8_t topif; - (void)c; - (void)saddr; - (void)pif; + ASSERT(!c->no_udp); + + if (!packet_get(p, idx, &data)) + return 1; - uh = packet_get(p, idx, 0, sizeof(*uh), NULL); + uh = IOV_PEEK_HEADER(&data, uh_storage); if (!uh) return 1; @@ -831,131 +1041,56 @@ int udp_tap_handler(struct ctx *c, uint8_t pif, src = ntohs(uh->source); dst = ntohs(uh->dest); - if (af == AF_INET) { - s_in = (struct sockaddr_in) { - .sin_family = AF_INET, - .sin_port = uh->dest, - .sin_addr = *(struct in_addr *)daddr, - }; + tosidx = udp_flow_from_tap(c, pif, af, saddr, daddr, src, dst, now); + if (!(uflow = udp_at_sidx(tosidx))) { + char sstr[INET6_ADDRSTRLEN], dstr[INET6_ADDRSTRLEN]; - sa = (struct sockaddr *)&s_in; - sl = sizeof(s_in); - - if (IN4_ARE_ADDR_EQUAL(&s_in.sin_addr, &c->ip4.dns_match) && - ntohs(s_in.sin_port) == 53) { - s_in.sin_addr = c->ip4.dns_host; - udp_tap_map[V4][src].ts = now->tv_sec; - udp_tap_map[V4][src].flags |= PORT_DNS_FWD; - bitmap_set(udp_act[V4][UDP_ACT_TAP], src); - } else if (IN4_ARE_ADDR_EQUAL(&s_in.sin_addr, &c->ip4.gw) && - !c->no_map_gw) { - if (!(udp_tap_map[V4][dst].flags & PORT_LOCAL) || - (udp_tap_map[V4][dst].flags & PORT_LOOPBACK)) - s_in.sin_addr.s_addr = htonl(INADDR_LOOPBACK); - else - s_in.sin_addr = c->ip4.addr_seen; - } - - debug("UDP from tap src=%hu dst=%hu, s=%d", - src, dst, udp_tap_map[V4][src].sock); - if ((s = udp_tap_map[V4][src].sock) < 0) { - struct in_addr bind_addr = IN4ADDR_ANY_INIT; - union udp_epoll_ref uref = { - .port = src, - .pif = PIF_HOST, - }; - const char *bind_if = NULL; - - if (!IN4_IS_ADDR_LOOPBACK(&s_in.sin_addr)) - bind_if = c->ip4.ifname_out; - - if (!IN4_IS_ADDR_LOOPBACK(&s_in.sin_addr)) - bind_addr = c->ip4.addr_out; - - s = sock_l4(c, AF_INET, IPPROTO_UDP, &bind_addr, - bind_if, src, uref.u32); - if (s < 0) - return p->count - idx; - - udp_tap_map[V4][src].sock = s; - bitmap_set(udp_act[V4][UDP_ACT_TAP], src); - } - - udp_tap_map[V4][src].ts = now->tv_sec; - } else { - s_in6 = (struct sockaddr_in6) { - .sin6_family = AF_INET6, - .sin6_port = uh->dest, - .sin6_addr = *(struct in6_addr *)daddr, - }; - const struct in6_addr *bind_addr = &in6addr_any; - - sa = (struct sockaddr *)&s_in6; - sl = sizeof(s_in6); - - if (IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.dns_match) && - ntohs(s_in6.sin6_port) == 53) { - s_in6.sin6_addr = c->ip6.dns_host; - udp_tap_map[V6][src].ts = now->tv_sec; - udp_tap_map[V6][src].flags |= PORT_DNS_FWD; - bitmap_set(udp_act[V6][UDP_ACT_TAP], src); - } else if (IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.gw) && - !c->no_map_gw) { - if (!(udp_tap_map[V6][dst].flags & PORT_LOCAL) || - (udp_tap_map[V6][dst].flags & PORT_LOOPBACK)) - s_in6.sin6_addr = in6addr_loopback; - else if (udp_tap_map[V6][dst].flags & PORT_GUA) - s_in6.sin6_addr = c->ip6.addr; - else - s_in6.sin6_addr = c->ip6.addr_seen; - } else if (IN6_IS_ADDR_LINKLOCAL(&s_in6.sin6_addr)) { - bind_addr = &c->ip6.addr_ll; - } - - if ((s = udp_tap_map[V6][src].sock) < 0) { - union udp_epoll_ref uref = { - .v6 = 1, - .port = src, - .pif = PIF_HOST, - }; - const char *bind_if = NULL; + debug("Dropping datagram with no flow %s %s:%hu -> %s:%hu", + pif_name(pif), + inet_ntop(af, saddr, sstr, sizeof(sstr)), src, + inet_ntop(af, daddr, dstr, sizeof(dstr)), dst); + return 1; + } - if (!IN6_IS_ADDR_LOOPBACK(&s_in6.sin6_addr)) - bind_if = c->ip6.ifname_out; + topif = pif_at_sidx(tosidx); + if (topif != PIF_HOST) { + flow_sidx_t fromsidx = flow_sidx_opposite(tosidx); + uint8_t frompif = pif_at_sidx(fromsidx); - if (!IN6_IS_ADDR_LOOPBACK(&s_in6.sin6_addr) && - !IN6_IS_ADDR_LINKLOCAL(&s_in6.sin6_addr)) - bind_addr = &c->ip6.addr_out; + flow_err(uflow, "No support for forwarding UDP from %s to %s", + pif_name(frompif), pif_name(topif)); + return 1; + } + toside = flowside_at_sidx(tosidx); - s = sock_l4(c, AF_INET6, IPPROTO_UDP, bind_addr, - bind_if, src, uref.u32); - if (s < 0) - return p->count - idx; + s = uflow->s[tosidx.sidei]; + ASSERT(s >= 0); - udp_tap_map[V6][src].sock = s; - bitmap_set(udp_act[V6][UDP_ACT_TAP], src); - } + pif_sockaddr(c, &to_sa, topif, &toside->eaddr, toside->eport); - udp_tap_map[V6][src].ts = now->tv_sec; - } + for (i = 0, j = 0; i < (int)p->count - idx && j < UIO_MAXIOV; i++) { + const struct udphdr *uh_send; - for (i = 0; i < (int)p->count - idx; i++) { - struct udphdr *uh_send; - size_t len; + if (!packet_get(p, idx + i, &data)) + return p->count - idx; - uh_send = packet_get(p, idx + i, 0, sizeof(*uh), &len); + uh_send = IOV_REMOVE_HEADER(&data, uh_storage); if (!uh_send) return p->count - idx; - mm[i].msg_hdr.msg_name = sa; - mm[i].msg_hdr.msg_namelen = sl; + mm[i].msg_hdr.msg_name = &to_sa; + mm[i].msg_hdr.msg_namelen = socklen_inany(&to_sa); - if (len) { - m[i].iov_base = (char *)(uh_send + 1); - m[i].iov_len = len; + if (data.cnt) { + int cnt; - mm[i].msg_hdr.msg_iov = m + i; - mm[i].msg_hdr.msg_iovlen = 1; + cnt = iov_tail_clone(&m[j], UIO_MAXIOV - j, &data); + if (cnt < 0) + return p->count - idx; + + mm[i].msg_hdr.msg_iov = &m[j]; + mm[i].msg_hdr.msg_iovlen = cnt; + j += cnt; } else { mm[i].msg_hdr.msg_iov = NULL; mm[i].msg_hdr.msg_iovlen = 0; @@ -965,6 +1100,24 @@ int udp_tap_handler(struct ctx *c, uint8_t pif, mm[i].msg_hdr.msg_controllen = 0; mm[i].msg_hdr.msg_flags = 0; + if (ttl != uflow->ttl[tosidx.sidei]) { + uflow->ttl[tosidx.sidei] = ttl; + if (af == AF_INET) { + if (setsockopt(s, IPPROTO_IP, IP_TTL, + &ttl, sizeof(ttl)) < 0) + flow_perror(uflow, + "setsockopt IP_TTL"); + } else { + /* IPv6 hop_limit cannot be only 1 byte */ + int hop_limit = ttl; + + if (setsockopt(s, SOL_IPV6, IPV6_UNICAST_HOPS, + &hop_limit, sizeof(hop_limit)) < 0) + flow_perror(uflow, + "setsockopt IPV6_UNICAST_HOPS"); + } + } + count++; } @@ -976,66 +1129,60 @@ int udp_tap_handler(struct ctx *c, uint8_t pif, } /** - * udp_sock_init() - Initialise listening sockets for a given port + * udp_listen() - Initialise listening socket for a given port * @c: Execution context - * @ns: In pasta mode, if set, bind with loopback address in namespace - * @af: Address family to select a specific IP version, or AF_UNSPEC + * @pif: Interface to open the socket for (PIF_HOST or PIF_SPLICE) * @addr: Pointer to address for binding, NULL if not configured * @ifname: Name of interface to bind to, NULL if not configured * @port: Port, host order * - * Return: 0 on (partial) success, negative error code on (complete) failure + * Return: 0 on success, negative error code on failure */ -int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, - const void *addr, const char *ifname, in_port_t port) +int udp_listen(const struct ctx *c, uint8_t pif, + const union inany_addr *addr, const char *ifname, in_port_t port) { - union udp_epoll_ref uref = { .splice = (c->mode == MODE_PASTA), - .orig = true, .port = port }; - int s, r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1; - - if (ns) - uref.pif = PIF_SPLICE; - else - uref.pif = PIF_HOST; - - if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) { - uref.v6 = 0; + union udp_listen_epoll_ref uref = { + .pif = pif, + .port = port, + }; + int (*socks)[NUM_PORTS]; + int s; - if (!ns) { - r4 = s = sock_l4(c, AF_INET, IPPROTO_UDP, addr, - ifname, port, uref.u32); + ASSERT(!c->no_udp); - udp_tap_map[V4][uref.port].sock = s < 0 ? -1 : s; - udp_splice_init[V4][port].sock = s < 0 ? -1 : s; - } else { - r4 = s = sock_l4(c, AF_INET, IPPROTO_UDP, - &in4addr_loopback, - ifname, port, uref.u32); - udp_splice_ns[V4][port].sock = s < 0 ? -1 : s; - } + if (pif == PIF_HOST) { + socks = udp_splice_init; + } else { + ASSERT(pif == PIF_SPLICE); + socks = udp_splice_ns; } - if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) { - uref.v6 = 1; - - if (!ns) { - r6 = s = sock_l4(c, AF_INET6, IPPROTO_UDP, addr, - ifname, port, uref.u32); - - udp_tap_map[V6][uref.port].sock = s < 0 ? -1 : s; - udp_splice_init[V6][port].sock = s < 0 ? -1 : s; - } else { - r6 = s = sock_l4(c, AF_INET6, IPPROTO_UDP, - &in6addr_loopback, - ifname, port, uref.u32); - udp_splice_ns[V6][port].sock = s < 0 ? -1 : s; - } + if (!c->ifi4) { + if (!addr) + /* Restrict to v6 only */ + addr = &inany_any6; + else if (inany_v4(addr)) + /* Nothing to do */ + return 0; + } + if (!c->ifi6) { + if (!addr) + /* Restrict to v4 only */ + addr = &inany_any4; + else if (!inany_v4(addr)) + /* Nothing to do */ + return 0; } - if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6)) - return 0; + s = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, pif, + addr, ifname, port, uref.u32); + + if (!addr || inany_v4(addr)) + socks[V4][port] = s < 0 ? -1 : s; + if (!addr || !inany_v4(addr)) + socks[V6][port] = s < 0 ? -1 : s; - return r4 < 0 ? r4 : r6; + return s < 0 ? s : 0; } /** @@ -1046,74 +1193,36 @@ static void udp_splice_iov_init(void) int i; for (i = 0; i < UDP_MAX_FRAMES; i++) { - struct msghdr *mh4 = &udp4_mh_splice[i].msg_hdr; - struct msghdr *mh6 = &udp6_mh_splice[i].msg_hdr; + struct msghdr *mh = &udp_mh_splice[i].msg_hdr; - mh4->msg_name = &udp4_localname; - mh4->msg_namelen = sizeof(udp4_localname); + mh->msg_name = &udp_splice_to; + mh->msg_namelen = sizeof(udp_splice_to); - mh6->msg_name = &udp6_localname; - mh6->msg_namelen = sizeof(udp6_localname); + udp_iov_splice[i].iov_base = udp_payload[i].data; - udp4_iov_splice[i].iov_base = udp4_l2_buf[i].data; - udp6_iov_splice[i].iov_base = udp6_l2_buf[i].data; - - mh4->msg_iov = &udp4_iov_splice[i]; - mh6->msg_iov = &udp6_iov_splice[i]; - mh4->msg_iovlen = mh6->msg_iovlen = 1; + mh->msg_iov = &udp_iov_splice[i]; + mh->msg_iovlen = 1; } } /** - * udp_timer_one() - Handler for timed events on one port + * udp_ns_listen() - Init socket to listen for spliced outbound connections * @c: Execution context - * @v6: Set for IPv6 connections - * @type: Socket type - * @port: Port number, host order - * @now: Current timestamp + * @port: Port, host order */ -static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type, - in_port_t port, const struct timespec *now) +static void udp_ns_listen(const struct ctx *c, in_port_t port) { - struct udp_splice_port *sp; - struct udp_tap_port *tp; - int *sockp = NULL; - - switch (type) { - case UDP_ACT_TAP: - tp = &udp_tap_map[v6 ? V6 : V4][port]; - - if (now->tv_sec - tp->ts > UDP_CONN_TIMEOUT) { - sockp = &tp->sock; - tp->flags = 0; - } - - break; - case UDP_ACT_SPLICE_INIT: - sp = &udp_splice_init[v6 ? V6 : V4][port]; - - if (now->tv_sec - sp->ts > UDP_CONN_TIMEOUT) - sockp = &sp->sock; - - break; - case UDP_ACT_SPLICE_NS: - sp = &udp_splice_ns[v6 ? V6 : V4][port]; - - if (now->tv_sec - sp->ts > UDP_CONN_TIMEOUT) - sockp = &sp->sock; + ASSERT(!c->no_udp); - break; - default: + if (!c->no_bindtodevice) { + udp_listen(c, PIF_SPLICE, NULL, "lo", port); return; } - if (sockp && *sockp >= 0) { - int s = *sockp; - *sockp = -1; - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL); - close(s); - bitmap_clear(udp_act[v6 ? V6 : V4][type], port); - } + if (c->ifi4) + udp_listen(c, PIF_SPLICE, &inany_loopback4, NULL, port); + if (c->ifi6) + udp_listen(c, PIF_SPLICE, &inany_loopback6, NULL, port); } /** @@ -1125,36 +1234,33 @@ static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type, */ static void udp_port_rebind(struct ctx *c, bool outbound) { + int (*socks)[NUM_PORTS] = outbound ? udp_splice_ns : udp_splice_init; const uint8_t *fmap - = outbound ? c->udp.fwd_out.f.map : c->udp.fwd_in.f.map; - const uint8_t *rmap - = outbound ? c->udp.fwd_in.f.map : c->udp.fwd_out.f.map; - struct udp_splice_port (*socks)[NUM_PORTS] - = outbound ? udp_splice_ns : udp_splice_init; + = outbound ? c->udp.fwd_out.map : c->udp.fwd_in.map; unsigned port; for (port = 0; port < NUM_PORTS; port++) { if (!bitmap_isset(fmap, port)) { - if (socks[V4][port].sock >= 0) { - close(socks[V4][port].sock); - socks[V4][port].sock = -1; + if (socks[V4][port] >= 0) { + close(socks[V4][port]); + socks[V4][port] = -1; } - if (socks[V6][port].sock >= 0) { - close(socks[V6][port].sock); - socks[V6][port].sock = -1; + if (socks[V6][port] >= 0) { + close(socks[V6][port]); + socks[V6][port] = -1; } continue; } - /* Don't loop back our own ports */ - if (bitmap_isset(rmap, port)) - continue; - - if ((c->ifi4 && socks[V4][port].sock == -1) || - (c->ifi6 && socks[V6][port].sock == -1)) - udp_sock_init(c, outbound, AF_UNSPEC, NULL, NULL, port); + if ((c->ifi4 && socks[V4][port] == -1) || + (c->ifi6 && socks[V6][port] == -1)) { + if (outbound) + udp_ns_listen(c, port); + else + udp_listen(c, PIF_HOST, NULL, NULL, port); + } } } @@ -1177,49 +1283,18 @@ static int udp_port_rebind_outbound(void *arg) } /** - * udp_timer() - Scan activity bitmaps for ports with associated timed events + * udp_port_rebind_all() - Rebind ports to match forward maps (in host & ns) * @c: Execution context - * @now: Current timestamp */ -void udp_timer(struct ctx *c, const struct timespec *now) +void udp_port_rebind_all(struct ctx *c) { - int n, t, v6 = 0; - unsigned int i; - long *word, tmp; - - if (c->mode == MODE_PASTA) { - if (c->udp.fwd_out.f.mode == FWD_AUTO) { - fwd_scan_ports_udp(&c->udp.fwd_out.f, &c->udp.fwd_in.f, - &c->tcp.fwd_out, &c->tcp.fwd_in); - NS_CALL(udp_port_rebind_outbound, c); - } + ASSERT(c->mode == MODE_PASTA && !c->no_udp); - if (c->udp.fwd_in.f.mode == FWD_AUTO) { - fwd_scan_ports_udp(&c->udp.fwd_in.f, &c->udp.fwd_out.f, - &c->tcp.fwd_in, &c->tcp.fwd_out); - udp_port_rebind(c, false); - } - } - - if (!c->ifi4) - v6 = 1; -v6: - for (t = 0; t < UDP_ACT_TYPE_MAX; t++) { - word = (long *)udp_act[v6 ? V6 : V4][t]; - for (i = 0; i < ARRAY_SIZE(udp_act[0][0]); - i += sizeof(long), word++) { - tmp = *word; - while ((n = ffsl(tmp))) { - tmp &= ~(1UL << (n - 1)); - udp_timer_one(c, v6, t, i * 8 + n - 1, now); - } - } - } + if (c->udp.fwd_out.mode == FWD_AUTO) + NS_CALL(udp_port_rebind_outbound, c); - if (!v6 && c->ifi6) { - v6 = 1; - goto v6; - } + if (c->udp.fwd_in.mode == FWD_AUTO) + udp_port_rebind(c, false); } /** @@ -1230,10 +1305,9 @@ v6: */ int udp_init(struct ctx *c) { - udp_sock_iov_init(c); + ASSERT(!c->no_udp); - udp_invert_portmap(&c->udp.fwd_in); - udp_invert_portmap(&c->udp.fwd_out); + udp_iov_init(c); if (c->mode == MODE_PASTA) { udp_splice_iov_init(); |
