// SPDX-License-Identifier: GPL-2.0-or-later /* PASST - Plug A Simple Socket Transport * for qemu/UNIX domain socket mode * * PASTA - Pack A Subtle Tap Abstraction * for network namespace/tap device mode * * udp.c - UDP L2-L4 translation routines * * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio */ /** * DOC: Theory of Operation * * UDP Flows * ========= * * UDP doesn't have true connections, but many protocols use a connection-like * format. The flow is initiated by a client sending a datagram from a port of * its choosing (usually ephemeral) to a specific port (usually well known) on a * server. Both client and server address must be unicast. The server sends * replies using the same addresses & ports with src/dest swapped. * * We track pseudo-connections of this type as flow table entries of type * FLOW_UDP. We store the time of the last traffic on the flow in uflow->ts, * and let the flow expire if there is no traffic for UDP_CONN_TIMEOUT seconds. * * NOTE: This won't handle multicast protocols, or some protocols with different * port usage. We'll need specific logic if we want to handle those. * * "Listening" sockets * =================== * * UDP doesn't use listen(), but we consider long term sockets which are allowed * to create new flows "listening" by analogy with TCP. This listening socket * could receive packets from multiple flows, so we use a hash table match to * find the specific flow for a datagram. * * When a UDP flow is initiated from a listening socket we take a duplicate of * the socket and store it in uflow->s[INISIDE]. This will last for the * lifetime of the flow, even if the original listening socket is closed due to * port auto-probing. The duplicate is used to deliver replies back to the * originating side. * * Reply sockets * ============= * * When a UDP flow targets a socket, we create a "reply" socket in * uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive * replies on the target side. This socket is both bound and connected and has * EPOLL_TYPE_UDP_REPLY. The connect() means it will only receive datagrams * associated with this flow, so the epoll reference directly points to the flow * and we don't need a hash lookup. * * NOTE: it's possible that the reply socket could have a bound address * overlapping with an unrelated listening socket. We assume datagrams for the * flow will come to the reply socket in preference to a listening socket. The * sample program doc/platform-requirements/reuseaddr-priority.c documents and * tests that assumption. * * "Spliced" flows * =============== * * In PASTA mode, L2-L4 translation is skipped for connections to ports bound * between namespaces using the loopback interface, messages are directly * transferred between L4 sockets instead. These are called spliced connections * in analogy with the TCP implementation. The the splice() syscall isn't * actually used; it doesn't make sense for datagrams and instead a pair of * recvmmsg() and sendmmsg() is used to forward the datagrams. * * Note that a spliced flow will have *both* a duplicated listening socket and a * reply socket (see above). */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "checksum.h" #include "util.h" #include "iov.h" #include "ip.h" #include "siphash.h" #include "inany.h" #include "passt.h" #include "tap.h" #include "pcap.h" #include "log.h" #include "flow_table.h" #define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */ #define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ /* "Spliced" sockets indexed by bound port (host order) */ static int udp_splice_ns [IP_VERSIONS][NUM_PORTS]; static int udp_splice_init[IP_VERSIONS][NUM_PORTS]; /* Static buffers */ /** * struct udp_payload_t - UDP header and data for inbound messages * @uh: UDP header * @data: UDP data */ static struct udp_payload_t { struct udphdr uh; char data[USHRT_MAX - sizeof(struct udphdr)]; #ifdef __AVX2__ } __attribute__ ((packed, aligned(32))) #else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))) #endif udp_payload[UDP_MAX_FRAMES]; /* Ethernet header for IPv4 frames */ static struct ethhdr udp4_eth_hdr; /* Ethernet header for IPv6 frames */ static struct ethhdr udp6_eth_hdr; /** * struct udp_meta_t - Pre-cooked headers and metadata for UDP packets * @ip6h: Pre-filled IPv6 header (except for payload_len and addresses) * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr) * @taph: Tap backend specific header * @s_in: Source socket address, filled in by recvmmsg() * @tosidx: sidx for the destination side of this datagram's flow */ static struct udp_meta_t { struct ipv6hdr ip6h; struct iphdr ip4h; struct tap_hdr taph; union sockaddr_inany s_in; flow_sidx_t tosidx; } #ifdef __AVX2__ __attribute__ ((aligned(32))) #endif udp_meta[UDP_MAX_FRAMES]; /** * enum udp_iov_idx - Indices for the buffers making up a single UDP frame * @UDP_IOV_TAP tap specific header * @UDP_IOV_ETH Ethernet header * @UDP_IOV_IP IP (v4/v6) header * @UDP_IOV_PAYLOAD IP payload (UDP header + data) * @UDP_NUM_IOVS the number of entries in the iovec array */ enum udp_iov_idx { UDP_IOV_TAP = 0, UDP_IOV_ETH = 1, UDP_IOV_IP = 2, UDP_IOV_PAYLOAD = 3, UDP_NUM_IOVS }; /* IOVs and msghdr arrays for receiving datagrams from sockets */ static struct iovec udp_iov_recv [UDP_MAX_FRAMES]; static struct mmsghdr udp4_mh_recv [UDP_MAX_FRAMES]; static struct mmsghdr udp6_mh_recv [UDP_MAX_FRAMES]; /* IOVs and msghdr arrays for sending "spliced" datagrams to sockets */ static union sockaddr_inany udp_splice_to; static struct iovec udp_iov_splice [UDP_MAX_FRAMES]; static struct mmsghdr udp_mh_splice [UDP_MAX_FRAMES]; /* IOVs for L2 frames */ static struct iovec udp_l2_iov [UDP_MAX_FRAMES][UDP_NUM_IOVS]; /** * udp_portmap_clear() - Clear UDP port map before configuration */ void udp_portmap_clear(void) { unsigned i; for (i = 0; i < NUM_PORTS; i++) { udp_splice_ns[V4][i] = udp_splice_ns[V6][i] = -1; udp_splice_init[V4][i] = udp_splice_init[V6][i] = -1; } } /** * udp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses * @eth_d: Ethernet destination address, NULL if unchanged * @eth_s: Ethernet source address, NULL if unchanged */ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) { eth_update_mac(&udp4_eth_hdr, eth_d, eth_s); eth_update_mac(&udp6_eth_hdr, eth_d, eth_s); } /** * udp_iov_init_one() - Initialise scatter-gather lists for one buffer * @c: Execution context * @i: Index of buffer to initialize */ static void udp_iov_init_one(const struct ctx *c, size_t i) { struct udp_payload_t *payload = &udp_payload[i]; struct udp_meta_t *meta = &udp_meta[i]; struct iovec *siov = &udp_iov_recv[i]; struct iovec *tiov = udp_l2_iov[i]; *meta = (struct udp_meta_t) { .ip4h = L2_BUF_IP4_INIT(IPPROTO_UDP), .ip6h = L2_BUF_IP6_INIT(IPPROTO_UDP), }; *siov = IOV_OF_LVALUE(payload->data); tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph); tiov[UDP_IOV_PAYLOAD].iov_base = payload; /* It's useful to have separate msghdr arrays for receiving. Otherwise, * an IPv4 recv() will alter msg_namelen, so we'd have to reset it every * time or risk truncating the address on future IPv6 recv()s. */ if (c->ifi4) { struct msghdr *mh = &udp4_mh_recv[i].msg_hdr; mh->msg_name = &meta->s_in; mh->msg_namelen = sizeof(struct sockaddr_in); mh->msg_iov = siov; mh->msg_iovlen = 1; } if (c->ifi6) { struct msghdr *mh = &udp6_mh_recv[i].msg_hdr; mh->msg_name = &meta->s_in; mh->msg_namelen = sizeof(struct sockaddr_in6); mh->msg_iov = siov; mh->msg_iovlen = 1; } } /** * udp_iov_init() - Initialise scatter-gather L2 buffers * @c: Execution context */ static void udp_iov_init(const struct ctx *c) { size_t i; udp4_eth_hdr.h_proto = htons_constant(ETH_P_IP); udp6_eth_hdr.h_proto = htons_constant(ETH_P_IPV6); for (i = 0; i < UDP_MAX_FRAMES; i++) udp_iov_init_one(c, i); } /** * udp_at_sidx() - Get UDP specific flow at given sidx * @sidx: Flow and side to retrieve * * Return: UDP specific flow at @sidx, or NULL of @sidx is invalid. Asserts if * the flow at @sidx is not FLOW_UDP. */ struct udp_flow *udp_at_sidx(flow_sidx_t sidx) { union flow *flow = flow_at_sidx(sidx); if (!flow) return NULL; ASSERT(flow->f.type == FLOW_UDP); return &flow->udp; } /* * udp_flow_close() - Close and clean up UDP flow * @c: Execution context * @uflow: UDP flow */ static void udp_flow_close(const struct ctx *c, struct udp_flow *uflow) { if (uflow->s[INISIDE] >= 0) { /* The listening socket needs to stay in epoll */ close(uflow->s[INISIDE]); uflow->s[INISIDE] = -1; } if (uflow->s[TGTSIDE] >= 0) { /* But the flow specific one needs to be removed */ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, uflow->s[TGTSIDE], NULL); close(uflow->s[TGTSIDE]); uflow->s[TGTSIDE] = -1; } flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE)); if (!pif_is_socket(uflow->f.pif[TGTSIDE])) flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE)); } /** * udp_flow_new() - Common setup for a new UDP flow * @c: Execution context * @flow: Initiated flow * @s_ini: Initiating socket (or -1) * @now: Timestamp * * Return: UDP specific flow, if successful, NULL on failure */ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, int s_ini, const struct timespec *now) { const struct flowside *ini = &flow->f.side[INISIDE]; struct udp_flow *uflow = NULL; const struct flowside *tgt; uint8_t tgtpif; if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) { flow_trace(flow, "Invalid endpoint to initiate UDP flow"); goto cancel; } if (!(tgt = flow_target(c, flow, IPPROTO_UDP))) goto cancel; tgtpif = flow->f.pif[TGTSIDE]; uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp); uflow->ts = now->tv_sec; uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1; if (s_ini >= 0) { /* When using auto port-scanning the listening port could go * away, so we need to duplicate the socket */ uflow->s[INISIDE] = fcntl(s_ini, F_DUPFD_CLOEXEC, 0); if (uflow->s[INISIDE] < 0) { flow_err(uflow, "Couldn't duplicate listening socket: %s", strerror(errno)); goto cancel; } } if (pif_is_socket(tgtpif)) { struct mmsghdr discard[UIO_MAXIOV] = { 0 }; union { flow_sidx_t sidx; uint32_t data; } fref = { .sidx = FLOW_SIDX(flow, TGTSIDE), }; int rc; uflow->s[TGTSIDE] = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY, tgtpif, tgt, fref.data); if (uflow->s[TGTSIDE] < 0) { flow_dbg(uflow, "Couldn't open socket for spliced flow: %s", strerror(errno)); goto cancel; } if (flowside_connect(c, uflow->s[TGTSIDE], tgtpif, tgt) < 0) { flow_dbg(uflow, "Couldn't connect flow socket: %s", strerror(errno)); goto cancel; } /* It's possible, if unlikely, that we could receive some * unrelated packets in between the bind() and connect() of this * socket. For now we just discard these. We could consider * trying to redirect these to an appropriate handler, if we * need to. */ rc = recvmmsg(uflow->s[TGTSIDE], discard, ARRAY_SIZE(discard), MSG_DONTWAIT, NULL); if (rc >= ARRAY_SIZE(discard)) { flow_dbg(uflow, "Too many (%d) spurious reply datagrams", rc); goto cancel; } else if (rc > 0) { flow_trace(uflow, "Discarded %d spurious reply datagrams", rc); } else if (errno != EAGAIN) { flow_err(uflow, "Unexpected error discarding datagrams: %s", strerror(errno)); } } flow_hash_insert(c, FLOW_SIDX(uflow, INISIDE)); /* If the target side is a socket, it will be a reply socket that knows * its own flowside. But if it's tap, then we need to look it up by * hash. */ if (!pif_is_socket(tgtpif)) flow_hash_insert(c, FLOW_SIDX(uflow, TGTSIDE)); FLOW_ACTIVATE(uflow); return FLOW_SIDX(uflow, TGTSIDE); cancel: if (uflow) udp_flow_close(c, uflow); flow_alloc_cancel(flow); return FLOW_SIDX_NONE; } /** * udp_flow_from_sock() - Find or create UDP flow for "listening" socket * @c: Execution context * @ref: epoll reference of the receiving socket * @meta: Metadata buffer for the datagram * @now: Timestamp * * #syscalls fcntl * * Return: sidx for the destination side of the flow for this packet, or * FLOW_SIDX_NONE if we couldn't find or create a flow. */ static flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref, struct udp_meta_t *meta, const struct timespec *now) { struct udp_flow *uflow; union flow *flow; flow_sidx_t sidx; ASSERT(ref.type == EPOLL_TYPE_UDP_LISTEN); sidx = flow_lookup_sa(c, IPPROTO_UDP, ref.udp.pif, &meta->s_in, ref.udp.port); if ((uflow = udp_at_sidx(sidx))) { uflow->ts = now->tv_sec; return flow_sidx_opposite(sidx); } if (!(flow = flow_alloc())) { char sastr[SOCKADDR_STRLEN]; debug("Couldn't allocate flow for UDP datagram from %s %s", pif_name(ref.udp.pif), sockaddr_ntop(&meta->s_in, sastr, sizeof(sastr))); return FLOW_SIDX_NONE; } flow_initiate_sa(flow, ref.udp.pif, &meta->s_in, ref.udp.port); return udp_flow_new(c, flow, ref.fd, now); } /** * udp_splice_prepare() - Prepare one datagram for splicing * @mmh: Receiving mmsghdr array * @idx: Index of the datagram to prepare */ static void udp_splice_prepare(struct mmsghdr *mmh, unsigned idx) { udp_mh_splice[idx].msg_hdr.msg_iov->iov_len = mmh[idx].msg_len; } /** * udp_splice_send() - Send a batch of datagrams from socket to socket * @c: Execution context * @start: Index of batch's first datagram in udp[46]_l2_buf * @n: Number of datagrams in batch * @src: Source port for datagram (target side) * @dst: Destination port for datagrams (target side) * @ref: epoll reference for origin socket * @now: Timestamp */ static void udp_splice_send(const struct ctx *c, size_t start, size_t n, flow_sidx_t tosidx) { const struct flowside *toside = flowside_at_sidx(tosidx); const struct udp_flow *uflow = udp_at_sidx(tosidx); uint8_t topif = pif_at_sidx(tosidx); int s = uflow->s[tosidx.sidei]; socklen_t sl; pif_sockaddr(c, &udp_splice_to, &sl, topif, &toside->eaddr, toside->eport); sendmmsg(s, udp_mh_splice + start, n, MSG_NOSIGNAL); } /** * udp_update_hdr4() - Update headers for one IPv4 datagram * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr) * @bp: Pointer to udp_payload_t to update * @toside: Flowside for destination side * @dlen: Length of UDP payload * * Return: size of IPv4 payload (UDP header + data) */ static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, const struct flowside *toside, size_t dlen) { const struct in_addr *src = inany_v4(&toside->faddr); const struct in_addr *dst = inany_v4(&toside->eaddr); size_t l4len = dlen + sizeof(bp->uh); size_t l3len = l4len + sizeof(*ip4h); ASSERT(src && dst); ip4h->tot_len = htons(l3len); ip4h->daddr = dst->s_addr; ip4h->saddr = src->s_addr; ip4h->check = csum_ip4_header(l3len, IPPROTO_UDP, *src, *dst); bp->uh.source = htons(toside->fport); bp->uh.dest = htons(toside->eport); bp->uh.len = htons(l4len); csum_udp4(&bp->uh, *src, *dst, bp->data, dlen); return l4len; } /** * udp_update_hdr6() - Update headers for one IPv6 datagram * @ip6h: Pre-filled IPv6 header (except for payload_len and addresses) * @bp: Pointer to udp_payload_t to update * @toside: Flowside for destination side * @dlen: Length of UDP payload * * Return: size of IPv6 payload (UDP header + data) */ static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, const struct flowside *toside, size_t dlen) { uint16_t l4len = dlen + sizeof(bp->uh); ip6h->payload_len = htons(l4len); ip6h->daddr = toside->eaddr.a6; ip6h->saddr = toside->faddr.a6; ip6h->version = 6; ip6h->nexthdr = IPPROTO_UDP; ip6h->hop_limit = 255; bp->uh.source = htons(toside->fport); bp->uh.dest = htons(toside->eport); bp->uh.len = ip6h->payload_len; csum_udp6(&bp->uh, &toside->faddr.a6, &toside->eaddr.a6, bp->data, dlen); return l4len; } /** * udp_tap_prepare() - Convert one datagram into a tap frame * @mmh: Receiving mmsghdr array * @idx: Index of the datagram to prepare * @toside: Flowside for destination side */ static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx, const struct flowside *toside) { struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx]; struct udp_payload_t *bp = &udp_payload[idx]; struct udp_meta_t *bm = &udp_meta[idx]; size_t l4len; if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->faddr)) { l4len = udp_update_hdr6(&bm->ip6h, bp, toside, mmh[idx].msg_len); tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + sizeof(udp6_eth_hdr)); (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr); (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h); } else { l4len = udp_update_hdr4(&bm->ip4h, bp, toside, mmh[idx].msg_len); tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + sizeof(udp4_eth_hdr)); (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr); (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip4h); } (*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len; } /** * udp_sock_recverr() - Receive and clear an error from a socket * @s: Socket to receive from * * Return: true if errors received and processed, false if no more errors * * #syscalls recvmsg */ static bool udp_sock_recverr(int s) { const struct sock_extended_err *ee; const struct cmsghdr *hdr; char buf[CMSG_SPACE(sizeof(*ee))]; struct msghdr mh = { .msg_name = NULL, .msg_namelen = 0, .msg_iov = NULL, .msg_iovlen = 0, .msg_control = buf, .msg_controllen = sizeof(buf), }; ssize_t rc; rc = recvmsg(s, &mh, MSG_ERRQUEUE); if (rc < 0) { if (errno != EAGAIN && errno != EWOULDBLOCK) err_perror("Failed to read error queue"); return false; } if (!(mh.msg_flags & MSG_ERRQUEUE)) { err("Missing MSG_ERRQUEUE flag reading error queue"); return false; } hdr = CMSG_FIRSTHDR(&mh); if (!((hdr->cmsg_level == IPPROTO_IP && hdr->cmsg_type == IP_RECVERR) || (hdr->cmsg_level == IPPROTO_IPV6 && hdr->cmsg_type == IPV6_RECVERR))) { err("Unexpected cmsg reading error queue"); return false; } ee = (const struct sock_extended_err *)CMSG_DATA(hdr); /* TODO: When possible propagate and otherwise handle errors */ debug("%s error on UDP socket %i: %s", str_ee_origin(ee), s, strerror(ee->ee_errno)); return true; } /** * udp_sock_recv() - Receive datagrams from a socket * @c: Execution context * @s: Socket to receive from * @events: epoll events bitmap * @mmh mmsghdr array to receive into * * #syscalls recvmmsg */ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events, struct mmsghdr *mmh) { /* For not entirely clear reasons (data locality?) pasta gets better * throughput if we receive tap datagrams one at a atime. For small * splice datagrams throughput is slightly better if we do batch, but * it's slightly worse for large splice datagrams. Since we don't know * before we receive whether we'll use tap or splice, always go one at a * time for pasta mode. */ int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES); ASSERT(!c->no_udp); /* Clear any errors first */ if (events & EPOLLERR) { while (udp_sock_recverr(s)) ; } if (!(events & EPOLLIN)) return 0; n = recvmmsg(s, mmh, n, 0, NULL); if (n < 0) { err_perror("Error receiving datagrams"); return 0; } return n; } /** * udp_listen_sock_handler() - Handle new data from socket * @c: Execution context * @ref: epoll reference * @events: epoll events bitmap * @now: Current timestamp * * #syscalls recvmmsg */ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { struct mmsghdr *mmh_recv = ref.udp.v6 ? udp6_mh_recv : udp4_mh_recv; int n, i; if ((n = udp_sock_recv(c, ref.fd, events, mmh_recv)) <= 0) return; /* We divide datagrams into batches based on how we need to send them, * determined by udp_meta[i].tosidx. To avoid either two passes through * the array, or recalculating tosidx for a single entry, we have to * populate it one entry *ahead* of the loop counter. */ udp_meta[0].tosidx = udp_flow_from_sock(c, ref, &udp_meta[0], now); for (i = 0; i < n; ) { flow_sidx_t batchsidx = udp_meta[i].tosidx; uint8_t batchpif = pif_at_sidx(batchsidx); int batchstart = i; do { if (pif_is_socket(batchpif)) { udp_splice_prepare(mmh_recv, i); } else if (batchpif == PIF_TAP) { udp_tap_prepare(mmh_recv, i, flowside_at_sidx(batchsidx)); } if (++i >= n) break; udp_meta[i].tosidx = udp_flow_from_sock(c, ref, &udp_meta[i], now); } while (flow_sidx_eq(udp_meta[i].tosidx, batchsidx)); if (pif_is_socket(batchpif)) { udp_splice_send(c, batchstart, i - batchstart, batchsidx); } else if (batchpif == PIF_TAP) { tap_send_frames(c, &udp_l2_iov[batchstart][0], UDP_NUM_IOVS, i - batchstart); } else if (flow_sidx_valid(batchsidx)) { flow_sidx_t fromsidx = flow_sidx_opposite(batchsidx); struct udp_flow *uflow = udp_at_sidx(batchsidx); flow_err(uflow, "No support for forwarding UDP from %s to %s", pif_name(pif_at_sidx(fromsidx)), pif_name(batchpif)); } else { debug("Discarding %d datagrams without flow", i - batchstart); } } } /** * udp_reply_sock_handler() - Handle new data from flow specific socket * @c: Execution context * @ref: epoll reference * @events: epoll events bitmap * @now: Current timestamp * * #syscalls recvmmsg */ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { const struct flowside *fromside = flowside_at_sidx(ref.flowside); flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); const struct flowside *toside = flowside_at_sidx(tosidx); struct udp_flow *uflow = udp_at_sidx(ref.flowside); int from_s = uflow->s[ref.flowside.sidei]; bool v6 = !inany_v4(&fromside->eaddr); struct mmsghdr *mmh_recv = v6 ? udp6_mh_recv : udp4_mh_recv; uint8_t topif = pif_at_sidx(tosidx); int n, i; ASSERT(!c->no_udp && uflow); if ((n = udp_sock_recv(c, from_s, events, mmh_recv)) <= 0) return; flow_trace(uflow, "Received %d datagrams on reply socket", n); uflow->ts = now->tv_sec; for (i = 0; i < n; i++) { if (pif_is_socket(topif)) udp_splice_prepare(mmh_recv, i); else if (topif == PIF_TAP) udp_tap_prepare(mmh_recv, i, toside); } if (pif_is_socket(topif)) { udp_splice_send(c, 0, n, tosidx); } else if (topif == PIF_TAP) { tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n); } else { uint8_t frompif = pif_at_sidx(ref.flowside); flow_err(uflow, "No support for forwarding UDP from %s to %s", pif_name(frompif), pif_name(topif)); } } /** * udp_flow_from_tap() - Find or create UDP flow for tap packets * @c: Execution context * @pif: pif on which the packet is arriving * @af: Address family, AF_INET or AF_INET6 * @saddr: Source address on guest side * @daddr: Destination address guest side * @srcport: Source port on guest side * @dstport: Destination port on guest side * * Return: sidx for the destination side of the flow for this packet, or * FLOW_SIDX_NONE if we couldn't find or create a flow. */ static flow_sidx_t udp_flow_from_tap(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, in_port_t srcport, in_port_t dstport, const struct timespec *now) { struct udp_flow *uflow; union flow *flow; flow_sidx_t sidx; ASSERT(pif == PIF_TAP); sidx = flow_lookup_af(c, IPPROTO_UDP, pif, af, saddr, daddr, srcport, dstport); if ((uflow = udp_at_sidx(sidx))) { uflow->ts = now->tv_sec; return flow_sidx_opposite(sidx); } if (!(flow = flow_alloc())) { char sstr[INET6_ADDRSTRLEN], dstr[INET6_ADDRSTRLEN]; debug("Couldn't allocate flow for UDP datagram from %s %s:%hu -> %s:%hu", pif_name(pif), inet_ntop(af, saddr, sstr, sizeof(sstr)), srcport, inet_ntop(af, daddr, dstr, sizeof(dstr)), dstport); return FLOW_SIDX_NONE; } flow_initiate_af(flow, PIF_TAP, af, saddr, srcport, daddr, dstport); return udp_flow_new(c, flow, -1, now); } /** * udp_tap_handler() - Handle packets from tap * @c: Execution context * @pif: pif on which the packet is arriving * @af: Address family, AF_INET or AF_INET6 * @saddr: Source address * @daddr: Destination address * @p: Pool of UDP packets, with UDP headers * @idx: Index of first packet to process * @now: Current timestamp * * Return: count of consumed packets * * #syscalls sendmmsg */ int udp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, int idx, const struct timespec *now) { const struct flowside *toside; struct mmsghdr mm[UIO_MAXIOV]; union sockaddr_inany to_sa; struct iovec m[UIO_MAXIOV]; const struct udphdr *uh; struct udp_flow *uflow; int i, s, count = 0; flow_sidx_t tosidx; in_port_t src, dst; uint8_t topif; socklen_t sl; ASSERT(!c->no_udp); uh = packet_get(p, idx, 0, sizeof(*uh), NULL); if (!uh) return 1; /* The caller already checks that all the messages have the same source * and destination, so we can just take those from the first message. */ src = ntohs(uh->source); dst = ntohs(uh->dest); tosidx = udp_flow_from_tap(c, pif, af, saddr, daddr, src, dst, now); if (!(uflow = udp_at_sidx(tosidx))) { char sstr[INET6_ADDRSTRLEN], dstr[INET6_ADDRSTRLEN]; debug("Dropping datagram with no flow %s %s:%hu -> %s:%hu", pif_name(pif), inet_ntop(af, saddr, sstr, sizeof(sstr)), src, inet_ntop(af, daddr, dstr, sizeof(dstr)), dst); return 1; } topif = pif_at_sidx(tosidx); if (topif != PIF_HOST) { flow_sidx_t fromsidx = flow_sidx_opposite(tosidx); uint8_t frompif = pif_at_sidx(fromsidx); flow_err(uflow, "No support for forwarding UDP from %s to %s", pif_name(frompif), pif_name(topif)); return 1; } toside = flowside_at_sidx(tosidx); s = udp_at_sidx(tosidx)->s[tosidx.sidei]; ASSERT(s >= 0); pif_sockaddr(c, &to_sa, &sl, topif, &toside->eaddr, toside->eport); for (i = 0; i < (int)p->count - idx; i++) { struct udphdr *uh_send; size_t len; uh_send = packet_get(p, idx + i, 0, sizeof(*uh), &len); if (!uh_send) return p->count - idx; mm[i].msg_hdr.msg_name = &to_sa; mm[i].msg_hdr.msg_namelen = sl; if (len) { m[i].iov_base = (char *)(uh_send + 1); m[i].iov_len = len; mm[i].msg_hdr.msg_iov = m + i; mm[i].msg_hdr.msg_iovlen = 1; } else { mm[i].msg_hdr.msg_iov = NULL; mm[i].msg_hdr.msg_iovlen = 0; } mm[i].msg_hdr.msg_control = NULL; mm[i].msg_hdr.msg_controllen = 0; mm[i].msg_hdr.msg_flags = 0; count++; } count = sendmmsg(s, mm, count, MSG_NOSIGNAL); if (count < 0) return 1; return count; } /** * udp_sock_init() - Initialise listening sockets for a given port * @c: Execution context * @ns: In pasta mode, if set, bind with loopback address in namespace * @af: Address family to select a specific IP version, or AF_UNSPEC * @addr: Pointer to address for binding, NULL if not configured * @ifname: Name of interface to bind to, NULL if not configured * @port: Port, host order * * Return: 0 on (partial) success, negative error code on (complete) failure */ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, const void *addr, const char *ifname, in_port_t port) { union udp_listen_epoll_ref uref = { .port = port }; int s, r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1; ASSERT(!c->no_udp); if (ns) uref.pif = PIF_SPLICE; else uref.pif = PIF_HOST; if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) { uref.v6 = 0; if (!ns) { r4 = s = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN, addr, ifname, port, uref.u32); udp_splice_init[V4][port] = s < 0 ? -1 : s; } else { r4 = s = sock_l4(c, AF_INET, EPOLL_TYPE_UDP_LISTEN, &in4addr_loopback, ifname, port, uref.u32); udp_splice_ns[V4][port] = s < 0 ? -1 : s; } } if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) { uref.v6 = 1; if (!ns) { r6 = s = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN, addr, ifname, port, uref.u32); udp_splice_init[V6][port] = s < 0 ? -1 : s; } else { r6 = s = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP_LISTEN, &in6addr_loopback, ifname, port, uref.u32); udp_splice_ns[V6][port] = s < 0 ? -1 : s; } } if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6)) return 0; return r4 < 0 ? r4 : r6; } /** * udp_splice_iov_init() - Set up buffers and descriptors for recvmmsg/sendmmsg */ static void udp_splice_iov_init(void) { int i; for (i = 0; i < UDP_MAX_FRAMES; i++) { struct msghdr *mh = &udp_mh_splice[i].msg_hdr; mh->msg_name = &udp_splice_to; mh->msg_namelen = sizeof(udp_splice_to); udp_iov_splice[i].iov_base = udp_payload[i].data; mh->msg_iov = &udp_iov_splice[i]; mh->msg_iovlen = 1; } } /** * udp_port_rebind() - Rebind ports to match forward maps * @c: Execution context * @outbound: True to remap outbound forwards, otherwise inbound * * Must be called in namespace context if @outbound is true. */ static void udp_port_rebind(struct ctx *c, bool outbound) { int (*socks)[NUM_PORTS] = outbound ? udp_splice_ns : udp_splice_init; const uint8_t *fmap = outbound ? c->udp.fwd_out.map : c->udp.fwd_in.map; const uint8_t *rmap = outbound ? c->udp.fwd_in.map : c->udp.fwd_out.map; unsigned port; for (port = 0; port < NUM_PORTS; port++) { if (!bitmap_isset(fmap, port)) { if (socks[V4][port] >= 0) { close(socks[V4][port]); socks[V4][port] = -1; } if (socks[V6][port] >= 0) { close(socks[V6][port]); socks[V6][port] = -1; } continue; } /* Don't loop back our own ports */ if (bitmap_isset(rmap, port)) continue; if ((c->ifi4 && socks[V4][port] == -1) || (c->ifi6 && socks[V6][port] == -1)) udp_sock_init(c, outbound, AF_UNSPEC, NULL, NULL, port); } } /** * udp_port_rebind_outbound() - Rebind ports in namespace * @arg: Execution context * * Called with NS_CALL() * * Return: 0 */ static int udp_port_rebind_outbound(void *arg) { struct ctx *c = (struct ctx *)arg; ns_enter(c); udp_port_rebind(c, true); return 0; } /** * udp_flow_timer() - Handler for timed events related to a given flow * @c: Execution context * @uflow: UDP flow * @now: Current timestamp * * Return: true if the flow is ready to free, false otherwise */ bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow, const struct timespec *now) { if (now->tv_sec - uflow->ts <= UDP_CONN_TIMEOUT) return false; udp_flow_close(c, uflow); return true; } /** * udp_timer() - Scan activity bitmaps for ports with associated timed events * @c: Execution context * @now: Current timestamp */ void udp_timer(struct ctx *c, const struct timespec *now) { (void)now; ASSERT(!c->no_udp); if (c->mode == MODE_PASTA) { if (c->udp.fwd_out.mode == FWD_AUTO) { fwd_scan_ports_udp(&c->udp.fwd_out, &c->udp.fwd_in, &c->tcp.fwd_out, &c->tcp.fwd_in); NS_CALL(udp_port_rebind_outbound, c); } if (c->udp.fwd_in.mode == FWD_AUTO) { fwd_scan_ports_udp(&c->udp.fwd_in, &c->udp.fwd_out, &c->tcp.fwd_in, &c->tcp.fwd_out); udp_port_rebind(c, false); } } } /** * udp_init() - Initialise per-socket data, and sockets in namespace * @c: Execution context * * Return: 0 */ int udp_init(struct ctx *c) { ASSERT(!c->no_udp); udp_iov_init(c); if (c->mode == MODE_PASTA) { udp_splice_iov_init(); NS_CALL(udp_port_rebind_outbound, c); } return 0; }