aboutgitcodebugslistschat
path: root/udp.c
diff options
context:
space:
mode:
Diffstat (limited to 'udp.c')
-rw-r--r--udp.c1650
1 files changed, 840 insertions, 810 deletions
diff --git a/udp.c b/udp.c
index 694424a..65a52e0 100644
--- a/udp.c
+++ b/udp.c
@@ -15,79 +15,66 @@
/**
* DOC: Theory of Operation
*
+ * UDP Flows
+ * =========
*
- * For UDP, a reduced version of port-based connection tracking is implemented
- * with two purposes:
- * - binding ephemeral ports when they're used as source port by the guest, so
- * that replies on those ports can be forwarded back to the guest, with a
- * fixed timeout for this binding
- * - packets received from the local host get their source changed to a local
- * address (gateway address) so that they can be forwarded to the guest, and
- * packets sent as replies by the guest need their destination address to
- * be changed back to the address of the local host. This is dynamic to allow
- * connections from the gateway as well, and uses the same fixed 180s timeout
- *
- * Sockets for bound ports are created at initialisation time, one set for IPv4
- * and one for IPv6.
+ * UDP doesn't have true connections, but many protocols use a connection-like
+ * format. The flow is initiated by a client sending a datagram from a port of
+ * its choosing (usually ephemeral) to a specific port (usually well known) on a
+ * server. Both client and server address must be unicast. The server sends
+ * replies using the same addresses & ports with src/dest swapped.
*
- * Packets are forwarded back and forth, by prepending and stripping UDP headers
- * in the obvious way, with no port translation.
+ * We track pseudo-connections of this type as flow table entries of type
+ * FLOW_UDP. We store the time of the last traffic on the flow in uflow->ts,
+ * and let the flow expire if there is no traffic for UDP_CONN_TIMEOUT seconds.
*
- * In PASTA mode, the L2-L4 translation is skipped for connections to ports
- * bound between namespaces using the loopback interface, messages are directly
- * transferred between L4 sockets instead. These are called spliced connections
- * for consistency with the TCP implementation, but the splice() syscall isn't
- * actually used as it wouldn't make sense for datagram-based connections: a
- * pair of recvmmsg() and sendmmsg() deals with this case.
+ * NOTE: This won't handle multicast protocols, or some protocols with different
+ * port usage. We'll need specific logic if we want to handle those.
+ *
+ * "Listening" sockets
+ * ===================
*
- * The connection tracking for PASTA mode is slightly complicated by the absence
- * of actual connections, see struct udp_splice_port, and these examples:
+ * UDP doesn't use listen(), but we consider long term sockets which are allowed
+ * to create new flows "listening" by analogy with TCP. This listening socket
+ * could receive packets from multiple flows, so we use a hash table match to
+ * find the specific flow for a datagram.
*
- * - from init to namespace:
+ * Flow sockets
+ * ============
*
- * - forward direction: 127.0.0.1:5000 -> 127.0.0.1:80 in init from socket s,
- * with epoll reference: index = 80, splice = 1, orig = 1, ns = 0
- * - if udp_splice_ns[V4][5000].sock:
- * - send packet to udp_splice_ns[V4][5000].sock, with destination port
- * 80
- * - otherwise:
- * - create new socket udp_splice_ns[V4][5000].sock
- * - bind in namespace to 127.0.0.1:5000
- * - add to epoll with reference: index = 5000, splice = 1, orig = 0,
- * ns = 1
- * - update udp_splice_init[V4][80].ts and udp_splice_ns[V4][5000].ts with
- * current time
+ * When a UDP flow targets a socket, we create a "flow" socket in
+ * uflow->s[TGTSIDE] both to deliver datagrams to the target side and receive
+ * replies on the target side. This socket is both bound and connected and has
+ * EPOLL_TYPE_UDP. The connect() means it will only receive datagrams
+ * associated with this flow, so the epoll reference directly points to the flow
+ * and we don't need a hash lookup.
*
- * - reverse direction: 127.0.0.1:80 -> 127.0.0.1:5000 in namespace socket s,
- * having epoll reference: index = 5000, splice = 1, orig = 0, ns = 1
- * - if udp_splice_init[V4][80].sock:
- * - send to udp_splice_init[V4][80].sock, with destination port 5000
- * - update udp_splice_init[V4][80].ts and udp_splice_ns[V4][5000].ts with
- * current time
- * - otherwise, discard
+ * When a flow is initiated from a listening socket, we create a "flow" socket
+ * with the same bound address as the listening socket, but also connect()ed to
+ * the flow's peer. This is stored in uflow->s[INISIDE] and will last for the
+ * lifetime of the flow, even if the original listening socket is closed due to
+ * port auto-probing. The duplicate is used to deliver replies back to the
+ * originating side.
*
- * - from namespace to init:
+ * NOTE: A flow socket can have a bound address overlapping with a listening
+ * socket. That will happen naturally for flows initiated from a socket, but is
+ * also possible (though unlikely) for tap initiated flows, depending on the
+ * source port. We assume datagrams for the flow will come to a connect()ed
+ * socket in preference to a listening socket. The sample program
+ * doc/platform-requirements/reuseaddr-priority.c documents and tests that
+ * assumption.
*
- * - forward direction: 127.0.0.1:2000 -> 127.0.0.1:22 in namespace from
- * socket s, with epoll reference: index = 22, splice = 1, orig = 1, ns = 1
- * - if udp4_splice_init[V4][2000].sock:
- * - send packet to udp_splice_init[V4][2000].sock, with destination
- * port 22
- * - otherwise:
- * - create new socket udp_splice_init[V4][2000].sock
- * - bind in init to 127.0.0.1:2000
- * - add to epoll with reference: index = 2000, splice = 1, orig = 0,
- * ns = 0
- * - update udp_splice_ns[V4][22].ts and udp_splice_init[V4][2000].ts with
- * current time
+ * "Spliced" flows
+ * ===============
+ *
+ * In PASTA mode, L2-L4 translation is skipped for connections to ports bound
+ * between namespaces using the loopback interface, messages are directly
+ * transferred between L4 sockets instead. These are called spliced connections
+ * in analogy with the TCP implementation. The the splice() syscall isn't
+ * actually used; it doesn't make sense for datagrams and instead a pair of
+ * recvmmsg() and sendmmsg() is used to forward the datagrams.
*
- * - reverse direction: 127.0.0.1:22 -> 127.0.0.1:2000 in init from socket s,
- * having epoll reference: index = 2000, splice = 1, orig = 0, ns = 0
- * - if udp_splice_ns[V4][22].sock:
- * - send to udp_splice_ns[V4][22].sock, with destination port 2000
- * - update udp_splice_ns[V4][22].ts and udp_splice_init[V4][2000].ts with
- * current time
- * - otherwise, discard
+ * Note that a spliced flow will have two flow sockets (see above).
*/
#include <sched.h>
@@ -102,6 +89,8 @@
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/udp.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/icmp6.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
@@ -110,9 +99,12 @@
#include <sys/socket.h>
#include <sys/uio.h>
#include <time.h>
+#include <arpa/inet.h>
+#include <linux/errqueue.h>
#include "checksum.h"
#include "util.h"
+#include "iov.h"
#include "ip.h"
#include "siphash.h"
#include "inany.h"
@@ -120,128 +112,87 @@
#include "tap.h"
#include "pcap.h"
#include "log.h"
+#include "flow_table.h"
+#include "udp_internal.h"
+#include "udp_vu.h"
-#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */
#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */
-/**
- * struct udp_tap_port - Port tracking based on tap-facing source port
- * @sock: Socket bound to source port used as index
- * @flags: Flags for recent activity type seen from/to port
- * @ts: Activity timestamp from tap, used for socket aging
- */
-struct udp_tap_port {
- int sock;
- uint8_t flags;
-#define PORT_LOCAL BIT(0) /* Port was contacted from local address */
-#define PORT_LOOPBACK BIT(1) /* Port was contacted from loopback address */
-#define PORT_GUA BIT(2) /* Port was contacted from global unicast */
-#define PORT_DNS_FWD BIT(3) /* Port used as source for DNS remapped query */
-
- time_t ts;
-};
-
-/**
- * struct udp_splice_port - Bound socket for spliced communication
- * @sock: Socket bound to index port
- * @ts: Activity timestamp
- */
-struct udp_splice_port {
- int sock;
- time_t ts;
-};
-
-/* Port tracking, arrays indexed by packet source port (host order) */
-static struct udp_tap_port udp_tap_map [IP_VERSIONS][NUM_PORTS];
+/* Maximum UDP data to be returned in ICMP messages */
+#define ICMP4_MAX_DLEN 8
+#define ICMP6_MAX_DLEN (IPV6_MIN_MTU \
+ - sizeof(struct udphdr) \
+ - sizeof(struct ipv6hdr))
/* "Spliced" sockets indexed by bound port (host order) */
-static struct udp_splice_port udp_splice_ns [IP_VERSIONS][NUM_PORTS];
-static struct udp_splice_port udp_splice_init[IP_VERSIONS][NUM_PORTS];
-
-enum udp_act_type {
- UDP_ACT_TAP,
- UDP_ACT_SPLICE_NS,
- UDP_ACT_SPLICE_INIT,
- UDP_ACT_TYPE_MAX,
-};
-
-/* Activity-based aging for bindings */
-static uint8_t udp_act[IP_VERSIONS][UDP_ACT_TYPE_MAX][DIV_ROUND_UP(NUM_PORTS, 8)];
+static int udp_splice_ns [IP_VERSIONS][NUM_PORTS];
+static int udp_splice_init[IP_VERSIONS][NUM_PORTS];
/* Static buffers */
-/**
- * udp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
- * @s_in: Source socket address, filled in by recvmmsg()
- * @taph: Tap-level headers (partially pre-filled)
- * @iph: Pre-filled IP header (except for tot_len and saddr)
- * @uh: Headroom for UDP header
- * @data: Storage for UDP payload
- */
-static struct udp4_l2_buf_t {
- struct sockaddr_in s_in;
+/* UDP header and data for inbound messages */
+static struct udp_payload_t udp_payload[UDP_MAX_FRAMES];
- struct tap_hdr taph;
- struct iphdr iph;
- struct udphdr uh;
- uint8_t data[USHRT_MAX -
- (sizeof(struct iphdr) + sizeof(struct udphdr))];
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
-udp4_l2_buf[UDP_MAX_FRAMES];
+/* Ethernet header for IPv4 frames */
+static struct ethhdr udp4_eth_hdr;
+
+/* Ethernet header for IPv6 frames */
+static struct ethhdr udp6_eth_hdr;
/**
- * udp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
- * @s_in6: Source socket address, filled in by recvmmsg()
- * @taph: Tap-level headers (partially pre-filled)
- * @ip6h: Pre-filled IP header (except for payload_len and addresses)
- * @uh: Headroom for UDP header
- * @data: Storage for UDP payload
+ * struct udp_meta_t - Pre-cooked headers for UDP packets
+ * @ip6h: Pre-filled IPv6 header (except for payload_len and addresses)
+ * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr)
+ * @taph: Tap backend specific header
*/
-struct udp6_l2_buf_t {
- struct sockaddr_in6 s_in6;
-#ifdef __AVX2__
- /* Align ip6h to 32-byte boundary. */
- uint8_t pad[64 - (sizeof(struct sockaddr_in6) + sizeof(struct ethhdr) +
- sizeof(uint32_t))];
-#endif
-
- struct tap_hdr taph;
+static struct udp_meta_t {
struct ipv6hdr ip6h;
- struct udphdr uh;
- uint8_t data[USHRT_MAX -
- (sizeof(struct ipv6hdr) + sizeof(struct udphdr))];
+ struct iphdr ip4h;
+ struct tap_hdr taph;
+}
#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
+__attribute__ ((aligned(32)))
#endif
-udp6_l2_buf[UDP_MAX_FRAMES];
+udp_meta[UDP_MAX_FRAMES];
-/* recvmmsg()/sendmmsg() data for tap */
-static struct iovec udp4_l2_iov_sock [UDP_MAX_FRAMES];
-static struct iovec udp6_l2_iov_sock [UDP_MAX_FRAMES];
+#define PKTINFO_SPACE \
+ MAX(CMSG_SPACE(sizeof(struct in_pktinfo)), \
+ CMSG_SPACE(sizeof(struct in6_pktinfo)))
-static struct iovec udp4_l2_iov_tap [UDP_MAX_FRAMES];
-static struct iovec udp6_l2_iov_tap [UDP_MAX_FRAMES];
+#define RECVERR_SPACE \
+ MAX(CMSG_SPACE(sizeof(struct sock_extended_err) + \
+ sizeof(struct sockaddr_in)), \
+ CMSG_SPACE(sizeof(struct sock_extended_err) + \
+ sizeof(struct sockaddr_in6)))
-static struct mmsghdr udp4_l2_mh_sock [UDP_MAX_FRAMES];
-static struct mmsghdr udp6_l2_mh_sock [UDP_MAX_FRAMES];
+/**
+ * enum udp_iov_idx - Indices for the buffers making up a single UDP frame
+ * @UDP_IOV_TAP tap specific header
+ * @UDP_IOV_ETH Ethernet header
+ * @UDP_IOV_IP IP (v4/v6) header
+ * @UDP_IOV_PAYLOAD IP payload (UDP header + data)
+ * @UDP_NUM_IOVS the number of entries in the iovec array
+ */
+enum udp_iov_idx {
+ UDP_IOV_TAP,
+ UDP_IOV_ETH,
+ UDP_IOV_IP,
+ UDP_IOV_PAYLOAD,
+ UDP_NUM_IOVS,
+};
-/* recvmmsg()/sendmmsg() data for "spliced" connections */
-static struct iovec udp4_iov_splice [UDP_MAX_FRAMES];
-static struct iovec udp6_iov_splice [UDP_MAX_FRAMES];
+/* IOVs and msghdr arrays for receiving datagrams from sockets */
+static struct iovec udp_iov_recv [UDP_MAX_FRAMES];
+static struct mmsghdr udp_mh_recv [UDP_MAX_FRAMES];
-static struct sockaddr_in udp4_localname = {
- .sin_family = AF_INET,
- .sin_addr = IN4ADDR_LOOPBACK_INIT,
-};
-static struct sockaddr_in6 udp6_localname = {
- .sin6_family = AF_INET6,
- .sin6_addr = IN6ADDR_LOOPBACK_INIT,
-};
+/* IOVs and msghdr arrays for sending "spliced" datagrams to sockets */
+static union sockaddr_inany udp_splice_to;
+
+static struct iovec udp_iov_splice [UDP_MAX_FRAMES];
+static struct mmsghdr udp_mh_splice [UDP_MAX_FRAMES];
-static struct mmsghdr udp4_mh_splice [UDP_MAX_FRAMES];
-static struct mmsghdr udp6_mh_splice [UDP_MAX_FRAMES];
+/* IOVs for L2 frames */
+static struct iovec udp_l2_iov [UDP_MAX_FRAMES][UDP_NUM_IOVS];
/**
* udp_portmap_clear() - Clear UDP port map before configuration
@@ -251,28 +202,8 @@ void udp_portmap_clear(void)
unsigned i;
for (i = 0; i < NUM_PORTS; i++) {
- udp_tap_map[V4][i].sock = udp_tap_map[V6][i].sock = -1;
- udp_splice_ns[V4][i].sock = udp_splice_ns[V6][i].sock = -1;
- udp_splice_init[V4][i].sock = udp_splice_init[V6][i].sock = -1;
- }
-}
-
-/**
- * udp_invert_portmap() - Compute reverse port translations for return packets
- * @fwd: Port forwarding configuration to compute reverse map for
- */
-static void udp_invert_portmap(struct udp_fwd_ports *fwd)
-{
- unsigned int i;
-
- static_assert(ARRAY_SIZE(fwd->f.delta) == ARRAY_SIZE(fwd->rdelta),
- "Forward and reverse delta arrays must have same size");
- for (i = 0; i < ARRAY_SIZE(fwd->f.delta); i++) {
- in_port_t delta = fwd->f.delta[i];
- in_port_t rport = i + delta;
-
- if (delta)
- fwd->rdelta[rport] = NUM_PORTS - delta;
+ udp_splice_ns[V4][i] = udp_splice_ns[V6][i] = -1;
+ udp_splice_init[V4][i] = udp_splice_init[V6][i] = -1;
}
}
@@ -283,509 +214,743 @@ static void udp_invert_portmap(struct udp_fwd_ports *fwd)
*/
void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
{
- int i;
-
- for (i = 0; i < UDP_MAX_FRAMES; i++) {
- struct udp4_l2_buf_t *b4 = &udp4_l2_buf[i];
- struct udp6_l2_buf_t *b6 = &udp6_l2_buf[i];
-
- eth_update_mac(&b4->taph.eh, eth_d, eth_s);
- eth_update_mac(&b6->taph.eh, eth_d, eth_s);
- }
+ eth_update_mac(&udp4_eth_hdr, eth_d, eth_s);
+ eth_update_mac(&udp6_eth_hdr, eth_d, eth_s);
}
/**
- * udp_sock4_iov_init_one() - Initialise a scatter-gather L2 buffer for IPv4
+ * udp_iov_init_one() - Initialise scatter-gather lists for one buffer
* @c: Execution context
* @i: Index of buffer to initialize
*/
-static void udp_sock4_iov_init_one(const struct ctx *c, size_t i)
+static void udp_iov_init_one(const struct ctx *c, size_t i)
{
- struct msghdr *mh = &udp4_l2_mh_sock[i].msg_hdr;
- struct udp4_l2_buf_t *buf = &udp4_l2_buf[i];
- struct iovec *siov = &udp4_l2_iov_sock[i];
- struct iovec *tiov = &udp4_l2_iov_tap[i];
-
- *buf = (struct udp4_l2_buf_t) {
- .taph = TAP_HDR_INIT(ETH_P_IP),
- .iph = L2_BUF_IP4_INIT(IPPROTO_UDP)
+ struct udp_payload_t *payload = &udp_payload[i];
+ struct msghdr *mh = &udp_mh_recv[i].msg_hdr;
+ struct udp_meta_t *meta = &udp_meta[i];
+ struct iovec *siov = &udp_iov_recv[i];
+ struct iovec *tiov = udp_l2_iov[i];
+
+ *meta = (struct udp_meta_t) {
+ .ip4h = L2_BUF_IP4_INIT(IPPROTO_UDP),
+ .ip6h = L2_BUF_IP6_INIT(IPPROTO_UDP),
};
- siov->iov_base = buf->data;
- siov->iov_len = sizeof(buf->data);
+ *siov = IOV_OF_LVALUE(payload->data);
+
+ tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph);
+ tiov[UDP_IOV_PAYLOAD].iov_base = payload;
- mh->msg_name = &buf->s_in;
- mh->msg_namelen = sizeof(buf->s_in);
mh->msg_iov = siov;
mh->msg_iovlen = 1;
-
- tiov->iov_base = tap_frame_base(c, &buf->taph);
}
/**
- * udp_sock6_iov_init_one() - Initialise a scatter-gather L2 buffer for IPv6
+ * udp_iov_init() - Initialise scatter-gather L2 buffers
* @c: Execution context
- * @i: Index of buffer to initialize
*/
-static void udp_sock6_iov_init_one(const struct ctx *c, size_t i)
+static void udp_iov_init(const struct ctx *c)
{
- struct msghdr *mh = &udp6_l2_mh_sock[i].msg_hdr;
- struct udp6_l2_buf_t *buf = &udp6_l2_buf[i];
- struct iovec *siov = &udp6_l2_iov_sock[i];
- struct iovec *tiov = &udp6_l2_iov_tap[i];
-
- *buf = (struct udp6_l2_buf_t) {
- .taph = TAP_HDR_INIT(ETH_P_IPV6),
- .ip6h = L2_BUF_IP6_INIT(IPPROTO_UDP)
- };
+ size_t i;
- siov->iov_base = buf->data;
- siov->iov_len = sizeof(buf->data);
+ udp4_eth_hdr.h_proto = htons_constant(ETH_P_IP);
+ udp6_eth_hdr.h_proto = htons_constant(ETH_P_IPV6);
- mh->msg_name = &buf->s_in6;
- mh->msg_namelen = sizeof(buf->s_in6);
- mh->msg_iov = siov;
- mh->msg_iovlen = 1;
+ for (i = 0; i < UDP_MAX_FRAMES; i++)
+ udp_iov_init_one(c, i);
+}
- tiov->iov_base = tap_frame_base(c, &buf->taph);
+/**
+ * udp_update_hdr4() - Update headers for one IPv4 datagram
+ * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr)
+ * @bp: Pointer to udp_payload_t to update
+ * @toside: Flowside for destination side
+ * @dlen: Length of UDP payload
+ * @no_udp_csum: Do not set UDP checksum
+ *
+ * Return: size of IPv4 payload (UDP header + data)
+ */
+size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
+ const struct flowside *toside, size_t dlen,
+ bool no_udp_csum)
+{
+ const struct in_addr *src = inany_v4(&toside->oaddr);
+ const struct in_addr *dst = inany_v4(&toside->eaddr);
+ size_t l4len = dlen + sizeof(bp->uh);
+ size_t l3len = l4len + sizeof(*ip4h);
+
+ ASSERT(src && dst);
+
+ ip4h->tot_len = htons(l3len);
+ ip4h->daddr = dst->s_addr;
+ ip4h->saddr = src->s_addr;
+ ip4h->check = csum_ip4_header(l3len, IPPROTO_UDP, *src, *dst);
+
+ bp->uh.source = htons(toside->oport);
+ bp->uh.dest = htons(toside->eport);
+ bp->uh.len = htons(l4len);
+ if (no_udp_csum) {
+ bp->uh.check = 0;
+ } else {
+ const struct iovec iov = {
+ .iov_base = bp->data,
+ .iov_len = dlen
+ };
+ struct iov_tail data = IOV_TAIL(&iov, 1, 0);
+ csum_udp4(&bp->uh, *src, *dst, &data);
+ }
+
+ return l4len;
}
/**
- * udp_sock_iov_init() - Initialise scatter-gather L2 buffers
+ * udp_update_hdr6() - Update headers for one IPv6 datagram
+ * @ip6h: Pre-filled IPv6 header (except for payload_len and
+ * addresses)
+ * @bp: Pointer to udp_payload_t to update
+ * @toside: Flowside for destination side
+ * @dlen: Length of UDP payload
+ * @no_udp_csum: Do not set UDP checksum
+ *
+ * Return: size of IPv6 payload (UDP header + data)
+ */
+size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
+ const struct flowside *toside, size_t dlen,
+ bool no_udp_csum)
+{
+ uint16_t l4len = dlen + sizeof(bp->uh);
+
+ ip6h->payload_len = htons(l4len);
+ ip6h->daddr = toside->eaddr.a6;
+ ip6h->saddr = toside->oaddr.a6;
+ ip6h->version = 6;
+ ip6h->nexthdr = IPPROTO_UDP;
+ ip6h->hop_limit = 255;
+
+ bp->uh.source = htons(toside->oport);
+ bp->uh.dest = htons(toside->eport);
+ bp->uh.len = ip6h->payload_len;
+ if (no_udp_csum) {
+ /* 0 is an invalid checksum for UDP IPv6 and dropped by
+ * the kernel stack, even if the checksum is disabled by virtio
+ * flags. We need to put any non-zero value here.
+ */
+ bp->uh.check = 0xffff;
+ } else {
+ const struct iovec iov = {
+ .iov_base = bp->data,
+ .iov_len = dlen
+ };
+ struct iov_tail data = IOV_TAIL(&iov, 1, 0);
+ csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, &data);
+ }
+
+ return l4len;
+}
+
+/**
+ * udp_tap_prepare() - Convert one datagram into a tap frame
+ * @mmh: Receiving mmsghdr array
+ * @idx: Index of the datagram to prepare
+ * @toside: Flowside for destination side
+ * @no_udp_csum: Do not set UDP checksum
+ */
+static void udp_tap_prepare(const struct mmsghdr *mmh,
+ unsigned idx, const struct flowside *toside,
+ bool no_udp_csum)
+{
+ struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx];
+ struct udp_payload_t *bp = &udp_payload[idx];
+ struct udp_meta_t *bm = &udp_meta[idx];
+ size_t l4len;
+
+ if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) {
+ l4len = udp_update_hdr6(&bm->ip6h, bp, toside,
+ mmh[idx].msg_len, no_udp_csum);
+ tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) +
+ sizeof(udp6_eth_hdr));
+ (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr);
+ (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h);
+ } else {
+ l4len = udp_update_hdr4(&bm->ip4h, bp, toside,
+ mmh[idx].msg_len, no_udp_csum);
+ tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) +
+ sizeof(udp4_eth_hdr));
+ (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr);
+ (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip4h);
+ }
+ (*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len;
+}
+
+/**
+ * udp_send_tap_icmp4() - Construct and send ICMPv4 to local peer
* @c: Execution context
+ * @ee: Extended error descriptor
+ * @toside: Destination side of flow
+ * @saddr: Address of ICMP generating node
+ * @in: First bytes (max 8) of original UDP message body
+ * @dlen: Length of the read part of original UDP message body
*/
-static void udp_sock_iov_init(const struct ctx *c)
+static void udp_send_tap_icmp4(const struct ctx *c,
+ const struct sock_extended_err *ee,
+ const struct flowside *toside,
+ struct in_addr saddr,
+ const void *in, size_t dlen)
{
- size_t i;
+ struct in_addr oaddr = toside->oaddr.v4mapped.a4;
+ struct in_addr eaddr = toside->eaddr.v4mapped.a4;
+ in_port_t eport = toside->eport;
+ in_port_t oport = toside->oport;
+ struct {
+ struct icmphdr icmp4h;
+ struct iphdr ip4h;
+ struct udphdr uh;
+ char data[ICMP4_MAX_DLEN];
+ } __attribute__((packed, aligned(__alignof__(max_align_t)))) msg;
+ size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen;
+ size_t l4len = dlen + sizeof(struct udphdr);
+
+ ASSERT(dlen <= ICMP4_MAX_DLEN);
+ memset(&msg, 0, sizeof(msg));
+ msg.icmp4h.type = ee->ee_type;
+ msg.icmp4h.code = ee->ee_code;
+ if (ee->ee_type == ICMP_DEST_UNREACH && ee->ee_code == ICMP_FRAG_NEEDED)
+ msg.icmp4h.un.frag.mtu = htons((uint16_t) ee->ee_info);
+
+ /* Reconstruct the original headers as returned in the ICMP message */
+ tap_push_ip4h(&msg.ip4h, eaddr, oaddr, l4len, IPPROTO_UDP);
+ tap_push_uh4(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
+ memcpy(&msg.data, in, dlen);
+
+ tap_icmp4_send(c, saddr, eaddr, &msg, msglen);
+}
- for (i = 0; i < UDP_MAX_FRAMES; i++) {
- if (c->ifi4)
- udp_sock4_iov_init_one(c, i);
- if (c->ifi6)
- udp_sock6_iov_init_one(c, i);
+
+/**
+ * udp_send_tap_icmp6() - Construct and send ICMPv6 to local peer
+ * @c: Execution context
+ * @ee: Extended error descriptor
+ * @toside: Destination side of flow
+ * @saddr: Address of ICMP generating node
+ * @in: First bytes (max 1232) of original UDP message body
+ * @dlen: Length of the read part of original UDP message body
+ * @flow: IPv6 flow identifier
+ */
+static void udp_send_tap_icmp6(const struct ctx *c,
+ const struct sock_extended_err *ee,
+ const struct flowside *toside,
+ const struct in6_addr *saddr,
+ void *in, size_t dlen, uint32_t flow)
+{
+ const struct in6_addr *oaddr = &toside->oaddr.a6;
+ const struct in6_addr *eaddr = &toside->eaddr.a6;
+ in_port_t eport = toside->eport;
+ in_port_t oport = toside->oport;
+ struct {
+ struct icmp6_hdr icmp6h;
+ struct ipv6hdr ip6h;
+ struct udphdr uh;
+ char data[ICMP6_MAX_DLEN];
+ } __attribute__((packed, aligned(__alignof__(max_align_t)))) msg;
+ size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen;
+ size_t l4len = dlen + sizeof(struct udphdr);
+
+ ASSERT(dlen <= ICMP6_MAX_DLEN);
+ memset(&msg, 0, sizeof(msg));
+ msg.icmp6h.icmp6_type = ee->ee_type;
+ msg.icmp6h.icmp6_code = ee->ee_code;
+ if (ee->ee_type == ICMP6_PACKET_TOO_BIG)
+ msg.icmp6h.icmp6_dataun.icmp6_un_data32[0] = htonl(ee->ee_info);
+
+ /* Reconstruct the original headers as returned in the ICMP message */
+ tap_push_ip6h(&msg.ip6h, eaddr, oaddr, l4len, IPPROTO_UDP, flow);
+ tap_push_uh6(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
+ memcpy(&msg.data, in, dlen);
+
+ tap_icmp6_send(c, saddr, eaddr, &msg, msglen);
+}
+
+/**
+ * udp_pktinfo() - Retrieve packet destination address from cmsg
+ * @msg: msghdr into which message has been received
+ * @dst: (Local) destination address of message in @msg (output)
+ *
+ * Return: 0 on success, -1 if the information was missing (@dst is set to
+ * inany_any6).
+ */
+static int udp_pktinfo(struct msghdr *msg, union inany_addr *dst)
+{
+ struct cmsghdr *hdr;
+
+ for (hdr = CMSG_FIRSTHDR(msg); hdr; hdr = CMSG_NXTHDR(msg, hdr)) {
+ if (hdr->cmsg_level == IPPROTO_IP &&
+ hdr->cmsg_type == IP_PKTINFO) {
+ const struct in_pktinfo *i4 = (void *)CMSG_DATA(hdr);
+
+ *dst = inany_from_v4(i4->ipi_addr);
+ return 0;
+ }
+
+ if (hdr->cmsg_level == IPPROTO_IPV6 &&
+ hdr->cmsg_type == IPV6_PKTINFO) {
+ const struct in6_pktinfo *i6 = (void *)CMSG_DATA(hdr);
+
+ dst->a6 = i6->ipi6_addr;
+ return 0;
+ }
}
+
+ debug("Missing PKTINFO cmsg on datagram");
+ *dst = inany_any6;
+ return -1;
}
/**
- * udp_splice_new() - Create and prepare socket for "spliced" binding
+ * udp_sock_recverr() - Receive and clear an error from a socket
* @c: Execution context
- * @v6: Set for IPv6 sockets
- * @src: Source port of original connection, host order
- * @ns: Does the splice originate in the ns or not
+ * @s: Socket to receive errors from
+ * @sidx: Flow and side of @s, or FLOW_SIDX_NONE if unknown
+ * @pif: Interface on which the error occurred
+ * (only used if @sidx == FLOW_SIDX_NONE)
+ * @port: Local port number of @s (only used if @sidx == FLOW_SIDX_NONE)
*
- * Return: prepared socket, negative error code on failure
+ * Return: 1 if error received and processed, 0 if no more errors in queue, < 0
+ * if there was an error reading the queue
*
- * #syscalls:pasta getsockname
+ * #syscalls recvmsg
*/
-int udp_splice_new(const struct ctx *c, int v6, in_port_t src, bool ns)
+static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
+ uint8_t pif, in_port_t port)
{
- struct epoll_event ev = { .events = EPOLLIN | EPOLLRDHUP | EPOLLHUP };
- union epoll_ref ref = { .type = EPOLL_TYPE_UDP,
- .udp = { .splice = true, .v6 = v6, .port = src }
- };
- struct udp_splice_port *sp;
- int act, s;
-
- if (ns) {
- ref.udp.pif = PIF_SPLICE;
- sp = &udp_splice_ns[v6 ? V6 : V4][src];
- act = UDP_ACT_SPLICE_NS;
- } else {
- ref.udp.pif = PIF_HOST;
- sp = &udp_splice_init[v6 ? V6 : V4][src];
- act = UDP_ACT_SPLICE_INIT;
+ char buf[PKTINFO_SPACE + RECVERR_SPACE];
+ const struct sock_extended_err *ee;
+ char data[ICMP6_MAX_DLEN];
+ struct cmsghdr *hdr;
+ struct iovec iov = {
+ .iov_base = data,
+ .iov_len = sizeof(data)
+ };
+ union sockaddr_inany src;
+ struct msghdr mh = {
+ .msg_name = &src,
+ .msg_namelen = sizeof(src),
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ .msg_control = buf,
+ .msg_controllen = sizeof(buf),
+ };
+ const struct flowside *fromside, *toside;
+ union inany_addr offender, otap;
+ char astr[INANY_ADDRSTRLEN];
+ char sastr[SOCKADDR_STRLEN];
+ const struct in_addr *o4;
+ in_port_t offender_port;
+ struct udp_flow *uflow;
+ uint8_t topif;
+ size_t dlen;
+ ssize_t rc;
+
+ rc = recvmsg(s, &mh, MSG_ERRQUEUE);
+ if (rc < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK)
+ return 0;
+
+ err_perror("UDP: Failed to read error queue");
+ return -1;
}
- s = socket(v6 ? AF_INET6 : AF_INET, SOCK_DGRAM | SOCK_NONBLOCK,
- IPPROTO_UDP);
+ if (!(mh.msg_flags & MSG_ERRQUEUE)) {
+ err("Missing MSG_ERRQUEUE flag reading error queue");
+ return -1;
+ }
- if (s > FD_REF_MAX) {
- close(s);
- return -EIO;
+ for (hdr = CMSG_FIRSTHDR(&mh); hdr; hdr = CMSG_NXTHDR(&mh, hdr)) {
+ if ((hdr->cmsg_level == IPPROTO_IP &&
+ hdr->cmsg_type == IP_RECVERR) ||
+ (hdr->cmsg_level == IPPROTO_IPV6 &&
+ hdr->cmsg_type == IPV6_RECVERR))
+ break;
}
- if (s < 0)
- return s;
+ if (!hdr) {
+ err("Missing RECVERR cmsg in error queue");
+ return -1;
+ }
- ref.fd = s;
+ ee = (const struct sock_extended_err *)CMSG_DATA(hdr);
- if (v6) {
- struct sockaddr_in6 addr6 = {
- .sin6_family = AF_INET6,
- .sin6_port = htons(src),
- .sin6_addr = IN6ADDR_LOOPBACK_INIT,
- };
- if (bind(s, (struct sockaddr *)&addr6, sizeof(addr6)))
- goto fail;
+ debug("%s error on UDP socket %i: %s",
+ str_ee_origin(ee), s, strerror_(ee->ee_errno));
+
+ if (!flow_sidx_valid(sidx)) {
+ /* No hint from the socket, determine flow from addresses */
+ union inany_addr dst;
+
+ if (udp_pktinfo(&mh, &dst) < 0) {
+ debug("Missing PKTINFO on UDP error");
+ return 1;
+ }
+
+ sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, &src, &dst, port);
+ if (!flow_sidx_valid(sidx)) {
+ debug("Ignoring UDP error without flow");
+ return 1;
+ }
} else {
- struct sockaddr_in addr4 = {
- .sin_family = AF_INET,
- .sin_port = htons(src),
- .sin_addr = IN4ADDR_LOOPBACK_INIT,
- };
- if (bind(s, (struct sockaddr *)&addr4, sizeof(addr4)))
- goto fail;
+ pif = pif_at_sidx(sidx);
}
- sp->sock = s;
- bitmap_set(udp_act[v6 ? V6 : V4][act], src);
+ uflow = udp_at_sidx(sidx);
+ ASSERT(uflow);
+ fromside = &uflow->f.side[sidx.sidei];
+ toside = &uflow->f.side[!sidx.sidei];
+ topif = uflow->f.pif[!sidx.sidei];
+ dlen = rc;
- ev.data.u64 = ref.u64;
- epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev);
- return s;
+ if (inany_from_sockaddr(&offender, &offender_port,
+ SO_EE_OFFENDER(ee)) < 0)
+ goto fail;
+
+ if (pif != PIF_HOST || topif != PIF_TAP)
+ /* XXX Can we support any other cases? */
+ goto fail;
+
+ /* If the offender *is* the endpoint, make sure our translation is
+ * consistent with the flow's translation. This matters if the flow
+ * endpoint has a port specific translation (like --dns-match).
+ */
+ if (inany_equals(&offender, &fromside->eaddr))
+ otap = toside->oaddr;
+ else if (!nat_inbound(c, &offender, &otap))
+ goto fail;
+
+ if (hdr->cmsg_level == IPPROTO_IP &&
+ (o4 = inany_v4(&otap)) && inany_v4(&toside->eaddr)) {
+ dlen = MIN(dlen, ICMP4_MAX_DLEN);
+ udp_send_tap_icmp4(c, ee, toside, *o4, data, dlen);
+ return 1;
+ }
+
+ if (hdr->cmsg_level == IPPROTO_IPV6 && !inany_v4(&toside->eaddr)) {
+ udp_send_tap_icmp6(c, ee, toside, &otap.a6, data, dlen,
+ FLOW_IDX(uflow));
+ return 1;
+ }
fail:
- close(s);
- return -1;
+ flow_dbg(uflow, "Can't propagate %s error from %s %s to %s %s",
+ str_ee_origin(ee),
+ pif_name(pif),
+ sockaddr_ntop(SO_EE_OFFENDER(ee), sastr, sizeof(sastr)),
+ pif_name(topif),
+ inany_ntop(&toside->eaddr, astr, sizeof(astr)));
+ return 1;
}
/**
- * struct udp_splice_new_ns_arg - Arguments for udp_splice_new_ns()
+ * udp_sock_errs() - Process errors on a socket
* @c: Execution context
- * @v6: Set for IPv6
- * @src: Source port of originating datagram, host order
- * @dst: Destination port of originating datagram, host order
- * @s: Newly created socket or negative error code
- */
-struct udp_splice_new_ns_arg {
- const struct ctx *c;
- int v6;
- in_port_t src;
- int s;
-};
-
-/**
- * udp_splice_new_ns() - Enter namespace and call udp_splice_new()
- * @arg: See struct udp_splice_new_ns_arg
+ * @s: Socket to receive errors from
+ * @sidx: Flow and side of @s, or FLOW_SIDX_NONE if unknown
+ * @pif: Interface on which the error occurred
+ * (only used if @sidx == FLOW_SIDX_NONE)
+ * @port: Local port number of @s (only used if @sidx == FLOW_SIDX_NONE)
*
- * Return: 0
+ * Return: Number of errors handled, or < 0 if we have an unrecoverable error
*/
-static int udp_splice_new_ns(void *arg)
+static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx,
+ uint8_t pif, in_port_t port)
{
- struct udp_splice_new_ns_arg *a;
+ unsigned n_err = 0;
+ socklen_t errlen;
+ int rc, err;
- a = (struct udp_splice_new_ns_arg *)arg;
+ ASSERT(!c->no_udp);
- ns_enter(a->c);
+ /* Empty the error queue */
+ while ((rc = udp_sock_recverr(c, s, sidx, pif, port)) > 0)
+ n_err += rc;
- a->s = udp_splice_new(a->c, a->v6, a->src, true);
+ if (rc < 0)
+ return -1; /* error reading error, unrecoverable */
- return 0;
+ errlen = sizeof(err);
+ if (getsockopt(s, SOL_SOCKET, SO_ERROR, &err, &errlen) < 0 ||
+ errlen != sizeof(err)) {
+ err_perror("Error reading SO_ERROR");
+ return -1; /* error reading error, unrecoverable */
+ }
+
+ if (err) {
+ debug("Unqueued error on UDP socket %i: %s", s, strerror_(err));
+ n_err++;
+ }
+
+ if (!n_err) {
+ /* EPOLLERR, but no errors to clear !? */
+ err("EPOLLERR event without reported errors on socket %i", s);
+ return -1; /* no way to clear, unrecoverable */
+ }
+
+ return n_err;
}
/**
- * udp_mmh_splice_port() - Is source address of message suitable for splicing?
- * @v6: Is @sa a sockaddr_in6 (otherwise sockaddr_in)?
- * @mmh: mmsghdr of incoming message
+ * udp_peek_addr() - Get source address for next packet
+ * @s: Socket to get information from
+ * @src: Socket address (output)
+ * @dst: (Local) destination address (output)
*
- * Return: if @sa refers to localhost (127.0.0.1 or ::1) the port from
- * @sa in host order, otherwise -1.
+ * Return: 0 if no more packets, 1 on success, -ve error code on error
*/
-static int udp_mmh_splice_port(bool v6, const struct mmsghdr *mmh)
+static int udp_peek_addr(int s, union sockaddr_inany *src,
+ union inany_addr *dst)
{
- const struct sockaddr_in6 *sa6 = mmh->msg_hdr.msg_name;
- const struct sockaddr_in *sa4 = mmh->msg_hdr.msg_name;
+ char sastr[SOCKADDR_STRLEN], dstr[INANY_ADDRSTRLEN];
+ char cmsg[PKTINFO_SPACE];
+ struct msghdr msg = {
+ .msg_name = src,
+ .msg_namelen = sizeof(*src),
+ .msg_control = cmsg,
+ .msg_controllen = sizeof(cmsg),
+ };
+ int rc;
+
+ rc = recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT);
+ if (rc < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK)
+ return 0;
+ return -errno;
+ }
- if (v6 && IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr))
- return ntohs(sa6->sin6_port);
+ udp_pktinfo(&msg, dst);
- if (!v6 && IN4_IS_ADDR_LOOPBACK(&sa4->sin_addr))
- return ntohs(sa4->sin_port);
+ trace("Peeked UDP datagram: %s -> %s",
+ sockaddr_ntop(src, sastr, sizeof(sastr)),
+ inany_ntop(dst, dstr, sizeof(dstr)));
- return -1;
+ return 1;
}
/**
- * udp_splice_sendfrom() - Send datagrams from given port to given port
+ * udp_sock_recv() - Receive datagrams from a socket
* @c: Execution context
- * @start: Index of first datagram in udp[46]_l2_buf
- * @n: Number of datagrams to send
- * @src: Datagrams will be sent from this port (on origin side)
- * @dst: Datagrams will be send to this port (on destination side)
- * @from_pif: pif from which the packet originated
- * @v6: Send as IPv6?
- * @allow_new: If true create sending socket if needed, if false discard
- * if no sending socket is available
- * @now: Timestamp
+ * @s: Socket to receive from
+ * @mmh: mmsghdr array to receive into
+ * @n: Maximum number of datagrams to receive
+ *
+ * Return: Number of datagrams received
+ *
+ * #syscalls recvmmsg arm:recvmmsg_time64 i686:recvmmsg_time64
*/
-static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n,
- in_port_t src, in_port_t dst, uint8_t from_pif,
- bool v6, bool allow_new,
- const struct timespec *now)
+static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n)
{
- struct mmsghdr *mmh_recv, *mmh_send;
- unsigned int i;
- int s;
+ ASSERT(!c->no_udp);
- if (v6) {
- mmh_recv = udp6_l2_mh_sock;
- mmh_send = udp6_mh_splice;
- } else {
- mmh_recv = udp4_l2_mh_sock;
- mmh_send = udp4_mh_splice;
+ n = recvmmsg(s, mmh, n, 0, NULL);
+ if (n < 0) {
+ trace("Error receiving datagrams: %s", strerror_(errno));
+ /* Bail out and let the EPOLLERR handler deal with it */
+ return 0;
}
- if (from_pif == PIF_SPLICE) {
- src += c->udp.fwd_in.rdelta[src];
- s = udp_splice_init[v6][src].sock;
- if (s < 0 && allow_new)
- s = udp_splice_new(c, v6, src, false);
+ return n;
+}
- if (s < 0)
- return;
+/**
+ * udp_sock_to_sock() - Forward datagrams from socket to socket
+ * @c: Execution context
+ * @from_s: Socket to receive datagrams from
+ * @n: Maximum number of datagrams to forward
+ * @tosidx: Flow & side to forward datagrams to
+ *
+ * #syscalls sendmmsg
+ */
+static void udp_sock_to_sock(const struct ctx *c, int from_s, int n,
+ flow_sidx_t tosidx)
+{
+ const struct flowside *toside = flowside_at_sidx(tosidx);
+ const struct udp_flow *uflow = udp_at_sidx(tosidx);
+ uint8_t topif = pif_at_sidx(tosidx);
+ int to_s = uflow->s[tosidx.sidei];
+ socklen_t sl;
+ int i;
- udp_splice_ns[v6][dst].ts = now->tv_sec;
- udp_splice_init[v6][src].ts = now->tv_sec;
- } else {
- ASSERT(from_pif == PIF_HOST);
- src += c->udp.fwd_out.rdelta[src];
- s = udp_splice_ns[v6][src].sock;
- if (s < 0 && allow_new) {
- struct udp_splice_new_ns_arg arg = {
- c, v6, src, -1,
- };
-
- NS_CALL(udp_splice_new_ns, &arg);
- s = arg.s;
- }
- if (s < 0)
- return;
+ if ((n = udp_sock_recv(c, from_s, udp_mh_recv, n)) <= 0)
+ return;
- udp_splice_init[v6][dst].ts = now->tv_sec;
- udp_splice_ns[v6][src].ts = now->tv_sec;
+ for (i = 0; i < n; i++) {
+ udp_mh_splice[i].msg_hdr.msg_iov->iov_len
+ = udp_mh_recv[i].msg_len;
}
- for (i = start; i < start + n; i++)
- mmh_send[i].msg_hdr.msg_iov->iov_len = mmh_recv[i].msg_len;
+ pif_sockaddr(c, &udp_splice_to, &sl, topif,
+ &toside->eaddr, toside->eport);
- sendmmsg(s, mmh_send + start, n, MSG_NOSIGNAL);
+ sendmmsg(to_s, udp_mh_splice, n, MSG_NOSIGNAL);
}
/**
- * udp_update_hdr4() - Update headers for one IPv4 datagram
+ * udp_buf_sock_to_tap() - Forward datagrams from socket to tap
* @c: Execution context
- * @b: Pointer to udp4_l2_buf to update
- * @dstport: Destination port number
- * @datalen: Length of UDP payload
- * @now: Current timestamp
- *
- * Return: size of tap frame with headers
+ * @s: Socket to read data from
+ * @n: Maximum number of datagrams to forward
+ * @tosidx: Flow & side to forward data from @s to
*/
-static size_t udp_update_hdr4(const struct ctx *c, struct udp4_l2_buf_t *b,
- in_port_t dstport, size_t datalen,
- const struct timespec *now)
+static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
+ flow_sidx_t tosidx)
{
- size_t ip_len = datalen + sizeof(b->iph) + sizeof(b->uh);
- in_port_t srcport = ntohs(b->s_in.sin_port);
- struct in_addr src = b->s_in.sin_addr;
-
- if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match) &&
- IN4_ARE_ADDR_EQUAL(&src, &c->ip4.dns_host) && srcport == 53 &&
- (udp_tap_map[V4][dstport].flags & PORT_DNS_FWD)) {
- src = c->ip4.dns_match;
- } else if (IN4_IS_ADDR_LOOPBACK(&src) ||
- IN4_ARE_ADDR_EQUAL(&src, &c->ip4.addr_seen)) {
- udp_tap_map[V4][srcport].ts = now->tv_sec;
- udp_tap_map[V4][srcport].flags |= PORT_LOCAL;
-
- if (IN4_IS_ADDR_LOOPBACK(&src))
- udp_tap_map[V4][srcport].flags |= PORT_LOOPBACK;
- else
- udp_tap_map[V4][srcport].flags &= ~PORT_LOOPBACK;
-
- bitmap_set(udp_act[V4][UDP_ACT_TAP], srcport);
-
- src = c->ip4.gw;
- }
+ const struct flowside *toside = flowside_at_sidx(tosidx);
+ int i;
- b->iph.tot_len = htons(ip_len);
- b->iph.daddr = c->ip4.addr_seen.s_addr;
- b->iph.saddr = src.s_addr;
- b->iph.check = csum_ip4_header(b->iph.tot_len, IPPROTO_UDP,
- src, c->ip4.addr_seen);
+ if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0)
+ return;
- b->uh.source = b->s_in.sin_port;
- b->uh.dest = htons(dstport);
- b->uh.len = htons(datalen + sizeof(b->uh));
+ for (i = 0; i < n; i++)
+ udp_tap_prepare(udp_mh_recv, i, toside, false);
- return tap_frame_len(c, &b->taph, ip_len);
+ tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
}
/**
- * udp_update_hdr6() - Update headers for one IPv6 datagram
+ * udp_sock_fwd() - Forward datagrams from a possibly unconnected socket
* @c: Execution context
- * @b: Pointer to udp6_l2_buf to update
- * @dstport: Destination port number
- * @datalen: Length of UDP payload
+ * @s: Socket to forward from
+ * @frompif: Interface to which @s belongs
+ * @port: Our (local) port number of @s
* @now: Current timestamp
- *
- * Return: size of tap frame with headers
*/
-static size_t udp_update_hdr6(const struct ctx *c, struct udp6_l2_buf_t *b,
- in_port_t dstport, size_t datalen,
- const struct timespec *now)
+void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
+ in_port_t port, const struct timespec *now)
{
- const struct in6_addr *src = &b->s_in6.sin6_addr;
- const struct in6_addr *dst = &c->ip6.addr_seen;
- uint16_t payload_len = datalen + sizeof(b->uh);
- in_port_t srcport = ntohs(b->s_in6.sin6_port);
-
- if (IN6_IS_ADDR_LINKLOCAL(src)) {
- dst = &c->ip6.addr_ll_seen;
- } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match) &&
- IN6_ARE_ADDR_EQUAL(src, &c->ip6.dns_host) &&
- srcport == 53 &&
- (udp_tap_map[V4][dstport].flags & PORT_DNS_FWD)) {
- src = &c->ip6.dns_match;
- } else if (IN6_IS_ADDR_LOOPBACK(src) ||
- IN6_ARE_ADDR_EQUAL(src, &c->ip6.addr_seen) ||
- IN6_ARE_ADDR_EQUAL(src, &c->ip6.addr)) {
- udp_tap_map[V6][srcport].ts = now->tv_sec;
- udp_tap_map[V6][srcport].flags |= PORT_LOCAL;
-
- if (IN6_IS_ADDR_LOOPBACK(src))
- udp_tap_map[V6][srcport].flags |= PORT_LOOPBACK;
- else
- udp_tap_map[V6][srcport].flags &= ~PORT_LOOPBACK;
-
- if (IN6_ARE_ADDR_EQUAL(src, &c->ip6.addr))
- udp_tap_map[V6][srcport].flags |= PORT_GUA;
- else
- udp_tap_map[V6][srcport].flags &= ~PORT_GUA;
-
- bitmap_set(udp_act[V6][UDP_ACT_TAP], srcport);
-
- dst = &c->ip6.addr_ll_seen;
-
- if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw))
- src = &c->ip6.gw;
- else
- src = &c->ip6.addr_ll;
+ union sockaddr_inany src;
+ union inany_addr dst;
+ int rc;
+
+ while ((rc = udp_peek_addr(s, &src, &dst)) != 0) {
+ bool discard = false;
+ flow_sidx_t tosidx;
+ uint8_t topif;
+
+ if (rc < 0) {
+ trace("Error peeking at socket address: %s",
+ strerror_(-rc));
+ /* Clear errors & carry on */
+ if (udp_sock_errs(c, s, FLOW_SIDX_NONE,
+ frompif, port) < 0) {
+ err(
+"UDP: Unrecoverable error on listening socket: (%s port %hu)",
+ pif_name(frompif), port);
+ /* FIXME: what now? close/re-open socket? */
+ }
+ continue;
+ }
- }
+ tosidx = udp_flow_from_sock(c, frompif, &dst, port, &src, now);
+ topif = pif_at_sidx(tosidx);
- b->ip6h.payload_len = htons(payload_len);
- b->ip6h.daddr = *dst;
- b->ip6h.saddr = *src;
- b->ip6h.version = 6;
- b->ip6h.nexthdr = IPPROTO_UDP;
- b->ip6h.hop_limit = 255;
+ if (pif_is_socket(topif)) {
+ udp_sock_to_sock(c, s, 1, tosidx);
+ } else if (topif == PIF_TAP) {
+ if (c->mode == MODE_VU)
+ udp_vu_sock_to_tap(c, s, 1, tosidx);
+ else
+ udp_buf_sock_to_tap(c, s, 1, tosidx);
+ } else if (flow_sidx_valid(tosidx)) {
+ struct udp_flow *uflow = udp_at_sidx(tosidx);
+
+ flow_err(uflow,
+ "No support for forwarding UDP from %s to %s",
+ pif_name(frompif), pif_name(topif));
+ discard = true;
+ } else {
+ debug("Discarding datagram without flow");
+ discard = true;
+ }
- b->uh.source = b->s_in6.sin6_port;
- b->uh.dest = htons(dstport);
- b->uh.len = b->ip6h.payload_len;
- csum_udp6(&b->uh, src, dst, b->data, datalen);
+ if (discard) {
+ struct msghdr msg = { 0 };
- return tap_frame_len(c, &b->taph, payload_len + sizeof(b->ip6h));
+ if (recvmsg(s, &msg, MSG_DONTWAIT) < 0)
+ debug_perror("Failed to discard datagram");
+ }
+ }
}
/**
- * udp_tap_send() - Prepare UDP datagrams and send to tap interface
+ * udp_listen_sock_handler() - Handle new data from socket
* @c: Execution context
- * @start: Index of first datagram in udp[46]_l2_buf pool
- * @n: Number of datagrams to send
- * @dstport: Destination port number
- * @v6: True if using IPv6
+ * @ref: epoll reference
+ * @events: epoll events bitmap
* @now: Current timestamp
- *
- * Return: size of tap frame with headers
*/
-static void udp_tap_send(const struct ctx *c,
- unsigned int start, unsigned int n,
- in_port_t dstport, bool v6, const struct timespec *now)
+void udp_listen_sock_handler(const struct ctx *c,
+ union epoll_ref ref, uint32_t events,
+ const struct timespec *now)
{
- struct iovec *tap_iov;
- unsigned int i;
-
- if (v6)
- tap_iov = udp6_l2_iov_tap;
- else
- tap_iov = udp4_l2_iov_tap;
-
- for (i = start; i < start + n; i++) {
- size_t buf_len;
-
- if (v6)
- buf_len = udp_update_hdr6(c, &udp6_l2_buf[i], dstport,
- udp6_l2_mh_sock[i].msg_len, now);
- else
- buf_len = udp_update_hdr4(c, &udp4_l2_buf[i], dstport,
- udp4_l2_mh_sock[i].msg_len, now);
-
- tap_iov[i].iov_len = buf_len;
- }
-
- tap_send_frames(c, tap_iov + start, 1, n);
+ if (events & (EPOLLERR | EPOLLIN))
+ udp_sock_fwd(c, ref.fd, ref.udp.pif, ref.udp.port, now);
}
/**
- * udp_sock_handler() - Handle new data from socket
+ * udp_sock_handler() - Handle new data from flow specific socket
* @c: Execution context
* @ref: epoll reference
* @events: epoll events bitmap
* @now: Current timestamp
- *
- * #syscalls recvmmsg
*/
-void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
- const struct timespec *now)
+void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
+ uint32_t events, const struct timespec *now)
{
- /* For not entirely clear reasons (data locality?) pasta gets
- * better throughput if we receive tap datagrams one at a
- * atime. For small splice datagrams throughput is slightly
- * better if we do batch, but it's slightly worse for large
- * splice datagrams. Since we don't know before we receive
- * whether we'll use tap or splice, always go one at a time
- * for pasta mode.
- */
- ssize_t n = (c->mode == MODE_PASST ? UDP_MAX_FRAMES : 1);
- in_port_t dstport = ref.udp.port;
- bool v6 = ref.udp.v6;
- struct mmsghdr *mmh_recv;
- int i, m;
-
- if (c->no_udp || !(events & EPOLLIN))
- return;
+ struct udp_flow *uflow = udp_at_sidx(ref.flowside);
- if (ref.udp.pif == PIF_SPLICE)
- dstport += c->udp.fwd_out.f.delta[dstport];
- else if (ref.udp.pif == PIF_HOST)
- dstport += c->udp.fwd_in.f.delta[dstport];
+ ASSERT(!c->no_udp && uflow);
- if (v6) {
- mmh_recv = udp6_l2_mh_sock;
- udp6_localname.sin6_port = htons(dstport);
- } else {
- mmh_recv = udp4_l2_mh_sock;
- udp4_localname.sin_port = htons(dstport);
+ if (events & EPOLLERR) {
+ if (udp_sock_errs(c, ref.fd, ref.flowside, PIF_NONE, 0) < 0) {
+ flow_err(uflow, "Unrecoverable error on flow socket");
+ goto fail;
+ }
}
- n = recvmmsg(ref.fd, mmh_recv, n, 0, NULL);
- if (n <= 0)
- return;
-
- for (i = 0; i < n; i += m) {
- int splicefrom = -1;
- m = n;
-
- if (ref.udp.splice) {
- splicefrom = udp_mmh_splice_port(v6, mmh_recv + i);
-
- for (m = 1; i + m < n; m++) {
- int p;
-
- p = udp_mmh_splice_port(v6, mmh_recv + i + m);
- if (p != splicefrom)
- break;
+ if (events & EPOLLIN) {
+ /* For not entirely clear reasons (data locality?) pasta gets
+ * better throughput if we receive tap datagrams one at a
+ * time. For small splice datagrams throughput is slightly
+ * better if we do batch, but it's slightly worse for large
+ * splice datagrams. Since we don't know the size before we
+ * receive, always go one at a time for pasta mode.
+ */
+ size_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES);
+ flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
+ uint8_t topif = pif_at_sidx(tosidx);
+ int s = ref.fd;
+
+ flow_trace(uflow, "Received data on reply socket");
+ uflow->ts = now->tv_sec;
+
+ if (pif_is_socket(topif)) {
+ udp_sock_to_sock(c, ref.fd, n, tosidx);
+ } else if (topif == PIF_TAP) {
+ if (c->mode == MODE_VU) {
+ udp_vu_sock_to_tap(c, s, UDP_MAX_FRAMES,
+ tosidx);
+ } else {
+ udp_buf_sock_to_tap(c, s, n, tosidx);
}
+ } else {
+ flow_err(uflow,
+ "No support for forwarding UDP from %s to %s",
+ pif_name(pif_at_sidx(ref.flowside)),
+ pif_name(topif));
+ goto fail;
}
-
- if (splicefrom >= 0)
- udp_splice_sendfrom(c, i, m, splicefrom, dstport,
- ref.udp.pif, v6, ref.udp.orig, now);
- else
- udp_tap_send(c, i, m, dstport, v6, now);
}
+ return;
+
+fail:
+ flow_err_details(uflow);
+ udp_flow_close(c, uflow);
}
/**
@@ -795,6 +960,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
* @af: Address family, AF_INET or AF_INET6
* @saddr: Source address
* @daddr: Destination address
+ * @ttl: TTL or hop limit for packets to be sent in this call
* @p: Pool of UDP packets, with UDP headers
* @idx: Index of first packet to process
* @now: Current timestamp
@@ -803,23 +969,24 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
*
* #syscalls sendmmsg
*/
-int udp_tap_handler(struct ctx *c, uint8_t pif,
+int udp_tap_handler(const struct ctx *c, uint8_t pif,
sa_family_t af, const void *saddr, const void *daddr,
- const struct pool *p, int idx, const struct timespec *now)
+ uint8_t ttl, const struct pool *p, int idx,
+ const struct timespec *now)
{
+ const struct flowside *toside;
struct mmsghdr mm[UIO_MAXIOV];
+ union sockaddr_inany to_sa;
struct iovec m[UIO_MAXIOV];
- struct sockaddr_in6 s_in6;
- struct sockaddr_in s_in;
const struct udphdr *uh;
- struct sockaddr *sa;
+ struct udp_flow *uflow;
int i, s, count = 0;
+ flow_sidx_t tosidx;
in_port_t src, dst;
+ uint8_t topif;
socklen_t sl;
- (void)c;
- (void)saddr;
- (void)pif;
+ ASSERT(!c->no_udp);
uh = packet_get(p, idx, 0, sizeof(*uh), NULL);
if (!uh)
@@ -831,113 +998,32 @@ int udp_tap_handler(struct ctx *c, uint8_t pif,
src = ntohs(uh->source);
dst = ntohs(uh->dest);
- if (af == AF_INET) {
- s_in = (struct sockaddr_in) {
- .sin_family = AF_INET,
- .sin_port = uh->dest,
- .sin_addr = *(struct in_addr *)daddr,
- };
-
- sa = (struct sockaddr *)&s_in;
- sl = sizeof(s_in);
-
- if (IN4_ARE_ADDR_EQUAL(&s_in.sin_addr, &c->ip4.dns_match) &&
- ntohs(s_in.sin_port) == 53) {
- s_in.sin_addr = c->ip4.dns_host;
- udp_tap_map[V4][src].ts = now->tv_sec;
- udp_tap_map[V4][src].flags |= PORT_DNS_FWD;
- bitmap_set(udp_act[V4][UDP_ACT_TAP], src);
- } else if (IN4_ARE_ADDR_EQUAL(&s_in.sin_addr, &c->ip4.gw) &&
- !c->no_map_gw) {
- if (!(udp_tap_map[V4][dst].flags & PORT_LOCAL) ||
- (udp_tap_map[V4][dst].flags & PORT_LOOPBACK))
- s_in.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
- else
- s_in.sin_addr = c->ip4.addr_seen;
- }
-
- debug("UDP from tap src=%hu dst=%hu, s=%d",
- src, dst, udp_tap_map[V4][src].sock);
- if ((s = udp_tap_map[V4][src].sock) < 0) {
- struct in_addr bind_addr = IN4ADDR_ANY_INIT;
- union udp_epoll_ref uref = {
- .port = src,
- .pif = PIF_HOST,
- };
- const char *bind_if = NULL;
-
- if (!IN4_IS_ADDR_LOOPBACK(&s_in.sin_addr))
- bind_if = c->ip4.ifname_out;
-
- if (!IN4_IS_ADDR_LOOPBACK(&s_in.sin_addr))
- bind_addr = c->ip4.addr_out;
-
- s = sock_l4(c, AF_INET, IPPROTO_UDP, &bind_addr,
- bind_if, src, uref.u32);
- if (s < 0)
- return p->count - idx;
-
- udp_tap_map[V4][src].sock = s;
- bitmap_set(udp_act[V4][UDP_ACT_TAP], src);
- }
+ tosidx = udp_flow_from_tap(c, pif, af, saddr, daddr, src, dst, now);
+ if (!(uflow = udp_at_sidx(tosidx))) {
+ char sstr[INET6_ADDRSTRLEN], dstr[INET6_ADDRSTRLEN];
- udp_tap_map[V4][src].ts = now->tv_sec;
- } else {
- s_in6 = (struct sockaddr_in6) {
- .sin6_family = AF_INET6,
- .sin6_port = uh->dest,
- .sin6_addr = *(struct in6_addr *)daddr,
- };
- const struct in6_addr *bind_addr = &in6addr_any;
-
- sa = (struct sockaddr *)&s_in6;
- sl = sizeof(s_in6);
-
- if (IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.dns_match) &&
- ntohs(s_in6.sin6_port) == 53) {
- s_in6.sin6_addr = c->ip6.dns_host;
- udp_tap_map[V6][src].ts = now->tv_sec;
- udp_tap_map[V6][src].flags |= PORT_DNS_FWD;
- bitmap_set(udp_act[V6][UDP_ACT_TAP], src);
- } else if (IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.gw) &&
- !c->no_map_gw) {
- if (!(udp_tap_map[V6][dst].flags & PORT_LOCAL) ||
- (udp_tap_map[V6][dst].flags & PORT_LOOPBACK))
- s_in6.sin6_addr = in6addr_loopback;
- else if (udp_tap_map[V6][dst].flags & PORT_GUA)
- s_in6.sin6_addr = c->ip6.addr;
- else
- s_in6.sin6_addr = c->ip6.addr_seen;
- } else if (IN6_IS_ADDR_LINKLOCAL(&s_in6.sin6_addr)) {
- bind_addr = &c->ip6.addr_ll;
- }
-
- if ((s = udp_tap_map[V6][src].sock) < 0) {
- union udp_epoll_ref uref = {
- .v6 = 1,
- .port = src,
- .pif = PIF_HOST,
- };
- const char *bind_if = NULL;
-
- if (!IN6_IS_ADDR_LOOPBACK(&s_in6.sin6_addr))
- bind_if = c->ip6.ifname_out;
+ debug("Dropping datagram with no flow %s %s:%hu -> %s:%hu",
+ pif_name(pif),
+ inet_ntop(af, saddr, sstr, sizeof(sstr)), src,
+ inet_ntop(af, daddr, dstr, sizeof(dstr)), dst);
+ return 1;
+ }
- if (!IN6_IS_ADDR_LOOPBACK(&s_in6.sin6_addr) &&
- !IN6_IS_ADDR_LINKLOCAL(&s_in6.sin6_addr))
- bind_addr = &c->ip6.addr_out;
+ topif = pif_at_sidx(tosidx);
+ if (topif != PIF_HOST) {
+ flow_sidx_t fromsidx = flow_sidx_opposite(tosidx);
+ uint8_t frompif = pif_at_sidx(fromsidx);
- s = sock_l4(c, AF_INET6, IPPROTO_UDP, bind_addr,
- bind_if, src, uref.u32);
- if (s < 0)
- return p->count - idx;
+ flow_err(uflow, "No support for forwarding UDP from %s to %s",
+ pif_name(frompif), pif_name(topif));
+ return 1;
+ }
+ toside = flowside_at_sidx(tosidx);
- udp_tap_map[V6][src].sock = s;
- bitmap_set(udp_act[V6][UDP_ACT_TAP], src);
- }
+ s = uflow->s[tosidx.sidei];
+ ASSERT(s >= 0);
- udp_tap_map[V6][src].ts = now->tv_sec;
- }
+ pif_sockaddr(c, &to_sa, &sl, topif, &toside->eaddr, toside->eport);
for (i = 0; i < (int)p->count - idx; i++) {
struct udphdr *uh_send;
@@ -947,7 +1033,7 @@ int udp_tap_handler(struct ctx *c, uint8_t pif,
if (!uh_send)
return p->count - idx;
- mm[i].msg_hdr.msg_name = sa;
+ mm[i].msg_hdr.msg_name = &to_sa;
mm[i].msg_hdr.msg_namelen = sl;
if (len) {
@@ -965,6 +1051,24 @@ int udp_tap_handler(struct ctx *c, uint8_t pif,
mm[i].msg_hdr.msg_controllen = 0;
mm[i].msg_hdr.msg_flags = 0;
+ if (ttl != uflow->ttl[tosidx.sidei]) {
+ uflow->ttl[tosidx.sidei] = ttl;
+ if (af == AF_INET) {
+ if (setsockopt(s, IPPROTO_IP, IP_TTL,
+ &ttl, sizeof(ttl)) < 0)
+ flow_perror(uflow,
+ "setsockopt IP_TTL");
+ } else {
+ /* IPv6 hop_limit cannot be only 1 byte */
+ int hop_limit = ttl;
+
+ if (setsockopt(s, SOL_IPV6, IPV6_UNICAST_HOPS,
+ &hop_limit, sizeof(hop_limit)) < 0)
+ flow_perror(uflow,
+ "setsockopt IPV6_UNICAST_HOPS");
+ }
+ }
+
count++;
}
@@ -979,56 +1083,62 @@ int udp_tap_handler(struct ctx *c, uint8_t pif,
* udp_sock_init() - Initialise listening sockets for a given port
* @c: Execution context
* @ns: In pasta mode, if set, bind with loopback address in namespace
- * @af: Address family to select a specific IP version, or AF_UNSPEC
* @addr: Pointer to address for binding, NULL if not configured
* @ifname: Name of interface to bind to, NULL if not configured
* @port: Port, host order
*
* Return: 0 on (partial) success, negative error code on (complete) failure
*/
-int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
- const void *addr, const char *ifname, in_port_t port)
+int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
+ const char *ifname, in_port_t port)
{
- union udp_epoll_ref uref = { .splice = (c->mode == MODE_PASTA),
- .orig = true, .port = port };
- int s, r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
+ union udp_listen_epoll_ref uref = {
+ .pif = ns ? PIF_SPLICE : PIF_HOST,
+ .port = port,
+ };
+ int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
+
+ ASSERT(!c->no_udp);
- if (ns)
- uref.pif = PIF_SPLICE;
- else
- uref.pif = PIF_HOST;
+ if (!addr && c->ifi4 && c->ifi6 && !ns) {
+ int s;
- if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) {
- uref.v6 = 0;
+ /* Attempt to get a dual stack socket */
+ s = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
+ NULL, ifname, port, uref.u32);
+ udp_splice_init[V4][port] = s < 0 ? -1 : s;
+ udp_splice_init[V6][port] = s < 0 ? -1 : s;
+ if (IN_INTERVAL(0, FD_REF_MAX, s))
+ return 0;
+ }
+ if ((!addr || inany_v4(addr)) && c->ifi4) {
if (!ns) {
- r4 = s = sock_l4(c, AF_INET, IPPROTO_UDP, addr,
- ifname, port, uref.u32);
+ r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
+ addr ? addr : &inany_any4, ifname,
+ port, uref.u32);
- udp_tap_map[V4][uref.port].sock = s < 0 ? -1 : s;
- udp_splice_init[V4][port].sock = s < 0 ? -1 : s;
+ udp_splice_init[V4][port] = r4 < 0 ? -1 : r4;
} else {
- r4 = s = sock_l4(c, AF_INET, IPPROTO_UDP,
- &in4addr_loopback,
- ifname, port, uref.u32);
- udp_splice_ns[V4][port].sock = s < 0 ? -1 : s;
+ r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_SPLICE,
+ &inany_loopback4, ifname,
+ port, uref.u32);
+ udp_splice_ns[V4][port] = r4 < 0 ? -1 : r4;
}
}
- if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) {
- uref.v6 = 1;
-
+ if ((!addr || !inany_v4(addr)) && c->ifi6) {
if (!ns) {
- r6 = s = sock_l4(c, AF_INET6, IPPROTO_UDP, addr,
- ifname, port, uref.u32);
+ r6 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
+ addr ? addr : &inany_any6, ifname,
+ port, uref.u32);
- udp_tap_map[V6][uref.port].sock = s < 0 ? -1 : s;
- udp_splice_init[V6][port].sock = s < 0 ? -1 : s;
+ udp_splice_init[V6][port] = r6 < 0 ? -1 : r6;
} else {
- r6 = s = sock_l4(c, AF_INET6, IPPROTO_UDP,
- &in6addr_loopback,
- ifname, port, uref.u32);
- udp_splice_ns[V6][port].sock = s < 0 ? -1 : s;
+ r6 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_SPLICE,
+ &inany_loopback6, ifname,
+ port, uref.u32);
+ udp_splice_ns[V6][port] = r6 < 0 ? -1 : r6;
}
}
@@ -1046,73 +1156,15 @@ static void udp_splice_iov_init(void)
int i;
for (i = 0; i < UDP_MAX_FRAMES; i++) {
- struct msghdr *mh4 = &udp4_mh_splice[i].msg_hdr;
- struct msghdr *mh6 = &udp6_mh_splice[i].msg_hdr;
+ struct msghdr *mh = &udp_mh_splice[i].msg_hdr;
- mh4->msg_name = &udp4_localname;
- mh4->msg_namelen = sizeof(udp4_localname);
+ mh->msg_name = &udp_splice_to;
+ mh->msg_namelen = sizeof(udp_splice_to);
- mh6->msg_name = &udp6_localname;
- mh6->msg_namelen = sizeof(udp6_localname);
+ udp_iov_splice[i].iov_base = udp_payload[i].data;
- udp4_iov_splice[i].iov_base = udp4_l2_buf[i].data;
- udp6_iov_splice[i].iov_base = udp6_l2_buf[i].data;
-
- mh4->msg_iov = &udp4_iov_splice[i];
- mh6->msg_iov = &udp6_iov_splice[i];
- mh4->msg_iovlen = mh6->msg_iovlen = 1;
- }
-}
-
-/**
- * udp_timer_one() - Handler for timed events on one port
- * @c: Execution context
- * @v6: Set for IPv6 connections
- * @type: Socket type
- * @port: Port number, host order
- * @now: Current timestamp
- */
-static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type,
- in_port_t port, const struct timespec *now)
-{
- struct udp_splice_port *sp;
- struct udp_tap_port *tp;
- int *sockp = NULL;
-
- switch (type) {
- case UDP_ACT_TAP:
- tp = &udp_tap_map[v6 ? V6 : V4][port];
-
- if (now->tv_sec - tp->ts > UDP_CONN_TIMEOUT) {
- sockp = &tp->sock;
- tp->flags = 0;
- }
-
- break;
- case UDP_ACT_SPLICE_INIT:
- sp = &udp_splice_init[v6 ? V6 : V4][port];
-
- if (now->tv_sec - sp->ts > UDP_CONN_TIMEOUT)
- sockp = &sp->sock;
-
- break;
- case UDP_ACT_SPLICE_NS:
- sp = &udp_splice_ns[v6 ? V6 : V4][port];
-
- if (now->tv_sec - sp->ts > UDP_CONN_TIMEOUT)
- sockp = &sp->sock;
-
- break;
- default:
- return;
- }
-
- if (sockp && *sockp >= 0) {
- int s = *sockp;
- *sockp = -1;
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL);
- close(s);
- bitmap_clear(udp_act[v6 ? V6 : V4][type], port);
+ mh->msg_iov = &udp_iov_splice[i];
+ mh->msg_iovlen = 1;
}
}
@@ -1125,24 +1177,23 @@ static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type,
*/
static void udp_port_rebind(struct ctx *c, bool outbound)
{
+ int (*socks)[NUM_PORTS] = outbound ? udp_splice_ns : udp_splice_init;
const uint8_t *fmap
- = outbound ? c->udp.fwd_out.f.map : c->udp.fwd_in.f.map;
+ = outbound ? c->udp.fwd_out.map : c->udp.fwd_in.map;
const uint8_t *rmap
- = outbound ? c->udp.fwd_in.f.map : c->udp.fwd_out.f.map;
- struct udp_splice_port (*socks)[NUM_PORTS]
- = outbound ? udp_splice_ns : udp_splice_init;
+ = outbound ? c->udp.fwd_in.map : c->udp.fwd_out.map;
unsigned port;
for (port = 0; port < NUM_PORTS; port++) {
if (!bitmap_isset(fmap, port)) {
- if (socks[V4][port].sock >= 0) {
- close(socks[V4][port].sock);
- socks[V4][port].sock = -1;
+ if (socks[V4][port] >= 0) {
+ close(socks[V4][port]);
+ socks[V4][port] = -1;
}
- if (socks[V6][port].sock >= 0) {
- close(socks[V6][port].sock);
- socks[V6][port].sock = -1;
+ if (socks[V6][port] >= 0) {
+ close(socks[V6][port]);
+ socks[V6][port] = -1;
}
continue;
@@ -1152,9 +1203,9 @@ static void udp_port_rebind(struct ctx *c, bool outbound)
if (bitmap_isset(rmap, port))
continue;
- if ((c->ifi4 && socks[V4][port].sock == -1) ||
- (c->ifi6 && socks[V6][port].sock == -1))
- udp_sock_init(c, outbound, AF_UNSPEC, NULL, NULL, port);
+ if ((c->ifi4 && socks[V4][port] == -1) ||
+ (c->ifi6 && socks[V6][port] == -1))
+ udp_sock_init(c, outbound, NULL, NULL, port);
}
}
@@ -1183,43 +1234,23 @@ static int udp_port_rebind_outbound(void *arg)
*/
void udp_timer(struct ctx *c, const struct timespec *now)
{
- int n, t, v6 = 0;
- unsigned int i;
- long *word, tmp;
+ (void)now;
+
+ ASSERT(!c->no_udp);
if (c->mode == MODE_PASTA) {
- if (c->udp.fwd_out.f.mode == FWD_AUTO) {
- fwd_scan_ports_udp(&c->udp.fwd_out.f, &c->udp.fwd_in.f,
+ if (c->udp.fwd_out.mode == FWD_AUTO) {
+ fwd_scan_ports_udp(&c->udp.fwd_out, &c->udp.fwd_in,
&c->tcp.fwd_out, &c->tcp.fwd_in);
NS_CALL(udp_port_rebind_outbound, c);
}
- if (c->udp.fwd_in.f.mode == FWD_AUTO) {
- fwd_scan_ports_udp(&c->udp.fwd_in.f, &c->udp.fwd_out.f,
+ if (c->udp.fwd_in.mode == FWD_AUTO) {
+ fwd_scan_ports_udp(&c->udp.fwd_in, &c->udp.fwd_out,
&c->tcp.fwd_in, &c->tcp.fwd_out);
udp_port_rebind(c, false);
}
}
-
- if (!c->ifi4)
- v6 = 1;
-v6:
- for (t = 0; t < UDP_ACT_TYPE_MAX; t++) {
- word = (long *)udp_act[v6 ? V6 : V4][t];
- for (i = 0; i < ARRAY_SIZE(udp_act[0][0]);
- i += sizeof(long), word++) {
- tmp = *word;
- while ((n = ffsl(tmp))) {
- tmp &= ~(1UL << (n - 1));
- udp_timer_one(c, v6, t, i * 8 + n - 1, now);
- }
- }
- }
-
- if (!v6 && c->ifi6) {
- v6 = 1;
- goto v6;
- }
}
/**
@@ -1230,10 +1261,9 @@ v6:
*/
int udp_init(struct ctx *c)
{
- udp_sock_iov_init(c);
+ ASSERT(!c->no_udp);
- udp_invert_portmap(&c->udp.fwd_in);
- udp_invert_portmap(&c->udp.fwd_out);
+ udp_iov_init(c);
if (c->mode == MODE_PASTA) {
udp_splice_iov_init();