diff options
author | Stefano Brivio <sbrivio@redhat.com> | 2021-04-29 16:59:20 +0200 |
---|---|---|
committer | Stefano Brivio <sbrivio@redhat.com> | 2021-04-29 17:15:26 +0200 |
commit | 605af213c5e0fa047f6d8caef5bcef61a0987c8d (patch) | |
tree | 45615e603964adee64bfecdc40e119bd33d77859 /tcp.c | |
parent | 50bcddabc9e2c0dbd313a61cda1606045a67a8de (diff) | |
download | passt-605af213c5e0fa047f6d8caef5bcef61a0987c8d.tar passt-605af213c5e0fa047f6d8caef5bcef61a0987c8d.tar.gz passt-605af213c5e0fa047f6d8caef5bcef61a0987c8d.tar.bz2 passt-605af213c5e0fa047f6d8caef5bcef61a0987c8d.tar.lz passt-605af213c5e0fa047f6d8caef5bcef61a0987c8d.tar.xz passt-605af213c5e0fa047f6d8caef5bcef61a0987c8d.tar.zst passt-605af213c5e0fa047f6d8caef5bcef61a0987c8d.zip |
udp: Connection tracking for ephemeral, local ports, and related fixes
As we support UDP forwarding for packets that are sent to local
ports, we actually need some kind of connection tracking for UDP.
While at it, this commit introduces a number of vaguely related fixes
for issues observed while trying this out. In detail:
- implement an explicit, albeit minimalistic, connection tracking
for UDP, to allow usage of ephemeral ports by the guest and by
the host at the same time, by binding them dynamically as needed,
and to allow mapping address changes for packets with a loopback
address as destination
- set the guest MAC address whenever we receive a packet from tap
instead of waiting for an ARP request, and set it to broadcast on
start, otherwise DHCPv6 might not work if all DHCPv6 requests time
out before the guest starts talking IPv4
- split context IPv6 address into address we assign, global or site
address seen on tap, and link-local address seen on tap, and make
sure we use the addresses we've seen as destination (link-local
choice depends on source address). Similarly, for IPv4, split into
address we assign and address we observe, and use the address we
observe as destination
- introduce a clock_gettime() syscall right after epoll_wait() wakes
up, so that we can remove all the other ones and pass the current
timestamp to tap and socket handlers -- this is additionally needed
by UDP to time out bindings to ephemeral ports and mappings between
loopback address and a local address
- rename sock_l4_add() to sock_l4(), no semantic changes intended
- include <arpa/inet.h> in passt.c before kernel headers so that we
can use <netinet/in.h> macros to check IPv6 address types, and
remove a duplicate <linux/ip.h> inclusion
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Diffstat (limited to 'tcp.c')
-rw-r--r-- | tcp.c | 72 |
1 files changed, 38 insertions, 34 deletions
@@ -846,17 +846,16 @@ static void tcp_clamp_window(int s, struct tcphdr *th, int len, int init) * @addr: Remote address, pointer to sin_addr or sin6_addr * @dstport: Destination port, connection-wise, network order * @srcport: Source port, connection-wise, network order + * @now: Current timestamp * * Return: initial TCP sequence */ static uint32_t tcp_seq_init(struct ctx *c, int af, void *addr, - in_port_t dstport, in_port_t srcport) + in_port_t dstport, in_port_t srcport, + struct timespec *now) { - struct timespec ts = { 0 }; uint32_t ns, seq = 0; - clock_gettime(CLOCK_MONOTONIC, &ts); - if (af == AF_INET) { struct { struct in_addr src; @@ -887,8 +886,8 @@ static uint32_t tcp_seq_init(struct ctx *c, int af, void *addr, seq = siphash_36b((uint8_t *)&in, c->tcp.hash_secret); } - ns = ts.tv_sec * 1E9; - ns += ts.tv_nsec >> 5; /* 32ns ticks, overflows 32 bits every 137s */ + ns = now->tv_sec * 1E9; + ns += now->tv_nsec >> 5; /* 32ns ticks, overflows 32 bits every 137s */ return seq + ns; } @@ -900,9 +899,11 @@ static uint32_t tcp_seq_init(struct ctx *c, int af, void *addr, * @addr: Remote address, pointer to sin_addr or sin6_addr * @th: TCP header from tap * @len: Packet length at L4 + * @now: Current timestamp */ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, - struct tcphdr *th, size_t len) + struct tcphdr *th, size_t len, + struct timespec *now) { struct sockaddr_in addr4 = { .sin_family = AF_INET, @@ -948,9 +949,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, tc[s].sock_port = th->dest; tc[s].tap_port = th->source; - clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_sock); - clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_tap); - clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_ack_tap); + tc[s].ts_sock = tc[s].ts_tap = tc[s].ts_ack_tap = *now; tcp_act_set(s); @@ -961,7 +960,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, tc[s].seq_from_tap = tc[s].seq_init_from_tap + 1; tc[s].seq_ack_to_tap = tc[s].seq_from_tap; - tc[s].seq_to_tap = tcp_seq_init(c, af, addr, th->dest, th->source); + tc[s].seq_to_tap = tcp_seq_init(c, af, addr, th->dest, th->source, now); tc[s].seq_ack_from_tap = tc[s].seq_to_tap + 1; tcp_sock_hash_insert(c, s, af, addr, th->source, th->dest); @@ -988,8 +987,9 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, * tcp_conn_from_sock() - Handle new connection request from listening socket * @c: Execution context * @fd: File descriptor number for listening socket + * @now: Current timestamp */ -static void tcp_conn_from_sock(struct ctx *c, int fd) +static void tcp_conn_from_sock(struct ctx *c, int fd, struct timespec *now) { struct sockaddr_storage sa_r, sa_l; socklen_t sa_len = sizeof(sa_l); @@ -1023,7 +1023,8 @@ static void tcp_conn_from_sock(struct ctx *c, int fd) tc[s].seq_to_tap = tcp_seq_init(c, AF_INET, &sa4->sin_addr, tc[s].sock_port, - tc[s].tap_port); + tc[s].tap_port, + now); tcp_sock_hash_insert(c, s, AF_INET, &sa4->sin_addr, tc[s].tap_port, tc[s].sock_port); @@ -1040,7 +1041,8 @@ static void tcp_conn_from_sock(struct ctx *c, int fd) tc[s].seq_to_tap = tcp_seq_init(c, AF_INET6, &sa6->sin6_addr, tc[s].sock_port, - tc[s].tap_port); + tc[s].tap_port, + now); tcp_sock_hash_insert(c, s, AF_INET6, &sa6->sin6_addr, tc[s].tap_port, tc[s].sock_port); @@ -1051,9 +1053,7 @@ static void tcp_conn_from_sock(struct ctx *c, int fd) tc[s].tap_window = WINDOW_DEFAULT; tc[s].ws_allowed = 1; - clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_sock); - clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_tap); - clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_ack_tap); + tc[s].ts_sock = tc[s].ts_tap = tc[s].ts_ack_tap = *now; tcp_act_set(s); @@ -1143,10 +1143,11 @@ static void tcp_sock_consume(int s, uint32_t ack_seq) * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window * @c: Execution context * @s: File descriptor number for socket + * @now: Current timestamp * * Return: negative on connection reset, 1 on pending data, 0 otherwise */ -static int tcp_data_from_sock(struct ctx *c, int s) +static int tcp_data_from_sock(struct ctx *c, int s, struct timespec *now) { int len, err, offset, left, send; @@ -1188,7 +1189,7 @@ static int tcp_data_from_sock(struct ctx *c, int s) } out: - clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_sock); + tc[s].ts_sock = *now; return !!left; } @@ -1199,11 +1200,12 @@ out: * @af: Address family, AF_INET or AF_INET6 * @msg: Input messages * @count: Message count + * @now: Current timestamp * * Return: count of consumed packets */ int tcp_tap_handler(struct ctx *c, int af, void *addr, - struct tap_msg *msg, int count) + struct tap_msg *msg, int count, struct timespec *now) { /* TODO: Implement message batching for TCP */ struct tcphdr *th = (struct tcphdr *)msg[0].l4h; @@ -1224,7 +1226,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, if ((s = tcp_sock_hash_lookup(c, af, addr, th->source, th->dest)) < 0) { if (th->syn) - tcp_conn_from_tap(c, af, addr, th, len); + tcp_conn_from_tap(c, af, addr, th, len, now); return 1; } @@ -1235,7 +1237,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, tcp_clamp_window(s, th, len, th->syn && th->ack); - clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_tap); + tc[s].ts_tap = *now; if (ntohl(th->seq) < tc[s].seq_from_tap) skip = tc[s].seq_from_tap - ntohl(th->seq); @@ -1275,7 +1277,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, /* The client might have sent data already, which we didn't * dequeue waiting for SYN,ACK from tap -- check now. */ - tcp_data_from_sock(c, s); + tcp_data_from_sock(c, s, now); ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLHUP; ev.data.fd = s; @@ -1298,7 +1300,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, break; case ESTABLISHED: case ESTABLISHED_SOCK_FIN: - clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_ack_tap); + tc[s].ts_ack_tap = *now; if (ntohl(th->seq) > tc[s].seq_from_tap) { tc[s].seq_from_tap = tc[s].seq_ack_to_tap; @@ -1318,7 +1320,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, tc[s].seq_to_tap = tc[s].seq_ack_from_tap; if (tc[s].s == ESTABLISHED_SOCK_FIN) { - if (!tcp_data_from_sock(c, s)) + if (!tcp_data_from_sock(c, s, now)) tcp_set_state(s, CLOSE_WAIT); } } @@ -1400,8 +1402,10 @@ static void tcp_connect_finish(struct ctx *c, int s) * @c: Execution context * @s: File descriptor number for socket * @events: epoll events bitmap + * @now: Current timestamp */ -void tcp_sock_handler(struct ctx *c, int s, uint32_t events) +void tcp_sock_handler(struct ctx *c, int s, uint32_t events, + struct timespec *now) { socklen_t sl; int accept; @@ -1434,7 +1438,7 @@ void tcp_sock_handler(struct ctx *c, int s, uint32_t events) } if (accept) { - tcp_conn_from_sock(c, s); + tcp_conn_from_sock(c, s, now); return; } @@ -1444,18 +1448,18 @@ void tcp_sock_handler(struct ctx *c, int s, uint32_t events) } if (tc[s].s == ESTABLISHED) - tcp_data_from_sock(c, s); + tcp_data_from_sock(c, s, now); if (events & EPOLLRDHUP || events & EPOLLHUP) { if (tc[s].s == ESTABLISHED) { tcp_set_state(s, ESTABLISHED_SOCK_FIN); shutdown(s, SHUT_RD); - tcp_data_from_sock(c, s); + tcp_data_from_sock(c, s, now); tcp_send_to_tap(c, s, FIN | ACK, NULL, 0); } else if (tc[s].s == FIN_WAIT_1) { tcp_set_state(s, FIN_WAIT_1_SOCK_FIN); shutdown(s, SHUT_RD); - tcp_data_from_sock(c, s); + tcp_data_from_sock(c, s, now); tcp_send_to_tap(c, s, FIN | ACK, NULL, 0); tcp_sock_consume(s, tc[s].seq_ack_from_tap); } @@ -1477,15 +1481,15 @@ int tcp_sock_init(struct ctx *c) c->tcp.fd_max = c->tcp.fd_listen_max = c->tcp.fd_conn_max = 0; CHECK_SET_MIN_MAX(c->tcp.fd_listen_, s); - for (port = 0; port < (1 << 15) + (1 << 14); port++) { + for (port = 0; !PORT_IS_EPHEMERAL(port); port++) { if (c->v4) { - if ((s = sock_l4_add(c, 4, IPPROTO_TCP, port)) < 0) + if ((s = sock_l4(c, AF_INET, IPPROTO_TCP, port)) < 0) return -1; CHECK_SET_MIN_MAX(c->tcp.fd_listen_, s); } if (c->v6) { - if ((s = sock_l4_add(c, 6, IPPROTO_TCP, port)) < 0) + if ((s = sock_l4(c, AF_INET6, IPPROTO_TCP, port)) < 0) return -1; CHECK_SET_MIN_MAX(c->tcp.fd_listen_, s); } @@ -1540,7 +1544,7 @@ static void tcp_timer_one(struct ctx *c, int s, struct timespec *ts) if (tc[s].seq_ack_from_tap < tc[s].seq_to_tap) { tc[s].seq_to_tap = tc[s].seq_ack_from_tap; tc[s].ts_ack_tap = *ts; - tcp_data_from_sock(c, s); + tcp_data_from_sock(c, s, ts); } } |