aboutgitcodebugslistschat
diff options
context:
space:
mode:
-rw-r--r--arp.c2
-rw-r--r--arp.h2
-rw-r--r--dhcp.c29
-rw-r--r--dhcp.h2
-rwxr-xr-xdoc/demo.sh13
-rw-r--r--ndp.c2
-rw-r--r--ndp.h2
-rw-r--r--passt.c148
-rw-r--r--tcp.c676
-rw-r--r--tcp.h3
-rw-r--r--udp.c31
-rw-r--r--util.c14
12 files changed, 464 insertions, 460 deletions
diff --git a/arp.c b/arp.c
index 21b6417..e9ccd5e 100644
--- a/arp.c
+++ b/arp.c
@@ -49,7 +49,7 @@ struct arpmsg {
*
* Return: 0 if it's not an ARP message, 1 if handled, -1 on failure
*/
-int arp(struct ctx *c, unsigned len, struct ethhdr *eh)
+int arp(struct ctx *c, struct ethhdr *eh, size_t len)
{
struct arphdr *ah = (struct arphdr *)(eh + 1);
struct arpmsg *am = (struct arpmsg *)(ah + 1);
diff --git a/arp.h b/arp.h
index ef3bd19..70188b3 100644
--- a/arp.h
+++ b/arp.h
@@ -1 +1 @@
-int arp(struct ctx *c, unsigned len, struct ethhdr *eh);
+int arp(struct ctx *c, struct ethhdr *eh, size_t len);
diff --git a/dhcp.c b/dhcp.c
index 3af4ace..d4a5261 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -163,22 +163,39 @@ static int fill(struct msg *m)
*
* Return: 0 if it's not a DHCP message, 1 if handled, -1 on failure
*/
-int dhcp(struct ctx *c, unsigned len, struct ethhdr *eh)
+int dhcp(struct ctx *c, struct ethhdr *eh, size_t len)
{
struct iphdr *iph = (struct iphdr *)(eh + 1);
- struct udphdr *uh = (struct udphdr *)((char *)iph + iph->ihl * 4);
- struct msg *m = (struct msg *)(uh + 1);
- unsigned int i, mlen = len - sizeof(*eh) - sizeof(*iph);
+ size_t mlen, olen;
+ struct udphdr *uh;
+ unsigned int i;
+ struct msg *m;
+
+ if (len < sizeof(*eh) + sizeof(*iph))
+ return 0;
+
+ if (len < sizeof(*eh) + iph->ihl * 4 + sizeof(*uh))
+ return 0;
+
+ uh = (struct udphdr *)((char *)iph + iph->ihl * 4);
+ m = (struct msg *)(uh + 1);
if (uh->dest != htons(67))
return 0;
- if (mlen != ntohs(uh->len) || mlen < offsetof(struct msg, o) ||
+ mlen = len - sizeof(*eh) - iph->ihl * 4 - sizeof(*uh);
+ if (mlen != ntohs(uh->len) - sizeof(*uh) ||
+ mlen < offsetof(struct msg, o) ||
m->op != BOOTREQUEST)
return -1;
- for (i = 0; i < mlen - offsetof(struct msg, o); i += m->o[i + 1] + 2)
+ olen = mlen - offsetof(struct msg, o);
+ for (i = 0; i + 2 < olen; i += m->o[i + 1] + 2) {
+ if (m->o[i + 1] + i + 2 >= olen)
+ return -1;
+
memcpy(&opts[m->o[i]].c, &m->o[i + 2], m->o[i + 1]);
+ }
if (opts[53].c[0] == DHCPDISCOVER) {
fprintf(stderr, "DHCP: offer to discover");
diff --git a/dhcp.h b/dhcp.h
index a519ee5..c4fbfe5 100644
--- a/dhcp.h
+++ b/dhcp.h
@@ -1 +1 @@
-int dhcp(struct ctx *c, unsigned len, struct ethhdr *eh);
+int dhcp(struct ctx *c, struct ethhdr *eh, size_t len);
diff --git a/doc/demo.sh b/doc/demo.sh
index 3d20491..3735130 100755
--- a/doc/demo.sh
+++ b/doc/demo.sh
@@ -48,10 +48,17 @@ ip netns add passt
ip link add veth_passt up netns passt type veth peer name veth_passt
ip link set dev veth_passt up
+
ip -n passt addr add 192.0.2.2/24 dev veth_passt
ip addr add 192.0.2.1/24 dev veth_passt
ip -n passt route add default via 192.0.2.1
+sysctl -w net.ipv4.ip_forward=1
+nft delete table passt_nat 2>/dev/null || :
+nft add table passt_nat
+nft 'add chain passt_nat postrouting { type nat hook postrouting priority -100 ; }'
+nft add rule passt_nat postrouting ip saddr 192.0.2.2 masquerade
+
ipv6_addr="$(ipv6_devaddr "$(ipv6_dev)")"
ipv6_passt="$(ipv6_mangle "${ipv6_addr}")"
ndp_setup "${ipv6_passt}"
@@ -59,11 +66,15 @@ ip -n passt addr add "${ipv6_passt}/$(ipv6_mask "${ipv6_addr}")" dev veth_passt
ip addr add "${ipv6_addr}" dev veth_passt
passt_ll="$(ipv6_ll_addr "veth_passt")"
main_ll="$(get_token "link/ether" $(ip -o li sh veth_passt))"
-ip -n passt neigh add "${passt_ll%%/*}" dev veth_passt lladdr "${main_ll}"
+ip neigh add "${passt_ll%%/*}" dev veth_passt lladdr "${main_ll}"
ip -n passt route add default via "${passt_ll%%/*}" dev veth_passt
+sysctl -w net.ipv6.conf.all.forwarding=1
+
+
ethtool -K veth_passt tx off
ip netns exec passt ethtool -K veth_passt tx off
ulimit -n 300000
+
ip netns exec passt ./passt
diff --git a/ndp.c b/ndp.c
index 9e38cec..952621d 100644
--- a/ndp.c
+++ b/ndp.c
@@ -40,7 +40,7 @@
*
* Return: 0 if not handled here, 1 if handled, -1 on failure
*/
-int ndp(struct ctx *c, unsigned len, struct ethhdr *eh)
+int ndp(struct ctx *c, struct ethhdr *eh, size_t len)
{
struct ethhdr *ehr;
struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1), *ip6hr;
diff --git a/ndp.h b/ndp.h
index 2c59713..b831c4d 100644
--- a/ndp.h
+++ b/ndp.h
@@ -1 +1 @@
-int ndp(struct ctx *c, unsigned len, struct ethhdr *eh);
+int ndp(struct ctx *c, struct ethhdr *eh, size_t len);
diff --git a/passt.c b/passt.c
index 4ef6e72..622ff38 100644
--- a/passt.c
+++ b/passt.c
@@ -51,9 +51,7 @@
#define EPOLL_EVENTS 10
-#define EPOLL_TIMEOUT 100 /* ms, for protocol periodic handlers */
-#define PERIODIC_HANDLER_FAST 100
-#define PERIODIC_HANDLER_SLOW 1000
+#define TIMER_INTERVAL 20 /* ms, for protocol periodic handlers */
/**
* sock_unix() - Create and bind AF_UNIX socket, add to epoll list
@@ -294,7 +292,7 @@ static void get_dns(struct ctx *c)
}
/**
- * tap4_handler() - IPv4 packet handler for tap file descriptor
+ * tap4_handler() - IPv4 and ARP packet handler for tap file descriptor
* @c: Execution context
* @len: Total L2 packet length
* @in: Packet buffer, L2 headers
@@ -303,12 +301,18 @@ static void tap4_handler(struct ctx *c, char *in, size_t len)
{
struct ethhdr *eh = (struct ethhdr *)in;
struct iphdr *iph = (struct iphdr *)(eh + 1);
- char *l4h = (char *)iph + iph->ihl * 4;
char buf_s[BUFSIZ], buf_d[BUFSIZ];
+ char *l4h;
+
+ if (arp(c, eh, len) || dhcp(c, eh, len))
+ return;
- if (arp(c, len, eh) || dhcp(c, len, eh))
+ if (len < sizeof(*eh) + sizeof(*iph))
return;
+ l4h = (char *)iph + iph->ihl * 4;
+ len -= (intptr_t)l4h - (intptr_t)eh;
+
if (iph->protocol == IPPROTO_ICMP) {
fprintf(stderr, "icmp from tap: %s -> %s\n",
inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)),
@@ -316,6 +320,9 @@ static void tap4_handler(struct ctx *c, char *in, size_t len)
} else {
struct tcphdr *th = (struct tcphdr *)l4h;
+ if (len < sizeof(*th) && len < sizeof(struct udphdr))
+ return;
+
fprintf(stderr, "%s from tap: %s:%i -> %s:%i\n",
getprotobynumber(iph->protocol)->p_name,
inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)),
@@ -324,8 +331,6 @@ static void tap4_handler(struct ctx *c, char *in, size_t len)
ntohs(th->dest));
}
- len -= (intptr_t)l4h - (intptr_t)eh;
-
if (iph->protocol == IPPROTO_TCP)
tcp_tap_handler(c, AF_INET, &iph->daddr, l4h, len);
else if (iph->protocol == IPPROTO_UDP)
@@ -346,33 +351,21 @@ static void tap6_handler(struct ctx *c, char *in, size_t len)
uint8_t proto;
char *l4h;
- if (ndp(c, len, eh))
+ if (len < sizeof(*eh) + sizeof(*ip6h))
+ return;
+
+ if (ndp(c, eh, len))
return;
l4h = ipv6_l4hdr(ip6h, &proto);
/* TODO: Assign MAC address to guest so that, together with prefix
- * assigned via NDP, address matches the one on the host. Then drop
- * address change and checksum recomputation.
+ * assigned via NDP, address matches the one from the host.
*/
c->addr6_guest = ip6h->saddr;
ip6h->saddr = c->addr6;
- if (proto == IPPROTO_TCP) {
- struct tcphdr *th = (struct tcphdr *)(ip6h + 1);
-
- th->check = 0;
- th->check = csum_ip4(ip6h, len + sizeof(*ip6h));
- } else if (proto == IPPROTO_UDP) {
- struct udphdr *uh = (struct udphdr *)(ip6h + 1);
-
- uh->check = 0;
- uh->check = csum_ip4(ip6h, len + sizeof(*ip6h));
- } else if (proto == IPPROTO_ICMPV6) {
- struct icmp6hdr *ih = (struct icmp6hdr *)(ip6h + 1);
- ih->icmp6_cksum = 0;
- ih->icmp6_cksum = csum_ip4(ip6h, len + sizeof(*ip6h));
- }
+ len -= (intptr_t)l4h - (intptr_t)eh;
if (proto == IPPROTO_ICMPV6) {
fprintf(stderr, "icmpv6 from tap: %s ->\n\t%s\n",
@@ -382,6 +375,9 @@ static void tap6_handler(struct ctx *c, char *in, size_t len)
} else {
struct tcphdr *th = (struct tcphdr *)l4h;
+ if (len < sizeof(*th) && len < sizeof(struct udphdr))
+ return;
+
fprintf(stderr, "%s from tap: [%s]:%i\n"
"\t-> [%s]:%i\n",
getprotobynumber(proto)->p_name,
@@ -391,8 +387,6 @@ static void tap6_handler(struct ctx *c, char *in, size_t len)
ntohs(th->dest));
}
- len -= (intptr_t)l4h - (intptr_t)eh;
-
if (proto == IPPROTO_TCP)
tcp_tap_handler(c, AF_INET6, &ip6h->daddr, l4h, len);
else if (proto == IPPROTO_UDP)
@@ -400,19 +394,46 @@ static void tap6_handler(struct ctx *c, char *in, size_t len)
}
/**
- * tap_handler() - IPv4/IPv6/ARP packet handler for tap file descriptor
+ * tap_handler() - Packet handler for tap file descriptor
* @c: Execution context
- * @len: Total L2 packet length
- * @in: Packet buffer, L2 headers
+ *
+ * Return: -ECONNRESET if tap connection was lost, 0 otherwise
*/
-static void tap_handler(struct ctx *c, char *in, size_t len)
+static int tap_handler(struct ctx *c)
{
- struct ethhdr *eh = (struct ethhdr *)in;
+ char buf[ETH_MAX_MTU];
+ struct ethhdr *eh;
+ uint32_t vnet_len;
+ ssize_t n;
+
+ eh = (struct ethhdr *)buf;
- if (eh->h_proto == ntohs(ETH_P_IP) || eh->h_proto == ntohs(ETH_P_ARP))
- tap4_handler(c, in, len);
- else if (eh->h_proto == ntohs(ETH_P_IPV6))
- tap6_handler(c, in, len);
+ while ((n = recv(c->fd_unix, &vnet_len, 4, MSG_DONTWAIT)) == 4) {
+ n = recv(c->fd_unix, buf, ntohl(vnet_len), MSG_DONTWAIT);
+
+ if (n < (ssize_t)sizeof(*eh))
+ break;
+
+ switch (ntohs(eh->h_proto)) {
+ case ETH_P_IP:
+ case ETH_P_ARP:
+ tap4_handler(c, buf, n);
+ break;
+ case ETH_P_IPV6:
+ tap6_handler(c, buf, n);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (n >= 0 || errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
+ return 0;
+
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_unix, NULL);
+ close(c->fd_unix);
+
+ return -ECONNRESET;
}
/**
@@ -429,29 +450,30 @@ static void sock_handler(struct ctx *c, int fd, uint32_t events)
sl = sizeof(so);
if (getsockopt(fd, SOL_SOCKET, SO_TYPE, &so, &sl) ||
- so == SOCK_STREAM)
+ so == SOCK_STREAM) {
+ fprintf(stderr, "TCP: packet from socket %i\n", fd);
tcp_sock_handler(c, fd, events);
- else if (so == SOCK_DGRAM)
+ }
+ else if (so == SOCK_DGRAM) {
udp_sock_handler(c, fd, events);
+ fprintf(stderr, "UDP: packet from socket %i\n", fd);
+ }
}
/**
- * periodic_handler() - Run periodic tasks for L4 protocol handlers
+ * timer_handler() - Run periodic tasks for L4 protocol handlers
* @c: Execution context
* @last: Timestamp of last run, updated on return
*/
-static void periodic_handler(struct ctx *c, struct timespec *last)
+static void timer_handler(struct ctx *c, struct timespec *last)
{
struct timespec tmp;
- int elapsed_ms;
clock_gettime(CLOCK_MONOTONIC, &tmp);
- elapsed_ms = timespec_diff_ms(&tmp, last);
+ if (timespec_diff_ms(&tmp, last) < TIMER_INTERVAL)
+ return;
- if (elapsed_ms >= PERIODIC_HANDLER_FAST)
- tcp_periodic_fast(c);
- if (elapsed_ms >= PERIODIC_HANDLER_SLOW)
- tcp_periodic_slow(c);
+ tcp_timer(c, &tmp);
*last = tmp;
}
@@ -481,10 +503,8 @@ int main(int argc, char **argv)
struct epoll_event events[EPOLL_EVENTS];
struct epoll_event ev = { 0 };
struct timespec last_time;
- char buf[ETH_MAX_MTU];
struct ctx c = { 0 };
- int nfds, i, len;
- int fd_unix;
+ int nfds, i, fd_unix;
if (argc != 1)
usage(argv[0]);
@@ -537,14 +557,14 @@ listen:
"./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio\n\n");
c.fd_unix = accept(fd_unix, NULL, NULL);
- ev.events = EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
+ ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
ev.data.fd = c.fd_unix;
epoll_ctl(c.epollfd, EPOLL_CTL_ADD, c.fd_unix, &ev);
clock_gettime(CLOCK_MONOTONIC, &last_time);
loop:
- nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, EPOLL_TIMEOUT);
+ nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL);
if (nfds == -1 && errno != EINTR) {
perror("epoll_wait");
exit(EXIT_FAILURE);
@@ -552,36 +572,16 @@ loop:
for (i = 0; i < nfds; i++) {
if (events[i].data.fd == c.fd_unix) {
- len = recv(events[i].data.fd, buf, sizeof(buf),
- MSG_DONTWAIT);
-
- if (len <= 0) {
- epoll_ctl(c.epollfd, EPOLL_CTL_DEL, c.fd_unix,
- &ev);
- close(c.fd_unix);
+ if (tap_handler(&c))
goto listen;
- }
-
- if (len == 0 || (len < 0 && errno == EINTR))
- continue;
-
- if (len < 0) {
- if (errno == EAGAIN || errno == EWOULDBLOCK)
- break;
- goto out;
- }
-
- tap_handler(&c, buf + 4, ntohl(*(uint32_t *)buf));
} else {
sock_handler(&c, events[i].data.fd, events[i].events);
}
}
- periodic_handler(&c, &last_time);
- clock_gettime(CLOCK_MONOTONIC, &last_time);
+ timer_handler(&c, &last_time);
goto loop;
-out:
return 0;
}
diff --git a/tcp.c b/tcp.c
index 46b739d..f1de9cf 100644
--- a/tcp.c
+++ b/tcp.c
@@ -130,7 +130,7 @@
*
* These states apply to connected sockets only, listening sockets are always
* open after initialisation, in LISTEN state. A single state is maintained for
- * both sides of the connection, and most states are omitted as they are already
+ * both sides of the connection, and some states are omitted as they are already
* handled by host kernel and guest.
*
* - CLOSED no connection
@@ -144,31 +144,32 @@
*
* - SOCK_SYN_SENT new connected socket, SYN sent to tap
* - SYN,ACK from tap ACK to tap > ESTABLISHED
- * - SYN,ACK timeout RST to tap, close socket > CLOSED
* - socket error RST to tap, close socket > CLOSED
+ * - SYN,ACK timeout RST to tap, close socket > CLOSED
* - RST from tap close socket > CLOSED
*
* - TAP_SYN_RCVD connect() completed, SYN,ACK sent to tap
+ * - FIN from tap write shutdown > FIN_WAIT_1
* - ACK from tap > ESTABLISHED
- * - ACK timeout RST to tap, close socket > CLOSED
* - socket error RST to tap, close socket > CLOSED
+ * - ACK timeout RST to tap, close socket > CLOSED
* - RST from tap close socket > CLOSED
*
* - ESTABLISHED connection established, ready for data
- * - zero-sized socket read FIN to tap > ESTABLISHED_SOCK_FIN
- * - data timeout FIN to tap > ESTABLISHED_SOCK_FIN
+ * - FIN from tap write shutdown > FIN_WAIT_1
+ * - zero-sized socket read read shutdown, FIN to tap > ESTABLISHED_SOCK_FIN
* - socket error RST to tap, close socket > CLOSED
- * - FIN from tap FIN,ACK to tap, close socket > FIN_WAIT_1
+ * - data timeout FIN to tap > ESTABLISHED_SOCK_FIN
* - RST from tap close socket > CLOSED
*
- * - ESTABLISHED_SOCK_FIN socket wants to close connection, data allowed
+ * - ESTABLISHED_SOCK_FIN socket closing connection, FIN sent to tap
* - ACK from tap > CLOSE_WAIT
* - ACK timeout RST to tap, close socket > CLOSED
* - RST from tap close socket > CLOSED
*
- * - CLOSE_WAIT socket wants to close connection, seen by tap
+ * - CLOSE_WAIT socket closing connection, ACK from tap
+ * - FIN from tap write shutdown > LAST_ACK
* - socket error RST to tap, close socket > CLOSED
- * - FIN from tap ACK to tap, close socket > LAST_ACK
* - FIN timeout RST to tap, close socket > CLOSED
* - RST from tap close socket > CLOSED
*
@@ -176,12 +177,19 @@
* - anything from socket close socket > CLOSED
* - socket error RST to tap, close socket > CLOSED
* - ACK timeout RST to tap, close socket > CLOSED
+ * - RST from tap close socket > CLOSED
*
- * - FIN_WAIT_1 tap wants to close connection, _FIN,ACK sent_
- * - ACK from tap close socket > CLOSED
+ * - FIN_WAIT_1 tap closing connection, FIN sent to socket
+ * - zero-sized socket read FIN,ACK to tap, shutdown > FIN_WAIT_1_SOCK_FIN
* - socket error RST to tap, close socket > CLOSED
* - ACK timeout RST to tap, close socket > CLOSED
+ * - RST from tap close socket > CLOSED
*
+ * - FIN_WAIT_1_SOCK_FIN tap closing connection, FIN received from socket
+ * - ACK from tap close socket > CLOSED
+ * - socket error RST to tap, close socket > CLOSED
+ * - ACK timeout RST to tap, close socket > CLOSED
+ * - RST from tap close socket > CLOSED
*
* Connection setup
* ----------------
@@ -198,34 +206,33 @@
* Aging and timeout
* -----------------
*
- * Two bitmaps of TCP_MAX_CONNS bits indicate which connections need scheduled
- * actions:
- * - @tcp_act_fast is used to send ACK segments to the tap once TCP_INFO reports
- * an increased number of acknowledged bytes sent on a socket, and examined
- * every 20ms (one tenth of current TCP_DELACK_MAX on Linux): for each marked
- * connection, a TCP_INFO query is performed and ACK segments are sent right
- * away as needed
- * - @tcp_act_slow is used for state and retransmission timeouts, and examined
- * every 2s: for each marked connection with an expired @timeout timestamp
- * specific actions are taken depending on the connection state:
- * - SOCK_SYN_SENT: after a 2MSL (240s) timeout waiting for a SYN,ACK segment
- * from tap expires, connection is reset (RST to tap, socket closed)
- * - TAP_SYN_RCVD: after a 2MSL (240s) timeout waiting for an ACK segment from
- * tap expires, connection is reset (RST to tap, socket closed)
- * - ESTABLISHED: after a timeout of 1s (TODO: implement requirements from
- * RFC 6298) waiting for an ACK segment from tap expires, data from socket
- * queue is retransmitted starting from the last ACK sequence
- * - ESTABLISHED: after a two hours (current TCP_KEEPALIVE_TIME on Linux)
- * timeout waiting for any activity expires, connection is reset (RST to
- * tap, socket closed)
- * - ESTABLISHED_SOCK_FIN: after a 2MSL (240s) timeout waiting for an ACK
- * segment from tap expires, connection is reset (RST to tap, socket closed)
- * - CLOSE_WAIT: after a 2MSL (240s) timeout waiting for a FIN segment from
- * tap expires, connection is reset (RST to tap, socket closed)
- * - LAST_ACK: after a 2MSL (240s) timeout waiting for an ACK segment from
- * socket expires, connection is reset (RST to tap, socket closed)
- * - FIN_WAIT_1: after a 2MSL (240s) timeout waiting for an ACK segment from
- * tap expires, connection is reset (RST to tap, socket closed)
+ * A bitmap of TCP_MAX_CONNS bits indicate the connections subject to timed
+ * events based on states:
+ * - SOCK_SYN_SENT: after a 2MSL (240s) timeout waiting for a SYN,ACK segment
+ * from tap expires, connection is reset (RST to tap, socket closed)
+ * - TAP_SYN_RCVD: after a 2MSL (240s) timeout waiting for an ACK segment from
+ * tap expires, connection is reset (RST to tap, socket closed)
+ * - TAP_SYN_SENT: connect() is pending, timeout is handled implicitly by
+ * connect() timeout, connection will be reset in case
+ * - ESTABLISHED, ESTABLISHED_SOCK_FIN: if an ACK segment to tap is pending,
+ * bytes acknowledged by socket endpoint are checked every 50ms (one quarter
+ * of current TCP_DELACK_MAX on Linux)
+ * - ESTABLISHED, ESTABLISHED_SOCK_FIN: after a timeout of 3s (TODO: implement
+ * requirements from RFC 6298) waiting for an ACK segment from tap expires,
+ * data from socket queue is retransmitted starting from the last ACK sequence
+ * - ESTABLISHED, ESTABLISHED_SOCK_FIN: after a two hours (current
+ * TCP_KEEPALIVE_TIME on Linux) timeout waiting for any activity expires,
+ * connection is reset (RST to tap, socket closed)
+ * - ESTABLISHED_SOCK_FIN: after a 2MSL (240s) timeout waiting for an ACK
+ * segment from tap expires, connection is reset (RST to tap, socket closed)
+ * - CLOSE_WAIT: after a 2MSL (240s) timeout waiting for a FIN segment from tap
+ * expires, connection is reset (RST to tap, socket closed)
+ * - FIN_WAIT_1: after a 2MSL (240s) timeout waiting for an ACK segment from
+ * socet expires, connection is reset (RST to tap, socket closed)
+ * - FIN_WAIT_1_SOCK_FIN: after a 2MSL (240s) timeout waiting for an ACK segment
+ * from tap expires, connection is reset (RST to tap, socket closed)
+ * - LAST_ACK: after a 2MSL (240s) timeout waiting for an ACK segment from
+ * socket expires, connection is reset (RST to tap, socket closed)
*
*
* Data flows (from ESTABLISHED, ESTABLISHED_SOCK_FIN states)
@@ -253,6 +260,7 @@
* - on read error, send RST to tap, close socket
* - on zero read, send FIN to tap, enter ESTABLISHED_SOCK_FIN
* - on ACK from tap:
+ * - set @ts_ack_tap
* - check if it's the second duplicated ACK
* - consume buffer by difference between new ack_seq and @seq_ack_from_tap
* - update @seq_ack_from_tap from ack_seq in header
@@ -263,11 +271,12 @@
* - periodically:
* - if @seq_ack_from_tap < @seq_to_tap and the retransmission timer
* (TODO: implement requirements from RFC 6298, currently 3s fixed) from
- * @last_ts_to_tap elapsed, reset @seq_to_tap to @seq_ack_from_tap, and
+ * @ts_sock elapsed, reset @seq_to_tap to @seq_ack_from_tap, and
* resend data with the steps listed above
*
* - from tap to socket:
* - on packet from tap:
+ * - set @ts_tap
* - set TCP_WINDOW_CLAMP from TCP header from tap
* - check seq from header against @seq_from_tap, if data is missing, send
* two ACKs with number @seq_ack_to_tap, discard packet
@@ -277,15 +286,11 @@
* set @tcpi_acked_last to tcpi_bytes_acked, set @seq_ack_to_tap
* to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and
* send ACK to tap
- * - set @last_ts_sock
- * - on @seq_ack_to_tap < @seq_from_tap, mark socket for later ACK in bitmap
* - periodically:
- * - if socket is marked in bitmap, query socket for TCP_INFO, on
- * tcpi_bytes_acked > @tcpi_acked_last,
+ * - query socket for TCP_INFO, on tcpi_bytes_acked > @tcpi_acked_last,
* set @tcpi_acked_last to tcpi_bytes_acked, set @seq_ack_to_tap
* to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and
* send ACK to tap
- * - on @seq_ack_to_tap == @seq_from_tap, unmark socket from bitmap
*/
#define _GNU_SOURCE
@@ -321,22 +326,17 @@
#define SYN_TIMEOUT 240000 /* ms */
#define ACK_TIMEOUT 3000
+#define ACK_INTERVAL 50
#define ACT_TIMEOUT 7200000
#define FIN_TIMEOUT 240000
#define LAST_ACK_TIMEOUT 240000
-#define SOCK_ACK_INTERVAL 20
/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of
* <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP
*/
#define SOL_TCP IPPROTO_TCP
-static char tcp_in_buf[MAX_WINDOW];
-
-static uint8_t tcp_act_fast[MAX_CONNS / 8] = { 0 };
-static uint8_t tcp_act_slow[MAX_CONNS / 8] = { 0 };
-
enum tcp_state {
CLOSED = 0,
TAP_SYN_SENT,
@@ -347,6 +347,13 @@ enum tcp_state {
CLOSE_WAIT,
LAST_ACK,
FIN_WAIT_1,
+ FIN_WAIT_1_SOCK_FIN,
+};
+
+static char *tcp_state_str[FIN_WAIT_1_SOCK_FIN + 1] = {
+ "CLOSED", "TAP_SYN_SENT", "SOCK_SYN_SENT", "TAP_SYN_RCVD",
+ "ESTABLISHED", "ESTABLISHED_SOCK_FIN", "CLOSE_WAIT", "LAST_ACK",
+ "FIN_WAIT_1", "FIN_WAIT_1_SOCK_FIN",
};
#define FIN (1 << 0)
@@ -357,7 +364,9 @@ enum tcp_state {
#define OPT_EOL 0
#define OPT_NOP 1
#define OPT_MSS 2
+#define OPT_MSS_LEN 4
#define OPT_WS 3
+#define OPT_WS_LEN 3
#define OPT_SACKP 4
#define OPT_SACK 5
#define OPT_TS 8
@@ -381,8 +390,9 @@ enum tcp_state {
* @ws_allowed: Window scaling allowed
* @ws: Window scaling factor
* @tap_window: Last window size received from tap, scaled
- * @last_ts_sock: Last activity timestamp from socket for timeout purposes
- * @last_ts_tap: Last activity timestamp from tap for timeout purposes
+ * @ts_sock: Last activity timestamp from socket for timeout purposes
+ * @ts_tap: Last activity timestamp from tap for timeout purposes
+ * @ts_ack_tap: Last ACK segment timestamp from tap for timeout purposes
* @mss_guest: Maximum segment size advertised by guest
*/
struct tcp_conn {
@@ -410,106 +420,101 @@ struct tcp_conn {
int ws;
int tap_window;
- struct timespec last_ts_sock;
- struct timespec last_ts_tap;
+ struct timespec ts_sock;
+ struct timespec ts_tap;
+ struct timespec ts_ack_tap;
int mss_guest;
};
+static char sock_buf[MAX_WINDOW];
+static uint8_t tcp_act[MAX_CONNS / 8] = { 0 };
static struct tcp_conn tc[MAX_CONNS];
static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len);
/**
- * tcp_act_fast_set() - Set socket in bitmap for "fast" timeout events
+ * tcp_act_set() - Set socket in bitmap for timed events
* @s: Socket file descriptor number
*/
-static void tcp_act_fast_set(int s)
+static void tcp_act_set(int s)
{
- tcp_act_fast[s / 8] |= 1 << (s % 8);
+ tcp_act[s / 8] |= 1 << (s % 8);
}
/**
- * tcp_act_fast_clear() - Clear socket from bitmap for "fast" timeout events
+ * tcp_act_clear() - Clear socket from bitmap for timed events
* @s: Socket file descriptor number
*/
-static void tcp_act_fast_clear(int s)
+static void tcp_act_clear(int s)
{
- tcp_act_fast[s / 8] &= ~(1 << (s % 8));
+ tcp_act[s / 8] &= ~(1 << (s % 8));
}
/**
- * tcp_act_slow_set() - Set socket in bitmap for "slow" timeout events
+ * tcp_set_state() - Set given TCP state for socket, report change to stderr
* @s: Socket file descriptor number
+ * @state: New TCP state to be set
*/
-static void tcp_act_slow_set(int s)
+static void tcp_set_state(int s, enum tcp_state state)
{
- tcp_act_slow[s / 8] |= 1 << (s % 8);
-}
-
-/**
- * tcp_act_slow_clear() - Clear socket from bitmap for "slow" timeout events
- * @s: Socket file descriptor number
- */
-static void tcp_act_slow_clear(int s)
-{
- tcp_act_slow[s / 8] &= ~(1 << (s % 8));
+ fprintf(stderr, "TCP: socket %i: %s -> %s\n", s,
+ tcp_state_str[tc[s].s], tcp_state_str[state]);
+ tc[s].s = state;
}
/**
* tcp_opt_get() - Get option, and value if any, from TCP header
* @th: Pointer to TCP header
* @len: Length of buffer, including TCP header
- * @type: Option type to look for
- * @optlen: Optional, filled with option length if passed
- * @value: Optional, set to start of option value if passed
+ * @__type: Option type to look for
+ * @__optlen: Optional, filled with option length if passed
+ * @__value: Optional, set to start of option value if passed
*
* Return: Option value, meaningful for up to 4 bytes, -1 if not found
*/
-static int tcp_opt_get(struct tcphdr *th, unsigned int len, uint8_t type,
- uint8_t *optlen, void *value)
+static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t __type,
+ uint8_t *__optlen, char **__value)
{
- uint8_t *p, __type, __optlen;
+ uint8_t type, optlen;
+ char *p;
- len -= sizeof(*th);
- p = (uint8_t *)(th + 1);
+ if (len > th->doff * 4)
+ len = th->doff * 4;
- if (len > th->doff * 4 - sizeof(*th))
- len = th->doff * 4 - sizeof(*th);
+ len -= sizeof(*th);
+ p = (char *)(th + 1);
- while (len >= 2) {
+ for (; len >= 2; p += optlen, len -= optlen) {
switch (*p) {
case OPT_EOL:
return -1;
case OPT_NOP:
- p++;
- len--;
+ optlen = 1;
break;
default:
- __type = *(p++);
- __optlen = *(p++);
+ type = *(p++);
+ optlen = *(p++) - 2;
len -= 2;
- if (type == __type) {
- if (optlen)
- *optlen = __optlen;
- if (value)
- value = p;
-
- if (__optlen - 2 == 0)
- return 0;
-
- if (__optlen - 2 == 1)
- return *p;
-
- if (__optlen - 2 == 2)
- return ntohs(*(uint16_t *)p);
-
+ if (type != __type)
+ break;
+
+ if (__optlen)
+ *__optlen = optlen;
+ if (__value)
+ *__value = p;
+
+ switch (optlen) {
+ case 0:
+ return 0;
+ case 1:
+ return *p;
+ case 2:
+ return ntohs(*(uint16_t *)p);
+ default:
return ntohl(*(uint32_t *)p);
}
-
- p += __optlen - 2;
- len -= __optlen - 2;
}
}
@@ -524,9 +529,9 @@ static int tcp_opt_get(struct tcphdr *th, unsigned int len, uint8_t type,
static void tcp_close_and_epoll_del(struct ctx *c, int s)
{
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL);
+ tcp_set_state(s, CLOSED);
close(s);
- tcp_act_fast_clear(s);
- tcp_act_slow_clear(s);
+ tcp_act_clear(s);
}
/**
@@ -541,7 +546,7 @@ static void tcp_rst(struct ctx *c, int s)
tcp_send_to_tap(c, s, RST, NULL, 0);
tcp_close_and_epoll_del(c, s);
- tc[s].s = CLOSED;
+ tcp_set_state(s, CLOSED);
}
/**
@@ -549,76 +554,70 @@ static void tcp_rst(struct ctx *c, int s)
* @c: Execution context
* @s: File descriptor number for socket
* @flags: TCP flags to set
- * @in: Input buffer, L4 header
- * @len: Buffer length, at L4
+ * @in: Payload buffer
+ * @len: Payload length
*
- * Return: -1 on error with connection reset, 0 otherwise
+ * Return: negative error code on connection reset, 0 otherwise
*/
static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len)
{
char buf[USHRT_MAX] = { 0 }, *data;
struct tcp_info info = { 0 };
socklen_t sl = sizeof(info);
- int ws = 0, have_info = 1;
struct tcphdr *th;
+ int ws = 0, err;
- if (getsockopt(s, SOL_TCP, TCP_INFO, &info, &sl)) {
- if (!(flags & RST)) {
- tcp_rst(c, s);
- return -1;
- }
-
- have_info = 0;
+ if ((err = getsockopt(s, SOL_TCP, TCP_INFO, &info, &sl)) &&
+ !(flags & RST)) {
+ tcp_rst(c, s);
+ return err;
}
th = (struct tcphdr *)buf;
data = (char *)(th + 1);
+ th->doff = sizeof(*th) / 4;
- if (flags & SYN && have_info) {
- if (tc[s].ws_allowed)
- ws = info.tcpi_snd_wscale;
-
+ if ((flags & SYN) && !err) {
/* Options: MSS, NOP and window scale if allowed (4-8 bytes) */
- *data++ = 2;
- *data++ = 4;
+ *data++ = OPT_MSS;
+ *data++ = OPT_MSS_LEN;
*(uint16_t *)data = htons(info.tcpi_snd_mss);
- data += 2;
+ data += OPT_MSS_LEN - 2;
+ th->doff += OPT_MSS_LEN / 4;
- if (ws) {
- *data++ = 1;
+ if (tc[s].ws_allowed && (ws = info.tcpi_snd_wscale)) {
+ *data++ = OPT_NOP;
- *data++ = 3;
- *data++ = 3;
- *data++ = ws;
+ *data++ = OPT_WS;
+ *data++ = OPT_WS_LEN;
+ *data = ws;
+ *data += OPT_WS_LEN - 2;
- th->doff = (20 + 8) / 4;
- } else {
- th->doff = (20 + 4) / 4;
+ th->doff += (1 + OPT_WS_LEN) / 4;
}
+ /* RFC 793, 3.1: "[...] and the first data octet is ISN+1." */
th->seq = htonl(tc[s].seq_to_tap++);
} else {
- th->doff = 20 / 4;
-
th->seq = htonl(tc[s].seq_to_tap);
tc[s].seq_to_tap += len;
}
- if ((info.tcpi_bytes_acked > tc[s].tcpi_acked_last || (flags & ACK) ||
- len) &&
- have_info) {
+ if (!err && ((info.tcpi_bytes_acked > tc[s].tcpi_acked_last) ||
+ (flags & ACK) || len)) {
uint64_t ack_seq;
th->ack = 1;
- /* info.tcpi_bytes_acked already includes one byte for SYN, but
- * not for incoming connections.
- */
+
ack_seq = info.tcpi_bytes_acked + tc[s].seq_init_from_tap;
- if (!info.tcpi_bytes_acked)
- ack_seq++;
- ack_seq &= (uint32_t)~0U;
- tc[s].seq_ack_to_tap = ack_seq;
+ tc[s].seq_ack_to_tap = ack_seq & (uint32_t)~0U;
+
+ if (tc[s].s == LAST_ACK) {
+ tc[s].seq_ack_to_tap = tc[s].seq_from_tap + 1;
+ th->seq = htonl(ntohl(th->seq) + 1);
+ }
+
th->ack_seq = htonl(tc[s].seq_ack_to_tap);
tc[s].tcpi_acked_last = info.tcpi_bytes_acked;
@@ -636,7 +635,7 @@ static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len)
th->source = tc[s].sock_port;
th->dest = tc[s].tap_port;
- if (have_info)
+ if (!err)
th->window = htons(info.tcpi_snd_wnd >> info.tcpi_snd_wscale);
else
th->window = WINDOW_DEFAULT;
@@ -656,23 +655,18 @@ static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len)
* @s: File descriptor number for socket
* @th: TCP header, from tap
* @len: Buffer length, at L4
+ * @init: Set if this is the very first segment from tap
*/
-static void tcp_clamp_window(int s, struct tcphdr *th, int len)
+static void tcp_clamp_window(int s, struct tcphdr *th, int len, int init)
{
- int ws;
-
- if (!tc[s].tap_window) {
- ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
- if (ws >= 0 && ws <= MAX_WS) {
- tc[s].ws_allowed = 1;
- tc[s].ws = ws;
- } else {
- tc[s].ws_allowed = 0;
- tc[s].ws = 0;
- }
-
- /* First value is not scaled. Also, don't clamp yet, to avoid
- * getting a zero scale just because we set a small window now.
+ if (init) {
+ tc[s].ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
+ tc[s].ws_allowed = tc[s].ws >= 0 && tc[s].ws <= MAX_WS;
+ tc[s].ws *= tc[s].ws_allowed;
+
+ /* RFC 7323, 2.2: first value is not scaled. Also, don't clamp
+ * yet, to avoid getting a zero scale just because we set a
+ * small window now.
*/
tc[s].tap_window = ntohs(th->window);
} else {
@@ -718,25 +712,31 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
sl = sizeof(tc[s].mss_guest);
setsockopt(s, SOL_TCP, TCP_MAXSEG, &tc[s].mss_guest, sl);
- tcp_clamp_window(s, th, len);
+ tcp_clamp_window(s, th, len, 1);
if (af == AF_INET) {
- sa = (const struct sockaddr *)&addr4;
+ sa = (struct sockaddr *)&addr4;
sl = sizeof(addr4);
- memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero));
- memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one));
- memcpy(&tc[s].a.a4.a, addr, sizeof(tc[s].a.a4.a));
+ memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero));
+ memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one));
+ memcpy(&tc[s].a.a4.a, addr, sizeof(tc[s].a.a4.a));
} else {
- sa = (const struct sockaddr *)&addr6;
+ sa = (struct sockaddr *)&addr6;
sl = sizeof(addr6);
- memcpy(&tc[s].a.a6, addr, sizeof(tc[s].a.a6));
+ memcpy(&tc[s].a.a6, addr, sizeof(tc[s].a.a6));
}
tc[s].sock_port = th->dest;
tc[s].tap_port = th->source;
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_sock);
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_tap);
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_ack_tap);
+
+ tcp_act_set(s);
+
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
ev.data.fd = s;
@@ -745,7 +745,8 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
tc[s].seq_ack_to_tap = tc[s].seq_from_tap;
/* TODO: RFC 6528 with SipHash, worth it? */
- tc[s].seq_ack_from_tap = tc[s].seq_to_tap = 0;
+ tc[s].seq_to_tap = 0;
+ tc[s].seq_ack_from_tap = tc[s].seq_to_tap;
if (connect(s, sa, sl)) {
if (errno != EINPROGRESS) {
@@ -754,17 +755,15 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
}
ev.events |= EPOLLOUT;
- tc[s].s = TAP_SYN_SENT;
+ tcp_set_state(s, TAP_SYN_SENT);
} else {
if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0))
return;
- tc[s].s = TAP_SYN_RCVD;
+ tcp_set_state(s, TAP_SYN_RCVD);
}
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev);
-
- return;
}
/**
@@ -773,7 +772,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
* @tap_port: tap-facing port
* @sock_port: Socket-facing port
*
- * Return: file descriptor number for socket, if found, -1 otherwise
+ * Return: file descriptor number for socket, if found, -ENOENT otherwise
*/
static int tcp_sock_lookup(int af, void *addr,
in_port_t tap_port, in_port_t sock_port)
@@ -797,7 +796,7 @@ static int tcp_sock_lookup(int af, void *addr,
return i;
}
- return -1;
+ return -ENOENT;
}
/**
@@ -808,10 +807,8 @@ static int tcp_sock_lookup(int af, void *addr,
static void tcp_conn_from_sock(struct ctx *c, int fd)
{
struct sockaddr_storage sa_r, sa_l;
- socklen_t sa_len = sizeof(sa_r);
+ socklen_t sa_len = sizeof(sa_l);
struct epoll_event ev = { 0 };
- struct sockaddr_in6 *sa6;
- struct sockaddr_in *sa4;
int s;
if (getsockname(fd, (struct sockaddr *)&sa_l, &sa_len))
@@ -822,41 +819,41 @@ static void tcp_conn_from_sock(struct ctx *c, int fd)
return;
if (sa_l.ss_family == AF_INET) {
- sa4 = (struct sockaddr_in *)&sa_r;
+ struct sockaddr_in *sa4 = (struct sockaddr_in *)&sa_r;
memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero));
memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one));
memcpy(&tc[s].a.a4.a, &sa4->sin_addr, sizeof(tc[s].a.a4.a));
tc[s].sock_port = sa4->sin_port;
-
- sa4 = (struct sockaddr_in *)&sa_l;
- tc[s].tap_port = sa4->sin_port;
-
+ tc[s].tap_port = ((struct sockaddr_in *)&sa_l)->sin_port;
} else if (sa_l.ss_family == AF_INET6) {
- sa6 = (struct sockaddr_in6 *)&sa_r;
+ struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)&sa_r;
memcpy(&tc[s].a.a6, &sa6->sin6_addr, sizeof(tc[s].a.a6));
tc[s].sock_port = sa6->sin6_port;
-
- sa6 = (struct sockaddr_in6 *)&sa_l;
- tc[s].tap_port = sa6->sin6_port;
+ tc[s].tap_port = ((struct sockaddr_in6 *)&sa_l)->sin6_port;
}
/* TODO: RFC 6528 with SipHash, worth it? */
tc[s].seq_to_tap = 0;
+ tc[s].seq_ack_from_tap = tc[s].seq_to_tap + 1;
+ tc[s].tap_window = WINDOW_DEFAULT;
tc[s].ws_allowed = 1;
- clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_sock);
- clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap);
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_sock);
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_tap);
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_ack_tap);
+
+ tcp_act_set(s);
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
ev.data.fd = s;
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev);
- tc[s].s = SOCK_SYN_SENT;
+ tcp_set_state(s, SOCK_SYN_SENT);
tcp_send_to_tap(c, s, SYN, NULL, 0);
}
@@ -864,14 +861,13 @@ static void tcp_conn_from_sock(struct ctx *c, int fd)
* tcp_send_to_sock() - Send buffer to socket, update timestamp and sequence
* @c: Execution context
* @s: File descriptor number for socket
- * @seq: Previous TCP sequence, host order
* @data: Data buffer
* @len: Length at L4
* @extra_flags: Additional flags for send(), if any
*
- * Return: -1 on socket error with connection reset, 0 otherwise
+ * Return: negative on socket error with connection reset, 0 otherwise
*/
-static int tcp_send_to_sock(struct ctx *c, int s, int seq, char *data, int len,
+static int tcp_send_to_sock(struct ctx *c, int s, char *data, int len,
int extra_flags)
{
int err = send(s, data, len, MSG_DONTWAIT | MSG_NOSIGNAL | extra_flags);
@@ -884,28 +880,28 @@ static int tcp_send_to_sock(struct ctx *c, int s, int seq, char *data, int len,
return 0;
}
+ err = errno;
tcp_rst(c, s);
- return -1;
+ return -err;
}
- clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_sock);
- tc[s].seq_from_tap = seq + len;
+ tc[s].seq_from_tap += len;
return 0;
}
/**
- * tcp_check_dupack() - Check if given ACK number is duplicated, update counter
+ * tcp_is_dupack() - Check if given ACK number is duplicated, update counter
* @s: File descriptor number for socket
* @ack_seq: ACK sequence, host order
*
- * Return: 1 on two duplicated ACKs observed, with counter reset, 0 otherwise
+ * Return: -EAGAIN on duplicated ACKs observed, with counter reset, 0 otherwise
*/
-static int tcp_check_dupack(int s, uint32_t ack_seq)
+static int tcp_is_dupack(int s, uint32_t ack_seq)
{
if (ack_seq == tc[s].seq_ack_from_tap && ++tc[s].dup_acks == 2) {
tc[s].dup_acks = 0;
- return 1;
+ return -EAGAIN;
}
return 0;
@@ -916,7 +912,7 @@ static int tcp_check_dupack(int s, uint32_t ack_seq)
* @s: File descriptor number for socket
* @ack_seq: ACK sequence, host order
*
- * Return: -1 on invalid sequence, 0 otherwise
+ * Return: negative on invalid sequence, 0 otherwise
*/
static int tcp_sock_consume(int s, uint32_t ack_seq)
{
@@ -926,7 +922,7 @@ static int tcp_sock_consume(int s, uint32_t ack_seq)
to_ack = ack_seq - tc[s].seq_ack_from_tap;
if (to_ack < 0)
- return -1;
+ return -EIO;
recv(s, NULL, to_ack, MSG_DONTWAIT | MSG_TRUNC);
tc[s].seq_ack_from_tap = ack_seq;
@@ -939,27 +935,29 @@ static int tcp_sock_consume(int s, uint32_t ack_seq)
* @c: Execution context
* @s: File descriptor number for socket
*
- * Return: non-zero on socket error or pending data, 0 otherwise
+ * Return: negative on connection reset, 1 on pending data, 0 otherwise
*/
static int tcp_data_from_sock(struct ctx *c, int s)
{
- int len, offset, left, send;
+ int len, err, offset, left, send;
/* Don't dequeue until acknowledged by guest */
- len = recv(s, tcp_in_buf, sizeof(tcp_in_buf), MSG_DONTWAIT | MSG_PEEK);
+ len = recv(s, sock_buf, sizeof(sock_buf), MSG_DONTWAIT | MSG_PEEK);
if (len < 0) {
- if (errno != EAGAIN && errno != EWOULDBLOCK)
+ if (errno != EAGAIN && errno != EWOULDBLOCK) {
tcp_rst(c, s);
- return 1;
+ return -errno;
+ }
+ return 0;
}
if (len == 0) {
if (tc[s].s >= ESTABLISHED_SOCK_FIN)
return 0;
- tc[s].s = ESTABLISHED_SOCK_FIN;
- if (tcp_send_to_tap(c, s, FIN | ACK, NULL, 0))
- return 0;
+ tcp_set_state(s, ESTABLISHED_SOCK_FIN);
+ if ((err = tcp_send_to_tap(c, s, FIN | ACK, NULL, 0)))
+ return err;
left = 0;
goto out;
@@ -973,16 +971,15 @@ static int tcp_data_from_sock(struct ctx *c, int s)
else
send = tc[s].mss_guest;
- if (tcp_send_to_tap(c, s, 0, tcp_in_buf + offset, send))
- return 0;
+ if ((err = tcp_send_to_tap(c, s, 0, sock_buf + offset, send)))
+ return err;
offset += send;
left -= send;
}
out:
- clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap);
- tcp_act_slow_set(s);
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_sock);
return !!left;
}
@@ -997,7 +994,7 @@ out:
void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len)
{
struct tcphdr *th = (struct tcphdr *)in;
- size_t off;
+ size_t off, skip = 0;
int s, ws;
if (len < sizeof(*th))
@@ -1007,9 +1004,7 @@ void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len)
if (off < sizeof(*th) || off > len)
return;
- s = tcp_sock_lookup(af, addr, th->source, th->dest);
-
- if (s < 0) {
+ if ((s = tcp_sock_lookup(af, addr, th->source, th->dest)) < 0) {
if (th->syn)
tcp_conn_from_tap(c, af, addr, th, len);
return;
@@ -1020,15 +1015,19 @@ void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len)
return;
}
- tcp_clamp_window(s, th, len);
+ tcp_clamp_window(s, th, len, th->syn && th->ack);
- if (th->ack)
- clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap);
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_tap);
+
+ if (ntohl(th->seq) < tc[s].seq_from_tap)
+ skip = tc[s].seq_from_tap - ntohl(th->seq);
switch (tc[s].s) {
case SOCK_SYN_SENT:
- if (!th->syn || !th->ack)
+ if (!th->syn || !th->ack) {
+ tcp_rst(c, s);
return;
+ }
tc[s].mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL);
if (tc[s].mss_guest < 0)
@@ -1045,19 +1044,20 @@ void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len)
return;
}
- tc[s].seq_from_tap = tc[s].seq_init_from_tap = ntohl(th->seq);
+ /* info.tcpi_bytes_acked already includes one byte for SYN, but
+ * not for incoming connections.
+ */
+ tc[s].seq_init_from_tap = ntohl(th->seq) + 1;
+ tc[s].seq_from_tap = tc[s].seq_init_from_tap;
tc[s].seq_ack_to_tap = tc[s].seq_from_tap;
- tc[s].s = ESTABLISHED;
+ tcp_set_state(s, ESTABLISHED);
tcp_send_to_tap(c, s, ACK, NULL, 0);
break;
- case TAP_SYN_SENT:
- break;
case TAP_SYN_RCVD:
if (th->fin) {
shutdown(s, SHUT_WR);
- tc[s].s = FIN_WAIT_1;
-
+ tcp_set_state(s, FIN_WAIT_1);
break;
}
@@ -1066,83 +1066,81 @@ void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len)
return;
}
- tc[s].seq_ack_from_tap = ntohl(th->ack_seq);
-
- tc[s].s = ESTABLISHED;
+ tcp_set_state(s, ESTABLISHED);
break;
case ESTABLISHED:
+ case ESTABLISHED_SOCK_FIN:
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_ack_tap);
+
+ if (ntohl(th->seq) > tc[s].seq_from_tap) {
+ tc[s].seq_from_tap = tc[s].seq_ack_to_tap;
+ tcp_send_to_tap(c, s, ACK, NULL, 0);
+ break;
+ }
+
if (th->ack) {
int retrans = 0;
- if (len == th->doff)
- retrans = tcp_check_dupack(s, th->ack_seq);
+ if (len == off)
+ retrans = tcp_is_dupack(s, ntohl(th->ack_seq));
if (tcp_sock_consume(s, ntohl(th->ack_seq))) {
tcp_rst(c, s);
return;
}
- if (retrans) {
+ tc[s].seq_ack_from_tap = ntohl(th->ack_seq);
+
+ if (retrans)
tc[s].seq_to_tap = tc[s].seq_ack_from_tap;
- tcp_data_from_sock(c, s);
+
+ if (tc[s].s == ESTABLISHED_SOCK_FIN) {
+ if (!tcp_data_from_sock(c, s))
+ tcp_set_state(s, CLOSE_WAIT);
}
}
- if (tcp_send_to_sock(c, s, ntohl(th->seq), in + off, len - off,
+ if (skip < len - off &&
+ tcp_send_to_sock(c, s, in + off + skip, len - off - skip,
th->psh ? 0 : MSG_MORE))
break;
if (th->fin) {
shutdown(s, SHUT_WR);
- tc[s].s = FIN_WAIT_1;
+ if (tc[s].s == ESTABLISHED)
+ tcp_set_state(s, FIN_WAIT_1);
+ else
+ tcp_set_state(s, LAST_ACK);
}
break;
- case ESTABLISHED_SOCK_FIN:
- if (tcp_send_to_sock(c, s, ntohl(th->seq), in + off, len - off,
- th->psh ? 0 : MSG_MORE) < 0)
- break;
-
- if (th->ack) {
- shutdown(s, SHUT_RD);
- if (!tcp_data_from_sock(c, s))
- tc[s].s = CLOSE_WAIT;
-
- if (tcp_sock_consume(s, ntohl(th->ack_seq))) {
- tcp_rst(c, s);
- return;
- }
- }
-
- break;
-
case CLOSE_WAIT:
if (tcp_sock_consume(s, ntohl(th->ack_seq))) {
tcp_rst(c, s);
return;
}
+ if (skip < len - off &&
+ tcp_send_to_sock(c, s, in + off + skip, len - off - skip,
+ th->psh ? 0 : MSG_MORE))
+ break;
+
if (th->fin) {
shutdown(s, SHUT_WR);
- tc[s].s = LAST_ACK;
+ tcp_set_state(s, LAST_ACK);
}
break;
+ case FIN_WAIT_1_SOCK_FIN:
+ if (th->ack)
+ tcp_close_and_epoll_del(c, s);
+ break;
case FIN_WAIT_1:
+ case TAP_SYN_SENT:
case LAST_ACK:
case CLOSED: /* ;) */
break;
}
-
- if (tc[s].seq_to_tap > tc[s].seq_ack_from_tap)
- tcp_act_slow_set(s);
- else
- tcp_act_slow_clear(s);
-
- if (tc[s].seq_from_tap > tc[s].seq_ack_to_tap)
- tcp_act_fast_set(s);
- else
- tcp_act_fast_clear(s);
}
/**
@@ -1162,14 +1160,15 @@ static void tcp_connect_finish(struct ctx *c, int s)
return;
}
- if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0) < 0)
+ if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0))
return;
+ /* Drop EPOLLOUT, only used to wait for connect() to complete */
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
ev.data.fd = s;
epoll_ctl(c->epollfd, EPOLL_CTL_MOD, s, &ev);
- tc[s].s = TAP_SYN_RCVD;
+ tcp_set_state(s, TAP_SYN_RCVD);
}
/**
@@ -1184,6 +1183,7 @@ void tcp_sock_handler(struct ctx *c, int s, uint32_t events)
int so;
if (tc[s].s == LAST_ACK) {
+ tcp_send_to_tap(c, s, ACK, NULL, 0);
tcp_close_and_epoll_del(c, s);
return;
}
@@ -1210,21 +1210,21 @@ void tcp_sock_handler(struct ctx *c, int s, uint32_t events)
tcp_data_from_sock(c, s);
if (events & EPOLLRDHUP || events & EPOLLHUP) {
- if (tc[s].s == ESTABLISHED)
- tc[s].s = ESTABLISHED_SOCK_FIN;
-
- tcp_send_to_tap(c, s, FIN | ACK, NULL, 0);
-
- if (tc[s].s == FIN_WAIT_1) {
+ if (tc[s].s == ESTABLISHED) {
+ tcp_set_state(s, ESTABLISHED_SOCK_FIN);
+ shutdown(s, SHUT_RD);
+ tcp_data_from_sock(c, s);
+ tcp_send_to_tap(c, s, FIN | ACK, NULL, 0);
+ } else if (tc[s].s == FIN_WAIT_1) {
+ tcp_set_state(s, FIN_WAIT_1_SOCK_FIN);
shutdown(s, SHUT_RD);
+ tcp_data_from_sock(c, s);
+ tcp_send_to_tap(c, s, FIN | ACK, NULL, 0);
- if (tcp_sock_consume(s, ntohl(tc[s].seq_ack_from_tap))) {
+ if (tcp_sock_consume(s, tc[s].seq_ack_from_tap)) {
tcp_rst(c, s);
return;
}
-
- tcp_close_and_epoll_del(c, s);
- tc[s].s = CLOSED;
}
}
}
@@ -1240,9 +1240,9 @@ int tcp_sock_init(struct ctx *c)
in_port_t port;
for (port = 0; port < (1 << 15) + (1 << 14); port++) {
- if (c->v4 && sock_l4_add(c, 4, IPPROTO_TCP, htons(port)) < 0)
+ if (c->v4 && sock_l4_add(c, 4, IPPROTO_TCP, port) < 0)
return -1;
- if (c->v6 && sock_l4_add(c, 6, IPPROTO_TCP, htons(port)) < 0)
+ if (c->v6 && sock_l4_add(c, 6, IPPROTO_TCP, port) < 0)
return -1;
}
@@ -1250,118 +1250,92 @@ int tcp_sock_init(struct ctx *c)
}
/**
- * tcp_periodic_fast_one() - Handler for "fast" timeout events on one socket
+ * tcp_timer_one() - Handler for timed events on one socket
* @c: Execution context
* @s: File descriptor number for socket
* @ts: Timestamp from caller
- *
- * Return: 0 if socket needs to be monitored further, non-zero otherwise
- */
-int tcp_periodic_fast_one(struct ctx *c, int s, struct timespec *ts)
-{
- if (timespec_diff_ms(ts, &tc[s].last_ts_sock) < SOCK_ACK_INTERVAL)
- return 0;
-
- tc[s].last_ts_sock = *ts;
-
- tcp_send_to_tap(c, s, 0, NULL, 0);
-
- return tc[s].seq_from_tap == tc[s].seq_ack_to_tap;
-}
-
-/**
- * tcp_periodic_fast() - Handle sockets in "fast" event bitmap, clear as needed
- * @c: Execution context
*/
-void tcp_periodic_fast(struct ctx *c)
+static void tcp_timer_one(struct ctx *c, int s, struct timespec *ts)
{
- long *word = (long *)tcp_act_fast, tmp;
- struct timespec now;
- unsigned int i;
- int n, s;
-
- clock_gettime(CLOCK_MONOTONIC, &now);
-
- for (i = 0; i < sizeof(tcp_act_fast) / sizeof(long); i++, word++) {
- tmp = *word;
- while ((n = ffsl(tmp))) {
- tmp &= ~(1UL << (n - 1));
+ int ack_tap_ms = timespec_diff_ms(ts, &tc[s].ts_ack_tap);
+ int sock_ms = timespec_diff_ms(ts, &tc[s].ts_tap);
+ int tap_ms = timespec_diff_ms(ts, &tc[s].ts_tap);
- s = i * sizeof(long) * 8 + n - 1;
-
- if (tcp_periodic_fast_one(c, s, &now))
- *word &= ~(1UL << (n - 1));
- }
- }
-}
-
-/**
- * tcp_periodic_fast_one() - Handler for "slow" timeout events on one socket
- * @c: Execution context
- * @s: File descriptor number for socket
- * @ts: Timestamp from caller
- */
-void tcp_periodic_slow_one(struct ctx *c, int s, struct timespec *ts)
-{
switch (tc[s].s) {
case SOCK_SYN_SENT:
- case TAP_SYN_SENT:
case TAP_SYN_RCVD:
- if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > SYN_TIMEOUT)
+ if (ack_tap_ms > SYN_TIMEOUT)
tcp_rst(c, s);
+
break;
case ESTABLISHED_SOCK_FIN:
- if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > FIN_TIMEOUT) {
+ if (ack_tap_ms > FIN_TIMEOUT) {
tcp_rst(c, s);
break;
}
/* Falls through */
case ESTABLISHED:
- if (tc[s].seq_ack_from_tap < tc[s].seq_to_tap &&
- timespec_diff_ms(ts, &tc[s].last_ts_tap) > ACK_TIMEOUT) {
- tc[s].seq_to_tap = tc[s].seq_ack_from_tap;
- tcp_data_from_sock(c, s);
+ if (tap_ms > ACT_TIMEOUT && sock_ms > ACT_TIMEOUT)
+ tcp_rst(c, s);
+
+ if (tc[s].seq_to_tap == tc[s].seq_ack_from_tap &&
+ tc[s].seq_from_tap == tc[s].seq_ack_to_tap) {
+ tc[s].ts_sock = *ts;
+ break;
}
- if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > ACT_TIMEOUT &&
- timespec_diff_ms(ts, &tc[s].last_ts_sock) > ACT_TIMEOUT)
- tcp_rst(c, s);
+ if (sock_ms > ACK_INTERVAL) {
+ if (tc[s].seq_from_tap > tc[s].seq_ack_to_tap)
+ tcp_send_to_tap(c, s, 0, NULL, 0);
+ }
+
+ if (ack_tap_ms > ACK_TIMEOUT) {
+ if (tc[s].seq_ack_from_tap < tc[s].seq_to_tap) {
+ tc[s].seq_to_tap = tc[s].seq_ack_from_tap;
+ tc[s].ts_ack_tap = *ts;
+ tcp_data_from_sock(c, s);
+ }
+ }
+
+ if (tc[s].seq_from_tap == tc[s].seq_ack_to_tap)
+ tc[s].ts_sock = *ts;
break;
case CLOSE_WAIT:
case FIN_WAIT_1:
- if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > FIN_TIMEOUT)
+ if (sock_ms > FIN_TIMEOUT)
+ tcp_rst(c, s);
+ break;
+ case FIN_WAIT_1_SOCK_FIN:
+ if (ack_tap_ms > FIN_TIMEOUT)
tcp_rst(c, s);
break;
case LAST_ACK:
- if (timespec_diff_ms(ts, &tc[s].last_ts_sock) >
- LAST_ACK_TIMEOUT)
+ if (sock_ms > LAST_ACK_TIMEOUT)
tcp_rst(c, s);
break;
+ case TAP_SYN_SENT:
case CLOSED:
break;
}
}
/**
- * tcp_periodic_slow() - Handle sockets in "slow" event bitmap
+ * tcp_timer() - Scan activity bitmap for sockets waiting for timed events
* @c: Execution context
+ * @ts: Timestamp from caller
*/
-void tcp_periodic_slow(struct ctx *c)
+void tcp_timer(struct ctx *c, struct timespec *ts)
{
- long *word = (long *)tcp_act_slow, tmp;
- struct timespec now;
+ long *word = (long *)tcp_act, tmp;
unsigned int i;
int n;
- clock_gettime(CLOCK_MONOTONIC, &now);
-
- for (i = 0; i < sizeof(tcp_act_slow) / sizeof(long); i++, word++) {
+ for (i = 0; i < sizeof(tcp_act) / sizeof(long); i++, word++) {
tmp = *word;
while ((n = ffsl(tmp))) {
tmp &= ~(1UL << (n - 1));
- tcp_periodic_slow_one(c, i * sizeof(long) * 8 + n - 1,
- &now);
+ tcp_timer_one(c, i * sizeof(long) * 8 + n - 1, ts);
}
}
}
diff --git a/tcp.h b/tcp.h
index 1f16790..9fa8244 100644
--- a/tcp.h
+++ b/tcp.h
@@ -1,5 +1,4 @@
void tcp_sock_handler(struct ctx *c, int s, uint32_t events);
void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len);
int tcp_sock_init(struct ctx *c);
-void tcp_periodic_fast(struct ctx *c);
-void tcp_periodic_slow(struct ctx *c);
+void tcp_timer(struct ctx *c, struct timespec *ts);
diff --git a/udp.c b/udp.c
index 74ce843..4acd48e 100644
--- a/udp.c
+++ b/udp.c
@@ -124,8 +124,6 @@ void udp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len)
if (!(s = udp4_sock_port[ntohs(uh->source)]))
return;
- fprintf(stderr, "udp from tap: using socket %i\n", s);
-
sa.sin_addr = *(struct in_addr *)addr;
sendto(s, in + sizeof(*uh), len - sizeof(*uh), MSG_DONTWAIT,
@@ -140,15 +138,14 @@ void udp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len)
if (!(s = udp6_sock_port[ntohs(uh->source)]))
return;
- fprintf(stderr, "udp from tap: using socket %i\n", s);
-
- sendto(s, in + sizeof(*uh), len - sizeof(*uh), MSG_DONTWAIT,
+ sendto(s, in + sizeof(*uh), len - sizeof(*uh),
+ MSG_DONTWAIT | MSG_NOSIGNAL,
(struct sockaddr *)&sa, sizeof(sa));
}
}
/**
- * udp_sock_init() - Create and bind listening sockets for inbound connections
+ * udp_sock_init() - Create and bind listening sockets for inbound packets
* @c: Execution context
*
* Return: 0 on success, -1 on failure
@@ -159,15 +156,19 @@ int udp_sock_init(struct ctx *c)
int s;
for (port = 0; port < USHRT_MAX; port++) {
- if (c->v4 &&
- (s = sock_l4_add(c, 4, IPPROTO_UDP, htons(port))) < 0)
- return -1;
- udp4_sock_port[port] = s;
-
- if (c->v6 &&
- (s = sock_l4_add(c, 6, IPPROTO_UDP, htons(port))) < 0)
- return -1;
- udp6_sock_port[port] = s;
+ if (c->v4) {
+ if ((s = sock_l4_add(c, 4, IPPROTO_UDP, port)) < 0)
+ return -1;
+
+ udp4_sock_port[port] = s;
+ }
+
+ if (c->v6) {
+ if ((s = sock_l4_add(c, 6, IPPROTO_UDP, port)) < 0)
+ return -1;
+
+ udp6_sock_port[port] = s;
+ }
}
return 0;
diff --git a/util.c b/util.c
index 324f800..e8ee57f 100644
--- a/util.c
+++ b/util.c
@@ -139,7 +139,7 @@ char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto)
* sock_l4_add() - Create and bind socket for given L4, add to epoll list
* @c: Execution context
* @v: IP protocol, 4 or 6
- * @proto: Protocol number, network order
+ * @proto: Protocol number, host order
* @port: Port, network order
*
* Return: newly created socket, -1 on error
@@ -148,17 +148,17 @@ int sock_l4_add(struct ctx *c, int v, uint16_t proto, uint16_t port)
{
struct sockaddr_in addr4 = {
.sin_family = AF_INET,
- .sin_port = port,
+ .sin_port = htons(port),
.sin_addr = { .s_addr = INADDR_ANY },
};
struct sockaddr_in6 addr6 = {
.sin6_family = AF_INET6,
- .sin6_port = port,
+ .sin6_port = htons(port),
.sin6_addr = IN6ADDR_ANY_INIT,
};
struct epoll_event ev = { 0 };
const struct sockaddr *sa;
- int fd, sl;
+ int fd, sl, one = 1;
if (proto != IPPROTO_TCP && proto != IPPROTO_UDP)
return -1; /* Not implemented. */
@@ -176,6 +176,8 @@ int sock_l4_add(struct ctx *c, int v, uint16_t proto, uint16_t port)
} else {
sa = (const struct sockaddr *)&addr6;
sl = sizeof(addr6);
+
+ setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one));
}
if (bind(fd, sa, sl) < 0) {
@@ -213,10 +215,10 @@ int sock_l4_add(struct ctx *c, int v, uint16_t proto, uint16_t port)
int timespec_diff_ms(struct timespec *a, struct timespec *b)
{
if (a->tv_nsec < b->tv_nsec) {
- return (b->tv_nsec - a->tv_nsec) / 1000 +
+ return (b->tv_nsec - a->tv_nsec) / 1000000 +
(a->tv_sec - b->tv_sec - 1) * 1000;
}
- return (a->tv_nsec - b->tv_nsec) / 1000 +
+ return (a->tv_nsec - b->tv_nsec) / 1000000 +
(a->tv_sec - b->tv_sec) * 1000;
}