aboutgitcodebugslistschat
path: root/tcp.c
diff options
context:
space:
mode:
authorStefano Brivio <sbrivio@redhat.com>2021-02-21 11:33:38 +0100
committerStefano Brivio <sbrivio@redhat.com>2021-02-21 11:55:49 +0100
commit8bca388e8a771d069b2a2d4ac47589112f6f0af3 (patch)
tree6e934d87ab33ef3116da032c3f5acb044c0cc6cc /tcp.c
parent105b916361ca6e9e63112444c323cc193303120c (diff)
downloadpasst-8bca388e8a771d069b2a2d4ac47589112f6f0af3.tar
passt-8bca388e8a771d069b2a2d4ac47589112f6f0af3.tar.gz
passt-8bca388e8a771d069b2a2d4ac47589112f6f0af3.tar.bz2
passt-8bca388e8a771d069b2a2d4ac47589112f6f0af3.tar.lz
passt-8bca388e8a771d069b2a2d4ac47589112f6f0af3.tar.xz
passt-8bca388e8a771d069b2a2d4ac47589112f6f0af3.tar.zst
passt-8bca388e8a771d069b2a2d4ac47589112f6f0af3.zip
passt: Assorted fixes from "fresh eyes" review
A bunch of fixes not worth single commits at this stage, notably: - make buffer, length parameter ordering consistent in ARP, DHCP, NDP handlers - strict checking of buffer, message and option length in DHCP handler (a malicious client could have easily crashed it) - set up forwarding for IPv4 and IPv6, and masquerading with nft for IPv4, from demo script - get rid of separate slow and fast timers, we don't save any overhead that way - stricter checking of buffer lengths as passed to tap handlers - proper dequeuing from qemu socket back-end: I accidentally trashed messages that were bundled up together in a single tap read operation -- the length header tells us what's the size of the next frame, but there's no apparent limit to the number of messages we get with one single receive - rework some bits of the TCP state machine, now passive and active connection closes appear to be robust -- introduce a new FIN_WAIT_1_SOCK_FIN state indicating a FIN_WAIT_1 with a FIN flag from socket - streamline TCP option parsing routine - track TCP state changes to stderr (this is temporary, proper debugging and syslogging support pending) - observe that multiplying a number by four might very well change its value, and this happens to be the case for the data offset from the TCP header as we check if it's the same as the total length to find out if it's a duplicated ACK segment - recent estimates suggest that the duration of a millisecond is closer to a million nanoseconds than a thousand of them, this trend is now reflected into the timespec_diff_ms() convenience routine Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Diffstat (limited to 'tcp.c')
-rw-r--r--tcp.c676
1 files changed, 325 insertions, 351 deletions
diff --git a/tcp.c b/tcp.c
index 46b739d..f1de9cf 100644
--- a/tcp.c
+++ b/tcp.c
@@ -130,7 +130,7 @@
*
* These states apply to connected sockets only, listening sockets are always
* open after initialisation, in LISTEN state. A single state is maintained for
- * both sides of the connection, and most states are omitted as they are already
+ * both sides of the connection, and some states are omitted as they are already
* handled by host kernel and guest.
*
* - CLOSED no connection
@@ -144,31 +144,32 @@
*
* - SOCK_SYN_SENT new connected socket, SYN sent to tap
* - SYN,ACK from tap ACK to tap > ESTABLISHED
- * - SYN,ACK timeout RST to tap, close socket > CLOSED
* - socket error RST to tap, close socket > CLOSED
+ * - SYN,ACK timeout RST to tap, close socket > CLOSED
* - RST from tap close socket > CLOSED
*
* - TAP_SYN_RCVD connect() completed, SYN,ACK sent to tap
+ * - FIN from tap write shutdown > FIN_WAIT_1
* - ACK from tap > ESTABLISHED
- * - ACK timeout RST to tap, close socket > CLOSED
* - socket error RST to tap, close socket > CLOSED
+ * - ACK timeout RST to tap, close socket > CLOSED
* - RST from tap close socket > CLOSED
*
* - ESTABLISHED connection established, ready for data
- * - zero-sized socket read FIN to tap > ESTABLISHED_SOCK_FIN
- * - data timeout FIN to tap > ESTABLISHED_SOCK_FIN
+ * - FIN from tap write shutdown > FIN_WAIT_1
+ * - zero-sized socket read read shutdown, FIN to tap > ESTABLISHED_SOCK_FIN
* - socket error RST to tap, close socket > CLOSED
- * - FIN from tap FIN,ACK to tap, close socket > FIN_WAIT_1
+ * - data timeout FIN to tap > ESTABLISHED_SOCK_FIN
* - RST from tap close socket > CLOSED
*
- * - ESTABLISHED_SOCK_FIN socket wants to close connection, data allowed
+ * - ESTABLISHED_SOCK_FIN socket closing connection, FIN sent to tap
* - ACK from tap > CLOSE_WAIT
* - ACK timeout RST to tap, close socket > CLOSED
* - RST from tap close socket > CLOSED
*
- * - CLOSE_WAIT socket wants to close connection, seen by tap
+ * - CLOSE_WAIT socket closing connection, ACK from tap
+ * - FIN from tap write shutdown > LAST_ACK
* - socket error RST to tap, close socket > CLOSED
- * - FIN from tap ACK to tap, close socket > LAST_ACK
* - FIN timeout RST to tap, close socket > CLOSED
* - RST from tap close socket > CLOSED
*
@@ -176,12 +177,19 @@
* - anything from socket close socket > CLOSED
* - socket error RST to tap, close socket > CLOSED
* - ACK timeout RST to tap, close socket > CLOSED
+ * - RST from tap close socket > CLOSED
*
- * - FIN_WAIT_1 tap wants to close connection, _FIN,ACK sent_
- * - ACK from tap close socket > CLOSED
+ * - FIN_WAIT_1 tap closing connection, FIN sent to socket
+ * - zero-sized socket read FIN,ACK to tap, shutdown > FIN_WAIT_1_SOCK_FIN
* - socket error RST to tap, close socket > CLOSED
* - ACK timeout RST to tap, close socket > CLOSED
+ * - RST from tap close socket > CLOSED
*
+ * - FIN_WAIT_1_SOCK_FIN tap closing connection, FIN received from socket
+ * - ACK from tap close socket > CLOSED
+ * - socket error RST to tap, close socket > CLOSED
+ * - ACK timeout RST to tap, close socket > CLOSED
+ * - RST from tap close socket > CLOSED
*
* Connection setup
* ----------------
@@ -198,34 +206,33 @@
* Aging and timeout
* -----------------
*
- * Two bitmaps of TCP_MAX_CONNS bits indicate which connections need scheduled
- * actions:
- * - @tcp_act_fast is used to send ACK segments to the tap once TCP_INFO reports
- * an increased number of acknowledged bytes sent on a socket, and examined
- * every 20ms (one tenth of current TCP_DELACK_MAX on Linux): for each marked
- * connection, a TCP_INFO query is performed and ACK segments are sent right
- * away as needed
- * - @tcp_act_slow is used for state and retransmission timeouts, and examined
- * every 2s: for each marked connection with an expired @timeout timestamp
- * specific actions are taken depending on the connection state:
- * - SOCK_SYN_SENT: after a 2MSL (240s) timeout waiting for a SYN,ACK segment
- * from tap expires, connection is reset (RST to tap, socket closed)
- * - TAP_SYN_RCVD: after a 2MSL (240s) timeout waiting for an ACK segment from
- * tap expires, connection is reset (RST to tap, socket closed)
- * - ESTABLISHED: after a timeout of 1s (TODO: implement requirements from
- * RFC 6298) waiting for an ACK segment from tap expires, data from socket
- * queue is retransmitted starting from the last ACK sequence
- * - ESTABLISHED: after a two hours (current TCP_KEEPALIVE_TIME on Linux)
- * timeout waiting for any activity expires, connection is reset (RST to
- * tap, socket closed)
- * - ESTABLISHED_SOCK_FIN: after a 2MSL (240s) timeout waiting for an ACK
- * segment from tap expires, connection is reset (RST to tap, socket closed)
- * - CLOSE_WAIT: after a 2MSL (240s) timeout waiting for a FIN segment from
- * tap expires, connection is reset (RST to tap, socket closed)
- * - LAST_ACK: after a 2MSL (240s) timeout waiting for an ACK segment from
- * socket expires, connection is reset (RST to tap, socket closed)
- * - FIN_WAIT_1: after a 2MSL (240s) timeout waiting for an ACK segment from
- * tap expires, connection is reset (RST to tap, socket closed)
+ * A bitmap of TCP_MAX_CONNS bits indicate the connections subject to timed
+ * events based on states:
+ * - SOCK_SYN_SENT: after a 2MSL (240s) timeout waiting for a SYN,ACK segment
+ * from tap expires, connection is reset (RST to tap, socket closed)
+ * - TAP_SYN_RCVD: after a 2MSL (240s) timeout waiting for an ACK segment from
+ * tap expires, connection is reset (RST to tap, socket closed)
+ * - TAP_SYN_SENT: connect() is pending, timeout is handled implicitly by
+ * connect() timeout, connection will be reset in case
+ * - ESTABLISHED, ESTABLISHED_SOCK_FIN: if an ACK segment to tap is pending,
+ * bytes acknowledged by socket endpoint are checked every 50ms (one quarter
+ * of current TCP_DELACK_MAX on Linux)
+ * - ESTABLISHED, ESTABLISHED_SOCK_FIN: after a timeout of 3s (TODO: implement
+ * requirements from RFC 6298) waiting for an ACK segment from tap expires,
+ * data from socket queue is retransmitted starting from the last ACK sequence
+ * - ESTABLISHED, ESTABLISHED_SOCK_FIN: after a two hours (current
+ * TCP_KEEPALIVE_TIME on Linux) timeout waiting for any activity expires,
+ * connection is reset (RST to tap, socket closed)
+ * - ESTABLISHED_SOCK_FIN: after a 2MSL (240s) timeout waiting for an ACK
+ * segment from tap expires, connection is reset (RST to tap, socket closed)
+ * - CLOSE_WAIT: after a 2MSL (240s) timeout waiting for a FIN segment from tap
+ * expires, connection is reset (RST to tap, socket closed)
+ * - FIN_WAIT_1: after a 2MSL (240s) timeout waiting for an ACK segment from
+ * socet expires, connection is reset (RST to tap, socket closed)
+ * - FIN_WAIT_1_SOCK_FIN: after a 2MSL (240s) timeout waiting for an ACK segment
+ * from tap expires, connection is reset (RST to tap, socket closed)
+ * - LAST_ACK: after a 2MSL (240s) timeout waiting for an ACK segment from
+ * socket expires, connection is reset (RST to tap, socket closed)
*
*
* Data flows (from ESTABLISHED, ESTABLISHED_SOCK_FIN states)
@@ -253,6 +260,7 @@
* - on read error, send RST to tap, close socket
* - on zero read, send FIN to tap, enter ESTABLISHED_SOCK_FIN
* - on ACK from tap:
+ * - set @ts_ack_tap
* - check if it's the second duplicated ACK
* - consume buffer by difference between new ack_seq and @seq_ack_from_tap
* - update @seq_ack_from_tap from ack_seq in header
@@ -263,11 +271,12 @@
* - periodically:
* - if @seq_ack_from_tap < @seq_to_tap and the retransmission timer
* (TODO: implement requirements from RFC 6298, currently 3s fixed) from
- * @last_ts_to_tap elapsed, reset @seq_to_tap to @seq_ack_from_tap, and
+ * @ts_sock elapsed, reset @seq_to_tap to @seq_ack_from_tap, and
* resend data with the steps listed above
*
* - from tap to socket:
* - on packet from tap:
+ * - set @ts_tap
* - set TCP_WINDOW_CLAMP from TCP header from tap
* - check seq from header against @seq_from_tap, if data is missing, send
* two ACKs with number @seq_ack_to_tap, discard packet
@@ -277,15 +286,11 @@
* set @tcpi_acked_last to tcpi_bytes_acked, set @seq_ack_to_tap
* to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and
* send ACK to tap
- * - set @last_ts_sock
- * - on @seq_ack_to_tap < @seq_from_tap, mark socket for later ACK in bitmap
* - periodically:
- * - if socket is marked in bitmap, query socket for TCP_INFO, on
- * tcpi_bytes_acked > @tcpi_acked_last,
+ * - query socket for TCP_INFO, on tcpi_bytes_acked > @tcpi_acked_last,
* set @tcpi_acked_last to tcpi_bytes_acked, set @seq_ack_to_tap
* to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and
* send ACK to tap
- * - on @seq_ack_to_tap == @seq_from_tap, unmark socket from bitmap
*/
#define _GNU_SOURCE
@@ -321,22 +326,17 @@
#define SYN_TIMEOUT 240000 /* ms */
#define ACK_TIMEOUT 3000
+#define ACK_INTERVAL 50
#define ACT_TIMEOUT 7200000
#define FIN_TIMEOUT 240000
#define LAST_ACK_TIMEOUT 240000
-#define SOCK_ACK_INTERVAL 20
/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of
* <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP
*/
#define SOL_TCP IPPROTO_TCP
-static char tcp_in_buf[MAX_WINDOW];
-
-static uint8_t tcp_act_fast[MAX_CONNS / 8] = { 0 };
-static uint8_t tcp_act_slow[MAX_CONNS / 8] = { 0 };
-
enum tcp_state {
CLOSED = 0,
TAP_SYN_SENT,
@@ -347,6 +347,13 @@ enum tcp_state {
CLOSE_WAIT,
LAST_ACK,
FIN_WAIT_1,
+ FIN_WAIT_1_SOCK_FIN,
+};
+
+static char *tcp_state_str[FIN_WAIT_1_SOCK_FIN + 1] = {
+ "CLOSED", "TAP_SYN_SENT", "SOCK_SYN_SENT", "TAP_SYN_RCVD",
+ "ESTABLISHED", "ESTABLISHED_SOCK_FIN", "CLOSE_WAIT", "LAST_ACK",
+ "FIN_WAIT_1", "FIN_WAIT_1_SOCK_FIN",
};
#define FIN (1 << 0)
@@ -357,7 +364,9 @@ enum tcp_state {
#define OPT_EOL 0
#define OPT_NOP 1
#define OPT_MSS 2
+#define OPT_MSS_LEN 4
#define OPT_WS 3
+#define OPT_WS_LEN 3
#define OPT_SACKP 4
#define OPT_SACK 5
#define OPT_TS 8
@@ -381,8 +390,9 @@ enum tcp_state {
* @ws_allowed: Window scaling allowed
* @ws: Window scaling factor
* @tap_window: Last window size received from tap, scaled
- * @last_ts_sock: Last activity timestamp from socket for timeout purposes
- * @last_ts_tap: Last activity timestamp from tap for timeout purposes
+ * @ts_sock: Last activity timestamp from socket for timeout purposes
+ * @ts_tap: Last activity timestamp from tap for timeout purposes
+ * @ts_ack_tap: Last ACK segment timestamp from tap for timeout purposes
* @mss_guest: Maximum segment size advertised by guest
*/
struct tcp_conn {
@@ -410,106 +420,101 @@ struct tcp_conn {
int ws;
int tap_window;
- struct timespec last_ts_sock;
- struct timespec last_ts_tap;
+ struct timespec ts_sock;
+ struct timespec ts_tap;
+ struct timespec ts_ack_tap;
int mss_guest;
};
+static char sock_buf[MAX_WINDOW];
+static uint8_t tcp_act[MAX_CONNS / 8] = { 0 };
static struct tcp_conn tc[MAX_CONNS];
static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len);
/**
- * tcp_act_fast_set() - Set socket in bitmap for "fast" timeout events
+ * tcp_act_set() - Set socket in bitmap for timed events
* @s: Socket file descriptor number
*/
-static void tcp_act_fast_set(int s)
+static void tcp_act_set(int s)
{
- tcp_act_fast[s / 8] |= 1 << (s % 8);
+ tcp_act[s / 8] |= 1 << (s % 8);
}
/**
- * tcp_act_fast_clear() - Clear socket from bitmap for "fast" timeout events
+ * tcp_act_clear() - Clear socket from bitmap for timed events
* @s: Socket file descriptor number
*/
-static void tcp_act_fast_clear(int s)
+static void tcp_act_clear(int s)
{
- tcp_act_fast[s / 8] &= ~(1 << (s % 8));
+ tcp_act[s / 8] &= ~(1 << (s % 8));
}
/**
- * tcp_act_slow_set() - Set socket in bitmap for "slow" timeout events
+ * tcp_set_state() - Set given TCP state for socket, report change to stderr
* @s: Socket file descriptor number
+ * @state: New TCP state to be set
*/
-static void tcp_act_slow_set(int s)
+static void tcp_set_state(int s, enum tcp_state state)
{
- tcp_act_slow[s / 8] |= 1 << (s % 8);
-}
-
-/**
- * tcp_act_slow_clear() - Clear socket from bitmap for "slow" timeout events
- * @s: Socket file descriptor number
- */
-static void tcp_act_slow_clear(int s)
-{
- tcp_act_slow[s / 8] &= ~(1 << (s % 8));
+ fprintf(stderr, "TCP: socket %i: %s -> %s\n", s,
+ tcp_state_str[tc[s].s], tcp_state_str[state]);
+ tc[s].s = state;
}
/**
* tcp_opt_get() - Get option, and value if any, from TCP header
* @th: Pointer to TCP header
* @len: Length of buffer, including TCP header
- * @type: Option type to look for
- * @optlen: Optional, filled with option length if passed
- * @value: Optional, set to start of option value if passed
+ * @__type: Option type to look for
+ * @__optlen: Optional, filled with option length if passed
+ * @__value: Optional, set to start of option value if passed
*
* Return: Option value, meaningful for up to 4 bytes, -1 if not found
*/
-static int tcp_opt_get(struct tcphdr *th, unsigned int len, uint8_t type,
- uint8_t *optlen, void *value)
+static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t __type,
+ uint8_t *__optlen, char **__value)
{
- uint8_t *p, __type, __optlen;
+ uint8_t type, optlen;
+ char *p;
- len -= sizeof(*th);
- p = (uint8_t *)(th + 1);
+ if (len > th->doff * 4)
+ len = th->doff * 4;
- if (len > th->doff * 4 - sizeof(*th))
- len = th->doff * 4 - sizeof(*th);
+ len -= sizeof(*th);
+ p = (char *)(th + 1);
- while (len >= 2) {
+ for (; len >= 2; p += optlen, len -= optlen) {
switch (*p) {
case OPT_EOL:
return -1;
case OPT_NOP:
- p++;
- len--;
+ optlen = 1;
break;
default:
- __type = *(p++);
- __optlen = *(p++);
+ type = *(p++);
+ optlen = *(p++) - 2;
len -= 2;
- if (type == __type) {
- if (optlen)
- *optlen = __optlen;
- if (value)
- value = p;
-
- if (__optlen - 2 == 0)
- return 0;
-
- if (__optlen - 2 == 1)
- return *p;
-
- if (__optlen - 2 == 2)
- return ntohs(*(uint16_t *)p);
-
+ if (type != __type)
+ break;
+
+ if (__optlen)
+ *__optlen = optlen;
+ if (__value)
+ *__value = p;
+
+ switch (optlen) {
+ case 0:
+ return 0;
+ case 1:
+ return *p;
+ case 2:
+ return ntohs(*(uint16_t *)p);
+ default:
return ntohl(*(uint32_t *)p);
}
-
- p += __optlen - 2;
- len -= __optlen - 2;
}
}
@@ -524,9 +529,9 @@ static int tcp_opt_get(struct tcphdr *th, unsigned int len, uint8_t type,
static void tcp_close_and_epoll_del(struct ctx *c, int s)
{
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL);
+ tcp_set_state(s, CLOSED);
close(s);
- tcp_act_fast_clear(s);
- tcp_act_slow_clear(s);
+ tcp_act_clear(s);
}
/**
@@ -541,7 +546,7 @@ static void tcp_rst(struct ctx *c, int s)
tcp_send_to_tap(c, s, RST, NULL, 0);
tcp_close_and_epoll_del(c, s);
- tc[s].s = CLOSED;
+ tcp_set_state(s, CLOSED);
}
/**
@@ -549,76 +554,70 @@ static void tcp_rst(struct ctx *c, int s)
* @c: Execution context
* @s: File descriptor number for socket
* @flags: TCP flags to set
- * @in: Input buffer, L4 header
- * @len: Buffer length, at L4
+ * @in: Payload buffer
+ * @len: Payload length
*
- * Return: -1 on error with connection reset, 0 otherwise
+ * Return: negative error code on connection reset, 0 otherwise
*/
static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len)
{
char buf[USHRT_MAX] = { 0 }, *data;
struct tcp_info info = { 0 };
socklen_t sl = sizeof(info);
- int ws = 0, have_info = 1;
struct tcphdr *th;
+ int ws = 0, err;
- if (getsockopt(s, SOL_TCP, TCP_INFO, &info, &sl)) {
- if (!(flags & RST)) {
- tcp_rst(c, s);
- return -1;
- }
-
- have_info = 0;
+ if ((err = getsockopt(s, SOL_TCP, TCP_INFO, &info, &sl)) &&
+ !(flags & RST)) {
+ tcp_rst(c, s);
+ return err;
}
th = (struct tcphdr *)buf;
data = (char *)(th + 1);
+ th->doff = sizeof(*th) / 4;
- if (flags & SYN && have_info) {
- if (tc[s].ws_allowed)
- ws = info.tcpi_snd_wscale;
-
+ if ((flags & SYN) && !err) {
/* Options: MSS, NOP and window scale if allowed (4-8 bytes) */
- *data++ = 2;
- *data++ = 4;
+ *data++ = OPT_MSS;
+ *data++ = OPT_MSS_LEN;
*(uint16_t *)data = htons(info.tcpi_snd_mss);
- data += 2;
+ data += OPT_MSS_LEN - 2;
+ th->doff += OPT_MSS_LEN / 4;
- if (ws) {
- *data++ = 1;
+ if (tc[s].ws_allowed && (ws = info.tcpi_snd_wscale)) {
+ *data++ = OPT_NOP;
- *data++ = 3;
- *data++ = 3;
- *data++ = ws;
+ *data++ = OPT_WS;
+ *data++ = OPT_WS_LEN;
+ *data = ws;
+ *data += OPT_WS_LEN - 2;
- th->doff = (20 + 8) / 4;
- } else {
- th->doff = (20 + 4) / 4;
+ th->doff += (1 + OPT_WS_LEN) / 4;
}
+ /* RFC 793, 3.1: "[...] and the first data octet is ISN+1." */
th->seq = htonl(tc[s].seq_to_tap++);
} else {
- th->doff = 20 / 4;
-
th->seq = htonl(tc[s].seq_to_tap);
tc[s].seq_to_tap += len;
}
- if ((info.tcpi_bytes_acked > tc[s].tcpi_acked_last || (flags & ACK) ||
- len) &&
- have_info) {
+ if (!err && ((info.tcpi_bytes_acked > tc[s].tcpi_acked_last) ||
+ (flags & ACK) || len)) {
uint64_t ack_seq;
th->ack = 1;
- /* info.tcpi_bytes_acked already includes one byte for SYN, but
- * not for incoming connections.
- */
+
ack_seq = info.tcpi_bytes_acked + tc[s].seq_init_from_tap;
- if (!info.tcpi_bytes_acked)
- ack_seq++;
- ack_seq &= (uint32_t)~0U;
- tc[s].seq_ack_to_tap = ack_seq;
+ tc[s].seq_ack_to_tap = ack_seq & (uint32_t)~0U;
+
+ if (tc[s].s == LAST_ACK) {
+ tc[s].seq_ack_to_tap = tc[s].seq_from_tap + 1;
+ th->seq = htonl(ntohl(th->seq) + 1);
+ }
+
th->ack_seq = htonl(tc[s].seq_ack_to_tap);
tc[s].tcpi_acked_last = info.tcpi_bytes_acked;
@@ -636,7 +635,7 @@ static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len)
th->source = tc[s].sock_port;
th->dest = tc[s].tap_port;
- if (have_info)
+ if (!err)
th->window = htons(info.tcpi_snd_wnd >> info.tcpi_snd_wscale);
else
th->window = WINDOW_DEFAULT;
@@ -656,23 +655,18 @@ static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len)
* @s: File descriptor number for socket
* @th: TCP header, from tap
* @len: Buffer length, at L4
+ * @init: Set if this is the very first segment from tap
*/
-static void tcp_clamp_window(int s, struct tcphdr *th, int len)
+static void tcp_clamp_window(int s, struct tcphdr *th, int len, int init)
{
- int ws;
-
- if (!tc[s].tap_window) {
- ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
- if (ws >= 0 && ws <= MAX_WS) {
- tc[s].ws_allowed = 1;
- tc[s].ws = ws;
- } else {
- tc[s].ws_allowed = 0;
- tc[s].ws = 0;
- }
-
- /* First value is not scaled. Also, don't clamp yet, to avoid
- * getting a zero scale just because we set a small window now.
+ if (init) {
+ tc[s].ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
+ tc[s].ws_allowed = tc[s].ws >= 0 && tc[s].ws <= MAX_WS;
+ tc[s].ws *= tc[s].ws_allowed;
+
+ /* RFC 7323, 2.2: first value is not scaled. Also, don't clamp
+ * yet, to avoid getting a zero scale just because we set a
+ * small window now.
*/
tc[s].tap_window = ntohs(th->window);
} else {
@@ -718,25 +712,31 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
sl = sizeof(tc[s].mss_guest);
setsockopt(s, SOL_TCP, TCP_MAXSEG, &tc[s].mss_guest, sl);
- tcp_clamp_window(s, th, len);
+ tcp_clamp_window(s, th, len, 1);
if (af == AF_INET) {
- sa = (const struct sockaddr *)&addr4;
+ sa = (struct sockaddr *)&addr4;
sl = sizeof(addr4);
- memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero));
- memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one));
- memcpy(&tc[s].a.a4.a, addr, sizeof(tc[s].a.a4.a));
+ memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero));
+ memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one));
+ memcpy(&tc[s].a.a4.a, addr, sizeof(tc[s].a.a4.a));
} else {
- sa = (const struct sockaddr *)&addr6;
+ sa = (struct sockaddr *)&addr6;
sl = sizeof(addr6);
- memcpy(&tc[s].a.a6, addr, sizeof(tc[s].a.a6));
+ memcpy(&tc[s].a.a6, addr, sizeof(tc[s].a.a6));
}
tc[s].sock_port = th->dest;
tc[s].tap_port = th->source;
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_sock);
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_tap);
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_ack_tap);
+
+ tcp_act_set(s);
+
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
ev.data.fd = s;
@@ -745,7 +745,8 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
tc[s].seq_ack_to_tap = tc[s].seq_from_tap;
/* TODO: RFC 6528 with SipHash, worth it? */
- tc[s].seq_ack_from_tap = tc[s].seq_to_tap = 0;
+ tc[s].seq_to_tap = 0;
+ tc[s].seq_ack_from_tap = tc[s].seq_to_tap;
if (connect(s, sa, sl)) {
if (errno != EINPROGRESS) {
@@ -754,17 +755,15 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
}
ev.events |= EPOLLOUT;
- tc[s].s = TAP_SYN_SENT;
+ tcp_set_state(s, TAP_SYN_SENT);
} else {
if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0))
return;
- tc[s].s = TAP_SYN_RCVD;
+ tcp_set_state(s, TAP_SYN_RCVD);
}
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev);
-
- return;
}
/**
@@ -773,7 +772,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
* @tap_port: tap-facing port
* @sock_port: Socket-facing port
*
- * Return: file descriptor number for socket, if found, -1 otherwise
+ * Return: file descriptor number for socket, if found, -ENOENT otherwise
*/
static int tcp_sock_lookup(int af, void *addr,
in_port_t tap_port, in_port_t sock_port)
@@ -797,7 +796,7 @@ static int tcp_sock_lookup(int af, void *addr,
return i;
}
- return -1;
+ return -ENOENT;
}
/**
@@ -808,10 +807,8 @@ static int tcp_sock_lookup(int af, void *addr,
static void tcp_conn_from_sock(struct ctx *c, int fd)
{
struct sockaddr_storage sa_r, sa_l;
- socklen_t sa_len = sizeof(sa_r);
+ socklen_t sa_len = sizeof(sa_l);
struct epoll_event ev = { 0 };
- struct sockaddr_in6 *sa6;
- struct sockaddr_in *sa4;
int s;
if (getsockname(fd, (struct sockaddr *)&sa_l, &sa_len))
@@ -822,41 +819,41 @@ static void tcp_conn_from_sock(struct ctx *c, int fd)
return;
if (sa_l.ss_family == AF_INET) {
- sa4 = (struct sockaddr_in *)&sa_r;
+ struct sockaddr_in *sa4 = (struct sockaddr_in *)&sa_r;
memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero));
memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one));
memcpy(&tc[s].a.a4.a, &sa4->sin_addr, sizeof(tc[s].a.a4.a));
tc[s].sock_port = sa4->sin_port;
-
- sa4 = (struct sockaddr_in *)&sa_l;
- tc[s].tap_port = sa4->sin_port;
-
+ tc[s].tap_port = ((struct sockaddr_in *)&sa_l)->sin_port;
} else if (sa_l.ss_family == AF_INET6) {
- sa6 = (struct sockaddr_in6 *)&sa_r;
+ struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)&sa_r;
memcpy(&tc[s].a.a6, &sa6->sin6_addr, sizeof(tc[s].a.a6));
tc[s].sock_port = sa6->sin6_port;
-
- sa6 = (struct sockaddr_in6 *)&sa_l;
- tc[s].tap_port = sa6->sin6_port;
+ tc[s].tap_port = ((struct sockaddr_in6 *)&sa_l)->sin6_port;
}
/* TODO: RFC 6528 with SipHash, worth it? */
tc[s].seq_to_tap = 0;
+ tc[s].seq_ack_from_tap = tc[s].seq_to_tap + 1;
+ tc[s].tap_window = WINDOW_DEFAULT;
tc[s].ws_allowed = 1;
- clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_sock);
- clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap);
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_sock);
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_tap);
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_ack_tap);
+
+ tcp_act_set(s);
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
ev.data.fd = s;
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev);
- tc[s].s = SOCK_SYN_SENT;
+ tcp_set_state(s, SOCK_SYN_SENT);
tcp_send_to_tap(c, s, SYN, NULL, 0);
}
@@ -864,14 +861,13 @@ static void tcp_conn_from_sock(struct ctx *c, int fd)
* tcp_send_to_sock() - Send buffer to socket, update timestamp and sequence
* @c: Execution context
* @s: File descriptor number for socket
- * @seq: Previous TCP sequence, host order
* @data: Data buffer
* @len: Length at L4
* @extra_flags: Additional flags for send(), if any
*
- * Return: -1 on socket error with connection reset, 0 otherwise
+ * Return: negative on socket error with connection reset, 0 otherwise
*/
-static int tcp_send_to_sock(struct ctx *c, int s, int seq, char *data, int len,
+static int tcp_send_to_sock(struct ctx *c, int s, char *data, int len,
int extra_flags)
{
int err = send(s, data, len, MSG_DONTWAIT | MSG_NOSIGNAL | extra_flags);
@@ -884,28 +880,28 @@ static int tcp_send_to_sock(struct ctx *c, int s, int seq, char *data, int len,
return 0;
}
+ err = errno;
tcp_rst(c, s);
- return -1;
+ return -err;
}
- clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_sock);
- tc[s].seq_from_tap = seq + len;
+ tc[s].seq_from_tap += len;
return 0;
}
/**
- * tcp_check_dupack() - Check if given ACK number is duplicated, update counter
+ * tcp_is_dupack() - Check if given ACK number is duplicated, update counter
* @s: File descriptor number for socket
* @ack_seq: ACK sequence, host order
*
- * Return: 1 on two duplicated ACKs observed, with counter reset, 0 otherwise
+ * Return: -EAGAIN on duplicated ACKs observed, with counter reset, 0 otherwise
*/
-static int tcp_check_dupack(int s, uint32_t ack_seq)
+static int tcp_is_dupack(int s, uint32_t ack_seq)
{
if (ack_seq == tc[s].seq_ack_from_tap && ++tc[s].dup_acks == 2) {
tc[s].dup_acks = 0;
- return 1;
+ return -EAGAIN;
}
return 0;
@@ -916,7 +912,7 @@ static int tcp_check_dupack(int s, uint32_t ack_seq)
* @s: File descriptor number for socket
* @ack_seq: ACK sequence, host order
*
- * Return: -1 on invalid sequence, 0 otherwise
+ * Return: negative on invalid sequence, 0 otherwise
*/
static int tcp_sock_consume(int s, uint32_t ack_seq)
{
@@ -926,7 +922,7 @@ static int tcp_sock_consume(int s, uint32_t ack_seq)
to_ack = ack_seq - tc[s].seq_ack_from_tap;
if (to_ack < 0)
- return -1;
+ return -EIO;
recv(s, NULL, to_ack, MSG_DONTWAIT | MSG_TRUNC);
tc[s].seq_ack_from_tap = ack_seq;
@@ -939,27 +935,29 @@ static int tcp_sock_consume(int s, uint32_t ack_seq)
* @c: Execution context
* @s: File descriptor number for socket
*
- * Return: non-zero on socket error or pending data, 0 otherwise
+ * Return: negative on connection reset, 1 on pending data, 0 otherwise
*/
static int tcp_data_from_sock(struct ctx *c, int s)
{
- int len, offset, left, send;
+ int len, err, offset, left, send;
/* Don't dequeue until acknowledged by guest */
- len = recv(s, tcp_in_buf, sizeof(tcp_in_buf), MSG_DONTWAIT | MSG_PEEK);
+ len = recv(s, sock_buf, sizeof(sock_buf), MSG_DONTWAIT | MSG_PEEK);
if (len < 0) {
- if (errno != EAGAIN && errno != EWOULDBLOCK)
+ if (errno != EAGAIN && errno != EWOULDBLOCK) {
tcp_rst(c, s);
- return 1;
+ return -errno;
+ }
+ return 0;
}
if (len == 0) {
if (tc[s].s >= ESTABLISHED_SOCK_FIN)
return 0;
- tc[s].s = ESTABLISHED_SOCK_FIN;
- if (tcp_send_to_tap(c, s, FIN | ACK, NULL, 0))
- return 0;
+ tcp_set_state(s, ESTABLISHED_SOCK_FIN);
+ if ((err = tcp_send_to_tap(c, s, FIN | ACK, NULL, 0)))
+ return err;
left = 0;
goto out;
@@ -973,16 +971,15 @@ static int tcp_data_from_sock(struct ctx *c, int s)
else
send = tc[s].mss_guest;
- if (tcp_send_to_tap(c, s, 0, tcp_in_buf + offset, send))
- return 0;
+ if ((err = tcp_send_to_tap(c, s, 0, sock_buf + offset, send)))
+ return err;
offset += send;
left -= send;
}
out:
- clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap);
- tcp_act_slow_set(s);
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_sock);
return !!left;
}
@@ -997,7 +994,7 @@ out:
void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len)
{
struct tcphdr *th = (struct tcphdr *)in;
- size_t off;
+ size_t off, skip = 0;
int s, ws;
if (len < sizeof(*th))
@@ -1007,9 +1004,7 @@ void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len)
if (off < sizeof(*th) || off > len)
return;
- s = tcp_sock_lookup(af, addr, th->source, th->dest);
-
- if (s < 0) {
+ if ((s = tcp_sock_lookup(af, addr, th->source, th->dest)) < 0) {
if (th->syn)
tcp_conn_from_tap(c, af, addr, th, len);
return;
@@ -1020,15 +1015,19 @@ void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len)
return;
}
- tcp_clamp_window(s, th, len);
+ tcp_clamp_window(s, th, len, th->syn && th->ack);
- if (th->ack)
- clock_gettime(CLOCK_MONOTONIC, &tc[s].last_ts_tap);
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_tap);
+
+ if (ntohl(th->seq) < tc[s].seq_from_tap)
+ skip = tc[s].seq_from_tap - ntohl(th->seq);
switch (tc[s].s) {
case SOCK_SYN_SENT:
- if (!th->syn || !th->ack)
+ if (!th->syn || !th->ack) {
+ tcp_rst(c, s);
return;
+ }
tc[s].mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL);
if (tc[s].mss_guest < 0)
@@ -1045,19 +1044,20 @@ void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len)
return;
}
- tc[s].seq_from_tap = tc[s].seq_init_from_tap = ntohl(th->seq);
+ /* info.tcpi_bytes_acked already includes one byte for SYN, but
+ * not for incoming connections.
+ */
+ tc[s].seq_init_from_tap = ntohl(th->seq) + 1;
+ tc[s].seq_from_tap = tc[s].seq_init_from_tap;
tc[s].seq_ack_to_tap = tc[s].seq_from_tap;
- tc[s].s = ESTABLISHED;
+ tcp_set_state(s, ESTABLISHED);
tcp_send_to_tap(c, s, ACK, NULL, 0);
break;
- case TAP_SYN_SENT:
- break;
case TAP_SYN_RCVD:
if (th->fin) {
shutdown(s, SHUT_WR);
- tc[s].s = FIN_WAIT_1;
-
+ tcp_set_state(s, FIN_WAIT_1);
break;
}
@@ -1066,83 +1066,81 @@ void tcp_tap_handler(struct ctx *c, int af, void *addr, char *in, size_t len)
return;
}
- tc[s].seq_ack_from_tap = ntohl(th->ack_seq);
-
- tc[s].s = ESTABLISHED;
+ tcp_set_state(s, ESTABLISHED);
break;
case ESTABLISHED:
+ case ESTABLISHED_SOCK_FIN:
+ clock_gettime(CLOCK_MONOTONIC, &tc[s].ts_ack_tap);
+
+ if (ntohl(th->seq) > tc[s].seq_from_tap) {
+ tc[s].seq_from_tap = tc[s].seq_ack_to_tap;
+ tcp_send_to_tap(c, s, ACK, NULL, 0);
+ break;
+ }
+
if (th->ack) {
int retrans = 0;
- if (len == th->doff)
- retrans = tcp_check_dupack(s, th->ack_seq);
+ if (len == off)
+ retrans = tcp_is_dupack(s, ntohl(th->ack_seq));
if (tcp_sock_consume(s, ntohl(th->ack_seq))) {
tcp_rst(c, s);
return;
}
- if (retrans) {
+ tc[s].seq_ack_from_tap = ntohl(th->ack_seq);
+
+ if (retrans)
tc[s].seq_to_tap = tc[s].seq_ack_from_tap;
- tcp_data_from_sock(c, s);
+
+ if (tc[s].s == ESTABLISHED_SOCK_FIN) {
+ if (!tcp_data_from_sock(c, s))
+ tcp_set_state(s, CLOSE_WAIT);
}
}
- if (tcp_send_to_sock(c, s, ntohl(th->seq), in + off, len - off,
+ if (skip < len - off &&
+ tcp_send_to_sock(c, s, in + off + skip, len - off - skip,
th->psh ? 0 : MSG_MORE))
break;
if (th->fin) {
shutdown(s, SHUT_WR);
- tc[s].s = FIN_WAIT_1;
+ if (tc[s].s == ESTABLISHED)
+ tcp_set_state(s, FIN_WAIT_1);
+ else
+ tcp_set_state(s, LAST_ACK);
}
break;
- case ESTABLISHED_SOCK_FIN:
- if (tcp_send_to_sock(c, s, ntohl(th->seq), in + off, len - off,
- th->psh ? 0 : MSG_MORE) < 0)
- break;
-
- if (th->ack) {
- shutdown(s, SHUT_RD);
- if (!tcp_data_from_sock(c, s))
- tc[s].s = CLOSE_WAIT;
-
- if (tcp_sock_consume(s, ntohl(th->ack_seq))) {
- tcp_rst(c, s);
- return;
- }
- }
-
- break;
-
case CLOSE_WAIT:
if (tcp_sock_consume(s, ntohl(th->ack_seq))) {
tcp_rst(c, s);
return;
}
+ if (skip < len - off &&
+ tcp_send_to_sock(c, s, in + off + skip, len - off - skip,
+ th->psh ? 0 : MSG_MORE))
+ break;
+
if (th->fin) {
shutdown(s, SHUT_WR);
- tc[s].s = LAST_ACK;
+ tcp_set_state(s, LAST_ACK);
}
break;
+ case FIN_WAIT_1_SOCK_FIN:
+ if (th->ack)
+ tcp_close_and_epoll_del(c, s);
+ break;
case FIN_WAIT_1:
+ case TAP_SYN_SENT:
case LAST_ACK:
case CLOSED: /* ;) */
break;
}
-
- if (tc[s].seq_to_tap > tc[s].seq_ack_from_tap)
- tcp_act_slow_set(s);
- else
- tcp_act_slow_clear(s);
-
- if (tc[s].seq_from_tap > tc[s].seq_ack_to_tap)
- tcp_act_fast_set(s);
- else
- tcp_act_fast_clear(s);
}
/**
@@ -1162,14 +1160,15 @@ static void tcp_connect_finish(struct ctx *c, int s)
return;
}
- if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0) < 0)
+ if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0))
return;
+ /* Drop EPOLLOUT, only used to wait for connect() to complete */
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
ev.data.fd = s;
epoll_ctl(c->epollfd, EPOLL_CTL_MOD, s, &ev);
- tc[s].s = TAP_SYN_RCVD;
+ tcp_set_state(s, TAP_SYN_RCVD);
}
/**
@@ -1184,6 +1183,7 @@ void tcp_sock_handler(struct ctx *c, int s, uint32_t events)
int so;
if (tc[s].s == LAST_ACK) {
+ tcp_send_to_tap(c, s, ACK, NULL, 0);
tcp_close_and_epoll_del(c, s);
return;
}
@@ -1210,21 +1210,21 @@ void tcp_sock_handler(struct ctx *c, int s, uint32_t events)
tcp_data_from_sock(c, s);
if (events & EPOLLRDHUP || events & EPOLLHUP) {
- if (tc[s].s == ESTABLISHED)
- tc[s].s = ESTABLISHED_SOCK_FIN;
-
- tcp_send_to_tap(c, s, FIN | ACK, NULL, 0);
-
- if (tc[s].s == FIN_WAIT_1) {
+ if (tc[s].s == ESTABLISHED) {
+ tcp_set_state(s, ESTABLISHED_SOCK_FIN);
+ shutdown(s, SHUT_RD);
+ tcp_data_from_sock(c, s);
+ tcp_send_to_tap(c, s, FIN | ACK, NULL, 0);
+ } else if (tc[s].s == FIN_WAIT_1) {
+ tcp_set_state(s, FIN_WAIT_1_SOCK_FIN);
shutdown(s, SHUT_RD);
+ tcp_data_from_sock(c, s);
+ tcp_send_to_tap(c, s, FIN | ACK, NULL, 0);
- if (tcp_sock_consume(s, ntohl(tc[s].seq_ack_from_tap))) {
+ if (tcp_sock_consume(s, tc[s].seq_ack_from_tap)) {
tcp_rst(c, s);
return;
}
-
- tcp_close_and_epoll_del(c, s);
- tc[s].s = CLOSED;
}
}
}
@@ -1240,9 +1240,9 @@ int tcp_sock_init(struct ctx *c)
in_port_t port;
for (port = 0; port < (1 << 15) + (1 << 14); port++) {
- if (c->v4 && sock_l4_add(c, 4, IPPROTO_TCP, htons(port)) < 0)
+ if (c->v4 && sock_l4_add(c, 4, IPPROTO_TCP, port) < 0)
return -1;
- if (c->v6 && sock_l4_add(c, 6, IPPROTO_TCP, htons(port)) < 0)
+ if (c->v6 && sock_l4_add(c, 6, IPPROTO_TCP, port) < 0)
return -1;
}
@@ -1250,118 +1250,92 @@ int tcp_sock_init(struct ctx *c)
}
/**
- * tcp_periodic_fast_one() - Handler for "fast" timeout events on one socket
+ * tcp_timer_one() - Handler for timed events on one socket
* @c: Execution context
* @s: File descriptor number for socket
* @ts: Timestamp from caller
- *
- * Return: 0 if socket needs to be monitored further, non-zero otherwise
- */
-int tcp_periodic_fast_one(struct ctx *c, int s, struct timespec *ts)
-{
- if (timespec_diff_ms(ts, &tc[s].last_ts_sock) < SOCK_ACK_INTERVAL)
- return 0;
-
- tc[s].last_ts_sock = *ts;
-
- tcp_send_to_tap(c, s, 0, NULL, 0);
-
- return tc[s].seq_from_tap == tc[s].seq_ack_to_tap;
-}
-
-/**
- * tcp_periodic_fast() - Handle sockets in "fast" event bitmap, clear as needed
- * @c: Execution context
*/
-void tcp_periodic_fast(struct ctx *c)
+static void tcp_timer_one(struct ctx *c, int s, struct timespec *ts)
{
- long *word = (long *)tcp_act_fast, tmp;
- struct timespec now;
- unsigned int i;
- int n, s;
-
- clock_gettime(CLOCK_MONOTONIC, &now);
-
- for (i = 0; i < sizeof(tcp_act_fast) / sizeof(long); i++, word++) {
- tmp = *word;
- while ((n = ffsl(tmp))) {
- tmp &= ~(1UL << (n - 1));
+ int ack_tap_ms = timespec_diff_ms(ts, &tc[s].ts_ack_tap);
+ int sock_ms = timespec_diff_ms(ts, &tc[s].ts_tap);
+ int tap_ms = timespec_diff_ms(ts, &tc[s].ts_tap);
- s = i * sizeof(long) * 8 + n - 1;
-
- if (tcp_periodic_fast_one(c, s, &now))
- *word &= ~(1UL << (n - 1));
- }
- }
-}
-
-/**
- * tcp_periodic_fast_one() - Handler for "slow" timeout events on one socket
- * @c: Execution context
- * @s: File descriptor number for socket
- * @ts: Timestamp from caller
- */
-void tcp_periodic_slow_one(struct ctx *c, int s, struct timespec *ts)
-{
switch (tc[s].s) {
case SOCK_SYN_SENT:
- case TAP_SYN_SENT:
case TAP_SYN_RCVD:
- if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > SYN_TIMEOUT)
+ if (ack_tap_ms > SYN_TIMEOUT)
tcp_rst(c, s);
+
break;
case ESTABLISHED_SOCK_FIN:
- if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > FIN_TIMEOUT) {
+ if (ack_tap_ms > FIN_TIMEOUT) {
tcp_rst(c, s);
break;
}
/* Falls through */
case ESTABLISHED:
- if (tc[s].seq_ack_from_tap < tc[s].seq_to_tap &&
- timespec_diff_ms(ts, &tc[s].last_ts_tap) > ACK_TIMEOUT) {
- tc[s].seq_to_tap = tc[s].seq_ack_from_tap;
- tcp_data_from_sock(c, s);
+ if (tap_ms > ACT_TIMEOUT && sock_ms > ACT_TIMEOUT)
+ tcp_rst(c, s);
+
+ if (tc[s].seq_to_tap == tc[s].seq_ack_from_tap &&
+ tc[s].seq_from_tap == tc[s].seq_ack_to_tap) {
+ tc[s].ts_sock = *ts;
+ break;
}
- if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > ACT_TIMEOUT &&
- timespec_diff_ms(ts, &tc[s].last_ts_sock) > ACT_TIMEOUT)
- tcp_rst(c, s);
+ if (sock_ms > ACK_INTERVAL) {
+ if (tc[s].seq_from_tap > tc[s].seq_ack_to_tap)
+ tcp_send_to_tap(c, s, 0, NULL, 0);
+ }
+
+ if (ack_tap_ms > ACK_TIMEOUT) {
+ if (tc[s].seq_ack_from_tap < tc[s].seq_to_tap) {
+ tc[s].seq_to_tap = tc[s].seq_ack_from_tap;
+ tc[s].ts_ack_tap = *ts;
+ tcp_data_from_sock(c, s);
+ }
+ }
+
+ if (tc[s].seq_from_tap == tc[s].seq_ack_to_tap)
+ tc[s].ts_sock = *ts;
break;
case CLOSE_WAIT:
case FIN_WAIT_1:
- if (timespec_diff_ms(ts, &tc[s].last_ts_tap) > FIN_TIMEOUT)
+ if (sock_ms > FIN_TIMEOUT)
+ tcp_rst(c, s);
+ break;
+ case FIN_WAIT_1_SOCK_FIN:
+ if (ack_tap_ms > FIN_TIMEOUT)
tcp_rst(c, s);
break;
case LAST_ACK:
- if (timespec_diff_ms(ts, &tc[s].last_ts_sock) >
- LAST_ACK_TIMEOUT)
+ if (sock_ms > LAST_ACK_TIMEOUT)
tcp_rst(c, s);
break;
+ case TAP_SYN_SENT:
case CLOSED:
break;
}
}
/**
- * tcp_periodic_slow() - Handle sockets in "slow" event bitmap
+ * tcp_timer() - Scan activity bitmap for sockets waiting for timed events
* @c: Execution context
+ * @ts: Timestamp from caller
*/
-void tcp_periodic_slow(struct ctx *c)
+void tcp_timer(struct ctx *c, struct timespec *ts)
{
- long *word = (long *)tcp_act_slow, tmp;
- struct timespec now;
+ long *word = (long *)tcp_act, tmp;
unsigned int i;
int n;
- clock_gettime(CLOCK_MONOTONIC, &now);
-
- for (i = 0; i < sizeof(tcp_act_slow) / sizeof(long); i++, word++) {
+ for (i = 0; i < sizeof(tcp_act) / sizeof(long); i++, word++) {
tmp = *word;
while ((n = ffsl(tmp))) {
tmp &= ~(1UL << (n - 1));
- tcp_periodic_slow_one(c, i * sizeof(long) * 8 + n - 1,
- &now);
+ tcp_timer_one(c, i * sizeof(long) * 8 + n - 1, ts);
}
}
}