aboutgitcodebugslistschat
path: root/tcp.c
diff options
context:
space:
mode:
Diffstat (limited to 'tcp.c')
-rw-r--r--tcp.c442
1 files changed, 247 insertions, 195 deletions
diff --git a/tcp.c b/tcp.c
index 802d6d8..e03561a 100644
--- a/tcp.c
+++ b/tcp.c
@@ -303,6 +303,9 @@
#define TCP_FRAMES \
(c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
+#define TCP_FILE_PRESSURE 30 /* % of c->nofile */
+#define TCP_CONN_PRESSURE 30 /* % of c->tcp.conn_count */
+
#define TCP_HASH_BUCKET_BITS (TCP_CONN_INDEX_BITS + 1)
#define TCP_HASH_TABLE_LOAD 70 /* % */
#define TCP_HASH_TABLE_SIZE (TCP_MAX_CONNS * 100 / \
@@ -440,6 +443,7 @@ struct tcp_conn {
#define TCP_MAX_RETRANS ((1U << TCP_RETRANS_BITS) - 1)
#define TCP_WS_BITS 4 /* RFC 7323 */
+#define TCP_WS_MAX 14
unsigned int ws_from_tap :TCP_WS_BITS;
unsigned int ws_to_tap :TCP_WS_BITS;
@@ -476,7 +480,6 @@ struct tcp_conn {
uint32_t seq_init_from_tap;
};
-#define CONN_IS_CLOSED(conn) (conn->events == CLOSED)
#define CONN_IS_CLOSING(conn) \
((conn->events & ESTABLISHED) && \
(conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
@@ -699,7 +702,7 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
return EPOLLET;
if (conn_flags & STALLED)
- return EPOLLIN | EPOLLRDHUP | EPOLLET;
+ return EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET;
return EPOLLIN | EPOLLRDHUP;
}
@@ -733,8 +736,11 @@ static int tcp_epoll_ctl(struct ctx *c, struct tcp_conn *conn)
.r.p.tcp.tcp.v6 = CONN_V6(conn) };
struct epoll_event ev = { .data.u64 = ref.u64 };
- if (CONN_IS_CLOSED(conn)) {
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev);
+ if (conn->events == CLOSED) {
+ if (conn->flags & IN_EPOLL)
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev);
+ if (conn->timer != -1)
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev);
return 0;
}
@@ -745,6 +751,18 @@ static int tcp_epoll_ctl(struct ctx *c, struct tcp_conn *conn)
conn->flags |= IN_EPOLL; /* No need to log this */
+ if (conn->timer != -1) {
+ union epoll_ref ref_t = { .r.proto = IPPROTO_TCP,
+ .r.s = conn->sock,
+ .r.p.tcp.tcp.timer = 1,
+ .r.p.tcp.tcp.index = conn - tc };
+ struct epoll_event ev_t = { .data.u64 = ref_t.u64,
+ .events = EPOLLIN | EPOLLET };
+
+ if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t))
+ return -errno;
+ }
+
return 0;
}
@@ -759,6 +777,9 @@ static void tcp_timer_ctl(struct ctx *c, struct tcp_conn *conn)
{
struct itimerspec it = { { 0 }, { 0 } };
+ if (conn->events == CLOSED)
+ return;
+
if (conn->timer == -1) {
union epoll_ref ref = { .r.proto = IPPROTO_TCP,
.r.s = conn->sock,
@@ -783,15 +804,11 @@ static void tcp_timer_ctl(struct ctx *c, struct tcp_conn *conn)
}
}
- if (conn->events == CLOSED) {
- it.it_value.tv_nsec = 1;
- } else if (conn->flags & ACK_TO_TAP_DUE) {
+ if (conn->flags & ACK_TO_TAP_DUE) {
it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000;
} else if (conn->flags & ACK_FROM_TAP_DUE) {
if (!(conn->events & ESTABLISHED))
it.it_value.tv_sec = SYN_TIMEOUT;
- else if (conn->events & TAP_FIN_SENT)
- it.it_value.tv_sec = FIN_TIMEOUT;
else
it.it_value.tv_sec = ACK_TIMEOUT;
} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
@@ -834,7 +851,9 @@ static void conn_flag_do(struct ctx *c, struct tcp_conn *conn,
if (flag == STALLED || flag == ~STALLED)
tcp_epoll_ctl(c, conn);
- if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE)
+ if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE ||
+ (flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) ||
+ (flag == ~ACK_TO_TAP_DUE && (conn->flags & ACK_FROM_TAP_DUE)))
tcp_timer_ctl(c, conn);
}
@@ -888,7 +907,7 @@ static void conn_event_do(struct ctx *c, struct tcp_conn *conn,
else
tcp_epoll_ctl(c, conn);
- if (event == CLOSED || CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
+ if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
tcp_timer_ctl(c, conn);
}
@@ -1182,36 +1201,32 @@ static void tcp_sock6_iov_init(void)
/**
* tcp_opt_get() - Get option, and value if any, from TCP header
- * @th: Pointer to TCP header
- * @len: Length of buffer, including TCP header
+ * @opts: Pointer to start of TCP options in header
+ * @len: Length of buffer, excluding TCP header -- NOT checked here!
* @type_find: Option type to look for
* @optlen_set: Optional, filled with option length if passed
* @value_set: Optional, set to start of option value if passed
*
* Return: option value, meaningful for up to 4 bytes, -1 if not found
*/
-static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_find,
+static int tcp_opt_get(char *opts, size_t len, uint8_t type_find,
uint8_t *optlen_set, char **value_set)
{
uint8_t type, optlen;
- char *p;
-
- if (len > (size_t)th->doff * 4)
- len = (size_t)th->doff * 4;
- len -= sizeof(*th);
- p = (char *)(th + 1);
+ if (!len)
+ return -1;
- for (; len >= 2; p += optlen, len -= optlen) {
- switch (*p) {
+ for (; len >= 2; opts += optlen, len -= optlen) {
+ switch (*opts) {
case OPT_EOL:
return -1;
case OPT_NOP:
optlen = 1;
break;
default:
- type = *(p++);
- optlen = *(p++) - 2;
+ type = *(opts++);
+ optlen = *(opts++) - 2;
len -= 2;
if (type != type_find)
@@ -1220,17 +1235,17 @@ static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_find,
if (optlen_set)
*optlen_set = optlen;
if (value_set)
- *value_set = p;
+ *value_set = opts;
switch (optlen) {
case 0:
return 0;
case 1:
- return *p;
+ return *opts;
case 2:
- return ntohs(*(uint16_t *)p);
+ return ntohs(*(uint16_t *)opts);
default:
- return ntohl(*(uint32_t *)p);
+ return ntohl(*(uint32_t *)opts);
}
}
}
@@ -1415,12 +1430,12 @@ static void tcp_table_compact(struct ctx *c, struct tcp_conn *hole)
if ((hole - tc) == --c->tcp.conn_count) {
debug("TCP: hash table compaction: index %i (%p) was max index",
hole - tc, hole);
+ memset(hole, 0, sizeof(*hole));
return;
}
from = CONN(c->tcp.conn_count);
memcpy(hole, from, sizeof(*hole));
- from->flags = from->events = 0;
to = hole;
tcp_hash_update(from, to);
@@ -1430,25 +1445,23 @@ static void tcp_table_compact(struct ctx *c, struct tcp_conn *hole)
debug("TCP: hash table compaction: old index %i, new index %i, "
"sock %i, from: %p, to: %p",
from - tc, to - tc, from->sock, from, to);
+
+ memset(from, 0, sizeof(*from));
}
/**
- * tcp_conn_destroy() - Close connection, drop from epoll file descriptor
+ * tcp_conn_destroy() - Close sockets, trigger hash table removal and compaction
* @c: Execution context
* @conn: Connection pointer
*/
static void tcp_conn_destroy(struct ctx *c, struct tcp_conn *conn)
{
- if (CONN_IS_CLOSED(conn))
- return;
-
- conn_event(c, conn, CLOSED);
- conn->flags = 0;
close(conn->sock);
+ if (conn->timer != -1)
+ close(conn->timer);
- /* Removal from hash table and connection table compaction deferred to
- * timer.
- */
+ tcp_hash_remove(conn);
+ tcp_table_compact(c, conn);
}
static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn);
@@ -1582,9 +1595,23 @@ static void tcp_l2_data_buf_flush(struct ctx *c)
*/
void tcp_defer_handler(struct ctx *c)
{
+ int max_conns = c->tcp.conn_count / 100 * TCP_CONN_PRESSURE;
+ int max_files = c->nofile / 100 * TCP_FILE_PRESSURE;
+ struct tcp_conn *conn;
+
tcp_l2_flags_buf_flush(c);
tcp_l2_data_buf_flush(c);
+
tcp_splice_defer_handler(c);
+
+ if (c->tcp.conn_count < MIN(max_files, max_conns))
+ return;
+
+ for (conn = CONN(c->tcp.conn_count - 1); conn >= tc; conn--) {
+ if (conn->events == CLOSED)
+ tcp_conn_destroy(c, conn);
+ }
+
}
/**
@@ -1605,13 +1632,19 @@ static size_t tcp_l2_buf_fill_headers(struct ctx *c, struct tcp_conn *conn,
size_t ip_len, eth_len;
#define SET_TCP_HEADER_COMMON_V4_V6(b, conn, seq) \
- do { \
- b->th.source = htons(conn->sock_port); \
- b->th.dest = htons(conn->tap_port); \
- b->th.seq = htonl(seq); \
- b->th.ack_seq = htonl(conn->seq_ack_to_tap); \
- b->th.window = htons(MIN(conn->wnd_to_tap, USHRT_MAX)); \
- } while (0)
+do { \
+ b->th.source = htons(conn->sock_port); \
+ b->th.dest = htons(conn->tap_port); \
+ b->th.seq = htonl(seq); \
+ b->th.ack_seq = htonl(conn->seq_ack_to_tap); \
+ if (conn->events & ESTABLISHED) { \
+ b->th.window = htons(conn->wnd_to_tap); \
+ } else { \
+ unsigned wnd = conn->wnd_to_tap << conn->ws_to_tap; \
+ \
+ b->th.window = htons(MIN(wnd, USHRT_MAX)); \
+ } \
+} while (0)
if (CONN_V6(conn)) {
struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p;
@@ -1692,7 +1725,7 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn,
conn->seq_ack_to_tap = prev_ack_to_tap;
#else
if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || tcp_rtt_dst_low(conn)
- || CONN_IS_CLOSING(conn) || conn->flags & LOCAL || force_seq) {
+ || CONN_IS_CLOSING(conn) || (conn->flags & LOCAL) || force_seq) {
conn->seq_ack_to_tap = conn->seq_from_tap;
} else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
if (!tinfo) {
@@ -1712,17 +1745,19 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn,
if (!KERNEL_REPORTS_SND_WND(c)) {
tcp_get_sndbuf(conn);
new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW);
- conn->wnd_to_tap = new_wnd_to_tap >> conn->ws_to_tap;
+ conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap,
+ USHRT_MAX);
goto out;
}
if (!tinfo) {
- if (prev_wnd_to_tap > WINDOW_DEFAULT)
+ if (prev_wnd_to_tap > WINDOW_DEFAULT) {
goto out;
-
+}
tinfo = &tinfo_new;
- if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
+ if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) {
goto out;
+}
}
#ifdef HAS_SND_WND
@@ -1735,10 +1770,15 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn,
}
#endif
- conn->wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW) >> conn->ws_to_tap;
+ new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW);
+ if (!(conn->events & ESTABLISHED))
+ new_wnd_to_tap = MAX(new_wnd_to_tap, WINDOW_DEFAULT);
+
+ conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap, USHRT_MAX);
if (!conn->wnd_to_tap)
conn_flag(c, conn, ACK_TO_TAP_DUE);
+
out:
return new_wnd_to_tap != prev_wnd_to_tap ||
conn->seq_ack_to_tap != prev_ack_to_tap;
@@ -1772,10 +1812,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags)
return 0;
if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
- tcp_conn_destroy(c, conn);
+ conn_event(c, conn, CLOSED);
return -ECONNRESET;
}
+#ifdef HAS_SND_WND
+ if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd)
+ c->tcp.kernel_snd_wnd = 1;
+#endif
+
if (!(conn->flags & LOCAL))
tcp_rtt_dst_check(conn, &tinfo);
@@ -1825,11 +1870,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags)
data += OPT_MSS_LEN - 2;
th->doff += OPT_MSS_LEN / 4;
-#ifdef HAS_SND_WND
- if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd)
- c->tcp.kernel_snd_wnd = 1;
-#endif
-
conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale);
*data++ = OPT_NOP;
@@ -1854,10 +1894,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags)
NULL, conn->seq_to_tap);
iov->iov_len = eth_len + sizeof(uint32_t);
- /* First value is not scaled: scale now */
- if (flags & SYN)
- conn->wnd_to_tap >>= conn->ws_to_tap;
-
if (CONN_V4(conn))
tcp4_l2_flags_buf_bytes += iov->iov_len;
else
@@ -1905,68 +1941,55 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags)
*/
static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn)
{
- if (CONN_IS_CLOSED(conn))
+ if (conn->events == CLOSED)
return;
if (!tcp_send_flag(c, conn, RST))
- tcp_conn_destroy(c, conn);
+ conn_event(c, conn, CLOSED);
}
/**
- * tcp_clamp_window() - Set window and scaling from option, clamp on socket
+ * tcp_get_tap_ws() - Get Window Scaling option for connection from tap/guest
* @conn: Connection pointer
- * @th: TCP header, from tap, can be NULL if window is passed
- * @len: Buffer length, at L4, can be 0 if no header is passed
- * @window: Window value, host order, unscaled, if no header is passed
- * @init: Set if this is the very first segment from tap
- */
-static void tcp_clamp_window(struct ctx *c, struct tcp_conn *conn,
- struct tcphdr *th, int len, unsigned int window,
- int init)
+ * @opts: Pointer to start of TCP options
+ * @optlen: Bytes in options: caller MUST ensure available length
+ */
+static void tcp_get_tap_ws(struct tcp_conn *conn, char *opts, size_t optlen)
{
- if (init && th) {
- int ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
+ int ws = tcp_opt_get(opts, optlen, OPT_WS, NULL, NULL);
- conn->ws_from_tap = ws & 0xf;
+ if (ws >= 0 && ws <= TCP_WS_MAX)
+ conn->ws_from_tap = ws;
+ else
+ conn->ws_from_tap = 0;
+}
- /* RFC 7323, 2.2: first value is not scaled. Also, don't clamp
- * yet, to avoid getting a zero scale just because we set a
- * small window now.
- */
- conn->wnd_from_tap = ntohs(th->window);
- } else {
- uint32_t prev_scaled = conn->wnd_from_tap << conn->ws_from_tap;
+/**
+ * tcp_clamp_window() - Set new window for connection, clamp on socket
+ * @c: Execution context
+ * @conn: Connection pointer
+ * @window: Window value, host order, unscaled
+ */
+static void tcp_clamp_window(struct ctx *c, struct tcp_conn *conn, unsigned wnd)
+{
+ uint32_t prev_scaled = conn->wnd_from_tap << conn->ws_from_tap;
- if (th)
- window = ntohs(th->window) << conn->ws_from_tap;
- else
- window <<= conn->ws_from_tap;
-
- window = MIN(MAX_WINDOW, window);
-
- if (conn->flags & WND_CLAMPED) {
- if (prev_scaled == window)
- return;
-
- /* Discard +/- 1% updates to spare some syscalls. */
- if ((window > prev_scaled &&
- window * 99 / 100 < prev_scaled) ||
- (window < prev_scaled &&
- window * 101 / 100 > prev_scaled)) {
- conn->wnd_from_tap = window >>
- conn->ws_from_tap;
- return;
- }
- }
+ wnd <<= conn->ws_from_tap;
+ wnd = MIN(MAX_WINDOW, wnd);
- if (window < 256)
- window = 256;
+ if (conn->flags & WND_CLAMPED) {
+ if (prev_scaled == wnd)
+ return;
- conn->wnd_from_tap = window >> conn->ws_from_tap;
- setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP,
- &window, sizeof(window));
- conn_flag(c, conn, WND_CLAMPED);
+ /* Discard +/- 1% updates to spare some syscalls. */
+ if ((wnd > prev_scaled && wnd * 99 / 100 < prev_scaled) ||
+ (wnd < prev_scaled && wnd * 101 / 100 > prev_scaled))
+ return;
}
+
+ conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
+ setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP, &wnd, sizeof(wnd));
+ conn_flag(c, conn, WND_CLAMPED);
}
/**
@@ -2059,18 +2082,18 @@ static int tcp_conn_new_sock(struct ctx *c, sa_family_t af)
* tcp_conn_tap_mss() - Get and clamp MSS value advertised by tap/guest
* @c: Execution context
* @conn: Connection pointer
- * @th: TCP header send by tap/guest
- * @len: L4 packet length, host order
+ * @opts: Pointer to start of TCP options
+ * @optlen: Bytes in options: caller MUST ensure available length
*
* Return: clamped MSS value
*/
static uint16_t tcp_conn_tap_mss(struct ctx *c, struct tcp_conn *conn,
- struct tcphdr *th, size_t len)
+ char *opts, size_t optlen)
{
unsigned int mss;
int ret;
- if ((ret = tcp_opt_get(th, len, OPT_MSS, NULL, NULL)) < 0)
+ if ((ret = tcp_opt_get(opts, optlen, OPT_MSS, NULL, NULL)) < 0)
mss = MSS_DEFAULT;
else
mss = ret;
@@ -2091,12 +2114,13 @@ static uint16_t tcp_conn_tap_mss(struct ctx *c, struct tcp_conn *conn,
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
* @addr: Remote address, pointer to sin_addr or sin6_addr
- * @th: TCP header from tap
- * @len: Packet length at L4
+ * @th: TCP header from tap: caller MUST ensure it's there
+ * @opts: Pointer to start of options
+ * @optlen: Bytes in options: caller MUST ensure available length
* @now: Current timestamp
*/
static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
- struct tcphdr *th, size_t len,
+ struct tcphdr *th, char *opts, size_t optlen,
struct timespec *now)
{
struct sockaddr_in addr4 = {
@@ -2142,16 +2166,21 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
conn = CONN(c->tcp.conn_count++);
conn->sock = s;
conn->timer = -1;
- conn->ws_to_tap = conn->ws_from_tap = 0;
conn_event(c, conn, TAP_SYN_RCVD);
conn->wnd_to_tap = WINDOW_DEFAULT;
- mss = tcp_conn_tap_mss(c, conn, th, len);
+ mss = tcp_conn_tap_mss(c, conn, opts, optlen);
setsockopt(s, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss));
MSS_SET(conn, mss);
- tcp_clamp_window(c, conn, th, len, 0, 1);
+ tcp_get_tap_ws(conn, opts, optlen);
+
+ /* RFC 7323, 2.2: first value is not scaled. Also, don't clamp yet, to
+ * avoid getting a zero scale just because we set a small window now.
+ */
+ if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap)))
+ conn->wnd_from_tap = 1;
if (af == AF_INET) {
sa = (struct sockaddr *)&addr4;
@@ -2395,53 +2424,52 @@ zero_len:
}
/**
- * tcp_data_from_tap() - tap data for established connection
+ * tcp_data_from_tap() - tap/guest data for established connection
* @c: Execution context
* @conn: Connection pointer
- * @msg: Array of messages from tap
- * @count: Count of messages
+ * @p: Pool of TCP packets, with TCP headers
*
* #syscalls sendmsg
*/
static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn,
- struct tap_l4_msg *msg, int count)
+ struct pool *p)
{
- int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1;
+ int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0;
uint16_t max_ack_seq_wnd = conn->wnd_from_tap;
uint32_t max_ack_seq = conn->seq_ack_from_tap;
uint32_t seq_from_tap = conn->seq_from_tap;
struct msghdr mh = { .msg_iov = tcp_iov };
- int partial_send = 0;
- uint16_t len;
+ size_t len;
ssize_t n;
- for (i = 0, iov_i = 0; i < count; i++) {
+ for (i = 0, iov_i = 0; i < (int)p->count; i++) {
uint32_t seq, seq_offset, ack_seq;
struct tcphdr *th;
char *data;
size_t off;
- th = (struct tcphdr *)(pkt_buf + msg[i].pkt_buf_offset);
- len = msg[i].l4_len;
-
- if (len < sizeof(*th)) {
+ packet_get(p, i, 0, 0, &len);
+ th = packet_get(p, i, 0, sizeof(*th), NULL);
+ if (!th) {
tcp_rst(c, conn);
return;
}
- off = (size_t)th->doff * 4;
+ off = th->doff * 4UL;
if (off < sizeof(*th) || off > len) {
tcp_rst(c, conn);
return;
}
if (th->rst) {
- tcp_conn_destroy(c, conn);
+ conn_event(c, conn, CLOSED);
return;
}
len -= off;
- data = (char *)th + off;
+ data = packet_get(p, i, off, len, NULL);
+ if (!data)
+ continue;
seq = ntohl(th->seq);
ack_seq = ntohl(th->ack_seq);
@@ -2511,7 +2539,7 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn,
i = keep - 1;
}
- tcp_clamp_window(c, conn, NULL, 0, max_ack_seq_wnd, 0);
+ tcp_clamp_window(c, conn, max_ack_seq_wnd);
if (ack) {
if (max_ack_seq == conn->seq_to_tap) {
@@ -2595,14 +2623,22 @@ out:
* tcp_conn_from_sock_finish() - Complete connection setup after connect()
* @c: Execution context
* @conn: Connection pointer
- * @th: TCP header of SYN, ACK segment from tap/guest
- * @len: Packet length of SYN, ACK segment at L4, host order
+ * @th: TCP header of SYN, ACK segment: caller MUST ensure it's there
+ * @opts: Pointer to start of options
+ * @optlen: Bytes in options: caller MUST ensure available length
*/
static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn,
- struct tcphdr *th, size_t len)
+ struct tcphdr *th,
+ char *opts, size_t optlen)
{
- tcp_clamp_window(c, conn, th, len, 0, 1);
- MSS_SET(conn, tcp_conn_tap_mss(c, conn, th, len));
+ tcp_clamp_window(c, conn, ntohs(th->window));
+ tcp_get_tap_ws(conn, opts, optlen);
+
+ /* First value is not scaled */
+ if (!(conn->wnd_from_tap >>= conn->ws_from_tap))
+ conn->wnd_from_tap = 1;
+
+ MSS_SET(conn, tcp_conn_tap_mss(c, conn, opts, optlen));
conn->seq_init_from_tap = ntohl(th->seq) + 1;
conn->seq_from_tap = conn->seq_init_from_tap;
@@ -2622,32 +2658,42 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn,
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
* @addr: Destination address
- * @msg: Input messages
- * @count: Message count
+ * @p: Pool of TCP packets, with TCP headers
* @now: Current timestamp
*
* Return: count of consumed packets
*/
-int tcp_tap_handler(struct ctx *c, int af, void *addr,
- struct tap_l4_msg *msg, int count, struct timespec *now)
+int tcp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p,
+ struct timespec *now)
{
- struct tcphdr *th = (struct tcphdr *)(pkt_buf + msg[0].pkt_buf_offset);
- uint16_t len = msg[0].l4_len;
struct tcp_conn *conn;
+ size_t optlen, len;
+ struct tcphdr *th;
int ack_due = 0;
+ char *opts;
+
+ packet_get(p, 0, 0, 0, &len);
+ th = packet_get(p, 0, 0, sizeof(*th), NULL);
+ if (!th)
+ return 1;
+
+ optlen = th->doff * 4UL - sizeof(*th);
+ opts = packet_get(p, 0, sizeof(*th), optlen, NULL);
conn = tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest));
/* New connection from tap */
if (!conn) {
if (th->syn && !th->ack)
- tcp_conn_from_tap(c, af, addr, th, len, now);
+ tcp_conn_from_tap(c, af, addr, th, opts, optlen, now);
return 1;
}
+ trace("TCP: packet length %lu from tap for index %lu", len, conn - tc);
+
if (th->rst) {
- tcp_conn_destroy(c, conn);
- return count;
+ conn_event(c, conn, CLOSED);
+ return p->count;
}
if (th->ack) {
@@ -2660,7 +2706,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
/* Establishing connection from socket */
if (conn->events & SOCK_ACCEPTED) {
if (th->syn && th->ack && !th->fin)
- tcp_conn_from_sock_finish(c, conn, th, len);
+ tcp_conn_from_sock_finish(c, conn, th, opts, optlen);
else
tcp_rst(c, conn);
@@ -2671,7 +2717,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
if (conn->events & TAP_SYN_RCVD) {
if (!(conn->events & TAP_SYN_ACK_SENT)) {
tcp_rst(c, conn);
- return count;
+ return p->count;
}
conn_event(c, conn, ESTABLISHED);
@@ -2683,17 +2729,19 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
tcp_send_flag(c, conn, ACK);
conn_event(c, conn, SOCK_FIN_SENT);
- return count;
+ return p->count;
}
if (!th->ack) {
tcp_rst(c, conn);
- return count;
+ return p->count;
}
- tcp_clamp_window(c, conn, th, len, 0, 0);
+ tcp_clamp_window(c, conn, ntohs(th->window));
+
+ tcp_data_from_sock(c, conn);
- if (count == 1)
+ if (p->count == 1)
return 1;
}
@@ -2701,13 +2749,13 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
if (conn->events & TAP_FIN_RCVD) {
if (conn->events & SOCK_FIN_RCVD &&
conn->seq_ack_from_tap == conn->seq_to_tap)
- tcp_conn_destroy(c, conn);
+ conn_event(c, conn, CLOSED);
return 1;
}
/* Established connections accepting data from tap */
- tcp_data_from_tap(c, conn, msg, count);
+ tcp_data_from_tap(c, conn, p);
if (conn->seq_ack_to_tap != conn->seq_from_tap)
ack_due = 1;
@@ -2721,7 +2769,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
if (ack_due)
conn_flag(c, conn, ACK_TO_TAP_DUE);
- return count;
+ return p->count;
}
/**
@@ -2872,7 +2920,7 @@ static void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
if (!(conn->events & ESTABLISHED)) {
debug("TCP: index %i, handshake timeout", conn - tc);
tcp_rst(c, conn);
- } else if (conn->events & TAP_FIN_SENT) {
+ } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
debug("TCP: index %i, FIN timeout", conn - tc);
tcp_rst(c, conn);
} else if (conn->retrans == TCP_MAX_RETRANS) {
@@ -2884,6 +2932,7 @@ static void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
conn->retrans++;
conn->seq_to_tap = conn->seq_ack_from_tap;
tcp_data_from_sock(c, conn);
+ tcp_timer_ctl(c, conn);
}
} else {
struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } };
@@ -2933,19 +2982,22 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
if (!(conn = CONN_OR_NULL(ref.r.p.tcp.tcp.index)))
return;
+ if (conn->events == CLOSED)
+ return;
+
if (events & EPOLLERR) {
tcp_rst(c, conn);
return;
}
if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) {
- tcp_conn_destroy(c, conn);
+ conn_event(c, conn, CLOSED);
return;
}
if (conn->events & ESTABLISHED) {
if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
- tcp_conn_destroy(c, conn);
+ conn_event(c, conn, CLOSED);
if (events & (EPOLLRDHUP | EPOLLHUP))
conn_event(c, conn, SOCK_FIN_RCVD);
@@ -3159,7 +3211,7 @@ static int tcp_sock_refill(void *arg)
*
* Return: 0 on success, -1 on failure
*/
-int tcp_sock_init(struct ctx *c, struct timespec *now)
+int tcp_sock_init(struct ctx *c)
{
struct tcp_sock_refill_arg refill_arg = { c, 0 };
int i, port;
@@ -3215,7 +3267,6 @@ int tcp_sock_init(struct ctx *c, struct timespec *now)
memset(tcp_sock_init_ext, 0xff, sizeof(tcp_sock_init_ext));
memset(tcp_sock_ns, 0xff, sizeof(tcp_sock_ns));
- c->tcp.refill_ts = *now;
tcp_sock_refill(&refill_arg);
if (c->mode == MODE_PASTA) {
@@ -3226,7 +3277,7 @@ int tcp_sock_init(struct ctx *c, struct timespec *now)
refill_arg.ns = 1;
NS_CALL(tcp_sock_refill, &refill_arg);
- c->tcp.port_detect_ts = *now;
+ tcp_splice_timer(c);
}
return 0;
@@ -3349,47 +3400,48 @@ static int tcp_port_rebind(void *arg)
}
/**
- * tcp_timer() - Scan activity bitmap for sockets waiting for timed events
+ * tcp_timer() - Periodic tasks: port detection, closed connections, pool refill
* @c: Execution context
- * @now: Timestamp from caller
+ * @ts: Unused
*/
-void tcp_timer(struct ctx *c, struct timespec *now)
+void tcp_timer(struct ctx *c, struct timespec *ts)
{
struct tcp_sock_refill_arg refill_arg = { c, 0 };
+ struct tcp_conn *conn;
- if (c->mode == MODE_PASTA) {
- if (timespec_diff_ms(now, &c->tcp.port_detect_ts) >
- PORT_DETECT_INTERVAL) {
- struct tcp_port_detect_arg detect_arg = { c, 0 };
- struct tcp_port_rebind_arg rebind_arg = { c, 0 };
-
- if (c->tcp.init_detect_ports) {
- detect_arg.detect_in_ns = 0;
- tcp_port_detect(&detect_arg);
- rebind_arg.bind_in_ns = 1;
- NS_CALL(tcp_port_rebind, &rebind_arg);
- }
+ (void)ts;
- if (c->tcp.ns_detect_ports) {
- detect_arg.detect_in_ns = 1;
- NS_CALL(tcp_port_detect, &detect_arg);
- rebind_arg.bind_in_ns = 0;
- tcp_port_rebind(&rebind_arg);
- }
+ if (c->mode == MODE_PASTA) {
+ struct tcp_port_detect_arg detect_arg = { c, 0 };
+ struct tcp_port_rebind_arg rebind_arg = { c, 0 };
+
+ if (c->tcp.init_detect_ports) {
+ detect_arg.detect_in_ns = 0;
+ tcp_port_detect(&detect_arg);
+ rebind_arg.bind_in_ns = 1;
+ NS_CALL(tcp_port_rebind, &rebind_arg);
+ }
- c->tcp.port_detect_ts = *now;
+ if (c->tcp.ns_detect_ports) {
+ detect_arg.detect_in_ns = 1;
+ NS_CALL(tcp_port_detect, &detect_arg);
+ rebind_arg.bind_in_ns = 0;
+ tcp_port_rebind(&rebind_arg);
}
+ }
- tcp_splice_timer(c, now);
+ for (conn = CONN(c->tcp.conn_count - 1); conn >= tc; conn--) {
+ if (conn->events == CLOSED)
+ tcp_conn_destroy(c, conn);
}
- if (timespec_diff_ms(now, &c->tcp.refill_ts) > REFILL_INTERVAL) {
- tcp_sock_refill(&refill_arg);
- if (c->mode == MODE_PASTA) {
- refill_arg.ns = 1;
- if ((c->v4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) ||
- (c->v6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0))
- NS_CALL(tcp_sock_refill, &refill_arg);
- }
+ tcp_sock_refill(&refill_arg);
+ if (c->mode == MODE_PASTA) {
+ refill_arg.ns = 1;
+ if ((c->v4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) ||
+ (c->v6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0))
+ NS_CALL(tcp_sock_refill, &refill_arg);
+
+ tcp_splice_timer(c);
}
}