aboutgitcodebugslistschat
path: root/tcp.c
diff options
context:
space:
mode:
authorStefano Brivio <sbrivio@redhat.com>2022-03-25 13:02:47 +0100
committerStefano Brivio <sbrivio@redhat.com>2022-03-29 15:35:38 +0200
commitbb708111833e23cafda1a5dd377e13400fa1e452 (patch)
tree253e39ce109dc80e3ab13b2773212a460e4b5234 /tcp.c
parent3e4c2d10985027775683255e5d5fef5149ef8e0b (diff)
downloadpasst-bb708111833e23cafda1a5dd377e13400fa1e452.tar
passt-bb708111833e23cafda1a5dd377e13400fa1e452.tar.gz
passt-bb708111833e23cafda1a5dd377e13400fa1e452.tar.bz2
passt-bb708111833e23cafda1a5dd377e13400fa1e452.tar.lz
passt-bb708111833e23cafda1a5dd377e13400fa1e452.tar.xz
passt-bb708111833e23cafda1a5dd377e13400fa1e452.tar.zst
passt-bb708111833e23cafda1a5dd377e13400fa1e452.zip
treewide: Packet abstraction with mandatory boundary checks
Implement a packet abstraction providing boundary and size checks based on packet descriptors: packets stored in a buffer can be queued into a pool (without storage of its own), and data can be retrieved referring to an index in the pool, specifying offset and length. Checks ensure data is not read outside the boundaries of buffer and descriptors, and that packets added to a pool are within the buffer range with valid offset and indices. This implies a wider rework: usage of the "queueing" part of the abstraction mostly affects tap_handler_{passt,pasta}() functions and their callees, while the "fetching" part affects all the guest or tap facing implementations: TCP, UDP, ICMP, ARP, NDP, DHCP and DHCPv6 handlers. Suggested-by: Stefan Hajnoczi <stefanha@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Diffstat (limited to 'tcp.c')
-rw-r--r--tcp.c442
1 files changed, 247 insertions, 195 deletions
diff --git a/tcp.c b/tcp.c
index 802d6d8..e03561a 100644
--- a/tcp.c
+++ b/tcp.c
@@ -303,6 +303,9 @@
#define TCP_FRAMES \
(c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
+#define TCP_FILE_PRESSURE 30 /* % of c->nofile */
+#define TCP_CONN_PRESSURE 30 /* % of c->tcp.conn_count */
+
#define TCP_HASH_BUCKET_BITS (TCP_CONN_INDEX_BITS + 1)
#define TCP_HASH_TABLE_LOAD 70 /* % */
#define TCP_HASH_TABLE_SIZE (TCP_MAX_CONNS * 100 / \
@@ -440,6 +443,7 @@ struct tcp_conn {
#define TCP_MAX_RETRANS ((1U << TCP_RETRANS_BITS) - 1)
#define TCP_WS_BITS 4 /* RFC 7323 */
+#define TCP_WS_MAX 14
unsigned int ws_from_tap :TCP_WS_BITS;
unsigned int ws_to_tap :TCP_WS_BITS;
@@ -476,7 +480,6 @@ struct tcp_conn {
uint32_t seq_init_from_tap;
};
-#define CONN_IS_CLOSED(conn) (conn->events == CLOSED)
#define CONN_IS_CLOSING(conn) \
((conn->events & ESTABLISHED) && \
(conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
@@ -699,7 +702,7 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
return EPOLLET;
if (conn_flags & STALLED)
- return EPOLLIN | EPOLLRDHUP | EPOLLET;
+ return EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET;
return EPOLLIN | EPOLLRDHUP;
}
@@ -733,8 +736,11 @@ static int tcp_epoll_ctl(struct ctx *c, struct tcp_conn *conn)
.r.p.tcp.tcp.v6 = CONN_V6(conn) };
struct epoll_event ev = { .data.u64 = ref.u64 };
- if (CONN_IS_CLOSED(conn)) {
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev);
+ if (conn->events == CLOSED) {
+ if (conn->flags & IN_EPOLL)
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev);
+ if (conn->timer != -1)
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev);
return 0;
}
@@ -745,6 +751,18 @@ static int tcp_epoll_ctl(struct ctx *c, struct tcp_conn *conn)
conn->flags |= IN_EPOLL; /* No need to log this */
+ if (conn->timer != -1) {
+ union epoll_ref ref_t = { .r.proto = IPPROTO_TCP,
+ .r.s = conn->sock,
+ .r.p.tcp.tcp.timer = 1,
+ .r.p.tcp.tcp.index = conn - tc };
+ struct epoll_event ev_t = { .data.u64 = ref_t.u64,
+ .events = EPOLLIN | EPOLLET };
+
+ if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t))
+ return -errno;
+ }
+
return 0;
}
@@ -759,6 +777,9 @@ static void tcp_timer_ctl(struct ctx *c, struct tcp_conn *conn)
{
struct itimerspec it = { { 0 }, { 0 } };
+ if (conn->events == CLOSED)
+ return;
+
if (conn->timer == -1) {
union epoll_ref ref = { .r.proto = IPPROTO_TCP,
.r.s = conn->sock,
@@ -783,15 +804,11 @@ static void tcp_timer_ctl(struct ctx *c, struct tcp_conn *conn)
}
}
- if (conn->events == CLOSED) {
- it.it_value.tv_nsec = 1;
- } else if (conn->flags & ACK_TO_TAP_DUE) {
+ if (conn->flags & ACK_TO_TAP_DUE) {
it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000;
} else if (conn->flags & ACK_FROM_TAP_DUE) {
if (!(conn->events & ESTABLISHED))
it.it_value.tv_sec = SYN_TIMEOUT;
- else if (conn->events & TAP_FIN_SENT)
- it.it_value.tv_sec = FIN_TIMEOUT;
else
it.it_value.tv_sec = ACK_TIMEOUT;
} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
@@ -834,7 +851,9 @@ static void conn_flag_do(struct ctx *c, struct tcp_conn *conn,
if (flag == STALLED || flag == ~STALLED)
tcp_epoll_ctl(c, conn);
- if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE)
+ if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE ||
+ (flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) ||
+ (flag == ~ACK_TO_TAP_DUE && (conn->flags & ACK_FROM_TAP_DUE)))
tcp_timer_ctl(c, conn);
}
@@ -888,7 +907,7 @@ static void conn_event_do(struct ctx *c, struct tcp_conn *conn,
else
tcp_epoll_ctl(c, conn);
- if (event == CLOSED || CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
+ if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
tcp_timer_ctl(c, conn);
}
@@ -1182,36 +1201,32 @@ static void tcp_sock6_iov_init(void)
/**
* tcp_opt_get() - Get option, and value if any, from TCP header
- * @th: Pointer to TCP header
- * @len: Length of buffer, including TCP header
+ * @opts: Pointer to start of TCP options in header
+ * @len: Length of buffer, excluding TCP header -- NOT checked here!
* @type_find: Option type to look for
* @optlen_set: Optional, filled with option length if passed
* @value_set: Optional, set to start of option value if passed
*
* Return: option value, meaningful for up to 4 bytes, -1 if not found
*/
-static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_find,
+static int tcp_opt_get(char *opts, size_t len, uint8_t type_find,
uint8_t *optlen_set, char **value_set)
{
uint8_t type, optlen;
- char *p;
-
- if (len > (size_t)th->doff * 4)
- len = (size_t)th->doff * 4;
- len -= sizeof(*th);
- p = (char *)(th + 1);
+ if (!len)
+ return -1;
- for (; len >= 2; p += optlen, len -= optlen) {
- switch (*p) {
+ for (; len >= 2; opts += optlen, len -= optlen) {
+ switch (*opts) {
case OPT_EOL:
return -1;
case OPT_NOP:
optlen = 1;
break;
default:
- type = *(p++);
- optlen = *(p++) - 2;
+ type = *(opts++);
+ optlen = *(opts++) - 2;
len -= 2;
if (type != type_find)
@@ -1220,17 +1235,17 @@ static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_find,
if (optlen_set)
*optlen_set = optlen;
if (value_set)
- *value_set = p;
+ *value_set = opts;
switch (optlen) {
case 0:
return 0;
case 1:
- return *p;
+ return *opts;
case 2:
- return ntohs(*(uint16_t *)p);
+ return ntohs(*(uint16_t *)opts);
default:
- return ntohl(*(uint32_t *)p);
+ return ntohl(*(uint32_t *)opts);
}
}
}
@@ -1415,12 +1430,12 @@ static void tcp_table_compact(struct ctx *c, struct tcp_conn *hole)
if ((hole - tc) == --c->tcp.conn_count) {
debug("TCP: hash table compaction: index %i (%p) was max index",
hole - tc, hole);
+ memset(hole, 0, sizeof(*hole));
return;
}
from = CONN(c->tcp.conn_count);
memcpy(hole, from, sizeof(*hole));
- from->flags = from->events = 0;
to = hole;
tcp_hash_update(from, to);
@@ -1430,25 +1445,23 @@ static void tcp_table_compact(struct ctx *c, struct tcp_conn *hole)
debug("TCP: hash table compaction: old index %i, new index %i, "
"sock %i, from: %p, to: %p",
from - tc, to - tc, from->sock, from, to);
+
+ memset(from, 0, sizeof(*from));
}
/**
- * tcp_conn_destroy() - Close connection, drop from epoll file descriptor
+ * tcp_conn_destroy() - Close sockets, trigger hash table removal and compaction
* @c: Execution context
* @conn: Connection pointer
*/
static void tcp_conn_destroy(struct ctx *c, struct tcp_conn *conn)
{
- if (CONN_IS_CLOSED(conn))
- return;
-
- conn_event(c, conn, CLOSED);
- conn->flags = 0;
close(conn->sock);
+ if (conn->timer != -1)
+ close(conn->timer);
- /* Removal from hash table and connection table compaction deferred to
- * timer.
- */
+ tcp_hash_remove(conn);
+ tcp_table_compact(c, conn);
}
static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn);
@@ -1582,9 +1595,23 @@ static void tcp_l2_data_buf_flush(struct ctx *c)
*/
void tcp_defer_handler(struct ctx *c)
{
+ int max_conns = c->tcp.conn_count / 100 * TCP_CONN_PRESSURE;
+ int max_files = c->nofile / 100 * TCP_FILE_PRESSURE;
+ struct tcp_conn *conn;
+
tcp_l2_flags_buf_flush(c);
tcp_l2_data_buf_flush(c);
+
tcp_splice_defer_handler(c);
+
+ if (c->tcp.conn_count < MIN(max_files, max_conns))
+ return;
+
+ for (conn = CONN(c->tcp.conn_count - 1); conn >= tc; conn--) {
+ if (conn->events == CLOSED)
+ tcp_conn_destroy(c, conn);
+ }
+
}
/**
@@ -1605,13 +1632,19 @@ static size_t tcp_l2_buf_fill_headers(struct ctx *c, struct tcp_conn *conn,
size_t ip_len, eth_len;
#define SET_TCP_HEADER_COMMON_V4_V6(b, conn, seq) \
- do { \
- b->th.source = htons(conn->sock_port); \
- b->th.dest = htons(conn->tap_port); \
- b->th.seq = htonl(seq); \
- b->th.ack_seq = htonl(conn->seq_ack_to_tap); \
- b->th.window = htons(MIN(conn->wnd_to_tap, USHRT_MAX)); \
- } while (0)
+do { \
+ b->th.source = htons(conn->sock_port); \
+ b->th.dest = htons(conn->tap_port); \
+ b->th.seq = htonl(seq); \
+ b->th.ack_seq = htonl(conn->seq_ack_to_tap); \
+ if (conn->events & ESTABLISHED) { \
+ b->th.window = htons(conn->wnd_to_tap); \
+ } else { \
+ unsigned wnd = conn->wnd_to_tap << conn->ws_to_tap; \
+ \
+ b->th.window = htons(MIN(wnd, USHRT_MAX)); \
+ } \
+} while (0)
if (CONN_V6(conn)) {
struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p;
@@ -1692,7 +1725,7 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn,
conn->seq_ack_to_tap = prev_ack_to_tap;
#else
if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || tcp_rtt_dst_low(conn)
- || CONN_IS_CLOSING(conn) || conn->flags & LOCAL || force_seq) {
+ || CONN_IS_CLOSING(conn) || (conn->flags & LOCAL) || force_seq) {
conn->seq_ack_to_tap = conn->seq_from_tap;
} else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
if (!tinfo) {
@@ -1712,17 +1745,19 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn,
if (!KERNEL_REPORTS_SND_WND(c)) {
tcp_get_sndbuf(conn);
new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW);
- conn->wnd_to_tap = new_wnd_to_tap >> conn->ws_to_tap;
+ conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap,
+ USHRT_MAX);
goto out;
}
if (!tinfo) {
- if (prev_wnd_to_tap > WINDOW_DEFAULT)
+ if (prev_wnd_to_tap > WINDOW_DEFAULT) {
goto out;
-
+}
tinfo = &tinfo_new;
- if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
+ if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) {
goto out;
+}
}
#ifdef HAS_SND_WND
@@ -1735,10 +1770,15 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn,
}
#endif
- conn->wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW) >> conn->ws_to_tap;
+ new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW);
+ if (!(conn->events & ESTABLISHED))
+ new_wnd_to_tap = MAX(new_wnd_to_tap, WINDOW_DEFAULT);
+
+ conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap, USHRT_MAX);
if (!conn->wnd_to_tap)
conn_flag(c, conn, ACK_TO_TAP_DUE);
+
out:
return new_wnd_to_tap != prev_wnd_to_tap ||
conn->seq_ack_to_tap != prev_ack_to_tap;
@@ -1772,10 +1812,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags)
return 0;
if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
- tcp_conn_destroy(c, conn);
+ conn_event(c, conn, CLOSED);
return -ECONNRESET;
}
+#ifdef HAS_SND_WND
+ if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd)
+ c->tcp.kernel_snd_wnd = 1;
+#endif
+
if (!(conn->flags & LOCAL))
tcp_rtt_dst_check(conn, &tinfo);
@@ -1825,11 +1870,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags)
data += OPT_MSS_LEN - 2;
th->doff += OPT_MSS_LEN / 4;
-#ifdef HAS_SND_WND
- if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd)
- c->tcp.kernel_snd_wnd = 1;
-#endif
-
conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale);
*data++ = OPT_NOP;
@@ -1854,10 +1894,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags)
NULL, conn->seq_to_tap);
iov->iov_len = eth_len + sizeof(uint32_t);
- /* First value is not scaled: scale now */
- if (flags & SYN)
- conn->wnd_to_tap >>= conn->ws_to_tap;
-
if (CONN_V4(conn))
tcp4_l2_flags_buf_bytes += iov->iov_len;
else
@@ -1905,68 +1941,55 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags)
*/
static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn)
{
- if (CONN_IS_CLOSED(conn))
+ if (conn->events == CLOSED)
return;
if (!tcp_send_flag(c, conn, RST))
- tcp_conn_destroy(c, conn);
+ conn_event(c, conn, CLOSED);
}
/**
- * tcp_clamp_window() - Set window and scaling from option, clamp on socket
+ * tcp_get_tap_ws() - Get Window Scaling option for connection from tap/guest
* @conn: Connection pointer
- * @th: TCP header, from tap, can be NULL if window is passed
- * @len: Buffer length, at L4, can be 0 if no header is passed
- * @window: Window value, host order, unscaled, if no header is passed
- * @init: Set if this is the very first segment from tap
- */
-static void tcp_clamp_window(struct ctx *c, struct tcp_conn *conn,
- struct tcphdr *th, int len, unsigned int window,
- int init)
+ * @opts: Pointer to start of TCP options
+ * @optlen: Bytes in options: caller MUST ensure available length
+ */
+static void tcp_get_tap_ws(struct tcp_conn *conn, char *opts, size_t optlen)
{
- if (init && th) {
- int ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
+ int ws = tcp_opt_get(opts, optlen, OPT_WS, NULL, NULL);
- conn->ws_from_tap = ws & 0xf;
+ if (ws >= 0 && ws <= TCP_WS_MAX)
+ conn->ws_from_tap = ws;
+ else
+ conn->ws_from_tap = 0;
+}
- /* RFC 7323, 2.2: first value is not scaled. Also, don't clamp
- * yet, to avoid getting a zero scale just because we set a
- * small window now.
- */
- conn->wnd_from_tap = ntohs(th->window);
- } else {
- uint32_t prev_scaled = conn->wnd_from_tap << conn->ws_from_tap;
+/**
+ * tcp_clamp_window() - Set new window for connection, clamp on socket
+ * @c: Execution context
+ * @conn: Connection pointer
+ * @window: Window value, host order, unscaled
+ */
+static void tcp_clamp_window(struct ctx *c, struct tcp_conn *conn, unsigned wnd)
+{
+ uint32_t prev_scaled = conn->wnd_from_tap << conn->ws_from_tap;
- if (th)
- window = ntohs(th->window) << conn->ws_from_tap;
- else
- window <<= conn->ws_from_tap;
-
- window = MIN(MAX_WINDOW, window);
-
- if (conn->flags & WND_CLAMPED) {
- if (prev_scaled == window)
- return;
-
- /* Discard +/- 1% updates to spare some syscalls. */
- if ((window > prev_scaled &&
- window * 99 / 100 < prev_scaled) ||
- (window < prev_scaled &&
- window * 101 / 100 > prev_scaled)) {
- conn->wnd_from_tap = window >>
- conn->ws_from_tap;
- return;
- }
- }
+ wnd <<= conn->ws_from_tap;
+ wnd = MIN(MAX_WINDOW, wnd);
- if (window < 256)
- window = 256;
+ if (conn->flags & WND_CLAMPED) {
+ if (prev_scaled == wnd)
+ return;
- conn->wnd_from_tap = window >> conn->ws_from_tap;
- setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP,
- &window, sizeof(window));
- conn_flag(c, conn, WND_CLAMPED);
+ /* Discard +/- 1% updates to spare some syscalls. */
+ if ((wnd > prev_scaled && wnd * 99 / 100 < prev_scaled) ||
+ (wnd < prev_scaled && wnd * 101 / 100 > prev_scaled))
+ return;
}
+
+ conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
+ setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP, &wnd, sizeof(wnd));
+ conn_flag(c, conn, WND_CLAMPED);
}
/**
@@ -2059,18 +2082,18 @@ static int tcp_conn_new_sock(struct ctx *c, sa_family_t af)
* tcp_conn_tap_mss() - Get and clamp MSS value advertised by tap/guest
* @c: Execution context
* @conn: Connection pointer
- * @th: TCP header send by tap/guest
- * @len: L4 packet length, host order
+ * @opts: Pointer to start of TCP options
+ * @optlen: Bytes in options: caller MUST ensure available length
*
* Return: clamped MSS value
*/
static uint16_t tcp_conn_tap_mss(struct ctx *c, struct tcp_conn *conn,
- struct tcphdr *th, size_t len)
+ char *opts, size_t optlen)
{
unsigned int mss;
int ret;
- if ((ret = tcp_opt_get(th, len, OPT_MSS, NULL, NULL)) < 0)
+ if ((ret = tcp_opt_get(opts, optlen, OPT_MSS, NULL, NULL)) < 0)
mss = MSS_DEFAULT;
else
mss = ret;
@@ -2091,12 +2114,13 @@ static uint16_t tcp_conn_tap_mss(struct ctx *c, struct tcp_conn *conn,
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
* @addr: Remote address, pointer to sin_addr or sin6_addr
- * @th: TCP header from tap
- * @len: Packet length at L4
+ * @th: TCP header from tap: caller MUST ensure it's there
+ * @opts: Pointer to start of options
+ * @optlen: Bytes in options: caller MUST ensure available length
* @now: Current timestamp
*/
static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
- struct tcphdr *th, size_t len,
+ struct tcphdr *th, char *opts, size_t optlen,
struct timespec *now)
{
struct sockaddr_in addr4 = {
@@ -2142,16 +2166,21 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
conn = CONN(c->tcp.conn_count++);
conn->sock = s;
conn->timer = -1;
- conn->ws_to_tap = conn->ws_from_tap = 0;
conn_event(c, conn, TAP_SYN_RCVD);
conn->wnd_to_tap = WINDOW_DEFAULT;
- mss = tcp_conn_tap_mss(c, conn, th, len);
+ mss = tcp_conn_tap_mss(c, conn, opts, optlen);
setsockopt(s, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss));
MSS_SET(conn, mss);
- tcp_clamp_window(c, conn, th, len, 0, 1);
+ tcp_get_tap_ws(conn, opts, optlen);
+
+ /* RFC 7323, 2.2: first value is not scaled. Also, don't clamp yet, to
+ * avoid getting a zero scale just because we set a small window now.
+ */
+ if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap)))
+ conn->wnd_from_tap = 1;
if (af == AF_INET) {
sa = (struct sockaddr *)&addr4;
@@ -2395,53 +2424,52 @@ zero_len:
}
/**
- * tcp_data_from_tap() - tap data for established connection
+ * tcp_data_from_tap() - tap/guest data for established connection
* @c: Execution context
* @conn: Connection pointer
- * @msg: Array of messages from tap
- * @count: Count of messages
+ * @p: Pool of TCP packets, with TCP headers
*
* #syscalls sendmsg
*/
static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn,
- struct tap_l4_msg *msg, int count)
+ struct pool *p)
{
- int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1;
+ int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0;
uint16_t max_ack_seq_wnd = conn->wnd_from_tap;
uint32_t max_ack_seq = conn->seq_ack_from_tap;
uint32_t seq_from_tap = conn->seq_from_tap;
struct msghdr mh = { .msg_iov = tcp_iov };
- int partial_send = 0;
- uint16_t len;
+ size_t len;
ssize_t n;
- for (i = 0, iov_i = 0; i < count; i++) {
+ for (i = 0, iov_i = 0; i < (int)p->count; i++) {
uint32_t seq, seq_offset, ack_seq;
struct tcphdr *th;
char *data;
size_t off;
- th = (struct tcphdr *)(pkt_buf + msg[i].pkt_buf_offset);
- len = msg[i].l4_len;
-
- if (len < sizeof(*th)) {
+ packet_get(p, i, 0, 0, &len);
+ th = packet_get(p, i, 0, sizeof(*th), NULL);
+ if (!th) {
tcp_rst(c, conn);
return;
}
- off = (size_t)th->doff * 4;
+ off = th->doff * 4UL;
if (off < sizeof(*th) || off > len) {
tcp_rst(c, conn);
return;
}
if (th->rst) {
- tcp_conn_destroy(c, conn);
+ conn_event(c, conn, CLOSED);
return;
}
len -= off;
- data = (char *)th + off;
+ data = packet_get(p, i, off, len, NULL);
+ if (!data)
+ continue;
seq = ntohl(th->seq);
ack_seq = ntohl(th->ack_seq);
@@ -2511,7 +2539,7 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn,
i = keep - 1;
}
- tcp_clamp_window(c, conn, NULL, 0, max_ack_seq_wnd, 0);
+ tcp_clamp_window(c, conn, max_ack_seq_wnd);
if (ack) {
if (max_ack_seq == conn->seq_to_tap) {
@@ -2595,14 +2623,22 @@ out:
* tcp_conn_from_sock_finish() - Complete connection setup after connect()
* @c: Execution context
* @conn: Connection pointer
- * @th: TCP header of SYN, ACK segment from tap/guest
- * @len: Packet length of SYN, ACK segment at L4, host order
+ * @th: TCP header of SYN, ACK segment: caller MUST ensure it's there
+ * @opts: Pointer to start of options
+ * @optlen: Bytes in options: caller MUST ensure available length
*/
static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn,
- struct tcphdr *th, size_t len)
+ struct tcphdr *th,
+ char *opts, size_t optlen)
{
- tcp_clamp_window(c, conn, th, len, 0, 1);
- MSS_SET(conn, tcp_conn_tap_mss(c, conn, th, len));
+ tcp_clamp_window(c, conn, ntohs(th->window));
+ tcp_get_tap_ws(conn, opts, optlen);
+
+ /* First value is not scaled */
+ if (!(conn->wnd_from_tap >>= conn->ws_from_tap))
+ conn->wnd_from_tap = 1;
+
+ MSS_SET(conn, tcp_conn_tap_mss(c, conn, opts, optlen));
conn->seq_init_from_tap = ntohl(th->seq) + 1;
conn->seq_from_tap = conn->seq_init_from_tap;
@@ -2622,32 +2658,42 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn,
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
* @addr: Destination address
- * @msg: Input messages
- * @count: Message count
+ * @p: Pool of TCP packets, with TCP headers
* @now: Current timestamp
*
* Return: count of consumed packets
*/
-int tcp_tap_handler(struct ctx *c, int af, void *addr,
- struct tap_l4_msg *msg, int count, struct timespec *now)
+int tcp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p,
+ struct timespec *now)
{
- struct tcphdr *th = (struct tcphdr *)(pkt_buf + msg[0].pkt_buf_offset);
- uint16_t len = msg[0].l4_len;
struct tcp_conn *conn;
+ size_t optlen, len;
+ struct tcphdr *th;
int ack_due = 0;
+ char *opts;
+
+ packet_get(p, 0, 0, 0, &len);
+ th = packet_get(p, 0, 0, sizeof(*th), NULL);
+ if (!th)
+ return 1;
+
+ optlen = th->doff * 4UL - sizeof(*th);
+ opts = packet_get(p, 0, sizeof(*th), optlen, NULL);
conn = tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest));
/* New connection from tap */
if (!conn) {
if (th->syn && !th->ack)
- tcp_conn_from_tap(c, af, addr, th, len, now);
+ tcp_conn_from_tap(c, af, addr, th, opts, optlen, now);
return 1;
}
+ trace("TCP: packet length %lu from tap for index %lu", len, conn - tc);
+
if (th->rst) {
- tcp_conn_destroy(c, conn);
- return count;
+ conn_event(c, conn, CLOSED);
+ return p->count;
}
if (th->ack) {
@@ -2660,7 +2706,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
/* Establishing connection from socket */
if (conn->events & SOCK_ACCEPTED) {
if (th->syn && th->ack && !th->fin)
- tcp_conn_from_sock_finish(c, conn, th, len);
+ tcp_conn_from_sock_finish(c, conn, th, opts, optlen);
else
tcp_rst(c, conn);
@@ -2671,7 +2717,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
if (conn->events & TAP_SYN_RCVD) {
if (!(conn->events & TAP_SYN_ACK_SENT)) {
tcp_rst(c, conn);
- return count;
+ return p->count;
}
conn_event(c, conn, ESTABLISHED);
@@ -2683,17 +2729,19 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
tcp_send_flag(c, conn, ACK);
conn_event(c, conn, SOCK_FIN_SENT);
- return count;
+ return p->count;
}
if (!th->ack) {
tcp_rst(c, conn);
- return count;
+ return p->count;
}
- tcp_clamp_window(c, conn, th, len, 0, 0);
+ tcp_clamp_window(c, conn, ntohs(th->window));
+
+ tcp_data_from_sock(c, conn);
- if (count == 1)
+ if (p->count == 1)
return 1;
}
@@ -2701,13 +2749,13 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
if (conn->events & TAP_FIN_RCVD) {
if (conn->events & SOCK_FIN_RCVD &&
conn->seq_ack_from_tap == conn->seq_to_tap)
- tcp_conn_destroy(c, conn);
+ conn_event(c, conn, CLOSED);
return 1;
}
/* Established connections accepting data from tap */
- tcp_data_from_tap(c, conn, msg, count);
+ tcp_data_from_tap(c, conn, p);
if (conn->seq_ack_to_tap != conn->seq_from_tap)
ack_due = 1;
@@ -2721,7 +2769,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
if (ack_due)
conn_flag(c, conn, ACK_TO_TAP_DUE);
- return count;
+ return p->count;
}
/**
@@ -2872,7 +2920,7 @@ static void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
if (!(conn->events & ESTABLISHED)) {
debug("TCP: index %i, handshake timeout", conn - tc);
tcp_rst(c, conn);
- } else if (conn->events & TAP_FIN_SENT) {
+ } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
debug("TCP: index %i, FIN timeout", conn - tc);
tcp_rst(c, conn);
} else if (conn->retrans == TCP_MAX_RETRANS) {
@@ -2884,6 +2932,7 @@ static void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
conn->retrans++;
conn->seq_to_tap = conn->seq_ack_from_tap;
tcp_data_from_sock(c, conn);
+ tcp_timer_ctl(c, conn);
}
} else {
struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } };
@@ -2933,19 +2982,22 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
if (!(conn = CONN_OR_NULL(ref.r.p.tcp.tcp.index)))
return;
+ if (conn->events == CLOSED)
+ return;
+
if (events & EPOLLERR) {
tcp_rst(c, conn);
return;
}
if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) {
- tcp_conn_destroy(c, conn);
+ conn_event(c, conn, CLOSED);
return;
}
if (conn->events & ESTABLISHED) {
if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
- tcp_conn_destroy(c, conn);
+ conn_event(c, conn, CLOSED);
if (events & (EPOLLRDHUP | EPOLLHUP))
conn_event(c, conn, SOCK_FIN_RCVD);
@@ -3159,7 +3211,7 @@ static int tcp_sock_refill(void *arg)
*
* Return: 0 on success, -1 on failure
*/
-int tcp_sock_init(struct ctx *c, struct timespec *now)
+int tcp_sock_init(struct ctx *c)
{
struct tcp_sock_refill_arg refill_arg = { c, 0 };
int i, port;
@@ -3215,7 +3267,6 @@ int tcp_sock_init(struct ctx *c, struct timespec *now)
memset(tcp_sock_init_ext, 0xff, sizeof(tcp_sock_init_ext));
memset(tcp_sock_ns, 0xff, sizeof(tcp_sock_ns));
- c->tcp.refill_ts = *now;
tcp_sock_refill(&refill_arg);
if (c->mode == MODE_PASTA) {
@@ -3226,7 +3277,7 @@ int tcp_sock_init(struct ctx *c, struct timespec *now)
refill_arg.ns = 1;
NS_CALL(tcp_sock_refill, &refill_arg);
- c->tcp.port_detect_ts = *now;
+ tcp_splice_timer(c);
}
return 0;
@@ -3349,47 +3400,48 @@ static int tcp_port_rebind(void *arg)
}
/**
- * tcp_timer() - Scan activity bitmap for sockets waiting for timed events
+ * tcp_timer() - Periodic tasks: port detection, closed connections, pool refill
* @c: Execution context
- * @now: Timestamp from caller
+ * @ts: Unused
*/
-void tcp_timer(struct ctx *c, struct timespec *now)
+void tcp_timer(struct ctx *c, struct timespec *ts)
{
struct tcp_sock_refill_arg refill_arg = { c, 0 };
+ struct tcp_conn *conn;
- if (c->mode == MODE_PASTA) {
- if (timespec_diff_ms(now, &c->tcp.port_detect_ts) >
- PORT_DETECT_INTERVAL) {
- struct tcp_port_detect_arg detect_arg = { c, 0 };
- struct tcp_port_rebind_arg rebind_arg = { c, 0 };
-
- if (c->tcp.init_detect_ports) {
- detect_arg.detect_in_ns = 0;
- tcp_port_detect(&detect_arg);
- rebind_arg.bind_in_ns = 1;
- NS_CALL(tcp_port_rebind, &rebind_arg);
- }
+ (void)ts;
- if (c->tcp.ns_detect_ports) {
- detect_arg.detect_in_ns = 1;
- NS_CALL(tcp_port_detect, &detect_arg);
- rebind_arg.bind_in_ns = 0;
- tcp_port_rebind(&rebind_arg);
- }
+ if (c->mode == MODE_PASTA) {
+ struct tcp_port_detect_arg detect_arg = { c, 0 };
+ struct tcp_port_rebind_arg rebind_arg = { c, 0 };
+
+ if (c->tcp.init_detect_ports) {
+ detect_arg.detect_in_ns = 0;
+ tcp_port_detect(&detect_arg);
+ rebind_arg.bind_in_ns = 1;
+ NS_CALL(tcp_port_rebind, &rebind_arg);
+ }
- c->tcp.port_detect_ts = *now;
+ if (c->tcp.ns_detect_ports) {
+ detect_arg.detect_in_ns = 1;
+ NS_CALL(tcp_port_detect, &detect_arg);
+ rebind_arg.bind_in_ns = 0;
+ tcp_port_rebind(&rebind_arg);
}
+ }
- tcp_splice_timer(c, now);
+ for (conn = CONN(c->tcp.conn_count - 1); conn >= tc; conn--) {
+ if (conn->events == CLOSED)
+ tcp_conn_destroy(c, conn);
}
- if (timespec_diff_ms(now, &c->tcp.refill_ts) > REFILL_INTERVAL) {
- tcp_sock_refill(&refill_arg);
- if (c->mode == MODE_PASTA) {
- refill_arg.ns = 1;
- if ((c->v4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) ||
- (c->v6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0))
- NS_CALL(tcp_sock_refill, &refill_arg);
- }
+ tcp_sock_refill(&refill_arg);
+ if (c->mode == MODE_PASTA) {
+ refill_arg.ns = 1;
+ if ((c->v4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) ||
+ (c->v6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0))
+ NS_CALL(tcp_sock_refill, &refill_arg);
+
+ tcp_splice_timer(c);
}
}