aboutgitcodebugslistschat
path: root/tcp.c
diff options
context:
space:
mode:
Diffstat (limited to 'tcp.c')
-rw-r--r--tcp.c696
1 files changed, 364 insertions, 332 deletions
diff --git a/tcp.c b/tcp.c
index fa95f6b..a0d7cd8 100644
--- a/tcp.c
+++ b/tcp.c
@@ -190,22 +190,27 @@
* - RTO_INIT_AFTER_SYN_RETRIES: if SYN retries happened during handshake and
* RTO is less than this, re-initialise RTO to this for data retransmissions
*
- * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
- * with TAP_FIN_SENT event), and no ACK is received within this time, reset
- * the connection
+ * - RTT / 2 elapsed after data segment received from tap without having
+ * sent an ACK segment, or zero-sized window advertised to tap/guest (flag
+ * ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent.
*
- * - FIN_TIMEOUT: if a FIN segment was acknowledged by tap/guest and a FIN
- * segment (write shutdown) was sent via socket (events SOCK_FIN_SENT and
- * TAP_FIN_ACKED), but no socket activity is detected from the socket within
- * this time, reset the connection
+ * RTT, here, is an approximation of the RTT value reported by the kernel via
+ * TCP_INFO, with a representable range from RTT_STORE_MIN (100 us) to
+ * RTT_STORE_MAX (3276.8 ms). The timeout value is clamped accordingly.
*
- * - ACT_TIMEOUT, in the presence of any event: if no activity is detected on
- * either side, the connection is reset
+ * We also use a global interval timer for an activity timeout which doesn't
+ * require precision:
*
- * - ACK_INTERVAL elapsed after data segment received from tap without having
- * sent an ACK segment, or zero-sized window advertised to tap/guest (flag
- * ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent
+ * - INACTIVITY_INTERVAL: if a connection has had no activity for an entire
+ * interval, close and reset it. This means that idle connections (without
+ * keepalives) will be removed between INACTIVITY_INTERVAL s and
+ * 2*INACTIVITY_INTERVAL s after the last activity.
*
+ * - KEEPALIVE_INTERVAL: if a connection has had no tap-side activity for an
+ * entire interval, send a tap-side keepalive. If the endpoint is no longer
+ * aware of the connection (due to a reboot, or a kernel timeout in FIN_WAIT_2
+ * state) that should trigger an RST, so we won't keep track of connections
+ * that the guest endpoint no longer cares about.
*
* Summary of data flows (with ESTABLISHED event)
* ----------------------------------------------
@@ -297,8 +302,6 @@
#include "ip.h"
#include "passt.h"
#include "tap.h"
-#include "siphash.h"
-#include "pcap.h"
#include "tcp_splice.h"
#include "log.h"
#include "inany.h"
@@ -341,15 +344,25 @@ enum {
#define MSS_DEFAULT 536
#define WINDOW_DEFAULT 14600 /* RFC 6928 */
-#define ACK_INTERVAL 10 /* ms */
#define RTO_INIT 1 /* s, RFC 6298 */
#define RTO_INIT_AFTER_SYN_RETRIES 3 /* s, RFC 6298 */
-#define FIN_TIMEOUT 60
-#define ACT_TIMEOUT 7200
+
+#define INACTIVITY_INTERVAL 7200 /* s */
+#define KEEPALIVE_INTERVAL 30 /* s */
#define LOW_RTT_TABLE_SIZE 8
#define LOW_RTT_THRESHOLD 10 /* us */
+/* Parameters to temporarily exceed sending buffer to force TCP auto-tuning */
+#define SNDBUF_BOOST_BYTES_RTT_LO 2500 /* B * s: no boost until here */
+/* ...examples: 5 MB sent * 500 ns RTT, 250 kB * 10 ms, 8 kB * 300 ms */
+#define SNDBUF_BOOST_FACTOR 150 /* % */
+#define SNDBUF_BOOST_BYTES_RTT_HI 6000 /* apply full boost factor */
+/* 12 MB sent * 500 ns RTT, 600 kB * 10 ms, 20 kB * 300 ms */
+
+/* Ratio of buffer to bandwidth * delay product implying interactive traffic */
+#define SNDBUF_TO_BW_DELAY_INTERACTIVE /* > */ 20 /* (i.e. < 5% of buffer) */
+
#define ACK_IF_NEEDED 0 /* See tcp_send_flag() */
#define CONN_IS_CLOSING(conn) \
@@ -401,10 +414,6 @@ static const char *tcp_flag_str[] __attribute((__unused__)) = {
"ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS", "SYN_RETRIED",
};
-/* Listening sockets, used for automatic port forwarding in pasta mode only */
-static int tcp_sock_init_ext [NUM_PORTS][IP_VERSIONS];
-static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
-
/* Table of our guest side addresses with very low RTT (assumed to be local to
* the host), LRU
*/
@@ -423,11 +432,13 @@ socklen_t tcp_info_size;
sizeof(((struct tcp_info_linux *)NULL)->tcpi_##f_)) <= tcp_info_size)
/* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */
-#define snd_wnd_cap tcp_info_cap(snd_wnd)
+#define snd_wnd_cap tcp_info_cap(snd_wnd)
/* Kernel reports bytes acked in TCP_INFO (kernel commit 0df48c26d84) */
-#define bytes_acked_cap tcp_info_cap(bytes_acked)
+#define bytes_acked_cap tcp_info_cap(bytes_acked)
/* Kernel reports minimum RTT in TCP_INFO (kernel commit cd9b266095f4) */
-#define min_rtt_cap tcp_info_cap(min_rtt)
+#define min_rtt_cap tcp_info_cap(min_rtt)
+/* Kernel reports delivery rate in TCP_INFO (kernel commit eb8329e0a04d) */
+#define delivery_rate_cap tcp_info_cap(delivery_rate)
/* sendmsg() to socket */
static struct iovec tcp_iov [UIO_MAXIOV];
@@ -508,47 +519,30 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
/**
* tcp_epoll_ctl() - Add/modify/delete epoll state from connection events
- * @c: Execution context
* @conn: Connection pointer
*
* Return: 0 on success, negative error code on failure (not on deletion)
*/
-static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
+static int tcp_epoll_ctl(struct tcp_tap_conn *conn)
{
- int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
- union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock,
- .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), };
- struct epoll_event ev = { .data.u64 = ref.u64 };
- int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f)
- : c->epollfd;
+ uint32_t events;
if (conn->events == CLOSED) {
- if (flow_in_epoll(&conn->f))
- epoll_del(epollfd, conn->sock);
+ int epollfd = flow_epollfd(&conn->f);
+
+ epoll_del(epollfd, conn->sock);
if (conn->timer != -1)
epoll_del(epollfd, conn->timer);
+
return 0;
}
- ev.events = tcp_conn_epoll_events(conn->events, conn->flags);
+ events = tcp_conn_epoll_events(conn->events, conn->flags);
- if (epoll_ctl(epollfd, m, conn->sock, &ev))
+ if (flow_epoll_set(&conn->f, EPOLL_CTL_MOD, events, conn->sock,
+ !TAPSIDE(conn)) < 0)
return -errno;
- flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
-
- if (conn->timer != -1) {
- union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER,
- .fd = conn->sock,
- .flow = FLOW_IDX(conn) };
- struct epoll_event ev_t = { .data.u64 = ref_t.u64,
- .events = EPOLLIN | EPOLLET };
-
- if (epoll_ctl(flow_epollfd(&conn->f), EPOLL_CTL_MOD,
- conn->timer, &ev_t))
- return -errno;
- }
-
return 0;
}
@@ -556,8 +550,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
* tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed
* @c: Execution context
* @conn: Connection pointer
- *
- * #syscalls timerfd_create timerfd_settime
+ * #syscalls timerfd_create timerfd_settime|timerfd_settime32
*/
static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
{
@@ -567,34 +560,38 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
return;
if (conn->timer == -1) {
- union epoll_ref ref = { .type = EPOLL_TYPE_TCP_TIMER,
- .fd = conn->sock,
- .flow = FLOW_IDX(conn) };
- struct epoll_event ev = { .data.u64 = ref.u64,
- .events = EPOLLIN | EPOLLET };
- int epollfd = flow_epollfd(&conn->f);
+ union epoll_ref ref;
int fd;
fd = timerfd_create(CLOCK_MONOTONIC, 0);
- if (fd == -1 || fd > FD_REF_MAX) {
+ if (fd == -1) {
flow_dbg_perror(conn, "failed to get timer");
- if (fd > -1)
- close(fd);
- conn->timer = -1;
return;
}
- conn->timer = fd;
+ if (fd > FD_REF_MAX) {
+ flow_dbg(conn, "timer fd overflow (%d > %d)",
+ fd, FD_REF_MAX);
+ close(fd);
+ return;
+ }
- if (epoll_ctl(epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
- flow_dbg_perror(conn, "failed to add timer");
- close(conn->timer);
- conn->timer = -1;
+ ref.type = EPOLL_TYPE_TCP_TIMER;
+ ref.flow = FLOW_IDX(conn);
+ ref.fd = fd;
+ if (epoll_add(flow_epollfd(&conn->f), EPOLLIN | EPOLLET,
+ ref) < 0) {
+ flow_dbg(conn, "failed to add timer");
+ close(fd);
return;
}
+
+ conn->timer = fd;
}
if (conn->flags & ACK_TO_TAP_DUE) {
- it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000;
+ it.it_value.tv_sec = RTT_GET(conn) / 2 / ((long)1000 * 1000);
+ it.it_value.tv_nsec = RTT_GET(conn) / 2 % ((long)1000 * 1000) *
+ 1000;
} else if (conn->flags & ACK_FROM_TAP_DUE) {
int exp = conn->retries, timeout = RTO_INIT;
if (!(conn->events & ESTABLISHED))
@@ -603,15 +600,23 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
timeout = MAX(timeout, RTO_INIT_AFTER_SYN_RETRIES);
timeout <<= MAX(exp, 0);
it.it_value.tv_sec = MIN(timeout, c->tcp.rto_max);
- } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
- it.it_value.tv_sec = FIN_TIMEOUT;
} else {
- it.it_value.tv_sec = ACT_TIMEOUT;
+ /* Disarm */
+ it.it_value.tv_sec = 0;
+ it.it_value.tv_nsec = 0;
}
- flow_dbg(conn, "timer expires in %llu.%03llus",
- (unsigned long long)it.it_value.tv_sec,
- (unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
+ if (conn->flags & ACK_TO_TAP_DUE) {
+ flow_trace(conn, "timer expires in %llu.%02llums",
+ (unsigned long long)it.it_value.tv_sec * 1000 +
+ it.it_value.tv_nsec / 1000 / 1000,
+ (unsigned long long)it.it_value.tv_nsec
+ / 1000 / 10 % 100);
+ } else {
+ flow_dbg(conn, "timer expires in %llu.%03llus",
+ (unsigned long long)it.it_value.tv_sec,
+ (unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
+ }
if (timerfd_settime(conn->timer, 0, &it, NULL))
flow_perror(conn, "failed to set timer");
@@ -657,7 +662,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
}
if (flag == STALLED || flag == ~STALLED)
- tcp_epoll_ctl(c, conn);
+ tcp_epoll_ctl(conn);
if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE ||
(flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) ||
@@ -714,11 +719,8 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
} else {
if (event == CLOSED)
flow_hash_remove(c, TAP_SIDX(conn));
- tcp_epoll_ctl(c, conn);
+ tcp_epoll_ctl(conn);
}
-
- if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
- tcp_timer_ctl(c, conn);
}
/**
@@ -774,7 +776,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
}
/**
- * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.5 usage)
+ * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.75 usage)
* @conn: Connection pointer
*/
static void tcp_get_sndbuf(struct tcp_tap_conn *conn)
@@ -789,11 +791,7 @@ static void tcp_get_sndbuf(struct tcp_tap_conn *conn)
return;
}
- v = sndbuf;
- if (v >= SNDBUF_BIG)
- v /= 2;
- else if (v > SNDBUF_SMALL)
- v -= v * (v - SNDBUF_SMALL) / (SNDBUF_BIG - SNDBUF_SMALL) / 2;
+ v = clamped_scale(sndbuf, sndbuf, SNDBUF_SMALL, SNDBUF_BIG, 75);
SNDBUF_SET(conn, MIN(INT_MAX, v));
}
@@ -940,7 +938,6 @@ static void tcp_fill_header(struct tcphdr *th,
* tcp_fill_headers() - Fill 802.3, IP, TCP headers
* @c: Execution context
* @conn: Connection pointer
- * @taph: tap backend specific header
* @eh: Pointer to Ethernet header
* @ip4h: Pointer to IPv4 header, or NULL
* @ip6h: Pointer to IPv6 header, or NULL
@@ -949,12 +946,15 @@ static void tcp_fill_header(struct tcphdr *th,
* @ip4_check: IPv4 checksum, if already known
* @seq: Sequence number for this segment
* @no_tcp_csum: Do not set TCP checksum
+ *
+ * Return: frame length (including L2 headers)
*/
-void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
- struct tap_hdr *taph, struct ethhdr *eh,
- struct iphdr *ip4h, struct ipv6hdr *ip6h,
- struct tcphdr *th, struct iov_tail *payload,
- const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum)
+size_t tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
+ struct ethhdr *eh,
+ struct iphdr *ip4h, struct ipv6hdr *ip6h,
+ struct tcphdr *th, struct iov_tail *payload,
+ const uint16_t *ip4_check, uint32_t seq,
+ bool no_tcp_csum)
{
const struct flowside *tapside = TAPFLOW(conn);
size_t l4len = iov_tail_size(payload) + sizeof(*th);
@@ -1020,7 +1020,36 @@ void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
else
tcp_update_csum(psum, th, payload);
- tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
+ return MAX(l3len + sizeof(struct ethhdr), ETH_ZLEN);
+}
+
+/**
+ * tcp_sndbuf_boost() - Calculate limit of sending buffer to force auto-tuning
+ * @conn: Connection pointer
+ * @tinfo: tcp_info from kernel, must be pre-fetched
+ *
+ * Return: increased sending buffer to use as a limit for advertised window
+ */
+static unsigned long tcp_sndbuf_boost(const struct tcp_tap_conn *conn,
+ const struct tcp_info_linux *tinfo)
+{
+ unsigned long bytes_rtt_product;
+
+ if (!bytes_acked_cap)
+ return SNDBUF_GET(conn);
+
+ /* This is *not* a bandwidth-delay product, but it's somewhat related:
+ * as we send more data (usually at the beginning of a connection), we
+ * try to make the sending buffer progressively grow, with the RTT as a
+ * factor (longer delay, bigger buffer needed).
+ */
+ bytes_rtt_product = (long long)tinfo->tcpi_bytes_acked *
+ tinfo->tcpi_rtt / 1000 / 1000;
+
+ return clamped_scale(SNDBUF_GET(conn), bytes_rtt_product,
+ SNDBUF_BOOST_BYTES_RTT_LO,
+ SNDBUF_BOOST_BYTES_RTT_HI,
+ SNDBUF_BOOST_FACTOR);
}
/**
@@ -1031,6 +1060,8 @@ void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
* @tinfo: tcp_info from kernel, can be NULL if not pre-fetched
*
* Return: 1 if sequence or window were updated, 0 otherwise
+ *
+ * #syscalls ioctl
*/
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
bool force_seq, struct tcp_info_linux *tinfo)
@@ -1041,6 +1072,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
socklen_t sl = sizeof(*tinfo);
struct tcp_info_linux tinfo_new;
uint32_t new_wnd_to_tap = prev_wnd_to_tap;
+ bool ack_everything = true;
int s = conn->sock;
/* At this point we could ack all the data we've accepted for forwarding
@@ -1050,7 +1082,8 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
* control behaviour.
*
* For it to be possible and worth it we need:
- * - The TCP_INFO Linux extension which gives us the peer acked bytes
+ * - The TCP_INFO Linux extensions which give us the peer acked bytes
+ * and the delivery rate (outbound bandwidth at receiver)
* - Not to be told not to (force_seq)
* - Not half-closed in the peer->guest direction
* With no data coming from the peer, we might not get events which
@@ -1060,13 +1093,19 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
* Data goes from socket to socket, with nothing meaningfully "in
* flight".
* - Not a pseudo-local connection (e.g. to a VM on the same host)
- * - Large enough send buffer
- * In these cases, there's not enough in flight to bother.
+ * If it is, there's not enough in flight to bother.
+ * - Sending buffer significantly larger than bandwidth * delay product
+ * Meaning we're not bandwidth-bound and this is likely to be
+ * interactive traffic where we want to preserve transparent
+ * connection behaviour and latency.
+ *
+ * Otherwise, we probably want to maximise throughput, which needs
+ * sending buffer auto-tuning, triggered in turn by filling up the
+ * outbound socket queue.
*/
- if (bytes_acked_cap && !force_seq &&
+ if (bytes_acked_cap && delivery_rate_cap && !force_seq &&
!CONN_IS_CLOSING(conn) &&
- !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn) &&
- (unsigned)SNDBUF_GET(conn) >= SNDBUF_SMALL) {
+ !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn)) {
if (!tinfo) {
tinfo = &tinfo_new;
if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
@@ -1075,14 +1114,24 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
/* This trips a cppcheck bug in some versions, including
* cppcheck 2.18.3.
- * https://sourceforge.net/p/cppcheck/discussion/general/thread/fecde59085/
+ * https://trac.cppcheck.net/ticket/14191
*/
/* cppcheck-suppress [uninitvar,unmatchedSuppression] */
- conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
- conn->seq_init_from_tap;
- } else {
+ if ((unsigned)SNDBUF_GET(conn) > (long long)tinfo->tcpi_rtt *
+ tinfo->tcpi_delivery_rate /
+ 1000 / 1000 *
+ SNDBUF_TO_BW_DELAY_INTERACTIVE)
+ ack_everything = false;
+ }
+
+ if (ack_everything) {
/* Fall back to acknowledging everything we got */
conn->seq_ack_to_tap = conn->seq_from_tap;
+ } else {
+ /* cppcheck bug 14191 again, see above */
+ /* cppcheck-suppress [uninitvar,unmatchedSuppression] */
+ conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
+ conn->seq_init_from_tap;
}
/* It's occasionally possible for us to go from using the fallback above
@@ -1113,9 +1162,54 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) {
new_wnd_to_tap = tinfo->tcpi_snd_wnd;
} else {
+ unsigned rtt_ms_ceiling = DIV_ROUND_UP(tinfo->tcpi_rtt, 1000);
+ uint32_t sendq;
+ int limit;
+
+ if (ioctl(s, SIOCOUTQ, &sendq)) {
+ debug_perror("SIOCOUTQ on socket %i, assuming 0", s);
+ sendq = 0;
+ }
tcp_get_sndbuf(conn);
- new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd,
- SNDBUF_GET(conn));
+
+ if ((int)sendq > SNDBUF_GET(conn)) /* Due to memory pressure? */
+ limit = 0;
+ else if ((int)tinfo->tcpi_snd_wnd > SNDBUF_GET(conn))
+ limit = tcp_sndbuf_boost(conn, tinfo) - (int)sendq;
+ else
+ limit = SNDBUF_GET(conn) - (int)sendq;
+
+ /* If the sender uses mechanisms to prevent Silly Window
+ * Syndrome (SWS, described in RFC 813 Section 3) it's critical
+ * that, should the window ever become less than the MSS, we
+ * advertise a new value once it increases again to be above it.
+ *
+ * The mechanism to avoid SWS in the kernel is, implicitly,
+ * implemented by Nagle's algorithm (which was proposed after
+ * RFC 813).
+ *
+ * To this end, for simplicity, approximate a window value below
+ * the MSS to zero, as we already have mechanisms in place to
+ * force updates after the window becomes zero. This matches the
+ * suggestion from RFC 813, Section 4.
+ *
+ * But don't do this if, either:
+ *
+ * - there's nothing in the outbound queue: the size of the
+ * sending buffer is limiting us, and it won't increase if we
+ * don't send data, so there's no point in waiting, or
+ *
+ * - we haven't sent data in a while (somewhat arbitrarily, ten
+ * times the RTT), as that might indicate that the receiver
+ * will only process data in batches that are large enough,
+ * but we won't send enough to fill one because we're stuck
+ * with pending data in the outbound queue
+ */
+ if (limit < MSS_GET(conn) && sendq &&
+ tinfo->tcpi_last_data_sent < rtt_ms_ceiling * 10)
+ limit = 0;
+
+ new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, limit);
}
new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW);
@@ -1135,6 +1229,10 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
conn_flag(c, conn, ACK_TO_TAP_DUE);
out:
+ /* Opportunistically store RTT approximation on valid TCP_INFO data */
+ if (tinfo)
+ RTT_SET(conn, tinfo->tcpi_rtt);
+
return new_wnd_to_tap != prev_wnd_to_tap ||
conn->seq_ack_to_tap != prev_ack_to_tap;
}
@@ -1256,7 +1354,8 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
th->fin = !!(flags & FIN);
if (th->ack) {
- if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
+ if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
+ conn->wnd_to_tap)
conn_flag(c, conn, ~ACK_TO_TAP_DUE);
else
conn_flag(c, conn, ACK_TO_TAP_DUE);
@@ -1290,7 +1389,34 @@ static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn,
}
/**
- * tcp_rst_do() - Reset a tap connection: send RST segment to tap, close socket
+ * tcp_sock_rst() - Close TCP connection forcing RST on socket side
+ * @c: Execution context
+ * @conn: Connection pointer
+ */
+static void tcp_sock_rst(const struct ctx *c, struct tcp_tap_conn *conn)
+{
+ const struct linger linger0 = {
+ .l_onoff = 1,
+ .l_linger = 0,
+ };
+
+ /* Force RST on socket to inform the peer
+ *
+ * We do this by setting SO_LINGER with 0 timeout, which means that
+ * close() will send an RST (unless the connection is already closed in
+ * both directions).
+ */
+ if (setsockopt(conn->sock, SOL_SOCKET,
+ SO_LINGER, &linger0, sizeof(linger0)) < 0) {
+ flow_dbg_perror(conn,
+ "SO_LINGER failed, may not send RST to peer");
+ }
+
+ conn_event(c, conn, CLOSED);
+}
+
+/**
+ * tcp_rst_do() - Reset a tap connection: send RST segment on both sides, close
* @c: Execution context
* @conn: Connection pointer
*/
@@ -1299,8 +1425,10 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn)
if (conn->events == CLOSED)
return;
+ /* Send RST on tap */
tcp_send_flag(c, conn, RST);
- conn_event(c, conn, CLOSED);
+
+ tcp_sock_rst(c, conn);
}
/**
@@ -1543,7 +1671,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
ini = flow_initiate_af(flow, PIF_TAP,
af, saddr, srcport, daddr, dstport);
- if (!(tgt = flow_target(c, flow, IPPROTO_TCP)))
+ if (!(tgt = flow_target(c, flow, FWD_NO_HINT, IPPROTO_TCP)))
goto cancel;
if (flow->f.pif[TGTSIDE] != PIF_HOST) {
@@ -1592,7 +1720,11 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
conn->sock = s;
conn->timer = -1;
- conn->listening_sock = -1;
+ flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
+ if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, s, TGTSIDE) < 0) {
+ flow_perror(flow, "Can't register with epoll");
+ goto cancel;
+ }
conn_event(c, conn, TAP_SYN_RCVD);
conn->wnd_to_tap = WINDOW_DEFAULT;
@@ -1636,7 +1768,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
conn_event(c, conn, TAP_SYN_ACK_SENT);
}
- tcp_epoll_ctl(c, conn);
+ tcp_epoll_ctl(conn);
if (c->mode == MODE_VU) { /* To rebind to same oport after migration */
socklen_t sl = sizeof(sa);
@@ -1771,7 +1903,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
return -1;
if (th->rst) {
- conn_event(c, conn, CLOSED);
+ tcp_sock_rst(c, conn);
return 1;
}
@@ -1787,6 +1919,10 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
tcp_send_flag(c, conn, ACK);
tcp_timer_ctl(c, conn);
+ if (setsockopt(conn->sock, SOL_SOCKET, SO_KEEPALIVE,
+ &((int){ 1 }), sizeof(int)))
+ flow_trace(conn, "failed to set SO_KEEPALIVE");
+
if (p->count == 1) {
tcp_tap_window_update(c, conn,
ntohs(th->window));
@@ -1913,20 +2049,17 @@ eintr:
goto eintr;
if (errno == EAGAIN || errno == EWOULDBLOCK) {
- tcp_send_flag(c, conn, ACK_IF_NEEDED);
+ tcp_send_flag(c, conn, ACK | DUP_ACK);
return p->count - idx;
}
return -1;
}
- if (n < (int)(seq_from_tap - conn->seq_from_tap)) {
+ if (n < (int)(seq_from_tap - conn->seq_from_tap))
partial_send = 1;
- conn->seq_from_tap += n;
- tcp_send_flag(c, conn, ACK_IF_NEEDED);
- } else {
- conn->seq_from_tap += n;
- }
+
+ conn->seq_from_tap += n;
out:
if (keep != -1 || partial_send) {
@@ -2134,10 +2267,13 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
flow_trace(conn, "packet length %zu from tap", l4len);
if (th->rst) {
- conn_event(c, conn, CLOSED);
+ tcp_sock_rst(c, conn);
return 1;
}
+ conn->inactive = false;
+ conn->tap_inactive = false;
+
if (th->ack && !(conn->events & ESTABLISHED))
tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
@@ -2166,7 +2302,11 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
if (th->fin) {
conn->seq_from_tap++;
- shutdown(conn->sock, SHUT_WR);
+ if (shutdown(conn->sock, SHUT_WR) < 0) {
+ flow_dbg_perror(conn, "shutdown() failed");
+ goto reset;
+ }
+
tcp_send_flag(c, conn, ACK);
conn_event(c, conn, SOCK_FIN_SENT);
@@ -2241,7 +2381,11 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
socklen_t sl;
struct tcp_info tinfo;
- shutdown(conn->sock, SHUT_WR);
+ if (shutdown(conn->sock, SHUT_WR) < 0) {
+ flow_dbg_perror(conn, "shutdown() failed");
+ goto reset;
+ }
+
conn_event(c, conn, SOCK_FIN_SENT);
tcp_send_flag(c, conn, ACK);
ack_due = 0;
@@ -2315,6 +2459,15 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
conn->sock = s;
conn->timer = -1;
conn->ws_to_tap = conn->ws_from_tap = 0;
+
+ flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
+ if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, s, INISIDE) < 0) {
+ flow_perror(flow, "Can't register with epoll");
+ conn_flag(c, conn, CLOSING);
+ FLOW_ACTIVATE(conn);
+ return;
+ }
+
conn_event(c, conn, SOCK_ACCEPTED);
hash = flow_hash_insert(c, TAP_SIDX(conn));
@@ -2341,7 +2494,6 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
const struct timespec *now)
{
- struct tcp_tap_conn *conn;
union sockaddr_inany sa;
socklen_t sl = sizeof(sa);
struct flowside *ini;
@@ -2357,17 +2509,14 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
if (s < 0)
goto cancel;
- conn = (struct tcp_tap_conn *)flow;
- conn->listening_sock = ref.fd;
-
tcp_sock_set_nodelay(s);
/* FIXME: If useful: when the listening port has a specific bound
* address, record that as our address, as implemented for vhost-user
* mode only, below.
*/
- ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa,
- NULL, ref.tcp_listen.port);
+ ini = flow_initiate_sa(flow, ref.listen.pif, &sa,
+ NULL, ref.listen.port);
if (getsockname(s, &sa.sa, &sl) ||
inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa) < 0)
@@ -2381,7 +2530,7 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
goto cancel;
}
- if (!flow_target(c, flow, IPPROTO_TCP))
+ if (!flow_target(c, flow, ref.listen.rule, IPPROTO_TCP))
goto cancel;
switch (flow->f.pif[TGTSIDE]) {
@@ -2412,7 +2561,9 @@ cancel:
* @c: Execution context
* @ref: epoll reference of timer (not connection)
*
- * #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64
+ * #syscalls timerfd_gettime|timerfd_gettime64
+ * #syscalls arm:timerfd_gettime64 i686:timerfd_gettime64
+ * #syscalls arm:timerfd_settime64 i686:timerfd_settime64
*/
void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
{
@@ -2450,9 +2601,6 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
conn_flag(c, conn, SYN_RETRIED);
tcp_timer_ctl(c, conn);
}
- } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
- flow_dbg(conn, "FIN timeout");
- tcp_rst(c, conn);
} else if (conn->retries == TCP_MAX_RETRIES) {
flow_dbg(conn, "retransmissions count exceeded");
tcp_rst(c, conn);
@@ -2469,23 +2617,6 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
tcp_data_from_sock(c, conn);
tcp_timer_ctl(c, conn);
}
- } else {
- struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } };
- struct itimerspec old = { { 0 }, { 0 } };
-
- /* Activity timeout: if it was already set, reset the
- * connection, otherwise, it was a left-over from ACK_TO_TAP_DUE
- * or ACK_FROM_TAP_DUE, so just set the long timeout in that
- * case. This avoids having to preemptively reset the timer on
- * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE.
- */
- if (timerfd_settime(conn->timer, 0, &new, &old))
- flow_perror(conn, "failed to set timer");
-
- if (old.it_value.tv_sec == ACT_TIMEOUT) {
- flow_dbg(conn, "activity timeout");
- tcp_rst(c, conn);
- }
}
}
@@ -2511,6 +2642,8 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
return;
}
+ conn->inactive = false;
+
if ((conn->events & TAP_FIN_ACKED) && (events & EPOLLHUP)) {
conn_event(c, conn, CLOSED);
return;
@@ -2552,65 +2685,18 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
}
/**
- * tcp_sock_init_one() - Initialise listening socket for address and port
- * @c: Execution context
- * @pif: Interface to open the socket for (PIF_HOST or PIF_SPLICE)
- * @addr: Pointer to address for binding, NULL for dual stack any
- * @ifname: Name of interface to bind to, NULL if not configured
- * @port: Port, host order
- *
- * Return: fd for the new listening socket, negative error code on failure
- *
- * If pif == PIF_SPLICE, the caller must have already entered the guest ns.
- */
-static int tcp_sock_init_one(const struct ctx *c, uint8_t pif,
- const union inany_addr *addr, const char *ifname,
- in_port_t port)
-{
- union tcp_listen_epoll_ref tref = {
- .port = port,
- .pif = pif,
- };
- const struct fwd_ports *fwd;
- int s;
-
- if (pif == PIF_HOST)
- fwd = &c->tcp.fwd_in;
- else
- fwd = &c->tcp.fwd_out;
-
- s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, pif, addr, ifname,
- port, tref.u32);
-
- if (fwd->mode == FWD_AUTO) {
- int (*socks)[IP_VERSIONS] = pif == PIF_SPLICE ?
- tcp_sock_ns : tcp_sock_init_ext;
-
- if (!addr || inany_v4(addr))
- socks[port][V4] = s < 0 ? -1 : s;
- if (!addr || !inany_v4(addr))
- socks[port][V6] = s < 0 ? -1 : s;
- }
-
- if (s < 0)
- return s;
-
- return s;
-}
-
-/**
- * tcp_sock_init() - Create listening socket for a given host ("inbound") port
+ * tcp_listen() - Create listening socket
* @c: Execution context
* @pif: Interface to open the socket for (PIF_HOST or PIF_SPLICE)
- * @addr: Pointer to address for binding, NULL if not configured
- * @ifname: Name of interface to bind to, NULL if not configured
+ * @rule: Index of relevant forwarding rule
+ * @addr: Pointer to address for binding, NULL for any
+ * @ifname: Name of interface to bind to, NULL for any
* @port: Port, host order
*
- * Return: 0 on success, negative error code on failure
+ * Return: socket fd on success, negative error code on failure
*/
-int tcp_sock_init(const struct ctx *c, uint8_t pif,
- const union inany_addr *addr, const char *ifname,
- in_port_t port)
+int tcp_listen(const struct ctx *c, uint8_t pif, unsigned rule,
+ const union inany_addr *addr, const char *ifname, in_port_t port)
{
int s;
@@ -2621,69 +2707,19 @@ int tcp_sock_init(const struct ctx *c, uint8_t pif,
/* Restrict to v6 only */
addr = &inany_any6;
else if (inany_v4(addr))
- /* Nothing to do */
- return 0;
+ return -EAFNOSUPPORT;
}
if (!c->ifi6) {
if (!addr)
/* Restrict to v4 only */
addr = &inany_any4;
else if (!inany_v4(addr))
- /* Nothing to do */
- return 0;
- }
-
- s = tcp_sock_init_one(c, pif, addr, ifname, port);
- if (s < 0)
- return s;
- if (s > FD_REF_MAX)
- return -EIO;
-
- return 0;
-}
-
-/**
- * tcp_ns_sock_init() - Init socket to listen for spliced outbound connections
- * @c: Execution context
- * @port: Port, host order
- */
-static void tcp_ns_sock_init(const struct ctx *c, in_port_t port)
-{
- ASSERT(!c->no_tcp);
-
- if (!c->no_bindtodevice) {
- tcp_sock_init(c, PIF_SPLICE, NULL, "lo", port);
- return;
+ return -EAFNOSUPPORT;
}
- if (c->ifi4)
- tcp_sock_init_one(c, PIF_SPLICE, &inany_loopback4, NULL, port);
- if (c->ifi6)
- tcp_sock_init_one(c, PIF_SPLICE, &inany_loopback6, NULL, port);
-}
-
-/**
- * tcp_ns_socks_init() - Bind sockets in namespace for outbound connections
- * @arg: Execution context
- *
- * Return: 0
- */
-/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */
-static int tcp_ns_socks_init(void *arg)
-{
- const struct ctx *c = (const struct ctx *)arg;
- unsigned port;
-
- ns_enter(c);
-
- for (port = 0; port < NUM_PORTS; port++) {
- if (!bitmap_isset(c->tcp.fwd_out.map, port))
- continue;
+ s = pif_listen(c, EPOLL_TYPE_TCP_LISTEN, pif, addr, ifname, port, rule);
- tcp_ns_sock_init(c, port);
- }
-
- return 0;
+ return s;
}
/**
@@ -2812,7 +2848,7 @@ static void tcp_get_rto_params(struct ctx *c)
* tcp_init() - Get initial sequence, hash secret, initialise per-socket data
* @c: Execution context
*
- * Return: 0, doesn't return on failure
+ * Return: 0 on success, -1 on failure
*/
int tcp_init(struct ctx *c)
{
@@ -2824,15 +2860,16 @@ int tcp_init(struct ctx *c)
memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4));
memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6));
- memset(tcp_sock_init_ext, 0xff, sizeof(tcp_sock_init_ext));
- memset(tcp_sock_ns, 0xff, sizeof(tcp_sock_ns));
tcp_sock_refill_init(c);
+ if (fwd_listen_sync(c, &c->tcp.fwd_in, PIF_HOST, IPPROTO_TCP) < 0)
+ return -1;
if (c->mode == MODE_PASTA) {
tcp_splice_init(c);
-
- NS_CALL(tcp_ns_socks_init, c);
+ if (fwd_listen_sync(c, &c->tcp.fwd_out,
+ PIF_SPLICE, IPPROTO_TCP) < 0)
+ return -1;
}
peek_offset_cap = (!c->ifi4 || tcp_probe_peek_offset_cap(AF_INET)) &&
@@ -2842,7 +2879,7 @@ int tcp_init(struct ctx *c)
tcp_info_size = tcp_probe_tcp_info();
#define dbg_tcpi(f_) debug("TCP_INFO tcpi_%s field%s supported", \
- STRINGIFY(f_), tcp_info_cap(f_) ? " " : " not ")
+ STRINGIFY(f_), tcp_info_cap(f_) ? "" : " not")
dbg_tcpi(snd_wnd);
dbg_tcpi(bytes_acked);
dbg_tcpi(min_rtt);
@@ -2852,74 +2889,59 @@ int tcp_init(struct ctx *c)
}
/**
- * tcp_port_rebind() - Rebind ports to match forward maps
- * @c: Execution context
- * @outbound: True to remap outbound forwards, otherwise inbound
- *
- * Must be called in namespace context if @outbound is true.
+ * tcp_keepalive() - Send keepalives for connections which need it
+ * @: Execution context
*/
-static void tcp_port_rebind(struct ctx *c, bool outbound)
+static void tcp_keepalive(struct ctx *c, const struct timespec *now)
{
- const uint8_t *fmap = outbound ? c->tcp.fwd_out.map : c->tcp.fwd_in.map;
- int (*socks)[IP_VERSIONS] = outbound ? tcp_sock_ns : tcp_sock_init_ext;
- unsigned port;
+ union flow *flow;
- for (port = 0; port < NUM_PORTS; port++) {
- if (!bitmap_isset(fmap, port)) {
- if (socks[port][V4] >= 0) {
- close(socks[port][V4]);
- socks[port][V4] = -1;
- }
+ if (now->tv_sec - c->tcp.keepalive_run < KEEPALIVE_INTERVAL)
+ return;
- if (socks[port][V6] >= 0) {
- close(socks[port][V6]);
- socks[port][V6] = -1;
- }
+ c->tcp.keepalive_run = now->tv_sec;
- continue;
- }
+ flow_foreach_of_type(flow, FLOW_TCP) {
+ struct tcp_tap_conn *conn = &flow->tcp;
- if ((c->ifi4 && socks[port][V4] == -1) ||
- (c->ifi6 && socks[port][V6] == -1)) {
- if (outbound)
- tcp_ns_sock_init(c, port);
- else
- tcp_sock_init(c, PIF_HOST, NULL, NULL, port);
+ if (conn->tap_inactive) {
+ flow_dbg(conn, "No tap activity for least %us, send keepalive",
+ KEEPALIVE_INTERVAL);
+ tcp_send_flag(c, conn, KEEPALIVE);
}
+
+ /* Ready to check fot next interval */
+ conn->tap_inactive = true;
}
}
/**
- * tcp_port_rebind_outbound() - Rebind ports in namespace
- * @arg: Execution context
- *
- * Called with NS_CALL()
- *
- * Return: 0
+ * tcp_inactivity() - Scan for and close long-inactive connections
+ * @: Execution context
*/
-static int tcp_port_rebind_outbound(void *arg)
+static void tcp_inactivity(struct ctx *c, const struct timespec *now)
{
- struct ctx *c = (struct ctx *)arg;
+ union flow *flow;
- ns_enter(c);
- tcp_port_rebind(c, true);
+ if (now->tv_sec - c->tcp.inactivity_run < INACTIVITY_INTERVAL)
+ return;
- return 0;
-}
+ debug("TCP inactivity scan");
+ c->tcp.inactivity_run = now->tv_sec;
-/**
- * tcp_port_rebind_all() - Rebind ports to match forward maps (in host & ns)
- * @c: Execution context
- */
-void tcp_port_rebind_all(struct ctx *c)
-{
- ASSERT(c->mode == MODE_PASTA && !c->no_tcp);
+ flow_foreach_of_type(flow, FLOW_TCP) {
+ struct tcp_tap_conn *conn = &flow->tcp;
- if (c->tcp.fwd_out.mode == FWD_AUTO)
- NS_CALL(tcp_port_rebind_outbound, c);
+ if (conn->inactive) {
+ /* No activity in this interval, reset */
+ flow_dbg(conn, "Inactive for at least %us, resetting",
+ INACTIVITY_INTERVAL);
+ tcp_rst(c, conn);
+ }
- if (c->tcp.fwd_in.mode == FWD_AUTO)
- tcp_port_rebind(c, false);
+ /* Ready to check fot next interval */
+ conn->inactive = true;
+ }
}
/**
@@ -2927,13 +2949,14 @@ void tcp_port_rebind_all(struct ctx *c)
* @c: Execution context
* @now: Current timestamp
*/
-void tcp_timer(const struct ctx *c, const struct timespec *now)
+void tcp_timer(struct ctx *c, const struct timespec *now)
{
- (void)now;
-
tcp_sock_refill_init(c);
if (c->mode == MODE_PASTA)
tcp_splice_refill(c);
+
+ tcp_keepalive(c, now);
+ tcp_inactivity(c, now);
}
/**
@@ -3420,7 +3443,7 @@ static int tcp_flow_repair_opt(const struct tcp_tap_conn *conn,
}
/**
- * tcp_flow_migrate_source() - Send data (flow table) for flow, close listening
+ * tcp_flow_migrate_source() - Send data (flow table) for flow
* @fd: Descriptor for state migration
* @conn: Pointer to the TCP connection structure
*
@@ -3460,9 +3483,6 @@ int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn)
return rc;
}
- if (conn->listening_sock != -1 && !fcntl(conn->listening_sock, F_GETFD))
- close(conn->listening_sock);
-
return 0;
}
@@ -3671,9 +3691,7 @@ static int tcp_flow_repair_connect(const struct ctx *c,
return rc;
}
- flow_epollid_clear(&conn->f);
conn->timer = -1;
- conn->listening_sock = -1;
return 0;
}
@@ -3731,14 +3749,19 @@ int tcp_flow_migrate_target(struct ctx *c, int fd)
if ((rc = tcp_flow_repair_socket(c, conn))) {
flow_err(flow, "Can't set up socket: %s, drop", strerror_(-rc));
- /* Can't leave the flow in an incomplete state */
- FLOW_ACTIVATE(conn);
- return 0;
+ goto out;
}
+ flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
+ if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, conn->sock,
+ !TAPSIDE(conn)))
+ goto out; /* tcp_flow_migrate_target_ext() will clean this up */
+
flow_hash_insert(c, TAP_SIDX(conn));
- FLOW_ACTIVATE(conn);
+out:
+ /* Never leave the flow in an incomplete state */
+ FLOW_ACTIVATE(conn);
return 0;
}
@@ -3862,10 +3885,15 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
int v;
v = TCP_SEND_QUEUE;
- if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v)))
+ if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v))) {
flow_perror(conn, "Selecting repair queue");
- else
- shutdown(s, SHUT_WR);
+ } else {
+ if (shutdown(s, SHUT_WR) < 0) {
+ flow_perror(conn,
+ "Repair mode shutdown() failed");
+ goto fail;
+ }
+ }
}
if (tcp_flow_repair_wnd(conn, &t))
@@ -3892,8 +3920,12 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
* Call shutdown(x, SHUT_WR) *not* in repair mode, which moves us to
* TCP_FIN_WAIT1.
*/
- if (t.tcpi_state == TCP_FIN_WAIT1)
- shutdown(s, SHUT_WR);
+ if (t.tcpi_state == TCP_FIN_WAIT1) {
+ if (shutdown(s, SHUT_WR) < 0) {
+ flow_perror(conn, "Post-repair shutdown() failed");
+ goto fail;
+ }
+ }
if (tcp_set_peek_offset(conn, peek_offset))
goto fail;
@@ -3901,7 +3933,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
tcp_send_flag(c, conn, ACK);
tcp_data_from_sock(c, conn);
- if ((rc = tcp_epoll_ctl(c, conn))) {
+ if ((rc = tcp_epoll_ctl(conn))) {
flow_dbg(conn,
"Failed to subscribe to epoll for migrated socket: %s",
strerror_(-rc));