aboutgitcodebugslistschat
path: root/tcp.c
diff options
context:
space:
mode:
Diffstat (limited to 'tcp.c')
-rw-r--r--tcp.c2123
1 files changed, 1779 insertions, 344 deletions
diff --git a/tcp.c b/tcp.c
index af6bd95..e7fa85f 100644
--- a/tcp.c
+++ b/tcp.c
@@ -179,14 +179,16 @@
*
* Timeouts are implemented by means of timerfd timers, set based on flags:
*
- * - SYN_TIMEOUT: if no ACK is received from tap/guest during handshake (flag
- * ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, reset the
- * connection
+ * - RTO_INIT: if no ACK segment was received from tap/guest, either during
+ * handshake (flag ACK_FROM_TAP_DUE without ESTABLISHED event) or after
+ * sending data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data
+ * from the socket and reset sequence to what was acknowledged. This is the
+ * timeout for the first retry, in seconds. Retry TCP_MAX_RETRIES times for
+ * established connections, or (syn_retries + syn_linear_timeouts) times
+ * during the handshake, then reset the connection
*
- * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
- * data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
- * socket and reset sequence to what was acknowledged. If this persists for
- * more than TCP_MAX_RETRANS times in a row, reset the connection
+ * - RTO_INIT_AFTER_SYN_RETRIES: if SYN retries happened during handshake and
+ * RTO is less than this, re-initialise RTO to this for data retransmissions
*
* - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
* with TAP_FIN_SENT event), and no ACK is received within this time, reset
@@ -200,9 +202,13 @@
* - ACT_TIMEOUT, in the presence of any event: if no activity is detected on
* either side, the connection is reset
*
- * - ACK_INTERVAL elapsed after data segment received from tap without having
+ * - RTT / 2 elapsed after data segment received from tap without having
* sent an ACK segment, or zero-sized window advertised to tap/guest (flag
- * ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent
+ * ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent.
+ *
+ * RTT, here, is an approximation of the RTT value reported by the kernel via
+ * TCP_INFO, with a representable range from RTT_STORE_MIN (100 us) to
+ * RTT_STORE_MAX (3276.8 ms). The timeout value is clamped accordingly.
*
*
* Summary of data flows (with ESTABLISHED event)
@@ -279,7 +285,7 @@
#include <stdbool.h>
#include <stddef.h>
#include <string.h>
-#include <sys/epoll.h>
+#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/timerfd.h>
#include <sys/types.h>
@@ -287,6 +293,8 @@
#include <time.h>
#include <arpa/inet.h>
+#include <linux/sockios.h>
+
#include "checksum.h"
#include "util.h"
#include "iov.h"
@@ -299,26 +307,62 @@
#include "log.h"
#include "inany.h"
#include "flow.h"
+#include "repair.h"
#include "linux_dep.h"
#include "flow_table.h"
#include "tcp_internal.h"
#include "tcp_buf.h"
#include "tcp_vu.h"
+#include "epoll_ctl.h"
+
+/*
+ * The size of TCP header (including options) is given by doff (Data Offset)
+ * that is a 4-bit value specifying the number of 32-bit words in the header.
+ * The maximum value of doff is 15 [(1 << 4) - 1].
+ * The maximum length in bytes of options is 15 minus the number of 32-bit
+ * words in the minimal TCP header (5) multiplied by the length of a 32-bit
+ * word (4).
+ */
+#define OPTLEN_MAX (((1UL << 4) - 1 - 5) * 4UL)
+
+#ifndef __USE_MISC
+/* From Linux UAPI, missing in netinet/tcp.h provided by musl */
+struct tcp_repair_opt {
+ __u32 opt_code;
+ __u32 opt_val;
+};
+
+enum {
+ TCP_NO_QUEUE,
+ TCP_RECV_QUEUE,
+ TCP_SEND_QUEUE,
+ TCP_QUEUES_NR,
+};
+#endif
/* MSS rounding: see SET_MSS() */
#define MSS_DEFAULT 536
#define WINDOW_DEFAULT 14600 /* RFC 6928 */
-#define ACK_INTERVAL 10 /* ms */
-#define SYN_TIMEOUT 10 /* s */
-#define ACK_TIMEOUT 2
+#define RTO_INIT 1 /* s, RFC 6298 */
+#define RTO_INIT_AFTER_SYN_RETRIES 3 /* s, RFC 6298 */
#define FIN_TIMEOUT 60
#define ACT_TIMEOUT 7200
#define LOW_RTT_TABLE_SIZE 8
#define LOW_RTT_THRESHOLD 10 /* us */
+/* Parameters to temporarily exceed sending buffer to force TCP auto-tuning */
+#define SNDBUF_BOOST_BYTES_RTT_LO 2500 /* B * s: no boost until here */
+/* ...examples: 5 MB sent * 500 ns RTT, 250 kB * 10 ms, 8 kB * 300 ms */
+#define SNDBUF_BOOST_FACTOR 150 /* % */
+#define SNDBUF_BOOST_BYTES_RTT_HI 6000 /* apply full boost factor */
+/* 12 MB sent * 500 ns RTT, 600 kB * 10 ms, 20 kB * 300 ms */
+
+/* Ratio of buffer to bandwidth * delay product implying interactive traffic */
+#define SNDBUF_TO_BW_DELAY_INTERACTIVE /* > */ 20 /* (i.e. < 5% of buffer) */
+
#define ACK_IF_NEEDED 0 /* See tcp_send_flag() */
#define CONN_IS_CLOSING(conn) \
@@ -326,6 +370,28 @@
((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
#define CONN_HAS(conn, set) (((conn)->events & (set)) == (set))
+/* Buffers to migrate pending data from send and receive queues. No, they don't
+ * use memory if we don't use them. And we're going away after this, so splurge.
+ */
+#define TCP_MIGRATE_SND_QUEUE_MAX (64 << 20)
+#define TCP_MIGRATE_RCV_QUEUE_MAX (64 << 20)
+uint8_t tcp_migrate_snd_queue [TCP_MIGRATE_SND_QUEUE_MAX];
+uint8_t tcp_migrate_rcv_queue [TCP_MIGRATE_RCV_QUEUE_MAX];
+
+#define TCP_MIGRATE_RESTORE_CHUNK_MIN 1024 /* Try smaller when above this */
+
+#define SYN_RETRIES "/proc/sys/net/ipv4/tcp_syn_retries"
+#define SYN_LINEAR_TIMEOUTS "/proc/sys/net/ipv4/tcp_syn_linear_timeouts"
+#define RTO_MAX_MS "/proc/sys/net/ipv4/tcp_rto_max_ms"
+
+#define SYN_RETRIES_DEFAULT 6
+#define SYN_LINEAR_TIMEOUTS_DEFAULT 4
+#define RTO_MAX_DEFAULT 120 /* s */
+#define MAX_SYNCNT 127 /* derived from kernel's limit */
+
+/* "Extended" data (not stored in the flow table) for TCP flow migration */
+static struct tcp_tap_transfer_ext migrate_ext[FLOW_MAX];
+
static const char *tcp_event_str[] __attribute((__unused__)) = {
"SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",
@@ -338,14 +404,14 @@ static const char *tcp_state_str[] __attribute((__unused__)) = {
"SYN_RCVD", /* approximately maps to TAP_SYN_ACK_SENT */
/* Passive close: */
- "CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK", "LAST_ACK",
+ "CLOSE_WAIT", "CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK",
/* Active close (+5): */
"CLOSING", "FIN_WAIT_1", "FIN_WAIT_1", "FIN_WAIT_2", "TIME_WAIT",
};
static const char *tcp_flag_str[] __attribute((__unused__)) = {
"STALLED", "LOCAL", "ACTIVE_CLOSE", "ACK_TO_TAP_DUE",
- "ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS",
+ "ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS", "SYN_RETRIED",
};
/* Listening sockets, used for automatic port forwarding in pasta mode only */
@@ -357,7 +423,7 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
*/
static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
-char tcp_buf_discard [MAX_WINDOW];
+char tcp_buf_discard [BUF_DISCARD_SIZE];
/* Does the kernel support TCP_PEEK_OFF? */
bool peek_offset_cap;
@@ -370,11 +436,13 @@ socklen_t tcp_info_size;
sizeof(((struct tcp_info_linux *)NULL)->tcpi_##f_)) <= tcp_info_size)
/* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */
-#define snd_wnd_cap tcp_info_cap(snd_wnd)
+#define snd_wnd_cap tcp_info_cap(snd_wnd)
/* Kernel reports bytes acked in TCP_INFO (kernel commit 0df48c26d84) */
-#define bytes_acked_cap tcp_info_cap(bytes_acked)
+#define bytes_acked_cap tcp_info_cap(bytes_acked)
/* Kernel reports minimum RTT in TCP_INFO (kernel commit cd9b266095f4) */
-#define min_rtt_cap tcp_info_cap(min_rtt)
+#define min_rtt_cap tcp_info_cap(min_rtt)
+/* Kernel reports delivery rate in TCP_INFO (kernel commit eb8329e0a04d) */
+#define delivery_rate_cap tcp_info_cap(delivery_rate)
/* sendmsg() to socket */
static struct iovec tcp_iov [UIO_MAXIOV];
@@ -402,19 +470,20 @@ static struct tcp_tap_conn *conn_at_sidx(flow_sidx_t sidx)
}
/**
- * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported
- * @s: Socket to update
+ * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on connection if supported
+ * @conn: Pointer to the TCP connection structure
* @offset: Offset in bytes
*
- * Return: -1 when it fails, 0 otherwise.
+ * Return: -1 when it fails, 0 otherwise.
*/
-int tcp_set_peek_offset(int s, int offset)
+int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset)
{
if (!peek_offset_cap)
return 0;
- if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) {
- err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s);
+ if (setsockopt(conn->sock, SOL_SOCKET, SO_PEEK_OFF,
+ &offset, sizeof(offset))) {
+ flow_perror(conn, "Failed to set SO_PEEK_OFF to %i", offset);
return -1;
}
return 0;
@@ -423,7 +492,7 @@ int tcp_set_peek_offset(int s, int offset)
/**
* tcp_conn_epoll_events() - epoll events mask for given connection state
* @events: Current connection events
- * @conn_flags Connection flags
+ * @conn_flags: Connection flags
*
* Return: epoll events mask corresponding to implied connection state
*/
@@ -461,34 +530,37 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
*/
static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
{
- int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
+ int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock,
.flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), };
struct epoll_event ev = { .data.u64 = ref.u64 };
+ int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f)
+ : c->epollfd;
if (conn->events == CLOSED) {
- if (conn->in_epoll)
- epoll_del(c, conn->sock);
+ if (flow_in_epoll(&conn->f))
+ epoll_del(epollfd, conn->sock);
if (conn->timer != -1)
- epoll_del(c, conn->timer);
+ epoll_del(epollfd, conn->timer);
return 0;
}
ev.events = tcp_conn_epoll_events(conn->events, conn->flags);
- if (epoll_ctl(c->epollfd, m, conn->sock, &ev))
+ if (epoll_ctl(epollfd, m, conn->sock, &ev))
return -errno;
- conn->in_epoll = true;
+ flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
if (conn->timer != -1) {
union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER,
- .fd = conn->sock,
+ .fd = conn->timer,
.flow = FLOW_IDX(conn) };
struct epoll_event ev_t = { .data.u64 = ref_t.u64,
.events = EPOLLIN | EPOLLET };
- if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t))
+ if (epoll_ctl(flow_epollfd(&conn->f), EPOLL_CTL_MOD,
+ conn->timer, &ev_t))
return -errno;
}
@@ -499,8 +571,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
* tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed
* @c: Execution context
* @conn: Connection pointer
- *
- * #syscalls timerfd_create timerfd_settime
+ * #syscalls timerfd_create timerfd_settime|timerfd_settime32
*/
static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
{
@@ -511,26 +582,25 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
if (conn->timer == -1) {
union epoll_ref ref = { .type = EPOLL_TYPE_TCP_TIMER,
- .fd = conn->sock,
.flow = FLOW_IDX(conn) };
struct epoll_event ev = { .data.u64 = ref.u64,
.events = EPOLLIN | EPOLLET };
+ int epollfd = flow_epollfd(&conn->f);
int fd;
fd = timerfd_create(CLOCK_MONOTONIC, 0);
if (fd == -1 || fd > FD_REF_MAX) {
- flow_dbg(conn, "failed to get timer: %s",
- strerror_(errno));
+ flow_dbg_perror(conn, "failed to get timer");
if (fd > -1)
close(fd);
conn->timer = -1;
return;
}
conn->timer = fd;
+ ref.fd = conn->timer;
- if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
- flow_dbg(conn, "failed to add timer: %s",
- strerror_(errno));
+ if (epoll_ctl(epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
+ flow_dbg_perror(conn, "failed to add timer");
close(conn->timer);
conn->timer = -1;
return;
@@ -538,24 +608,37 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
}
if (conn->flags & ACK_TO_TAP_DUE) {
- it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000;
+ it.it_value.tv_sec = RTT_GET(conn) / 2 / ((long)1000 * 1000);
+ it.it_value.tv_nsec = RTT_GET(conn) / 2 % ((long)1000 * 1000) *
+ 1000;
} else if (conn->flags & ACK_FROM_TAP_DUE) {
+ int exp = conn->retries, timeout = RTO_INIT;
if (!(conn->events & ESTABLISHED))
- it.it_value.tv_sec = SYN_TIMEOUT;
- else
- it.it_value.tv_sec = ACK_TIMEOUT;
+ exp -= c->tcp.syn_linear_timeouts;
+ else if (conn->flags & SYN_RETRIED)
+ timeout = MAX(timeout, RTO_INIT_AFTER_SYN_RETRIES);
+ timeout <<= MAX(exp, 0);
+ it.it_value.tv_sec = MIN(timeout, c->tcp.rto_max);
} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
it.it_value.tv_sec = FIN_TIMEOUT;
} else {
it.it_value.tv_sec = ACT_TIMEOUT;
}
- flow_dbg(conn, "timer expires in %llu.%03llus",
- (unsigned long long)it.it_value.tv_sec,
- (unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
+ if (conn->flags & ACK_TO_TAP_DUE) {
+ flow_trace(conn, "timer expires in %llu.%02llums",
+ (unsigned long long)it.it_value.tv_sec * 1000 +
+ it.it_value.tv_nsec / 1000 / 1000,
+ (unsigned long long)it.it_value.tv_nsec
+ / 1000 / 10 % 100);
+ } else {
+ flow_dbg(conn, "timer expires in %llu.%03llus",
+ (unsigned long long)it.it_value.tv_sec,
+ (unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
+ }
if (timerfd_settime(conn->timer, 0, &it, NULL))
- flow_err(conn, "failed to set timer: %s", strerror_(errno));
+ flow_perror(conn, "failed to set timer");
}
/**
@@ -650,12 +733,13 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
flow_dbg(conn, "%s",
num == -1 ? "CLOSED" : tcp_event_str[num]);
- if (event == CLOSED)
- flow_hash_remove(c, TAP_SIDX(conn));
- else if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD))
+ if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD)) {
conn_flag(c, conn, ACTIVE_CLOSE);
- else
+ } else {
+ if (event == CLOSED)
+ flow_hash_remove(c, TAP_SIDX(conn));
tcp_epoll_ctl(c, conn);
+ }
if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
tcp_timer_ctl(c, conn);
@@ -714,7 +798,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
}
/**
- * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.5 usage)
+ * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.75 usage)
* @conn: Connection pointer
*/
static void tcp_get_sndbuf(struct tcp_tap_conn *conn)
@@ -729,34 +813,12 @@ static void tcp_get_sndbuf(struct tcp_tap_conn *conn)
return;
}
- v = sndbuf;
- if (v >= SNDBUF_BIG)
- v /= 2;
- else if (v > SNDBUF_SMALL)
- v -= v * (v - SNDBUF_SMALL) / (SNDBUF_BIG - SNDBUF_SMALL) / 2;
+ v = clamped_scale(sndbuf, sndbuf, SNDBUF_SMALL, SNDBUF_BIG, 75);
SNDBUF_SET(conn, MIN(INT_MAX, v));
}
/**
- * tcp_sock_set_bufsize() - Set SO_RCVBUF and SO_SNDBUF to maximum values
- * @s: Socket, can be -1 to avoid check in the caller
- */
-static void tcp_sock_set_bufsize(const struct ctx *c, int s)
-{
- int v = INT_MAX / 2; /* Kernel clamps and rounds, no need to check */
-
- if (s == -1)
- return;
-
- if (!c->low_rmem && setsockopt(s, SOL_SOCKET, SO_RCVBUF, &v, sizeof(v)))
- trace("TCP: failed to set SO_RCVBUF to %i", v);
-
- if (!c->low_wmem && setsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v)))
- trace("TCP: failed to set SO_SNDBUF to %i", v);
-}
-
-/**
* tcp_sock_set_nodelay() - Set TCP_NODELAY option (disable Nagle's algorithm)
* @s: Socket, can be -1 to avoid check in the caller
*/
@@ -775,7 +837,8 @@ static void tcp_sock_set_nodelay(int s)
* @th: TCP header (updated)
* @payload: TCP payload
*/
-void tcp_update_csum(uint32_t psum, struct tcphdr *th, struct iov_tail *payload)
+static void tcp_update_csum(uint32_t psum, struct tcphdr *th,
+ struct iov_tail *payload)
{
th->check = 0;
psum = csum_unfolded(th, sizeof(*th), psum);
@@ -895,8 +958,10 @@ static void tcp_fill_header(struct tcphdr *th,
/**
* tcp_fill_headers() - Fill 802.3, IP, TCP headers
+ * @c: Execution context
* @conn: Connection pointer
* @taph: tap backend specific header
+ * @eh: Pointer to Ethernet header
* @ip4h: Pointer to IPv4 header, or NULL
* @ip6h: Pointer to IPv6 header, or NULL
* @th: Pointer to TCP header
@@ -905,14 +970,15 @@ static void tcp_fill_header(struct tcphdr *th,
* @seq: Sequence number for this segment
* @no_tcp_csum: Do not set TCP checksum
*/
-void tcp_fill_headers(const struct tcp_tap_conn *conn,
- struct tap_hdr *taph,
+void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
+ struct tap_hdr *taph, struct ethhdr *eh,
struct iphdr *ip4h, struct ipv6hdr *ip6h,
struct tcphdr *th, struct iov_tail *payload,
const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum)
{
const struct flowside *tapside = TAPFLOW(conn);
size_t l4len = iov_tail_size(payload) + sizeof(*th);
+ uint8_t *omac = conn->f.tap_omac;
size_t l3len = l4len;
uint32_t psum = 0;
@@ -938,6 +1004,7 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn,
psum = proto_ipv4_header_psum(l4len, IPPROTO_TCP,
*src4, *dst4);
}
+ eh->h_proto = htons_constant(ETH_P_IP);
}
if (ip6h) {
@@ -951,17 +1018,21 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn,
ip6h->version = 6;
ip6h->nexthdr = IPPROTO_TCP;
- ip6h->flow_lbl[0] = (conn->sock >> 16) & 0xf;
- ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff;
- ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff;
+ ip6_set_flow_lbl(ip6h, conn->sock);
if (!no_tcp_csum) {
psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP,
&ip6h->saddr,
&ip6h->daddr);
}
+ eh->h_proto = htons_constant(ETH_P_IPV6);
}
+ /* Find if neighbour table has a recorded MAC address */
+ if (MAC_IS_UNDEF(omac))
+ fwd_neigh_mac_get(c, &tapside->oaddr, omac);
+ eth_update_mac(eh, NULL, omac);
+
tcp_fill_header(th, conn, seq);
if (no_tcp_csum)
@@ -969,7 +1040,36 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn,
else
tcp_update_csum(psum, th, payload);
- tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
+ tap_hdr_update(taph, MAX(l3len + sizeof(struct ethhdr), ETH_ZLEN));
+}
+
+/**
+ * tcp_sndbuf_boost() - Calculate limit of sending buffer to force auto-tuning
+ * @conn: Connection pointer
+ * @tinfo: tcp_info from kernel, must be pre-fetched
+ *
+ * Return: increased sending buffer to use as a limit for advertised window
+ */
+static unsigned long tcp_sndbuf_boost(const struct tcp_tap_conn *conn,
+ const struct tcp_info_linux *tinfo)
+{
+ unsigned long bytes_rtt_product;
+
+ if (!bytes_acked_cap)
+ return SNDBUF_GET(conn);
+
+ /* This is *not* a bandwidth-delay product, but it's somewhat related:
+ * as we send more data (usually at the beginning of a connection), we
+ * try to make the sending buffer progressively grow, with the RTT as a
+ * factor (longer delay, bigger buffer needed).
+ */
+ bytes_rtt_product = (long long)tinfo->tcpi_bytes_acked *
+ tinfo->tcpi_rtt / 1000 / 1000;
+
+ return clamped_scale(SNDBUF_GET(conn), bytes_rtt_product,
+ SNDBUF_BOOST_BYTES_RTT_LO,
+ SNDBUF_BOOST_BYTES_RTT_HI,
+ SNDBUF_BOOST_FACTOR);
}
/**
@@ -980,6 +1080,8 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn,
* @tinfo: tcp_info from kernel, can be NULL if not pre-fetched
*
* Return: 1 if sequence or window were updated, 0 otherwise
+ *
+ * #syscalls ioctl
*/
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
bool force_seq, struct tcp_info_linux *tinfo)
@@ -990,32 +1092,75 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
socklen_t sl = sizeof(*tinfo);
struct tcp_info_linux tinfo_new;
uint32_t new_wnd_to_tap = prev_wnd_to_tap;
+ bool ack_everything = true;
int s = conn->sock;
- if (!bytes_acked_cap) {
- conn->seq_ack_to_tap = conn->seq_from_tap;
- if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
- conn->seq_ack_to_tap = prev_ack_to_tap;
- } else {
- if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL ||
- tcp_rtt_dst_low(conn) || CONN_IS_CLOSING(conn) ||
- (conn->flags & LOCAL) || force_seq) {
- conn->seq_ack_to_tap = conn->seq_from_tap;
- } else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
- if (!tinfo) {
- tinfo = &tinfo_new;
- if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
- return 0;
- }
+ /* At this point we could ack all the data we've accepted for forwarding
+ * (seq_from_tap). When possible, however, we want to only acknowledge
+ * what the peer has acknowledged. This makes it appear to the guest
+ * more like a direct connection to the peer, and may improve flow
+ * control behaviour.
+ *
+ * For it to be possible and worth it we need:
+ * - The TCP_INFO Linux extensions which give us the peer acked bytes
+ * and the delivery rate (outbound bandwidth at receiver)
+ * - Not to be told not to (force_seq)
+ * - Not half-closed in the peer->guest direction
+ * With no data coming from the peer, we might not get events which
+ * would prompt us to recheck bytes_acked. We could poll on a
+ * timer, but that's more trouble than it's worth.
+ * - Not a host local connection
+ * Data goes from socket to socket, with nothing meaningfully "in
+ * flight".
+ * - Not a pseudo-local connection (e.g. to a VM on the same host)
+ * If it is, there's not enough in flight to bother.
+ * - Sending buffer significantly larger than bandwidth * delay product
+ * Meaning we're not bandwidth-bound and this is likely to be
+ * interactive traffic where we want to preserve transparent
+ * connection behaviour and latency.
+ *
+ * Otherwise, we probably want to maximise throughput, which needs
+ * sending buffer auto-tuning, triggered in turn by filling up the
+ * outbound socket queue.
+ */
+ if (bytes_acked_cap && delivery_rate_cap && !force_seq &&
+ !CONN_IS_CLOSING(conn) &&
+ !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn)) {
+ if (!tinfo) {
+ tinfo = &tinfo_new;
+ if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
+ return 0;
+ }
- conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
- conn->seq_init_from_tap;
+ /* This trips a cppcheck bug in some versions, including
+ * cppcheck 2.18.3.
+ * https://trac.cppcheck.net/ticket/14191
+ */
+ /* cppcheck-suppress [uninitvar,unmatchedSuppression] */
+ if ((unsigned)SNDBUF_GET(conn) > (long long)tinfo->tcpi_rtt *
+ tinfo->tcpi_delivery_rate /
+ 1000 / 1000 *
+ SNDBUF_TO_BW_DELAY_INTERACTIVE)
+ ack_everything = false;
+ }
- if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
- conn->seq_ack_to_tap = prev_ack_to_tap;
- }
+ if (ack_everything) {
+ /* Fall back to acknowledging everything we got */
+ conn->seq_ack_to_tap = conn->seq_from_tap;
+ } else {
+ /* cppcheck bug 14191 again, see above */
+ /* cppcheck-suppress [uninitvar,unmatchedSuppression] */
+ conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
+ conn->seq_init_from_tap;
}
+ /* It's occasionally possible for us to go from using the fallback above
+ * to the tcpi_bytes_acked method. In that case, we must be careful not
+ * to let our ACKed sequence go backwards.
+ */
+ if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
+ conn->seq_ack_to_tap = prev_ack_to_tap;
+
if (!snd_wnd_cap) {
tcp_get_sndbuf(conn);
new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW);
@@ -1037,9 +1182,53 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) {
new_wnd_to_tap = tinfo->tcpi_snd_wnd;
} else {
+ uint32_t sendq;
+ int limit;
+
+ if (ioctl(s, SIOCOUTQ, &sendq)) {
+ debug_perror("SIOCOUTQ on socket %i, assuming 0", s);
+ sendq = 0;
+ }
tcp_get_sndbuf(conn);
- new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd,
- SNDBUF_GET(conn));
+
+ if ((int)sendq > SNDBUF_GET(conn)) /* Due to memory pressure? */
+ limit = 0;
+ else if ((int)tinfo->tcpi_snd_wnd > SNDBUF_GET(conn))
+ limit = tcp_sndbuf_boost(conn, tinfo) - (int)sendq;
+ else
+ limit = SNDBUF_GET(conn) - (int)sendq;
+
+ /* If the sender uses mechanisms to prevent Silly Window
+ * Syndrome (SWS, described in RFC 813 Section 3) it's critical
+ * that, should the window ever become less than the MSS, we
+ * advertise a new value once it increases again to be above it.
+ *
+ * The mechanism to avoid SWS in the kernel is, implicitly,
+ * implemented by Nagle's algorithm (which was proposed after
+ * RFC 813).
+ *
+ * To this end, for simplicity, approximate a window value below
+ * the MSS to zero, as we already have mechanisms in place to
+ * force updates after the window becomes zero. This matches the
+ * suggestion from RFC 813, Section 4.
+ *
+ * But don't do this if, either:
+ *
+ * - there's nothing in the outbound queue: the size of the
+ * sending buffer is limiting us, and it won't increase if we
+ * don't send data, so there's no point in waiting, or
+ *
+ * - we haven't sent data in a while (somewhat arbitrarily, ten
+ * times the RTT), as that might indicate that the receiver
+ * will only process data in batches that are large enough,
+ * but we won't send enough to fill one because we're stuck
+ * with pending data in the outbound queue
+ */
+ if (limit < MSS_GET(conn) && sendq &&
+ tinfo->tcpi_last_data_sent < tinfo->tcpi_rtt / 1000 * 10)
+ limit = 0;
+
+ new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, limit);
}
new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW);
@@ -1059,6 +1248,10 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
conn_flag(c, conn, ACK_TO_TAP_DUE);
out:
+ /* Opportunistically store RTT approximation on valid TCP_INFO data */
+ if (tinfo)
+ RTT_SET(conn, tinfo->tcpi_rtt);
+
return new_wnd_to_tap != prev_wnd_to_tap ||
conn->seq_ack_to_tap != prev_ack_to_tap;
}
@@ -1067,7 +1260,7 @@ out:
* tcp_update_seqack_from_tap() - ACK number from tap and related flags/counters
* @c: Execution context
* @conn: Connection pointer
- * @seq Current ACK sequence, host order
+ * @seq: Current ACK sequence, host order
*/
static void tcp_update_seqack_from_tap(const struct ctx *c,
struct tcp_tap_conn *conn, uint32_t seq)
@@ -1080,18 +1273,38 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
if (SEQ_LT(seq, conn->seq_to_tap))
conn_flag(c, conn, ACK_FROM_TAP_DUE);
- conn->retrans = 0;
+ conn->retries = 0;
conn->seq_ack_from_tap = seq;
}
}
/**
+ * tcp_rewind_seq() - Rewind sequence to tap and socket offset to current ACK
+ * @c: Execution context
+ * @conn: Connection pointer
+ *
+ * Return: 0 on success, -1 on failure, with connection reset
+ */
+static int tcp_rewind_seq(const struct ctx *c, struct tcp_tap_conn *conn)
+{
+ conn->seq_to_tap = conn->seq_ack_from_tap;
+ conn->events &= ~TAP_FIN_SENT;
+
+ if (tcp_set_peek_offset(conn, 0)) {
+ tcp_rst(c, conn);
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
* tcp_prepare_flags() - Prepare header for flags-only segment (no payload)
* @c: Execution context
* @conn: Connection pointer
* @flags: TCP flags: if not set, send segment only if ACK is due
* @th: TCP header to update
- * @data: buffer to store TCP option
+ * @opts: TCP option buffer (output parameter)
* @optlen: size of the TCP option buffer (output parameter)
*
* Return: < 0 error code on connection reset,
@@ -1127,7 +1340,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
if (flags & SYN) {
int mss;
- if (c->mtu == -1) {
+ if (!c->mtu) {
mss = tinfo.tcpi_snd_mss;
} else {
mss = c->mtu - sizeof(struct tcphdr);
@@ -1154,12 +1367,14 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
th->doff = (sizeof(*th) + *optlen) / 4;
th->ack = !!(flags & ACK);
+ th->psh = !!(flags & PSH);
th->rst = !!(flags & RST);
th->syn = !!(flags & SYN);
th->fin = !!(flags & FIN);
if (th->ack) {
- if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
+ if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
+ conn->wnd_to_tap)
conn_flag(c, conn, ~ACK_TO_TAP_DUE);
else
conn_flag(c, conn, ACK_TO_TAP_DUE);
@@ -1202,8 +1417,8 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn)
if (conn->events == CLOSED)
return;
- if (!tcp_send_flag(c, conn, RST))
- conn_event(c, conn, CLOSED);
+ tcp_send_flag(c, conn, RST);
+ conn_event(c, conn, CLOSED);
}
/**
@@ -1225,30 +1440,41 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn,
/**
* tcp_tap_window_update() - Process an updated window from tap side
+ * @c: Execution context
* @conn: Connection pointer
- * @window: Window value, host order, unscaled
+ * @wnd: Window value, host order, unscaled
+ *
+ * Return: false on zero window (not stored to wnd_from_tap), true otherwise
*/
-static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd)
+static bool tcp_tap_window_update(const struct ctx *c,
+ struct tcp_tap_conn *conn, unsigned wnd)
{
wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap);
/* Work-around for bug introduced in peer kernel code, commit
- * e2142825c120 ("net: tcp: send zero-window ACK when no memory").
- * We don't update if window shrank to zero.
+ * e2142825c120 ("net: tcp: send zero-window ACK when no memory"): don't
+ * update the window if it shrank to zero, so that we'll eventually
+ * retry to send data, but rewind the sequence as that obviously implies
+ * that no data beyond the updated window will be acknowledged.
*/
- if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap))
- return;
+ if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) {
+ tcp_rewind_seq(c, conn);
+ return false;
+ }
conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
/* FIXME: reflect the tap-side receiver's window back to the sock-side
* sender by adjusting SO_RCVBUF? */
+ return true;
}
/**
* tcp_init_seq() - Calculate initial sequence number according to RFC 6528
* @hash: Hash of connection details
* @now: Current timestamp
+ *
+ * Return: the calculated 32-bit initial sequence number.
*/
static uint32_t tcp_init_seq(uint64_t hash, const struct timespec *now)
{
@@ -1278,12 +1504,11 @@ int tcp_conn_pool_sock(int pool[])
/**
* tcp_conn_new_sock() - Open and prepare new socket for connection
- * @c: Execution context
* @af: Address family
*
* Return: socket number on success, negative code if socket creation failed
*/
-static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
+static int tcp_conn_new_sock(sa_family_t af)
{
int s;
@@ -1297,7 +1522,6 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
if (s < 0)
return -errno;
- tcp_sock_set_bufsize(c, s);
tcp_sock_set_nodelay(s);
return s;
@@ -1305,12 +1529,11 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
/**
* tcp_conn_sock() - Obtain a connectable socket in the host/init namespace
- * @c: Execution context
* @af: Address family (AF_INET or AF_INET6)
*
- * Return: Socket fd on success, -errno on failure
+ * Return: socket fd on success, -errno on failure
*/
-int tcp_conn_sock(const struct ctx *c, sa_family_t af)
+int tcp_conn_sock(sa_family_t af)
{
int *pool = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4;
int s;
@@ -1321,7 +1544,7 @@ int tcp_conn_sock(const struct ctx *c, sa_family_t af)
/* If the pool is empty we just open a new one without refilling the
* pool to keep latency down.
*/
- if ((s = tcp_conn_new_sock(c, af)) >= 0)
+ if ((s = tcp_conn_new_sock(af)) >= 0)
return s;
err("TCP: Unable to open socket for new connection: %s",
@@ -1367,18 +1590,17 @@ static void tcp_bind_outbound(const struct ctx *c,
{
const struct flowside *tgt = &conn->f.side[TGTSIDE];
union sockaddr_inany bind_sa;
- socklen_t sl;
- pif_sockaddr(c, &bind_sa, &sl, PIF_HOST, &tgt->oaddr, tgt->oport);
+ pif_sockaddr(c, &bind_sa, PIF_HOST, &tgt->oaddr, tgt->oport);
if (!inany_is_unspecified(&tgt->oaddr) || tgt->oport) {
- if (bind(s, &bind_sa.sa, sl)) {
+ if (bind(s, &bind_sa.sa, socklen_inany(&bind_sa))) {
char sstr[INANY_ADDRSTRLEN];
- flow_dbg(conn,
- "Can't bind TCP outbound socket to %s:%hu: %s",
- inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)),
- tgt->oport, strerror_(errno));
+ flow_dbg_perror(conn,
+ "Can't bind TCP outbound socket to %s:%hu",
+ inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)),
+ tgt->oport);
}
}
@@ -1387,9 +1609,9 @@ static void tcp_bind_outbound(const struct ctx *c,
if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE,
c->ip4.ifname_out,
strlen(c->ip4.ifname_out))) {
- flow_dbg(conn, "Can't bind IPv4 TCP socket to"
- " interface %s: %s", c->ip4.ifname_out,
- strerror_(errno));
+ flow_dbg_perror(conn,
+ "Can't bind IPv4 TCP socket to interface %s",
+ c->ip4.ifname_out);
}
}
} else if (bind_sa.sa_family == AF_INET6) {
@@ -1397,9 +1619,9 @@ static void tcp_bind_outbound(const struct ctx *c,
if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE,
c->ip6.ifname_out,
strlen(c->ip6.ifname_out))) {
- flow_dbg(conn, "Can't bind IPv6 TCP socket to"
- " interface %s: %s", c->ip6.ifname_out,
- strerror_(errno));
+ flow_dbg_perror(conn,
+ "Can't bind IPv6 TCP socket to interface %s",
+ c->ip6.ifname_out);
}
}
}
@@ -1432,7 +1654,6 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
union flow *flow;
int s = -1, mss;
uint64_t hash;
- socklen_t sl;
if (!(flow = flow_alloc()))
return;
@@ -1462,10 +1683,10 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
goto cancel;
}
- if ((s = tcp_conn_sock(c, af)) < 0)
+ if ((s = tcp_conn_sock(af)) < 0)
goto cancel;
- pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, tgt->eport);
+ pif_sockaddr(c, &sa, PIF_HOST, &tgt->eaddr, tgt->eport);
/* Use bind() to check if the target address is local (EADDRINUSE or
* similar) and already bound, and set the LOCAL flag in that case.
@@ -1477,18 +1698,19 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
*
* So, if bind() succeeds, close the socket, get a new one, and proceed.
*/
- if (bind(s, &sa.sa, sl)) {
+ if (bind(s, &sa.sa, socklen_inany(&sa))) {
if (errno != EADDRNOTAVAIL && errno != EACCES)
conn_flag(c, conn, LOCAL);
} else {
/* Not a local, bound destination, inconclusive test */
close(s);
- if ((s = tcp_conn_sock(c, af)) < 0)
+ if ((s = tcp_conn_sock(af)) < 0)
goto cancel;
}
conn->sock = s;
conn->timer = -1;
+ conn->listening_sock = -1;
conn_event(c, conn, TAP_SYN_RCVD);
conn->wnd_to_tap = WINDOW_DEFAULT;
@@ -1516,7 +1738,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
tcp_bind_outbound(c, conn, s);
- if (connect(s, &sa.sa, sl)) {
+ if (connect(s, &sa.sa, socklen_inany(&sa))) {
if (errno != EINPROGRESS) {
tcp_rst(c, conn);
goto cancel;
@@ -1535,13 +1757,11 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
tcp_epoll_ctl(c, conn);
if (c->mode == MODE_VU) { /* To rebind to same oport after migration */
- sl = sizeof(sa);
- if (!getsockname(s, &sa.sa, &sl)) {
- inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa);
- } else {
- err("Failed to get local address for socket: %s",
- strerror_(errno));
- }
+ socklen_t sl = sizeof(sa);
+
+ if (getsockname(s, &sa.sa, &sl) ||
+ inany_from_sockaddr(&tgt->oaddr, &tgt->oport, &sa) < 0)
+ err_perror("Can't get local address for socket %i", s);
}
FLOW_ACTIVATE(conn);
@@ -1604,6 +1824,23 @@ static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
}
/**
+ * tcp_packet_data_len() - Get data (TCP payload) length for a TCP packet
+ * @th: Pointer to TCP header
+ * @l4len: TCP packet length, including TCP header
+ *
+ * Return: data length of TCP packet, -1 on invalid value of Data Offset field
+ */
+static ssize_t tcp_packet_data_len(const struct tcphdr *th, size_t l4len)
+{
+ size_t off = th->doff * 4UL;
+
+ if (off < sizeof(*th) || off > l4len)
+ return -1;
+
+ return l4len - off;
+}
+
+/**
* tcp_data_from_tap() - tap/guest data for established connection
* @c: Execution context
* @conn: Connection pointer
@@ -1632,16 +1869,22 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
for (i = idx, iov_i = 0; i < (int)p->count; i++) {
uint32_t seq, seq_offset, ack_seq;
+ struct tcphdr th_storage;
const struct tcphdr *th;
- char *data;
- size_t off;
+ struct iov_tail data;
+ size_t off, size;
+ int count;
+
+ if (!packet_get(p, i, &data))
+ return -1;
- th = packet_get(p, i, 0, sizeof(*th), &len);
+ th = IOV_PEEK_HEADER(&data, th_storage);
if (!th)
return -1;
- len += sizeof(*th);
+ len = iov_tail_size(&data);
off = th->doff * 4UL;
+
if (off < sizeof(*th) || off > len)
return -1;
@@ -1651,9 +1894,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
}
len -= off;
- data = packet_get(p, i, off, len, NULL);
- if (!data)
- continue;
+ iov_drop_header(&data, off);
seq = ntohl(th->seq);
if (SEQ_LT(seq, conn->seq_from_tap) && len <= 1) {
@@ -1664,8 +1905,15 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
tcp_send_flag(c, conn, ACK);
tcp_timer_ctl(c, conn);
- if (p->count == 1)
+ if (setsockopt(conn->sock, SOL_SOCKET, SO_KEEPALIVE,
+ &((int){ 1 }), sizeof(int)))
+ flow_trace(conn, "failed to set SO_KEEPALIVE");
+
+ if (p->count == 1) {
+ tcp_tap_window_update(c, conn,
+ ntohs(th->window));
return 1;
+ }
continue;
}
@@ -1682,12 +1930,21 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
ack_seq == max_ack_seq &&
ntohs(th->window) == max_ack_seq_wnd;
+ /* See tcp_tap_window_update() for details. On
+ * top of that, we also need to check here if a
+ * zero-window update is contained in a batch of
+ * packets that includes a non-zero window as
+ * well.
+ */
+ if (!ntohs(th->window))
+ tcp_rewind_seq(c, conn);
+
max_ack_seq_wnd = ntohs(th->window);
max_ack_seq = ack_seq;
}
}
- if (th->fin)
+ if (th->fin && seq == seq_from_tap)
fin = 1;
if (!len)
@@ -1725,10 +1982,14 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
continue;
}
- tcp_iov[iov_i].iov_base = data + seq_offset;
- tcp_iov[iov_i].iov_len = len - seq_offset;
- seq_from_tap += tcp_iov[iov_i].iov_len;
- iov_i++;
+ iov_drop_header(&data, seq_offset);
+ size = len - seq_offset;
+ count = iov_tail_clone(&tcp_iov[iov_i], UIO_MAXIOV - iov_i,
+ &data);
+ if (count < 0)
+ break;
+ seq_from_tap += size;
+ iov_i += count;
if (keep == i)
keep = -1;
@@ -1741,17 +2002,16 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
if (ack && !tcp_sock_consume(conn, max_ack_seq))
tcp_update_seqack_from_tap(c, conn, max_ack_seq);
- tcp_tap_window_update(conn, max_ack_seq_wnd);
+ tcp_tap_window_update(c, conn, max_ack_seq_wnd);
if (retr) {
flow_trace(conn,
"fast re-transmit, ACK: %u, previous sequence: %u",
- max_ack_seq, conn->seq_to_tap);
- conn->seq_to_tap = max_ack_seq;
- if (tcp_set_peek_offset(conn->sock, 0)) {
- tcp_rst(c, conn);
+ conn->seq_ack_from_tap, conn->seq_to_tap);
+
+ if (tcp_rewind_seq(c, conn))
return -1;
- }
+
tcp_data_from_sock(c, conn);
}
@@ -1775,23 +2035,20 @@ eintr:
goto eintr;
if (errno == EAGAIN || errno == EWOULDBLOCK) {
- tcp_send_flag(c, conn, ACK_IF_NEEDED);
+ tcp_send_flag(c, conn, ACK | DUP_ACK);
return p->count - idx;
}
return -1;
}
- if (n < (int)(seq_from_tap - conn->seq_from_tap)) {
+ if (n < (int)(seq_from_tap - conn->seq_from_tap))
partial_send = 1;
- conn->seq_from_tap += n;
- tcp_send_flag(c, conn, ACK_IF_NEEDED);
- } else {
- conn->seq_from_tap += n;
- }
+
+ conn->seq_from_tap += n;
out:
- if (keep != -1) {
+ if (keep != -1 || partial_send) {
/* We use an 8-bit approximation here: the associated risk is
* that we skip a duplicate ACK on 8-bit sequence number
* collision. Fast retransmit is a SHOULD in RFC 5681, 3.2.
@@ -1831,7 +2088,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
const struct tcphdr *th,
const char *opts, size_t optlen)
{
- tcp_tap_window_update(conn, ntohs(th->window));
+ tcp_tap_window_update(c, conn, ntohs(th->window));
tcp_get_tap_ws(conn, opts, optlen);
/* First value is not scaled */
@@ -1845,7 +2102,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
conn->seq_ack_to_tap = conn->seq_from_tap;
conn_event(c, conn, ESTABLISHED);
- if (tcp_set_peek_offset(conn->sock, 0)) {
+ if (tcp_set_peek_offset(conn, 0)) {
tcp_rst(c, conn);
return;
}
@@ -1859,12 +2116,84 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
}
/**
+ * tcp_rst_no_conn() - Send RST in response to a packet with no connection
+ * @c: Execution context
+ * @af: Address family, AF_INET or AF_INET6
+ * @saddr: Source address of the packet we're responding to
+ * @daddr: Destination address of the packet we're responding to
+ * @flow_lbl: IPv6 flow label (ignored for IPv4)
+ * @th: TCP header of the packet we're responding to
+ * @l4len: Packet length, including TCP header
+ */
+static void tcp_rst_no_conn(const struct ctx *c, int af,
+ const void *saddr, const void *daddr,
+ uint32_t flow_lbl,
+ const struct tcphdr *th, size_t l4len)
+{
+ struct iov_tail payload = IOV_TAIL(NULL, 0, 0);
+ struct tcphdr *rsth;
+ char buf[USHRT_MAX];
+ uint32_t psum = 0;
+ size_t rst_l2len;
+
+ /* Don't respond to RSTs without a connection */
+ if (th->rst)
+ return;
+
+ if (af == AF_INET) {
+ struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac,
+ ETH_P_IP);
+ const struct in_addr *rst_src = daddr;
+ const struct in_addr *rst_dst = saddr;
+
+ rsth = tap_push_ip4h(ip4h, *rst_src, *rst_dst,
+ sizeof(*rsth), IPPROTO_TCP);
+ psum = proto_ipv4_header_psum(sizeof(*rsth), IPPROTO_TCP,
+ *rst_src, *rst_dst);
+
+ } else {
+ struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac,
+ ETH_P_IPV6);
+ const struct in6_addr *rst_src = daddr;
+ const struct in6_addr *rst_dst = saddr;
+
+ rsth = tap_push_ip6h(ip6h, rst_src, rst_dst,
+ sizeof(*rsth), IPPROTO_TCP, flow_lbl);
+ psum = proto_ipv6_header_psum(sizeof(*rsth), IPPROTO_TCP,
+ rst_src, rst_dst);
+ }
+
+ memset(rsth, 0, sizeof(*rsth));
+
+ rsth->source = th->dest;
+ rsth->dest = th->source;
+ rsth->rst = 1;
+ rsth->doff = sizeof(*rsth) / 4UL;
+
+ /* Sequence matching logic from RFC 9293 section 3.10.7.1 */
+ if (th->ack) {
+ rsth->seq = th->ack_seq;
+ } else {
+ size_t dlen = l4len - th->doff * 4UL;
+ uint32_t ack = ntohl(th->seq) + dlen;
+
+ rsth->ack_seq = htonl(ack);
+ rsth->ack = 1;
+ }
+
+ tcp_update_csum(psum, rsth, &payload);
+ rst_l2len = ((char *)rsth - buf) + sizeof(*rsth);
+ tap_send_single(c, buf, rst_l2len);
+}
+
+/**
* tcp_tap_handler() - Handle packets from tap and state transitions
* @c: Execution context
* @pif: pif on which the packet is arriving
* @af: Address family, AF_INET or AF_INET6
* @saddr: Source address
* @daddr: Destination address
+ * @flow_lbl: IPv6 flow label (ignored for IPv4)
* @p: Pool of TCP packets, with TCP headers
* @idx: Index of first packet in pool to process
* @now: Current timestamp
@@ -1872,12 +2201,15 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
* Return: count of consumed packets
*/
int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
- const void *saddr, const void *daddr,
+ const void *saddr, const void *daddr, uint32_t flow_lbl,
const struct pool *p, int idx, const struct timespec *now)
{
struct tcp_tap_conn *conn;
+ struct tcphdr th_storage;
const struct tcphdr *th;
- size_t optlen, len;
+ char optsc[OPTLEN_MAX];
+ struct iov_tail data;
+ size_t optlen, l4len;
const char *opts;
union flow *flow;
flow_sidx_t sidx;
@@ -1886,15 +2218,19 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
(void)pif;
- th = packet_get(p, idx, 0, sizeof(*th), &len);
+ if (!packet_get(p, idx, &data))
+ return 1;
+
+ l4len = iov_tail_size(&data);
+
+ th = IOV_REMOVE_HEADER(&data, th_storage);
if (!th)
return 1;
- len += sizeof(*th);
optlen = th->doff * 4UL - sizeof(*th);
/* Static checkers might fail to see this: */
- optlen = MIN(optlen, ((1UL << 4) /* from doff width */ - 6) * 4UL);
- opts = packet_get(p, idx, sizeof(*th), optlen, NULL);
+ optlen = MIN(optlen, OPTLEN_MAX);
+ opts = (char *)iov_remove_header_(&data, &optsc[0], optlen, 1);
sidx = flow_lookup_af(c, IPPROTO_TCP, PIF_TAP, af, saddr, daddr,
ntohs(th->source), ntohs(th->dest));
@@ -1905,6 +2241,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
if (opts && th->syn && !th->ack)
tcp_conn_from_tap(c, af, saddr, daddr, th,
opts, optlen, now);
+ else
+ tcp_rst_no_conn(c, af, saddr, daddr, flow_lbl, th, l4len);
return 1;
}
@@ -1912,7 +2250,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
ASSERT(pif_at_sidx(sidx) == PIF_TAP);
conn = &flow->tcp;
- flow_trace(conn, "packet length %zu from tap", len);
+ flow_trace(conn, "packet length %zu from tap", l4len);
if (th->rst) {
conn_event(c, conn, CLOSED);
@@ -1941,7 +2279,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
goto reset;
conn_event(c, conn, ESTABLISHED);
- if (tcp_set_peek_offset(conn->sock, 0))
+ if (tcp_set_peek_offset(conn, 0))
goto reset;
if (th->fin) {
@@ -1957,9 +2295,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
if (!th->ack)
goto reset;
- tcp_tap_window_update(conn, ntohs(th->window));
-
- tcp_data_from_sock(c, conn);
+ if (tcp_tap_window_update(c, conn, ntohs(th->window)))
+ tcp_data_from_sock(c, conn);
if (p->count - idx == 1)
return 1;
@@ -1967,11 +2304,44 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
/* Established connections not accepting data from tap */
if (conn->events & TAP_FIN_RCVD) {
- tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
+ size_t dlen;
+ bool retr;
- if (conn->events & SOCK_FIN_RCVD &&
- conn->seq_ack_from_tap == conn->seq_to_tap)
- conn_event(c, conn, CLOSED);
+ if ((dlen = tcp_packet_data_len(th, l4len))) {
+ flow_dbg(conn, "data segment in CLOSE-WAIT (%zu B)",
+ dlen);
+ }
+
+ retr = th->ack && !th->fin &&
+ ntohl(th->ack_seq) == conn->seq_ack_from_tap &&
+ ntohs(th->window) == conn->wnd_from_tap;
+
+ /* On socket flush failure, pretend there was no ACK, try again
+ * later
+ */
+ if (th->ack && !tcp_sock_consume(conn, ntohl(th->ack_seq)))
+ tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
+
+ if (retr) {
+ flow_trace(conn,
+ "fast re-transmit, ACK: %u, previous sequence: %u",
+ ntohl(th->ack_seq), conn->seq_to_tap);
+
+ if (tcp_rewind_seq(c, conn))
+ return -1;
+ }
+
+ if (tcp_tap_window_update(c, conn, ntohs(th->window)) || retr)
+ tcp_data_from_sock(c, conn);
+
+ if (conn->seq_ack_from_tap == conn->seq_to_tap) {
+ if (th->ack && conn->events & TAP_FIN_SENT)
+ conn_event(c, conn, TAP_FIN_ACKED);
+
+ if (conn->events & SOCK_FIN_RCVD &&
+ conn->events & TAP_FIN_ACKED)
+ conn_event(c, conn, CLOSED);
+ }
return 1;
}
@@ -1987,10 +2357,27 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
ack_due = 1;
if ((conn->events & TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_SENT)) {
+ socklen_t sl;
+ struct tcp_info tinfo;
+
shutdown(conn->sock, SHUT_WR);
conn_event(c, conn, SOCK_FIN_SENT);
tcp_send_flag(c, conn, ACK);
ack_due = 0;
+
+ /* If we received a FIN, but the socket is in TCP_ESTABLISHED
+ * state, it must be a migrated socket. The kernel saw the FIN
+ * on the source socket, but not on the target socket.
+ *
+ * Approximate the effect of that FIN: as we're sending a FIN
+ * out ourselves, the socket is now in a state equivalent to
+ * LAST_ACK. Now that we sent the FIN out, close it with a RST.
+ */
+ sl = sizeof(tinfo);
+ getsockopt(conn->sock, SOL_TCP, TCP_INFO, &tinfo, &sl);
+ if (tinfo.tcpi_state == TCP_ESTABLISHED &&
+ conn->events & SOCK_FIN_RCVD)
+ goto reset;
}
if (ack_due)
@@ -2073,9 +2460,10 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
const struct timespec *now)
{
- const struct flowside *ini;
+ struct tcp_tap_conn *conn;
union sockaddr_inany sa;
socklen_t sl = sizeof(sa);
+ struct flowside *ini;
union flow *flow;
int s;
@@ -2088,14 +2476,21 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
if (s < 0)
goto cancel;
- tcp_sock_set_bufsize(c, s);
+ conn = (struct tcp_tap_conn *)flow;
+ conn->listening_sock = ref.fd;
+
tcp_sock_set_nodelay(s);
- /* FIXME: When listening port has a specific bound address, record that
- * as our address
+ /* FIXME: If useful: when the listening port has a specific bound
+ * address, record that as our address, as implemented for vhost-user
+ * mode only, below.
*/
ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa,
- ref.tcp_listen.port);
+ NULL, ref.tcp_listen.port);
+
+ if (getsockname(s, &sa.sa, &sl) ||
+ inany_from_sockaddr(&ini->oaddr, &ini->oport, &sa) < 0)
+ err_perror("Can't get local address for socket %i", s);
if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) {
char sastr[SOCKADDR_STRLEN];
@@ -2136,7 +2531,9 @@ cancel:
* @c: Execution context
* @ref: epoll reference of timer (not connection)
*
- * #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64
+ * #syscalls timerfd_gettime|timerfd_gettime64
+ * #syscalls arm:timerfd_gettime64 i686:timerfd_gettime64
+ * #syscalls arm:timerfd_settime64 i686:timerfd_settime64
*/
void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
{
@@ -2151,7 +2548,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
* and we just set the timer to a new point in the future: discard it.
*/
if (timerfd_gettime(conn->timer, &check_armed))
- flow_err(conn, "failed to read timer: %s", strerror_(errno));
+ flow_perror(conn, "failed to read timer");
if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec)
return;
@@ -2161,24 +2558,37 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
tcp_timer_ctl(c, conn);
} else if (conn->flags & ACK_FROM_TAP_DUE) {
if (!(conn->events & ESTABLISHED)) {
- flow_dbg(conn, "handshake timeout");
- tcp_rst(c, conn);
+ int max;
+ max = c->tcp.syn_retries + c->tcp.syn_linear_timeouts;
+ max = MIN(TCP_MAX_RETRIES, max);
+ if (conn->retries >= max) {
+ flow_dbg(conn, "handshake timeout");
+ tcp_rst(c, conn);
+ } else {
+ flow_trace(conn, "SYN timeout, retry");
+ tcp_send_flag(c, conn, SYN);
+ conn->retries++;
+ conn_flag(c, conn, SYN_RETRIED);
+ tcp_timer_ctl(c, conn);
+ }
} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
flow_dbg(conn, "FIN timeout");
tcp_rst(c, conn);
- } else if (conn->retrans == TCP_MAX_RETRANS) {
+ } else if (conn->retries == TCP_MAX_RETRIES) {
flow_dbg(conn, "retransmissions count exceeded");
tcp_rst(c, conn);
} else {
flow_dbg(conn, "ACK timeout, retry");
- conn->retrans++;
- conn->seq_to_tap = conn->seq_ack_from_tap;
- if (tcp_set_peek_offset(conn->sock, 0)) {
- tcp_rst(c, conn);
- } else {
- tcp_data_from_sock(c, conn);
- tcp_timer_ctl(c, conn);
- }
+
+ if (!conn->wnd_from_tap)
+ conn->wnd_from_tap = 1; /* Zero-window probe */
+
+ conn->retries++;
+ if (tcp_rewind_seq(c, conn))
+ return;
+
+ tcp_data_from_sock(c, conn);
+ tcp_timer_ctl(c, conn);
}
} else {
struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } };
@@ -2191,8 +2601,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
* ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE.
*/
if (timerfd_settime(conn->timer, 0, &new, &old))
- flow_err(conn, "failed to set timer: %s",
- strerror_(errno));
+ flow_perror(conn, "failed to set timer");
if (old.it_value.tv_sec == ACT_TIMEOUT) {
flow_dbg(conn, "activity timeout");
@@ -2223,7 +2632,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
return;
}
- if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) {
+ if ((conn->events & TAP_FIN_ACKED) && (events & EPOLLHUP)) {
conn_event(c, conn, CLOSED);
return;
}
@@ -2264,136 +2673,88 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
}
/**
- * tcp_sock_init_one() - Initialise listening socket for address and port
+ * tcp_listen() - Create listening socket
* @c: Execution context
- * @addr: Pointer to address for binding, NULL for dual stack any
- * @ifname: Name of interface to bind to, NULL if not configured
+ * @pif: Interface to open the socket for (PIF_HOST or PIF_SPLICE)
+ * @addr: Pointer to address for binding, NULL for any
+ * @ifname: Name of interface to bind to, NULL for any
* @port: Port, host order
*
- * Return: fd for the new listening socket, negative error code on failure
+ * Return: 0 on success, negative error code on failure
*/
-static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr,
- const char *ifname, in_port_t port)
+int tcp_listen(const struct ctx *c, uint8_t pif,
+ const union inany_addr *addr, const char *ifname, in_port_t port)
{
union tcp_listen_epoll_ref tref = {
.port = port,
- .pif = PIF_HOST,
+ .pif = pif,
};
+ const struct fwd_ports *fwd;
+ int (*socks)[IP_VERSIONS];
int s;
- s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, addr,
- ifname, port, tref.u32);
-
- if (c->tcp.fwd_in.mode == FWD_AUTO) {
- if (!addr || inany_v4(addr))
- tcp_sock_init_ext[port][V4] = s < 0 ? -1 : s;
- if (!addr || !inany_v4(addr))
- tcp_sock_init_ext[port][V6] = s < 0 ? -1 : s;
- }
-
- if (s < 0)
- return s;
-
- return s;
-}
-
-/**
- * tcp_sock_init() - Create listening sockets for a given host ("inbound") port
- * @c: Execution context
- * @addr: Pointer to address for binding, NULL if not configured
- * @ifname: Name of interface to bind to, NULL if not configured
- * @port: Port, host order
- *
- * Return: 0 on (partial) success, negative error code on (complete) failure
- */
-int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
- const char *ifname, in_port_t port)
-{
- int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
-
ASSERT(!c->no_tcp);
- if (!addr && c->ifi4 && c->ifi6)
- /* Attempt to get a dual stack socket */
- if (tcp_sock_init_one(c, NULL, ifname, port) >= 0)
+ if (!c->ifi4) {
+ if (!addr)
+ /* Restrict to v6 only */
+ addr = &inany_any6;
+ else if (inany_v4(addr))
+ /* Nothing to do */
return 0;
+ }
+ if (!c->ifi6) {
+ if (!addr)
+ /* Restrict to v4 only */
+ addr = &inany_any4;
+ else if (!inany_v4(addr))
+ /* Nothing to do */
+ return 0;
+ }
- /* Otherwise create a socket per IP version */
- if ((!addr || inany_v4(addr)) && c->ifi4)
- r4 = tcp_sock_init_one(c, addr ? addr : &inany_any4,
- ifname, port);
-
- if ((!addr || !inany_v4(addr)) && c->ifi6)
- r6 = tcp_sock_init_one(c, addr ? addr : &inany_any6,
- ifname, port);
-
- if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6))
- return 0;
-
- return r4 < 0 ? r4 : r6;
-}
-
-/**
- * tcp_ns_sock_init4() - Init socket to listen for outbound IPv4 connections
- * @c: Execution context
- * @port: Port, host order
- */
-static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
-{
- union tcp_listen_epoll_ref tref = {
- .port = port,
- .pif = PIF_SPLICE,
- };
- int s;
-
- ASSERT(c->mode == MODE_PASTA);
-
- s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback4,
- NULL, port, tref.u32);
- if (s < 0)
- s = -1;
+ if (pif == PIF_HOST) {
+ fwd = &c->tcp.fwd_in;
+ socks = tcp_sock_init_ext;
+ } else {
+ ASSERT(pif == PIF_SPLICE);
+ fwd = &c->tcp.fwd_out;
+ socks = tcp_sock_ns;
+ }
- if (c->tcp.fwd_out.mode == FWD_AUTO)
- tcp_sock_ns[port][V4] = s;
-}
+ s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, pif, addr, ifname,
+ port, tref.u32);
-/**
- * tcp_ns_sock_init6() - Init socket to listen for outbound IPv6 connections
- * @c: Execution context
- * @port: Port, host order
- */
-static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
-{
- union tcp_listen_epoll_ref tref = {
- .port = port,
- .pif = PIF_SPLICE,
- };
- int s;
-
- ASSERT(c->mode == MODE_PASTA);
+ if (fwd->mode == FWD_AUTO) {
+ if (!addr || inany_v4(addr))
+ socks[port][V4] = s < 0 ? -1 : s;
+ if (!addr || !inany_v4(addr))
+ socks[port][V6] = s < 0 ? -1 : s;
+ }
- s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback6,
- NULL, port, tref.u32);
if (s < 0)
- s = -1;
+ return s;
- if (c->tcp.fwd_out.mode == FWD_AUTO)
- tcp_sock_ns[port][V6] = s;
+ return 0;
}
/**
- * tcp_ns_sock_init() - Init socket to listen for spliced outbound connections
+ * tcp_ns_listen() - Init socket to listen for spliced outbound connections
* @c: Execution context
* @port: Port, host order
*/
-void tcp_ns_sock_init(const struct ctx *c, in_port_t port)
+static void tcp_ns_listen(const struct ctx *c, in_port_t port)
{
ASSERT(!c->no_tcp);
+ if (!c->no_bindtodevice) {
+ tcp_listen(c, PIF_SPLICE, NULL, "lo", port);
+ return;
+ }
+
if (c->ifi4)
- tcp_ns_sock_init4(c, port);
+ tcp_listen(c, PIF_SPLICE, &inany_loopback4, NULL, port);
if (c->ifi6)
- tcp_ns_sock_init6(c, port);
+ tcp_listen(c, PIF_SPLICE, &inany_loopback6, NULL, port);
}
/**
@@ -2414,7 +2775,7 @@ static int tcp_ns_socks_init(void *arg)
if (!bitmap_isset(c->tcp.fwd_out.map, port))
continue;
- tcp_ns_sock_init(c, port);
+ tcp_ns_listen(c, port);
}
return 0;
@@ -2422,13 +2783,12 @@ static int tcp_ns_socks_init(void *arg)
/**
* tcp_sock_refill_pool() - Refill one pool of pre-opened sockets
- * @c: Execution context
* @pool: Pool of sockets to refill
* @af: Address family to use
*
* Return: 0 on success, negative error code if there was at least one error
*/
-int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af)
+int tcp_sock_refill_pool(int pool[], sa_family_t af)
{
int i;
@@ -2438,7 +2798,7 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af)
if (pool[i] >= 0)
continue;
- if ((fd = tcp_conn_new_sock(c, af)) < 0)
+ if ((fd = tcp_conn_new_sock(af)) < 0)
return fd;
pool[i] = fd;
@@ -2454,13 +2814,13 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af)
static void tcp_sock_refill_init(const struct ctx *c)
{
if (c->ifi4) {
- int rc = tcp_sock_refill_pool(c, init_sock_pool4, AF_INET);
+ int rc = tcp_sock_refill_pool(init_sock_pool4, AF_INET);
if (rc < 0)
warn("TCP: Error refilling IPv4 host socket pool: %s",
strerror_(-rc));
}
if (c->ifi6) {
- int rc = tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6);
+ int rc = tcp_sock_refill_pool(init_sock_pool6, AF_INET6);
if (rc < 0)
warn("TCP: Error refilling IPv6 host socket pool: %s",
strerror_(-rc));
@@ -2493,7 +2853,7 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af)
/**
* tcp_probe_tcp_info() - Check what data TCP_INFO reports
*
- * Return: Number of bytes returned by TCP_INFO getsockopt()
+ * Return: number of bytes returned by TCP_INFO getsockopt()
*/
static socklen_t tcp_probe_tcp_info(void)
{
@@ -2519,6 +2879,31 @@ static socklen_t tcp_probe_tcp_info(void)
}
/**
+ * tcp_get_rto_params() - Get host kernel RTO parameters
+ * @c: Execution context
+ */
+static void tcp_get_rto_params(struct ctx *c)
+{
+ intmax_t v;
+
+ v = read_file_integer(SYN_RETRIES, SYN_RETRIES_DEFAULT);
+ c->tcp.syn_retries = MIN(v, MAX_SYNCNT);
+
+ v = read_file_integer(SYN_LINEAR_TIMEOUTS, SYN_LINEAR_TIMEOUTS_DEFAULT);
+ c->tcp.syn_linear_timeouts = MIN(v, MAX_SYNCNT);
+
+ v = read_file_integer(RTO_MAX_MS, (intmax_t)(RTO_MAX_DEFAULT * 1000));
+ c->tcp.rto_max = MIN(DIV_ROUND_UP(v, 1000), INT_MAX);
+
+ debug("Using TCP RTO parameters, syn_retries: %"PRIu8
+ ", syn_linear_timeouts: %"PRIu8
+ ", rto_max: %d",
+ c->tcp.syn_retries,
+ c->tcp.syn_linear_timeouts,
+ c->tcp.rto_max);
+}
+
+/**
* tcp_init() - Get initial sequence, hash secret, initialise per-socket data
* @c: Execution context
*
@@ -2528,6 +2913,8 @@ int tcp_init(struct ctx *c)
{
ASSERT(!c->no_tcp);
+ tcp_get_rto_params(c);
+
tcp_sock_iov_init(c);
memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4));
@@ -2550,7 +2937,7 @@ int tcp_init(struct ctx *c)
tcp_info_size = tcp_probe_tcp_info();
#define dbg_tcpi(f_) debug("TCP_INFO tcpi_%s field%s supported", \
- STRINGIFY(f_), tcp_info_cap(f_) ? " " : " not ")
+ STRINGIFY(f_), tcp_info_cap(f_) ? "" : " not")
dbg_tcpi(snd_wnd);
dbg_tcpi(bytes_acked);
dbg_tcpi(min_rtt);
@@ -2569,7 +2956,6 @@ int tcp_init(struct ctx *c)
static void tcp_port_rebind(struct ctx *c, bool outbound)
{
const uint8_t *fmap = outbound ? c->tcp.fwd_out.map : c->tcp.fwd_in.map;
- const uint8_t *rmap = outbound ? c->tcp.fwd_in.map : c->tcp.fwd_out.map;
int (*socks)[IP_VERSIONS] = outbound ? tcp_sock_ns : tcp_sock_init_ext;
unsigned port;
@@ -2588,16 +2974,12 @@ static void tcp_port_rebind(struct ctx *c, bool outbound)
continue;
}
- /* Don't loop back our own ports */
- if (bitmap_isset(rmap, port))
- continue;
-
if ((c->ifi4 && socks[port][V4] == -1) ||
(c->ifi6 && socks[port][V6] == -1)) {
if (outbound)
- tcp_ns_sock_init(c, port);
+ tcp_ns_listen(c, port);
else
- tcp_sock_init(c, NULL, NULL, port);
+ tcp_listen(c, PIF_HOST, NULL, NULL, port);
}
}
}
@@ -2621,27 +3003,1080 @@ static int tcp_port_rebind_outbound(void *arg)
}
/**
+ * tcp_port_rebind_all() - Rebind ports to match forward maps (in host & ns)
+ * @c: Execution context
+ */
+void tcp_port_rebind_all(struct ctx *c)
+{
+ ASSERT(c->mode == MODE_PASTA && !c->no_tcp);
+
+ if (c->tcp.fwd_out.mode == FWD_AUTO)
+ NS_CALL(tcp_port_rebind_outbound, c);
+
+ if (c->tcp.fwd_in.mode == FWD_AUTO)
+ tcp_port_rebind(c, false);
+}
+
+/**
* tcp_timer() - Periodic tasks: port detection, closed connections, pool refill
* @c: Execution context
* @now: Current timestamp
*/
-void tcp_timer(struct ctx *c, const struct timespec *now)
+void tcp_timer(const struct ctx *c, const struct timespec *now)
{
(void)now;
- if (c->mode == MODE_PASTA) {
- if (c->tcp.fwd_out.mode == FWD_AUTO) {
- fwd_scan_ports_tcp(&c->tcp.fwd_out, &c->tcp.fwd_in);
- NS_CALL(tcp_port_rebind_outbound, c);
+ tcp_sock_refill_init(c);
+ if (c->mode == MODE_PASTA)
+ tcp_splice_refill(c);
+}
+
+/**
+ * tcp_flow_is_established() - Was the connection established? Includes closing
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: true if the connection was established, false otherwise
+ */
+bool tcp_flow_is_established(const struct tcp_tap_conn *conn)
+{
+ return conn->events & ESTABLISHED;
+}
+
+/**
+ * tcp_flow_repair_on() - Enable repair mode for a single TCP flow
+ * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn)
+{
+ int rc = 0;
+
+ if (conn->sock < 0)
+ return 0;
+
+ if ((rc = repair_set(c, conn->sock, TCP_REPAIR_ON)))
+ err("Failed to set TCP_REPAIR");
+
+ return rc;
+}
+
+/**
+ * tcp_flow_repair_off() - Clear repair mode for a single TCP flow
+ * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn)
+{
+ int rc = 0;
+
+ if (conn->sock < 0)
+ return 0;
+
+ if ((rc = repair_set(c, conn->sock, TCP_REPAIR_OFF)))
+ err("Failed to clear TCP_REPAIR");
+
+ return rc;
+}
+
+/**
+ * tcp_flow_dump_tinfo() - Dump window scale, tcpi_state, tcpi_options
+ * @conn: Pointer to the TCP connection structure
+ * @t: Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_tinfo(const struct tcp_tap_conn *conn,
+ struct tcp_tap_transfer_ext *t)
+{
+ struct tcp_info tinfo;
+ socklen_t sl;
+
+ sl = sizeof(tinfo);
+ if (getsockopt(conn->sock, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
+ int rc = -errno;
+ flow_perror(conn, "Querying TCP_INFO");
+ return rc;
+ }
+
+ t->snd_ws = tinfo.tcpi_snd_wscale;
+ t->rcv_ws = tinfo.tcpi_rcv_wscale;
+ t->tcpi_state = tinfo.tcpi_state;
+ t->tcpi_options = tinfo.tcpi_options;
+
+ return 0;
+}
+
+/**
+ * tcp_flow_dump_mss() - Dump MSS clamp (not current MSS) via TCP_MAXSEG
+ * @conn: Pointer to the TCP connection structure
+ * @t: Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_mss(const struct tcp_tap_conn *conn,
+ struct tcp_tap_transfer_ext *t)
+{
+ socklen_t sl = sizeof(t->mss);
+ int val;
+
+ if (getsockopt(conn->sock, SOL_TCP, TCP_MAXSEG, &val, &sl)) {
+ int rc = -errno;
+ flow_perror(conn, "Getting MSS");
+ return rc;
+ }
+
+ t->mss = (uint32_t)val;
+
+ return 0;
+}
+
+
+/**
+ * tcp_flow_dump_timestamp() - Dump RFC 7323 timestamp via TCP_TIMESTAMP
+ * @conn: Pointer to the TCP connection structure
+ * @t: Extended migration data (tcpi_options must be populated)
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_timestamp(const struct tcp_tap_conn *conn,
+ struct tcp_tap_transfer_ext *t)
+{
+ int val = 0;
+
+ if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) {
+ socklen_t sl = sizeof(val);
+
+ if (getsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP, &val, &sl)) {
+ int rc = -errno;
+ flow_perror(conn, "Getting RFC 7323 timestamp");
+ return rc;
}
+ }
+
+ t->timestamp = (uint32_t)val;
+ return 0;
+}
- if (c->tcp.fwd_in.mode == FWD_AUTO) {
- fwd_scan_ports_tcp(&c->tcp.fwd_in, &c->tcp.fwd_out);
- tcp_port_rebind(c, false);
+/**
+ * tcp_flow_repair_timestamp() - Restore RFC 7323 timestamp via TCP_TIMESTAMP
+ * @conn: Pointer to the TCP connection structure
+ * @t: Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_timestamp(const struct tcp_tap_conn *conn,
+ const struct tcp_tap_transfer_ext *t)
+{
+ int val = (int)t->timestamp;
+
+ if (t->tcpi_options & TCPI_OPT_TIMESTAMPS) {
+ if (setsockopt(conn->sock, SOL_TCP, TCP_TIMESTAMP,
+ &val, sizeof(val))) {
+ int rc = -errno;
+ flow_perror(conn, "Setting RFC 7323 timestamp");
+ return rc;
}
}
- tcp_sock_refill_init(c);
- if (c->mode == MODE_PASTA)
- tcp_splice_refill(c);
+ return 0;
+}
+
+/**
+ * tcp_flow_dump_wnd() - Dump current tcp_repair_window parameters
+ * @conn: Pointer to the TCP connection structure
+ * @t: Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_wnd(const struct tcp_tap_conn *conn,
+ struct tcp_tap_transfer_ext *t)
+{
+ struct tcp_repair_window wnd;
+ socklen_t sl = sizeof(wnd);
+
+ if (getsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) {
+ int rc = -errno;
+ flow_perror(conn, "Getting window repair data");
+ return rc;
+ }
+
+ t->snd_wl1 = wnd.snd_wl1;
+ t->snd_wnd = wnd.snd_wnd;
+ t->max_window = wnd.max_window;
+ t->rcv_wnd = wnd.rcv_wnd;
+ t->rcv_wup = wnd.rcv_wup;
+
+ /* If we received a FIN, we also need to adjust window parameters.
+ *
+ * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state.
+ */
+ if (t->tcpi_state == TCP_CLOSE_WAIT || t->tcpi_state == TCP_LAST_ACK) {
+ t->rcv_wup--;
+ t->rcv_wnd++;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_repair_wnd() - Restore window parameters from extended data
+ * @conn: Pointer to the TCP connection structure
+ * @t: Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_wnd(const struct tcp_tap_conn *conn,
+ const struct tcp_tap_transfer_ext *t)
+{
+ struct tcp_repair_window wnd;
+
+ wnd.snd_wl1 = t->snd_wl1;
+ wnd.snd_wnd = t->snd_wnd;
+ wnd.max_window = t->max_window;
+ wnd.rcv_wnd = t->rcv_wnd;
+ wnd.rcv_wup = t->rcv_wup;
+
+ if (setsockopt(conn->sock, IPPROTO_TCP, TCP_REPAIR_WINDOW,
+ &wnd, sizeof(wnd))) {
+ int rc = -errno;
+ flow_perror(conn, "Setting window data");
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_select_queue() - Select queue (receive or send) for next operation
+ * @conn: Connection to select queue for
+ * @queue: TCP_RECV_QUEUE or TCP_SEND_QUEUE
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_select_queue(const struct tcp_tap_conn *conn, int queue)
+{
+ if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_QUEUE,
+ &queue, sizeof(queue))) {
+ int rc = -errno;
+ flow_perror(conn, "Selecting TCP_SEND_QUEUE");
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_dump_sndqueue() - Dump send queue, length of sent and not sent data
+ * @conn: Connection to dump queue for
+ * @t: Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ *
+ * #syscalls:vu ioctl
+ */
+static int tcp_flow_dump_sndqueue(const struct tcp_tap_conn *conn,
+ struct tcp_tap_transfer_ext *t)
+{
+ int s = conn->sock;
+ ssize_t rc;
+
+ if (ioctl(s, SIOCOUTQ, &t->sndq) < 0) {
+ rc = -errno;
+ flow_perror(conn, "Getting send queue size");
+ return rc;
+ }
+
+ if (ioctl(s, SIOCOUTQNSD, &t->notsent) < 0) {
+ rc = -errno;
+ flow_perror(conn, "Getting not sent count");
+ return rc;
+ }
+
+ /* If we sent a FIN, SIOCOUTQ and SIOCOUTQNSD are one greater than the
+ * actual pending queue length, because they are based on the sequence
+ * numbers, not directly on the buffer contents.
+ *
+ * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state.
+ */
+ if (t->tcpi_state == TCP_FIN_WAIT1 || t->tcpi_state == TCP_FIN_WAIT2 ||
+ t->tcpi_state == TCP_LAST_ACK || t->tcpi_state == TCP_CLOSING) {
+ if (t->sndq)
+ t->sndq--;
+ if (t->notsent)
+ t->notsent--;
+ }
+
+ if (t->notsent > t->sndq) {
+ flow_err(conn,
+ "Invalid notsent count socket %i, send: %u, not sent: %u",
+ s, t->sndq, t->notsent);
+ return -EINVAL;
+ }
+
+ if (t->sndq > TCP_MIGRATE_SND_QUEUE_MAX) {
+ flow_err(conn,
+ "Send queue too large to migrate socket %i: %u bytes",
+ s, t->sndq);
+ return -ENOBUFS;
+ }
+
+ rc = recv(s, tcp_migrate_snd_queue,
+ MIN(t->sndq, TCP_MIGRATE_SND_QUEUE_MAX), MSG_PEEK);
+ if (rc < 0) {
+ if (errno == EAGAIN) { /* EAGAIN means empty */
+ rc = 0;
+ } else {
+ rc = -errno;
+ flow_perror(conn, "Can't read send queue");
+ return rc;
+ }
+ }
+
+ if ((uint32_t)rc < t->sndq) {
+ flow_err(conn, "Short read migrating send queue");
+ return -ENXIO;
+ }
+
+ t->notsent = MIN(t->notsent, t->sndq);
+
+ return 0;
+}
+
+/**
+ * tcp_flow_repair_queue() - Restore contents of a given (pre-selected) queue
+ * @conn: Connection to repair queue for
+ * @len: Length of data to be restored
+ * @buf: Buffer with content of pending data queue
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_queue(const struct tcp_tap_conn *conn,
+ size_t len, uint8_t *buf)
+{
+ size_t chunk = len;
+ uint8_t *p = buf;
+
+ if (conn->sock < 0) {
+ flow_err(conn, "Invalid socket descriptor for repair queue");
+ return -EBADF;
+ }
+
+ while (len > 0) {
+ ssize_t rc = send(conn->sock, p, MIN(len, chunk), 0);
+
+ if (rc < 0) {
+ if ((errno == ENOBUFS || errno == ENOMEM) &&
+ chunk >= TCP_MIGRATE_RESTORE_CHUNK_MIN) {
+ chunk /= 2;
+ continue;
+ }
+
+ rc = -errno;
+ flow_perror(conn, "Can't write queue");
+ return rc;
+ }
+
+ len -= rc;
+ p += rc;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_dump_seq() - Dump current sequence of pre-selected queue
+ * @conn: Pointer to the TCP connection structure
+ * @v: Sequence value, set on return
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_seq(const struct tcp_tap_conn *conn, uint32_t *v)
+{
+ socklen_t sl = sizeof(*v);
+
+ if (getsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) {
+ int rc = -errno;
+ flow_perror(conn, "Dumping sequence");
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_repair_seq() - Restore sequence for pre-selected queue
+ * @conn: Connection to repair sequences for
+ * @v: Sequence value to be set
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_seq(const struct tcp_tap_conn *conn,
+ const uint32_t *v)
+{
+ if (setsockopt(conn->sock, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) {
+ int rc = -errno;
+ flow_perror(conn, "Setting sequence");
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_dump_rcvqueue() - Dump receive queue and its length, seal/block it
+ * @conn: Pointer to the TCP connection structure
+ * @t: Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ *
+ * #syscalls:vu ioctl
+ */
+static int tcp_flow_dump_rcvqueue(const struct tcp_tap_conn *conn,
+ struct tcp_tap_transfer_ext *t)
+{
+ int s = conn->sock;
+ ssize_t rc;
+
+ if (ioctl(s, SIOCINQ, &t->rcvq) < 0) {
+ rc = -errno;
+ err_perror("Get receive queue size, socket %i", s);
+ return rc;
+ }
+
+ /* If we received a FIN, SIOCINQ is one greater than the actual number
+ * of bytes on the queue, because it's based on the sequence number
+ * rather than directly on the buffer contents.
+ *
+ * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state.
+ */
+ if (t->rcvq &&
+ (t->tcpi_state == TCP_CLOSE_WAIT || t->tcpi_state == TCP_LAST_ACK))
+ t->rcvq--;
+
+ if (t->rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) {
+ flow_err(conn,
+ "Receive queue too large to migrate socket: %u bytes",
+ t->rcvq);
+ return -ENOBUFS;
+ }
+
+ rc = recv(s, tcp_migrate_rcv_queue, t->rcvq, MSG_PEEK);
+ if (rc < 0) {
+ if (errno == EAGAIN) { /* EAGAIN means empty */
+ rc = 0;
+ } else {
+ rc = -errno;
+ flow_perror(conn, "Can't read receive queue");
+ return rc;
+ }
+ }
+
+ if ((uint32_t)rc < t->rcvq) {
+ flow_err(conn, "Short read migrating receive queue");
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_repair_opt() - Set repair "options" (MSS, scale, SACK, timestamps)
+ * @conn: Pointer to the TCP connection structure
+ * @t: Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_opt(const struct tcp_tap_conn *conn,
+ const struct tcp_tap_transfer_ext *t)
+{
+ const struct tcp_repair_opt opts[] = {
+ { TCPOPT_WINDOW, t->snd_ws + (t->rcv_ws << 16) },
+ { TCPOPT_MAXSEG, t->mss },
+ { TCPOPT_SACK_PERMITTED, 0 },
+ { TCPOPT_TIMESTAMP, 0 },
+ };
+ socklen_t sl;
+
+ sl = sizeof(opts[0]) * (2 +
+ !!(t->tcpi_options & TCPI_OPT_SACK) +
+ !!(t->tcpi_options & TCPI_OPT_TIMESTAMPS));
+
+ if (setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) {
+ int rc = -errno;
+ flow_perror(conn, "Setting repair options");
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_migrate_source() - Send data (flow table) for flow, close listening
+ * @fd: Descriptor for state migration
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn)
+{
+ struct tcp_tap_transfer t = {
+ .retries = conn->retries,
+ .ws_from_tap = conn->ws_from_tap,
+ .ws_to_tap = conn->ws_to_tap,
+ .events = conn->events,
+
+ .tap_mss = htonl(MSS_GET(conn)),
+
+ .sndbuf = htonl(conn->sndbuf),
+
+ .flags = conn->flags,
+ .seq_dup_ack_approx = conn->seq_dup_ack_approx,
+
+ .wnd_from_tap = htons(conn->wnd_from_tap),
+ .wnd_to_tap = htons(conn->wnd_to_tap),
+
+ .seq_to_tap = htonl(conn->seq_to_tap),
+ .seq_ack_from_tap = htonl(conn->seq_ack_from_tap),
+ .seq_from_tap = htonl(conn->seq_from_tap),
+ .seq_ack_to_tap = htonl(conn->seq_ack_to_tap),
+ .seq_init_from_tap = htonl(conn->seq_init_from_tap),
+ };
+
+ memcpy(&t.pif, conn->f.pif, sizeof(t.pif));
+ memcpy(&t.side, conn->f.side, sizeof(t.side));
+
+ if (write_all_buf(fd, &t, sizeof(t))) {
+ int rc = -errno;
+ err_perror("Can't write migration data, socket %i", conn->sock);
+ return rc;
+ }
+
+ if (conn->listening_sock != -1 && !fcntl(conn->listening_sock, F_GETFD))
+ close(conn->listening_sock);
+
+ return 0;
+}
+
+/**
+ * tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data
+ * @c: Execution context
+ * @fd: Descriptor for state migration
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure
+ */
+int tcp_flow_migrate_source_ext(const struct ctx *c,
+ int fd, const struct tcp_tap_conn *conn)
+{
+ uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
+ struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(conn)];
+ int s = conn->sock;
+ int rc;
+
+ /* Disable SO_PEEK_OFF, it will make accessing the queues in repair mode
+ * weird.
+ */
+ if (tcp_set_peek_offset(conn, -1)) {
+ rc = -errno;
+ goto fail;
+ }
+
+ if ((rc = tcp_flow_dump_tinfo(conn, t)))
+ goto fail;
+
+ if ((rc = tcp_flow_dump_mss(conn, t)))
+ goto fail;
+
+ if ((rc = tcp_flow_dump_timestamp(conn, t)))
+ goto fail;
+
+ if ((rc = tcp_flow_dump_wnd(conn, t)))
+ goto fail;
+
+ if ((rc = tcp_flow_select_queue(conn, TCP_SEND_QUEUE)))
+ goto fail;
+
+ if ((rc = tcp_flow_dump_sndqueue(conn, t)))
+ goto fail;
+
+ if ((rc = tcp_flow_dump_seq(conn, &t->seq_snd)))
+ goto fail;
+
+ if ((rc = tcp_flow_select_queue(conn, TCP_RECV_QUEUE)))
+ goto fail;
+
+ if ((rc = tcp_flow_dump_rcvqueue(conn, t)))
+ goto fail;
+
+ if ((rc = tcp_flow_dump_seq(conn, &t->seq_rcv)))
+ goto fail;
+
+ if (c->migrate_no_linger)
+ close(s);
+ else
+ epoll_del(flow_epollfd(&conn->f), s);
+
+ /* Adjustments unrelated to FIN segments: sequence numbers we dumped are
+ * based on the end of the queues.
+ */
+ t->seq_rcv -= t->rcvq;
+ t->seq_snd -= t->sndq;
+
+ flow_dbg(conn, "Extended migration data, socket %i sequences send %u receive %u",
+ s, t->seq_snd, t->seq_rcv);
+ flow_dbg(conn, " pending queues: send %u not sent %u receive %u",
+ t->sndq, t->notsent, t->rcvq);
+ flow_dbg(conn, " window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
+ t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup);
+ flow_dbg(conn, " SO_PEEK_OFF %s offset=%"PRIu32,
+ peek_offset_cap ? "enabled" : "disabled", peek_offset);
+
+ /* Endianness fix-ups */
+ t->seq_snd = htonl(t->seq_snd);
+ t->seq_rcv = htonl(t->seq_rcv);
+ t->sndq = htonl(t->sndq);
+ t->notsent = htonl(t->notsent);
+ t->rcvq = htonl(t->rcvq);
+ t->mss = htonl(t->mss);
+ t->timestamp = htonl(t->timestamp);
+
+ t->snd_wl1 = htonl(t->snd_wl1);
+ t->snd_wnd = htonl(t->snd_wnd);
+ t->max_window = htonl(t->max_window);
+ t->rcv_wnd = htonl(t->rcv_wnd);
+ t->rcv_wup = htonl(t->rcv_wup);
+
+ if (write_all_buf(fd, t, sizeof(*t))) {
+ flow_perror(conn, "Failed to write extended data");
+ return -EIO;
+ }
+
+ if (write_all_buf(fd, tcp_migrate_snd_queue, ntohl(t->sndq))) {
+ flow_perror(conn, "Failed to write send queue data");
+ return -EIO;
+ }
+
+ if (write_all_buf(fd, tcp_migrate_rcv_queue, ntohl(t->rcvq))) {
+ flow_perror(conn, "Failed to write receive queue data");
+ return -EIO;
+ }
+
+ return 0;
+
+fail:
+ /* For any type of failure dumping data, write an invalid extended data
+ * descriptor that allows us to keep the stream in sync, but tells the
+ * target to skip the flow. If we fail to transfer data, that's fatal:
+ * return -EIO in that case (and only in that case).
+ */
+ t->tcpi_state = 0; /* Not defined: tell the target to skip this flow */
+
+ if (write_all_buf(fd, t, sizeof(*t))) {
+ flow_perror(conn, "Failed to write extended data");
+ return -EIO;
+ }
+
+ if (rc == -EIO) /* but not a migration data transfer failure */
+ return -ENODATA;
+
+ return rc;
+}
+
+/**
+ * tcp_flow_repair_socket() - Open and bind socket, request repair mode
+ * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
+{
+ sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6;
+ int s, rc;
+
+ if ((conn->sock = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
+ IPPROTO_TCP)) < 0) {
+ rc = -errno;
+ flow_perror(conn, "Failed to create socket for migrated flow");
+ return rc;
+ }
+ s = conn->sock;
+
+ if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &(int){ 1 }, sizeof(int)))
+ flow_dbg_perror(conn, "Failed to set SO_REUSEADDR on socket %i",
+ s);
+
+ tcp_sock_set_nodelay(s);
+
+ if ((rc = tcp_flow_repair_on(c, conn)))
+ goto err;
+
+ return 0;
+
+err:
+ close(s);
+ conn->sock = -1;
+ return rc;
+}
+
+/**
+ * tcp_flow_repair_bind() - Bind socket in repair mode
+ * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_bind(const struct ctx *c, struct tcp_tap_conn *conn)
+{
+ const struct flowside *sockside = HOSTFLOW(conn);
+ union sockaddr_inany a;
+
+ pif_sockaddr(c, &a, PIF_HOST, &sockside->oaddr, sockside->oport);
+
+ if (bind(conn->sock, &a.sa, socklen_inany(&a))) {
+ int rc = -errno;
+ flow_perror(conn, "Failed to bind socket for migrated flow");
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_repair_connect() - Connect socket in repair mode, then turn it off
+ * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_connect(const struct ctx *c,
+ struct tcp_tap_conn *conn)
+{
+ const struct flowside *tgt = HOSTFLOW(conn);
+ int rc;
+
+ rc = flowside_connect(c, conn->sock, PIF_HOST, tgt);
+ if (rc) {
+ rc = -errno;
+ flow_perror(conn, "Failed to connect migrated socket");
+ return rc;
+ }
+
+ flow_epollid_clear(&conn->f);
+ conn->timer = -1;
+ conn->listening_sock = -1;
+
+ return 0;
+}
+
+/**
+ * tcp_flow_migrate_target() - Receive data (flow table part) for flow, insert
+ * @c: Execution context
+ * @fd: Descriptor for state migration
+ *
+ * Return: 0 on success, negative on fatal failure, but 0 on single flow failure
+ */
+int tcp_flow_migrate_target(struct ctx *c, int fd)
+{
+ struct tcp_tap_transfer t;
+ struct tcp_tap_conn *conn;
+ union flow *flow;
+ int rc;
+
+ if (!(flow = flow_alloc())) {
+ err("Flow table full on migration target");
+ return 0;
+ }
+
+ if (read_all_buf(fd, &t, sizeof(t))) {
+ flow_perror(flow, "Failed to receive migration data");
+ flow_alloc_cancel(flow);
+ return -errno;
+ }
+
+ flow->f.state = FLOW_STATE_TGT;
+ memcpy(&flow->f.pif, &t.pif, sizeof(flow->f.pif));
+ memcpy(&flow->f.side, &t.side, sizeof(flow->f.side));
+ conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
+
+ conn->retries = t.retries;
+ conn->ws_from_tap = t.ws_from_tap;
+ conn->ws_to_tap = t.ws_to_tap;
+ conn->events = t.events;
+
+ conn->sndbuf = htonl(t.sndbuf);
+
+ conn->flags = t.flags;
+ conn->seq_dup_ack_approx = t.seq_dup_ack_approx;
+
+ MSS_SET(conn, ntohl(t.tap_mss));
+
+ conn->wnd_from_tap = ntohs(t.wnd_from_tap);
+ conn->wnd_to_tap = ntohs(t.wnd_to_tap);
+
+ conn->seq_to_tap = ntohl(t.seq_to_tap);
+ conn->seq_ack_from_tap = ntohl(t.seq_ack_from_tap);
+ conn->seq_from_tap = ntohl(t.seq_from_tap);
+ conn->seq_ack_to_tap = ntohl(t.seq_ack_to_tap);
+ conn->seq_init_from_tap = ntohl(t.seq_init_from_tap);
+
+ if ((rc = tcp_flow_repair_socket(c, conn))) {
+ flow_err(flow, "Can't set up socket: %s, drop", strerror_(-rc));
+ /* Can't leave the flow in an incomplete state */
+ FLOW_ACTIVATE(conn);
+ return 0;
+ }
+
+ flow_hash_insert(c, TAP_SIDX(conn));
+ FLOW_ACTIVATE(conn);
+
+ return 0;
+}
+
+/**
+ * tcp_flow_migrate_target_ext() - Receive extended data for flow, set, connect
+ * @c: Execution context
+ * @conn: Connection entry to complete with extra data
+ * @fd: Descriptor for state migration
+ *
+ * Return: 0 on success, negative on fatal failure, but 0 on single flow failure
+ */
+int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd)
+{
+ uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
+ struct tcp_tap_transfer_ext t;
+ int s = conn->sock, rc;
+
+ if (read_all_buf(fd, &t, sizeof(t))) {
+ rc = -errno;
+ flow_perror(conn, "Failed to read extended data");
+ return rc;
+ }
+
+ if (!t.tcpi_state) { /* Source wants us to skip this flow */
+ flow_err(conn, "Dropping as requested by source");
+ goto fail;
+ }
+
+ /* Endianness fix-ups */
+ t.seq_snd = ntohl(t.seq_snd);
+ t.seq_rcv = ntohl(t.seq_rcv);
+ t.sndq = ntohl(t.sndq);
+ t.notsent = ntohl(t.notsent);
+ t.rcvq = ntohl(t.rcvq);
+ t.mss = ntohl(t.mss);
+ t.timestamp = ntohl(t.timestamp);
+
+ t.snd_wl1 = ntohl(t.snd_wl1);
+ t.snd_wnd = ntohl(t.snd_wnd);
+ t.max_window = ntohl(t.max_window);
+ t.rcv_wnd = ntohl(t.rcv_wnd);
+ t.rcv_wup = ntohl(t.rcv_wup);
+
+ flow_dbg(conn,
+ "Extended migration data, socket %i sequences send %u receive %u",
+ s, t.seq_snd, t.seq_rcv);
+ flow_dbg(conn, " pending queues: send %u not sent %u receive %u",
+ t.sndq, t.notsent, t.rcvq);
+ flow_dbg(conn,
+ " window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
+ t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup);
+ flow_dbg(conn, " SO_PEEK_OFF %s offset=%"PRIu32,
+ peek_offset_cap ? "enabled" : "disabled", peek_offset);
+
+ if (t.sndq > TCP_MIGRATE_SND_QUEUE_MAX || t.notsent > t.sndq ||
+ t.rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) {
+ flow_err(conn,
+ "Bad queues socket %i, send: %u, not sent: %u, receive: %u",
+ s, t.sndq, t.notsent, t.rcvq);
+ return -EINVAL;
+ }
+
+ if (read_all_buf(fd, tcp_migrate_snd_queue, t.sndq)) {
+ rc = -errno;
+ flow_perror(conn, "Failed to read send queue data");
+ return rc;
+ }
+
+ if (read_all_buf(fd, tcp_migrate_rcv_queue, t.rcvq)) {
+ rc = -errno;
+ flow_perror(conn, "Failed to read receive queue data");
+ return rc;
+ }
+
+ if (conn->sock < 0)
+ /* We weren't able to create the socket, discard flow */
+ goto fail;
+
+ if (tcp_flow_repair_bind(c, conn))
+ goto fail;
+
+ if (tcp_flow_repair_timestamp(conn, &t))
+ goto fail;
+
+ if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE))
+ goto fail;
+
+ if (tcp_flow_repair_seq(conn, &t.seq_snd))
+ goto fail;
+
+ if (tcp_flow_select_queue(conn, TCP_RECV_QUEUE))
+ goto fail;
+
+ if (tcp_flow_repair_seq(conn, &t.seq_rcv))
+ goto fail;
+
+ if (tcp_flow_repair_connect(c, conn))
+ goto fail;
+
+ if (tcp_flow_repair_queue(conn, t.rcvq, tcp_migrate_rcv_queue))
+ goto fail;
+
+ if (tcp_flow_select_queue(conn, TCP_SEND_QUEUE))
+ goto fail;
+
+ if (tcp_flow_repair_queue(conn, t.sndq - t.notsent,
+ tcp_migrate_snd_queue))
+ goto fail;
+
+ if (tcp_flow_repair_opt(conn, &t))
+ goto fail;
+
+ /* If we sent a FIN sent and it was acknowledged (TCP_FIN_WAIT2), don't
+ * send it out, because we already sent it for sure.
+ *
+ * Call shutdown(x, SHUT_WR) in repair mode, so that we move to
+ * FIN_WAIT_1 (tcp_shutdown()) without sending anything
+ * (goto in tcp_write_xmit()).
+ */
+ if (t.tcpi_state == TCP_FIN_WAIT2) {
+ int v;
+
+ v = TCP_SEND_QUEUE;
+ if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v)))
+ flow_perror(conn, "Selecting repair queue");
+ else
+ shutdown(s, SHUT_WR);
+ }
+
+ if (tcp_flow_repair_wnd(conn, &t))
+ goto fail;
+
+ tcp_flow_repair_off(c, conn);
+ repair_flush(c);
+
+ if (t.notsent) {
+ if (tcp_flow_repair_queue(conn, t.notsent,
+ tcp_migrate_snd_queue +
+ (t.sndq - t.notsent))) {
+ /* This sometimes seems to fail for unclear reasons.
+ * Don't fail the whole migration, just reset the flow
+ * and carry on to the next one.
+ */
+ goto fail;
+ }
+ }
+
+ /* If we sent a FIN but it wasn't acknowledged yet (TCP_FIN_WAIT1), send
+ * it out, because we don't know if we already sent it.
+ *
+ * Call shutdown(x, SHUT_WR) *not* in repair mode, which moves us to
+ * TCP_FIN_WAIT1.
+ */
+ if (t.tcpi_state == TCP_FIN_WAIT1)
+ shutdown(s, SHUT_WR);
+
+ if (tcp_set_peek_offset(conn, peek_offset))
+ goto fail;
+
+ tcp_send_flag(c, conn, ACK);
+ tcp_data_from_sock(c, conn);
+
+ if ((rc = tcp_epoll_ctl(c, conn))) {
+ flow_dbg(conn,
+ "Failed to subscribe to epoll for migrated socket: %s",
+ strerror_(-rc));
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ if (conn->sock >= 0) {
+ tcp_flow_repair_off(c, conn);
+ repair_flush(c);
+ }
+
+ conn->flags = 0; /* Not waiting for ACK, don't schedule timer */
+ tcp_rst(c, conn);
+
+ return 0;
+}
+
+/**
+ * tcp_prepare_iov() - Prepare iov according to kernel capability
+ * @msg: Message header to update
+ * @iov: iovec to receive TCP payload and data to discard
+ * @already_sent: Bytes sent after the last acknowledged one
+ * @payload_iov_cnt: Number of TCP payload iovec entries
+ *
+ * Return: 0 on success, -1 if already_sent cannot be discarded fully
+ */
+int tcp_prepare_iov(struct msghdr *msg, struct iovec *iov,
+ uint32_t already_sent, int payload_iov_cnt)
+{
+ /*
+ * IOV layout
+ * |- tcp_buf_discard -|---------- TCP data slots ------------|
+ *
+ * with discarded data:
+ * |------ddddddddddddd|ttttttttttttt-------------------------|
+ * ^
+ * |
+ * msg_iov
+ *
+ * without discarded data:
+ * |-------------------|ttttttttttttt-------------------------|
+ * ^
+ * |
+ * msg_iov
+ * d: discard data
+ * t: TCP data
+ */
+ if (peek_offset_cap) {
+ msg->msg_iov = iov + DISCARD_IOV_NUM;
+ msg->msg_iovlen = payload_iov_cnt;
+ } else {
+ int discard_cnt, discard_iov_rem;
+ struct iovec *iov_start;
+ int i;
+
+ discard_cnt = DIV_ROUND_UP(already_sent, BUF_DISCARD_SIZE);
+ if (discard_cnt > DISCARD_IOV_NUM) {
+ debug("Failed to discard %u already sent bytes",
+ already_sent);
+ return -1;
+ }
+
+ discard_iov_rem = already_sent % BUF_DISCARD_SIZE;
+
+ iov_start = iov + (DISCARD_IOV_NUM - discard_cnt);
+
+ /* Multiple iov entries pointing to the same buffer */
+ for (i = 0; i < discard_cnt; i++) {
+ iov_start[i].iov_base = tcp_buf_discard;
+ iov_start[i].iov_len = BUF_DISCARD_SIZE;
+ }
+ if (discard_iov_rem)
+ iov[DISCARD_IOV_NUM - 1].iov_len = discard_iov_rem;
+
+ msg->msg_iov = iov_start;
+ msg->msg_iovlen = discard_cnt + payload_iov_cnt;
+ }
+
+ return 0;
}