aboutgitcodebugslistschat
path: root/tcp.c
diff options
context:
space:
mode:
Diffstat (limited to 'tcp.c')
-rw-r--r--tcp.c509
1 files changed, 252 insertions, 257 deletions
diff --git a/tcp.c b/tcp.c
index c0820ce..ec433f7 100644
--- a/tcp.c
+++ b/tcp.c
@@ -274,6 +274,7 @@
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/ip.h>
+#include <netinet/tcp.h>
#include <stdint.h>
#include <stdbool.h>
#include <stddef.h>
@@ -286,8 +287,6 @@
#include <time.h>
#include <arpa/inet.h>
-#include <linux/tcp.h> /* For struct tcp_info */
-
#include "checksum.h"
#include "util.h"
#include "iov.h"
@@ -300,19 +299,16 @@
#include "log.h"
#include "inany.h"
#include "flow.h"
+#include "linux_dep.h"
#include "flow_table.h"
#include "tcp_internal.h"
#include "tcp_buf.h"
+#include "tcp_vu.h"
/* MSS rounding: see SET_MSS() */
#define MSS_DEFAULT 536
#define WINDOW_DEFAULT 14600 /* RFC 6928 */
-#ifdef HAS_SND_WND
-# define KERNEL_REPORTS_SND_WND(c) ((c)->tcp.kernel_snd_wnd)
-#else
-# define KERNEL_REPORTS_SND_WND(c) (0 && (c))
-#endif
#define ACK_INTERVAL 10 /* ms */
#define SYN_TIMEOUT 10 /* s */
@@ -323,11 +319,6 @@
#define LOW_RTT_TABLE_SIZE 8
#define LOW_RTT_THRESHOLD 10 /* us */
-/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of
- * <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP
- */
-#define SOL_TCP IPPROTO_TCP
-
#define ACK_IF_NEEDED 0 /* See tcp_send_flag() */
#define CONN_IS_CLOSING(conn) \
@@ -361,8 +352,8 @@ static const char *tcp_flag_str[] __attribute((__unused__)) = {
static int tcp_sock_init_ext [NUM_PORTS][IP_VERSIONS];
static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
-/* Table of guest side forwarding addresses with very low RTT (assumed
- * to be local to the host), LRU
+/* Table of our guest side addresses with very low RTT (assumed to be local to
+ * the host), LRU
*/
static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
@@ -371,6 +362,20 @@ char tcp_buf_discard [MAX_WINDOW];
/* Does the kernel support TCP_PEEK_OFF? */
bool peek_offset_cap;
+/* Size of data returned by TCP_INFO getsockopt() */
+socklen_t tcp_info_size;
+
+#define tcp_info_cap(f_) \
+ ((offsetof(struct tcp_info_linux, tcpi_##f_) + \
+ sizeof(((struct tcp_info_linux *)NULL)->tcpi_##f_)) <= tcp_info_size)
+
+/* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */
+#define snd_wnd_cap tcp_info_cap(snd_wnd)
+/* Kernel reports bytes acked in TCP_INFO (kernel commit 0df48c26d84) */
+#define bytes_acked_cap tcp_info_cap(bytes_acked)
+/* Kernel reports minimum RTT in TCP_INFO (kernel commit cd9b266095f4) */
+#define min_rtt_cap tcp_info_cap(min_rtt)
+
/* sendmsg() to socket */
static struct iovec tcp_iov [UIO_MAXIOV];
@@ -440,7 +445,7 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
if (events == TAP_SYN_RCVD)
return EPOLLOUT | EPOLLET | EPOLLRDHUP;
- return EPOLLRDHUP;
+ return EPOLLET | EPOLLRDHUP;
}
/**
@@ -511,7 +516,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
fd = timerfd_create(CLOCK_MONOTONIC, 0);
if (fd == -1 || fd > FD_REF_MAX) {
flow_dbg(conn, "failed to get timer: %s",
- strerror(errno));
+ strerror_(errno));
if (fd > -1)
close(fd);
conn->timer = -1;
@@ -521,7 +526,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
flow_dbg(conn, "failed to add timer: %s",
- strerror(errno));
+ strerror_(errno));
close(conn->timer);
conn->timer = -1;
return;
@@ -545,7 +550,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
(unsigned long long)it.it_value.tv_sec,
(unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
- timerfd_settime(conn->timer, 0, &it, NULL);
+ if (timerfd_settime(conn->timer, 0, &it, NULL))
+ flow_err(conn, "failed to set timer: %s", strerror_(errno));
}
/**
@@ -663,7 +669,7 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
int i;
for (i = 0; i < LOW_RTT_TABLE_SIZE; i++)
- if (inany_equals(&tapside->faddr, low_rtt_dst + i))
+ if (inany_equals(&tapside->oaddr, low_rtt_dst + i))
return 1;
return 0;
@@ -675,18 +681,17 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
* @tinfo: Pointer to struct tcp_info for socket
*/
static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
- const struct tcp_info *tinfo)
+ const struct tcp_info_linux *tinfo)
{
-#ifdef HAS_MIN_RTT
const struct flowside *tapside = TAPFLOW(conn);
int i, hole = -1;
- if (!tinfo->tcpi_min_rtt ||
+ if (!min_rtt_cap ||
(int)tinfo->tcpi_min_rtt > LOW_RTT_THRESHOLD)
return;
for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) {
- if (inany_equals(&tapside->faddr, low_rtt_dst + i))
+ if (inany_equals(&tapside->oaddr, low_rtt_dst + i))
return;
if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i))
hole = i;
@@ -698,14 +703,10 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
if (hole == -1)
return;
- low_rtt_dst[hole++] = tapside->faddr;
+ low_rtt_dst[hole++] = tapside->oaddr;
if (hole == LOW_RTT_TABLE_SIZE)
hole = 0;
inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any);
-#else
- (void)conn;
- (void)tinfo;
-#endif /* HAS_MIN_RTT */
}
/**
@@ -752,34 +753,16 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s)
}
/**
- * tcp_update_check_tcp4() - Update TCP checksum from stored one
- * @iph: IPv4 header
- * @th: TCP header followed by TCP payload
- */
-static void tcp_update_check_tcp4(const struct iphdr *iph, struct tcphdr *th)
-{
- uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr);
- struct in_addr saddr = { .s_addr = iph->saddr };
- struct in_addr daddr = { .s_addr = iph->daddr };
- uint32_t sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr);
-
- th->check = 0;
- th->check = csum(th, l4len, sum);
-}
-
-/**
- * tcp_update_check_tcp6() - Calculate TCP checksum for IPv6
- * @ip6h: IPv6 header
- * @th: TCP header followed by TCP payload
+ * tcp_update_csum() - Calculate TCP checksum
+ * @psum: Unfolded partial checksum of the IPv4 or IPv6 pseudo-header
+ * @th: TCP header (updated)
+ * @payload: TCP payload
*/
-static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
+void tcp_update_csum(uint32_t psum, struct tcphdr *th, struct iov_tail *payload)
{
- uint16_t l4len = ntohs(ip6h->payload_len);
- uint32_t sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP,
- &ip6h->saddr, &ip6h->daddr);
-
th->check = 0;
- th->check = csum(th, l4len, sum);
+ psum = csum_unfolded(th, sizeof(*th), psum);
+ th->check = csum_iov_tail(payload, psum);
}
/**
@@ -865,7 +848,6 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn)
/* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */
void tcp_defer_handler(struct ctx *c)
{
- tcp_flags_flush(c);
tcp_payload_flush(c);
}
@@ -881,7 +863,7 @@ static void tcp_fill_header(struct tcphdr *th,
{
const struct flowside *tapside = TAPFLOW(conn);
- th->source = htons(tapside->fport);
+ th->source = htons(tapside->oport);
th->dest = htons(tapside->eport);
th->seq = htonl(seq);
th->ack_seq = htonl(conn->seq_ack_to_tap);
@@ -895,116 +877,82 @@ static void tcp_fill_header(struct tcphdr *th,
}
/**
- * tcp_fill_headers4() - Fill 802.3, IPv4, TCP headers in pre-cooked buffers
- * @conn: Connection pointer
- * @taph: tap backend specific header
- * @iph: Pointer to IPv4 header
- * @th: Pointer to TCP header
- * @dlen: TCP payload length
- * @check: Checksum, if already known
- * @seq: Sequence number for this segment
- *
- * Return: The IPv4 payload length, host order
+ * tcp_fill_headers() - Fill 802.3, IP, TCP headers
+ * @conn: Connection pointer
+ * @taph: tap backend specific header
+ * @ip4h: Pointer to IPv4 header, or NULL
+ * @ip6h: Pointer to IPv6 header, or NULL
+ * @th: Pointer to TCP header
+ * @payload: TCP payload
+ * @ip4_check: IPv4 checksum, if already known
+ * @seq: Sequence number for this segment
+ * @no_tcp_csum: Do not set TCP checksum
*/
-static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn,
- struct tap_hdr *taph,
- struct iphdr *iph, struct tcphdr *th,
- size_t dlen, const uint16_t *check,
- uint32_t seq)
+void tcp_fill_headers(const struct tcp_tap_conn *conn,
+ struct tap_hdr *taph,
+ struct iphdr *ip4h, struct ipv6hdr *ip6h,
+ struct tcphdr *th, struct iov_tail *payload,
+ const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum)
{
const struct flowside *tapside = TAPFLOW(conn);
- const struct in_addr *src4 = inany_v4(&tapside->faddr);
- const struct in_addr *dst4 = inany_v4(&tapside->eaddr);
- size_t l4len = dlen + sizeof(*th);
- size_t l3len = l4len + sizeof(*iph);
+ size_t l4len = iov_tail_size(payload) + sizeof(*th);
+ size_t l3len = l4len;
+ uint32_t psum = 0;
- ASSERT(src4 && dst4);
+ if (ip4h) {
+ const struct in_addr *src4 = inany_v4(&tapside->oaddr);
+ const struct in_addr *dst4 = inany_v4(&tapside->eaddr);
- iph->tot_len = htons(l3len);
- iph->saddr = src4->s_addr;
- iph->daddr = dst4->s_addr;
+ ASSERT(src4 && dst4);
- iph->check = check ? *check :
- csum_ip4_header(l3len, IPPROTO_TCP, *src4, *dst4);
+ l3len += + sizeof(*ip4h);
- tcp_fill_header(th, conn, seq);
+ ip4h->tot_len = htons(l3len);
+ ip4h->saddr = src4->s_addr;
+ ip4h->daddr = dst4->s_addr;
- tcp_update_check_tcp4(iph, th);
+ if (ip4_check)
+ ip4h->check = *ip4_check;
+ else
+ ip4h->check = csum_ip4_header(l3len, IPPROTO_TCP,
+ *src4, *dst4);
- tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
+ if (!no_tcp_csum) {
+ psum = proto_ipv4_header_psum(l4len, IPPROTO_TCP,
+ *src4, *dst4);
+ }
+ }
- return l4len;
-}
+ if (ip6h) {
+ l3len += sizeof(*ip6h);
-/**
- * tcp_fill_headers6() - Fill 802.3, IPv6, TCP headers in pre-cooked buffers
- * @conn: Connection pointer
- * @taph: tap backend specific header
- * @ip6h: Pointer to IPv6 header
- * @th: Pointer to TCP header
- * @dlen: TCP payload length
- * @check: Checksum, if already known
- * @seq: Sequence number for this segment
- *
- * Return: The IPv6 payload length, host order
- */
-static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn,
- struct tap_hdr *taph,
- struct ipv6hdr *ip6h, struct tcphdr *th,
- size_t dlen, uint32_t seq)
-{
- const struct flowside *tapside = TAPFLOW(conn);
- size_t l4len = dlen + sizeof(*th);
+ ip6h->payload_len = htons(l4len);
+ ip6h->saddr = tapside->oaddr.a6;
+ ip6h->daddr = tapside->eaddr.a6;
- ip6h->payload_len = htons(l4len);
- ip6h->saddr = tapside->faddr.a6;
- ip6h->daddr = tapside->eaddr.a6;
+ ip6h->hop_limit = 255;
+ ip6h->version = 6;
+ ip6h->nexthdr = IPPROTO_TCP;
- ip6h->hop_limit = 255;
- ip6h->version = 6;
- ip6h->nexthdr = IPPROTO_TCP;
+ ip6h->flow_lbl[0] = (conn->sock >> 16) & 0xf;
+ ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff;
+ ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff;
- ip6h->flow_lbl[0] = (conn->sock >> 16) & 0xf;
- ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff;
- ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff;
+ if (!no_tcp_csum) {
+ psum = proto_ipv6_header_psum(l4len, IPPROTO_TCP,
+ &ip6h->saddr,
+ &ip6h->daddr);
+ }
+ }
tcp_fill_header(th, conn, seq);
- tcp_update_check_tcp6(ip6h, th);
-
- tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr));
-
- return l4len;
-}
-
-/**
- * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
- * @conn: Connection pointer
- * @iov: Pointer to an array of iovec of TCP pre-cooked buffers
- * @dlen: TCP payload length
- * @check: Checksum, if already known
- * @seq: Sequence number for this segment
- *
- * Return: IP payload length, host order
- */
-size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
- struct iovec *iov, size_t dlen,
- const uint16_t *check, uint32_t seq)
-{
- const struct flowside *tapside = TAPFLOW(conn);
- const struct in_addr *a4 = inany_v4(&tapside->faddr);
-
- if (a4) {
- return tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base,
- iov[TCP_IOV_IP].iov_base,
- iov[TCP_IOV_PAYLOAD].iov_base, dlen,
- check, seq);
- }
+ if (no_tcp_csum)
+ th->check = 0;
+ else
+ tcp_update_csum(psum, th, payload);
- return tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base,
- iov[TCP_IOV_IP].iov_base,
- iov[TCP_IOV_PAYLOAD].iov_base, dlen,
- seq);
+ tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
}
/**
@@ -1017,42 +965,41 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
* Return: 1 if sequence or window were updated, 0 otherwise
*/
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
- int force_seq, struct tcp_info *tinfo)
+ bool force_seq, struct tcp_info_linux *tinfo)
{
uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap;
uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
/* cppcheck-suppress [ctunullpointer, unmatchedSuppression] */
socklen_t sl = sizeof(*tinfo);
- struct tcp_info tinfo_new;
+ struct tcp_info_linux tinfo_new;
uint32_t new_wnd_to_tap = prev_wnd_to_tap;
int s = conn->sock;
-#ifndef HAS_BYTES_ACKED
- (void)force_seq;
-
- conn->seq_ack_to_tap = conn->seq_from_tap;
- if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
- conn->seq_ack_to_tap = prev_ack_to_tap;
-#else
- if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || tcp_rtt_dst_low(conn)
- || CONN_IS_CLOSING(conn) || (conn->flags & LOCAL) || force_seq) {
+ if (!bytes_acked_cap) {
conn->seq_ack_to_tap = conn->seq_from_tap;
- } else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
- if (!tinfo) {
- tinfo = &tinfo_new;
- if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
- return 0;
- }
-
- conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
- conn->seq_init_from_tap;
-
if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
conn->seq_ack_to_tap = prev_ack_to_tap;
+ } else {
+ if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL ||
+ tcp_rtt_dst_low(conn) || CONN_IS_CLOSING(conn) ||
+ (conn->flags & LOCAL) || force_seq) {
+ conn->seq_ack_to_tap = conn->seq_from_tap;
+ } else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
+ if (!tinfo) {
+ tinfo = &tinfo_new;
+ if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
+ return 0;
+ }
+
+ conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
+ conn->seq_init_from_tap;
+
+ if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
+ conn->seq_ack_to_tap = prev_ack_to_tap;
+ }
}
-#endif /* !HAS_BYTES_ACKED */
- if (!KERNEL_REPORTS_SND_WND(c)) {
+ if (!snd_wnd_cap) {
tcp_get_sndbuf(conn);
new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW);
conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap,
@@ -1063,14 +1010,13 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
if (!tinfo) {
if (prev_wnd_to_tap > WINDOW_DEFAULT) {
goto out;
-}
+ }
tinfo = &tinfo_new;
if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) {
goto out;
-}
+ }
}
-#ifdef HAS_SND_WND
if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) {
new_wnd_to_tap = tinfo->tcpi_snd_wnd;
} else {
@@ -1078,7 +1024,6 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd,
SNDBUF_GET(conn));
}
-#endif
new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW);
if (!(conn->events & ESTABLISHED))
@@ -1136,44 +1081,35 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
* 0 if there is no flag to send
* 1 otherwise
*/
-int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
- int flags, struct tcphdr *th, char *data,
+int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
+ int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
size_t *optlen)
{
- struct tcp_info tinfo = { 0 };
+ struct tcp_info_linux tinfo = { 0 };
socklen_t sl = sizeof(tinfo);
int s = conn->sock;
if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
- !flags && conn->wnd_to_tap)
+ !flags && conn->wnd_to_tap) {
+ conn_flag(c, conn, ~ACK_TO_TAP_DUE);
return 0;
+ }
if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
conn_event(c, conn, CLOSED);
return -ECONNRESET;
}
-#ifdef HAS_SND_WND
- if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd)
- c->tcp.kernel_snd_wnd = 1;
-#endif
-
if (!(conn->flags & LOCAL))
tcp_rtt_dst_check(conn, &tinfo);
- if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags)
+ if (!tcp_update_seqack_wnd(c, conn, !!flags, &tinfo) && !flags)
return 0;
*optlen = 0;
if (flags & SYN) {
int mss;
- /* Options: MSS, NOP and window scale (8 bytes) */
- *optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
-
- *data++ = OPT_MSS;
- *data++ = OPT_MSS_LEN;
-
if (c->mtu == -1) {
mss = tinfo.tcpi_snd_mss;
} else {
@@ -1189,16 +1125,11 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
else if (mss > PAGE_SIZE)
mss = ROUND_DOWN(mss, PAGE_SIZE);
}
- *(uint16_t *)data = htons(MIN(USHRT_MAX, mss));
-
- data += OPT_MSS_LEN - 2;
conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale);
- *data++ = OPT_NOP;
- *data++ = OPT_WS;
- *data++ = OPT_WS_LEN;
- *data++ = conn->ws_to_tap;
+ *opts = TCP_SYN_OPTS(mss, conn->ws_to_tap);
+ *optlen = sizeof(*opts);
} else if (!(flags & RST)) {
flags |= ACK;
}
@@ -1235,8 +1166,12 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
*
* Return: negative error code on connection reset, 0 otherwise
*/
-int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn,
+ int flags)
{
+ if (c->mode == MODE_VU)
+ return tcp_vu_send_flag(c, conn, flags);
+
return tcp_buf_send_flag(c, conn, flags);
}
@@ -1245,7 +1180,7 @@ int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
* @c: Execution context
* @conn: Connection pointer
*/
-void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
+void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn)
{
if (conn->events == CLOSED)
return;
@@ -1335,7 +1270,7 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
{
int s;
- s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
+ s = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, IPPROTO_TCP);
if (s > FD_REF_MAX) {
close(s);
@@ -1372,7 +1307,7 @@ int tcp_conn_sock(const struct ctx *c, sa_family_t af)
return s;
err("TCP: Unable to open socket for new connection: %s",
- strerror(-s));
+ strerror_(-s));
return -1;
}
@@ -1417,15 +1352,15 @@ static void tcp_bind_outbound(const struct ctx *c,
socklen_t sl;
- pif_sockaddr(c, &bind_sa, &sl, PIF_HOST, &tgt->faddr, tgt->fport);
- if (!inany_is_unspecified(&tgt->faddr) || tgt->fport) {
+ pif_sockaddr(c, &bind_sa, &sl, PIF_HOST, &tgt->oaddr, tgt->oport);
+ if (!inany_is_unspecified(&tgt->oaddr) || tgt->oport) {
if (bind(s, &bind_sa.sa, sl)) {
char sstr[INANY_ADDRSTRLEN];
flow_dbg(conn,
"Can't bind TCP outbound socket to %s:%hu: %s",
- inany_ntop(&tgt->faddr, sstr, sizeof(sstr)),
- tgt->fport, strerror(errno));
+ inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)),
+ tgt->oport, strerror_(errno));
}
}
@@ -1436,7 +1371,7 @@ static void tcp_bind_outbound(const struct ctx *c,
strlen(c->ip4.ifname_out))) {
flow_dbg(conn, "Can't bind IPv4 TCP socket to"
" interface %s: %s", c->ip4.ifname_out,
- strerror(errno));
+ strerror_(errno));
}
}
} else if (bind_sa.sa_family == AF_INET6) {
@@ -1446,7 +1381,7 @@ static void tcp_bind_outbound(const struct ctx *c,
strlen(c->ip6.ifname_out))) {
flow_dbg(conn, "Can't bind IPv6 TCP socket to"
" interface %s: %s", c->ip6.ifname_out,
- strerror(errno));
+ strerror_(errno));
}
}
}
@@ -1463,7 +1398,7 @@ static void tcp_bind_outbound(const struct ctx *c,
* @optlen: Bytes in options: caller MUST ensure available length
* @now: Current timestamp
*/
-static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
+static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
const void *saddr, const void *daddr,
const struct tcphdr *th, const char *opts,
size_t optlen, const struct timespec *now)
@@ -1497,12 +1432,12 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0 ||
- !inany_is_unicast(&ini->faddr) || ini->fport == 0) {
+ !inany_is_unicast(&ini->oaddr) || ini->oport == 0) {
char sstr[INANY_ADDRSTRLEN], dstr[INANY_ADDRSTRLEN];
debug("Invalid endpoint in TCP SYN: %s:%hu -> %s:%hu",
inany_ntop(&ini->eaddr, sstr, sizeof(sstr)), ini->eport,
- inany_ntop(&ini->faddr, dstr, sizeof(dstr)), ini->fport);
+ inany_ntop(&ini->oaddr, dstr, sizeof(dstr)), ini->oport);
goto cancel;
}
@@ -1628,8 +1563,11 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
*
* #syscalls recvmsg
*/
-static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
+static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
{
+ if (c->mode == MODE_VU)
+ return tcp_vu_data_from_sock(c, conn);
+
return tcp_buf_data_from_sock(c, conn);
}
@@ -1644,8 +1582,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
*
* Return: count of consumed packets
*/
-static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
- const struct pool *p, int idx)
+static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
+ const struct pool *p, int idx)
{
int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0;
uint16_t max_ack_seq_wnd = conn->wnd_from_tap;
@@ -1686,6 +1624,20 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
continue;
seq = ntohl(th->seq);
+ if (SEQ_LT(seq, conn->seq_from_tap) && len <= 1) {
+ flow_trace(conn,
+ "keep-alive sequence: %u, previous: %u",
+ seq, conn->seq_from_tap);
+
+ tcp_send_flag(c, conn, ACK);
+ tcp_timer_ctl(c, conn);
+
+ if (p->count == 1)
+ return 1;
+
+ continue;
+ }
+
ack_seq = ntohl(th->ack_seq);
if (th->ack) {
@@ -1842,7 +1794,8 @@ out:
* @opts: Pointer to start of options
* @optlen: Bytes in options: caller MUST ensure available length
*/
-static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
+static void tcp_conn_from_sock_finish(const struct ctx *c,
+ struct tcp_tap_conn *conn,
const struct tcphdr *th,
const char *opts, size_t optlen)
{
@@ -1865,11 +1818,12 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
return;
}
+ tcp_send_flag(c, conn, ACK);
+
/* The client might have sent data already, which we didn't
* dequeue waiting for SYN,ACK from tap -- check now.
*/
tcp_data_from_sock(c, conn);
- tcp_send_flag(c, conn, ACK);
}
/**
@@ -1885,7 +1839,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
*
* Return: count of consumed packets
*/
-int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
+int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
const void *saddr, const void *daddr,
const struct pool *p, int idx, const struct timespec *now)
{
@@ -2023,7 +1977,7 @@ reset:
* @c: Execution context
* @conn: Connection pointer
*/
-static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
+static void tcp_connect_finish(const struct ctx *c, struct tcp_tap_conn *conn)
{
socklen_t sl;
int so;
@@ -2049,8 +2003,8 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
* @sa: Peer socket address (from accept())
* @now: Current timestamp
*/
-static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s,
- const struct timespec *now)
+static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
+ int s, const struct timespec *now)
{
struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
uint64_t hash;
@@ -2081,7 +2035,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s,
* @ref: epoll reference of listening socket
* @now: Current timestamp
*/
-void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
+void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
const struct timespec *now)
{
const struct flowside *ini;
@@ -2100,7 +2054,8 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
goto cancel;
/* FIXME: When listening port has a specific bound address, record that
- * as the forwarding address */
+ * as our address
+ */
ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa,
ref.tcp_listen.port);
@@ -2143,9 +2098,9 @@ cancel:
* @c: Execution context
* @ref: epoll reference of timer (not connection)
*
- * #syscalls timerfd_gettime
+ * #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64
*/
-void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
+void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
{
struct itimerspec check_armed = { { 0 }, { 0 } };
struct tcp_tap_conn *conn = &FLOW(ref.flow)->tcp;
@@ -2157,7 +2112,9 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
* timer is currently armed, this event came from a previous setting,
* and we just set the timer to a new point in the future: discard it.
*/
- timerfd_gettime(conn->timer, &check_armed);
+ if (timerfd_gettime(conn->timer, &check_armed))
+ flow_err(conn, "failed to read timer: %s", strerror_(errno));
+
if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec)
return;
@@ -2195,7 +2152,10 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
* case. This avoids having to preemptively reset the timer on
* ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE.
*/
- timerfd_settime(conn->timer, 0, &new, &old);
+ if (timerfd_settime(conn->timer, 0, &new, &old))
+ flow_err(conn, "failed to set timer: %s",
+ strerror_(errno));
+
if (old.it_value.tv_sec == ACT_TIMEOUT) {
flow_dbg(conn, "activity timeout");
tcp_rst(c, conn);
@@ -2209,7 +2169,8 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
* @ref: epoll reference
* @events: epoll events bitmap
*/
-void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
+void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
+ uint32_t events)
{
struct tcp_tap_conn *conn = conn_at_sidx(ref.flowside);
@@ -2240,7 +2201,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
tcp_data_from_sock(c, conn);
if (events & EPOLLOUT)
- tcp_update_seqack_wnd(c, conn, 0, NULL);
+ tcp_update_seqack_wnd(c, conn, false, NULL);
return;
}
@@ -2263,17 +2224,16 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
}
/**
- * tcp_sock_init_af() - Initialise listening socket for a given af and port
+ * tcp_sock_init_one() - Initialise listening socket for address and port
* @c: Execution context
- * @af: Address family to listen on
- * @port: Port, host order
- * @addr: Pointer to address for binding, NULL if not configured
+ * @addr: Pointer to address for binding, NULL for dual stack any
* @ifname: Name of interface to bind to, NULL if not configured
+ * @port: Port, host order
*
* Return: fd for the new listening socket, negative error code on failure
*/
-static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port,
- const void *addr, const char *ifname)
+static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr,
+ const char *ifname, in_port_t port)
{
union tcp_listen_epoll_ref tref = {
.port = port,
@@ -2281,12 +2241,13 @@ static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port,
};
int s;
- s = sock_l4(c, af, EPOLL_TYPE_TCP_LISTEN, addr, ifname, port, tref.u32);
+ s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, addr,
+ ifname, port, tref.u32);
if (c->tcp.fwd_in.mode == FWD_AUTO) {
- if (af == AF_INET || af == AF_UNSPEC)
+ if (!addr || inany_v4(addr))
tcp_sock_init_ext[port][V4] = s < 0 ? -1 : s;
- if (af == AF_INET6 || af == AF_UNSPEC)
+ if (!addr || !inany_v4(addr))
tcp_sock_init_ext[port][V6] = s < 0 ? -1 : s;
}
@@ -2300,31 +2261,32 @@ static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port,
/**
* tcp_sock_init() - Create listening sockets for a given host ("inbound") port
* @c: Execution context
- * @af: Address family to select a specific IP version, or AF_UNSPEC
* @addr: Pointer to address for binding, NULL if not configured
* @ifname: Name of interface to bind to, NULL if not configured
* @port: Port, host order
*
* Return: 0 on (partial) success, negative error code on (complete) failure
*/
-int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
+int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
const char *ifname, in_port_t port)
{
int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
ASSERT(!c->no_tcp);
- if (af == AF_UNSPEC && c->ifi4 && c->ifi6)
+ if (!addr && c->ifi4 && c->ifi6)
/* Attempt to get a dual stack socket */
- if (tcp_sock_init_af(c, AF_UNSPEC, port, addr, ifname) >= 0)
+ if (tcp_sock_init_one(c, NULL, ifname, port) >= 0)
return 0;
/* Otherwise create a socket per IP version */
- if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4)
- r4 = tcp_sock_init_af(c, AF_INET, port, addr, ifname);
+ if ((!addr || inany_v4(addr)) && c->ifi4)
+ r4 = tcp_sock_init_one(c, addr ? addr : &inany_any4,
+ ifname, port);
- if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6)
- r6 = tcp_sock_init_af(c, AF_INET6, port, addr, ifname);
+ if ((!addr || !inany_v4(addr)) && c->ifi6)
+ r6 = tcp_sock_init_one(c, addr ? addr : &inany_any6,
+ ifname, port);
if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6))
return 0;
@@ -2347,8 +2309,8 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
ASSERT(c->mode == MODE_PASTA);
- s = sock_l4(c, AF_INET, EPOLL_TYPE_TCP_LISTEN, &in4addr_loopback,
- NULL, port, tref.u32);
+ s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback4,
+ NULL, port, tref.u32);
if (s >= 0)
tcp_sock_set_bufsize(c, s);
else
@@ -2373,8 +2335,8 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
ASSERT(c->mode == MODE_PASTA);
- s = sock_l4(c, AF_INET6, EPOLL_TYPE_TCP_LISTEN, &in6addr_loopback,
- NULL, port, tref.u32);
+ s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback6,
+ NULL, port, tref.u32);
if (s >= 0)
tcp_sock_set_bufsize(c, s);
else
@@ -2460,13 +2422,13 @@ static void tcp_sock_refill_init(const struct ctx *c)
int rc = tcp_sock_refill_pool(c, init_sock_pool4, AF_INET);
if (rc < 0)
warn("TCP: Error refilling IPv4 host socket pool: %s",
- strerror(-rc));
+ strerror_(-rc));
}
if (c->ifi6) {
int rc = tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6);
if (rc < 0)
warn("TCP: Error refilling IPv6 host socket pool: %s",
- strerror(-rc));
+ strerror_(-rc));
}
}
@@ -2476,7 +2438,7 @@ static void tcp_sock_refill_init(const struct ctx *c)
*
* Return: true if supported, false otherwise
*/
-bool tcp_probe_peek_offset_cap(sa_family_t af)
+static bool tcp_probe_peek_offset_cap(sa_family_t af)
{
bool ret = false;
int s, optv = 0;
@@ -2494,6 +2456,34 @@ bool tcp_probe_peek_offset_cap(sa_family_t af)
}
/**
+ * tcp_probe_tcp_info() - Check what data TCP_INFO reports
+ *
+ * Return: Number of bytes returned by TCP_INFO getsockopt()
+ */
+static socklen_t tcp_probe_tcp_info(void)
+{
+ struct tcp_info_linux tinfo;
+ socklen_t sl = sizeof(tinfo);
+ int s;
+
+ s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+ if (s < 0) {
+ warn_perror("Temporary TCP socket creation failed");
+ return false;
+ }
+
+ if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
+ warn_perror("Failed to get TCP_INFO on temporary socket");
+ close(s);
+ return false;
+ }
+
+ close(s);
+
+ return sl;
+}
+
+/**
* tcp_init() - Get initial sequence, hash secret, initialise per-socket data
* @c: Execution context
*
@@ -2503,11 +2493,7 @@ int tcp_init(struct ctx *c)
{
ASSERT(!c->no_tcp);
- if (c->ifi4)
- tcp_sock4_iov_init(c);
-
- if (c->ifi6)
- tcp_sock6_iov_init(c);
+ tcp_sock_iov_init(c);
memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4));
memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6));
@@ -2526,6 +2512,15 @@ int tcp_init(struct ctx *c)
(!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6));
debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");
+ tcp_info_size = tcp_probe_tcp_info();
+
+#define dbg_tcpi(f_) debug("TCP_INFO tcpi_%s field%s supported", \
+ STRINGIFY(f_), tcp_info_cap(f_) ? " " : " not ")
+ dbg_tcpi(snd_wnd);
+ dbg_tcpi(bytes_acked);
+ dbg_tcpi(min_rtt);
+#undef dbg_tcpi
+
return 0;
}
@@ -2567,7 +2562,7 @@ static void tcp_port_rebind(struct ctx *c, bool outbound)
if (outbound)
tcp_ns_sock_init(c, port);
else
- tcp_sock_init(c, AF_UNSPEC, NULL, NULL, port);
+ tcp_sock_init(c, NULL, NULL, port);
}
}
}