aboutgitcodebugslistschat
path: root/tcp.c
diff options
context:
space:
mode:
Diffstat (limited to 'tcp.c')
-rw-r--r--tcp.c1203
1 files changed, 589 insertions, 614 deletions
diff --git a/tcp.c b/tcp.c
index c4b6c09..c95dcaf 100644
--- a/tcp.c
+++ b/tcp.c
@@ -274,6 +274,7 @@
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/ip.h>
+#include <netinet/tcp.h>
#include <stdint.h>
#include <stdbool.h>
#include <stddef.h>
@@ -286,10 +287,9 @@
#include <time.h>
#include <arpa/inet.h>
-#include <linux/tcp.h> /* For struct tcp_info */
-
#include "checksum.h"
#include "util.h"
+#include "iov.h"
#include "ip.h"
#include "passt.h"
#include "tap.h"
@@ -299,28 +299,16 @@
#include "log.h"
#include "inany.h"
#include "flow.h"
+#include "linux_dep.h"
#include "flow_table.h"
#include "tcp_internal.h"
#include "tcp_buf.h"
#include "tcp_vu.h"
-/* Sides of a flow as we use them in "tap" connections */
-#define SOCKSIDE 0
-#define TAPSIDE 1
-
-#define TCP_HASH_TABLE_LOAD 70 /* % */
-#define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD)
-
/* MSS rounding: see SET_MSS() */
#define MSS_DEFAULT 536
-
#define WINDOW_DEFAULT 14600 /* RFC 6928 */
-#ifdef HAS_SND_WND
-# define KERNEL_REPORTS_SND_WND(c) (c->tcp.kernel_snd_wnd)
-#else
-# define KERNEL_REPORTS_SND_WND(c) (0 && (c))
-#endif
#define ACK_INTERVAL 10 /* ms */
#define SYN_TIMEOUT 10 /* s */
@@ -331,18 +319,12 @@
#define LOW_RTT_TABLE_SIZE 8
#define LOW_RTT_THRESHOLD 10 /* us */
-/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of
- * <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP
- */
-#define SOL_TCP IPPROTO_TCP
-
-#define ACK_IF_NEEDED 0 /* See tcp_buf_send_flag() */
-
+#define ACK_IF_NEEDED 0 /* See tcp_send_flag() */
#define CONN_IS_CLOSING(conn) \
- ((conn->events & ESTABLISHED) && \
- (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
-#define CONN_HAS(conn, set) ((conn->events & (set)) == (set))
+ (((conn)->events & ESTABLISHED) && \
+ ((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
+#define CONN_HAS(conn, set) (((conn)->events & (set)) == (set))
static const char *tcp_event_str[] __attribute((__unused__)) = {
"SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",
@@ -370,29 +352,75 @@ static const char *tcp_flag_str[] __attribute((__unused__)) = {
static int tcp_sock_init_ext [NUM_PORTS][IP_VERSIONS];
static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
-/* Table of guest side forwarding addresses with very low RTT (assumed
- * to be local to the host), LRU
+/* Table of our guest side addresses with very low RTT (assumed to be local to
+ * the host), LRU
*/
static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
char tcp_buf_discard [MAX_WINDOW];
-/* sendmsg() to socket */
-static struct iovec tcp_iov [UIO_MAXIOV];
+/* Does the kernel support TCP_PEEK_OFF? */
+bool peek_offset_cap;
-#define CONN(idx) (&(FLOW(idx)->tcp))
+/* Size of data returned by TCP_INFO getsockopt() */
+socklen_t tcp_info_size;
-/* Table for lookup from remote address, local port, remote port */
-static flow_sidx_t tc_hash[TCP_HASH_TABLE_SIZE];
+#define tcp_info_cap(f_) \
+ ((offsetof(struct tcp_info_linux, tcpi_##f_) + \
+ sizeof(((struct tcp_info_linux *)NULL)->tcpi_##f_)) <= tcp_info_size)
-static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX,
- "Safe linear probing requires hash table larger than connection table");
+/* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */
+#define snd_wnd_cap tcp_info_cap(snd_wnd)
+/* Kernel reports bytes acked in TCP_INFO (kernel commit 0df48c26d84) */
+#define bytes_acked_cap tcp_info_cap(bytes_acked)
+/* Kernel reports minimum RTT in TCP_INFO (kernel commit cd9b266095f4) */
+#define min_rtt_cap tcp_info_cap(min_rtt)
+
+/* sendmsg() to socket */
+static struct iovec tcp_iov [UIO_MAXIOV];
/* Pools for pre-opened sockets (in init) */
int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
/**
+ * conn_at_sidx() - Get TCP connection specific flow at given sidx
+ * @sidx: Flow and side to retrieve
+ *
+ * Return: TCP connection at @sidx, or NULL of @sidx is invalid. Asserts if the
+ * flow at @sidx is not FLOW_TCP.
+ */
+static struct tcp_tap_conn *conn_at_sidx(flow_sidx_t sidx)
+{
+ union flow *flow = flow_at_sidx(sidx);
+
+ if (!flow)
+ return NULL;
+
+ ASSERT(flow->f.type == FLOW_TCP);
+ return &flow->tcp;
+}
+
+/**
+ * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported
+ * @s: Socket to update
+ * @offset: Offset in bytes
+ *
+ * Return: -1 when it fails, 0 otherwise.
+ */
+int tcp_set_peek_offset(int s, int offset)
+{
+ if (!peek_offset_cap)
+ return 0;
+
+ if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) {
+ err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s);
+ return -1;
+ }
+ return 0;
+}
+
+/**
* tcp_conn_epoll_events() - epoll events mask for given connection state
* @events: Current connection events
* @conn_flags Connection flags
@@ -417,7 +445,7 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
if (events == TAP_SYN_RCVD)
return EPOLLOUT | EPOLLET | EPOLLRDHUP;
- return EPOLLRDHUP;
+ return EPOLLET | EPOLLRDHUP;
}
/**
@@ -431,7 +459,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
{
int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock,
- .flowside = FLOW_SIDX(conn, SOCKSIDE) };
+ .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), };
struct epoll_event ev = { .data.u64 = ref.u64 };
if (conn->events == CLOSED) {
@@ -522,7 +550,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
(unsigned long long)it.it_value.tv_sec,
(unsigned long long)it.it_value.tv_nsec / 1000 / 1000);
- timerfd_settime(conn->timer, 0, &it, NULL);
+ if (timerfd_settime(conn->timer, 0, &it, NULL))
+ flow_err(conn, "failed to set timer: %s", strerror(errno));
}
/**
@@ -573,9 +602,6 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
tcp_timer_ctl(c, conn);
}
-static void tcp_hash_remove(const struct ctx *c,
- const struct tcp_tap_conn *conn);
-
/**
* conn_event_do() - Set and log connection events, update epoll state
* @c: Execution context
@@ -621,7 +647,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
num == -1 ? "CLOSED" : tcp_event_str[num]);
if (event == CLOSED)
- tcp_hash_remove(c, conn);
+ flow_hash_remove(c, TAP_SIDX(conn));
else if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD))
conn_flag(c, conn, ACTIVE_CLOSE);
else
@@ -639,10 +665,11 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
*/
static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
{
+ const struct flowside *tapside = TAPFLOW(conn);
int i;
for (i = 0; i < LOW_RTT_TABLE_SIZE; i++)
- if (inany_equals(&conn->faddr, low_rtt_dst + i))
+ if (inany_equals(&tapside->oaddr, low_rtt_dst + i))
return 1;
return 0;
@@ -654,17 +681,17 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
* @tinfo: Pointer to struct tcp_info for socket
*/
static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
- const struct tcp_info *tinfo)
+ const struct tcp_info_linux *tinfo)
{
-#ifdef HAS_MIN_RTT
+ const struct flowside *tapside = TAPFLOW(conn);
int i, hole = -1;
- if (!tinfo->tcpi_min_rtt ||
+ if (!min_rtt_cap ||
(int)tinfo->tcpi_min_rtt > LOW_RTT_THRESHOLD)
return;
for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) {
- if (inany_equals(&conn->faddr, low_rtt_dst + i))
+ if (inany_equals(&tapside->oaddr, low_rtt_dst + i))
return;
if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i))
hole = i;
@@ -676,14 +703,10 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
if (hole == -1)
return;
- low_rtt_dst[hole++] = conn->faddr;
+ low_rtt_dst[hole++] = tapside->oaddr;
if (hole == LOW_RTT_TABLE_SIZE)
hole = 0;
inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any);
-#else
- (void)conn;
- (void)tinfo;
-#endif /* HAS_MIN_RTT */
}
/**
@@ -730,34 +753,106 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s)
}
/**
- * tcp_update_check_tcp4() - Update TCP checksum from stored one
+ * tcp_update_check_tcp4() - Calculate TCP checksum for IPv4
* @iph: IPv4 header
- * @th: TCP header followed by TCP payload
+ * @iov: Pointer to the array of IO vectors
+ * @iov_cnt: Length of the array
+ * @l4offset: IPv4 payload offset in the iovec array
*/
-static void tcp_update_check_tcp4(const struct iphdr *iph, struct tcphdr *th)
+void tcp_update_check_tcp4(const struct iphdr *iph,
+ const struct iovec *iov, int iov_cnt,
+ size_t l4offset)
{
- uint16_t tlen = ntohs(iph->tot_len) - sizeof(struct iphdr);
+ uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr);
struct in_addr saddr = { .s_addr = iph->saddr };
struct in_addr daddr = { .s_addr = iph->daddr };
- uint32_t sum = proto_ipv4_header_psum(tlen, IPPROTO_TCP, saddr, daddr);
+ size_t check_ofs;
+ uint16_t *check;
+ int check_idx;
+ uint32_t sum;
+ char *ptr;
+
+ sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr);
+
+ check_idx = iov_skip_bytes(iov, iov_cnt,
+ l4offset + offsetof(struct tcphdr, check),
+ &check_ofs);
+
+ if (check_idx >= iov_cnt) {
+ err("TCP4 buffer is too small, iov size %zd, check offset %zd",
+ iov_size(iov, iov_cnt),
+ l4offset + offsetof(struct tcphdr, check));
+ return;
+ }
+
+ if (check_ofs + sizeof(*check) > iov[check_idx].iov_len) {
+ err("TCP4 checksum field memory is not contiguous "
+ "check_ofs %zd check_idx %d iov_len %zd",
+ check_ofs, check_idx, iov[check_idx].iov_len);
+ return;
+ }
+
+ ptr = (char *)iov[check_idx].iov_base + check_ofs;
+ if ((uintptr_t)ptr & (__alignof__(*check) - 1)) {
+ err("TCP4 checksum field is not correctly aligned in memory");
+ return;
+ }
+
+ check = (uint16_t *)ptr;
- th->check = 0;
- th->check = csum(th, tlen, sum);
+ *check = 0;
+ *check = csum_iov(iov, iov_cnt, l4offset, sum);
}
/**
* tcp_update_check_tcp6() - Calculate TCP checksum for IPv6
* @ip6h: IPv6 header
- * @th: TCP header followed by TCP payload
+ * @iov: Pointer to the array of IO vectors
+ * @iov_cnt: Length of the array
+ * @l4offset: IPv6 payload offset in the iovec array
*/
-static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
+void tcp_update_check_tcp6(const struct ipv6hdr *ip6h,
+ const struct iovec *iov, int iov_cnt,
+ size_t l4offset)
{
- uint16_t payload_len = ntohs(ip6h->payload_len);
- uint32_t sum = proto_ipv6_header_psum(payload_len, IPPROTO_TCP,
- &ip6h->saddr, &ip6h->daddr);
+ uint16_t l4len = ntohs(ip6h->payload_len);
+ size_t check_ofs;
+ uint16_t *check;
+ int check_idx;
+ uint32_t sum;
+ char *ptr;
+
+ sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, &ip6h->saddr,
+ &ip6h->daddr);
+
+ check_idx = iov_skip_bytes(iov, iov_cnt,
+ l4offset + offsetof(struct tcphdr, check),
+ &check_ofs);
+
+ if (check_idx >= iov_cnt) {
+ err("TCP6 buffer is too small, iov size %zd, check offset %zd",
+ iov_size(iov, iov_cnt),
+ l4offset + offsetof(struct tcphdr, check));
+ return;
+ }
+
+ if (check_ofs + sizeof(*check) > iov[check_idx].iov_len) {
+ err("TCP6 checksum field memory is not contiguous "
+ "check_ofs %zd check_idx %d iov_len %zd",
+ check_ofs, check_idx, iov[check_idx].iov_len);
+ return;
+ }
+
+ ptr = (char *)iov[check_idx].iov_base + check_ofs;
+ if ((uintptr_t)ptr & (__alignof__(*check) - 1)) {
+ err("TCP6 checksum field is not correctly aligned in memory");
+ return;
+ }
+
+ check = (uint16_t *)ptr;
- th->check = 0;
- th->check = csum(th, payload_len, sum);
+ *check = 0;
+ *check = csum_iov(iov, iov_cnt, l4offset, sum);
}
/**
@@ -819,163 +914,14 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find,
}
/**
- * tcp_hash_match() - Check if a connection entry matches address and ports
- * @conn: Connection entry to match against
- * @faddr: Guest side forwarding address
- * @eport: Guest side endpoint port
- * @fport: Guest side forwarding port
- *
- * Return: 1 on match, 0 otherwise
- */
-static int tcp_hash_match(const struct tcp_tap_conn *conn,
- const union inany_addr *faddr,
- in_port_t eport, in_port_t fport)
-{
- if (inany_equals(&conn->faddr, faddr) &&
- conn->eport == eport && conn->fport == fport)
- return 1;
-
- return 0;
-}
-
-/**
- * tcp_hash() - Calculate hash value for connection given address and ports
- * @c: Execution context
- * @faddr: Guest side forwarding address
- * @eport: Guest side endpoint port
- * @fport: Guest side forwarding port
- *
- * Return: hash value, needs to be adjusted for table size
- */
-static uint64_t tcp_hash(const struct ctx *c, const union inany_addr *faddr,
- in_port_t eport, in_port_t fport)
-{
- struct siphash_state state = SIPHASH_INIT(c->hash_secret);
-
- inany_siphash_feed(&state, faddr);
- return siphash_final(&state, 20, (uint64_t)eport << 16 | fport);
-}
-
-/**
- * tcp_conn_hash() - Calculate hash bucket of an existing connection
- * @c: Execution context
- * @conn: Connection
- *
- * Return: hash value, needs to be adjusted for table size
- */
-static uint64_t tcp_conn_hash(const struct ctx *c,
- const struct tcp_tap_conn *conn)
-{
- return tcp_hash(c, &conn->faddr, conn->eport, conn->fport);
-}
-
-/**
- * tcp_hash_probe() - Find hash bucket for a connection
- * @c: Execution context
- * @conn: Connection to find bucket for
- *
- * Return: If @conn is in the table, its current bucket, otherwise a suitable
- * free bucket for it.
- */
-static inline unsigned tcp_hash_probe(const struct ctx *c,
- const struct tcp_tap_conn *conn)
-{
- flow_sidx_t sidx = FLOW_SIDX(conn, TAPSIDE);
- unsigned b = tcp_conn_hash(c, conn) % TCP_HASH_TABLE_SIZE;
-
- /* Linear probing */
- while (!flow_sidx_eq(tc_hash[b], FLOW_SIDX_NONE) &&
- !flow_sidx_eq(tc_hash[b], sidx))
- b = mod_sub(b, 1, TCP_HASH_TABLE_SIZE);
-
- return b;
-}
-
-/**
- * tcp_hash_insert() - Insert connection into hash table, chain link
- * @c: Execution context
- * @conn: Connection pointer
- */
-static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn)
-{
- unsigned b = tcp_hash_probe(c, conn);
-
- tc_hash[b] = FLOW_SIDX(conn, TAPSIDE);
- flow_dbg(conn, "hash table insert: sock %i, bucket: %u", conn->sock, b);
-}
-
-/**
- * tcp_hash_remove() - Drop connection from hash table, chain unlink
- * @c: Execution context
- * @conn: Connection pointer
- */
-static void tcp_hash_remove(const struct ctx *c,
- const struct tcp_tap_conn *conn)
-{
- unsigned b = tcp_hash_probe(c, conn), s;
- union flow *flow = flow_at_sidx(tc_hash[b]);
-
- if (!flow)
- return; /* Redundant remove */
-
- flow_dbg(conn, "hash table remove: sock %i, bucket: %u", conn->sock, b);
-
- /* Scan the remainder of the cluster */
- for (s = mod_sub(b, 1, TCP_HASH_TABLE_SIZE);
- (flow = flow_at_sidx(tc_hash[s]));
- s = mod_sub(s, 1, TCP_HASH_TABLE_SIZE)) {
- unsigned h = tcp_conn_hash(c, &flow->tcp) % TCP_HASH_TABLE_SIZE;
-
- if (!mod_between(h, s, b, TCP_HASH_TABLE_SIZE)) {
- /* tc_hash[s] can live in tc_hash[b]'s slot */
- debug("hash table remove: shuffle %u -> %u", s, b);
- tc_hash[b] = tc_hash[s];
- b = s;
- }
- }
-
- tc_hash[b] = FLOW_SIDX_NONE;
-}
-
-/**
- * tcp_hash_lookup() - Look up connection given remote address and ports
- * @c: Execution context
- * @af: Address family, AF_INET or AF_INET6
- * @faddr: Guest side forwarding address (guest remote address)
- * @eport: Guest side endpoint port (guest local port)
- * @fport: Guest side forwarding port (guest remote port)
- *
- * Return: connection pointer, if found, -ENOENT otherwise
- */
-static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c,
- sa_family_t af, const void *faddr,
- in_port_t eport, in_port_t fport)
-{
- union inany_addr aany;
- union flow *flow;
- unsigned b;
-
- inany_from_af(&aany, af, faddr);
-
- b = tcp_hash(c, &aany, eport, fport) % TCP_HASH_TABLE_SIZE;
- while ((flow = flow_at_sidx(tc_hash[b])) &&
- !tcp_hash_match(&flow->tcp, &aany, eport, fport))
- b = mod_sub(b, 1, TCP_HASH_TABLE_SIZE);
-
- return &flow->tcp;
-}
-
-/**
* tcp_flow_defer() - Deferred per-flow handling (clean up closed connections)
- * @flow: Flow table entry for this connection
+ * @conn: Connection to handle
*
- * Return: true if the flow is ready to free, false otherwise
+ * Return: true if the connection is ready to free, false otherwise
*/
-bool tcp_flow_defer(union flow *flow)
+bool tcp_flow_defer(const struct tcp_tap_conn *conn)
{
- const struct tcp_tap_conn *conn = &flow->tcp;
-
- if (flow->tcp.events != CLOSED)
+ if (conn->events != CLOSED)
return false;
close(conn->sock);
@@ -992,8 +938,7 @@ bool tcp_flow_defer(union flow *flow)
/* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */
void tcp_defer_handler(struct ctx *c)
{
- tcp_buf_l2_flags_flush(c);
- tcp_buf_l2_data_flush(c);
+ tcp_payload_flush(c);
}
/**
@@ -1004,10 +949,12 @@ void tcp_defer_handler(struct ctx *c)
* @seq: Sequence number
*/
static void tcp_fill_header(struct tcphdr *th,
- const struct tcp_tap_conn *conn, uint32_t seq)
+ const struct tcp_tap_conn *conn, uint32_t seq)
{
- th->source = htons(conn->fport);
- th->dest = htons(conn->eport);
+ const struct flowside *tapside = TAPFLOW(conn);
+
+ th->source = htons(tapside->oport);
+ th->dest = htons(tapside->eport);
th->seq = htonl(seq);
th->ack_seq = htonl(conn->seq_ack_to_tap);
if (conn->events & ESTABLISHED) {
@@ -1021,68 +968,73 @@ static void tcp_fill_header(struct tcphdr *th,
/**
* tcp_fill_headers4() - Fill 802.3, IPv4, TCP headers in pre-cooked buffers
- * @c: Execution context
- * @conn: Connection pointer
- * @iph: Pointer to IPv4 header
- * @th: Pointer to TCP header
- * @plen: Payload length (including TCP header options)
- * @check: Checksum, if already known
- * @seq: Sequence number for this segment
- *
- * Return: The total length of the IPv4 packet, host order
- */
-size_t tcp_fill_headers4(const struct ctx *c,
- const struct tcp_tap_conn *conn,
- struct iphdr *iph, struct tcphdr *th,
- size_t plen, const uint16_t *check,
- uint32_t seq)
+ * @conn: Connection pointer
+ * @taph: tap backend specific header
+ * @iph: Pointer to IPv4 header
+ * @bp: Pointer to TCP header followed by TCP payload
+ * @dlen: TCP payload length
+ * @check: Checksum, if already known
+ * @seq: Sequence number for this segment
+ * @no_tcp_csum: Do not set TCP checksum
+ */
+void tcp_fill_headers4(const struct tcp_tap_conn *conn,
+ struct tap_hdr *taph, struct iphdr *iph,
+ struct tcp_payload_t *bp, size_t dlen,
+ const uint16_t *check, uint32_t seq, bool no_tcp_csum)
{
- size_t ip_len = plen + sizeof(struct iphdr) + sizeof(struct tcphdr);
- const struct in_addr *a4 = inany_v4(&conn->faddr);
+ const struct flowside *tapside = TAPFLOW(conn);
+ const struct in_addr *src4 = inany_v4(&tapside->oaddr);
+ const struct in_addr *dst4 = inany_v4(&tapside->eaddr);
+ size_t l4len = dlen + sizeof(bp->th);
+ size_t l3len = l4len + sizeof(*iph);
- ASSERT(a4);
+ ASSERT(src4 && dst4);
- iph->tot_len = htons(ip_len);
- iph->saddr = a4->s_addr;
- iph->daddr = c->ip4.addr_seen.s_addr;
+ iph->tot_len = htons(l3len);
+ iph->saddr = src4->s_addr;
+ iph->daddr = dst4->s_addr;
iph->check = check ? *check :
- csum_ip4_header(iph->tot_len, IPPROTO_TCP,
- *a4, c->ip4.addr_seen);
+ csum_ip4_header(l3len, IPPROTO_TCP, *src4, *dst4);
+
+ tcp_fill_header(&bp->th, conn, seq);
- tcp_fill_header(th, conn, seq);
+ if (no_tcp_csum) {
+ bp->th.check = 0;
+ } else {
+ const struct iovec iov = {
+ .iov_base = bp,
+ .iov_len = ntohs(iph->tot_len) - sizeof(struct iphdr),
+ };
- if (c->mode != MODE_VU)
- tcp_update_check_tcp4(iph, th);
+ tcp_update_check_tcp4(iph, &iov, 1, 0);
+ }
- return ip_len;
+ tap_hdr_update(taph, l3len + sizeof(struct ethhdr));
}
/**
* tcp_fill_headers6() - Fill 802.3, IPv6, TCP headers in pre-cooked buffers
- * @c: Execution context
- * @conn: Connection pointer
- * @ip6h: Pointer to IPv6 header
- * @th: Pointer to TCP header
- * @plen: Payload length (including TCP header options)
- * @check: Checksum, if already known
- * @seq: Sequence number for this segment
- *
- * Return: The total length of the IPv6 packet, host order
- */
-size_t tcp_fill_headers6(const struct ctx *c,
- const struct tcp_tap_conn *conn,
- struct ipv6hdr *ip6h, struct tcphdr *th,
- size_t plen, uint32_t seq)
+ * @conn: Connection pointer
+ * @taph: tap backend specific header
+ * @ip6h: Pointer to IPv6 header
+ * @bp: Pointer to TCP header followed by TCP payload
+ * @dlen: TCP payload length
+ * @check: Checksum, if already known
+ * @seq: Sequence number for this segment
+ * @no_tcp_csum: Do not set TCP checksum
+ */
+void tcp_fill_headers6(const struct tcp_tap_conn *conn,
+ struct tap_hdr *taph, struct ipv6hdr *ip6h,
+ struct tcp_payload_t *bp, size_t dlen,
+ uint32_t seq, bool no_tcp_csum)
{
- size_t ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
+ const struct flowside *tapside = TAPFLOW(conn);
+ size_t l4len = dlen + sizeof(bp->th);
- ip6h->payload_len = htons(plen + sizeof(struct tcphdr));
- ip6h->saddr = conn->faddr.a6;
- if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr))
- ip6h->daddr = c->ip6.addr_ll_seen;
- else
- ip6h->daddr = c->ip6.addr_seen;
+ ip6h->payload_len = htons(l4len);
+ ip6h->saddr = tapside->oaddr.a6;
+ ip6h->daddr = tapside->eaddr.a6;
ip6h->hop_limit = 255;
ip6h->version = 6;
@@ -1092,12 +1044,20 @@ size_t tcp_fill_headers6(const struct ctx *c,
ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff;
ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff;
- tcp_fill_header(th, conn, seq);
+ tcp_fill_header(&bp->th, conn, seq);
+
+ if (no_tcp_csum) {
+ bp->th.check = 0;
+ } else {
+ const struct iovec iov = {
+ .iov_base = bp,
+ .iov_len = ntohs(ip6h->payload_len)
+ };
- if (c->mode != MODE_VU)
- tcp_update_check_tcp6(ip6h, th);
+ tcp_update_check_tcp6(ip6h, &iov, 1, 0);
+ }
- return ip_len;
+ tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr));
}
/**
@@ -1110,42 +1070,41 @@ size_t tcp_fill_headers6(const struct ctx *c,
* Return: 1 if sequence or window were updated, 0 otherwise
*/
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
- int force_seq, struct tcp_info *tinfo)
+ bool force_seq, struct tcp_info_linux *tinfo)
{
uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap;
uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
/* cppcheck-suppress [ctunullpointer, unmatchedSuppression] */
socklen_t sl = sizeof(*tinfo);
- struct tcp_info tinfo_new;
+ struct tcp_info_linux tinfo_new;
uint32_t new_wnd_to_tap = prev_wnd_to_tap;
int s = conn->sock;
-#ifndef HAS_BYTES_ACKED
- (void)force_seq;
-
- conn->seq_ack_to_tap = conn->seq_from_tap;
- if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
- conn->seq_ack_to_tap = prev_ack_to_tap;
-#else
- if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || tcp_rtt_dst_low(conn)
- || CONN_IS_CLOSING(conn) || (conn->flags & LOCAL) || force_seq) {
+ if (!bytes_acked_cap) {
conn->seq_ack_to_tap = conn->seq_from_tap;
- } else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
- if (!tinfo) {
- tinfo = &tinfo_new;
- if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
- return 0;
- }
-
- conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
- conn->seq_init_from_tap;
-
if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
conn->seq_ack_to_tap = prev_ack_to_tap;
+ } else {
+ if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL ||
+ tcp_rtt_dst_low(conn) || CONN_IS_CLOSING(conn) ||
+ (conn->flags & LOCAL) || force_seq) {
+ conn->seq_ack_to_tap = conn->seq_from_tap;
+ } else if (conn->seq_ack_to_tap != conn->seq_from_tap) {
+ if (!tinfo) {
+ tinfo = &tinfo_new;
+ if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
+ return 0;
+ }
+
+ conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
+ conn->seq_init_from_tap;
+
+ if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap))
+ conn->seq_ack_to_tap = prev_ack_to_tap;
+ }
}
-#endif /* !HAS_BYTES_ACKED */
- if (!KERNEL_REPORTS_SND_WND(c)) {
+ if (!snd_wnd_cap) {
tcp_get_sndbuf(conn);
new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW);
conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap,
@@ -1156,14 +1115,13 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
if (!tinfo) {
if (prev_wnd_to_tap > WINDOW_DEFAULT) {
goto out;
-}
+ }
tinfo = &tinfo_new;
if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) {
goto out;
-}
+ }
}
-#ifdef HAS_SND_WND
if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) {
new_wnd_to_tap = tinfo->tcpi_snd_wnd;
} else {
@@ -1171,7 +1129,6 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd,
SNDBUF_GET(conn));
}
-#endif
new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW);
if (!(conn->events & ESTABLISHED))
@@ -1217,25 +1174,23 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
}
/**
- * tcp_fill_flag_header() - Prepare header for flags-only segment (no payload)
+ * tcp_prepare_flags() - Prepare header for flags-only segment (no payload)
* @c: Execution context
* @conn: Connection pointer
* @flags: TCP flags: if not set, send segment only if ACK is due
* @th: TCP header to update
- * @opts: buffer to store TCP option
- * @optlen: size of the TCP option buffer
+ * @data: buffer to store TCP option
+ * @optlen: size of the TCP option buffer (output parameter)
*
* Return: < 0 error code on connection reset,
- * 0 if there is no flag to send
- * 1 otherwise
+ * 0 if there is no flag to send
+ * 1 otherwise
*/
-int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn,
- int flags, struct tcphdr *th, char *opts,
- size_t *optlen)
+int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
+ int flags, struct tcphdr *th, struct tcp_syn_opts *opts,
+ size_t *optlen)
{
- uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
- uint32_t prev_wnd_to_tap = conn->wnd_to_tap;
- struct tcp_info tinfo = { 0 };
+ struct tcp_info_linux tinfo = { 0 };
socklen_t sl = sizeof(tinfo);
int s = conn->sock;
@@ -1248,30 +1203,24 @@ int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn,
return -ECONNRESET;
}
-#ifdef HAS_SND_WND
- if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd)
- c->tcp.kernel_snd_wnd = 1;
-#endif
-
if (!(conn->flags & LOCAL))
tcp_rtt_dst_check(conn, &tinfo);
- if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags)
+ if (!tcp_update_seqack_wnd(c, conn, !!flags, &tinfo) && !flags)
return 0;
+ *optlen = 0;
if (flags & SYN) {
int mss;
- /* Options: MSS, NOP and window scale (8 bytes) */
- *optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
-
- *opts++ = OPT_MSS;
- *opts++ = OPT_MSS_LEN;
-
if (c->mtu == -1) {
mss = tinfo.tcpi_snd_mss;
} else {
mss = c->mtu - sizeof(struct tcphdr);
+ if (CONN_V4(conn))
+ mss -= sizeof(struct iphdr);
+ else
+ mss -= sizeof(struct ipv6hdr);
if (c->low_wmem &&
!(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn))
@@ -1279,26 +1228,18 @@ int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn,
else if (mss > PAGE_SIZE)
mss = ROUND_DOWN(mss, PAGE_SIZE);
}
- *(uint16_t *)opts = htons(MIN(USHRT_MAX, mss));
-
- opts += OPT_MSS_LEN - 2;
conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale);
- *opts++ = OPT_NOP;
- *opts++ = OPT_WS;
- *opts++ = OPT_WS_LEN;
- *opts++ = conn->ws_to_tap;
-
- th->ack = !!(flags & ACK);
- } else {
- th->ack = !!(flags & (ACK | DUP_ACK)) ||
- conn->seq_ack_to_tap != prev_ack_to_tap ||
- !prev_wnd_to_tap;
+ *opts = TCP_SYN_OPTS(mss, conn->ws_to_tap);
+ *optlen = sizeof(*opts);
+ } else if (!(flags & RST)) {
+ flags |= ACK;
}
th->doff = (sizeof(*th) + *optlen) / 4;
+ th->ack = !!(flags & ACK);
th->rst = !!(flags & RST);
th->syn = !!(flags & SYN);
th->fin = !!(flags & FIN);
@@ -1320,10 +1261,20 @@ int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn,
return 1;
}
-int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+/**
+ * tcp_send_flag() - Send segment with flags to tap (no payload)
+ * @c: Execution context
+ * @conn: Connection pointer
+ * @flags: TCP flags: if not set, send segment only if ACK is due
+ *
+ * Return: negative error code on connection reset, 0 otherwise
+ */
+static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn,
+ int flags)
{
if (c->mode == MODE_VU)
return tcp_vu_send_flag(c, conn, flags);
+
return tcp_buf_send_flag(c, conn, flags);
}
@@ -1332,7 +1283,7 @@ int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
* @c: Execution context
* @conn: Connection pointer
*/
-void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
+void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn)
{
if (conn->events == CLOSED)
return;
@@ -1366,6 +1317,14 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn,
static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd)
{
wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap);
+
+ /* Work-around for bug introduced in peer kernel code, commit
+ * e2142825c120 ("net: tcp: send zero-window ACK when no memory").
+ * We don't update if window shrank to zero.
+ */
+ if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap))
+ return;
+
conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
/* FIXME: reflect the tap-side receiver's window back to the sock-side
@@ -1373,33 +1332,16 @@ static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd)
}
/**
- * tcp_seq_init() - Calculate initial sequence number according to RFC 6528
- * @c: Execution context
- * @conn: TCP connection, with faddr, fport and eport populated
+ * tcp_init_seq() - Calculate initial sequence number according to RFC 6528
+ * @hash: Hash of connection details
* @now: Current timestamp
*/
-static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn,
- const struct timespec *now)
+static uint32_t tcp_init_seq(uint64_t hash, const struct timespec *now)
{
- struct siphash_state state = SIPHASH_INIT(c->hash_secret);
- union inany_addr aany;
- uint64_t hash;
- uint32_t ns;
-
- if (CONN_V4(conn))
- inany_from_af(&aany, AF_INET, &c->ip4.addr);
- else
- inany_from_af(&aany, AF_INET6, &c->ip6.addr);
-
- inany_siphash_feed(&state, &conn->faddr);
- inany_siphash_feed(&state, &aany);
- hash = siphash_final(&state, 36,
- (uint64_t)conn->fport << 16 | conn->eport);
-
/* 32ns ticks, overflows 32 bits every 137s */
- ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5;
+ uint32_t ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5;
- conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns;
+ return ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns;
}
/**
@@ -1431,7 +1373,7 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
{
int s;
- s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
+ s = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, IPPROTO_TCP);
if (s > FD_REF_MAX) {
close(s);
@@ -1480,22 +1422,21 @@ int tcp_conn_sock(const struct ctx *c, sa_family_t af)
*
* Return: clamped MSS value
*/
-static uint16_t tcp_conn_tap_mss(const struct ctx *c,
- const struct tcp_tap_conn *conn,
+static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn,
const char *opts, size_t optlen)
{
unsigned int mss;
int ret;
- (void)c; /* unused */
- (void)conn; /* unused */
-
if ((ret = tcp_opt_get(opts, optlen, OPT_MSS, NULL, NULL)) < 0)
mss = MSS_DEFAULT;
else
mss = ret;
- mss = MIN(MSS, mss);
+ if (CONN_V4(conn))
+ mss = MIN(MSS4, mss);
+ else
+ mss = MIN(MSS6, mss);
return MIN(mss, USHRT_MAX);
}
@@ -1503,53 +1444,47 @@ static uint16_t tcp_conn_tap_mss(const struct ctx *c,
/**
* tcp_bind_outbound() - Bind socket to outbound address and interface if given
* @c: Execution context
+ * @conn: Connection entry for socket to bind
* @s: Outbound TCP socket
- * @af: Address family
*/
-static void tcp_bind_outbound(const struct ctx *c, int s, sa_family_t af)
+static void tcp_bind_outbound(const struct ctx *c,
+ const struct tcp_tap_conn *conn, int s)
{
- if (af == AF_INET) {
- if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.addr_out)) {
- struct sockaddr_in addr4 = {
- .sin_family = AF_INET,
- .sin_port = 0,
- .sin_addr = c->ip4.addr_out,
- };
-
- if (bind(s, (struct sockaddr *)&addr4, sizeof(addr4))) {
- debug("Can't bind IPv4 TCP socket address: %s",
- strerror(errno));
- }
+ const struct flowside *tgt = &conn->f.side[TGTSIDE];
+ union sockaddr_inany bind_sa;
+ socklen_t sl;
+
+
+ pif_sockaddr(c, &bind_sa, &sl, PIF_HOST, &tgt->oaddr, tgt->oport);
+ if (!inany_is_unspecified(&tgt->oaddr) || tgt->oport) {
+ if (bind(s, &bind_sa.sa, sl)) {
+ char sstr[INANY_ADDRSTRLEN];
+
+ flow_dbg(conn,
+ "Can't bind TCP outbound socket to %s:%hu: %s",
+ inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)),
+ tgt->oport, strerror(errno));
}
+ }
+ if (bind_sa.sa_family == AF_INET) {
if (*c->ip4.ifname_out) {
if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE,
c->ip4.ifname_out,
strlen(c->ip4.ifname_out))) {
- debug("Can't bind IPv4 TCP socket to interface:"
- " %s", strerror(errno));
- }
- }
- } else if (af == AF_INET6) {
- if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr_out)) {
- struct sockaddr_in6 addr6 = {
- .sin6_family = AF_INET6,
- .sin6_port = 0,
- .sin6_addr = c->ip6.addr_out,
- };
-
- if (bind(s, (struct sockaddr *)&addr6, sizeof(addr6))) {
- debug("Can't bind IPv6 TCP socket address: %s",
- strerror(errno));
+ flow_dbg(conn, "Can't bind IPv4 TCP socket to"
+ " interface %s: %s", c->ip4.ifname_out,
+ strerror(errno));
}
}
-
+ } else if (bind_sa.sa_family == AF_INET6) {
if (*c->ip6.ifname_out) {
if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE,
c->ip6.ifname_out,
strlen(c->ip6.ifname_out))) {
- debug("Can't bind IPv6 TCP socket to interface:"
- " %s", strerror(errno));
+ flow_dbg(conn, "Can't bind IPv6 TCP socket to"
+ " interface %s: %s", c->ip6.ifname_out,
+ strerror(errno));
}
}
}
@@ -1566,92 +1501,81 @@ static void tcp_bind_outbound(const struct ctx *c, int s, sa_family_t af)
* @optlen: Bytes in options: caller MUST ensure available length
* @now: Current timestamp
*/
-static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
+static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
const void *saddr, const void *daddr,
const struct tcphdr *th, const char *opts,
size_t optlen, const struct timespec *now)
{
in_port_t srcport = ntohs(th->source);
in_port_t dstport = ntohs(th->dest);
- struct sockaddr_in addr4 = {
- .sin_family = AF_INET,
- .sin_port = htons(dstport),
- .sin_addr = *(struct in_addr *)daddr,
- };
- struct sockaddr_in6 addr6 = {
- .sin6_family = AF_INET6,
- .sin6_port = htons(dstport),
- .sin6_addr = *(struct in6_addr *)daddr,
- };
- const struct sockaddr *sa;
+ const struct flowside *ini, *tgt;
struct tcp_tap_conn *conn;
+ union sockaddr_inany sa;
union flow *flow;
int s = -1, mss;
+ uint64_t hash;
socklen_t sl;
if (!(flow = flow_alloc()))
return;
- if (af == AF_INET) {
- if (IN4_IS_ADDR_UNSPECIFIED(saddr) ||
- IN4_IS_ADDR_BROADCAST(saddr) ||
- IN4_IS_ADDR_MULTICAST(saddr) || srcport == 0 ||
- IN4_IS_ADDR_UNSPECIFIED(daddr) ||
- IN4_IS_ADDR_BROADCAST(daddr) ||
- IN4_IS_ADDR_MULTICAST(daddr) || dstport == 0) {
- char sstr[INET_ADDRSTRLEN], dstr[INET_ADDRSTRLEN];
-
- debug("Invalid endpoint in TCP SYN: %s:%hu -> %s:%hu",
- inet_ntop(AF_INET, saddr, sstr, sizeof(sstr)),
- srcport,
- inet_ntop(AF_INET, daddr, dstr, sizeof(dstr)),
- dstport);
- goto cancel;
- }
- } else if (af == AF_INET6) {
- if (IN6_IS_ADDR_UNSPECIFIED(saddr) ||
- IN6_IS_ADDR_MULTICAST(saddr) || srcport == 0 ||
- IN6_IS_ADDR_UNSPECIFIED(daddr) ||
- IN6_IS_ADDR_MULTICAST(daddr) || dstport == 0) {
- char sstr[INET6_ADDRSTRLEN], dstr[INET6_ADDRSTRLEN];
-
- debug("Invalid endpoint in TCP SYN: %s:%hu -> %s:%hu",
- inet_ntop(AF_INET6, saddr, sstr, sizeof(sstr)),
- srcport,
- inet_ntop(AF_INET6, daddr, dstr, sizeof(dstr)),
- dstport);
- goto cancel;
- }
- }
+ ini = flow_initiate_af(flow, PIF_TAP,
+ af, saddr, srcport, daddr, dstport);
- if ((s = tcp_conn_sock(c, af)) < 0)
+ if (!(tgt = flow_target(c, flow, IPPROTO_TCP)))
goto cancel;
- if (!c->no_map_gw) {
- if (af == AF_INET && IN4_ARE_ADDR_EQUAL(daddr, &c->ip4.gw))
- addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
- if (af == AF_INET6 && IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.gw))
- addr6.sin6_addr = in6addr_loopback;
+ if (flow->f.pif[TGTSIDE] != PIF_HOST) {
+ flow_err(flow, "No support for forwarding TCP from %s to %s",
+ pif_name(flow->f.pif[INISIDE]),
+ pif_name(flow->f.pif[TGTSIDE]));
+ goto cancel;
}
- if (af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)) {
- struct sockaddr_in6 addr6_ll = {
- .sin6_family = AF_INET6,
- .sin6_addr = c->ip6.addr_ll,
- .sin6_scope_id = c->ifi6,
- };
- if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll)))
+ conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
+
+ if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0 ||
+ !inany_is_unicast(&ini->oaddr) || ini->oport == 0) {
+ char sstr[INANY_ADDRSTRLEN], dstr[INANY_ADDRSTRLEN];
+
+ debug("Invalid endpoint in TCP SYN: %s:%hu -> %s:%hu",
+ inany_ntop(&ini->eaddr, sstr, sizeof(sstr)), ini->eport,
+ inany_ntop(&ini->oaddr, dstr, sizeof(dstr)), ini->oport);
+ goto cancel;
+ }
+
+ if ((s = tcp_conn_sock(c, af)) < 0)
+ goto cancel;
+
+ pif_sockaddr(c, &sa, &sl, PIF_HOST, &tgt->eaddr, tgt->eport);
+
+ /* Use bind() to check if the target address is local (EADDRINUSE or
+ * similar) and already bound, and set the LOCAL flag in that case.
+ *
+ * If bind() succeeds, in general, we could infer that nobody (else) is
+ * listening on that address and port and reset the connection attempt
+ * early, but we can't rely on that if non-local binds are enabled,
+ * because bind() would succeed for any non-local address we can reach.
+ *
+ * So, if bind() succeeds, close the socket, get a new one, and proceed.
+ */
+ if (bind(s, &sa.sa, sl)) {
+ if (errno != EADDRNOTAVAIL && errno != EACCES)
+ conn_flag(c, conn, LOCAL);
+ } else {
+ /* Not a local, bound destination, inconclusive test */
+ close(s);
+ if ((s = tcp_conn_sock(c, af)) < 0)
goto cancel;
}
- conn = FLOW_START(flow, FLOW_TCP, tcp, TAPSIDE);
conn->sock = s;
conn->timer = -1;
conn_event(c, conn, TAP_SYN_RCVD);
conn->wnd_to_tap = WINDOW_DEFAULT;
- mss = tcp_conn_tap_mss(c, conn, opts, optlen);
+ mss = tcp_conn_tap_mss(conn, opts, optlen);
if (setsockopt(s, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)))
flow_trace(conn, "failed to set TCP_MAXSEG on socket %i", s);
MSS_SET(conn, mss);
@@ -1664,44 +1588,20 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap)))
conn->wnd_from_tap = 1;
- inany_from_af(&conn->faddr, af, daddr);
-
- if (af == AF_INET) {
- sa = (struct sockaddr *)&addr4;
- sl = sizeof(addr4);
- } else {
- sa = (struct sockaddr *)&addr6;
- sl = sizeof(addr6);
- }
-
- conn->fport = dstport;
- conn->eport = srcport;
-
conn->seq_init_from_tap = ntohl(th->seq);
conn->seq_from_tap = conn->seq_init_from_tap + 1;
conn->seq_ack_to_tap = conn->seq_from_tap;
- tcp_seq_init(c, conn, now);
+ hash = flow_hash_insert(c, TAP_SIDX(conn));
+ conn->seq_to_tap = tcp_init_seq(hash, now);
conn->seq_ack_from_tap = conn->seq_to_tap;
- tcp_hash_insert(c, conn);
-
- if (!bind(s, sa, sl)) {
- tcp_rst(c, conn); /* Nobody is listening then */
- return;
- }
- if (errno != EADDRNOTAVAIL && errno != EACCES)
- conn_flag(c, conn, LOCAL);
-
- if ((af == AF_INET && !IN4_IS_ADDR_LOOPBACK(&addr4.sin_addr)) ||
- (af == AF_INET6 && !IN6_IS_ADDR_LOOPBACK(&addr6.sin6_addr) &&
- !IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)))
- tcp_bind_outbound(c, s, af);
+ tcp_bind_outbound(c, conn, s);
- if (connect(s, sa, sl)) {
+ if (connect(s, &sa.sa, sl)) {
if (errno != EINPROGRESS) {
tcp_rst(c, conn);
- return;
+ goto cancel;
}
tcp_get_sndbuf(conn);
@@ -1709,12 +1609,13 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
tcp_get_sndbuf(conn);
if (tcp_send_flag(c, conn, SYN | ACK))
- return;
+ goto cancel;
conn_event(c, conn, TAP_SYN_ACK_SENT);
}
tcp_epoll_ctl(c, conn);
+ FLOW_ACTIVATE(conn);
return;
cancel:
@@ -1756,7 +1657,16 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
return 0;
}
-static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
+/**
+ * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window
+ * @c: Execution context
+ * @conn: Connection pointer
+ *
+ * Return: negative on connection reset, 0 otherwise
+ *
+ * #syscalls recvmsg
+ */
+static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
{
if (c->mode == MODE_VU)
return tcp_vu_data_from_sock(c, conn);
@@ -1775,8 +1685,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
*
* Return: count of consumed packets
*/
-static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
- const struct pool *p, int idx)
+static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
+ const struct pool *p, int idx)
{
int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0;
uint16_t max_ack_seq_wnd = conn->wnd_from_tap;
@@ -1895,6 +1805,10 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
"fast re-transmit, ACK: %u, previous sequence: %u",
max_ack_seq, conn->seq_to_tap);
conn->seq_to_tap = max_ack_seq;
+ if (tcp_set_peek_offset(conn->sock, 0)) {
+ tcp_rst(c, conn);
+ return -1;
+ }
tcp_data_from_sock(c, conn);
}
@@ -1941,7 +1855,7 @@ out:
*/
if (conn->seq_dup_ack_approx != (conn->seq_from_tap & 0xff)) {
conn->seq_dup_ack_approx = conn->seq_from_tap & 0xff;
- tcp_send_flag(c, conn, DUP_ACK);
+ tcp_send_flag(c, conn, ACK | DUP_ACK);
}
return p->count - idx;
}
@@ -1969,7 +1883,8 @@ out:
* @opts: Pointer to start of options
* @optlen: Bytes in options: caller MUST ensure available length
*/
-static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
+static void tcp_conn_from_sock_finish(const struct ctx *c,
+ struct tcp_tap_conn *conn,
const struct tcphdr *th,
const char *opts, size_t optlen)
{
@@ -1980,19 +1895,24 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
if (!(conn->wnd_from_tap >>= conn->ws_from_tap))
conn->wnd_from_tap = 1;
- MSS_SET(conn, tcp_conn_tap_mss(c, conn, opts, optlen));
+ MSS_SET(conn, tcp_conn_tap_mss(conn, opts, optlen));
conn->seq_init_from_tap = ntohl(th->seq) + 1;
conn->seq_from_tap = conn->seq_init_from_tap;
conn->seq_ack_to_tap = conn->seq_from_tap;
conn_event(c, conn, ESTABLISHED);
+ if (tcp_set_peek_offset(conn->sock, 0)) {
+ tcp_rst(c, conn);
+ return;
+ }
+
+ tcp_send_flag(c, conn, ACK);
/* The client might have sent data already, which we didn't
* dequeue waiting for SYN,ACK from tap -- check now.
*/
tcp_data_from_sock(c, conn);
- tcp_send_flag(c, conn, ACK);
}
/**
@@ -2008,7 +1928,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
*
* Return: count of consumed packets
*/
-int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
+int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
const void *saddr, const void *daddr,
const struct pool *p, int idx, const struct timespec *now)
{
@@ -2016,6 +1936,8 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
const struct tcphdr *th;
size_t optlen, len;
const char *opts;
+ union flow *flow;
+ flow_sidx_t sidx;
int ack_due = 0;
int count;
@@ -2031,16 +1953,22 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
optlen = MIN(optlen, ((1UL << 4) /* from doff width */ - 6) * 4UL);
opts = packet_get(p, idx, sizeof(*th), optlen, NULL);
- conn = tcp_hash_lookup(c, af, daddr, ntohs(th->source), ntohs(th->dest));
+ sidx = flow_lookup_af(c, IPPROTO_TCP, PIF_TAP, af, saddr, daddr,
+ ntohs(th->source), ntohs(th->dest));
+ flow = flow_at_sidx(sidx);
/* New connection from tap */
- if (!conn) {
+ if (!flow) {
if (opts && th->syn && !th->ack)
tcp_conn_from_tap(c, af, saddr, daddr, th,
opts, optlen, now);
return 1;
}
+ ASSERT(flow->f.type == FLOW_TCP);
+ ASSERT(pif_at_sidx(sidx) == PIF_TAP);
+ conn = &flow->tcp;
+
flow_trace(conn, "packet length %zu from tap", len);
if (th->rst) {
@@ -2067,6 +1995,8 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
goto reset;
conn_event(c, conn, ESTABLISHED);
+ if (tcp_set_peek_offset(conn->sock, 0))
+ goto reset;
if (th->fin) {
conn->seq_from_tap++;
@@ -2136,7 +2066,7 @@ reset:
* @c: Execution context
* @conn: Connection pointer
*/
-static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
+static void tcp_connect_finish(const struct ctx *c, struct tcp_tap_conn *conn)
{
socklen_t sl;
int so;
@@ -2155,61 +2085,26 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
}
/**
- * tcp_snat_inbound() - Translate source address for inbound data if needed
- * @c: Execution context
- * @addr: Source address of inbound packet/connection
- */
-static void tcp_snat_inbound(const struct ctx *c, union inany_addr *addr)
-{
- struct in_addr *addr4 = inany_v4(addr);
-
- if (addr4) {
- if (IN4_IS_ADDR_LOOPBACK(addr4) ||
- IN4_IS_ADDR_UNSPECIFIED(addr4) ||
- IN4_ARE_ADDR_EQUAL(addr4, &c->ip4.addr_seen))
- *addr4 = c->ip4.gw;
- } else {
- struct in6_addr *addr6 = &addr->a6;
-
- if (IN6_IS_ADDR_LOOPBACK(addr6) ||
- IN6_ARE_ADDR_EQUAL(addr6, &c->ip6.addr_seen) ||
- IN6_ARE_ADDR_EQUAL(addr6, &c->ip6.addr)) {
- if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw))
- *addr6 = c->ip6.gw;
- else
- *addr6 = c->ip6.addr_ll;
- }
- }
-}
-
-/**
* tcp_tap_conn_from_sock() - Initialize state for non-spliced connection
* @c: Execution context
- * @dstport: Destination port for connection (host side)
* @flow: flow to initialise
* @s: Accepted socket
* @sa: Peer socket address (from accept())
* @now: Current timestamp
*/
-static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport,
- union flow *flow, int s,
- const union sockaddr_inany *sa,
- const struct timespec *now)
+static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
+ int s, const struct timespec *now)
{
- struct tcp_tap_conn *conn = FLOW_START(flow, FLOW_TCP, tcp, SOCKSIDE);
+ struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
+ uint64_t hash;
conn->sock = s;
conn->timer = -1;
conn->ws_to_tap = conn->ws_from_tap = 0;
conn_event(c, conn, SOCK_ACCEPTED);
- inany_from_sockaddr(&conn->faddr, &conn->fport, sa);
- conn->eport = dstport + c->tcp.fwd_in.delta[dstport];
-
- tcp_snat_inbound(c, &conn->faddr);
-
- tcp_seq_init(c, conn, now);
- tcp_hash_insert(c, conn);
+ hash = flow_hash_insert(c, TAP_SIDX(conn));
+ conn->seq_to_tap = tcp_init_seq(hash, now);
conn->seq_ack_from_tap = conn->seq_to_tap;
@@ -2219,6 +2114,8 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport,
conn_flag(c, conn, ACK_FROM_TAP_DUE);
tcp_get_sndbuf(conn);
+
+ FLOW_ACTIVATE(conn);
}
/**
@@ -2227,53 +2124,58 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport,
* @ref: epoll reference of listening socket
* @now: Current timestamp
*/
-void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
+void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
const struct timespec *now)
{
+ const struct flowside *ini;
union sockaddr_inany sa;
socklen_t sl = sizeof(sa);
union flow *flow;
int s;
- if (c->no_tcp || !(flow = flow_alloc()))
+ ASSERT(!c->no_tcp);
+
+ if (!(flow = flow_alloc()))
return;
s = accept4(ref.fd, &sa.sa, &sl, SOCK_NONBLOCK);
if (s < 0)
goto cancel;
- if (sa.sa_family == AF_INET) {
- const struct in_addr *addr = &sa.sa4.sin_addr;
- in_port_t port = sa.sa4.sin_port;
+ /* FIXME: When listening port has a specific bound address, record that
+ * as our address
+ */
+ ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa,
+ ref.tcp_listen.port);
- if (IN4_IS_ADDR_UNSPECIFIED(addr) ||
- IN4_IS_ADDR_BROADCAST(addr) ||
- IN4_IS_ADDR_MULTICAST(addr) || port == 0) {
- char str[INET_ADDRSTRLEN];
+ if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) {
+ char sastr[SOCKADDR_STRLEN];
- err("Invalid endpoint from TCP accept(): %s:%hu",
- inet_ntop(AF_INET, addr, str, sizeof(str)), port);
- goto cancel;
- }
- } else if (sa.sa_family == AF_INET6) {
- const struct in6_addr *addr = &sa.sa6.sin6_addr;
- in_port_t port = sa.sa6.sin6_port;
+ err("Invalid endpoint from TCP accept(): %s",
+ sockaddr_ntop(&sa, sastr, sizeof(sastr)));
+ goto cancel;
+ }
- if (IN6_IS_ADDR_UNSPECIFIED(addr) ||
- IN6_IS_ADDR_MULTICAST(addr) || port == 0) {
- char str[INET6_ADDRSTRLEN];
+ if (!flow_target(c, flow, IPPROTO_TCP))
+ goto cancel;
- err("Invalid endpoint from TCP accept(): %s:%hu",
- inet_ntop(AF_INET6, addr, str, sizeof(str)), port);
- goto cancel;
- }
- }
+ switch (flow->f.pif[TGTSIDE]) {
+ case PIF_SPLICE:
+ case PIF_HOST:
+ tcp_splice_conn_from_sock(c, flow, s);
+ break;
- if (tcp_splice_conn_from_sock(c, ref.tcp_listen.pif,
- ref.tcp_listen.port, flow, s, &sa))
- return;
+ case PIF_TAP:
+ tcp_tap_conn_from_sock(c, flow, s, now);
+ break;
+
+ default:
+ flow_err(flow, "No support for forwarding TCP from %s to %s",
+ pif_name(flow->f.pif[INISIDE]),
+ pif_name(flow->f.pif[TGTSIDE]));
+ goto cancel;
+ }
- tcp_tap_conn_from_sock(c, ref.tcp_listen.port, flow, s, &sa, now);
return;
cancel:
@@ -2285,21 +2187,23 @@ cancel:
* @c: Execution context
* @ref: epoll reference of timer (not connection)
*
- * #syscalls timerfd_gettime
+ * #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64
*/
-void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
+void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
{
struct itimerspec check_armed = { { 0 }, { 0 } };
- struct tcp_tap_conn *conn = CONN(ref.flow);
+ struct tcp_tap_conn *conn = &FLOW(ref.flow)->tcp;
- if (c->no_tcp)
- return;
+ ASSERT(!c->no_tcp);
+ ASSERT(conn->f.type == FLOW_TCP);
/* We don't reset timers on ~ACK_FROM_TAP_DUE, ~ACK_TO_TAP_DUE. If the
* timer is currently armed, this event came from a previous setting,
* and we just set the timer to a new point in the future: discard it.
*/
- timerfd_gettime(conn->timer, &check_armed);
+ if (timerfd_gettime(conn->timer, &check_armed))
+ flow_err(conn, "failed to read timer: %s", strerror(errno));
+
if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec)
return;
@@ -2320,8 +2224,12 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
flow_dbg(conn, "ACK timeout, retry");
conn->retrans++;
conn->seq_to_tap = conn->seq_ack_from_tap;
- tcp_data_from_sock(c, conn);
- tcp_timer_ctl(c, conn);
+ if (tcp_set_peek_offset(conn->sock, 0)) {
+ tcp_rst(c, conn);
+ } else {
+ tcp_data_from_sock(c, conn);
+ tcp_timer_ctl(c, conn);
+ }
}
} else {
struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } };
@@ -2333,7 +2241,10 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
* case. This avoids having to preemptively reset the timer on
* ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE.
*/
- timerfd_settime(conn->timer, 0, &new, &old);
+ if (timerfd_settime(conn->timer, 0, &new, &old))
+ flow_err(conn, "failed to set timer: %s",
+ strerror(errno));
+
if (old.it_value.tv_sec == ACT_TIMEOUT) {
flow_dbg(conn, "activity timeout");
tcp_rst(c, conn);
@@ -2347,12 +2258,13 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
* @ref: epoll reference
* @events: epoll events bitmap
*/
-void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
+void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
+ uint32_t events)
{
- struct tcp_tap_conn *conn = CONN(ref.flowside.flow);
+ struct tcp_tap_conn *conn = conn_at_sidx(ref.flowside);
- ASSERT(conn->f.type == FLOW_TCP);
- ASSERT(ref.flowside.side == SOCKSIDE);
+ ASSERT(!c->no_tcp);
+ ASSERT(pif_at_sidx(ref.flowside) != PIF_TAP);
if (conn->events == CLOSED)
return;
@@ -2378,7 +2290,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
tcp_data_from_sock(c, conn);
if (events & EPOLLOUT)
- tcp_update_seqack_wnd(c, conn, 0, NULL);
+ tcp_update_seqack_wnd(c, conn, false, NULL);
return;
}
@@ -2401,17 +2313,16 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
}
/**
- * tcp_sock_init_af() - Initialise listening socket for a given af and port
+ * tcp_sock_init_one() - Initialise listening socket for address and port
* @c: Execution context
- * @af: Address family to listen on
- * @port: Port, host order
- * @addr: Pointer to address for binding, NULL if not configured
+ * @addr: Pointer to address for binding, NULL for dual stack any
* @ifname: Name of interface to bind to, NULL if not configured
+ * @port: Port, host order
*
* Return: fd for the new listening socket, negative error code on failure
*/
-static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port,
- const void *addr, const char *ifname)
+static int tcp_sock_init_one(const struct ctx *c, const union inany_addr *addr,
+ const char *ifname, in_port_t port)
{
union tcp_listen_epoll_ref tref = {
.port = port,
@@ -2419,12 +2330,13 @@ static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port,
};
int s;
- s = sock_l4(c, af, IPPROTO_TCP, addr, ifname, port, tref.u32);
+ s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_HOST, addr,
+ ifname, port, tref.u32);
if (c->tcp.fwd_in.mode == FWD_AUTO) {
- if (af == AF_INET || af == AF_UNSPEC)
+ if (!addr || inany_v4(addr))
tcp_sock_init_ext[port][V4] = s < 0 ? -1 : s;
- if (af == AF_INET6 || af == AF_UNSPEC)
+ if (!addr || !inany_v4(addr))
tcp_sock_init_ext[port][V6] = s < 0 ? -1 : s;
}
@@ -2438,29 +2350,32 @@ static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port,
/**
* tcp_sock_init() - Create listening sockets for a given host ("inbound") port
* @c: Execution context
- * @af: Address family to select a specific IP version, or AF_UNSPEC
* @addr: Pointer to address for binding, NULL if not configured
* @ifname: Name of interface to bind to, NULL if not configured
* @port: Port, host order
*
* Return: 0 on (partial) success, negative error code on (complete) failure
*/
-int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
+int tcp_sock_init(const struct ctx *c, const union inany_addr *addr,
const char *ifname, in_port_t port)
{
int r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1;
- if (af == AF_UNSPEC && c->ifi4 && c->ifi6)
+ ASSERT(!c->no_tcp);
+
+ if (!addr && c->ifi4 && c->ifi6)
/* Attempt to get a dual stack socket */
- if (tcp_sock_init_af(c, AF_UNSPEC, port, addr, ifname) >= 0)
+ if (tcp_sock_init_one(c, NULL, ifname, port) >= 0)
return 0;
/* Otherwise create a socket per IP version */
- if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4)
- r4 = tcp_sock_init_af(c, AF_INET, port, addr, ifname);
+ if ((!addr || inany_v4(addr)) && c->ifi4)
+ r4 = tcp_sock_init_one(c, addr ? addr : &inany_any4,
+ ifname, port);
- if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6)
- r6 = tcp_sock_init_af(c, AF_INET6, port, addr, ifname);
+ if ((!addr || !inany_v4(addr)) && c->ifi6)
+ r6 = tcp_sock_init_one(c, addr ? addr : &inany_any6,
+ ifname, port);
if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6))
return 0;
@@ -2483,8 +2398,8 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port)
ASSERT(c->mode == MODE_PASTA);
- s = sock_l4(c, AF_INET, IPPROTO_TCP, &in4addr_loopback, NULL, port,
- tref.u32);
+ s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback4,
+ NULL, port, tref.u32);
if (s >= 0)
tcp_sock_set_bufsize(c, s);
else
@@ -2509,8 +2424,8 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
ASSERT(c->mode == MODE_PASTA);
- s = sock_l4(c, AF_INET6, IPPROTO_TCP, &in6addr_loopback, NULL, port,
- tref.u32);
+ s = pif_sock_l4(c, EPOLL_TYPE_TCP_LISTEN, PIF_SPLICE, &inany_loopback6,
+ NULL, port, tref.u32);
if (s >= 0)
tcp_sock_set_bufsize(c, s);
else
@@ -2527,6 +2442,8 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port)
*/
void tcp_ns_sock_init(const struct ctx *c, in_port_t port)
{
+ ASSERT(!c->no_tcp);
+
if (c->ifi4)
tcp_ns_sock_init4(c, port);
if (c->ifi6)
@@ -2539,6 +2456,7 @@ void tcp_ns_sock_init(const struct ctx *c, in_port_t port)
*
* Return: 0
*/
+/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */
static int tcp_ns_socks_init(void *arg)
{
const struct ctx *c = (const struct ctx *)arg;
@@ -2604,6 +2522,57 @@ static void tcp_sock_refill_init(const struct ctx *c)
}
/**
+ * tcp_probe_peek_offset_cap() - Check if SO_PEEK_OFF is supported by kernel
+ * @af: Address family, IPv4 or IPv6
+ *
+ * Return: true if supported, false otherwise
+ */
+static bool tcp_probe_peek_offset_cap(sa_family_t af)
+{
+ bool ret = false;
+ int s, optv = 0;
+
+ s = socket(af, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+ if (s < 0) {
+ warn_perror("Temporary TCP socket creation failed");
+ } else {
+ if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int)))
+ ret = true;
+ close(s);
+ }
+
+ return ret;
+}
+
+/**
+ * tcp_probe_tcp_info() - Check what data TCP_INFO reports
+ *
+ * Return: Number of bytes returned by TCP_INFO getsockopt()
+ */
+static socklen_t tcp_probe_tcp_info(void)
+{
+ struct tcp_info_linux tinfo;
+ socklen_t sl = sizeof(tinfo);
+ int s;
+
+ s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+ if (s < 0) {
+ warn_perror("Temporary TCP socket creation failed");
+ return false;
+ }
+
+ if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
+ warn_perror("Failed to get TCP_INFO on temporary socket");
+ close(s);
+ return false;
+ }
+
+ close(s);
+
+ return sl;
+}
+
+/**
* tcp_init() - Get initial sequence, hash secret, initialise per-socket data
* @c: Execution context
*
@@ -2611,16 +2580,9 @@ static void tcp_sock_refill_init(const struct ctx *c)
*/
int tcp_init(struct ctx *c)
{
- unsigned b;
-
- for (b = 0; b < TCP_HASH_TABLE_SIZE; b++)
- tc_hash[b] = FLOW_SIDX_NONE;
-
- if (c->ifi4)
- tcp_buf_sock4_iov_init(c);
+ ASSERT(!c->no_tcp);
- if (c->ifi6)
- tcp_buf_sock6_iov_init(c);
+ tcp_sock_iov_init(c);
memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4));
memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6));
@@ -2635,6 +2597,19 @@ int tcp_init(struct ctx *c)
NS_CALL(tcp_ns_socks_init, c);
}
+ peek_offset_cap = (!c->ifi4 || tcp_probe_peek_offset_cap(AF_INET)) &&
+ (!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6));
+ debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");
+
+ tcp_info_size = tcp_probe_tcp_info();
+
+#define dbg_tcpi(f_) debug("TCP_INFO tcpi_%s field%s supported", \
+ STRINGIFY(f_), tcp_info_cap(f_) ? " " : " not ")
+ dbg_tcpi(snd_wnd);
+ dbg_tcpi(bytes_acked);
+ dbg_tcpi(min_rtt);
+#undef dbg_tcpi
+
return 0;
}
@@ -2676,7 +2651,7 @@ static void tcp_port_rebind(struct ctx *c, bool outbound)
if (outbound)
tcp_ns_sock_init(c, port);
else
- tcp_sock_init(c, AF_UNSPEC, NULL, NULL, port);
+ tcp_sock_init(c, NULL, NULL, port);
}
}
}