aboutgitcodebugslistschat
path: root/tcp_buf.c
blob: c31e9f31b4386de6f8a248ee4dcd7f9f1850b3db (plain) (tree)




































                                                                            
                                                    






































































































































































































                                                                                  



                                                                            
   

                                                                                






                                                                              
                                     




                                                  


                                                                        






                                                                
                                     





                                                                
                                                                        






                                                                
                                                                        







































                                                                          
                                                                      



































                                                                                
                                                                     


















                                                                            
                                                                             






                                                           
                                                                            

























                                                                                  
                                                                  







                                                                          



                                                
















                                                                            









                                                        







































                                                                                       



                                        





































                                                                           
// SPDX-License-Identifier: GPL-2.0-or-later

/* PASST - Plug A Simple Socket Transport
 *  for qemu/UNIX domain socket mode
 *
 * PASTA - Pack A Subtle Tap Abstraction
 *  for network namespace/tap device mode
 *
 * tcp_buf.c - TCP L2 buffer management functions
 *
 * Copyright Red Hat
 * Author: Stefano Brivio <sbrivio@redhat.com>
 */

#include <stddef.h>
#include <stdint.h>
#include <limits.h>
#include <string.h>
#include <errno.h>

#include <netinet/ip.h>

#include <linux/tcp.h>

#include "util.h"
#include "ip.h"
#include "iov.h"
#include "passt.h"
#include "tap.h"
#include "siphash.h"
#include "inany.h"
#include "tcp_conn.h"
#include "tcp_internal.h"
#include "tcp_buf.h"

#define TCP_FRAMES_MEM			128
#define TCP_FRAMES							   \
	(c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM)

/* Static buffers */
/**
 * struct tcp_payload_t - TCP header and data to send segments with payload
 * @th:		TCP header
 * @data:	TCP data
 */
struct tcp_payload_t {
	struct tcphdr th;
	uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)));    /* For AVX2 checksum routines */
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif

/**
 * struct tcp_flags_t - TCP header and data to send zero-length
 *                      segments (flags)
 * @th:		TCP header
 * @opts	TCP options
 */
struct tcp_flags_t {
	struct tcphdr th;
	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)));
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif

/* Ethernet header for IPv4 frames */
static struct ethhdr		tcp4_eth_src;

static struct tap_hdr		tcp4_payload_tap_hdr[TCP_FRAMES_MEM];
/* IPv4 headers */
static struct iphdr		tcp4_payload_ip[TCP_FRAMES_MEM];
/* TCP segments with payload for IPv4 frames */
static struct tcp_payload_t	tcp4_payload[TCP_FRAMES_MEM];

static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");

/* References tracking the owner connection of frames in the tap outqueue */
static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
static unsigned int tcp4_payload_used;

static struct tap_hdr		tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
/* IPv4 headers for TCP segment without payload */
static struct iphdr		tcp4_flags_ip[TCP_FRAMES_MEM];
/* TCP segments without payload for IPv4 frames */
static struct tcp_flags_t	tcp4_flags[TCP_FRAMES_MEM];

static unsigned int tcp4_flags_used;

/* Ethernet header for IPv6 frames */
static struct ethhdr		tcp6_eth_src;

static struct tap_hdr		tcp6_payload_tap_hdr[TCP_FRAMES_MEM];
/* IPv6 headers */
static struct ipv6hdr		tcp6_payload_ip[TCP_FRAMES_MEM];
/* TCP headers and data for IPv6 frames */
static struct tcp_payload_t	tcp6_payload[TCP_FRAMES_MEM];

static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");

/* References tracking the owner connection of frames in the tap outqueue */
static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
static unsigned int tcp6_payload_used;

static struct tap_hdr		tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
/* IPv6 headers for TCP segment without payload */
static struct ipv6hdr		tcp6_flags_ip[TCP_FRAMES_MEM];
/* TCP segment without payload for IPv6 frames */
static struct tcp_flags_t	tcp6_flags[TCP_FRAMES_MEM];

static unsigned int tcp6_flags_used;

/* recvmsg()/sendmsg() data for tap */
static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];

static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
/**
 * tcp_update_l2_buf() - Update Ethernet header buffers with addresses
 * @eth_d:	Ethernet destination address, NULL if unchanged
 * @eth_s:	Ethernet source address, NULL if unchanged
 */
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
{
	eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
	eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
}

/**
 * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
 * @c:		Execution context
 */
void tcp_sock4_iov_init(const struct ctx *c)
{
	struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
	struct iovec *iov;
	int i;

	tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);

	for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) {
		tcp4_payload_ip[i] = iph;
		tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
		tcp4_payload[i].th.ack = 1;
	}

	for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) {
		tcp4_flags_ip[i] = iph;
		tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
		tcp4_flags[i].th.ack = 1;
	}

	for (i = 0; i < TCP_FRAMES_MEM; i++) {
		iov = tcp4_l2_iov[i];

		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]);
		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]);
		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
	}

	for (i = 0; i < TCP_FRAMES_MEM; i++) {
		iov = tcp4_l2_flags_iov[i];

		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]);
		iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]);
		iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
	}
}

/**
 * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
 * @c:		Execution context
 */
void tcp_sock6_iov_init(const struct ctx *c)
{
	struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
	struct iovec *iov;
	int i;

	tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);

	for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) {
		tcp6_payload_ip[i] = ip6;
		tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
		tcp6_payload[i].th.ack = 1;
	}

	for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) {
		tcp6_flags_ip[i] = ip6;
		tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
		tcp6_flags[i].th .ack = 1;
	}

	for (i = 0; i < TCP_FRAMES_MEM; i++) {
		iov = tcp6_l2_iov[i];

		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]);
		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]);
		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
	}

	for (i = 0; i < TCP_FRAMES_MEM; i++) {
		iov = tcp6_l2_flags_iov[i];

		iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]);
		iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
		iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]);
		iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
	}
}

/**
 * tcp_flags_flush() - Send out buffers for segments with no data (flags)
 * @c:		Execution context
 */
void tcp_flags_flush(const struct ctx *c)
{
	tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
			tcp6_flags_used);
	tcp6_flags_used = 0;

	tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
			tcp4_flags_used);
	tcp4_flags_used = 0;
}

/**
 * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
 * @ctx:	Execution context
 * @conns:	Array of connection pointers corresponding to queued frames
 * @frames:	Two-dimensional array containing queued frames with sub-iovs
 * @num_frames:	Number of entries in the two arrays to be compared
 */
static void tcp_revert_seq(struct ctx *c, struct tcp_tap_conn **conns,
			   struct iovec (*frames)[TCP_NUM_IOVS], int num_frames)
{
	int i;

	for (i = 0; i < num_frames; i++) {
		const struct tcphdr *th = frames[i][TCP_IOV_PAYLOAD].iov_base;
		struct tcp_tap_conn *conn = conns[i];
		uint32_t seq = ntohl(th->seq);
		uint32_t peek_offset;

		if (SEQ_LE(conn->seq_to_tap, seq))
			continue;

		conn->seq_to_tap = seq;
		peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
		if (tcp_set_peek_offset(conn->sock, peek_offset))
			tcp_rst(c, conn);
	}
}

/**
 * tcp_payload_flush() - Send out buffers for segments with data
 * @c:		Execution context
 */
void tcp_payload_flush(struct ctx *c)
{
	size_t m;

	m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
			    tcp6_payload_used);
	if (m != tcp6_payload_used) {
		tcp_revert_seq(c, &tcp6_frame_conns[m], &tcp6_l2_iov[m],
			       tcp6_payload_used - m);
	}
	tcp6_payload_used = 0;

	m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
			    tcp4_payload_used);
	if (m != tcp4_payload_used) {
		tcp_revert_seq(c, &tcp4_frame_conns[m], &tcp4_l2_iov[m],
			       tcp4_payload_used - m);
	}
	tcp4_payload_used = 0;
}

/**
 * tcp_buf_send_flag() - Send segment with flags to tap (no payload)
 * @c:         Execution context
 * @conn:      Connection pointer
 * @flags:     TCP flags: if not set, send segment only if ACK is due
 *
 * Return: negative error code on connection reset, 0 otherwise
 */
int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
{
	struct tcp_flags_t *payload;
	struct iovec *iov;
	size_t optlen;
	size_t l4len;
	uint32_t seq;
	int ret;

	if (CONN_V4(conn))
		iov = tcp4_l2_flags_iov[tcp4_flags_used++];
	else
		iov = tcp6_l2_flags_iov[tcp6_flags_used++];

	payload = iov[TCP_IOV_PAYLOAD].iov_base;

	seq = conn->seq_to_tap;
	ret = tcp_prepare_flags(c, conn, flags, &payload->th,
				payload->opts, &optlen);
	if (ret <= 0) {
		if (CONN_V4(conn))
			tcp4_flags_used--;
		else
			tcp6_flags_used--;
		return ret;
	}

	l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq);
	iov[TCP_IOV_PAYLOAD].iov_len = l4len;

	if (flags & DUP_ACK) {
		struct iovec *dup_iov;
		int i;

		if (CONN_V4(conn))
			dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
		else
			dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];

		for (i = 0; i < TCP_NUM_IOVS; i++)
			memcpy(dup_iov[i].iov_base, iov[i].iov_base,
			       iov[i].iov_len);
		dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
	}

	if (CONN_V4(conn)) {
		if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
			tcp_flags_flush(c);
	} else {
		if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
			tcp_flags_flush(c);
	}

	return 0;
}

/**
 * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer
 * @c:		Execution context
 * @conn:	Connection pointer
 * @dlen:	TCP payload length
 * @no_csum:	Don't compute IPv4 checksum, use the one from previous buffer
 * @seq:	Sequence number to be sent
 */
static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
			    ssize_t dlen, int no_csum, uint32_t seq)
{
	struct iovec *iov;
	size_t l4len;

	conn->seq_to_tap = seq + dlen;

	if (CONN_V4(conn)) {
		struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
		const uint16_t *check = NULL;

		if (no_csum) {
			struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
			check = &iph->check;
		}

		tcp4_frame_conns[tcp4_payload_used] = conn;

		iov = tcp4_l2_iov[tcp4_payload_used++];
		l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq);
		iov[TCP_IOV_PAYLOAD].iov_len = l4len;
		if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
			tcp_payload_flush(c);
	} else if (CONN_V6(conn)) {
		tcp6_frame_conns[tcp6_payload_used] = conn;

		iov = tcp6_l2_iov[tcp6_payload_used++];
		l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, NULL, seq);
		iov[TCP_IOV_PAYLOAD].iov_len = l4len;
		if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
			tcp_payload_flush(c);
	}
}

/**
 * tcp_buf_data_from_sock() - Handle new data from socket, queue to tap, in window
 * @c:		Execution context
 * @conn:	Connection pointer
 *
 * Return: negative on connection reset, 0 otherwise
 *
 * #syscalls recvmsg
 */
int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
{
	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
	int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
	int sendlen, len, dlen, v4 = CONN_V4(conn);
	int s = conn->sock, i, ret = 0;
	struct msghdr mh_sock = { 0 };
	uint16_t mss = MSS_GET(conn);
	uint32_t already_sent, seq;
	struct iovec *iov;

	/* How much have we read/sent since last received ack ? */
	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;

	if (SEQ_LT(already_sent, 0)) {
		/* RFC 761, section 2.1. */
		flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
			   conn->seq_ack_from_tap, conn->seq_to_tap);
		conn->seq_to_tap = conn->seq_ack_from_tap;
		already_sent = 0;
		if (tcp_set_peek_offset(s, 0)) {
			tcp_rst(c, conn);
			return -1;
		}
	}

	if (!wnd_scaled || already_sent >= wnd_scaled) {
		conn_flag(c, conn, STALLED);
		conn_flag(c, conn, ACK_FROM_TAP_DUE);
		return 0;
	}

	/* Set up buffer descriptors we'll fill completely and partially. */
	fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
	if (fill_bufs > TCP_FRAMES) {
		fill_bufs = TCP_FRAMES;
		iov_rem = 0;
	} else {
		iov_rem = (wnd_scaled - already_sent) % mss;
	}

	/* Prepare iov according to kernel capability */
	if (!peek_offset_cap) {
		mh_sock.msg_iov = iov_sock;
		iov_sock[0].iov_base = tcp_buf_discard;
		iov_sock[0].iov_len = already_sent;
		mh_sock.msg_iovlen = fill_bufs + 1;
	} else {
		mh_sock.msg_iov = &iov_sock[1];
		mh_sock.msg_iovlen = fill_bufs;
	}

	if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
	    (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
		tcp_payload_flush(c);

		/* Silence Coverity CWE-125 false positive */
		tcp4_payload_used = tcp6_payload_used = 0;
	}

	for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
		if (v4)
			iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
		else
			iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
		iov->iov_len = mss;
	}
	if (iov_rem)
		iov_sock[fill_bufs].iov_len = iov_rem;

	/* Receive into buffers, don't dequeue until acknowledged by guest. */
	do
		len = recvmsg(s, &mh_sock, MSG_PEEK);
	while (len < 0 && errno == EINTR);

	if (len < 0)
		goto err;

	if (!len) {
		if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
			if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) {
				tcp_rst(c, conn);
				return ret;
			}

			conn_event(c, conn, TAP_FIN_SENT);
		}

		return 0;
	}

	sendlen = len;
	if (!peek_offset_cap)
		sendlen -= already_sent;

	if (sendlen <= 0) {
		conn_flag(c, conn, STALLED);
		return 0;
	}

	conn_flag(c, conn, ~STALLED);

	send_bufs = DIV_ROUND_UP(sendlen, mss);
	last_len = sendlen - (send_bufs - 1) * mss;

	/* Likely, some new data was acked too. */
	tcp_update_seqack_wnd(c, conn, 0, NULL);

	/* Finally, queue to tap */
	dlen = mss;
	seq = conn->seq_to_tap;
	for (i = 0; i < send_bufs; i++) {
		int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;

		if (i == send_bufs - 1)
			dlen = last_len;

		tcp_data_to_tap(c, conn, dlen, no_csum, seq);
		seq += dlen;
	}

	conn_flag(c, conn, ACK_FROM_TAP_DUE);

	return 0;

err:
	if (errno != EAGAIN && errno != EWOULDBLOCK) {
		ret = -errno;
		tcp_rst(c, conn);
	}

	return ret;
}