aboutgitcodebugslistschat
path: root/tcp_splice.c
blob: 0f4f4850dd1c84a53b6ce742ac384ee87039e0d5 (plain) (tree)




















































                                                                                
                                                                            
                                                                    





























                                                                                








                                              

                      






                                              






                           
                                                            








                                                                         

                                                               



                                                                      
                                                                  













                                                                             
                            

                       





                                                  
                              
         
 

                                                   

 
                                                                            

                                                              

   
                                                                           



                                              
                                                                           

















                                                                         
                            
















                                                                                

                                                             
 
                                                                         











                                                                         

                                  






                                                                         
                            
 
                                                                         


                 
       










                                                                      
                                                                            

                                              
















                                                                
                                            




























                                                                          
                                                                            


                                                   

                                                                           
                                      
                                          
                                            


   
                                                                   




                                                                           
                                         












                                                                       
                                     



                               

                       

                                                                            



                                                            









                                                                           
                                                         







                                                                  
                                                     











                                                                           
                                                    






                                                                         

                                                 












                                                                              
                                                                                


















                                                                          







                                     





















                                                                               
                                             
                
                                                 













                                                                           
                            




























                                                                          
                                                                            


                                                                       
                          

                                                   
                                                                  
            
                                                                      
 


                                                       
                              






























































                                                                              
                                                               

                                                                   
                                                    





                                           



                                   

                           
                                      







                                                       
                                                         
                    
                                                         







                                                                     







                                                                                
                    
                                                        








                                             

                                                


                                             

                                                








                                                                              
                                                                         























                                                                           

                                                                             




































                                                                         
                                                                
                            
                                                                














                                                                      

                                                                          
                                                   
                                                        


                 

                                                                          
                                                   
                                                        


                 
                                                    













                                                                      


                              


               
                                    










































                                                                                
                                                       


































                                                                 
   
                                    
 
                                     
 
                                                                             
                                            
                                                    
                               

                 

                                                      

                                                                    
                                                            

                 

                                                      

                                                                    
                                                            

                 

                                                    

         
                                  
 






                                                                                
                                                                           


                                                                   
                                                                     


                                                                             
                                          


                                                    
// SPDX-License-Identifier: AGPL-3.0-or-later

/* PASTA - Pack A Subtle Tap Abstraction
 *  for network namespace/tap device mode
 *
 * tcp_splice.c - direct namespace forwarding for local connections
 *
 * Copyright (c) 2020-2022 Red Hat GmbH
 * Author: Stefano Brivio <sbrivio@redhat.com>
 */

/**
 * DOC: Theory of Operation
 *
 *
 * For local traffic directed to TCP ports configured for direct mapping between
 * namespaces, packets are directly translated between L4 sockets using a pair
 * of splice() syscalls. These connections are tracked in the @tc array of
 * struct tcp_splice_conn, using these events:
 *
 * - SPLICE_CONNECT:		connection accepted, connecting to target
 * - SPLICE_ESTABLISHED:	connection to target established
 * - SPLICE_A_OUT_WAIT:		pipe to accepted socket full, wait for EPOLLOUT
 * - SPLICE_B_OUT_WAIT:		pipe to target socket full, wait for EPOLLOUT
 * - SPLICE_A_FIN_RCVD:		FIN (EPOLLRDHUP) seen from accepted socket
 * - SPLICE_B_FIN_RCVD:		FIN (EPOLLRDHUP) seen from target socket
 * - SPLICE_A_FIN_RCVD:		FIN (write shutdown) sent to accepted socket
 * - SPLICE_B_FIN_RCVD:		FIN (write shutdown) sent to target socket
 *
 * #syscalls:pasta pipe2|pipe fcntl armv6l:fcntl64 armv7l:fcntl64 ppc64:fcntl64
 */

#include <sched.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <net/ethernet.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <sys/epoll.h>
#include <sys/types.h>
#include <sys/socket.h>

#include "util.h"
#include "passt.h"

#define MAX_PIPE_SIZE			(2UL * 1024 * 1024)
#define TCP_SPLICE_MAX_CONNS		(128 * 1024)
#define TCP_SPLICE_PIPE_POOL_SIZE	16
#define TCP_SPLICE_CONN_PRESSURE	30	/* % of splice_conn_count */
#define TCP_SPLICE_FILE_PRESSURE	30	/* % of c->nofile */

/* From tcp.c */
extern int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
extern int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
extern int ns_sock_pool4		[TCP_SOCK_POOL_SIZE];
extern int ns_sock_pool6		[TCP_SOCK_POOL_SIZE];

/* Pool of pre-opened pipes */
static int splice_pipe_pool		[TCP_SPLICE_PIPE_POOL_SIZE][2][2];

/**
 * struct tcp_splice_conn - Descriptor for a spliced TCP connection
 * @a:			File descriptor number of socket for accepted connection
 * @pipe_a_b:		Pipe ends for splice() from @a to @b
 * @b:			File descriptor number of peer connected socket
 * @pipe_b_a:		Pipe ends for splice() from @b to @a
 * @events:		Events observed/actions performed on connection
 * @flags:		Connection flags (attributes, not events)
 * @a_read:		Bytes read from @a (not fully written to @b in one shot)
 * @a_written:		Bytes written to @a (not fully written from one @b read)
 * @b_read:		Bytes read from @b (not fully written to @a in one shot)
 * @b_written:		Bytes written to @b (not fully written from one @a read)
*/
struct tcp_splice_conn {
	int a;
	int pipe_a_b[2];
	int b;
	int pipe_b_a[2];

	uint8_t events;
#define CLOSED				0
#define CONNECT				BIT(0)
#define ESTABLISHED			BIT(1)
#define A_OUT_WAIT			BIT(2)
#define B_OUT_WAIT			BIT(3)
#define A_FIN_RCVD			BIT(4)
#define B_FIN_RCVD			BIT(5)
#define A_FIN_SENT			BIT(6)
#define B_FIN_SENT			BIT(7)

	uint8_t flags;
#define SOCK_V6				BIT(0)
#define IN_EPOLL			BIT(1)
#define RCVLOWAT_SET_A			BIT(2)
#define RCVLOWAT_SET_B			BIT(3)
#define RCVLOWAT_ACT_A			BIT(4)
#define RCVLOWAT_ACT_B			BIT(5)
#define CLOSING				BIT(6)

	uint64_t a_read;
	uint64_t a_written;
	uint64_t b_read;
	uint64_t b_written;
};

#define CONN_V6(x)			(x->flags & SOCK_V6)
#define CONN_V4(x)			(!CONN_V6(x))
#define CONN_HAS(conn, set)		((conn->events & (set)) == (set))
#define CONN(index)			(tc + (index))

/* Spliced connections */
static struct tcp_splice_conn tc[TCP_SPLICE_MAX_CONNS];

/* Display strings for connection events */
static const char *tcp_splice_event_str[] __attribute((__unused__)) = {
	"CONNECT", "ESTABLISHED", "A_OUT_WAIT", "B_OUT_WAIT",
	"A_FIN_RCVD", "B_FIN_RCVD", "A_FIN_SENT", "B_FIN_SENT",
};

/* Display strings for connection flags */
static const char *tcp_splice_flag_str[] __attribute((__unused__)) = {
	"SOCK_V6", "IN_EPOLL", "RCVLOWAT_SET_A", "RCVLOWAT_SET_B",
	"RCVLOWAT_ACT_A", "RCVLOWAT_ACT_B", "CLOSING",
};

/**
 * tcp_splice_conn_epoll_events() - epoll events masks for given state
 * @events:	Connection event flags
 * @a:		Event mask for socket with accepted connection, set on return
 * @b:		Event mask for connection target socket, set on return
 */
static void tcp_splice_conn_epoll_events(uint16_t events,
					 uint32_t *a, uint32_t *b)
{
	*a = *b = 0;

	if (events & CLOSED)
		return;

	if (events & ESTABLISHED) {
		if (!(events & B_FIN_SENT))
			*a = EPOLLIN | EPOLLRDHUP;
		if (!(events & A_FIN_SENT))
			*b = EPOLLIN | EPOLLRDHUP;
	} else if (events & CONNECT) {
		*b = EPOLLOUT;
	}

	*a |= (events & A_OUT_WAIT) ? EPOLLOUT : 0;
	*b |= (events & B_OUT_WAIT) ? EPOLLOUT : 0;
}

static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn);
static int tcp_splice_epoll_ctl(const struct ctx *c,
				struct tcp_splice_conn *conn);

/**
 * conn_flag_do() - Set/unset given flag, log, update epoll on CLOSING flag
 * @c:		Execution context
 * @conn:	Connection pointer
 * @flag:	Flag to set, or ~flag to unset
 */
static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
			 unsigned long flag)
{
	if (flag & (flag - 1)) {
		if (!(conn->flags & ~flag))
			return;

		conn->flags &= flag;
		debug("TCP (spliced): index %i: %s dropped", (conn) - tc,
		      tcp_splice_flag_str[fls(~flag)]);
	} else {
		if (conn->flags & flag)
			return;

		conn->flags |= flag;
		debug("TCP (spliced): index %i: %s", (conn) - tc,
		      tcp_splice_flag_str[fls(flag)]);
	}

	if (flag == CLOSING)
		tcp_splice_epoll_ctl(c, conn);
}

#define conn_flag(c, conn, flag)					\
	do {								\
		trace("TCP (spliced): flag at %s:%i",			\
		      __func__, __LINE__);				\
		conn_flag_do(c, conn, flag);				\
	} while (0)

/**
 * tcp_splice_epoll_ctl() - Add/modify/delete epoll state from connection events
 * @c:		Execution context
 * @conn:	Connection pointer
 *
 * Return: 0 on success, negative error code on failure (not on deletion)
 */
static int tcp_splice_epoll_ctl(const struct ctx *c,
				struct tcp_splice_conn *conn)
{
	int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
	union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a,
				  .r.p.tcp.tcp.splice = 1,
				  .r.p.tcp.tcp.index = conn - tc,
				  .r.p.tcp.tcp.v6 = CONN_V6(conn) };
	union epoll_ref ref_b = { .r.proto = IPPROTO_TCP, .r.s = conn->b,
				  .r.p.tcp.tcp.splice = 1,
				  .r.p.tcp.tcp.index = conn - tc,
				  .r.p.tcp.tcp.v6 = CONN_V6(conn) };
	struct epoll_event ev_a = { .data.u64 = ref_a.u64 };
	struct epoll_event ev_b = { .data.u64 = ref_b.u64 };
	uint32_t events_a, events_b;

	if (conn->flags & CLOSING)
		goto delete;

	tcp_splice_conn_epoll_events(conn->events, &events_a, &events_b);
	ev_a.events = events_a;
	ev_b.events = events_b;

	if (epoll_ctl(c->epollfd, m, conn->a, &ev_a) ||
	    epoll_ctl(c->epollfd, m, conn->b, &ev_b))
		goto delete;

	conn->flags |= IN_EPOLL;		/* No need to log this */

	return 0;

delete:
	epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->a, &ev_a);
	epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->b, &ev_b);
	return -errno;
}

/**
 * conn_event_do() - Set and log connection events, update epoll state
 * @c:		Execution context
 * @conn:	Connection pointer
 * @event:	Connection event
 */
static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn,
			  unsigned long event)
{
	if (event & (event - 1)) {
		if (!(conn->events & ~event))
			return;

		conn->events &= event;
		debug("TCP (spliced): index %i, ~%s", conn - tc,
		      tcp_splice_event_str[fls(~event)]);
	} else {
		if (conn->events & event)
			return;

		conn->events |= event;
		debug("TCP (spliced): index %i, %s", conn - tc,
		      tcp_splice_event_str[fls(event)]);
	}

	if (tcp_splice_epoll_ctl(c, conn))
		conn_flag(c, conn, CLOSING);
}

#define conn_event(c, conn, event)					\
	do {								\
		trace("TCP (spliced): event at %s:%i",			\
		      __func__, __LINE__);				\
		conn_event_do(c, conn, event);				\
	} while (0)

/**
 * tcp_table_splice_compact - Compact spliced connection table
 * @c:		Execution context
 * @hole:	Pointer to recently closed connection
 */
static void tcp_table_splice_compact(struct ctx *c,
				     struct tcp_splice_conn *hole)
{
	struct tcp_splice_conn *move;

	if ((hole - tc) == --c->tcp.splice_conn_count) {
		debug("TCP (spliced): index %i (max) removed", hole - tc);
		return;
	}

	move = CONN(c->tcp.splice_conn_count);

	memcpy(hole, move, sizeof(*hole));

	move->a = move->b = -1;
	move->a_read = move->a_written = move->b_read = move->b_written = 0;
	move->pipe_a_b[0] = move->pipe_a_b[1] = -1;
	move->pipe_b_a[0] = move->pipe_b_a[1] = -1;
	move->flags = move->events = 0;

	debug("TCP (spliced): index %i moved to %i", move - tc, hole - tc);
	tcp_splice_epoll_ctl(c, hole);
	if (tcp_splice_epoll_ctl(c, hole))
		conn_flag(c, hole, CLOSING);
}

/**
 * tcp_splice_destroy() - Close spliced connection and pipes, clear
 * @c:		Execution context
 * @conn:	Connection pointer
 */
static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn)
{
	if (conn->events & ESTABLISHED) {
		/* Flushing might need to block: don't recycle them. */
		if (conn->pipe_a_b[0] != -1) {
			close(conn->pipe_a_b[0]);
			close(conn->pipe_a_b[1]);
			conn->pipe_a_b[0] = conn->pipe_a_b[1] = -1;
		}
		if (conn->pipe_b_a[0] != -1) {
			close(conn->pipe_b_a[0]);
			close(conn->pipe_b_a[1]);
			conn->pipe_b_a[0] = conn->pipe_b_a[1] = -1;
		}
	}

	if (conn->events & CONNECT) {
		close(conn->b);
		conn->b = -1;
	}

	close(conn->a);
	conn->a = -1;
	conn->a_read = conn->a_written = conn->b_read = conn->b_written = 0;

	conn->events = CLOSED;
	conn->flags = 0;
	debug("TCP (spliced): index %i, CLOSED", conn - tc);

	tcp_table_splice_compact(c, conn);
}

/**
 * tcp_splice_connect_finish() - Completion of connect() or call on success
 * @c:		Execution context
 * @conn:	Connection pointer
 *
 * Return: 0 on success, -EIO on failure
 */
static int tcp_splice_connect_finish(const struct ctx *c,
				     struct tcp_splice_conn *conn)
{
	int i;

	conn->pipe_a_b[0] = conn->pipe_b_a[0] = -1;
	conn->pipe_a_b[1] = conn->pipe_b_a[1] = -1;

	for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) {
		if (splice_pipe_pool[i][0][0] >= 0) {
			SWAP(conn->pipe_a_b[0], splice_pipe_pool[i][0][0]);
			SWAP(conn->pipe_a_b[1], splice_pipe_pool[i][0][1]);

			SWAP(conn->pipe_b_a[0], splice_pipe_pool[i][1][0]);
			SWAP(conn->pipe_b_a[1], splice_pipe_pool[i][1][1]);
			break;
		}
	}

	if (conn->pipe_a_b[0] < 0) {
		if (pipe2(conn->pipe_a_b, O_NONBLOCK) ||
		    pipe2(conn->pipe_b_a, O_NONBLOCK)) {
			conn_flag(c, conn, CLOSING);
			return -EIO;
		}

		fcntl(conn->pipe_a_b[0], F_SETPIPE_SZ, c->tcp.pipe_size);
		fcntl(conn->pipe_b_a[0], F_SETPIPE_SZ, c->tcp.pipe_size);
	}

	if (!(conn->events & ESTABLISHED))
		conn_event(c, conn, ESTABLISHED);

	return 0;
}

/**
 * tcp_splice_connect() - Create and connect socket for new spliced connection
 * @c:		Execution context
 * @conn:	Connection pointer
 * @s:		Accepted socket
 * @port:	Destination port, host order
 *
 * Return: 0 for connect() succeeded or in progress, negative value on error
 */
static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
			      int s, in_port_t port)
{
	int sock_conn = (s >= 0) ? s : socket(CONN_V6(conn) ? AF_INET6 :
							      AF_INET,
					      SOCK_STREAM | SOCK_NONBLOCK,
					      IPPROTO_TCP);
	struct sockaddr_in6 addr6 = {
		.sin6_family = AF_INET6,
		.sin6_port = htons(port),
		.sin6_addr = IN6ADDR_LOOPBACK_INIT,
	};
	struct sockaddr_in addr4 = {
		.sin_family = AF_INET,
		.sin_port = htons(port),
		.sin_addr = { .s_addr = htonl(INADDR_LOOPBACK) },
	};
	const struct sockaddr *sa;
	socklen_t sl;

	if (sock_conn < 0)
		return -errno;

	if (sock_conn > SOCKET_MAX) {
		close(sock_conn);
		return -EIO;
	}

	conn->b = sock_conn;

	if (s < 0)
		tcp_sock_set_bufsize(c, conn->b);

	setsockopt(conn->b, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int));

	if (CONN_V6(conn)) {
		sa = (struct sockaddr *)&addr6;
		sl = sizeof(addr6);
	} else {
		sa = (struct sockaddr *)&addr4;
		sl = sizeof(addr4);
	}

	if (connect(conn->b, sa, sl)) {
		if (errno != EINPROGRESS) {
			int ret = -errno;

			close(sock_conn);
			return ret;
		}
		conn_event(c, conn, CONNECT);
	} else {
		conn_event(c, conn, ESTABLISHED);
		return tcp_splice_connect_finish(c, conn);
	}

	return 0;
}

/**
 * struct tcp_splice_connect_ns_arg - Arguments for tcp_splice_connect_ns()
 * @c:		Execution context
 * @conn:	Accepted inbound connection
 * @port:	Destination port, host order
 * @ret:	Return value of tcp_splice_connect_ns()
 */
struct tcp_splice_connect_ns_arg {
	const struct ctx *c;
	struct tcp_splice_conn *conn;
	in_port_t port;
	int ret;
};

/**
 * tcp_splice_connect_ns() - Enter namespace and call tcp_splice_connect()
 * @arg:	See struct tcp_splice_connect_ns_arg
 *
 * Return: 0
 */
static int tcp_splice_connect_ns(void *arg)
{
	struct tcp_splice_connect_ns_arg *a;

	a = (struct tcp_splice_connect_ns_arg *)arg;
	ns_enter(a->c);
	a->ret = tcp_splice_connect(a->c, a->conn, -1, a->port);
	return 0;
}

/**
 * tcp_splice_new() - Handle new inbound, spliced connection
 * @c:		Execution context
 * @conn:	Connection pointer
 * @port:	Destination port, host order
 *
 * Return: return code from connect()
 */
static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn,
			  in_port_t port)
{
	struct tcp_splice_connect_ns_arg ns_arg = { c, conn, port, 0 };
	int *p, i, s = -1;

	if (bitmap_isset(c->tcp.port_to_tap, port))
		p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4;
	else
		p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4;

	for (i = 0; i < TCP_SOCK_POOL_SIZE; i++, p++) {
		SWAP(s, *p);
		if (s >= 0)
			break;
	}

	if (s < 0 && bitmap_isset(c->tcp.port_to_tap, port)) {
		NS_CALL(tcp_splice_connect_ns, &ns_arg);
		return ns_arg.ret;
	}

	return tcp_splice_connect(c, conn, s, port);
}

/**
 * tcp_splice_dir() - Set sockets/pipe pointers reflecting flow direction
 * @conn:	Connection pointers
 * @ref_sock:	Socket returned as reference from epoll
 * @reverse:	Reverse direction: @ref_sock is used as destination
 * @from:	Destination socket pointer to set
 * @to:		Source socket pointer to set
 * @pipes:	Pipe set, assigned on return
 */
static void tcp_splice_dir(struct tcp_splice_conn *conn, int ref_sock,
			   int reverse, int *from, int *to, int **pipes)
{
	if (!reverse) {
		*from = ref_sock;
		*to   = (*from == conn->a) ? conn->b : conn->a;
	} else {
		*to   = ref_sock;
		*from = (*to   == conn->a) ? conn->b : conn->a;
	}

	*pipes = *from == conn->a ? conn->pipe_a_b : conn->pipe_b_a;
}

/**
 * tcp_sock_handler_splice() - Handler for socket mapped to spliced connection
 * @c:		Execution context
 * @ref:	epoll reference
 * @events:	epoll events bitmap
 *
 * #syscalls:pasta splice
 */
void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
			     uint32_t events)
{
	uint8_t lowat_set_flag, lowat_act_flag;
	int from, to, *pipes, eof, never_read;
	uint64_t *seq_read, *seq_write;
	struct tcp_splice_conn *conn;

	if (ref.r.p.tcp.tcp.listen) {
		int s;

		if (c->tcp.splice_conn_count >= TCP_SPLICE_MAX_CONNS)
			return;

		if ((s = accept4(ref.r.s, NULL, NULL, SOCK_NONBLOCK)) < 0)
			return;

		setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }),
			   sizeof(int));

		conn = CONN(c->tcp.splice_conn_count++);
		conn->a = s;
		conn->flags = ref.r.p.tcp.tcp.v6 ? SOCK_V6 : 0;

		if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index))
			conn_flag(c, conn, CLOSING);

		return;
	}

	conn = CONN(ref.r.p.tcp.tcp.index);

	if (conn->events == CLOSED)
		return;

	if (events & EPOLLERR)
		goto close;

	if (conn->events == CONNECT) {
		if (!(events & EPOLLOUT))
			goto close;
		if (tcp_splice_connect_finish(c, conn))
			goto close;
	}

	if (events & EPOLLOUT) {
		if (ref.r.s == conn->a)
			conn_event(c, conn, ~A_OUT_WAIT);
		else
			conn_event(c, conn, ~B_OUT_WAIT);

		tcp_splice_dir(conn, ref.r.s, 1, &from, &to, &pipes);
	} else {
		tcp_splice_dir(conn, ref.r.s, 0, &from, &to, &pipes);
	}

	if (events & EPOLLRDHUP) {
		if (ref.r.s == conn->a)
			conn_event(c, conn, A_FIN_RCVD);
		else
			conn_event(c, conn, B_FIN_RCVD);
	}

	if (events & EPOLLHUP) {
		if (ref.r.s == conn->a)
			conn_event(c, conn, A_FIN_SENT); /* Fake, but implied */
		else
			conn_event(c, conn, B_FIN_SENT);
	}

swap:
	eof = 0;
	never_read = 1;

	if (from == conn->a) {
		seq_read = &conn->a_read;
		seq_write = &conn->a_written;
		lowat_set_flag = RCVLOWAT_SET_A;
		lowat_act_flag = RCVLOWAT_ACT_A;
	} else {
		seq_read = &conn->b_read;
		seq_write = &conn->b_written;
		lowat_set_flag = RCVLOWAT_SET_B;
		lowat_act_flag = RCVLOWAT_ACT_B;
	}

	while (1) {
		int retry_write = 0, more = 0;
		ssize_t readlen, to_write = 0, written;

retry:
		readlen = splice(from, NULL, pipes[1], NULL, c->tcp.pipe_size,
				 SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
		trace("TCP (spliced): %li from read-side call", readlen);
		if (readlen < 0) {
			if (errno == EINTR)
				goto retry;

			if (errno != EAGAIN)
				goto close;

			to_write = c->tcp.pipe_size;
		} else if (!readlen) {
			eof = 1;
			to_write = c->tcp.pipe_size;
		} else {
			never_read = 0;
			to_write += readlen;
			if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
				more = SPLICE_F_MORE;

			if (conn->flags & lowat_set_flag)
				conn_flag(c, conn, lowat_act_flag);
		}

eintr:
		written = splice(pipes[0], NULL, to, NULL, to_write,
				 SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
		trace("TCP (spliced): %li from write-side call (passed %lu)",
		      written, to_write);

		/* Most common case: skip updating counters. */
		if (readlen > 0 && readlen == written) {
			if (readlen >= (long)c->tcp.pipe_size * 10 / 100)
				continue;

			if (conn->flags & lowat_set_flag &&
			    readlen > (long)c->tcp.pipe_size / 10) {
				int lowat = c->tcp.pipe_size / 4;

				setsockopt(from, SOL_SOCKET, SO_RCVLOWAT,
					   &lowat, sizeof(lowat));

				conn_flag(c, conn, lowat_set_flag);
				conn_flag(c, conn, lowat_act_flag);
			}

			break;
		}

		*seq_read  += readlen > 0 ? readlen : 0;
		*seq_write += written > 0 ? written : 0;

		if (written < 0) {
			if (errno == EINTR)
				goto eintr;

			if (errno != EAGAIN)
				goto close;

			if (never_read)
				break;

			if (retry_write--)
				goto retry;

			if (to == conn->a)
				conn_event(c, conn, A_OUT_WAIT);
			else
				conn_event(c, conn, B_OUT_WAIT);
			break;
		}

		if (never_read && written == (long)(c->tcp.pipe_size))
			goto retry;

		if (!never_read && written < to_write) {
			to_write -= written;
			goto retry;
		}

		if (eof)
			break;
	}

	if ((conn->events & A_FIN_RCVD) && !(conn->events & B_FIN_SENT)) {
		if (*seq_read == *seq_write && eof) {
			shutdown(conn->b, SHUT_WR);
			conn_event(c, conn, B_FIN_SENT);
		}
	}

	if ((conn->events & B_FIN_RCVD) && !(conn->events & A_FIN_SENT)) {
		if (*seq_read == *seq_write && eof) {
			shutdown(conn->a, SHUT_WR);
			conn_event(c, conn, A_FIN_SENT);
		}
	}

	if (CONN_HAS(conn, A_FIN_SENT | B_FIN_SENT))
		goto close;

	if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) {
		events = EPOLLIN;

		SWAP(from, to);
		if (pipes == conn->pipe_a_b)
			pipes = conn->pipe_b_a;
		else
			pipes = conn->pipe_a_b;

		goto swap;
	}

	if (events & EPOLLHUP)
		goto close;

	return;

close:
	conn_flag(c, conn, CLOSING);
}

/**
 * tcp_set_pipe_size() - Set usable pipe size, probe starting from MAX_PIPE_SIZE
 * @c:		Execution context
 */
static void tcp_set_pipe_size(struct ctx *c)
{
	int probe_pipe[TCP_SPLICE_PIPE_POOL_SIZE * 2][2], i, j;

	c->tcp.pipe_size = MAX_PIPE_SIZE;

smaller:
	for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE * 2; i++) {
		if (pipe2(probe_pipe[i], 0)) {
			i++;
			break;
		}

		if (fcntl(probe_pipe[i][0], F_SETPIPE_SZ, c->tcp.pipe_size) < 0)
			break;
	}

	for (j = i - 1; j >= 0; j--) {
		close(probe_pipe[j][0]);
		close(probe_pipe[j][1]);
	}

	if (i == TCP_SPLICE_PIPE_POOL_SIZE * 2)
		return;

	if (!(c->tcp.pipe_size /= 2)) {
		c->tcp.pipe_size = MAX_PIPE_SIZE;
		return;
	}

	goto smaller;
}

/**
 * tcp_splice_pipe_refill() - Refill pool of pre-opened pipes
 * @c:		Execution context
 */
static void tcp_splice_pipe_refill(const struct ctx *c)
{
	int i;

	for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) {
		if (splice_pipe_pool[i][0][0] >= 0)
			break;
		if (pipe2(splice_pipe_pool[i][0], O_NONBLOCK))
			continue;
		if (pipe2(splice_pipe_pool[i][1], O_NONBLOCK)) {
			close(splice_pipe_pool[i][1][0]);
			close(splice_pipe_pool[i][1][1]);
			continue;
		}

		fcntl(splice_pipe_pool[i][0][0], F_SETPIPE_SZ,
		      c->tcp.pipe_size);
		fcntl(splice_pipe_pool[i][1][0], F_SETPIPE_SZ,
		      c->tcp.pipe_size);
	}
}

/**
 * tcp_splice_init() - Initialise pipe pool and size
 * @c:		Execution context
 */
void tcp_splice_init(struct ctx *c)
{
	memset(splice_pipe_pool, 0xff, sizeof(splice_pipe_pool));
	tcp_set_pipe_size(c);
	tcp_splice_pipe_refill(c);
}

/**
 * tcp_splice_timer() - Timer for spliced connections
 * @c:		Execution context
 */
void tcp_splice_timer(struct ctx *c)
{
	struct tcp_splice_conn *conn;

	for (conn = CONN(c->tcp.splice_conn_count - 1); conn >= tc; conn--) {
		if (conn->flags & CLOSING) {
			tcp_splice_destroy(c, conn);
			return;
		}

		if ( (conn->flags & RCVLOWAT_SET_A) &&
		    !(conn->flags & RCVLOWAT_ACT_A)) {
			setsockopt(conn->a, SOL_SOCKET, SO_RCVLOWAT,
				   &((int){ 1 }), sizeof(int));
			conn_flag(c, conn, ~RCVLOWAT_SET_A);
		}

		if ( (conn->flags & RCVLOWAT_SET_B) &&
		    !(conn->flags & RCVLOWAT_ACT_B)) {
			setsockopt(conn->b, SOL_SOCKET, SO_RCVLOWAT,
				   &((int){ 1 }), sizeof(int));
			conn_flag(c, conn, ~RCVLOWAT_SET_B);
		}

		conn_flag(c, conn, ~RCVLOWAT_ACT_A);
		conn_flag(c, conn, ~RCVLOWAT_ACT_B);
	}

	tcp_splice_pipe_refill(c);
}

/**
 * tcp_splice_defer_handler() - Close connections without timer on file pressure
 * @c:		Execution context
 */
void tcp_splice_defer_handler(struct ctx *c)
{
	int max_conns = c->tcp.conn_count / 100 * TCP_SPLICE_CONN_PRESSURE;
	int max_files = c->nofile / 100 * TCP_SPLICE_FILE_PRESSURE;
	struct tcp_splice_conn *conn;

	if (c->tcp.splice_conn_count < MIN(max_files / 6, max_conns))
		return;

	for (conn = CONN(c->tcp.splice_conn_count - 1); conn >= tc; conn--) {
		if (conn->flags & CLOSING)
			tcp_splice_destroy(c, conn);
	}
}