aboutgitcodebugslistschat
path: root/tcp_splice.c
blob: a202715c7102d2ad1db3f0b370bcbdddd2f30b26 (plain) (tree)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
                                            













                                                                   


                                                                      
                                                                    
                


                                                                         





                                                                               




                                                                               
                   
                   



                   
                    

                   








                         
                
                       
                    
                  
                 
 
                       
 
                                                           
                                          
                                                                     
                                                                    
 




                                                                         

                              
                                                                       
 
                                                              

                                                                         
                                                                


                                                                       

                                                                           



                                                                      

                                                                          

  

                                         
                                                                 
 


                                                                      
                                                                               

                                                         
                                                                 
 
                                        
 
                                          



                                                            
                                             
                                        
         
 

                                                             

 






                                                                                
                                                    


                                                               
                                            
                                                                  
                                                   
                                                                  
                                                  


                                                                      
 
                                                       
 

                                                           
                                 
                                                                            






                              

   
                                                                           



                                              
                                                                           


                                            

                                            



                                           


                                                                  
                

                                           



                                       

                                                                              

         
                              

                                                                       
         



                                                                         
                                                                         



                                                                         




                                                                      
                                                                            

                                              
                                  

                                             



                                             

                                                                                
                

                                            



                                         

                                                                               


                                          
                                            



                                                                         
                                                                         


                                                                         

   
                                                                         
                                                    

                                                             
   
                                            
 
                                                         
                      
 
                                                
                             
 
                                              




                                                                       
                 
 
                                         

                                             
                 
 
                                                           

         
                                     
                        
                                 
 
                    








                                                                           
                                                         

                                                                  
                      
                  

                                              







                                                             
                 
 

                                                                              

                                                                               





                                                                    


                                                                                
                         
                 

         

                                                        







                                                                              

                                             



                                                                            
                                                                                
                                                                          
 







                                                   
                                                  



                                  








                                                     
 
                                                         
                                                     

                                                                           
         








                                               
                                          


                                                                                  
                                      
                 
 
                                                    
                
                                                        






                                                          



























                                                                                
                                                                               

                                                   
                                  
                                        
                                          
  
                                                                       

                             
                                                   
                                                                                
                                                                      
 
                                              
                                     
                             
                       
                     
 
                                      
 

                                                
                             
 

                                                 

                                                                
                                                    
                        


                                                 
 

                                                                              
 
                                    
                                
                                                         


                                            
                                  
                                                        

         
                                                           
                                            

                    


   
                                                                              
                                 
                               



                                   

                                                                
 

                                                               
                                               
                            

                                                
 
                                          

                       











                                                                         
                           
         
 
                                             






                                                       

                                                                           
                
                                

         


                                                                         




                       

                                                                         

                   
                                                       
                             

      

                                                                                 
                                                                    
                                                                     





















                                                                         

                                                                    
                                                                           

                                                                         









                                                                         

                                                                               







                                                                   

                                                                     










                                            

                                                                            














                                                                      
                                                                          
                                                                             

                                                        


                 
                                                                          
                                                                             

                                                        


                 
                                                    




                                                                      
                                     


                          


                              


               
                                    







                                                                                
                                                           



                                         
                                                         
                                                      












                                                                                
                                           













                                                             
                                                       



                                                         
                                                
                              
                                                                       
                                 
 
                                                               
                                              
                                                                                

                                                



         










                                                                         











                                                                            

















                                                                      






                                                                 



                                                                       




                                                     
                                
   
                                                            
 
                                                         
                 
 
                                         
 


                                                                          
 


                                                                              

                                                                               

                                                 
                 

         

                                            
 
// SPDX-License-Identifier: GPL-2.0-or-later

/* PASTA - Pack A Subtle Tap Abstraction
 *  for network namespace/tap device mode
 *
 * tcp_splice.c - direct namespace forwarding for local connections
 *
 * Copyright (c) 2020-2022 Red Hat GmbH
 * Author: Stefano Brivio <sbrivio@redhat.com>
 */

/**
 * DOC: Theory of Operation
 *
 *
 * For local traffic directed to TCP ports configured for direct
 * mapping between namespaces, packets are directly translated between
 * L4 sockets using a pair of splice() syscalls. These connections are
 * tracked by struct tcp_splice_conn entries in the @tc array, using
 * these events:
 *
 * - SPLICE_CONNECT:		connection accepted, connecting to target
 * - SPLICE_ESTABLISHED:	connection to target established
 * - OUT_WAIT_0:		pipe to accepted socket full, wait for EPOLLOUT
 * - OUT_WAIT_1:		pipe to target socket full, wait for EPOLLOUT
 * - FIN_RCVD_0:		FIN (EPOLLRDHUP) seen from accepted socket
 * - FIN_RCVD_1:		FIN (EPOLLRDHUP) seen from target socket
 * - FIN_SENT_0:		FIN (write shutdown) sent to accepted socket
 * - FIN_SENT_1:		FIN (write shutdown) sent to target socket
 *
 * #syscalls:pasta pipe2|pipe fcntl armv6l:fcntl64 armv7l:fcntl64 ppc64:fcntl64
 */

#include <sched.h>
#include <unistd.h>
#include <signal.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include <time.h>
#include <net/ethernet.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <sys/epoll.h>
#include <sys/types.h>
#include <sys/socket.h>

#include "util.h"
#include "passt.h"
#include "log.h"
#include "tcp_splice.h"
#include "siphash.h"
#include "inany.h"
#include "flow.h"

#include "flow_table.h"

#define MAX_PIPE_SIZE			(8UL * 1024 * 1024)
#define TCP_SPLICE_PIPE_POOL_SIZE	32
#define TCP_SPLICE_CONN_PRESSURE	30	/* % of conn_count */
#define TCP_SPLICE_FILE_PRESSURE	30	/* % of c->nofile */

/* Pools for pre-opened sockets (in namespace) */
#define TCP_SOCK_POOL_TSH		16 /* Refill in ns if > x used */

static int ns_sock_pool4	[TCP_SOCK_POOL_SIZE];
static int ns_sock_pool6	[TCP_SOCK_POOL_SIZE];

/* Pool of pre-opened pipes */
static int splice_pipe_pool		[TCP_SPLICE_PIPE_POOL_SIZE][2];

#define CONN_V6(x)			(x->flags & SPLICE_V6)
#define CONN_V4(x)			(!CONN_V6(x))
#define CONN_HAS(conn, set)		((conn->events & (set)) == (set))
#define CONN(idx)			(&FLOW(idx)->tcp_splice)

/* Display strings for connection events */
static const char *tcp_splice_event_str[] __attribute((__unused__)) = {
	"SPLICE_CONNECT", "SPLICE_ESTABLISHED", "OUT_WAIT_0", "OUT_WAIT_1",
	"FIN_RCVD_0", "FIN_RCVD_1", "FIN_SENT_0", "FIN_SENT_1",
};

/* Display strings for connection flags */
static const char *tcp_splice_flag_str[] __attribute((__unused__)) = {
	"SPLICE_V6", "RCVLOWAT_SET_0", "RCVLOWAT_SET_1", "RCVLOWAT_ACT_0",
	"RCVLOWAT_ACT_1", "CLOSING",
};

/* Forward declaration */
static int tcp_sock_refill_ns(void *arg);
static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af);

/**
 * tcp_splice_conn_epoll_events() - epoll events masks for given state
 * @events:	Connection event flags
 * @ev:		Events to fill in, 0 is accepted socket, 1 is connecting socket
 */
static void tcp_splice_conn_epoll_events(uint16_t events,
					 struct epoll_event ev[])
{
	ev[0].events = ev[1].events = 0;

	if (events & SPLICE_ESTABLISHED) {
		if (!(events & FIN_SENT_1))
			ev[0].events = EPOLLIN | EPOLLRDHUP;
		if (!(events & FIN_SENT_0))
			ev[1].events = EPOLLIN | EPOLLRDHUP;
	} else if (events & SPLICE_CONNECT) {
		ev[1].events = EPOLLOUT;
	}

	ev[0].events |= (events & OUT_WAIT_0) ? EPOLLOUT : 0;
	ev[1].events |= (events & OUT_WAIT_1) ? EPOLLOUT : 0;
}

/**
 * tcp_splice_epoll_ctl() - Add/modify/delete epoll state from connection events
 * @c:		Execution context
 * @conn:	Connection pointer
 *
 * Return: 0 on success, negative error code on failure (not on deletion)
 */
static int tcp_splice_epoll_ctl(const struct ctx *c,
				struct tcp_splice_conn *conn)
{
	int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
	const union epoll_ref ref[SIDES] = {
		{ .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[0],
		  .flowside = FLOW_SIDX(conn, 0) },
		{ .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[1],
		  .flowside = FLOW_SIDX(conn, 1) }
	};
	struct epoll_event ev[SIDES] = { { .data.u64 = ref[0].u64 },
					 { .data.u64 = ref[1].u64 } };

	tcp_splice_conn_epoll_events(conn->events, ev);

	if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) ||
	    epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) {
		int ret = -errno;
		flow_err(conn, "ERROR on epoll_ctl(): %s", strerror(errno));
		return ret;
	}

	conn->in_epoll = true;

	return 0;
}

/**
 * conn_flag_do() - Set/unset given flag, log, update epoll on CLOSING flag
 * @c:		Execution context
 * @conn:	Connection pointer
 * @flag:	Flag to set, or ~flag to unset
 */
static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
			 unsigned long flag)
{
	if (flag & (flag - 1)) {
		int flag_index = fls(~flag);

		if (!(conn->flags & ~flag))
			return;

		conn->flags &= flag;
		if (flag_index >= 0)
			flow_dbg(conn, "%s dropped",
				 tcp_splice_flag_str[flag_index]);
	} else {
		int flag_index = fls(flag);

		if (conn->flags & flag)
			return;

		conn->flags |= flag;
		if (flag_index >= 0)
			flow_dbg(conn, "%s", tcp_splice_flag_str[flag_index]);
	}

	if (flag == CLOSING) {
		epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->s[0], NULL);
		epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->s[1], NULL);
	}
}

#define conn_flag(c, conn, flag)					\
	do {								\
		flow_trace(conn, "flag at %s:%i", __func__, __LINE__);	\
		conn_flag_do(c, conn, flag);				\
	} while (0)

/**
 * conn_event_do() - Set and log connection events, update epoll state
 * @c:		Execution context
 * @conn:	Connection pointer
 * @event:	Connection event
 */
static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn,
			  unsigned long event)
{
	if (event & (event - 1)) {
		int flag_index = fls(~event);

		if (!(conn->events & ~event))
			return;

		conn->events &= event;
		if (flag_index >= 0)
			flow_dbg(conn, "~%s", tcp_splice_event_str[flag_index]);
	} else {
		int flag_index = fls(event);

		if (conn->events & event)
			return;

		conn->events |= event;
		if (flag_index >= 0)
			flow_dbg(conn, "%s", tcp_splice_event_str[flag_index]);
	}

	if (tcp_splice_epoll_ctl(c, conn))
		conn_flag(c, conn, CLOSING);
}

#define conn_event(c, conn, event)					\
	do {								\
		flow_trace(conn, "event at %s:%i",__func__, __LINE__);	\
		conn_event_do(c, conn, event);				\
	} while (0)


/**
 * tcp_splice_flow_defer() - Deferred per-flow handling (clean up closed)
 * @flow:	Flow table entry for this connection
 *
 * Return: true if the flow is ready to free, false otherwise
 */
bool tcp_splice_flow_defer(union flow *flow)
{
	struct tcp_splice_conn *conn = &flow->tcp_splice;
	unsigned side;

	if (!(flow->tcp_splice.flags & CLOSING))
		return false;

	for (side = 0; side < SIDES; side++) {
		/* Flushing might need to block: don't recycle them. */
		if (conn->pipe[side][0] >= 0) {
			close(conn->pipe[side][0]);
			close(conn->pipe[side][1]);
			conn->pipe[side][0] = conn->pipe[side][1] = -1;
		}

		if (conn->s[side] >= 0) {
			close(conn->s[side]);
			conn->s[side] = -1;
		}

		conn->read[side] = conn->written[side] = 0;
	}

	conn->events = SPLICE_CLOSED;
	conn->flags = 0;
	flow_dbg(conn, "CLOSED");

	return true;
}

/**
 * tcp_splice_connect_finish() - Completion of connect() or call on success
 * @c:		Execution context
 * @conn:	Connection pointer
 *
 * Return: 0 on success, -EIO on failure
 */
static int tcp_splice_connect_finish(const struct ctx *c,
				     struct tcp_splice_conn *conn)
{
	unsigned side;
	int i = 0;

	for (side = 0; side < SIDES; side++) {
		for (; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) {
			if (splice_pipe_pool[i][0] >= 0) {
				SWAP(conn->pipe[side][0],
				     splice_pipe_pool[i][0]);
				SWAP(conn->pipe[side][1],
				     splice_pipe_pool[i][1]);
				break;
			}
		}

		if (conn->pipe[side][0] < 0) {
			if (pipe2(conn->pipe[side], O_NONBLOCK | O_CLOEXEC)) {
				flow_err(conn, "cannot create %d->%d pipe: %s",
					 side, !side, strerror(errno));
				conn_flag(c, conn, CLOSING);
				return -EIO;
			}

			if (fcntl(conn->pipe[side][0], F_SETPIPE_SZ,
				  c->tcp.pipe_size)) {
				flow_trace(conn,
					   "cannot set %d->%d pipe size to %zu",
					   side, !side, c->tcp.pipe_size);
			}
		}
	}

	if (!(conn->events & SPLICE_ESTABLISHED))
		conn_event(c, conn, SPLICE_ESTABLISHED);

	return 0;
}

/**
 * tcp_splice_connect() - Create and connect socket for new spliced connection
 * @c:		Execution context
 * @conn:	Connection pointer
 * @af:		Address family
 * @pif:	pif on which to create socket
 * @port:	Destination port, host order
 *
 * Return: 0 for connect() succeeded or in progress, negative value on error
 */
static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
			      sa_family_t af, uint8_t pif, in_port_t port)
{
	struct sockaddr_in6 addr6 = {
		.sin6_family = AF_INET6,
		.sin6_port = htons(port),
		.sin6_addr = IN6ADDR_LOOPBACK_INIT,
	};
	struct sockaddr_in addr4 = {
		.sin_family = AF_INET,
		.sin_port = htons(port),
		.sin_addr = IN4ADDR_LOOPBACK_INIT,
	};
	const struct sockaddr *sa;
	socklen_t sl;

	if (pif == PIF_HOST)
		conn->s[1] = tcp_conn_sock(c, af);
	else if (pif == PIF_SPLICE)
		conn->s[1] = tcp_conn_sock_ns(c, af);
	else
		ASSERT(0);

	if (conn->s[1] < 0)
		return -1;

	if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK,
		       &((int){ 1 }), sizeof(int))) {
		flow_trace(conn, "failed to set TCP_QUICKACK on socket %i",
			   conn->s[1]);
	}

	if (CONN_V6(conn)) {
		sa = (struct sockaddr *)&addr6;
		sl = sizeof(addr6);
	} else {
		sa = (struct sockaddr *)&addr4;
		sl = sizeof(addr4);
	}

	if (connect(conn->s[1], sa, sl)) {
		if (errno != EINPROGRESS) {
			flow_trace(conn, "Couldn't connect socket for splice: %s",
				   strerror(errno));
			return -errno;
		}

		conn_event(c, conn, SPLICE_CONNECT);
	} else {
		conn_event(c, conn, SPLICE_ESTABLISHED);
		return tcp_splice_connect_finish(c, conn);
	}

	return 0;
}

/**
 * tcp_conn_sock_ns() - Obtain a connectable socket in the namespace
 * @c:		Execution context
 * @af:		Address family (AF_INET or AF_INET6)
 *
 * Return: Socket fd in the namespace on success, -errno on failure
 */
static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af)
{
	int *p = af == AF_INET6 ? ns_sock_pool6 : ns_sock_pool4;
	int s;

	if ((s = tcp_conn_pool_sock(p)) >= 0)
		return s;

	/* If the pool is empty we have to incur the latency of entering the ns.
	 * Therefore, we might as well refill the whole pool while we're at it.
	 * This differs from tcp_conn_sock().
	 */
	NS_CALL(tcp_sock_refill_ns, c);

	if ((s = tcp_conn_pool_sock(p)) >= 0)
		return s;

	err("TCP: No available ns sockets for new connection");
	return -1;
}

/**
 * tcp_splice_conn_from_sock() - Attempt to init state for a spliced connection
 * @c:		Execution context
 * @ref:	epoll reference of listening socket
 * @flow:	flow to initialise
 * @s0:		Accepted (side 0) socket
 * @sa:		Peer address of connection
 *
 * Return: true if able to create a spliced connection, false otherwise
 * #syscalls:pasta setsockopt
 */
bool tcp_splice_conn_from_sock(const struct ctx *c,
			       union tcp_listen_epoll_ref ref, union flow *flow,
			       int s0, const union sockaddr_inany *sa)
{
	in_port_t srcport, dstport = ref.port;
	struct tcp_splice_conn *conn;
	union inany_addr src;
	sa_family_t af;
	uint8_t pif1;

	ASSERT(c->mode == MODE_PASTA);

	inany_from_sockaddr(&src, &srcport, sa);
	if (!inany_is_loopback(&src))
		return false;

	af = inany_v4(&src) ? AF_INET : AF_INET6;

	conn = FLOW_START(flow, FLOW_TCP_SPLICE, tcp_splice, 0);

	conn->flags = af == AF_INET ? 0 : SPLICE_V6;
	conn->s[0] = s0;
	conn->s[1] = -1;
	conn->pipe[0][0] = conn->pipe[0][1] = -1;
	conn->pipe[1][0] = conn->pipe[1][1] = -1;

	if (setsockopt(s0, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int)))
		flow_trace(conn, "failed to set TCP_QUICKACK on %i", s0);

	if (ref.pif == PIF_SPLICE) {
		pif1 = PIF_HOST;
		dstport += c->tcp.fwd_out.delta[dstport];
	} else {
		ASSERT(ref.pif == PIF_HOST);

		pif1 = PIF_SPLICE;
		dstport += c->tcp.fwd_in.delta[dstport];
	}

	if (tcp_splice_connect(c, conn, af, pif1, dstport))
		conn_flag(c, conn, CLOSING);

	return true;
}

/**
 * tcp_splice_sock_handler() - Handler for socket mapped to spliced connection
 * @c:		Execution context
 * @ref:	epoll reference
 * @events:	epoll events bitmap
 *
 * #syscalls:pasta splice
 */
void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
			     uint32_t events)
{
	struct tcp_splice_conn *conn = CONN(ref.flowside.flow);
	unsigned side = ref.flowside.side, fromside;
	uint8_t lowat_set_flag, lowat_act_flag;
	int eof, never_read;

	ASSERT(conn->f.type == FLOW_TCP_SPLICE);

	if (conn->events == SPLICE_CLOSED)
		return;

	if (events & EPOLLERR) {
		int err, rc;
		socklen_t sl = sizeof(err);

		rc = getsockopt(ref.fd, SOL_SOCKET, SO_ERROR, &err, &sl);
		if (rc)
			flow_err(conn, "Error retrieving SO_ERROR: %s",
				 strerror(errno));
		else
			flow_trace(conn, "Error event on socket: %s",
				   strerror(err));

		goto close;
	}

	if (conn->events == SPLICE_CONNECT) {
		if (!(events & EPOLLOUT))
			goto close;
		if (tcp_splice_connect_finish(c, conn))
			goto close;
	}

	if (events & EPOLLOUT) {
		fromside = !side;
		conn_event(c, conn, side == 0 ? ~OUT_WAIT_0 : ~OUT_WAIT_1);
	} else {
		fromside = side;
	}

	if (events & EPOLLRDHUP)
		/* For side 0 this is fake, but implied */
		conn_event(c, conn, side == 0 ? FIN_RCVD_0 : FIN_RCVD_1);

swap:
	eof = 0;
	never_read = 1;

	lowat_set_flag = fromside == 0 ? RCVLOWAT_SET_0 : RCVLOWAT_SET_1;
	lowat_act_flag = fromside == 0 ? RCVLOWAT_ACT_0 : RCVLOWAT_ACT_1;

	while (1) {
		ssize_t readlen, to_write = 0, written;
		int more = 0;

retry:
		readlen = splice(conn->s[fromside], NULL,
				 conn->pipe[fromside][1], NULL, c->tcp.pipe_size,
				 SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
		flow_trace(conn, "%zi from read-side call", readlen);
		if (readlen < 0) {
			if (errno == EINTR)
				goto retry;

			if (errno != EAGAIN)
				goto close;

			to_write = c->tcp.pipe_size;
		} else if (!readlen) {
			eof = 1;
			to_write = c->tcp.pipe_size;
		} else {
			never_read = 0;
			to_write += readlen;
			if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
				more = SPLICE_F_MORE;

			if (conn->flags & lowat_set_flag)
				conn_flag(c, conn, lowat_act_flag);
		}

eintr:
		written = splice(conn->pipe[fromside][0], NULL,
				 conn->s[!fromside], NULL, to_write,
				 SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
		flow_trace(conn, "%zi from write-side call (passed %zi)",
			   written, to_write);

		/* Most common case: skip updating counters. */
		if (readlen > 0 && readlen == written) {
			if (readlen >= (long)c->tcp.pipe_size * 10 / 100)
				continue;

			if (conn->flags & lowat_set_flag &&
			    readlen > (long)c->tcp.pipe_size / 10) {
				int lowat = c->tcp.pipe_size / 4;

				setsockopt(conn->s[fromside], SOL_SOCKET,
					   SO_RCVLOWAT, &lowat, sizeof(lowat));

				conn_flag(c, conn, lowat_set_flag);
				conn_flag(c, conn, lowat_act_flag);
			}

			break;
		}

		conn->read[fromside]    += readlen > 0 ? readlen : 0;
		conn->written[fromside] += written > 0 ? written : 0;

		if (written < 0) {
			if (errno == EINTR)
				goto eintr;

			if (errno != EAGAIN)
				goto close;

			if (never_read)
				break;

			conn_event(c, conn,
				   fromside == 0 ? OUT_WAIT_1 : OUT_WAIT_0);
			break;
		}

		if (never_read && written == (long)(c->tcp.pipe_size))
			goto retry;

		if (!never_read && written < to_write) {
			to_write -= written;
			goto retry;
		}

		if (eof)
			break;
	}

	if ((conn->events & FIN_RCVD_0) && !(conn->events & FIN_SENT_1)) {
		if (conn->read[fromside] == conn->written[fromside] && eof) {
			shutdown(conn->s[1], SHUT_WR);
			conn_event(c, conn, FIN_SENT_1);
		}
	}

	if ((conn->events & FIN_RCVD_1) && !(conn->events & FIN_SENT_0)) {
		if (conn->read[fromside] == conn->written[fromside] && eof) {
			shutdown(conn->s[0], SHUT_WR);
			conn_event(c, conn, FIN_SENT_0);
		}
	}

	if (CONN_HAS(conn, FIN_SENT_0 | FIN_SENT_1))
		goto close;

	if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) {
		events = EPOLLIN;

		fromside = !fromside;
		goto swap;
	}

	if (events & EPOLLHUP)
		goto close;

	return;

close:
	conn_flag(c, conn, CLOSING);
}

/**
 * tcp_set_pipe_size() - Set usable pipe size, probe starting from MAX_PIPE_SIZE
 * @c:		Execution context
 */
static void tcp_set_pipe_size(struct ctx *c)
{
	int probe_pipe[TCP_SPLICE_PIPE_POOL_SIZE][2], i, j;

	c->tcp.pipe_size = MAX_PIPE_SIZE;

smaller:
	for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) {
		if (pipe2(probe_pipe[i], O_CLOEXEC)) {
			i++;
			break;
		}

		if (fcntl(probe_pipe[i][0], F_SETPIPE_SZ, c->tcp.pipe_size) < 0)
			break;
	}

	for (j = i - 1; j >= 0; j--) {
		close(probe_pipe[j][0]);
		close(probe_pipe[j][1]);
	}

	if (i == TCP_SPLICE_PIPE_POOL_SIZE)
		return;

	if (!(c->tcp.pipe_size /= 2)) {
		c->tcp.pipe_size = MAX_PIPE_SIZE;
		return;
	}

	goto smaller;
}

/**
 * tcp_splice_pipe_refill() - Refill pool of pre-opened pipes
 * @c:		Execution context
 */
static void tcp_splice_pipe_refill(const struct ctx *c)
{
	int i;

	for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) {
		if (splice_pipe_pool[i][0] >= 0)
			break;
		if (pipe2(splice_pipe_pool[i], O_NONBLOCK | O_CLOEXEC))
			continue;

		if (fcntl(splice_pipe_pool[i][0], F_SETPIPE_SZ,
			  c->tcp.pipe_size)) {
			trace("TCP (spliced): cannot set pool pipe size to %zu",
			      c->tcp.pipe_size);
		}
	}
}

/**
 * tcp_sock_refill_ns() - Refill pools of pre-opened sockets in namespace
 * @arg:	Execution context cast to void *
 *
 * Return: 0
 */
static int tcp_sock_refill_ns(void *arg)
{
	const struct ctx *c = (const struct ctx *)arg;

	ns_enter(c);

	if (c->ifi4) {
		int rc = tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET);
		if (rc < 0)
			warn("TCP: Error refilling IPv4 ns socket pool: %s",
			     strerror(-rc));
	}
	if (c->ifi6) {
		int rc = tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6);
		if (rc < 0)
			warn("TCP: Error refilling IPv6 ns socket pool: %s",
			     strerror(-rc));
	}

	return 0;
}

/**
 * tcp_splice_refill() - Refill pools of resources needed for splicing
 * @c:		Execution context
 */
void tcp_splice_refill(const struct ctx *c)
{
	if ((c->ifi4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) ||
	    (c->ifi6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0))
		NS_CALL(tcp_sock_refill_ns, c);

	tcp_splice_pipe_refill(c);
}

/**
 * tcp_splice_init() - Initialise pipe pool and size
 * @c:		Execution context
 */
void tcp_splice_init(struct ctx *c)
{
	memset(splice_pipe_pool, 0xff, sizeof(splice_pipe_pool));
	tcp_set_pipe_size(c);

	memset(&ns_sock_pool4,		0xff,	sizeof(ns_sock_pool4));
	memset(&ns_sock_pool6,		0xff,	sizeof(ns_sock_pool6));
	NS_CALL(tcp_sock_refill_ns, c);
}

/**
 * tcp_splice_timer() - Timer for spliced connections
 * @c:		Execution context
 * @flow:	Flow table entry
 */
void tcp_splice_timer(const struct ctx *c, union flow *flow)
{
	struct tcp_splice_conn *conn = &flow->tcp_splice;
	int side;

	ASSERT(!(conn->flags & CLOSING));

	for (side = 0; side < SIDES; side++) {
		uint8_t set = side == 0 ? RCVLOWAT_SET_0 : RCVLOWAT_SET_1;
		uint8_t act = side == 0 ? RCVLOWAT_ACT_0 : RCVLOWAT_ACT_1;

		if ((conn->flags & set) && !(conn->flags & act)) {
			if (setsockopt(conn->s[side], SOL_SOCKET, SO_RCVLOWAT,
				       &((int){ 1 }), sizeof(int))) {
				flow_trace(conn, "can't set SO_RCVLOWAT on %d",
					   conn->s[side]);
			}
			conn_flag(c, conn, ~set);
		}
	}

	conn_flag(c, conn, ~RCVLOWAT_ACT_0);
	conn_flag(c, conn, ~RCVLOWAT_ACT_1);
}