aboutgitcodebugslistschat
path: root/tcp.c
blob: 80458e3a9d678c8c170c7d1b849ba4ef1552b095 (plain) (tree)
1
2
3
4
5
6
7


                                             



                                         




                                              





                           

             






                                                                                
                                                                              








































                                                                                
                                                                           






                                                                              

                                                                          









                                                                                
                                                                               
                                                                               

                                                                              












                                                                               
                                                                              
                                                                                





                                                                                















                                                                                

                                                                               







                                                                               
                                                                                








                                                                         


                                                                     
                                                                 

                                                                        
                                                           
                                             
                                                                 

                                                                      
                                                                    
                                                           


                                                                 
  
                                                                              




                                                                          
  

                                                                       
                                                                 

                                                                      
                                                                 

                                                                 
                                                                          

                                                                              
  

                                                                                


                                                                 
                                                     
                                                                 















                                                                                


























                                                                                


























                                                                              
                        









                                                                               
                                                                      



                                                
                    








                                                                              
                    
                                                                            


                                                                      



















                                                                                


                   

                  









                         
                       
                       
                      





                       
                     
                 

                  
                    
                 
 


                                                    
                                          
 
                                                     
 
                                                               

                                                                         
 
                                         

                                                              
                                                                      

                                                                
                                            
                                          



                                               





                                                                     






                             
                                  


                   
                            


                           


                        
  
                                                     
 
                                                                            
                                                                  

                                                                           
                                                                  
                                                              





                                

                                



                         
                         
                         
                         



                         
                    
 
   
                                                                              

                                                                  
                                                                    





                                                                           
                                            





                                                                              


                                                                  
                                                                  
                                                                                
                                                                          


                                                                                

                                                                

                                  


                        









                                         
                             





                                   
                                 
                                 


                       

                            
                       
                              
 


                                   

                      


                        

  













                                                                                

                          
               



                              

  























                                                                            




                                                                        

                                                                                








































                                                                              
                                                                              














































                                                                              





                                                                    
 


                                                           

                                                               
 


                                                   
 

                                                                   

   


                                                                             
   
                                                                          
 


                                                                            


   

                                                                          
                                       
   
                                                                                
 


                                                                           


   













































































































































                                                                               


                                                                


                                                                


                                                                      

                                                                     
 

                             
 

                                   
 

                             
 
                                                      



                                  
                                   

                              

                                            

                                 















                                                             

                                                             






                  
                                                                           







                                                                

                                                                        














                                                                       
                                                                           
                                 






                                                                

                                                                     
 
                       











                                                        
                                                                   










                                                         
                                                                    





                                                       
                                                                    
                                 
                                  

                                                                
   

                                                                     


              






                                                                                


   

                                                                    
   
                                                      
 

                                                 
 

                                                                            


                                                        

























                                                                               

                 



                                                                             


   
                                                                        
                                 




                                                                
                                                          
   


                                                                              
 

                                                           
 


                                                                        

         
                    


   





















                                                                               
                                 
                                                     
   
                                                                           
 
                                       
                        














                                                                                


                                            



                                                                        


   
                                                                           
                                 
                                  
   
                                                                     
 
                                  

                       




                                                               

 

                                                              


                                                                               
                                  
                                

                              
  
                                                               
   

                                                                    
 
                                                                        

                                           
                                     
                                    

                          

                                                                  


                                                              
                                                

                                                        






                                                                            



                                                                 
 

                                                                 
                                                       



                                  
                                   
 
                                    
                                                                               

                                      
                                                             

                                            
 


                                                                     
                                                      
 

                                                                      
                                          
 

                                             
                                     
 
                                                         

                 
                                                                              
                                                    
                

                                                  

         

                                


                                                                           





                                                                            
                            
 


                                                                  





                                                                               

                                                                      
 











                                                                               










                                          

                                            
 


                                               

                                                                
                                                                         


                                                   
 


                                       




                              

                                                                         
 


                                   



                 













                                                                            
                                                                           
                                  


                                                                          
                                                              
   
                                                                          
                                                                    
 
                   


                                                                       



                                                                              
                   

                                                     
                



                                                               
 












                                                                            






                                                                 



         





                                                                           
                                 



                                                               

                                                                  
 
                             
 








                                                       
                                            


                                           
                                                                      












                                                        
                                                                      

         

                                                                               




                        





                                                                         
                                 

                                                                

                                                            










                                                      
                                                       
                                  
                                  
                              


                     
                                                   

                       

                                                                         
                       
 




                                                                          








                                                                        


                                            



                                                                     


                                                                    
 








                                                              
 

                                                                 
 
                                              

                            
                                               

                                   


                                                                        
                
                                               

                                   
                                                                   

         

                                           
 
                                                               
 
                                       
 


                                                         
 
                                                                                
                                                 
                                                      
 
                                           
 
                                 

                                                  
                                           
                                         


                               
                                                  
                
                                                  
 

                                                                 

                                                 

         
                                 

                                  
                                                     


   
                                                              
                                 
                                                     
   

                                                                  
 
                                                                           
                                                              
                                                                         





                                                            

                       















                                                               

                                                                 

         

                                        
 


                                                                   
 
   
                                                                             





                                                                           


                             


                                                     
                                                   
                                                     
                                                   


                                                     
                                                   
                                                     
                                                   










                                                                       
                                                            

                                                         


                       
         


   
                                                                               
                                  
                                        
   
                                                                         
 
                        

                                                  
                                                  
 



                                                                           
 
                                
                       
 

                                                                         
 
                                         




                                                                              
                                  
                                 
  
                                                    
   

                                                                       
 
                                                                         
                                                                     
                                                                    
                                               
                                               



                                       
                           


                                                                 







                                                                       
                         
         
 

                                                                 
                                         
                                           



                                                                              
 









                                                                                





                                                     
 

                                                                                
                    




                                                           
 
                                                        
         
                                                                             


                                      
                         
         
 



                                  

                                                                    
                                     


                                                             





















                                                                              





                                                         
 
                                                                              
 
                                                  
                                                                                





                                                                            
 


                                                                      
 

                                                                         
 





                                                                               
                                                                      
                 




                                                       
                               

                                                                   



















                                                                        
                                                      





















                                                                            

                                                                  
















                                                                           
                                                      


























                                                                            
 
                                                                  
                 
 
                                   




                                  
         

                                                                    
                                      

                              


























                                                                                
 





                                                      
         


                             

                                                  
 





                                                                       

    
                             
 






                                                             


   










                                                                              
                                                            


                                                      
                                                  




























                                                                





                                                                        



                                                                            


                                                                 

                                                                


                                                                            
                                                      
                         




                                


                                 






                                                                     
                                                             





                                                                        
                                                                                

                                                                             




                                                    




                                                                         
                                       








                                                                
                              
                                  


                                     

         

                                                            




                                                    


                                                 
                       

         
                   
                         
 
                              
      

                                                                    


                                   










                                                                       

                                          

         



                                                       
                       
         







                                                               
                                                     

                       
 
                                         
                                              
                                                
                                                       
                                               
                                              

                                                       



         


                                                                    
                                   

                              
                                 

                                    
   
                                                      
                                                                         
 
                                                        
                                        
                                  
               

                              
                         


                                           
                         
 

                                                                                
                                        
                                                                     
                         


                      
                                         
                         

         
                            
 
                              
                           
                                           
                                         
                                 
                 
 

                                                      


                                                                            
 








                                                                      
 

                                                              
                                                                   
                                         
 


                                                               
                                 

                 


                                                                               


                                                             
 
                                                 



                                                                           
                                                 
                                                     
 
                                                                  
                      

                              
                                                      
                                                               
                                                        



                               
                                         
                                 

                 

                                                      
                                                 

                         
                                  
                                       
                        


                                                                     

                                                            
                          
                      


                                


                             


                                

                 






                                                                            
                                                                        
 



                        

                                                                           


                       
                                                         

                       
                                                                        
                                                          
 





























                                                                           










                                                              



                                                           
                                                                     





                                                                           





















                                                                              
                                                           
                                                                    
                                                         




































































































                                                                           
 

                                                        



















                                                                             
                     




                                                   

                                                                       






                                            



                                                                     


                                                                      
                                                            

                                                                               










                                                                             
                                                         



                                                                    
                                                               



                                                                       
                                                                   
                                                                           










                                                                            
                                                         












                                                                  
                                              















                                                                              
                                            
                                     
                              





















                                                                        




                                                                 


                                         
                                                       


                                            
                                                  
                                                                       
 

                                                                         



















                                                   















                                                                        
     

                
                   

                                                    

      



















                                                                         

                 


                                                                         
 









                                                              
 

                                            
 
                                       
                                      



                                           
                                                                    






















                                                                           
                         











                                                               













                                                                      





                                    




                                                   
                               
                                   
                                 
   
                                                                          
                                           
 
                                  
 



                                                        
 

                                                


                       

                                  


                                          


                       





                                                    
                       


                                       
                                                 




                                                                          
                                                         




                                                                   
                                                 



















                                                                               
                 










                                
         


                                 


   












                                                                         


                                                             
 



                                                                              

                                                                             
                 
 

                                    

                                                                              






                 
                                                                               





                                      
                                                   

                       
                                                                                
 











                                                                             
 
                                                    
                                                

                                                                      
                         
                 
 

                                    
 



                                                                            
 
                                                    
                                                

                                                                       
                         
                 
         
 



                                     
                                     
 

                                             
 



                 
                                                           
                                 
                                  
                                     
   

                                                                   
 
                                                                 
                                                           
                                                         
 
                              
                           
                          
                                             
                                         
 
                      
                                       
                                               
                                         



                                   
                                  



                                                                    
 


                                                                 
                              

                 
                                             
                                                                      
                                                                     

                 
                                                         
                                                                        
                                                                              




                                                                          

                                                                        


                         

                                                               
 


                                                             

                        
                                 
                                                       
                                         
                      



                                          
                      
                                               
                                         
                      
                          


                                


                             





                      
                                                                          
                                 
                                     
   
                                                  
 
                                          


                       
                                                                      

                                         

                                                                 
                                                 
                                                         


                 
// SPDX-License-Identifier: AGPL-3.0-or-later

/* PASST - Plug A Simple Socket Transport
 *  for qemu/UNIX domain socket mode
 *
 * PASTA - Pack A Subtle Tap Abstraction
 *  for network namespace/tap device mode
 *
 * tcp.c - TCP L2-L4 translation state machine
 *
 * Copyright (c) 2020-2021 Red Hat GmbH
 * Author: Stefano Brivio <sbrivio@redhat.com>
 */

/**
 * DOC: Theory of Operation
 *
 *
 * PASST mode
 * ==========
 *
 * This implementation maps TCP traffic between a single L2 interface (tap) and
 * native TCP (L4) sockets, mimicking and reproducing as closely as possible the
 * inferred behaviour of applications running on a guest, connected via said L2
 * interface. Four connection flows are supported:
 * - from the local host to the guest behind the tap interface:
 *   - this is the main use case for proxies in service meshes
 *   - we bind to configured local ports, and relay traffic between L4 sockets
 *     with local endpoints and the L2 interface
 * - from remote hosts to the guest behind the tap interface:
 *   - this might be needed for services that need to be addressed directly,
 *     and typically configured with special port forwarding rules (which are
 *     not needed here)
 *   - we also relay traffic between L4 sockets with remote endpoints and the L2
 *     interface
 * - from the guest to the local host:
 *   - this is not observed in practice, but implemented for completeness and
 *     transparency
 * - from the guest to external hosts:
 *   - this might be needed for applications running on the guest that need to
 *     directly access internet services (e.g. NTP)
 *
 * Relevant goals are:
 * - transparency: sockets need to behave as if guest applications were running
 *   directly on the host. This is achieved by:
 *   - avoiding port and address translations whenever possible
 *   - mirroring TCP dynamics by observation of socket parameters (TCP_INFO
 *     socket option) and TCP headers of packets coming from the tap interface,
 *     reapplying those parameters in both flow directions (including TCP_MSS,
 *     TCP_WINDOW_CLAMP socket options)
 * - simplicity: only a small subset of TCP logic is implemented here and
 *   delegated as much as possible to the TCP implementations of guest and host
 *   kernel. This is achieved by:
 *   - avoiding a complete TCP stack reimplementation, with a modified TCP state
 *     machine focused on the translation of observed states instead
 *   - mirroring TCP dynamics as described above and hence avoiding the need for
 *     segmentation, explicit queueing, and reassembly of segments
 * - security:
 *   - no dynamic memory allocation is performed
 *   - TODO: synflood protection
 *   - TODO: sequence collision attacks
 *
 * Portability is limited by usage of Linux-specific socket options.
 *
 *
 * Limits
 * ------
 *
 * To avoid the need for dynamic memory allocation, a maximum, reasonable amount
 * of connections is defined by MAX_TAP_CONNS below (currently 1M, close to
 * the maximum amount of file descriptors typically available to a process on
 * Linux).
 *
 * While fragmentation and reassembly are not implemented, tracking of missing
 * segments and retransmissions needs to be, thus data needs to linger on
 * sockets as long as it's not acknowledged by the guest, and read using
 * MSG_PEEK into a single, preallocated static buffer sized to the maximum
 * supported window, 16MiB. This imposes a practical limitation on window
 * scaling, that is, the maximum factor is 512. If a bigger window scaling
 * factor is observed during connection establishment, connection is reset and
 * reestablished by omitting the scaling factor in the SYN segment. This
 * limitation only applies to the window scaling advertised by the guest, but
 * if exceeded, no window scaling will be allowed at all toward either endpoint.
 *
 *
 * Ports
 * -----
 *
 * To avoid the need for ad-hoc configuration of port forwarding or allowed
 * ports, listening sockets can be opened and bound to all unbound ports on the
 * host, as far as process capabilities allow. This service needs to be started
 * after any application proxy that needs to bind to local ports. Mapped ports
 * can also be configured explicitly.
 *
 * No port translation is needed for connections initiated remotely or by the
 * local host: source port from socket is reused while establishing connections
 * to the guest.
 * 
 * For connections initiated by the guest, it's not possible to force the same
 * source port as connections are established by the host kernel: that's the
 * only port translation needed.
 *
 *
 * Connection tracking and storage
 * -------------------------------
 *
 * Connections are tracked by the @tt array of struct tcp_tap_conn, containing
 * addresses, ports, TCP states and parameters. This is statically allocated and
 * indexed by an arbitrary connection number. The array is compacted whenever a
 * connection is closed, by remapping the highest connection index in use to the
 * one freed up.
 *
 * References used for the epoll interface report the connection index used for
 * the @tt array.
 *
 * IPv4 addresses are stored as IPv4-mapped IPv6 addresses to avoid the need for
 * separate data structures depending on the protocol version.
 *
 * - Inbound connection requests (to the guest) are mapped using the triple
 *   < source IP address, source port, destination port >
 * - Outbound connection requests (from the guest) are mapped using the triple
 *   < destination IP address, destination port, source port >
 *   where the source port is the one used by the guest, not the one used by the
 *   corresponding host socket
 *
 *
 * Initialisation
 * --------------
 *
 * Up to 2^15 + 2^14 listening sockets (excluding ephemeral ports, repeated for
 * IPv4 and IPv6) can be opened and bound to wildcard addresses. Some will fail
 * to bind (for low ports, or ports already bound, e.g. by a proxy). These are
 * added to the epoll list, with no separate storage.
 *
 *
 * States and events
 * -----------------
 *
 * These states apply to connected sockets only, listening sockets are always
 * open after initialisation, in LISTEN state. A single state is maintained for
 * both sides of the connection, and some states are omitted as they are already
 * handled by host kernel and guest.
 *
 * - CLOSED			no connection
 *   No associated events: this is always a final state, new connections
 *   directly start from TAP_SYN_SENT or SOCK_SYN_SENT described below.
 *
 * - TAP_SYN_SENT		connect() in progress, triggered from tap
 *   - connect() completes	SYN,ACK to tap > TAP_SYN_RCVD
 *   - connect() aborts		RST to tap, close socket > CLOSED
 *
 * - SOCK_SYN_SENT		new connected socket, SYN sent to tap
 *   - SYN,ACK from tap		ACK to tap > ESTABLISHED
 *   - SYN,ACK timeout		RST to tap, close socket > CLOSED
 *
 * - TAP_SYN_RCVD		connect() completed, SYN,ACK sent to tap
 *   - FIN from tap		write shutdown > FIN_WAIT_1
 *   - ACK from tap		> ESTABLISHED
 *   - ACK timeout		RST to tap, close socket > CLOSED
 *
 * - ESTABLISHED		connection established, ready for data
 *   - EPOLLRDHUP		read shutdown > ESTABLISHED_SOCK_FIN
 *   - FIN from tap		write shutdown > FIN_WAIT_1
 *   - EPOLLHUP			RST to tap, close socket > CLOSED
 *   - data timeout		read shutdown, FIN to tap >
 * 				ESTABLISHED_SOCK_FIN_SENT
 *
 * - ESTABLISHED_SOCK_FIN	socket closing connection, reading half closed
 *   - zero-sized socket read	FIN,ACK to tap > ESTABLISHED_SOCK_FIN_SENT
 *
 * - ESTABLISHED_SOCK_FIN_SENT	socket closing connection, FIN sent to tap
 *   - ACK (for FIN) from tap	> CLOSE_WAIT
 *   - tap ACK timeout		RST to tap, close socket > CLOSED
 *
 * - CLOSE_WAIT			socket closing connection, ACK from tap
 *   - FIN from tap		write shutdown > LAST_ACK
 *   - data timeout		RST to tap, close socket > CLOSED
 * 
 * - LAST_ACK			socket started close, tap completed it
 *   - any event from socket	ACK to tap, close socket > CLOSED
 *   - ACK timeout		RST to tap, close socket > CLOSED
 *
 * - FIN_WAIT_1			tap closing connection, FIN sent to socket
 *   - EPOLLRDHUP		FIN,ACK to tap, shutdown > FIN_WAIT_1_SOCK_FIN
 *   - socket timeout		RST to tap, close socket > CLOSED
 *
 * - FIN_WAIT_1_SOCK_FIN	tap closing connection, FIN received from socket
 *   - ACK from tap		close socket > CLOSED
 *   - tap ACK timeout		RST to tap, close socket > CLOSED
 *
 * - from any state
 *   - RST from tap		close socket > CLOSED
 *   - socket error		RST to tap, close socket > CLOSED
 *
 * Connection setup
 * ----------------
 *
 * - inbound connection (from socket to guest): on accept() from listening
 *   socket, the new socket is mapped in connection tracking table, and
 *   three-way handshake initiated towards the guest, advertising MSS and window
 *   size and scaling from socket parameters
 * - outbound connection (from guest to socket): on SYN segment from guest, a
 *   new socket is created and mapped in connection tracking table, setting
 *   MSS and window clamping from header and option of the observed SYN segment
 *
 * 
 * Aging and timeout
 * -----------------
 *
 * A bitmap of TCP_MAX_CONNS bits indicate the connections subject to timed
 * events based on states:
 * - SOCK_SYN_SENT: after a 2MSL (240s) timeout waiting for a SYN,ACK segment
 *   from tap expires, connection is reset (RST to tap, socket closed)
 * - TAP_SYN_RCVD: after a 2MSL (240s) timeout waiting for an ACK segment from
 *   tap expires, connection is reset (RST to tap, socket closed)
 * - TAP_SYN_SENT: connect() is pending, timeout is handled implicitly by
 *   connect() timeout, connection will be reset in case
 * - ESTABLISHED, ESTABLISHED_SOCK_FIN: if an ACK segment to tap is pending,
 *   bytes acknowledged by socket endpoint are checked every 50ms (one quarter
 *   of current TCP_DELACK_MAX on Linux)
 * - ESTABLISHED, ESTABLISHED_SOCK_FIN: after a timeout of 3s (TODO: implement
 *   requirements from RFC 6298) waiting for an ACK segment from tap expires,
 *   data from socket queue is retransmitted starting from the last ACK sequence
 * - ESTABLISHED, ESTABLISHED_SOCK_FIN: after a two hours (current
 *   TCP_KEEPALIVE_TIME on Linux) timeout waiting for any activity expires,
 *   connection is reset (RST to tap, socket closed)
 * - ESTABLISHED_SOCK_FIN: after a 2MSL (240s) timeout waiting for an ACK
 *   segment from tap expires, connection is reset (RST to tap, socket closed)
 * - CLOSE_WAIT: after a 2MSL (240s) timeout waiting for a FIN segment from tap
 *   expires, connection is reset (RST to tap, socket closed)
 * - FIN_WAIT_1: after a 2MSL (240s) timeout waiting for an ACK segment from
 *   socet expires, connection is reset (RST to tap, socket closed)
 * - FIN_WAIT_1_SOCK_FIN: after a 2MSL (240s) timeout waiting for an ACK segment
 *   from tap expires, connection is reset (RST to tap, socket closed)
 * - LAST_ACK: after a 2MSL (240s) timeout waiting for an ACK segment from
 *   socket expires, connection is reset (RST to tap, socket closed)
 *
 *
 * Data flows (from ESTABLISHED, ESTABLISHED_SOCK_FIN states)
 * ----------------------------------------------------------
 *
 * @seq_to_tap:		next sequence for packets to tap
 * @seq_ack_from_tap:	last ACK number received from tap
 * @seq_from_tap:	next sequence for packets from tap (not actually sent)
 * @seq_ack_to_tap:	last ACK number sent to tap
 *
 * @seq_init_from_tap:	initial sequence number from tap
 *
 * @tap_window:		last window size received from tap, scaled
 * @tcpi_acked_last:	most recent value of tcpi_bytes_acked (TCP_INFO)
 * 
 * - from socket to tap:
 *   - on new data from socket:
 *     - peek into buffer
 *     - send data to tap:
 *       - starting at offset (@seq_to_tap - @seq_ack_from_tap)
 *       - in MSS-sized segments
 *       - increasing @seq_to_tap at each segment
 *       - up to window (until @seq_to_tap - @seq_ack_from_tap <= @tap_window)
 *       - mark socket in bitmap for periodic ACK check, set @last_ts_to_tap
 *     - on read error, send RST to tap, close socket
 *     - on zero read, send FIN to tap, enter ESTABLISHED_SOCK_FIN
 *   - on ACK from tap:
 *     - set @ts_ack_tap
 *     - check if it's the second duplicated ACK
 *     - consume buffer by difference between new ack_seq and @seq_ack_from_tap
 *     - update @seq_ack_from_tap from ack_seq in header
 *     - on two duplicated ACKs, reset @seq_to_tap to @seq_ack_from_tap, and
 *       resend with steps listed above
 *     - set TCP_WINDOW_CLAMP from TCP header from tap
 *     - on @seq_ack_from_tap == @seq_to_tap, mark in bitmap, umark otherwise
 *   - periodically:
 *     - if @seq_ack_from_tap < @seq_to_tap and the retransmission timer
 *       (TODO: implement requirements from RFC 6298, currently 3s fixed) from
 *       @ts_sock elapsed, reset @seq_to_tap to @seq_ack_from_tap, and
 *       resend data with the steps listed above
 *
 * - from tap to socket:
 *   - on packet from tap:
 *     - set @ts_tap
 *     - set TCP_WINDOW_CLAMP from TCP header from tap
 *     - check seq from header against @seq_from_tap, if data is missing, send
 *       two ACKs with number @seq_ack_to_tap, discard packet
 *     - otherwise queue data to socket, set @seq_from_tap to seq from header
 *       plus payload length
 *     - query socket for TCP_INFO, on tcpi_bytes_acked > @tcpi_acked_last,
 *       set @tcpi_acked_last to tcpi_bytes_acked, set @seq_ack_to_tap
 *       to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and
 *       send ACK to tap
 *   - periodically:
 *     - query socket for TCP_INFO, on tcpi_bytes_acked > @tcpi_acked_last, 
 *       set @tcpi_acked_last to tcpi_bytes_acked, set @seq_ack_to_tap
 *       to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and
 *       send ACK to tap
 *
 *
 * PASTA mode
 * ==========
 *
 * For traffic directed to TCP ports configured for mapping to the tuntap device
 * in the namespace, and for non-local traffic coming from the tuntap device,
 * the implementation is identical as the PASST mode described in the previous
 * section.
 *
 * For local traffic directed to TCP ports configured for direct mapping between
 * namespaces, the implementation is substantially simpler: packets are directly
 * translated between L4 sockets using a pair of splice() syscalls. These
 * connections are tracked in the @ts array of struct tcp_splice_conn, using
 * just four states:
 *
 * - CLOSED:			no connection
 * - SPLICE_ACCEPTED:		accept() on the listening socket succeeded
 * - SPLICE_CONNECT:		connect() issued in the destination namespace
 * - SPLICE_ESTABLISHED:	connect() succeeded, packets are transferred
 */

#define _GNU_SOURCE
#include <sched.h>
#include <fcntl.h>
#include <stdio.h>
#include <errno.h>
#include <limits.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <netinet/in.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <sys/epoll.h>
#include <sys/random.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <unistd.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/tcp.h>
#include <time.h>

#include "checksum.h"
#include "util.h"
#include "passt.h"
#include "tap.h"
#include "siphash.h"
#include "pcap.h"

#define MAX_TAP_CONNS			(128 * 1024)
#define MAX_SPLICE_CONNS		(128 * 1024)

#define TCP_TAP_FRAMES			32

#define PIPE_SIZE			(1024 * 1024)

#define TCP_HASH_TABLE_LOAD		70		/* % */
#define TCP_HASH_TABLE_SIZE		(MAX_TAP_CONNS * 100 /		\
					 TCP_HASH_TABLE_LOAD)

#define MAX_WS				9
#define MAX_WINDOW			(1 << (16 + (MAX_WS)))
#define MSS_DEFAULT			536
#define WINDOW_DEFAULT			14600		/* RFC 6928 */

#define SYN_TIMEOUT			240000		/* ms */
#define ACK_TIMEOUT			2000
#define ACK_INTERVAL			50
#define ACT_TIMEOUT			7200000
#define FIN_TIMEOUT			240000
#define LAST_ACK_TIMEOUT		240000


/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of
 * <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP
 */
#define SOL_TCP				IPPROTO_TCP

enum tcp_state {
	CLOSED = 0,
	TAP_SYN_SENT,
	SOCK_SYN_SENT,
	TAP_SYN_RCVD,
	ESTABLISHED,
	ESTABLISHED_SOCK_FIN,
	ESTABLISHED_SOCK_FIN_SENT,
	CLOSE_WAIT,
	LAST_ACK,
	FIN_WAIT_1,
	FIN_WAIT_1_SOCK_FIN,
	SPLICE_ACCEPTED,
	SPLICE_CONNECT,
	SPLICE_ESTABLISHED,
	SPLICE_FIN_FROM,
	SPLICE_FIN_TO,
	SPLICE_FIN_BOTH,
};
#define TCP_STATE_STR_SIZE	(SPLICE_FIN_BOTH + 1)

static char *tcp_state_str[TCP_STATE_STR_SIZE] __attribute((__unused__)) = {
	"CLOSED", "TAP_SYN_SENT", "SOCK_SYN_SENT", "TAP_SYN_RCVD",
	"ESTABLISHED", "ESTABLISHED_SOCK_FIN", "ESTABLISHED_SOCK_FIN_SENT",
	"CLOSE_WAIT", "LAST_ACK", "FIN_WAIT_1", "FIN_WAIT_1_SOCK_FIN",
	"SPLICE_ACCEPTED", "SPLICE_CONNECT", "SPLICE_ESTABLISHED",
	"SPLICE_FIN_FROM", "SPLICE_FIN_TO", "SPLICE_FIN_BOTH",
};

#define FIN		(1 << 0)
#define SYN		(1 << 1)
#define RST		(1 << 2)
#define ACK		(1 << 4)
/* Flags for internal usage */
#define ZERO_WINDOW	(1 << 5)

#define OPT_EOL		0
#define OPT_NOP		1
#define OPT_MSS		2
#define OPT_MSS_LEN	4
#define OPT_WS		3
#define OPT_WS_LEN	3
#define OPT_SACKP	4
#define OPT_SACK	5
#define OPT_TS		8

struct tcp_tap_conn;

/**
 * struct tcp_tap_conn - Descriptor for a TCP connection via tap (not spliced)
 * @next:		Pointer to next item in hash chain, if any
 * @sock:		Socket descriptor number
 * @hash_bucket:	Bucket index in connection lookup hash table
 * @a.a6:		IPv6 remote address, can be IPv4-mapped
 * @a.a4.zero:		Zero prefix for IPv4-mapped, see RFC 6890, Table 20
 * @a.a4.one:		Ones prefix for IPv4-mapped
 * @a.a4.a:		IPv4 address
 * @tap_port:		Guest-facing tap port
 * @sock_port:		Remote, socket-facing port
 * @state:		TCP connection state
 * @seq_to_tap:		Next sequence for packets to tap
 * @seq_ack_from_tap:	Last ACK number received from tap
 * @seq_from_tap:	Next sequence for packets from tap (not actually sent)
 * @seq_ack_to_tap:	Last ACK number sent to tap
 * @seq_init_from_tap:	Initial sequence number from tap
 * @tcpi_acked_last:	Most recent value of tcpi_bytes_acked (TCP_INFO query)
 * @ws_allowed:		Window scaling allowed
 * @ws:			Window scaling factor
 * @tap_window:		Last window size received from tap, scaled
 * @window_clamped:	Window was clamped on socket at least once
 * @no_snd_wnd:		Kernel won't report window (without commit 8f7baad7f035)
 * @tcpi_acked_last:	Most recent value of tcpi_snd_wnd (TCP_INFO query)
 * @ts_sock:		Last activity timestamp from socket for timeout purposes
 * @ts_tap:		Last activity timestamp from tap for timeout purposes
 * @ts_ack_tap:		Last ACK segment timestamp from tap for timeout purposes
 * @mss_guest:		Maximum segment size advertised by guest
 */
struct tcp_tap_conn {
	struct tcp_tap_conn *next;
	int sock;
	int hash_bucket;

	union {
		struct in6_addr a6;
		struct {
			uint8_t zero[10];
			uint8_t one[2];
			struct in_addr a;
		} a4;
	} a;
	in_port_t tap_port;
	in_port_t sock_port;
	enum tcp_state state;

	uint32_t seq_to_tap;
	uint32_t seq_ack_from_tap;
	uint32_t seq_from_tap;
	uint32_t seq_ack_to_tap;
	uint32_t seq_init_from_tap;
	uint32_t seq_init_to_tap;
	uint64_t tcpi_acked_last;

	int ws_allowed;
	int ws;
	uint32_t tap_window;
	int window_clamped;
	int no_snd_wnd;
	uint32_t tcpi_snd_wnd;

	struct timespec ts_sock;
	struct timespec ts_tap;
	struct timespec ts_ack_tap;

	int mss_guest;
	uint32_t sndbuf;

	uint32_t events;
};

/**
 * struct tcp_splice_conn - Descriptor for a spliced TCP connection
 * @from:		File descriptor number of socket for accepted connection
 * @pipe_from_to:	Pipe ends for splice() from @from to @to
 * @to:			File descriptor number of peer connected socket
 * @pipe_to_from:	Pipe ends for splice() from @to to @from
 * @state:		TCP connection state
*/
struct tcp_splice_conn {
	int from;
	int pipe_from_to[2];
	int to;
	int pipe_to_from[2];
	enum tcp_state state;
	int from_fin_sent;
	int to_fin_sent;
	int v6;
	uint64_t from_read;
	uint64_t from_written;
	uint64_t to_read;
	uint64_t to_written;
};

/* Port re-mappings as delta, indexed by original destination port */
static in_port_t		tcp_port_delta_to_tap	[USHRT_MAX];
static in_port_t		tcp_port_delta_to_init	[USHRT_MAX];

/**
 * tcp_remap_to_tap() - Set delta for port translation toward guest/tap
 * @port:	Original destination port, host order
 * @delta:	Delta to be added to original destination port
 */
void tcp_remap_to_tap(in_port_t port, in_port_t delta)
{
	tcp_port_delta_to_tap[port] = delta;
}

/**
 * tcp_remap_to_tap() - Set delta for port translation toward init namespace
 * @port:	Original destination port, host order
 * @delta:	Delta to be added to original destination port
 */
void tcp_remap_to_init(in_port_t port, in_port_t delta)
{
	tcp_port_delta_to_init[port] = delta;
}

/* Static buffers */

/**
 * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
 * @psum:	Partial IP header checksum (excluding tot_len and saddr)
 * @tsum:	Partial TCP header checksum (excluding length and saddr)
 * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
 * @vnet_len:	4-byte qemu vnet buffer length descriptor, only for passt mode
 * @eh:		Pre-filled Ethernet header
 * @iph:	Pre-filled IP header (except for tot_len and saddr)
 * @uh:		Headroom for TCP header
 * @data:	Storage for TCP payload
 */
__extension__  static struct tcp4_l2_buf_t {
	uint32_t psum;		/* 0 */
	uint32_t tsum;		/* 4 */
#ifdef __AVX2__
	uint8_t pad[18];	/* 8, align th to 32 bytes */
#endif

	uint32_t vnet_len;	/* 26 */
	struct ethhdr eh;	/* 30 */
	struct iphdr iph;	/* 44 */
	struct tcphdr th;	/* 64 */
	uint8_t data[USHRT_MAX - sizeof(struct tcphdr)];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp4_l2_buf[TCP_TAP_FRAMES] = {
	[ 0 ... TCP_TAP_FRAMES - 1 ] = {
		0, 0,
#ifdef __AVX2__
		{ 0 },
#endif
		0, L2_BUF_ETH_IP4_INIT, L2_BUF_IP4_INIT(IPPROTO_TCP),
		{ .doff = sizeof(struct tcphdr) / 4, .ack = 1 }, { 0 },
	},
};

static int tcp4_l2_buf_mss;
static int tcp4_l2_buf_mss_nr_set;
static int tcp4_l2_buf_mss_tap;
static int tcp4_l2_buf_mss_tap_nr_set;

/**
 * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
 * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
 * @vnet_len:	4-byte qemu vnet buffer length descriptor, only for passt mode
 * @eh:		Pre-filled Ethernet header
 * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
 * @th:		Headroom for TCP header
 * @data:	Storage for TCP payload
 */
__extension__ struct tcp6_l2_buf_t {
#ifdef __AVX2__
	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
#else
	uint8_t pad[2];		/*	align ip6h to 4 bytes	0 */
#endif
	uint32_t vnet_len;	/* 14				2 */
	struct ethhdr eh;	/* 18				6 */
	struct ipv6hdr ip6h;	/* 32				20 */
	struct tcphdr th;	/* 72				60 */
	uint8_t data[USHRT_MAX -
		     (sizeof(struct ipv6hdr) + sizeof(struct tcphdr))];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp6_l2_buf[TCP_TAP_FRAMES] = {
	[ 0 ... TCP_TAP_FRAMES - 1 ] = {
		{ 0 },
		0, L2_BUF_ETH_IP6_INIT, L2_BUF_IP6_INIT(IPPROTO_TCP),
		{ .doff = sizeof(struct tcphdr) / 4, .ack = 1 }, { 0 },
	},
};

static int tcp6_l2_buf_mss;
static int tcp6_l2_buf_mss_nr_set;
static int tcp6_l2_buf_mss_tap;
static int tcp6_l2_buf_mss_tap_nr_set;

/* recvmmsg()/sendmmsg() data for tap */
static struct iovec	tcp4_l2_iov_sock	[TCP_TAP_FRAMES + 1];
static struct iovec	tcp6_l2_iov_sock	[TCP_TAP_FRAMES + 1];
static char 		tcp_buf_discard		[MAX_WINDOW];

static struct iovec	tcp4_l2_iov_tap		[TCP_TAP_FRAMES];
static struct iovec	tcp6_l2_iov_tap		[TCP_TAP_FRAMES];

static struct msghdr	tcp4_l2_mh_sock;
static struct msghdr	tcp6_l2_mh_sock;

__extension__
static struct mmsghdr	tcp_l2_mh_tap		[TCP_TAP_FRAMES] = {
	[ 0 ... TCP_TAP_FRAMES - 1 ] = {
		.msg_hdr.msg_iovlen = 1,
	},
};

/* sendmsg() to socket */
static struct iovec	tcp_tap_iov		[TAP_MSGS];

/* Bitmap, activity monitoring needed for connection via tap */
static uint8_t tcp_act[MAX_TAP_CONNS / 8] = { 0 };

/* TCP connections */
static struct tcp_tap_conn tt[MAX_TAP_CONNS];
static struct tcp_splice_conn ts[MAX_SPLICE_CONNS];

/* Table for lookup from remote address, local port, remote port */
static struct tcp_tap_conn *tt_hash[TCP_HASH_TABLE_SIZE];

/**
 * tcp_tap_state() - Set given TCP state for tap connection, report to stderr
 * @conn:	Connection pointer
 * @state:	New TCP state to be set
 */
static void tcp_tap_state(struct tcp_tap_conn *conn, enum tcp_state state)
{
	debug("TCP: socket %i: %s -> %s",
	      conn->sock, tcp_state_str[conn->state], tcp_state_str[state]);
	conn->state = state;
}

/**
 * tcp_splice_state() - Set state for spliced connection, report to stderr
 * @conn:	Connection pointer
 * @state:	New TCP state to be set
 */
static void tcp_splice_state(struct tcp_splice_conn *conn, enum tcp_state state)
{
	debug("TCP: index %i: %s -> %s",
	      conn - ts, tcp_state_str[conn->state], tcp_state_str[state]);
	conn->state = state;
}

/**
 * tcp_update_check_ip4() - Update IPv4 with variable parts from stored one
 * @buf:	L2 packet buffer with final IPv4 header
 */
static void tcp_update_check_ip4(struct tcp4_l2_buf_t *buf)
{
	uint32_t sum = buf->psum;

	sum += buf->iph.tot_len;
	sum += (buf->iph.saddr >> 16) & 0xffff;
	sum += buf->iph.saddr & 0xffff;

	buf->iph.check = (uint16_t)~csum_fold(sum);
}

/**
 * tcp_update_check_tcp4() - Update TCP checksum from stored one
 * @buf:	L2 packet buffer with final IPv4 header
 */
static void tcp_update_check_tcp4(struct tcp4_l2_buf_t *buf)
{
	uint16_t tlen = ntohs(buf->iph.tot_len) - 20;
	uint32_t sum = buf->tsum;

	sum += (buf->iph.saddr >> 16) & 0xffff;
	sum += buf->iph.saddr & 0xffff;
	sum += htons(ntohs(buf->iph.tot_len) - 20);

	buf->th.check = 0;
	buf->th.check = csum(&buf->th, tlen, sum);
}

/**
 * tcp_update_check_tcp6() - Calculate TCP checksum for IPv6
 * @buf:	L2 packet buffer with final IPv6 header
 */
static void tcp_update_check_tcp6(struct tcp6_l2_buf_t *buf)
{
	int len = ntohs(buf->ip6h.payload_len) + sizeof(struct ipv6hdr);

	buf->ip6h.hop_limit = IPPROTO_TCP;
	buf->ip6h.version = 0;
	buf->ip6h.nexthdr = 0;

	buf->th.check = 0;
	buf->th.check = csum(&buf->ip6h, len, 0);

	buf->ip6h.hop_limit = 255;
	buf->ip6h.version = 6;
	buf->ip6h.nexthdr = IPPROTO_TCP;
}

/**
 * tcp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
 * @eth_d:	Ethernet destination address, NULL if unchanged
 * @eth_s:	Ethernet source address, NULL if unchanged
 * @ip_da:	Pointer to IPv4 destination address, NULL if unchanged
 */
void tcp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s,
		       uint32_t *ip_da)
{
	int i;

	for (i = 0; i < TCP_TAP_FRAMES; i++) {
		struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i];
		struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i];

		if (eth_d) {
			memcpy(b4->eh.h_dest, eth_d, ETH_ALEN);
			memcpy(b6->eh.h_dest, eth_d, ETH_ALEN);
		}

		if (eth_s) {
			memcpy(b4->eh.h_source, eth_s, ETH_ALEN);
			memcpy(b6->eh.h_source, eth_s, ETH_ALEN);
		}

		if (ip_da) {
			b4->iph.daddr = *ip_da;
			if (!i) {
				b4->iph.saddr = 0;
				b4->iph.tot_len = 0;
				b4->iph.check = 0;
				b4->psum = sum_16b(&b4->iph, 20);

				b4->tsum = ((*ip_da >> 16) & 0xffff) +
					   (*ip_da & 0xffff) +
					   htons(IPPROTO_TCP);
			} else {
				b4->psum = tcp4_l2_buf[0].psum;
				b4->tsum = tcp4_l2_buf[0].tsum;
			}
		}
	}
}

/**
 * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
 */
static void tcp_sock4_iov_init(void)
{
	struct iovec *iov;
	int i;

	tcp4_l2_iov_sock[0].iov_base = tcp_buf_discard;
	for (i = 0, iov = tcp4_l2_iov_sock + 1; i < TCP_TAP_FRAMES;
	     i++, iov++) {
		iov->iov_base = &tcp4_l2_buf[i].data;
		iov->iov_len = MSS_DEFAULT;
	}

	tcp4_l2_mh_sock.msg_iov = tcp4_l2_iov_sock;

	for (i = 0, iov = tcp4_l2_iov_tap; i < TCP_TAP_FRAMES; i++, iov++) {
		iov->iov_base = &tcp4_l2_buf[i].vnet_len;
		iov->iov_len = MSS_DEFAULT;
	}
}

/**
 * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
 */
static void tcp_sock6_iov_init(void)
{
	struct iovec *iov;
	int i;

	tcp6_l2_iov_sock[0].iov_base = tcp_buf_discard;
	for (i = 0, iov = tcp6_l2_iov_sock + 1; i < TCP_TAP_FRAMES;
	     i++, iov++) {
		iov->iov_base = &tcp6_l2_buf[i].data;
		iov->iov_len = MSS_DEFAULT;
	}

	tcp6_l2_mh_sock.msg_iov = tcp6_l2_iov_sock;

	for (i = 0, iov = tcp6_l2_iov_tap; i < TCP_TAP_FRAMES; i++, iov++) {
		iov->iov_base = &tcp6_l2_buf[i].vnet_len;
		iov->iov_len = MSS_DEFAULT;
	}
}

/**
 * tcp_opt_get() - Get option, and value if any, from TCP header
 * @th:		Pointer to TCP header
 * @len:	Length of buffer, including TCP header
 * @__type:	Option type to look for
 * @__optlen:	Optional, filled with option length if passed
 * @__value:	Optional, set to start of option value if passed
 *
 * Return: Option value, meaningful for up to 4 bytes, -1 if not found
 */
static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t __type,
		       uint8_t *__optlen, char **__value)
{
	uint8_t type, optlen;
	char *p;

	if (len > th->doff * 4)
		len = th->doff * 4;

	len -= sizeof(*th);
	p = (char *)(th + 1);

	for (; len >= 2; p += optlen, len -= optlen) {
		switch (*p) {
		case OPT_EOL:
			return -1;
		case OPT_NOP:
			optlen = 1;
			break;
		default:
			type = *(p++);
			optlen = *(p++) - 2;
			len -= 2;

			if (type != __type)
				break;

			if (__optlen)
				*__optlen = optlen;
			if (__value)
				*__value = p;

			switch (optlen) {
			case 0:
				return 0;
			case 1:
				return *p;
			case 2:
				return ntohs(*(uint16_t *)p);
			default:
				return ntohl(*(uint32_t *)p);
			}
		}
	}

	return -1;
}

/**
 * tcp_hash_match() - Check if a connection entry matches address and ports
 * @conn:	Connection entry to match against
 * @af:		Address family, AF_INET or AF_INET6
 * @addr:	Remote address, pointer to sin_addr or sin6_addr
 * @tap_port:	tap-facing port
 * @sock_port:	Socket-facing port
 *
 * Return: 1 on match, 0 otherwise
 */
static int tcp_hash_match(struct tcp_tap_conn *conn, int af, void *addr,
			  in_port_t tap_port, in_port_t sock_port)
{
	if (af == AF_INET && IN6_IS_ADDR_V4MAPPED(&conn->a.a6)	&&
	    !memcmp(&conn->a.a4.a, addr, sizeof(conn->a.a4.a))	&&
	    conn->tap_port == tap_port && conn->sock_port == sock_port)
		return 1;

	if (af == AF_INET6					&&
	    !memcmp(&conn->a.a6, addr, sizeof(conn->a.a6))	&&
	    conn->tap_port == tap_port && conn->sock_port == sock_port)
		return 1;

	return 0;
}

/**
 * tcp_hash() - Calculate hash value for connection given address and ports
 * @c:		Execution context
 * @af:		Address family, AF_INET or AF_INET6
 * @addr:	Remote address, pointer to sin_addr or sin6_addr
 * @tap_port:	tap-facing port
 * @sock_port:	Socket-facing port
 *
 * Return: hash value, already modulo size of the hash table
 */
static unsigned int tcp_hash(struct ctx *c, int af, void *addr,
			     in_port_t tap_port, in_port_t sock_port)
{
	uint64_t b = 0;

	if (af == AF_INET) {
		struct {
			struct in_addr addr;
			in_port_t tap_port;
			in_port_t sock_port;
		} __attribute__((__packed__)) in = {
			.addr = *(struct in_addr *)addr,
			.tap_port = tap_port,
			.sock_port = sock_port,
		};

		b = siphash_8b((uint8_t *)&in, c->tcp.hash_secret);
	} else if (af == AF_INET6) {
		struct {
			struct in6_addr addr;
			in_port_t tap_port;
			in_port_t sock_port;
		} __attribute__((__packed__)) in = {
			.addr = *(struct in6_addr *)addr,
			.tap_port = tap_port,
			.sock_port = sock_port,
		};

		b = siphash_20b((uint8_t *)&in, c->tcp.hash_secret);
	}

	return (unsigned int)(b % TCP_HASH_TABLE_SIZE);
}

/**
 * tcp_hash_insert() - Insert connection into hash table, chain link
 * @c:		Execution context
 * @conn:	Connection pointer
 * @af:		Address family, AF_INET or AF_INET6
 * @addr:	Remote address, pointer to sin_addr or sin6_addr
 */
static void tcp_hash_insert(struct ctx *c, struct tcp_tap_conn *conn,
			    int af, void *addr)
{
	int b;

	b = tcp_hash(c, af, addr, conn->tap_port, conn->sock_port);
	conn->next = tt_hash[b];
	tt_hash[b] = conn;
	conn->hash_bucket = b;

	debug("TCP: hash table insert: index %i, sock %i, bucket: %i, next: %p",
	      conn - tt, conn->sock, b, conn->next);
}

/**
 * tcp_hash_remove() - Drop connection from hash table, chain unlink
 * @conn:	Connection pointer
 */
static void tcp_hash_remove(struct tcp_tap_conn *conn)
{
	struct tcp_tap_conn *entry, *prev = NULL;
	int b = conn->hash_bucket;

	for (entry = tt_hash[b]; entry; prev = entry, entry = entry->next) {
		if (entry == conn) {
			if (prev)
				prev->next = conn->next;
			else
				tt_hash[b] = conn->next;
			break;
		}
	}

	debug("TCP: hash table remove: index %i, sock %i, bucket: %i, new: %p",
	      conn - tt, conn->sock, b, prev ? prev->next : tt_hash[b]);
}

/**
 * tcp_hash_update() - Update pointer for given connection
 * @old:	Old connection pointer
 * @new:	New connection pointer
 */
static void tcp_hash_update(struct tcp_tap_conn *old, struct tcp_tap_conn *new)
{
	struct tcp_tap_conn *entry, *prev = NULL;
	int b = old->hash_bucket;

	for (entry = tt_hash[b]; entry; prev = entry, entry = entry->next) {
		if (entry == old) {
			if (prev)
				prev->next = new;
			else
				tt_hash[b] = new;
			break;
		}
	}

	debug("TCP: hash table update: old index %i, new index %i, sock %i, "
	      "bucket: %i, old: %p, new: %p",
	      old - tt, new - tt, new->sock, b, old, new);
}

/**
 * tcp_hash_lookup() - Look up connection given remote address and ports
 * @c:		Execution context
 * @af:		Address family, AF_INET or AF_INET6
 * @addr:	Remote address, pointer to sin_addr or sin6_addr
 * @tap_port:	tap-facing port
 * @sock_port:	Socket-facing port
 *
 * Return: connection pointer, if found, -ENOENT otherwise
 */
static struct tcp_tap_conn *tcp_hash_lookup(struct ctx *c, int af, void *addr,
					    in_port_t tap_port,
					    in_port_t sock_port)
{
	int b = tcp_hash(c, af, addr, tap_port, sock_port);
	struct tcp_tap_conn *conn;

	for (conn = tt_hash[b]; conn; conn = conn->next) {
		if (tcp_hash_match(conn, af, addr, tap_port, sock_port))
			return conn;
	}

	return NULL;
}

/**
 * tcp_tap_epoll_mask() - Set new epoll event mask given a connection
 * @c:		Execution context
 * @conn:	Connection pointer
 * @events:	New epoll event bitmap
 */
static void tcp_tap_epoll_mask(struct ctx *c, struct tcp_tap_conn *conn,
			       uint32_t events)
{
	union epoll_ref ref = { .proto = IPPROTO_TCP, .s = conn->sock,
				.tcp.index = conn - tt,
				.tcp.v6 = !IN6_IS_ADDR_V4MAPPED(&conn->a.a6) };
	struct epoll_event ev = { .data.u64 = ref.u64, .events = events };

	if (conn->events == events)
		return;

	conn->events = events;
	epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->sock, &ev);
}

/**
 * tcp_table_tap_compact() - Perform compaction on tap connection table
 * @c:		Execution context
 * @hole:	Pointer to recently closed connection
 */
static void tcp_table_tap_compact(struct ctx *c, struct tcp_tap_conn *hole)
{
	struct tcp_tap_conn *from, *to;
	uint32_t events;

	if ((hole - tt) == --c->tcp.tap_conn_count) {
		bitmap_clear(tcp_act, hole - tt);
		debug("TCP: hash table compaction: index %i (%p) was max index",
		      hole - tt, hole);
		return;
	}

	from = &tt[c->tcp.tap_conn_count];
	memcpy(hole, from, sizeof(*hole));
	from->state = CLOSED;

	to = hole;
	tcp_hash_update(from, to);

	events = hole->events;
	hole->events = UINT_MAX;
	tcp_tap_epoll_mask(c, hole, events);

	debug("TCP: hash table compaction: old index %i, new index %i, "
	      "sock %i, from: %p, to: %p",
	      from - tt, to - tt, from->sock, from, to);
}

/**
 * tcp_tap_destroy() - Close tap connection, drop from hash table and epoll
 * @c:		Execution context
 * @conn:	Connection pointer
 */
static void tcp_tap_destroy(struct ctx *c, struct tcp_tap_conn *conn)
{
	if (conn->state == CLOSED)
		return;

	epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, NULL);
	tcp_tap_state(conn, CLOSED);
	close(conn->sock);
	tcp_hash_remove(conn);
	tcp_table_tap_compact(c, conn);
}

static void tcp_rst(struct ctx *c, struct tcp_tap_conn *conn);

/**
 * tcp_send_to_tap() - Send segment to tap, with options and values from socket
 * @c:		Execution context
 * @conn:	Connection pointer
 * @flags:	TCP flags to set
 * @in:		Payload buffer
 * @len:	Payload length
 *
 * Return: negative error code on connection reset, 0 otherwise
 */
static int tcp_send_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
			   int flags, char *in, int len)
{
	uint32_t ack_offset = conn->seq_from_tap - conn->seq_ack_to_tap;
	char buf[USHRT_MAX] = { 0 }, *data;
	struct tcp_info info = { 0 };
	int ws = 0, err, ack_pending;
	socklen_t sl = sizeof(info);
	struct tcphdr *th;

	if (conn->state == ESTABLISHED && !ack_offset && !flags &&
	    conn->tcpi_snd_wnd) {
		err = 0;
		info.tcpi_bytes_acked = conn->tcpi_acked_last;
		info.tcpi_snd_wnd = conn->tcpi_snd_wnd;
		info.tcpi_snd_wscale = conn->ws;
	} else if (conn->no_snd_wnd && !(flags & SYN)) {
		err = 0;
	} else {
		err = getsockopt(conn->sock, SOL_TCP, TCP_INFO, &info, &sl);
		if (err && !(flags & RST)) {
			tcp_rst(c, conn);
			return err;
		}

		sl = sizeof(conn->sndbuf);
		if (getsockopt(conn->sock, SOL_SOCKET, SO_SNDBUF,
			       &conn->sndbuf, &sl))
			conn->sndbuf = USHRT_MAX;

		info.tcpi_snd_wnd = MIN(info.tcpi_snd_wnd,
					conn->sndbuf * 90 / 100);
		conn->tcpi_snd_wnd = info.tcpi_snd_wnd;
	}

	th = (struct tcphdr *)buf;
	data = (char *)(th + 1);
	th->doff = sizeof(*th) / 4;

	if ((flags & SYN) && !err) {
		/* Options: MSS, NOP and window scale if allowed (4-8 bytes) */
		*data++ = OPT_MSS;
		*data++ = OPT_MSS_LEN;
		*(uint16_t *)data = htons(info.tcpi_snd_mss);
		data += OPT_MSS_LEN - 2;
		th->doff += OPT_MSS_LEN / 4;

		/* Check if kernel includes commit:
		 *	8f7baad7f035 ("tcp: Add snd_wnd to TCP_INFO")
		 */
		conn->no_snd_wnd = !info.tcpi_snd_wnd;

		if (conn->ws_allowed && (ws = info.tcpi_snd_wscale) &&
		    !conn->no_snd_wnd) {
			*data++ = OPT_NOP;

			*data++ = OPT_WS;
			*data++ = OPT_WS_LEN;
			*data++ = ws;

			th->doff += (1 + OPT_WS_LEN) / 4;
		}

		/* RFC 793, 3.1: "[...] and the first data octet is ISN+1." */
		th->seq = htonl(conn->seq_to_tap++);
	} else {
		th->seq = htonl(conn->seq_to_tap);
		conn->seq_to_tap += len;
	}

	if (flags & SYN) {
		ack_pending = 0;
	} else if (conn->state == ESTABLISHED || conn->no_snd_wnd) {
		ack_pending = conn->seq_from_tap != conn->seq_ack_to_tap &&
			      (conn->seq_from_tap - conn->seq_ack_to_tap) <
			      MAX_WINDOW;
	} else {
		ack_pending = info.tcpi_bytes_acked > conn->tcpi_acked_last;
	}

	if (!err && (ack_pending || (flags & ACK) || len)) {
		th->ack = 1;

		if (conn->no_snd_wnd) {
			conn->seq_ack_to_tap = conn->seq_from_tap;
		} else {
			if (conn->state == ESTABLISHED)
				conn->seq_ack_to_tap = conn->seq_from_tap;
			else
				conn->seq_ack_to_tap = info.tcpi_bytes_acked +
						       conn->seq_init_from_tap;

			conn->tcpi_acked_last = info.tcpi_bytes_acked;
		}

		/* seq_ack_to_tap matching seq_from_tap means, in these states,
		 * that we shut the writing half down, but the FIN segment
		 * wasn't acknowledged yet. We sent the FIN for sure, so adjust
		 * the sequence number in that case.
		 */
		if ((conn->state == LAST_ACK ||
		     conn->state == FIN_WAIT_1_SOCK_FIN ||
		     conn->state == FIN_WAIT_1) &&
		     conn->seq_ack_to_tap == conn->seq_from_tap)
			th->ack_seq = htonl(conn->seq_ack_to_tap + 1);
		else
			th->ack_seq = htonl(conn->seq_ack_to_tap);
	} else {
		if (!len && !flags)
			return 0;

		th->ack = th->ack_seq = 0;
	}

	th->rst = !!(flags & RST);
	th->syn = !!(flags & SYN);
	th->fin = !!(flags & FIN);

	th->source = htons(conn->sock_port);
	th->dest = htons(conn->tap_port);

	if (flags & ZERO_WINDOW) {
		th->window = 0;
	} else if (!err && !conn->no_snd_wnd) {
		/* First value sent by receiver is not scaled */
		th->window = htons(info.tcpi_snd_wnd >>
				   (th->syn ? 0 : info.tcpi_snd_wscale));
	} else {
		th->window = htons(WINDOW_DEFAULT);
	}

	if (!th->window)
		conn->tcpi_snd_wnd = 0;

	th->urg_ptr = 0;
	th->check = 0;

	memcpy(data, in, len);

	tap_ip_send(c, &conn->a.a6, IPPROTO_TCP, buf, th->doff * 4 + len,
		    conn->seq_init_to_tap);

	if (th->fin)
		conn->seq_to_tap++;

	return 0;
}

/**
 * tcp_rst() - Reset a tap connection: send RST segment to tap, close socket
 * @c:		Execution context
 * @conn:	Connection pointer
 */
static void tcp_rst(struct ctx *c, struct tcp_tap_conn *conn)
{
	if (conn->state == CLOSED)
		return;

	tcp_send_to_tap(c, conn, RST, NULL, 0);
	tcp_tap_destroy(c, conn);
}

/**
 * tcp_clamp_window() - Set window and scaling from option, clamp on socket
 * @conn:	Connection pointer
 * @th:		TCP header, from tap, can be NULL if window is passed
 * @len:	Buffer length, at L4, can be 0 if no header is passed
 * @window:	Window value, host order, unscaled, if no header is passed
 * @init:	Set if this is the very first segment from tap
 */
static void tcp_clamp_window(struct tcp_tap_conn *conn, struct tcphdr *th,
			     int len, unsigned int window, int init)
{
	if (init) {
		conn->ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
		conn->ws_allowed = conn->ws >= 0 && conn->ws <= MAX_WS;
		conn->ws *= conn->ws_allowed;

		/* RFC 7323, 2.2: first value is not scaled. Also, don't clamp
		 * yet, to avoid getting a zero scale just because we set a
		 * small window now.
		 */
		conn->tap_window = ntohs(th->window);
		conn->window_clamped = 0;
	} else {
		if (th)
			window = ntohs(th->window) << conn->ws;
		else
			window <<= conn->ws;

		if (conn->window_clamped) {
			if (conn->tap_window == window)
				return;

			/* Discard +/- 1% updates to spare some syscalls. */
			if ((window > conn->tap_window &&
			     window * 99 / 100 < conn->tap_window) ||
			    (window < conn->tap_window &&
			     window * 101 / 100 > conn->tap_window)) {
				conn->tap_window = window;
				return;
			}
		}

		conn->tap_window = window;
		if (window < 256)
			window = 256;
		setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP,
			   &window, sizeof(window));
		conn->window_clamped = 1;
	}
}

/**
 * tcp_seq_init() - Calculate initial sequence number according to RFC 6528
 * @c:		Execution context
 * @af:		Address family, AF_INET or AF_INET6
 * @addr:	Remote address, pointer to sin_addr or sin6_addr
 * @dstport:	Destination port, connection-wise, network order
 * @srcport:	Source port, connection-wise, network order
 * @now:	Current timestamp
 *
 * Return: initial TCP sequence
 */
static uint32_t tcp_seq_init(struct ctx *c, int af, void *addr,
			     in_port_t dstport, in_port_t srcport,
			     struct timespec *now)
{
	uint32_t ns, seq = 0;

	if (af == AF_INET) {
		struct {
			struct in_addr src;
			in_port_t srcport;
			struct in_addr dst;
			in_port_t dstport;
		} __attribute__((__packed__)) in = {
			.src = *(struct in_addr *)addr,
			.srcport = srcport,
			.dst = { c->addr4 },
			.dstport = dstport,
		};

		seq = siphash_12b((uint8_t *)&in, c->tcp.hash_secret);
	} else if (af == AF_INET6) {
		struct {
			struct in6_addr src;
			in_port_t srcport;
			struct in6_addr dst;
			in_port_t dstport;
		} __attribute__((__packed__)) in = {
			.src = *(struct in6_addr *)addr,
			.srcport = srcport,
			.dst = c->addr6,
			.dstport = dstport,
		};

		seq = siphash_36b((uint8_t *)&in, c->tcp.hash_secret);
	}

	ns = now->tv_sec * 1E9;
	ns += now->tv_nsec >> 5; /* 32ns ticks, overflows 32 bits every 137s */

	return seq + ns;
}

/**
 * tcp_conn_from_tap() - Handle connection request (SYN segment) from tap
 * @c:		Execution context
 * @af:		Address family, AF_INET or AF_INET6
 * @addr:	Remote address, pointer to sin_addr or sin6_addr
 * @th:		TCP header from tap
 * @len:	Packet length at L4
 * @now:	Current timestamp
 */
static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
			      struct tcphdr *th, size_t len,
			      struct timespec *now)
{
	struct sockaddr_in addr4 = {
		.sin_family = AF_INET,
		.sin_port = th->dest,
		.sin_addr = *(struct in_addr *)addr,
	};
	struct sockaddr_in6 addr6 = {
		.sin6_family = AF_INET6,
		.sin6_port = th->dest,
		.sin6_addr = *(struct in6_addr *)addr,
	};
	union epoll_ref ref = { .proto = IPPROTO_TCP };
	const struct sockaddr *sa;
	struct tcp_tap_conn *conn;
	struct epoll_event ev;
	socklen_t sl;
	int s;

	if (c->tcp.tap_conn_count >= MAX_TAP_CONNS)
		return;

	ref.s = s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
	if (s < 0)
		return;

	if (af == AF_INET && addr4.sin_addr.s_addr == c->gw4)
		addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
	else if (af == AF_INET6 && !memcmp(addr, &c->gw6, sizeof(c->gw6)))
		addr6.sin6_addr = in6addr_loopback;

	if (af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)) {
		struct sockaddr_in6 addr6_ll = {
			.sin6_family = AF_INET6,
			.sin6_addr = c->addr6_ll,
			.sin6_scope_id = if_nametoindex(c->ifn),
		};
		bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll));
	}

	conn = &tt[c->tcp.tap_conn_count++];
	conn->sock = s;

	sl = sizeof(conn->sndbuf);
	if (getsockopt(s, SOL_SOCKET, SO_SNDBUF, &conn->sndbuf, &sl))
		conn->sndbuf = USHRT_MAX;

	conn->mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL);
	if (conn->mss_guest < 0)
		conn->mss_guest = MSS_DEFAULT;

	if (c->mode == MODE_PASST) {
		/* Don't upset qemu */
		conn->mss_guest = MIN(USHRT_MAX -
				      sizeof(uint32_t) -
				      sizeof(struct ethhdr) -
				      sizeof(struct ipv6hdr) -
				      sizeof(struct tcphdr),
				      conn->mss_guest);
	}

	sl = sizeof(conn->mss_guest);
	setsockopt(s, SOL_TCP, TCP_MAXSEG, &conn->mss_guest, sl);

	tcp_clamp_window(conn, th, len, 0, 1);

	if (af == AF_INET) {
		sa = (struct sockaddr *)&addr4;
		sl = sizeof(addr4);

		memset(&conn->a.a4.zero, 0,    sizeof(conn->a.a4.zero));
		memset(&conn->a.a4.one,  0xff, sizeof(conn->a.a4.one));
		memcpy(&conn->a.a4.a,    addr, sizeof(conn->a.a4.a));
	} else {
		sa = (struct sockaddr *)&addr6;
		sl = sizeof(addr6);

		memcpy(&conn->a.a6,      addr, sizeof(conn->a.a6));
	}

	conn->sock_port = ntohs(th->dest);
	conn->tap_port = ntohs(th->source);

	conn->ts_sock = conn->ts_tap = conn->ts_ack_tap = *now;

	bitmap_set(tcp_act, conn - tt);

	conn->seq_init_from_tap = ntohl(th->seq);
	conn->seq_from_tap = conn->seq_init_from_tap + 1;
	conn->seq_ack_to_tap = conn->seq_from_tap;

	conn->seq_to_tap = tcp_seq_init(c, af, addr, th->dest, th->source, now);
	conn->seq_init_to_tap = conn->seq_to_tap;
	conn->seq_ack_from_tap = conn->seq_to_tap + 1;

	tcp_hash_insert(c, conn, af, addr);

	if (connect(s, sa, sl)) {
		tcp_tap_state(conn, TAP_SYN_SENT);

		if (errno != EINPROGRESS) {
			tcp_rst(c, conn);
			return;
		}

		ev.events = EPOLLOUT | EPOLLRDHUP;
	} else {
		tcp_tap_state(conn, TAP_SYN_RCVD);

		if (tcp_send_to_tap(c, conn, SYN | ACK, NULL, 0))
			return;

		ev.events = EPOLLIN | EPOLLRDHUP;
	}

	conn->events = ev.events;
	ref.tcp.index = conn - tt;
	ev.data.u64 = ref.u64;
	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev);
}

/**
 * tcp_table_splice_compact - Compact spliced connection table
 * @c:		Execution context
 * @hole:	Pointer to recently closed connection
 */
static void tcp_table_splice_compact(struct ctx *c,
				     struct tcp_splice_conn *hole)
{
	union epoll_ref ref_from = { .proto = IPPROTO_TCP, .tcp.splice = 1,
				     .tcp.index = hole - ts };
	union epoll_ref ref_to = { .proto = IPPROTO_TCP, .tcp.splice = 1,
				   .tcp.index = hole - ts };
	struct tcp_splice_conn *move;
	struct epoll_event ev_from;
	struct epoll_event ev_to;

	if ((hole - ts) == --c->tcp.splice_conn_count)
		return;

	move = &ts[c->tcp.splice_conn_count];
	memcpy(hole, move, sizeof(*hole));
	move->state = CLOSED;
	move = hole;

	ref_from.s = move->from;
	ref_from.tcp.v6 = move->v6;
	ref_to.s = move->to;
	ref_to.tcp.v6 = move->v6;

	if (move->state == SPLICE_ACCEPTED) {
		ev_from.events = ev_to.events = 0;
	} else if (move->state == SPLICE_CONNECT) {
		ev_from.events = EPOLLET | EPOLLRDHUP;
		ev_to.events = EPOLLET | EPOLLOUT | EPOLLRDHUP;
	} else {
		ev_from.events = EPOLLIN | EPOLLOUT | EPOLLRDHUP;
		ev_to.events = EPOLLIN | EPOLLOUT | EPOLLRDHUP;
	}

	ev_from.data.u64 = ref_from.u64;
	ev_to.data.u64 = ref_to.u64;

	epoll_ctl(c->epollfd, EPOLL_CTL_MOD, move->from, &ev_from);
	epoll_ctl(c->epollfd, EPOLL_CTL_MOD, move->to, &ev_to);
}

/**
 * tcp_splice_destroy() - Close spliced connection and pipes, drop from epoll
 * @c:		Execution context
 * @conn:	Connection pointer
 */
static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn)
{
	switch (conn->state) {
	case SPLICE_FIN_BOTH:
	case SPLICE_FIN_FROM:
	case SPLICE_FIN_TO:
	case SPLICE_ESTABLISHED:
		if (conn->pipe_from_to[0] != -1) {
			close(conn->pipe_from_to[0]);
			conn->pipe_from_to[0] = -1;
			close(conn->pipe_from_to[1]);
			conn->pipe_from_to[1] = -1;
		}
		if (conn->pipe_to_from[0] != -1) {
			close(conn->pipe_to_from[0]);
			conn->pipe_to_from[0] = -1;
			close(conn->pipe_to_from[1]);
			conn->pipe_to_from[1] = -1;
		}
		/* Falls through */
	case SPLICE_CONNECT:
		epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->from, NULL);
		epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->to, NULL);
		close(conn->to);
		/* Falls through */
	case SPLICE_ACCEPTED:
		close(conn->from);
		tcp_splice_state(conn, CLOSED);
		tcp_table_splice_compact(c, conn);
		conn->from_fin_sent = conn->to_fin_sent = 0;
		conn->from_read = conn->from_written = 0;
		conn->to_read = conn->to_written = 0;
		return;
	default:
		return;
	}
}

/**
 * tcp_sock_consume() - Consume (discard) data from buffer, update ACK sequence
 * @conn:	Connection pointer
 * @ack_seq:	ACK sequence, host order
 */
static void tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq)
{
	uint32_t to_ack;

	/* Implicitly take care of wrap-arounds */
	to_ack = ack_seq - conn->seq_ack_from_tap;

	/* Simply ignore out-of-order ACKs: we already consumed the data we
	 * needed from the buffer, and we won't rewind back to a lower ACK
	 * sequence.
	 */

	if (to_ack > MAX_WINDOW)
		return;

	if (to_ack)
		recv(conn->sock, NULL, to_ack, MSG_DONTWAIT | MSG_TRUNC);

	conn->seq_ack_from_tap = ack_seq;
}

/**
 * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window
 * @c:		Execution context
 * @conn:	Connection pointer
 * @now:	Current timestamp
 *
 * Return: negative on connection reset, 0 otherwise
 */
static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
			      struct timespec *now)
{
	int *buf_mss, *buf_mss_nr_set, *buf_mss_tap, *buf_mss_tap_nr_set;
	int mss_tap, fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
	int send, len, plen, v4 = IN6_IS_ADDR_V4MAPPED(&conn->a.a6);
	uint32_t seq_to_tap = conn->seq_to_tap;
	socklen_t sl = sizeof(struct tcp_info);
	int s = conn->sock, i, ret = 0;
	struct iovec *iov, *iov_tap;
	uint32_t already_sent;
	struct tcp_info info;
	struct mmsghdr *mh;

	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;

	if (already_sent > MAX_WINDOW) {
		/* RFC 761, section 2.1. */
		seq_to_tap = conn->seq_to_tap = conn->seq_ack_from_tap;
		already_sent = 0;
	}

	if (!conn->tap_window || already_sent >= conn->tap_window) {
		tcp_tap_epoll_mask(c, conn, conn->events | EPOLLET);
		return 0;
	}

	fill_bufs = DIV_ROUND_UP(conn->tap_window - already_sent,
				 conn->mss_guest);
	if (fill_bufs > TCP_TAP_FRAMES) {
		fill_bufs = TCP_TAP_FRAMES;
		iov_rem = 0;
	} else {
		iov_rem = (conn->tap_window - already_sent) % conn->mss_guest;
	}

	/* Adjust iovec length for recvmsg() based on what was set last time. */
	if (v4) {
		iov = tcp4_l2_iov_sock + 1;
		buf_mss = &tcp4_l2_buf_mss;
		buf_mss_nr_set = &tcp4_l2_buf_mss_nr_set;
	} else {
		iov = tcp6_l2_iov_sock + 1;
		buf_mss = &tcp6_l2_buf_mss;
		buf_mss_nr_set = &tcp6_l2_buf_mss_nr_set;
	}
	if (*buf_mss != conn->mss_guest)
		*buf_mss_nr_set = 0;
	for (i = *buf_mss_nr_set; i < fill_bufs; i++)
		iov[i].iov_len = conn->mss_guest;
	*buf_mss = conn->mss_guest;
	*buf_mss_nr_set = fill_bufs - 1;

	/* First buffer is to discard data, last one may be partially filled. */
	iov[-1].iov_len = already_sent;
	if (iov_rem)
		iov[fill_bufs - 1].iov_len = iov_rem;
	if (v4)
		tcp4_l2_mh_sock.msg_iovlen = fill_bufs + 1;
	else
		tcp6_l2_mh_sock.msg_iovlen = fill_bufs + 1;

	/* Don't dequeue until acknowledged by guest. */
recvmmsg:
	len = recvmsg(s, v4 ? &tcp4_l2_mh_sock : &tcp6_l2_mh_sock, MSG_PEEK);
	if (len < 0) {
		if (errno == EINTR)
			goto recvmmsg;
		goto err;
	}

	if (!len)
		goto zero_len;

	send = len - already_sent;
	if (send <= 0) {
		tcp_tap_epoll_mask(c, conn, conn->events | EPOLLET);
		goto out_restore_iov;
	}

	tcp_tap_epoll_mask(c, conn, conn->events & ~EPOLLET);

	send_bufs = DIV_ROUND_UP(send, conn->mss_guest);
	last_len = send - (send_bufs - 1) * conn->mss_guest;

	/* Adjust iovec length for sending based on what was set last time. */
	if (v4) {
		mss_tap = conn->mss_guest +
			  offsetof(struct tcp4_l2_buf_t, data) -
			  offsetof(struct tcp4_l2_buf_t, vnet_len);

		iov_tap = tcp4_l2_iov_tap;
		buf_mss_tap = &tcp4_l2_buf_mss_tap;
		buf_mss_tap_nr_set = &tcp4_l2_buf_mss_tap_nr_set;
	} else {
		mss_tap = conn->mss_guest +
			  offsetof(struct tcp6_l2_buf_t, data) -
			  offsetof(struct tcp6_l2_buf_t, vnet_len);

		iov_tap = tcp6_l2_iov_tap;
		buf_mss_tap = &tcp6_l2_buf_mss_tap;
		buf_mss_tap_nr_set = &tcp6_l2_buf_mss_tap_nr_set;
	}
	if (*buf_mss_tap != mss_tap)
		*buf_mss_tap_nr_set = 0;
	for (i = *buf_mss_tap_nr_set; i < send_bufs; i++)
		iov_tap[i].iov_len = mss_tap;
	*buf_mss_tap = mss_tap;
	*buf_mss_tap_nr_set = send_bufs;

	iov_tap[send_bufs - 1].iov_len = mss_tap - conn->mss_guest + last_len;

	/* Likely, some new data was acked too. */
	if (conn->seq_from_tap != conn->seq_ack_to_tap || !conn->tcpi_snd_wnd) {
		if (conn->no_snd_wnd) {
			conn->seq_ack_to_tap = conn->seq_from_tap;
		} else {
			if (getsockopt(conn->sock, SOL_TCP, TCP_INFO, &info,
				       &sl))
				goto err;

			if (getsockopt(conn->sock, SOL_SOCKET,
				       SO_SNDBUF, &conn->sndbuf, &sl))
				conn->sndbuf = USHRT_MAX;

			info.tcpi_snd_wnd = MIN(info.tcpi_snd_wnd,
						conn->sndbuf * 90 / 100);

			if (conn->state == ESTABLISHED)
				conn->seq_ack_to_tap = conn->seq_from_tap;
			else
				conn->seq_ack_to_tap = info.tcpi_bytes_acked +
						       conn->seq_init_from_tap;

			conn->tcpi_acked_last = info.tcpi_bytes_acked;
		}
	} else {
		info.tcpi_snd_wscale = conn->ws;
		info.tcpi_snd_wnd = conn->tcpi_snd_wnd;
	}

	plen = conn->mss_guest;
	for (i = 0, mh = tcp_l2_mh_tap; i < send_bufs; i++, mh++) {
		int ip_len;

		if (i == send_bufs - 1)
			plen = last_len;

		if (v4) {
			struct tcp4_l2_buf_t *b = &tcp4_l2_buf[i];

			ip_len = plen + sizeof(struct iphdr) +
				 sizeof(struct tcphdr);

			b->iph.tot_len = htons(ip_len);
			b->iph.saddr = conn->a.a4.a.s_addr;
			b->iph.daddr = c->addr4_seen;
			if (!i || i == send_bufs - 1)
				tcp_update_check_ip4(b);
			else
				b->iph.check = tcp4_l2_buf[0].iph.check;

			b->th.source = htons(conn->sock_port);
			b->th.dest = htons(conn->tap_port);
			b->th.seq = htonl(seq_to_tap);
			b->th.ack_seq = htonl(conn->seq_ack_to_tap);

			if (conn->no_snd_wnd) {
				b->th.window = htons(WINDOW_DEFAULT);
			} else {
				b->th.window = htons(info.tcpi_snd_wnd >>
						     info.tcpi_snd_wscale);
				conn->tcpi_snd_wnd = info.tcpi_snd_wnd;
			}

			tcp_update_check_tcp4(b);

			if (c->mode == MODE_PASTA) {
				ip_len += sizeof(struct ethhdr);
				write(c->fd_tap, &b->eh, ip_len);
				pcap((char *)&b->eh, ip_len);

				conn->seq_to_tap += plen;
				continue;
			}

			b->vnet_len = htonl(sizeof(struct ethhdr) + ip_len);

			mh->msg_hdr.msg_iov = &tcp4_l2_iov_tap[i];
		} else {
			struct tcp6_l2_buf_t *b = &tcp6_l2_buf[i];
			uint32_t flow = conn->seq_init_to_tap;

			ip_len = plen + sizeof(struct ipv6hdr) +
				 sizeof(struct tcphdr);

			b->ip6h.payload_len = htons(plen +
						    sizeof(struct tcphdr));
			b->ip6h.saddr = conn->a.a6;
			if (IN6_IS_ADDR_LINKLOCAL(&b->ip6h.saddr))
				b->ip6h.daddr = c->addr6_ll_seen;
			else
				b->ip6h.daddr = c->addr6_seen;

			b->th.source = htons(conn->sock_port);
			b->th.dest = htons(conn->tap_port);
			b->th.seq = htonl(seq_to_tap);
			b->th.ack_seq = htonl(conn->seq_ack_to_tap);

			if (conn->no_snd_wnd) {
				b->th.window = htons(WINDOW_DEFAULT);
			} else {
				b->th.window = htons(info.tcpi_snd_wnd >>
						     info.tcpi_snd_wscale);
				conn->tcpi_snd_wnd = info.tcpi_snd_wnd;
			}

			memset(b->ip6h.flow_lbl, 0, 3);
			tcp_update_check_tcp6(b);

			b->ip6h.flow_lbl[0] = (flow >> 16) & 0xf;
			b->ip6h.flow_lbl[1] = (flow >> 8) & 0xff;
			b->ip6h.flow_lbl[2] = (flow >> 0) & 0xff;

			if (c->mode == MODE_PASTA) {
				ip_len += sizeof(struct ethhdr);
				write(c->fd_tap, &b->eh, ip_len);
				pcap((char *)&b->eh, ip_len);

				conn->seq_to_tap += plen;
				continue;
			}

			b->vnet_len = htonl(sizeof(struct ethhdr) + ip_len);

			mh->msg_hdr.msg_iov = &tcp6_l2_iov_tap[i];
		}

		seq_to_tap += plen;
	}

	if (c->mode == MODE_PASTA)
		goto out;

sendmmsg:
	ret = sendmmsg(c->fd_tap, tcp_l2_mh_tap, mh - tcp_l2_mh_tap,
		       MSG_NOSIGNAL | MSG_DONTWAIT);
	if (ret < 0 && errno == EINTR)
		goto sendmmsg;

	if (ret <= 0)
		goto out;

	conn->seq_to_tap += conn->mss_guest * (ret - 1) + last_len;

	/* sendmmsg() indicates how many messages were sent at least partially.
	 * Kernel commit 3023898b7d4a ("sock: fix sendmmsg for partial sendmsg")
	 * gives us the guarantee that at most one message, namely the last sent
	 * one, might have been sent partially. Check how many bytes of that
	 * message were sent, and re-send any missing bytes with a blocking
	 * sendmsg(), otherwise qemu will fail to parse any subsequent message.
	 */
	mh = &tcp_l2_mh_tap[ret - 1];
	if (mh->msg_len < mh->msg_hdr.msg_iov->iov_len) {
		uint8_t **iov_base = (uint8_t **)&mh->msg_hdr.msg_iov->iov_base;
		int part_sent = mh->msg_len;

		mh->msg_hdr.msg_iov->iov_len -= part_sent;
		*iov_base += part_sent;

		sendmsg(c->fd_tap, &mh->msg_hdr, MSG_NOSIGNAL);

		mh->msg_hdr.msg_iov->iov_len += part_sent;
		*iov_base -= part_sent;
	}

	pcapmm(tcp_l2_mh_tap, ret);

	goto out;

err:
	if (errno != EAGAIN && errno != EWOULDBLOCK) {
		tcp_rst(c, conn);
		ret = -errno;
	}
	goto out_restore_iov;

zero_len:
	if (conn->state == ESTABLISHED_SOCK_FIN) {
		uint8_t probe;

		if (!recv(conn->sock, &probe, 1, MSG_PEEK)) {
			tcp_tap_epoll_mask(c, conn, EPOLLET);
			tcp_send_to_tap(c, conn, FIN | ACK, NULL, 0);
			tcp_tap_state(conn, ESTABLISHED_SOCK_FIN_SENT);
		}
	}

out:
	conn->ts_sock = *now;

out_restore_iov:
	if (iov_rem)
		iov[fill_bufs - 1].iov_len = conn->mss_guest;
	if (send_bufs)
		iov_tap[send_bufs - 1].iov_len = mss_tap;

	return ret;
}

/**
 * tcp_data_from_tap() - tap data in ESTABLISHED{,SOCK_FIN}, CLOSE_WAIT states
 * @c:		Execution context
 * @conn:	Connection pointer
 * @msg:	Array of messages from tap
 * @count:	Count of messages
 * @now:	Current timestamp
 */
static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
			      struct tap_msg *msg, int count,
			      struct timespec *now)
{
	int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1;
	struct msghdr mh = { .msg_iov = tcp_tap_iov };
	uint32_t max_ack_seq = conn->seq_ack_from_tap;
	uint32_t seq_from_tap = conn->seq_from_tap;
	uint16_t max_ack_seq_wnd = WINDOW_DEFAULT;
	ssize_t len;

	for (i = 0, iov_i = 0; i < count; i++) {
		struct tcphdr *th = (struct tcphdr *)msg[i].l4h;
		uint32_t seq, seq_offset, ack_seq;
		size_t len = msg[i].l4_len, off;
		char *data;

		if (len < sizeof(*th)) {
			tcp_rst(c, conn);
			return;
		}

		off = th->doff * 4;
		if (off < sizeof(*th) || off > len) {
			tcp_rst(c, conn);
			return;
		}

		if (th->rst) {
			tcp_tap_destroy(c, conn);
			return;
		}

		len -= off;
		data = (char *)th + off;

		seq = ntohl(th->seq);
		ack_seq = ntohl(th->ack_seq);
		if (!i) {
			if (count == 1)
				max_ack_seq_wnd = ntohs(th->window);
			else
				max_ack_seq_wnd = ntohs(th->window) - 1;
		}

		if (th->ack) {
			ack = 1;
			if (ack_seq - conn->seq_ack_from_tap < MAX_WINDOW &&
			    ack_seq - max_ack_seq < MAX_WINDOW) {

				/* Fast re-transmit */
				retr = !len && !th->fin &&
				       ack_seq == max_ack_seq &&
				       max_ack_seq_wnd == ntohs(th->window);

				max_ack_seq_wnd = ntohs(th->window);
				max_ack_seq = ack_seq;
			}
		}

		if (th->fin)
			fin = 1;

		if (!len)
			continue;

		seq_offset = seq_from_tap - seq;
		/* Use data from this buffer only in these two cases:
		 *
		 *      , seq_from_tap           , seq_from_tap
		 * |--------| <-- len            |--------| <-- len
		 * '----' <-- offset             ' <-- offset
		 * ^ seq                         ^ seq
		 *    (offset >= 0, seq + len > seq_from_tap)
		 *
		 * discard in these two cases:
		 *          , seq_from_tap                , seq_from_tap
		 * |--------| <-- len            |--------| <-- len
		 * '--------' <-- offset            '-----| <- offset
		 * ^ seq                            ^ seq
		 *    (offset >= 0 i.e. < MAX_WINDOW, seq + len <= seq_from_tap)
		 *
		 * keep, look for another buffer, then go back, in this case:
		 *      , seq_from_tap
		 *          |--------| <-- len
		 *      '===' <-- offset
		 *          ^ seq
		 *    (offset < 0 i.e. > MAX_WINDOW)
		 */
		if (seq_offset < MAX_WINDOW && seq + len <= seq_from_tap)
			continue;

		if (seq_offset > MAX_WINDOW) {
			if (keep == -1)
				keep = i;
			continue;
		}

		tcp_tap_iov[iov_i].iov_base = data + seq_offset;
		tcp_tap_iov[iov_i].iov_len = len - seq_offset;
		seq_from_tap += tcp_tap_iov[iov_i].iov_len;
		iov_i++;

		if (keep == i)
			keep = -1;

		if (keep != -1)
			i = keep - 1;
	}

	tcp_clamp_window(conn, NULL, 0, max_ack_seq_wnd, 0);

	if (ack) {
		conn->ts_ack_tap = *now;
		tcp_sock_consume(conn, max_ack_seq);
	}

	if (retr) {
		conn->seq_to_tap = max_ack_seq;
		tcp_data_from_sock(c, conn, now);
		return;
	}

	if (!iov_i)
		goto out;

	mh.msg_iovlen = iov_i;
eintr:
	len = sendmsg(conn->sock, &mh, MSG_DONTWAIT | MSG_NOSIGNAL);
	if (len < 0) {
		if (errno == EINTR)
			goto eintr;

		if (errno == EAGAIN || errno == EWOULDBLOCK) {
			tcp_send_to_tap(c, conn, ZERO_WINDOW, NULL, 0);
			return;
		}
		tcp_rst(c, conn);
		return;
	}

	if (len < (seq_from_tap - conn->seq_from_tap)) {
		conn->seq_from_tap += len;
		tcp_send_to_tap(c, conn, ZERO_WINDOW, NULL, 0);
	} else {
		conn->seq_from_tap += len;
	}

out:
	if (keep != -1) {
		tcp_send_to_tap(c, conn, ACK, NULL, 0);
		tcp_send_to_tap(c, conn, ACK, NULL, 0);
		return;
	}

	if (ack) {
		if (conn->state == ESTABLISHED_SOCK_FIN_SENT &&
		    conn->seq_ack_from_tap == conn->seq_to_tap)
			tcp_tap_state(conn, CLOSE_WAIT);
	}

	if (!fin) {
		tcp_send_to_tap(c, conn, 0, NULL, 0);
		return;
	}

	if (conn->state == ESTABLISHED) {
		shutdown(conn->sock, SHUT_WR);
		tcp_tap_state(conn, FIN_WAIT_1);
		tcp_send_to_tap(c, conn, ACK, NULL, 0);
	} else if (conn->state == CLOSE_WAIT) {
		shutdown(conn->sock, SHUT_WR);
		tcp_tap_state(conn, LAST_ACK);
		tcp_send_to_tap(c, conn, ACK, NULL, 0);
	}
}

/**
 * tcp_tap_handler() - Handle packets from tap and state transitions
 * @c:		Execution context
 * @af:		Address family, AF_INET or AF_INET6
 * @addr:	Destination address
 * @msg:	Input messages
 * @count:	Message count
 * @now:	Current timestamp
 *
 * Return: count of consumed packets
 */
int tcp_tap_handler(struct ctx *c, int af, void *addr,
		    struct tap_msg *msg, int count, struct timespec *now)
{
	struct tcphdr *th = (struct tcphdr *)msg[0].l4h;
	size_t len = msg[0].l4_len, off;
	struct tcp_tap_conn *conn;
	int ws;

	if (len < sizeof(*th))
		return 1;

	off = th->doff * 4;
	if (off < sizeof(*th) || off > len)
		return 1;

	conn = tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest));
	if (!conn) {
		if (th->syn && !th->ack)
			tcp_conn_from_tap(c, af, addr, th, len, now);
		return 1;
	}

	if (th->rst) {
		tcp_tap_destroy(c, conn);
		return 1;
	}

	conn->ts_tap = *now;

	switch (conn->state) {
	case SOCK_SYN_SENT:
		if (!th->syn || !th->ack) {
			tcp_rst(c, conn);
			return 1;
		}

		tcp_clamp_window(conn, th, len, 0, 1);

		conn->mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL);
		if (conn->mss_guest < 0)
			conn->mss_guest = MSS_DEFAULT;

		if (c->mode == MODE_PASST) {
			/* Don't upset qemu */
			conn->mss_guest = MIN(USHRT_MAX -
					      sizeof(uint32_t) -
					      sizeof(struct ethhdr) -
					      sizeof(struct ipv6hdr) -
					      sizeof(struct tcphdr),
					      conn->mss_guest);
		}

		ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
		if (ws > MAX_WS) {
			if (tcp_send_to_tap(c, conn, RST, NULL, 0))
				return 1;

			conn->seq_to_tap = 0;
			conn->ws_allowed = 0;
			tcp_send_to_tap(c, conn, SYN, NULL, 0);
			return 1;
		}

		/* info.tcpi_bytes_acked already includes one byte for SYN, but
		 * not for incoming connections.
		 */
		conn->seq_init_from_tap = ntohl(th->seq) + 1;
		conn->seq_from_tap = conn->seq_init_from_tap;
		conn->seq_ack_to_tap = conn->seq_from_tap;

		tcp_tap_state(conn, ESTABLISHED);

		/* The client might have sent data already, which we didn't
		 * dequeue waiting for SYN,ACK from tap -- check now.
		 */
		tcp_data_from_sock(c, conn, now);
		tcp_send_to_tap(c, conn, 0, NULL, 0);

		tcp_tap_epoll_mask(c, conn, EPOLLIN | EPOLLRDHUP);
		break;
	case TAP_SYN_RCVD:
		if (th->fin) {
			shutdown(conn->sock, SHUT_WR);
			tcp_send_to_tap(c, conn, ACK, NULL, 0);
			tcp_tap_state(conn, FIN_WAIT_1);
			break;
		}

		if (!th->ack) {
			tcp_rst(c, conn);
			return 1;
		}

		tcp_clamp_window(conn, th, len, 0, 0);

		tcp_tap_state(conn, ESTABLISHED);
		break;
	case ESTABLISHED:
	case ESTABLISHED_SOCK_FIN:
	case ESTABLISHED_SOCK_FIN_SENT:
	case CLOSE_WAIT:
	case FIN_WAIT_1_SOCK_FIN:
	case FIN_WAIT_1:
		tcp_tap_epoll_mask(c, conn, conn->events & ~EPOLLET);
		tcp_data_from_tap(c, conn, msg, count, now);
		return count;
	case TAP_SYN_SENT:
	case LAST_ACK:
	case SPLICE_ACCEPTED:
	case SPLICE_CONNECT:
	case SPLICE_ESTABLISHED:
	case SPLICE_FIN_FROM:
	case SPLICE_FIN_TO:
	case SPLICE_FIN_BOTH:
	case CLOSED:	/* ;) */
		break;
	}

	return 1;
}

/**
 * tcp_connect_finish() - Handle completion of connect() from EPOLLOUT event
 * @c:		Execution context
 * @s:		File descriptor number for socket
 */
static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
{
	socklen_t sl;
	int so;

	sl = sizeof(so);
	if (getsockopt(conn->sock, SOL_SOCKET, SO_ERROR, &so, &sl) || so) {
		tcp_rst(c, conn);
		return;
	}

	if (tcp_send_to_tap(c, conn, SYN | ACK, NULL, 0))
		return;

	/* Drop EPOLLOUT, only used to wait for connect() to complete */
	tcp_tap_epoll_mask(c, conn, EPOLLIN | EPOLLRDHUP);

	tcp_tap_state(conn, TAP_SYN_RCVD);
}

/**
 * tcp_splice_connect_finish() - Completion of connect() or call on success
 * @c:		Execution context
 * @conn:	Connection pointer
 * @v6:		Set on IPv6 connection
 */
static void tcp_splice_connect_finish(struct ctx *c,
				      struct tcp_splice_conn *conn, int v6)
{
	union epoll_ref ref_from = { .proto = IPPROTO_TCP, .s = conn->from,
				      .tcp = { .splice = 1, .v6 = v6,
					       .index = conn - ts } };
	union epoll_ref ref_to = { .proto = IPPROTO_TCP, .s = conn->to,
				   .tcp = { .splice = 1, .v6 = v6,
					    .index = conn - ts } };
	struct epoll_event ev_from, ev_to;

	if (conn->state == SPLICE_CONNECT) {
		socklen_t sl;
		int so;

		sl = sizeof(so);
		if (getsockopt(conn->to, SOL_SOCKET, SO_ERROR, &so, &sl) ||
		    so) {
			tcp_splice_destroy(c, conn);
			return;
		}
	}

	conn->pipe_from_to[0] = conn->pipe_to_from[0] = -1;
	if (pipe2(conn->pipe_to_from, O_NONBLOCK) ||
	    pipe2(conn->pipe_from_to, O_NONBLOCK)) {
		tcp_splice_destroy(c, conn);
		return;
	}

	fcntl(conn->pipe_from_to[0], F_SETPIPE_SZ, PIPE_SIZE);
	fcntl(conn->pipe_to_from[0], F_SETPIPE_SZ, PIPE_SIZE);

	if (conn->state == SPLICE_CONNECT) {
		tcp_splice_state(conn, SPLICE_ESTABLISHED);

		ev_from.events = ev_to.events = EPOLLIN | EPOLLRDHUP;
		ev_from.data.u64 = ref_from.u64;
		ev_to.data.u64 = ref_to.u64;

		epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->from, &ev_from);
		epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->to, &ev_to);
	}
}

/**
 * tcp_splice_connect() - Create and connect socket for new spliced connection
 * @c:		Execution context
 * @conn:	Connection pointer
 * @v6:		Set on IPv6 connection
 * @port:	Destination port, host order
 *
 * Return: 0 for connect() succeeded or in progress, negative value on error
 */
static int tcp_splice_connect(struct ctx *c, struct tcp_splice_conn *conn,
			      int v6, in_port_t port)
{
	int sock_conn = socket(v6 ? AF_INET6 : AF_INET,
			       SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP);
	union epoll_ref ref_accept = { .proto = IPPROTO_TCP, .s = conn->from,
				       .tcp = { .splice = 1, .v6 = v6,
						.index = conn - ts } };
	union epoll_ref ref_conn = { .proto = IPPROTO_TCP, .s = sock_conn,
				     .tcp = { .splice = 1, .v6 = v6,
					      .index = conn - ts } };
	struct epoll_event ev_accept = { .events = EPOLLET,
				       .data.u64 = ref_accept.u64 };
	struct epoll_event ev_conn = { .events = EPOLLET,
				       .data.u64 = ref_conn.u64 };
	struct sockaddr_in6 addr6 = {
		.sin6_family = AF_INET6,
		.sin6_port = htons(port),
		.sin6_addr = IN6ADDR_LOOPBACK_INIT,
	};
	struct sockaddr_in addr4 = {
		.sin_family = AF_INET,
		.sin_port = htons(port),
		.sin_addr = { .s_addr = htonl(INADDR_LOOPBACK) },
	};
	const struct sockaddr *sa;
	int ret, one = 1;
	socklen_t sl;

	if (sock_conn < 0)
		return -errno;

	conn->to = sock_conn;

	setsockopt(conn->from, SOL_TCP, TCP_CORK,    &one, sizeof(one));
	setsockopt(conn->from, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
	setsockopt(conn->to,   SOL_TCP, TCP_CORK,    &one, sizeof(one));
	setsockopt(conn->to,   SOL_TCP, TCP_NODELAY, &one, sizeof(one));

	if (v6) {
		sa = (struct sockaddr *)&addr6;
		sl = sizeof(addr6);
	} else {
		sa = (struct sockaddr *)&addr4;
		sl = sizeof(addr4);
	}

	if (connect(conn->to, sa, sl)) {
		if (errno != EINPROGRESS) {
			ret = -errno;
			close(sock_conn);
			return ret;
		}

		tcp_splice_state(conn, SPLICE_CONNECT);
		ev_conn.events |= EPOLLOUT;
	} else {
		tcp_splice_state(conn, SPLICE_ESTABLISHED);
		tcp_splice_connect_finish(c, conn, v6);

		ev_conn.events |= EPOLLIN;
		ev_accept.events |= EPOLLIN;
	}

	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->from, &ev_accept);
	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->to, &ev_conn);

	return 0;
}

/**
 * struct tcp_splice_connect_ns_arg - Arguments for tcp_splice_connect_ns()
 * @c:		Execution context
 * @conn:	Accepted inbound connection
 * @v6:		Set for inbound IPv6 connection
 * @port:	Destination port, host order
 * @ret:	Return value of tcp_splice_connect_ns()
 */
struct tcp_splice_connect_ns_arg {
	struct ctx *c;
	struct tcp_splice_conn *conn;
	int v6;
	in_port_t port;
	int ret;
};

/**
 * tcp_splice_connect_ns() - Enter namespace and call tcp_splice_connect()
 * @arg:	See struct tcp_splice_connect_ns_arg
 *
 * Return: 0
 */
static int tcp_splice_connect_ns(void *arg)
{
	struct tcp_splice_connect_ns_arg *a;

	a = (struct tcp_splice_connect_ns_arg *)arg;
	ns_enter(a->c->pasta_pid);
	a->ret = tcp_splice_connect(a->c, a->conn, a->v6, a->port);
	return 0;
}

/**
 * tcp_splice_new() - Handle new inbound, spliced connection
 * @c:		Execution context
 * @conn:	Connection pointer
 * @v6:		Set for IPv6 connection
 * @port:	Destination port, host order
 *
 * Return: return code from connect()
 */
static int tcp_splice_new(struct ctx *c, struct tcp_splice_conn *conn,
			  int v6, in_port_t port)
{
	struct tcp_splice_connect_ns_arg ns_arg = { c, conn, v6, port, 0 };

	if (bitmap_isset(c->tcp.port_to_tap, port)) {
		NS_CALL(tcp_splice_connect_ns, &ns_arg);
		return ns_arg.ret;
	}

	return tcp_splice_connect(c, conn, v6, port);
}

/**
 * tcp_conn_from_sock() - Handle new connection request from listening socket
 * @c:		Execution context
 * @ref:	epoll reference of listening socket
 * @now:	Current timestamp
 */
static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
			       struct timespec *now)
{
	union epoll_ref ref_conn = { .proto = IPPROTO_TCP,
				     .tcp.v6 = ref.tcp.v6 };
	struct sockaddr_storage sa;
	struct tcp_tap_conn *conn;
	struct epoll_event ev;
	socklen_t sl;
	int s;

	if (c->tcp.tap_conn_count >= MAX_TAP_CONNS)
		return;

	sl = sizeof(sa);
	s = accept4(ref.s, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK);
	if (s < 0)
		return;

	conn = &tt[c->tcp.tap_conn_count++];
	ref_conn.tcp.index = conn - tt;
	ref_conn.s = conn->sock = s;

	sl = sizeof(conn->sndbuf);
	if (getsockopt(s, SOL_SOCKET, SO_SNDBUF, &conn->sndbuf, &sl))
		conn->sndbuf = USHRT_MAX;

	if (ref.tcp.v6) {
		struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)&sa;

		if (IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr) ||
		    !memcmp(&sa6->sin6_addr, &c->addr6_seen, sizeof(c->gw6)) ||
		    !memcmp(&sa6->sin6_addr, &c->addr6, sizeof(c->gw6)))
			memcpy(&sa6->sin6_addr, &c->gw6, sizeof(c->gw6));

		memcpy(&conn->a.a6, &sa6->sin6_addr, sizeof(conn->a.a6));

		conn->sock_port = ntohs(sa6->sin6_port);
		conn->tap_port = ref.tcp.index;

		conn->seq_to_tap = tcp_seq_init(c, AF_INET6, &sa6->sin6_addr,
						conn->sock_port,
						conn->tap_port,
						now);
		conn->seq_init_to_tap = conn->seq_to_tap;

		tcp_hash_insert(c, conn, AF_INET6, &sa6->sin6_addr);
	} else {
		struct sockaddr_in *sa4 = (struct sockaddr_in *)&sa;
		in_addr_t s_addr = ntohl(sa4->sin_addr.s_addr);

		memset(&conn->a.a4.zero,   0, sizeof(conn->a.a4.zero));
		memset(&conn->a.a4.one, 0xff, sizeof(conn->a.a4.one));

		if (s_addr >> IN_CLASSA_NSHIFT == IN_LOOPBACKNET ||
		    s_addr == INADDR_ANY || s_addr == htonl(c->addr4_seen))
			sa4->sin_addr.s_addr = c->gw4;

		memcpy(&conn->a.a4.a, &sa4->sin_addr, sizeof(conn->a.a4.a));

		conn->sock_port = ntohs(sa4->sin_port);
		conn->tap_port = ref.tcp.index;

		conn->seq_to_tap = tcp_seq_init(c, AF_INET, &sa4->sin_addr,
						conn->sock_port,
						conn->tap_port,
						now);
		conn->seq_init_to_tap = conn->seq_to_tap;

		tcp_hash_insert(c, conn, AF_INET, &sa4->sin_addr);
	}

	conn->seq_ack_from_tap = conn->seq_to_tap + 1;

	conn->tap_window = WINDOW_DEFAULT;
	conn->ws_allowed = 1;

	conn->ts_sock = conn->ts_tap = conn->ts_ack_tap = *now;

	bitmap_set(tcp_act, conn - tt);

	conn->events = ev.events = EPOLLRDHUP;
	ev.data.u64 = ref_conn.u64;
	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->sock, &ev);

	tcp_tap_state(conn, SOCK_SYN_SENT);
	tcp_send_to_tap(c, conn, SYN, NULL, 0);
}

/**
 * tcp_sock_handler_splice() - Handler for socket mapped to spliced connection
 * @c:		Execution context
 * @ref:	epoll reference
 * @events:	epoll events bitmap
 */
void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
			     uint32_t events)
{
	int move_from, move_to, *pipes, eof;
	struct tcp_splice_conn *conn;
	struct epoll_event ev;

	if (ref.tcp.listen) {
		int s;

		if (c->tcp.splice_conn_count >= MAX_SPLICE_CONNS)
			return;

		if ((s = accept4(ref.s, NULL, NULL, SOCK_NONBLOCK)) < 0)
			return;

		conn = &ts[c->tcp.splice_conn_count++];
		conn->from = s;
		tcp_splice_state(conn, SPLICE_ACCEPTED);

		if (tcp_splice_new(c, conn, ref.tcp.v6, ref.tcp.index))
			tcp_splice_destroy(c, conn);

		return;
	}

	conn = &ts[ref.tcp.index];

	if (events & EPOLLERR)
		goto close;

	if (conn->state == SPLICE_CONNECT && (events & EPOLLHUP))
		goto close;

	if (events & EPOLLOUT) {
		struct epoll_event ev = {
			.events = EPOLLIN | EPOLLRDHUP,
			.data.u64 = ref.u64,
		};

		if (conn->state == SPLICE_CONNECT)
			tcp_splice_connect_finish(c, conn, ref.tcp.v6);

		if (conn->state == SPLICE_ESTABLISHED)
			epoll_ctl(c->epollfd, EPOLL_CTL_MOD, ref.s, &ev);

		move_to = ref.s;
		if (ref.s == conn->to) {
			move_from = conn->from;
			pipes = conn->pipe_from_to;
		} else {
			move_from = conn->to;
			pipes = conn->pipe_to_from;
		}
	} else {
		move_from = ref.s;
		if (ref.s == conn->from) {
			move_to = conn->to;
			pipes = conn->pipe_from_to;
		} else {
			move_to = conn->from;
			pipes = conn->pipe_to_from;
		}
	}

	if (events & EPOLLHUP) {
		tcp_splice_state(conn, SPLICE_FIN_BOTH);
	} else if (events & EPOLLRDHUP) {
		if (ref.s == conn->from) {
			if (conn->state == SPLICE_ESTABLISHED)
				tcp_splice_state(conn, SPLICE_FIN_FROM);
			else if (conn->state == SPLICE_FIN_TO)
				tcp_splice_state(conn, SPLICE_FIN_BOTH);
		} else {
			if (conn->state == SPLICE_ESTABLISHED)
				tcp_splice_state(conn, SPLICE_FIN_TO);
			else if (conn->state == SPLICE_FIN_FROM)
				tcp_splice_state(conn, SPLICE_FIN_BOTH);
		}
	}

swap:
	eof = 0;

	while (1) {
		int never_read = 1, retry_write = 1;
		ssize_t read, to_write = 0, written;

retry:
		read = splice(move_from, NULL, pipes[1], NULL, PIPE_SIZE,
			      SPLICE_F_MOVE);
		if (read < 0) {
			if (errno == EINTR)
				goto retry;

			if (errno != EAGAIN)
				goto close;

			to_write = PIPE_SIZE;
		} else if (!read) {
			eof = 1;
			to_write = PIPE_SIZE;
		} else {
			never_read = 0;
			to_write += read;
			if (move_from == conn->from)
				conn->from_read += read;
			else
				conn->to_read += read;
		}

eintr:
		written = splice(pipes[0], NULL, move_to, NULL, to_write,
				 SPLICE_F_MOVE);

		if (written > 0) {
			if (move_from == conn->from)
				conn->from_written += written;
			else
				conn->to_written += written;
		}

		if (written < 0) {
			if (errno == EINTR)
				goto eintr;

			if (errno != EAGAIN)
				goto close;

			if (never_read)
				break;

			if (retry_write--)
				goto retry;

			ev.events = EPOLLIN | EPOLLOUT | EPOLLRDHUP;
			ref.s = move_to;
			ev.data.u64 = ref.u64,
			epoll_ctl(c->epollfd, EPOLL_CTL_MOD, move_to, &ev);
			break;
		} else if (never_read && written == PIPE_SIZE) {
			goto retry;
		} else if (!never_read &&written < to_write) {
			to_write -= written;
			goto retry;
		}

		if (eof)
			break;
	}

	if (conn->state == SPLICE_FIN_BOTH ||
	    (conn->state == SPLICE_FIN_FROM && move_from == conn->from) ||
	    (conn->state == SPLICE_FIN_TO   && move_from == conn->to)) {
		if (move_from == conn->from &&
		    conn->from_read == conn->from_written) {
			if (!conn->from_fin_sent) {
				shutdown(move_to, SHUT_WR);
				conn->from_fin_sent = 1;
			}

			if (conn->to_fin_sent)
				goto close;
		} else if (move_from == conn->to &&
			   conn->to_read == conn->to_written) {
			if (!conn->to_fin_sent) {
				shutdown(move_to, SHUT_WR);
				conn->to_fin_sent = 1;
			}

			if (conn->from_fin_sent)
				goto close;
		}
	}

	if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) {
		events = EPOLLIN;

		SWAP(move_from, move_to);
		if (pipes == conn->pipe_from_to)
			pipes = conn->pipe_to_from;
		else
			pipes = conn->pipe_from_to;

		goto swap;
	}

	return;

close:
	tcp_splice_destroy(c, conn);
	return;
}

/**
 * tcp_sock_handler() - Handle new data from socket
 * @c:		Execution context
 * @ref:	epoll reference
 * @events:	epoll events bitmap
 * @now:	Current timestamp
 */
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
		      struct timespec *now)
{
	struct tcp_tap_conn *conn;

	if (ref.tcp.splice) {
		tcp_sock_handler_splice(c, ref, events);
		return;
	}

	if (ref.tcp.listen) {
		tcp_conn_from_sock(c, ref, now);
		return;
	}

	conn = &tt[ref.tcp.index];

	if (events & EPOLLERR) {
		if (conn->state != CLOSED)
			tcp_rst(c, conn);
		return;
	}

	switch (conn->state) {
	case TAP_SYN_SENT:
		if (events & EPOLLOUT)
			tcp_connect_finish(c, conn);
		else
			tcp_rst(c, conn);
		return;
	case ESTABLISHED_SOCK_FIN:
	case ESTABLISHED_SOCK_FIN_SENT:
	case ESTABLISHED:
		tcp_data_from_sock(c, conn, now);
		if (events & EPOLLHUP) {
			tcp_rst(c, conn);
		} else if (events & EPOLLRDHUP) {
			if (conn->state == ESTABLISHED)
				tcp_tap_state(conn, ESTABLISHED_SOCK_FIN);
			tcp_data_from_sock(c, conn, now);
		}
		return;
	case LAST_ACK:
		tcp_send_to_tap(c, conn, 0, NULL, 0);
		if (conn->seq_ack_to_tap == conn->seq_from_tap + 1)
			tcp_tap_destroy(c, conn);
		return;
	case FIN_WAIT_1:
		if (events & EPOLLIN)
			tcp_data_from_sock(c, conn, now);
		if (events & EPOLLRDHUP) {
			tcp_send_to_tap(c, conn, FIN | ACK, NULL, 0);
			tcp_tap_state(conn, FIN_WAIT_1_SOCK_FIN);
		}
		return;
	case CLOSE_WAIT:
	case FIN_WAIT_1_SOCK_FIN:
		if (events & EPOLLIN)
			tcp_data_from_sock(c, conn, now);
		if (events & EPOLLHUP) {
			if ((conn->seq_ack_to_tap == conn->seq_from_tap + 1) &&
			    (conn->seq_ack_from_tap == conn->seq_to_tap)) {
				tcp_tap_destroy(c, conn);
			} else {
				tcp_send_to_tap(c, conn, 0, NULL, 0);
			}
		}
		return;
	case TAP_SYN_RCVD:
	case SOCK_SYN_SENT:
	case SPLICE_ACCEPTED:
	case SPLICE_CONNECT:
	case SPLICE_ESTABLISHED:
	case SPLICE_FIN_FROM:
	case SPLICE_FIN_TO:
	case SPLICE_FIN_BOTH:
	case CLOSED:
		break;
	}

	if (events & EPOLLHUP)
		tcp_rst(c, conn);
}

/**
 * tcp_sock_init_ns() - Bind sockets in namespace for inbound connections
 * @arg:	Execution context
 *
 * Return: 0 on success, -1 on failure
 */
static int tcp_sock_init_ns(void *arg)
{
	union tcp_epoll_ref tref = { .listen = 1, .splice = 1 };
	struct ctx *c = (struct ctx *)arg;
	in_port_t port;

	ns_enter(c->pasta_pid);

	for (port = 0; port < USHRT_MAX; port++) {
		if (!bitmap_isset(c->tcp.port_to_init, port))
			continue;

		tref.index = (in_port_t)(port + tcp_port_delta_to_init[port]);

		if (c->v4) {
			tref.v6 = 0;
			sock_l4(c, AF_INET, IPPROTO_TCP, port, BIND_LOOPBACK,
				tref.u32);
		}

		if (c->v6) {
			tref.v6 = 1;
			sock_l4(c, AF_INET6, IPPROTO_TCP, port, BIND_LOOPBACK,
				tref.u32);
		}
	}

	return 0;
}

/**
 * tcp_sock_init() - Bind sockets for inbound connections, get key for sequence
 * @c:		Execution context
 *
 * Return: 0 on success, -1 on failure
 */
int tcp_sock_init(struct ctx *c)
{
	union tcp_epoll_ref tref = { .listen = 1 };
	in_port_t port;

	getrandom(&c->tcp.hash_secret, sizeof(c->tcp.hash_secret), GRND_RANDOM);

	for (port = 0; port < USHRT_MAX; port++) {
		if (!bitmap_isset(c->tcp.port_to_tap, port))
			continue;

		tref.index = (in_port_t)(port + tcp_port_delta_to_tap[port]);
		if (c->v4) {
			tref.v6 = 0;

			tref.splice = 0;
			sock_l4(c, AF_INET, IPPROTO_TCP, port,
				c->mode == MODE_PASTA ? BIND_EXT : BIND_ANY,
				tref.u32);

			if (c->mode == MODE_PASTA) {
				tref.splice = 1;
				sock_l4(c, AF_INET, IPPROTO_TCP, port,
					BIND_LOOPBACK, tref.u32);
			}
		}

		if (c->v6) {
			tref.v6 = 1;

			tref.splice = 0;
			sock_l4(c, AF_INET6, IPPROTO_TCP, port,
				c->mode == MODE_PASTA ? BIND_EXT : BIND_ANY,
				tref.u32);

			if (c->mode == MODE_PASTA) {
				tref.splice = 1;
				sock_l4(c, AF_INET6, IPPROTO_TCP, port,
					BIND_LOOPBACK, tref.u32);
			}
		}
	}

	if (c->v4)
		tcp_sock4_iov_init();

	if (c->v6)
		tcp_sock6_iov_init();

	if (c->mode == MODE_PASTA)
		NS_CALL(tcp_sock_init_ns, c);

	return 0;
}

/**
 * tcp_timer_one() - Handler for timed events on one socket
 * @c:		Execution context
 * @conn:	Connection pointer
 * @ts:		Timestamp from caller
 */
static void tcp_timer_one(struct ctx *c, struct tcp_tap_conn *conn,
			  struct timespec *ts)
{
	int ack_tap_ms = timespec_diff_ms(ts, &conn->ts_ack_tap);
	int sock_ms = timespec_diff_ms(ts, &conn->ts_sock);
	int tap_ms = timespec_diff_ms(ts, &conn->ts_tap);

	switch (conn->state) {
	case SOCK_SYN_SENT:
	case TAP_SYN_RCVD:
		if (ack_tap_ms > SYN_TIMEOUT)
			tcp_rst(c, conn);

		break;
	case ESTABLISHED_SOCK_FIN_SENT:
		if (ack_tap_ms > FIN_TIMEOUT) {
			tcp_rst(c, conn);
			break;
		}
		/* Falls through */
	case ESTABLISHED:
	case ESTABLISHED_SOCK_FIN:
		if (tap_ms > ACT_TIMEOUT && sock_ms > ACT_TIMEOUT) {
			tcp_rst(c, conn);
			break;
		}

		if (conn->seq_to_tap == conn->seq_ack_from_tap &&
		    conn->seq_from_tap == conn->seq_ack_to_tap) {
			conn->ts_sock = *ts;
			break;
		}

		if (sock_ms > ACK_INTERVAL) {
			if (conn->seq_from_tap > conn->seq_ack_to_tap)
				tcp_send_to_tap(c, conn, 0, NULL, 0);
		}

		if (sock_ms - ack_tap_ms > ACK_TIMEOUT) {
			if (conn->seq_ack_from_tap < conn->seq_to_tap) {
				if (sock_ms - ack_tap_ms > 10 * ACK_TIMEOUT) {
					tcp_rst(c, conn);
					break;
				}

				conn->seq_to_tap = conn->seq_ack_from_tap;
				if (sock_ms > ACK_TIMEOUT)
					tcp_data_from_sock(c, conn, ts);
			}
		}

		if (conn->seq_from_tap == conn->seq_ack_to_tap)
			conn->ts_sock = *ts;

		if (!conn->tcpi_snd_wnd)
			tcp_send_to_tap(c, conn, 0, NULL, 0);

		break;
	case CLOSE_WAIT:
	case FIN_WAIT_1_SOCK_FIN:
		if (sock_ms - ack_tap_ms > FIN_TIMEOUT)
			tcp_rst(c, conn);
		break;
	case FIN_WAIT_1:
		if (sock_ms > FIN_TIMEOUT)
			tcp_rst(c, conn);
		break;
	case LAST_ACK:
		if (sock_ms > LAST_ACK_TIMEOUT)
			tcp_rst(c, conn);
		break;
	case TAP_SYN_SENT:
	case SPLICE_ACCEPTED:
	case SPLICE_CONNECT:
	case SPLICE_ESTABLISHED:
	case SPLICE_FIN_FROM:
	case SPLICE_FIN_TO:
	case SPLICE_FIN_BOTH:
	case CLOSED:
		break;
	}
}

/**
 * tcp_timer() - Scan activity bitmap for sockets waiting for timed events
 * @c:		Execution context
 * @ts:		Timestamp from caller
 */
void tcp_timer(struct ctx *c, struct timespec *ts)
{
	long *word = (long *)tcp_act, tmp;
	unsigned int i;
	int n;

	for (i = 0; i < sizeof(tcp_act) / sizeof(long); i++, word++) {
		tmp = *word;
		while ((n = ffsl(tmp))) {
			int index = i * sizeof(long) * 8 + n - 1;

			tmp &= ~(1UL << (n - 1));
			tcp_timer_one(c, &tt[index], ts);
		}
	}
}