aboutgitcodebugslistschat
path: root/tcp.c
blob: ec8c32ea288394c268a400a3cbeed620cc6e87d8 (plain) (tree)
1
2
3
4
5
6
7
8
9
10


                                             



                                         


                                              
                                       
                                              





                           

             






                                                                                
                                                                              

























                                                                                
                                                                    




                                                                                







                                                                                
                                                               


                                                                           
                                                                              
                                                                             
                                                                           

                                                                              





                                                                           
                                                                               
                                                                               

                                                                              












                                                                               
                                                                          
                                                                                




                                                                                
                 















                                                                                

                                                                               


                                                     
                    

                    



















                                                                               
  
                                                   
  
                                             
  





                                                                               















                                                                                
                                                                           
  


                                                                             

                                                                              


                                                                               
  


                                                                               
  



                                                                              
  





                                                                             











                                                                           
  

                                                                          

                              

                               
                                


                                                               
                                                                                



                                                              





                                                                               
  


                                




                                                                              


                                                                              
                              










                                                                                
                                                      

   

                  
                  
                   




                         
                       



                      
                    
                       
      
                       
                        
                      
                    
                   

                 

                                                
                     
                 

                  
                    
                 
                 
                       
 
                                           

                                                                         
 


                                                                            
                                                                 
                                                               
                                                                         
                                                             
 
                                         
                                                              

                                 
                                           




































                                                                             
 
                                                                      




                                                               
 




                                                                
 
                                                                         
 
                                         
                                                   
 




                                                                     




                                                                    



                                
                              
                                
                                                                 



                         
                         
                         
                         




                         
                                                                  

                                                                                
                                                


                                                                         
                                                                    




                                                                             





                                                                           

                                                                            




                                                                              
   
                 
                                                                 
 







                                                                      
 
 


                                                         













                                                                         



                                                         






                                      
 
 
                                                              
 



                                                                              
 






























                                                                                

  



                                                                         
 
                                              
 





                                                                                




                                                                           

  











                                                                         
                                             

  



                                                                     




                                                                              


                                                                        




                                                                        

                                                                                





                                                                              
                             



                                                             

                                                                    
      



                                                                     

                                                                        




                                                              
                            
 

                                     


                                                                     
                                                                              





                                                                              
                      








                                                                     

                                                                        




                                                              
                            
 

                                     
 
                                      
                                                             
                                                                     
 



                                                                 
 
                                                                 
 
                         
                                                             
 










                                                                                
                                   



                                                             

                                                                    
      



                                                                     





                                                              
                                  
 

                                           









                                                                              
                                   














                                                                                
                                  
 

                                           
 
                     
                                         
 
                                                                   























                                                                         
                                         
                                                                         









                                                       

                                                                    












                                                                         
                                                                    
 
                                                                         




                                                                          




                                                                               







                                                                     
                                                                 
 











                                                                             



                 





                                                                                
                                                                     


                                                


                                   






                                                                         
                       
 

                                                        


                                                                               
                                 








                                                                               
                                           



                                                                       







                                                                   
                                                                       






                                                                           



                                              
                                                                    






                                            



                                                                      




                                       



                                                              

         
                                                
                                       
 


                                                                            
                                       







                                                                      
                                                                     







                                              
                                       









                                                                              





                                                                            
                                       


                          
                                                                



                                                                                
                                                      

                                                                                





                                                                       
                                                          
                                       
















                                                                         
 








                                                                            
 
   

                                                                        
  

                                                            
                                                       



                                                
                                                                     







                                                                            
                                                     
   

                                                           
 
                  

                         

                                                         


                                                  
                                                                     




                                                                           





                                                                             



                                                                      



                        


   


                                                                              
                                                 
 
                                   
                     
                   
 

                                                                 
                                                 


                       
                   




                                                                              
                                          


   


                                                                         
                                                     





                                                                             

                                                                                
 

                                                                                


   
























































                                                                           
                                                                              
                                             


              
                                              

                                                                        





                                                               


                                                                




                                                                 


                                                                  


                            
                                                                
                                 



                                                                             



                                                                      
                                                     
                                

                                                                           












                                                                               

                                                               
                              






                                                                               
                              




                                                                             
                                                                        


                                                         
 
                                                                            
                                                               









                                                                               















                                                                               
                                                                        


                                                         
 
                                                                            
                                                               


   
                                                                

                                                                           
                                       

                                                                
  
                                                                      
   

                                                                       
 
                             
 

                          
 

                                                         


                                  
                                   

                              

                                               

                                 
                                              

                                      


                                                     
                                                  




                                         
                                             
                               
                                                                
                                
                                                                
                         






                  
                                                                           







                                                                
                                                                                
                                                                  
 
                                                                  




                                                                       
                                                                  






                                                                       
                                                                           
                                 






                                                                


                                                             
                                                                           
                                                                     
 
                       






                                                    
                                                                     

                  
                                                                   





                                                    
                                                                      

                  
                                                                    





                                                       
                                                                    
                                 
                                  

                                                                
   

                                                                       


              
                                                                   
                                                             
                          

                              

                                                                              


   

                                                                    
   
                                                        
 
                                             
                                  
 

                                                                     
                                    
                                 
                                                                    
                            
                                                                            



                              
                                                                                

                                                                  






                                                          
                                                                       
 
                                             

                                 

                                                                     

                                   
                                                            
                            
                                                 
                              

                 
 
                                                                               
                                             
                                                          


   
                                                                        
                                 




                                                                
                                                          
   

                                                                    
                                                                                
 
                                                           
                              
 
                                                                              

                                                                        

         
                    


   
                                                               
                                 
                                                     
   
                                                                   
 
                                   
 
                                                 
                                                                               
                                       
                                               


                       
                                       
                                          



                                  
                             
 
                                                                          
                                          
                                                        

                                       


   
                                                                                
                                 
                                  
   
                                                                  
 
                          

                                   
 

                                   

 


                                                                         
                                                                         


                                                                         
 
   
                                                                        
                                 



                                                                            
   
                                                                       
 


                                                                                
                                                                  
                              

         
                 


   




                                                                                

                                                                       













                                                                        

                                                                                


   
                                                                                
                                 


                                                                      

                                                              
                                                                       
 

                                          

                                    


                                                                               
                
                         
 


                                                            
                                                         
                                    

                 


                                   
 


                                                                                
   
                                                 







                                                   
                                                      



                                                   
                                                      
 
 


                                                                    
   
                                                







                                             
                                                      



                                             
                                                      


   


                                                       
                                     
 



                                                                    

                                  
 
                                    








                                                                      










                                                                                
                                   
   

                                                                  
                                                           
                                                                          



                                                                         












                                                                         


                                                                    





                                                                               
                                                            
                    
                                                         






                                                          


                                                               









                                                                             
                                                























                                                                        
                                                                                
                                                                    


                                                            
                                                                            
                                                                       
 
                                                                       
                                                        

                                      
                                                  

                           
                       
                        




                                                          
                                                                              
                                                                             

                                                                


                                                                         


                                         
                                                                




                                                                  
                             
 
                                         
                                     
                                                                   

                                                                         
                         

         
                     
                                                       
                                 
 
                                   
                                                                   
                                 
 

         
                  
                                                             
                                                     

                                     

                                                              
         
      
 




                                                                             
 

                                                   
 
    
                                                         



                                                       
                                                                
                                 
                                  
                                                                      
  
                                                               
   
                                                                         
 
                                                        
                                                    

                                              

                                      
                                   
                           
                          
                          
                   
                
 


                                                               
 
                                                            
                                            
                                   

         




                                                         
                                   
                                                
 
                                                                     
                         
 
                            

                                                                      
                             


                                                                         
                

                                                                      
                             
                                

         
                          
                        
 


                                                                  

                                      

                                   
                                                 
                        
                                                             
                                          


                                                              
 
                                          
                                                                             
                                                          
                                                 
                                                                 
                 
                                                               
 

                                            
 
                                                                     
 


                                     
                                          

                                          
                
                                                        

                                                                    

         
                                              




                                  


                                                                  
 




                                                        

                                                    
 

                                                     


                                                                      
                                   





                                                          
                                                                


                                                                               
                                                  




                                                          
                                                                
                 
 
                                                                               
                                                  
         
 



                 
                                                                               


                                  
                                                            
 
                                   

                       
                                         
                                            


   
                                                                             
                                  


                                                                     

                                                           
 
                                                               
 




                                        
 





                                                                      

                                                                        

                                                                       
                           
 

                                   
 


                                        
 



                                                                          
         

                                                                      


                                                                             
                                        


   





                                                                           
                                 


                               
                                                                           
                                                                  
                                                        
 
                             
 








                                                       
                                               


                                           
                                                                      








                                                        
                                           


                                           
                                                                      

         

                                                                               




                        
                                                                                
                                 


                                                                              
   
                                                                 
 
                                                                               
 


                                                       
                              

         
                  
                                                                         
 




                             
                  
                              
 
                                   
 

                 
 
   
                                                             

                                  

                                                                     


                            


                                                                 



                         
                                                                       



















                                                                         


                                                                     

                                 


                                                                        













                                                      
                   







                                               
                                                                        
                                                                         
                                                                           





                                                                        
                                                    
                                                 
                  



                                                                              

         
                                         
                       
                         
                                          
 
                                          
 
                                                      

                                                                       
                           
 






                                                                              

                            
                                               

                                   


                                                                        
                
                                               

                                   
                                                                   

         

                                           
 


                                                         
 

                                                                                
 
                                           
 
                               
                                                                      



                                                      
 

                                           
                                         


                               
                                     
                

                                     
                                                      
                               
 
                                                      

         
                               


   




                                                                               
   
                                                                    
 





                                                                           
 


                                                                    
 

                                         
 
 
   
                                                                              

                                  



                                                                             
   

                                                                    
 

                          
 


                                                                         
 
                                                                            
 


                                                                              
                                                 







                                                                              
                                                 
         




                                                                              
                                  
  
                                                    

                    
   
                                                                   
 
                                                                      
                                                            
                                                   
                                       
                                      
                                     
                              
                          


                                                                 
                                      
                                           
                                                                    

                                                                


                                 
                                                        

                                                     
                         
         
 
                                                                            
                                                                 

                                       

                            
                                                            
         
 





                                               
                                                                              
                                                                              
                                         
 



                                                             




                                                                                
                                   
         
                    
                                                      
 
                                                                              
        
                                             

                                   
                                     
                         
         
 


                              

                                     
                                            
                         

         
                                     
 

                                                   
 
                                                  
                                                
 
                                   
                   
                                         
                                                                          



                                        
                                                                          
                                         

         
                                             
 
                 


                                                      
                             
                                 
         
 
                   

         
                                                                               
                                                                




                                                  
         
 
                 


   
                                                                  

                                  
                                                     

                    
   
                                                                   
                                                   
 
                                                                              
                                                      
                                                      
                                                   
                                                  
                   
                  
 
                                                        
                                                  
                                  
                           

                           




                                                    

                                                            



                                         
                                     





                                                     
                                                    



                               


                                                        





                                             
 

                                                                      
                                                      

                                                                
                                                                            

                                                                    
                                                      
                         




                                


                                 






                                                                     
                                                             





                                                                        
                                                              

                                                                             



                                              
                                  
                   
                                                                             

                                 
                                            
                                       



                                         


                                                            

                        
                              
                                  


                                     

         
                                                   
 
                  




                                                              


                                                    
                   
                                                                              
                                                     
                                                     
                                               
                                            

         
                   
                         
 
                              
      







                                                                  
                                                    

                 


                                   
                                                              
                                                              





                                 
                                                           
                                 
                                        
                                                      

                                        

         

                         





                                                                              
                                                        
                 
                       
         
 


                                                       
 


                                     
                                                  
                
                                                      



         


                                                                          


                                                                             

                                                                           

                                                                      
 







                                                               









                                                                   

                                              


   


                                                                    
                                   
                                                     
                                 

                                    
   

                                                                     
 
                              

                           
                        

                   


                                          




                                                    

                                                                           
                                                           
 
                                                                                

                                     
                    
                                                
                                                                              
                         

         

                                                                               
                      

                                            

         





                                                      
 


                                                   
                                                                             
                    
                                         
 

                         
 



                                                         
                                        
                 
 
                                                 
 
                              

                                             
                                                      
                                                    

                                                           
                                        


                               
                                         
                                        

                 


                                                             
 
                                  

                                 
 

                                                                 

                                                               
                                                    
 



                                                             
                                      

                                                       



                                                                               

                                            

         


                                                   
                        




                                                                            
                                  
   
                                                                    
 



                        

                                                                           


                       
                                              

                       
                                              
                                             








                                                                             
                                                          
 
                                   
                              
                     

              
                                               

                       
                        
                                                                         


                       

                                         
                         
                                                
                                           
 
                                 
                                        
 
                                               
 
                                                           

                                                                            

                                             

                                                              
                            
                                                      


                                                                  
 


                                                                        
                                                       
 
                                                                            



                                                                
                                                                   
                



                                               
                                                    



                                                                       
                                                                   

                                                                              
 
                                       
                                                                     
 
                                                      
                                                       
 
                                                                    



                                                                
                                                           



                                                      
                                            
 

                                             
 
                             


   


























                                                                              
                                                                              
                                         
                                                                           
                                                                        

                                                              
                                                                               


                                         
                                                                               


                                                                  
                                               












                                                                                
                                                                             






                                                                     
                                 
                               
                                   
                                 
   
                                                                          
                                                 
 
                              
 




                                          
                                     


                                                        
 
                                     
                                                


                       
                                                          
                       
 


                                   
                                
                                 


                       
                                                                   
                                            
                       
         
 

                                                                  
                                                    
 

                                                           
 
                                     
                                                    
 

                                                                
 
                       

         


                                              
                       
         
 

                                                         
                       
 

                                           
                                                    

                                       


   
                                                                  

                                                                              

                                                                            

                                

                                                               
 
                                                       
                              

              






                                                                           
 

                                                   
                                                 

                                         
 
                                
                                    
 
                          
                                                                             
                                              
                                   
                                                           







                                                                

                                                                          
                                            

                                                                             
                                   
                                                           











                                                                       

                                                   
                                                 


                                         
                                
 
                                    
                          
                                                                              
                                              
                                   
                                                           







                                                                

                                                      
                                            

                                                                              
                                   
                                                           













                                                                       


                                                                         
            


                                      
                                          
                 
 
                    
 


                                                             
 
                                                           





                 

















                                                                          
                        

                    
                               






                                     
                                                                      
                             
                              
 
                                                                                





                                       

                                                        

         
                                                                      
                             
                              
 

                                                                   





                                       

                                                        





                 
                                                                             

                                 
                                       
   
                           
 
                                                         
              







                                                                            
 











                                                       

                                                                      
                           


                                                         
 

                                                                            
 
                    

                                     
                    
                                     
 







                                                                           


                                     

                                   
                                             
 

                                                      
 
                                    

         



                 



















                                                                          
                               



























                                                                          
                 

                            
                               


                                                                          
                                                                 
                                                                     
                                                                   

                                 
                                                                 
                                                                     
                                                                   








                                                                      

                                                                          
                                                                              



                                                                         
                                                                       
                                                                           
                                                                         

                                 
                                                                       
                                                                           
                                                                         

                                 
                                                                      
                                                                          
                                                                        

                                 
                                                                      
                                                                          
                                                                        







                                                                       

                                                                                
                                                                              






                 
                                                                                
                                 
                      
   
                                                        
 
                                                         
                              
 
                 
 









                                                                 
 




                                                              
                 
         
 


                                                                      

         


                                     

                                                                        


                                                              
         
 
// SPDX-License-Identifier: AGPL-3.0-or-later

/* PASST - Plug A Simple Socket Transport
 *  for qemu/UNIX domain socket mode
 *
 * PASTA - Pack A Subtle Tap Abstraction
 *  for network namespace/tap device mode
 *
 * tcp.c - TCP L2-L4 translation state machine
 *
 * Copyright (c) 2020-2022 Red Hat GmbH
 * Author: Stefano Brivio <sbrivio@redhat.com>
 */

/**
 * DOC: Theory of Operation
 *
 *
 * PASST mode
 * ==========
 *
 * This implementation maps TCP traffic between a single L2 interface (tap) and
 * native TCP (L4) sockets, mimicking and reproducing as closely as possible the
 * inferred behaviour of applications running on a guest, connected via said L2
 * interface. Four connection flows are supported:
 * - from the local host to the guest behind the tap interface:
 *   - this is the main use case for proxies in service meshes
 *   - we bind to configured local ports, and relay traffic between L4 sockets
 *     with local endpoints and the L2 interface
 * - from remote hosts to the guest behind the tap interface:
 *   - this might be needed for services that need to be addressed directly,
 *     and typically configured with special port forwarding rules (which are
 *     not needed here)
 *   - we also relay traffic between L4 sockets with remote endpoints and the L2
 *     interface
 * - from the guest to the local host:
 *   - this is not observed in practice, but implemented for completeness and
 *     transparency
 * - from the guest to external hosts:
 *   - this might be needed for applications running on the guest that need to
 *     directly access internet services (e.g. NTP)
 *
 * Relevant goals are:
 * - transparency: sockets need to behave as if guest applications were running
 *   directly on the host. This is achieved by:
 *   - avoiding port and address translations whenever possible
 *   - mirroring TCP dynamics by observation of socket parameters (TCP_INFO
 *     socket option) and TCP headers of packets coming from the tap interface,
 *     reapplying those parameters in both flow directions (including TCP_MSS,
 *     TCP_WINDOW_CLAMP socket options)
 * - simplicity: only a small subset of TCP logic is implemented here and
 *   delegated as much as possible to the TCP implementations of guest and host
 *   kernel. This is achieved by:
 *   - avoiding a complete TCP stack reimplementation, with a modified TCP state
 *     machine focused on the translation of observed events instead
 *   - mirroring TCP dynamics as described above and hence avoiding the need for
 *     segmentation, explicit queueing, and reassembly of segments
 * - security:
 *   - no dynamic memory allocation is performed
 *   - TODO: synflood protection
 *
 * Portability is limited by usage of Linux-specific socket options.
 *
 *
 * Limits
 * ------
 *
 * To avoid the need for dynamic memory allocation, a maximum, reasonable amount
 * of connections is defined by TCP_MAX_CONNS (currently 128k).
 *
 * Data needs to linger on sockets as long as it's not acknowledged by the
 * guest, and is read using MSG_PEEK into preallocated static buffers sized
 * to the maximum supported window, 16 MiB ("discard" buffer, for already-sent
 * data) plus a number of maximum-MSS-sized buffers. This imposes a practical
 * limitation on window scaling, that is, the maximum factor is 256. Larger
 * factors will be accepted, but resulting, larger values are never advertised
 * to the other side, and not used while queueing data.
 *
 *
 * Ports
 * -----
 *
 * To avoid the need for ad-hoc configuration of port forwarding or allowed
 * ports, listening sockets can be opened and bound to all unbound ports on the
 * host, as far as process capabilities allow. This service needs to be started
 * after any application proxy that needs to bind to local ports. Mapped ports
 * can also be configured explicitly.
 *
 * No port translation is needed for connections initiated remotely or by the
 * local host: source port from socket is reused while establishing connections
 * to the guest.
 * 
 * For connections initiated by the guest, it's not possible to force the same
 * source port as connections are established by the host kernel: that's the
 * only port translation needed.
 *
 *
 * Connection tracking and storage
 * -------------------------------
 *
 * Connections are tracked by the @tc array of struct tcp_conn, containing
 * addresses, ports, TCP states and parameters. This is statically allocated and
 * indexed by an arbitrary connection number. The array is compacted whenever a
 * connection is closed, by remapping the highest connection index in use to the
 * one freed up.
 *
 * References used for the epoll interface report the connection index used for
 * the @tc array.
 *
 * IPv4 addresses are stored as IPv4-mapped IPv6 addresses to avoid the need for
 * separate data structures depending on the protocol version.
 *
 * - Inbound connection requests (to the guest) are mapped using the triple
 *   < source IP address, source port, destination port >
 * - Outbound connection requests (from the guest) are mapped using the triple
 *   < destination IP address, destination port, source port >
 *   where the source port is the one used by the guest, not the one used by the
 *   corresponding host socket
 *
 *
 * Initialisation
 * --------------
 *
 * Up to 2^15 + 2^14 listening sockets (excluding ephemeral ports, repeated for
 * IPv4 and IPv6) can be opened and bound to wildcard addresses. Some will fail
 * to bind (for low ports, or ports already bound, e.g. by a proxy). These are
 * added to the epoll list, with no separate storage.
 *
 *
 * Events and states
 * -----------------
 *
 * Instead of tracking connection states using a state machine, connection
 * events are used to determine state and actions for a given connection. This
 * makes the implementation simpler as most of the relevant tasks deal with
 * reactions to events, rather than state-associated actions. For user
 * convenience, approximate states are mapped in logs from events by
 * @tcp_state_str.
 *
 * The events are:
 *
 * - SOCK_ACCEPTED	connection accepted from socket, SYN sent to tap/guest
 *
 * - TAP_SYN_RCVD	tap/guest initiated connection, SYN received
 *
 * - TAP_SYN_ACK_SENT	SYN, ACK sent to tap/guest, valid for TAP_SYN_RCVD only
 *
 * - ESTABLISHED	connection established, the following events are valid:
 *
 * - SOCK_FIN_RCVD	FIN (EPOLLRDHUP) received from socket
 *
 * - SOCK_FIN_SENT	FIN (write shutdown) sent to socket
 *
 * - TAP_FIN_RCVD	FIN received from tap/guest
 *
 * - TAP_FIN_SENT	FIN sent to tap/guest
 *
 * - TAP_FIN_ACKED	ACK to FIN seen from tap/guest
 *
 * Setting any event in CONN_STATE_BITS (SOCK_ACCEPTED, TAP_SYN_RCVD,
 * ESTABLISHED) clears all the other events, as those represent the fundamental
 * connection states. No events (events == CLOSED) means the connection is
 * closed.
 *
 * Connection setup
 * ----------------
 *
 * - inbound connection (from socket to guest): on accept() from listening
 *   socket, the new socket is mapped in connection tracking table, and
 *   three-way handshake initiated towards the guest, advertising MSS and window
 *   size and scaling from socket parameters
 * - outbound connection (from guest to socket): on SYN segment from guest, a
 *   new socket is created and mapped in connection tracking table, setting
 *   MSS and window clamping from header and option of the observed SYN segment
 *
 * 
 * Aging and timeout
 * -----------------
 *
 * Timeouts are implemented by means of timerfd timers, set based on flags:
 *
 * - SYN_TIMEOUT: if no ACK is received from tap/guest during handshake (flag
 *   ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, reset the
 *   connection
 *
 * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
 *   data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
 *   socket and reset sequence to what was acknowledged. If this persists for
 *   more than TCP_MAX_RETRANS times in a row, reset the connection
 *
 * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
 *   with TAP_FIN_SENT event), and no ACK is received within this time, reset
 *   the connection
 *
 * - FIN_TIMEOUT: if a FIN segment was acknowledged by tap/guest and a FIN
 *   segment (write shutdown) was sent via socket (events SOCK_FIN_SENT and
 *   TAP_FIN_ACKED), but no socket activity is detected from the socket within
 *   this time, reset the connection
 *
 * - ACT_TIMEOUT, in the presence of any event: if no activity is detected on
 *   either side, the connection is reset
 *
 * - ACK_INTERVAL elapsed after data segment received from tap without having
 *   sent an ACK segment, or zero-sized window advertised to tap/guest (flag
 *   ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent
 *
 *
 * Summary of data flows (with ESTABLISHED event)
 * ----------------------------------------------
 *
 * @seq_to_tap:		next sequence for packets to tap/guest
 * @seq_ack_from_tap:	last ACK number received from tap/guest
 * @seq_from_tap:	next sequence for packets from tap/guest (expected)
 * @seq_ack_to_tap:	last ACK number sent to tap/guest
 *
 * @seq_init_from_tap:	initial sequence number from tap/guest
 * @seq_init_to_tap:	initial sequence number from tap/guest
 *
 * @wnd_from_tap:	last window size received from tap, never scaled
 * @wnd_from_tap:	last window size advertised from tap, never scaled
 *
 * - from socket to tap/guest:
 *   - on new data from socket:
 *     - peek into buffer
 *     - send data to tap/guest:
 *       - starting at offset (@seq_to_tap - @seq_ack_from_tap)
 *       - in MSS-sized segments
 *       - increasing @seq_to_tap at each segment
 *       - up to window (until @seq_to_tap - @seq_ack_from_tap <= @wnd_from_tap)
 *     - on read error, send RST to tap/guest, close socket
 *     - on zero read, send FIN to tap/guest, set TAP_FIN_SENT
 *   - on ACK from tap/guest:
 *     - set @ts_ack_from_tap
 *     - check if it's the second duplicated ACK
 *     - consume buffer by difference between new ack_seq and @seq_ack_from_tap
 *     - update @seq_ack_from_tap from ack_seq in header
 *     - on two duplicated ACKs, reset @seq_to_tap to @seq_ack_from_tap, and
 *       resend with steps listed above
 *     - set TCP_WINDOW_CLAMP from TCP header from tap
 *
 * - from tap/guest to socket:
 *   - on packet from tap/guest:
 *     - set @ts_tap_act
 *     - set TCP_WINDOW_CLAMP from TCP header from tap
 *     - check seq from header against @seq_from_tap, if data is missing, send
 *       two ACKs with number @seq_ack_to_tap, discard packet
 *     - otherwise queue data to socket, set @seq_from_tap to seq from header
 *       plus payload length
 *     - in ESTABLISHED state, send ACK to tap as soon as we queue to the
 *       socket. In other states, query socket for TCP_INFO, set
 *       @seq_ack_to_tap to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and
 *       send ACK to tap/guest
 *
 *
 * PASTA mode
 * ==========
 *
 * For traffic directed to TCP ports configured for mapping to the tuntap device
 * in the namespace, and for non-local traffic coming from the tuntap device,
 * the implementation is identical as the PASST mode described in the previous
 * section.
 *
 * For local traffic directed to TCP ports configured for direct mapping between
 * namespaces, see the implementation in tcp_splice.c.
 */

#include <sched.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <limits.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <sys/epoll.h>
#ifdef HAS_GETRANDOM
#include <sys/random.h>
#endif
#include <sys/socket.h>
#include <sys/timerfd.h>
#include <sys/types.h>
#include <sys/uio.h>
#include <unistd.h>
#include <time.h>

#include <linux/tcp.h> /* For struct tcp_info */

#include "checksum.h"
#include "util.h"
#include "passt.h"
#include "tap.h"
#include "siphash.h"
#include "pcap.h"
#include "conf.h"
#include "tcp_splice.h"

#define TCP_FRAMES_MEM			128
#define TCP_FRAMES							\
	(c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)

#define TCP_FILE_PRESSURE		30	/* % of c->nofile */
#define TCP_CONN_PRESSURE		30	/* % of c->tcp.conn_count */

#define TCP_HASH_BUCKET_BITS		(TCP_CONN_INDEX_BITS + 1)
#define TCP_HASH_TABLE_LOAD		70		/* % */
#define TCP_HASH_TABLE_SIZE		(TCP_MAX_CONNS * 100 /		\
					 TCP_HASH_TABLE_LOAD)

#define MAX_WS				8
#define MAX_WINDOW			(1 << (16 + (MAX_WS)))

/* MSS rounding: see SET_MSS() */
#define MSS_DEFAULT			536

struct tcp4_l2_head {	/* For MSS4 macro: keep in sync with tcp4_l2_buf_t */
	uint32_t psum;
	uint32_t tsum;
#ifdef __AVX2__
	uint8_t pad[18];
#else
	uint8_t pad[2];
#endif
	uint32_t vnet_len;
	struct ethhdr eh;
	struct iphdr iph;
	struct tcphdr th;
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)));
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif

struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
#ifdef __AVX2__
	uint8_t pad[14];
#else
	uint8_t pad[2];
#endif
	uint32_t vnet_len;
	struct ethhdr eh;
	struct ipv6hdr ip6h;
	struct tcphdr th;
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)));
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif

#define MSS4	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4)
#define MSS6	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4)

#define WINDOW_DEFAULT			14600		/* RFC 6928 */
#ifdef HAS_SND_WND
# define KERNEL_REPORTS_SND_WND(c)	(c->tcp.kernel_snd_wnd)
#else
# define KERNEL_REPORTS_SND_WND(c)	(0 && (c))
#endif

#define ACK_INTERVAL			50		/* ms */
#define SYN_TIMEOUT			10		/* s */
#define ACK_TIMEOUT			2
#define FIN_TIMEOUT			60
#define ACT_TIMEOUT			7200

#define TCP_SOCK_POOL_TSH		16 /* Refill in ns if > x used */

#define LOW_RTT_TABLE_SIZE		8
#define LOW_RTT_THRESHOLD		10 /* us */

/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of
 * <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP
 */
#define SOL_TCP				IPPROTO_TCP

#define SEQ_LE(a, b)			((b) - (a) < MAX_WINDOW)
#define SEQ_LT(a, b)			((b) - (a) - 1 < MAX_WINDOW)
#define SEQ_GE(a, b)			((a) - (b) < MAX_WINDOW)
#define SEQ_GT(a, b)			((a) - (b) - 1 < MAX_WINDOW)

#define FIN		(1 << 0)
#define SYN		(1 << 1)
#define RST		(1 << 2)
#define ACK		(1 << 4)
/* Flags for internal usage */
#define DUP_ACK		(1 << 5)
#define ACK_IF_NEEDED	0		/* See tcp_send_flag() */

#define OPT_EOL		0
#define OPT_NOP		1
#define OPT_MSS		2
#define OPT_MSS_LEN	4
#define OPT_WS		3
#define OPT_WS_LEN	3
#define OPT_SACKP	4
#define OPT_SACK	5
#define OPT_TS		8

/**
 * struct tcp_conn - Descriptor for a TCP connection (not spliced)
 * @next_index:		Connection index of next item in hash chain, -1 for none
 * @tap_mss:		MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
 * @sock:		Socket descriptor number
 * @events:		Connection events, implying connection states
 * @timer:		timerfd descriptor for timeout events
 * @flags:		Connection flags representing internal attributes
 * @hash_bucket:	Bucket index in connection lookup hash table
 * @retrans:		Number of retransmissions occurred due to ACK_TIMEOUT
 * @ws_from_tap:	Window scaling factor advertised from tap/guest
 * @ws_to_tap:		Window scaling factor advertised to tap/guest
 * @sndbuf:		Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
 * @seq_dup_ack_approx:	Last duplicate ACK number sent to tap
 * @a.a6:		IPv6 remote address, can be IPv4-mapped
 * @a.a4.zero:		Zero prefix for IPv4-mapped, see RFC 6890, Table 20
 * @a.a4.one:		Ones prefix for IPv4-mapped
 * @a.a4.a:		IPv4 address
 * @tap_port:		Guest-facing tap port
 * @sock_port:		Remote, socket-facing port
 * @wnd_from_tap:	Last window size from tap, unscaled (as received)
 * @wnd_to_tap:		Sending window advertised to tap, unscaled (as sent)
 * @seq_to_tap:		Next sequence for packets to tap
 * @seq_ack_from_tap:	Last ACK number received from tap
 * @seq_from_tap:	Next sequence for packets from tap (not actually sent)
 * @seq_ack_to_tap:	Last ACK number sent to tap
 * @seq_init_from_tap:	Initial sequence number from tap
 */
struct tcp_conn {
	int	 	next_index	:TCP_CONN_INDEX_BITS + 2;

#define TCP_RETRANS_BITS		3
	unsigned int	retrans		:TCP_RETRANS_BITS;
#define TCP_MAX_RETRANS			((1U << TCP_RETRANS_BITS) - 1)

#define TCP_WS_BITS			4	/* RFC 7323 */
#define TCP_WS_MAX			14
	unsigned int	ws_from_tap	:TCP_WS_BITS;
	unsigned int	ws_to_tap	:TCP_WS_BITS;


	int		sock		:SOCKET_REF_BITS;

	uint8_t		events;
#define CLOSED			0
#define SOCK_ACCEPTED		BIT(0)	/* implies SYN sent to tap */
#define TAP_SYN_RCVD		BIT(1)	/* implies socket connecting */
#define  TAP_SYN_ACK_SENT	BIT( 3)	/* implies socket connected */
#define ESTABLISHED		BIT(2)
#define  SOCK_FIN_RCVD		BIT( 3)
#define  SOCK_FIN_SENT		BIT( 4)
#define  TAP_FIN_RCVD		BIT( 5)
#define  TAP_FIN_SENT		BIT( 6)
#define  TAP_FIN_ACKED		BIT( 7)

#define	CONN_STATE_BITS		/* Setting these clears other flags */	\
	(SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED)


	int		timer		:SOCKET_REF_BITS;

	uint8_t		flags;
#define STALLED			BIT(0)
#define LOCAL			BIT(1)
#define WND_CLAMPED		BIT(2)
#define IN_EPOLL		BIT(3)
#define ACTIVE_CLOSE		BIT(4)
#define ACK_TO_TAP_DUE		BIT(5)
#define ACK_FROM_TAP_DUE	BIT(6)


	unsigned int	hash_bucket	:TCP_HASH_BUCKET_BITS;

#define TCP_MSS_BITS			14
	unsigned int	tap_mss		:TCP_MSS_BITS;
#define MSS_SET(conn, mss)	(conn->tap_mss = (mss >> (16 - TCP_MSS_BITS)))
#define MSS_GET(conn)		(conn->tap_mss << (16 - TCP_MSS_BITS))


#define SNDBUF_BITS		24
	unsigned int	sndbuf		:SNDBUF_BITS;
#define SNDBUF_SET(conn, bytes)	(conn->sndbuf = ((bytes) >> (32 - SNDBUF_BITS)))
#define SNDBUF_GET(conn)	(conn->sndbuf << (32 - SNDBUF_BITS))

	uint8_t		seq_dup_ack_approx;


	union {
		struct in6_addr a6;
		struct {
			uint8_t zero[10];
			uint8_t one[2];
			struct in_addr a;
		} a4;
	} a;
#define CONN_V4(conn)		IN6_IS_ADDR_V4MAPPED(&conn->a.a6)
#define CONN_V6(conn)		(!CONN_V4(conn))

	in_port_t	tap_port;
	in_port_t	sock_port;

	uint16_t	wnd_from_tap;
	uint16_t	wnd_to_tap;

	uint32_t	seq_to_tap;
	uint32_t	seq_ack_from_tap;
	uint32_t	seq_from_tap;
	uint32_t	seq_ack_to_tap;
	uint32_t	seq_init_from_tap;
};

#define CONN_IS_CLOSING(conn)						\
	((conn->events & ESTABLISHED) &&				\
	 (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
#define CONN_HAS(conn, set)	((conn->events & (set)) == (set))

#define CONN(index)		(tc + (index))

/* We probably don't want to use gcc statement expressions (for portability), so
 * use this only after well-defined sequence points (no pre-/post-increments).
 */
#define CONN_OR_NULL(index)						\
	(((int)(index) >= 0 && (index) < TCP_MAX_CONNS) ? (tc + (index)) : NULL)

static const char *tcp_event_str[] __attribute((__unused__)) = {
	"SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",

	"SOCK_FIN_RCVD", "SOCK_FIN_SENT", "TAP_FIN_RCVD", "TAP_FIN_SENT",
	"TAP_FIN_ACKED",
};

static const char *tcp_state_str[] __attribute((__unused__)) = {
	"SYN_RCVD", "SYN_SENT", "ESTABLISHED",
	"SYN_RCVD",	/* approximately maps to TAP_SYN_ACK_SENT */

	/* Passive close: */
	"CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK", "LAST_ACK",
	/* Active close (+5): */
	"CLOSING", "FIN_WAIT_1", "FIN_WAIT_1", "FIN_WAIT_2", "TIME_WAIT",
};

static const char *tcp_flag_str[] __attribute((__unused__)) = {
	"STALLED", "LOCAL", "WND_CLAMPED", "IN_EPOLL", "ACTIVE_CLOSE",
	"ACK_TO_TAP_DUE", "ACK_FROM_TAP_DUE",
};

/* Port re-mappings as delta, indexed by original destination port */
static in_port_t		tcp_port_delta_to_tap	[USHRT_MAX];
static in_port_t		tcp_port_delta_to_init	[USHRT_MAX];

/* Listening sockets, used for automatic port forwarding in pasta mode only */
static int tcp_sock_init_lo	[USHRT_MAX][IP_VERSIONS];
static int tcp_sock_init_ext	[USHRT_MAX][IP_VERSIONS];
static int tcp_sock_ns		[USHRT_MAX][IP_VERSIONS];

/* Table of destinations with very low RTT (assumed to be local), LRU */
static struct in6_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];

/* Static buffers */

/**
 * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
 * @psum:	Partial IP header checksum (excluding tot_len and saddr)
 * @tsum:	Partial TCP header checksum (excluding length and saddr)
 * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
 * @vnet_len:	4-byte qemu vnet buffer length descriptor, only for passt mode
 * @eh:		Pre-filled Ethernet header
 * @iph:	Pre-filled IP header (except for tot_len and saddr)
 * @uh:		Headroom for TCP header
 * @data:	Storage for TCP payload
 */
static struct tcp4_l2_buf_t {
	uint32_t psum;		/* 0 */
	uint32_t tsum;		/* 4 */
#ifdef __AVX2__
	uint8_t pad[18];	/* 8, align th to 32 bytes */
#else
	uint8_t pad[2];		/*	align iph to 4 bytes	8 */
#endif
	uint32_t vnet_len;	/* 26				10 */
	struct ethhdr eh;	/* 30				14 */
	struct iphdr iph;	/* 44				28 */
	struct tcphdr th;	/* 64				48 */
	uint8_t data[MSS4];	/* 84				68 */
				/* 65541			65525 */
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp4_l2_buf[TCP_FRAMES_MEM];

static unsigned int tcp4_l2_buf_used;
static size_t tcp4_l2_buf_bytes;

/**
 * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
 * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
 * @vnet_len:	4-byte qemu vnet buffer length descriptor, only for passt mode
 * @eh:		Pre-filled Ethernet header
 * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
 * @th:		Headroom for TCP header
 * @data:	Storage for TCP payload
 */
struct tcp6_l2_buf_t {
#ifdef __AVX2__
	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
#else
	uint8_t pad[2];		/*	align ip6h to 4 bytes	0 */
#endif
	uint32_t vnet_len;	/* 14				2 */
	struct ethhdr eh;	/* 18				6 */
	struct ipv6hdr ip6h;	/* 32				20 */
	struct tcphdr th;	/* 72				60 */
	uint8_t data[MSS6];	/* 92				80 */
				/* 65639			65627 */
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp6_l2_buf[TCP_FRAMES_MEM];

static unsigned int tcp6_l2_buf_used;
static size_t tcp6_l2_buf_bytes;

/* recvmsg()/sendmsg() data for tap */
static char 		tcp_buf_discard		[MAX_WINDOW];
static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];

static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM];
static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM];
static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM];
static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM];

static struct mmsghdr	tcp_l2_mh		[TCP_FRAMES_MEM];

/* sendmsg() to socket */
static struct iovec	tcp_iov			[UIO_MAXIOV];

/**
 * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
 * @psum:	Partial IP header checksum (excluding tot_len and saddr)
 * @tsum:	Partial TCP header checksum (excluding length and saddr)
 * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
 * @vnet_len:	4-byte qemu vnet buffer length descriptor, only for passt mode
 * @eh:		Pre-filled Ethernet header
 * @iph:	Pre-filled IP header (except for tot_len and saddr)
 * @th:		Headroom for TCP header
 * @opts:	Headroom for TCP options
 */
static struct tcp4_l2_flags_buf_t {
	uint32_t psum;		/* 0 */
	uint32_t tsum;		/* 4 */
#ifdef __AVX2__
	uint8_t pad[18];	/* 8, align th to 32 bytes */
#else
	uint8_t pad[2];		/*	align iph to 4 bytes	8 */
#endif
	uint32_t vnet_len;	/* 26				10 */
	struct ethhdr eh;	/* 30				14 */
	struct iphdr iph;	/* 44				28 */
	struct tcphdr th;	/* 64				48 */
	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp4_l2_flags_buf[TCP_FRAMES_MEM];

static unsigned int tcp4_l2_flags_buf_used;
static size_t tcp4_l2_flags_buf_bytes;

/**
 * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags)
 * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
 * @vnet_len:	4-byte qemu vnet buffer length descriptor, only for passt mode
 * @eh:		Pre-filled Ethernet header
 * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
 * @th:		Headroom for TCP header
 * @opts:	Headroom for TCP options
 */
static struct tcp6_l2_flags_buf_t {
#ifdef __AVX2__
	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
#else
	uint8_t pad[2];		/*	align ip6h to 4 bytes		   0 */
#endif
	uint32_t vnet_len;	/* 14					   2 */
	struct ethhdr eh;	/* 18					   6 */
	struct ipv6hdr ip6h;	/* 32					  20 */
	struct tcphdr th	/* 72 */ __attribute__ ((aligned(4))); /* 60 */ 
	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp6_l2_flags_buf[TCP_FRAMES_MEM];

static unsigned int tcp6_l2_flags_buf_used;
static size_t tcp6_l2_flags_buf_bytes;

/* TCP connections */
static struct tcp_conn tc[TCP_MAX_CONNS];

/* Table for lookup from remote address, local port, remote port */
static struct tcp_conn *tc_hash[TCP_HASH_TABLE_SIZE];

/* Pools for pre-opened sockets */
int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
int ns_sock_pool4		[TCP_SOCK_POOL_SIZE];
int ns_sock_pool6		[TCP_SOCK_POOL_SIZE];

/**
 * tcp_conn_epoll_events() - epoll events mask for given connection state
 * @events:	Current connection events
 * @conn_flags	Connection flags
 *
 * Return: epoll events mask corresponding to implied connection state
 */
static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
{
	if (!events)
		return 0;

	if (events & ESTABLISHED) {
		if (events & TAP_FIN_SENT)
			return EPOLLET;

		if (conn_flags & STALLED)
			return EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET;

		return EPOLLIN | EPOLLRDHUP;
	}

	if (events == TAP_SYN_RCVD)
		return EPOLLOUT | EPOLLET | EPOLLRDHUP;

	return EPOLLRDHUP;
}

static void conn_flag_do(const struct ctx *c, struct tcp_conn *conn,
			 unsigned long flag);
#define conn_flag(c, conn, flag)					\
	do {								\
		trace("TCP: flag at %s:%i", __func__, __LINE__);	\
		conn_flag_do(c, conn, flag);				\
	} while (0)

/**
 * tcp_epoll_ctl() - Add/modify/delete epoll state from connection events
 * @c:		Execution context
 * @conn:	Connection pointer
 *
 * Return: 0 on success, negative error code on failure (not on deletion)
 */
static int tcp_epoll_ctl(const struct ctx *c, struct tcp_conn *conn)
{
	int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
	union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock,
				.r.p.tcp.tcp.index = conn - tc,
				.r.p.tcp.tcp.v6 = CONN_V6(conn) };
	struct epoll_event ev = { .data.u64 = ref.u64 };

	if (conn->events == CLOSED) {
		if (conn->flags & IN_EPOLL)
			epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev);
		if (conn->timer != -1)
			epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev);
		return 0;
	}

	ev.events = tcp_conn_epoll_events(conn->events, conn->flags);

	if (epoll_ctl(c->epollfd, m, conn->sock, &ev))
		return -errno;

	conn->flags |= IN_EPOLL;	/* No need to log this */

	if (conn->timer != -1) {
		union epoll_ref ref_t = { .r.proto = IPPROTO_TCP,
					  .r.s = conn->sock,
					  .r.p.tcp.tcp.timer = 1,
					  .r.p.tcp.tcp.index = conn - tc };
		struct epoll_event ev_t = { .data.u64 = ref_t.u64,
					    .events = EPOLLIN | EPOLLET };

		if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t))
			return -errno;
	}

	return 0;
}

/**
 * tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed
 * @c:		Execution context
 * @conn:	Connection pointer
 *
 * #syscalls timerfd_create timerfd_settime
 */
static void tcp_timer_ctl(const struct ctx *c, struct tcp_conn *conn)
{
	struct itimerspec it = { { 0 }, { 0 } };

	if (conn->events == CLOSED)
		return;

	if (conn->timer == -1) {
		union epoll_ref ref = { .r.proto = IPPROTO_TCP,
					.r.s = conn->sock,
					.r.p.tcp.tcp.timer = 1,
					.r.p.tcp.tcp.index = conn - tc };
		struct epoll_event ev = { .data.u64 = ref.u64,
					  .events = EPOLLIN | EPOLLET };
		int fd;

		fd = timerfd_create(CLOCK_MONOTONIC, 0);
		if (fd == -1 || fd > SOCKET_MAX) {
			debug("TCP: failed to get timer: %s", strerror(errno));
			return;
		}
		conn->timer = fd;

		if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
			debug("TCP: failed to add timer: %s", strerror(errno));
			close(conn->timer);
			conn->timer = -1;
			return;
		}
	}

	if (conn->flags & ACK_TO_TAP_DUE) {
		it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000;
	} else if (conn->flags & ACK_FROM_TAP_DUE) {
		if (!(conn->events & ESTABLISHED))
			it.it_value.tv_sec = SYN_TIMEOUT;
		else
			it.it_value.tv_sec = ACK_TIMEOUT;
	} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
		it.it_value.tv_sec = FIN_TIMEOUT;
	} else {
		it.it_value.tv_sec = ACT_TIMEOUT;
	}

	debug("TCP: index %li, timer expires in %lu.%03lus", conn - tc,
	      it.it_value.tv_sec, it.it_value.tv_nsec / 1000 / 1000);

	timerfd_settime(conn->timer, 0, &it, NULL);
}

/**
 * conn_flag_do() - Set/unset given flag, log, update epoll on STALLED flag
 * @c:		Execution context
 * @conn:	Connection pointer
 * @flag:	Flag to set, or ~flag to unset
 */
static void conn_flag_do(const struct ctx *c, struct tcp_conn *conn,
			 unsigned long flag)
{
	if (flag & (flag - 1)) {
		if (!(conn->flags & ~flag))
			return;

		conn->flags &= flag;
		if (fls(~flag) >= 0) {
			debug("TCP: index %li: %s dropped", conn - tc,
			      tcp_flag_str[fls(~flag)]);
		}
	} else {
		if (conn->flags & flag)
			return;

		conn->flags |= flag;
		if (fls(flag) >= 0) {
			debug("TCP: index %li: %s", conn - tc,
			      tcp_flag_str[fls(flag)]);
		}
	}

	if (flag == STALLED || flag == ~STALLED)
		tcp_epoll_ctl(c, conn);

	if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE		  ||
	    (flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) ||
	    (flag == ~ACK_TO_TAP_DUE   && (conn->flags & ACK_FROM_TAP_DUE)))
		tcp_timer_ctl(c, conn);
}

/**
 * conn_event_do() - Set and log connection events, update epoll state
 * @c:		Execution context
 * @conn:	Connection pointer
 * @event:	Connection event
 */
static void conn_event_do(const struct ctx *c, struct tcp_conn *conn,
			  unsigned long event)
{
	int prev, new, num = fls(event);

	if (conn->events & event)
		return;

	prev = fls(conn->events);
	if (conn->flags & ACTIVE_CLOSE)
		prev += 5;

	if ((conn->events & ESTABLISHED) && (conn->events != ESTABLISHED))
		prev++;		/* i.e. SOCK_FIN_RCVD, not TAP_SYN_ACK_SENT */

	if (event == CLOSED || (event & CONN_STATE_BITS))
		conn->events = event;
	else
		conn->events |= event;

	new = fls(conn->events);

	if ((conn->events & ESTABLISHED) && (conn->events != ESTABLISHED)) {
		num++;
		new++;
	}
	if (conn->flags & ACTIVE_CLOSE)
		new += 5;

	if (prev != new) {
		debug("TCP: index %li, %s: %s -> %s", conn - tc,
		      num == -1 	       ? "CLOSED" : tcp_event_str[num],
		      prev == -1	       ? "CLOSED" : tcp_state_str[prev],
		      (new == -1 || num == -1) ? "CLOSED" : tcp_state_str[new]);
	} else {
		debug("TCP: index %li, %s", conn - tc,
		      num == -1 	       ? "CLOSED" : tcp_event_str[num]);
	}

	if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD))
		conn_flag(c, conn, ACTIVE_CLOSE);
	else
		tcp_epoll_ctl(c, conn);

	if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
		tcp_timer_ctl(c, conn);
}

#define conn_event(c, conn, event)					\
	do {								\
		trace("TCP: event at %s:%i", __func__, __LINE__);	\
		conn_event_do(c, conn, event);				\
	} while (0)

/**
 * tcp_remap_to_tap() - Set delta for port translation toward guest/tap
 * @port:	Original destination port, host order
 * @delta:	Delta to be added to original destination port
 */
void tcp_remap_to_tap(in_port_t port, in_port_t delta)
{
	tcp_port_delta_to_tap[port] = delta;
}

/**
 * tcp_remap_to_tap() - Set delta for port translation toward init namespace
 * @port:	Original destination port, host order
 * @delta:	Delta to be added to original destination port
 */
void tcp_remap_to_init(in_port_t port, in_port_t delta)
{
	tcp_port_delta_to_init[port] = delta;
}

/**
 * tcp_rtt_dst_low() - Check if low RTT was seen for connection endpoint
 * @conn:	Connection pointer
 *
 * Return: 1 if destination is in low RTT table, 0 otherwise
 */
static int tcp_rtt_dst_low(const struct tcp_conn *conn)
{
	int i;

	for (i = 0; i < LOW_RTT_TABLE_SIZE; i++)
		if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, low_rtt_dst + i))
			return 1;

	return 0;
}

/**
 * tcp_rtt_dst_check() - Check tcpi_min_rtt, insert endpoint in table if low
 * @conn:	Connection pointer
 * @tinfo:	Pointer to struct tcp_info for socket
 */
static void tcp_rtt_dst_check(const struct tcp_conn *conn,
			      const struct tcp_info *tinfo)
{
#ifdef HAS_MIN_RTT
	int i, hole = -1;

	if (!tinfo->tcpi_min_rtt ||
	    (int)tinfo->tcpi_min_rtt > LOW_RTT_THRESHOLD)
		return;

	for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) {
		if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, low_rtt_dst + i))
			return;
		if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i))
			hole = i;
	}

	/* Keep gcc 12 happy: this won't actually happen because the table is
	 * guaranteed to have a hole, see the second memcpy() below.
	 */
	if (hole == -1)
		return;

	memcpy(low_rtt_dst + hole++, &conn->a.a6, sizeof(conn->a.a6));
	if (hole == LOW_RTT_TABLE_SIZE)
		hole = 0;
	memcpy(low_rtt_dst + hole, &in6addr_any, sizeof(conn->a.a6));
#else
	(void)conn;
	(void)tinfo;
#endif /* HAS_MIN_RTT */
}

/**
 * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.5 usage)
 * @conn:	Connection pointer
 */
static void tcp_get_sndbuf(struct tcp_conn *conn)
{
	int s = conn->sock, sndbuf;
	socklen_t sl;
	uint64_t v;

	sl = sizeof(sndbuf);
	if (getsockopt(s, SOL_SOCKET, SO_SNDBUF, &sndbuf, &sl)) {
		SNDBUF_SET(conn, WINDOW_DEFAULT);
		return;
	}

	v = sndbuf;
	if (v >= SNDBUF_BIG)
		v /= 2;
	else if (v > SNDBUF_SMALL)
		v -= v * (v - SNDBUF_SMALL) / (SNDBUF_BIG - SNDBUF_SMALL) / 2;

	SNDBUF_SET(conn, MIN(INT_MAX, v));
}

/**
 * tcp_sock_set_bufsize() - Set SO_RCVBUF and SO_SNDBUF to maximum values
 * @s:		Socket, can be -1 to avoid check in the caller
 */
void tcp_sock_set_bufsize(const struct ctx *c, int s)
{
	int v = INT_MAX / 2; /* Kernel clamps and rounds, no need to check */

	if (s == -1)
		return;

	if (!c->low_rmem && setsockopt(s, SOL_SOCKET, SO_RCVBUF, &v, sizeof(v)))
		trace("TCP: failed to set SO_RCVBUF to %i", v);

	if (!c->low_wmem && setsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v)))
		trace("TCP: failed to set SO_SNDBUF to %i", v);
}

/**
 * tcp_update_check_ip4() - Update IPv4 with variable parts from stored one
 * @buf:	L2 packet buffer with final IPv4 header
 */
static void tcp_update_check_ip4(struct tcp4_l2_buf_t *buf)
{
	uint32_t sum = buf->psum;

	sum += buf->iph.tot_len;
	sum += (buf->iph.saddr >> 16) & 0xffff;
	sum += buf->iph.saddr & 0xffff;

	buf->iph.check = (uint16_t)~csum_fold(sum);
}

/**
 * tcp_update_check_tcp4() - Update TCP checksum from stored one
 * @buf:	L2 packet buffer with final IPv4 header
 */
static void tcp_update_check_tcp4(struct tcp4_l2_buf_t *buf)
{
	uint16_t tlen = ntohs(buf->iph.tot_len) - 20;
	uint32_t sum = buf->tsum;

	sum += (buf->iph.saddr >> 16) & 0xffff;
	sum += buf->iph.saddr & 0xffff;
	sum += htons(ntohs(buf->iph.tot_len) - 20);

	buf->th.check = 0;
	buf->th.check = csum(&buf->th, tlen, sum);
}

/**
 * tcp_update_check_tcp6() - Calculate TCP checksum for IPv6
 * @buf:	L2 packet buffer with final IPv6 header
 */
static void tcp_update_check_tcp6(struct tcp6_l2_buf_t *buf)
{
	int len = ntohs(buf->ip6h.payload_len) + sizeof(struct ipv6hdr);

	buf->ip6h.hop_limit = IPPROTO_TCP;
	buf->ip6h.version = 0;
	buf->ip6h.nexthdr = 0;

	buf->th.check = 0;
	buf->th.check = csum(&buf->ip6h, len, 0);

	buf->ip6h.hop_limit = 255;
	buf->ip6h.version = 6;
	buf->ip6h.nexthdr = IPPROTO_TCP;
}

/**
 * tcp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
 * @eth_d:	Ethernet destination address, NULL if unchanged
 * @eth_s:	Ethernet source address, NULL if unchanged
 * @ip_da:	Pointer to IPv4 destination address, NULL if unchanged
 */
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
		       const uint32_t *ip_da)
{
	int i;

	for (i = 0; i < TCP_FRAMES_MEM; i++) {
		struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i];
		struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i];
		struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i];
		struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i];

		if (eth_d) {
			memcpy(b4->eh.h_dest, eth_d, ETH_ALEN);
			memcpy(b6->eh.h_dest, eth_d, ETH_ALEN);

			memcpy(b4f->eh.h_dest, eth_d, ETH_ALEN);
			memcpy(b6f->eh.h_dest, eth_d, ETH_ALEN);
		}

		if (eth_s) {
			memcpy(b4->eh.h_source, eth_s, ETH_ALEN);
			memcpy(b6->eh.h_source, eth_s, ETH_ALEN);

			memcpy(b4f->eh.h_source, eth_s, ETH_ALEN);
			memcpy(b6f->eh.h_source, eth_s, ETH_ALEN);
		}

		if (ip_da) {
			b4f->iph.daddr = b4->iph.daddr = *ip_da;
			if (!i) {
				b4f->iph.saddr = b4->iph.saddr = 0;
				b4f->iph.tot_len = b4->iph.tot_len = 0;
				b4f->iph.check = b4->iph.check = 0;
				b4f->psum = b4->psum = sum_16b(&b4->iph, 20);

				b4->tsum = ((*ip_da >> 16) & 0xffff) +
					   (*ip_da & 0xffff) +
					   htons(IPPROTO_TCP);
				b4f->tsum = b4->tsum;
			} else {
				b4f->psum = b4->psum = tcp4_l2_buf[0].psum;
				b4f->tsum = b4->tsum = tcp4_l2_buf[0].tsum;
			}
		}
	}
}

/**
 * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
 */
static void tcp_sock4_iov_init(void)
{
	struct iovec *iov;
	int i;

	for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) {
		tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) { 0, 0,
			{ 0 },
			0, L2_BUF_ETH_IP4_INIT, L2_BUF_IP4_INIT(IPPROTO_TCP),
			{ .doff = sizeof(struct tcphdr) / 4, .ack = 1 }, { 0 },
		};
	}

	for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) {
		tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) { 0, 0,
			{ 0 },
			0, L2_BUF_ETH_IP4_INIT, L2_BUF_IP4_INIT(IPPROTO_TCP),
			{ 0 }, { 0 },
		};
	}

	for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) {
		iov->iov_base = &tcp4_l2_buf[i].vnet_len;
		iov->iov_len = MSS_DEFAULT;
	}

	for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
		iov->iov_base = &tcp4_l2_flags_buf[i].vnet_len;
}

/**
 * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
 */
static void tcp_sock6_iov_init(void)
{
	struct iovec *iov;
	int i;

	for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) {
		tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) {
			{ 0 },
			0, L2_BUF_ETH_IP6_INIT, L2_BUF_IP6_INIT(IPPROTO_TCP),
			{ .doff = sizeof(struct tcphdr) / 4, .ack = 1 }, { 0 },
		};
	}

	for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) {
		tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) {
			{ 0 },
			0, L2_BUF_ETH_IP6_INIT, L2_BUF_IP6_INIT(IPPROTO_TCP),
			{ 0 }, { 0 },
		};
	}

	for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) {
		iov->iov_base = &tcp6_l2_buf[i].vnet_len;
		iov->iov_len = MSS_DEFAULT;
	}