aboutgitcodebugslistschat
path: root/tcp.c
blob: 056e917490293256424000a9a32a04c04d60778e (plain) (tree)
1
2
3
4
5
6
7
8
9
10


                                             



                                         


                                              
                                       
                                              





                           

             






                                                                                
                                                                              

























                                                                                
                                                                    




                                                                                







                                                                                
                                                               


                                                                           
                                                                              
                                                                             
                                                                           

                                                                              





                                                                           
                                                                               
                                                                               

                                                                              



                                                                               
  







                                                                              




                                                                      

                                                                               
                 















                                                                                

                                                                               


                                                     
                    

                    



















                                                                               
  
                                                   
  
                                             
  





                                                                               











                                                                                
  


                    
                                                                           
  


                                                                             

                                                                              


                                                                               
  


                                                                               
  



                                                                              
  





                                                                             











                                                                           
  

                                                                          

                              

                               
                                


                                                               
                                                                                



                                                              





                                                                               
  


                                




                                                                              


                                                                              
                              










                                                                                
                                                      

   

                  
                  
                   
                   
                   




                         
                       
                   
                    


                      
                    
                       
      
                       
                        
                      
                    

                 

                                                
                     
                 

                  
                    
                 
                 
                       
                
                  
 

                     
                                           

                                                                         
 


                                                                            
                                                               
                                                                         
                                                             
 
                                         
                                                              

                                 
                                           








                                                                             
                            













                                                                             
                            









                                                                      
 
                                                                      




                                                               
 
                                                                



                                                               
 
                                         
                                                   
 




                                                                     




                                                                    



                                
                              
                                
                                                                 



                         
                         
                         
                         



                         
                                                           
                                                



                                                                         
 




                                                                           

  










                                                                         

                                                                            

  
                                                                              

                                                         
 
                                                                        
                                                        
 




                                                                        

                                                                                
                                                        



                                                                   
                             



                                                             

                                                                    
      
                                                                     

                                                                     
                                                                     
                                                                        




                                                              
                            
 
                                     


                                                                     
                                                                              
                                                        



                                                                           
                      




                                                                    
                                                                    

                                                                     
                                                                     
                                                                        




                                                              
                            
 
                                     
 
                                      
                                                             
                                                                     
 



                                                                 
 
                                                                 
 
                         
                                                             
 




                                                                                
                                                        



                                                                   
                                   



                                                             

                                                                    
      
                                                                     

                                                                     





                                                              
                                  
 
                                           



                                                                              
                                                        



                                                                           
                                   




                                                                               
                                                                               
                                                                               
                                                                               





                                                              
                                  
 
                                           
 
                     
                                 
 

                                                               





                                                                    
                                                         


                                                    
                                          


                           
                                                                   
                                                         
 
                                            

                                                     
















                                                                         
                                         
                                                                         









                                                       
                                                                        
                                             












                                                                         
                                                                        
 
                                                                 
                                                                          
                                                                      

                                                        
                                     
                                     


                                                                               







                                                                     
                                
 



                                                                 
                                                                                






                                                                             



                 





                                                                                
                                                                         


                                                


                                   



                                                               
                                                                              

                                                                        
                       
 

                                                        
                                                                               


                                          

                               
                                 








                                                                               
                                           



                                                                       







                                                                   
                                                                            






                                                                           



                                              
                                                                        


                                            

                                            



                                           
                                      
                                                                           
                                                        
                 
                
                                           
 









                                                                           
                               
                 

                                    
                                      
                                                                   
                                                        
                 

         
                                                
                                       
 


                                                                            
                                       







                                                                      
                                                                         







                                              
                                       









                                                                              





                                                                            
                                       


                          
                                                                     



                                                                                
                                                           

                                                                                





                                                                       
                                                          
                                       








                                                                         

                                                                        
  

                                                            
                                                           



                                                
                                                               







                                                                            
                                                     
   
                                                              
                                                           
 
                  

                         

                                                         


                                                  
                                                               




                                                                           





                                                                             
                                         

                                       
                                                                  



                        


   


                                                                              
                                                     
 
                                   
                     
                   
 

                                                                 
                                                 


                       
                   




                                                                              
                                          


   


                                                                         
                                                     





                                                                             

                                                                                
 

                                                                                


   
























































                                                                           
                                                                              
                                                   


              
                                              

                                                                        


                                                           



                                                         

                            
                                                                       
                                 



                                                                             
 


                                                                             
                                                     
                                

                                                                           






                                                                               
                                 
   
                                                   



                          
                                                       
                                                         
                                                       

                                                                             



                                                             
                                                                     
                                                       
                                                           


                  
                                                                      
                                                                      
 
                                                                            
                                                                            



                                                                               
                                 
   
                                                   



                          

                                                         
                                                         

                                                                             




                                                                     
                                                         
                                                            


                  
                                                                      
                                                                      
 
                                                                            
                                                                            


   
                                                                

                                                                           
                                       

                                                                
  
                                                                      
   

                                                                       
 
                             
 
                          
                          
 

                                                         


                                  
                                   

                              
                                         



                                                                           
                                               

                                 
                                              

                                      


                                                     
                                                  




                                         
                                             
                               
                                                                
                                
                                                                
                         






                  
                                                                           
                                                 
                              




                                  
                                                          
                                                       
                                                                  
 
                                              






                                                                       
                                                                           
                                 
                              




                                                            
                                                                               
                                                                     
 






                                            
                       
 
                                                            




                                                       








                                                                    
                                                                         


   
                                                                    
                                 
                                  
   
                                                                           


              
                                                                      
                                                                  
                          
 
                                                                              
                                                                                  


   
                                                                    
                                 
                                  
   

                                                            
 
                                                 
                                       
 
                                       
                                                                    
                                    
                                 
                                                                    
                            
                                                                           



                              
                                                                                

                                                                 


   



                                                                            
   

                                                                        
 
                                                 
                                      
 
                                       
                                                                    

                                   
                                                                 
                            
                                                 
                              

                 
 
                                                                               
                                             
                                                                    

                              


   
                                                                        
                                 
                                                   
                                                              


                                  
                                                          
   



                                                                     
 
                              
                                  
              
 

                                                    
                                                                             
                                                                     
                                    

         
                    


   
                                                               
                                 
                                                     
   
                                                           
 
                             
 
                                                    
                                                                          
                                            
                                               


                       
                                      
                                          
 



                                                               
 


                                                                                  

                                       


   
                                                                                
                                 
                                                    
   
                                                                       
 

                                                     
                          

                                   
 
                                 
                                         

 
                                                                 

                                                                         
                                                                         


                                                                         
 
   

                                                                                
   
                                                 
 
                                                                      
                                   
 
                                                                      
                                   
 
 


                                                                    
   
                                                
 
                                                          
                             
 
                                                          
                             


   


                                                       
                                     
 

                                                                    
                             
 

                                  
 

                                                                       

                       


                                                                     
                                                            

                                                       
                                                          
                 
         










                                                                                
                                                        
   
                                                          
                                                                      
                                                           
                                                                          
 
                                                         
                            

                                                                         












                                                                         
 
















                                                                             
                                                        
                
                                                                    



                                                                               
                                              
                                                          
                                                            
                    
                                                         






                                                          


                                                               
 
                                                        
         

                                  
                    





                                                                        
                                                                                
                                                                    


                                                            
                                                                                
                                                                       
 
                                                                       
                                                        
                                                                      

                                      
                                                  

                           
                       
                        




                                                          
                                                                              
                                                                             

                                                                


                                                                         


                                         
                                                                




                                                                  
                             
 
                                         
                                     
                                                                   

                                                                         
                         

         
                     
                                                       
                                 
 
                                   
                                                                   
                                 
 

         
                  
                                                             
                                                     

                                     

                                                              
         
      
 




                                                                             
 

                                                   
 
    
                                                         



                                                       







                                                                                


                                                      
                                                  

                                                                                







                                                             
                                                                
                                 
                                  
                                                                      
  
                                                               
   
                                                                             
 
                                                        
                                                    

                                              

                                      
                           
                          
                          
                          
                   
                
 


                                                               
 
                                                            
                                            
                                   

         




                                                         
                                   
                                                
 
                                                                     
                         
 
                            

                                                                      
                             


                                                                         
                

                                                                      
                             
                                

         
                          
                        
 


                                                                  

                                      

                                   
                                                 
                        
                                                             
                                          


                                                              
 
                                          
                                                                             
                                                          
                                                 
                                                                 
                 
                                                               
 

                                            
 
                                                                     
 


                                     
                                          

                                          
                
                                                        

                                                                    

         
                                              




                                  

                                                                       
 





                                                                     
 

                                                     


                                                                      
                                   








                                                                               
                                                  





                                                          
 
                                                                               
                                                  
         
 



                 
                                                                               


                                  
                                                                
 
                                   

                       
                                         
                                            


   
                                                                             
                                  


                                                                     
                                                     
                                                           
 
                                                               
 




                                        
 





                                                                      
                                                                            
                                          

                                                                       
                           
 

                                   
 


                                        
 
                                                                    







                                                                             
                                                                                       


                                                                          
         

                                                                      


                                                                             
                                        


   

                                                                           
                                                                           
                                 
   

                                                                        
 






                                            


                                           
          
                             
 
                          



                                                             
 
                                                              
 

                                                            
 
                                    


   

                                                                            
  
                                                                     
   
                                  
 
                      
 

                                                  
                           
                                 
         

                  
 






                                                                            
                                                          



                                                                 
 




                             
                  
                              
 
                                   
 

                 
 
   
                                                             
                                  

                                                                     


                            
                                                                 
                                                                 



                         
                                                                       



                                  



                                     




                                   






















































                                                                                


                                                                         
                                                              


                                                                     

                                 


                                                                        
 
                                                                       










                                                      
                                  
                     
                   



                                               


                                                       

                            

                                                                          
                                                                           
                                                           




                                                                        
                                                    
                                                 
                  



                                                                              

         
                                         
                                
                       
                         
                                          
 
                                          
 
                                                   

                                                                       
                           
 






                                                                              
 

                                             
                            
                                               
                                   
                
                                               
                                   

         

                                           
 


                                                         
 
                                   
                                                  
 
                                 
 
                               
                                                                      



                                                      
 




                                                                         

                                           
                                         


                               
                                     
                

                                     
                                                      
                               
 
                                                      

         
                               


   




                                                                               
   
                                                                        
 





                                                                           
 
                                                                   


                                                                    
 
                 
 
 
   
                                                                              

                                  



                                                                             
   
                                                                     
                                                                    
 
                          
 


                                                                         
 
                                                       

                                                                        
                                                                   
                                                 


                                                                         
                                                       

                                                                        
                                                                   
                                                 
         




                                                                              
                                  
  
                                                    

                    
   
                                                                       
 
                                                                      
                                                            
                                                   
                                       
                                      
                                     
                              
                          


                                                                 
                                      
                                           
                                                                    

                                                                


                                 
                                                        

                                                     
                         
         
 
                                                                            
                                                                 

                                       

                            
                                                            
         
 





                                               
                                                                              
                                                                              
                                         
 



                                                             




                                                                                
                                   
         
                    
                                                      
 
                                                                              




                                                     
                         
 











                                                                                       
 

                                     
                                            
                         

         
                                     
 

                                                   
 
                                                  
                                                
 
                                   
                   
                                         
                                                                          



                                        
                                                                          
                                         

         
                                             
 
                 


                                                      
                             
                                 
         
 
                   


   
                                                                  

                                  
                                                     

                    
   
                                                                       
                                                   
 
                                                                              
                                                      
                                                      
                                                   
                                                  
                   
                  
 




                                           
                                                        
                                                  
                                  
                           

                           




                                                    

                                                            



                                         
                                     





                                                     
                                                    



                               


                                                        





                                             
 

                                                                      
                                                      

                                                                
                                                                            

                                                                    
                                                      
                         




                                


                                 






                                                                     
                                                             





                                                                        
                                                              

                                                                             



                                              
                                  
                   
                                                                             

                                 
                                            
                                       



                                         


                                                            

                        
                              
                                  


                                     

         
                                                   
 


                                                                                
 
                   
                                                                              
                                                     
                                                     
                                               
                                            

         
                   
                         
 
                              
      







                                                                  
                                                    

                 


                                   
                                                              
                                                              





                                 
                                                           
                                 
                                        
                                                      

                                        

         

                         





                                                                              
                                                        
                 
                       
         
 


                                                       
 


                                     
                                                  
                
                                                      



         


                                                                          


                                                                             
   
                                                                               

                                                                      
 






                                                        
                                                            









                                                                   

                                              


   


                                                                    
                                   
                                                     
                                 

                                    
   

                                                                     
 
                                  

                           
                        

                   


                                          




                                                    

                                                                           
                                                           
 
                                                                                

                                     
                    
                                                
                                                                              
                         

         
                                                                                    
 
                      

                                            

         

                                                                        

                                     
 


                                                   
                                                                             
                    
                                         
 

                         
 



                                                         
                                        
                 
 
                                                 
 
                              

                                             
                                                      
                                                    

                                                           
                                        


                               
                                         
                                        

                 


                                                             
 
                                  

                                 
 

                                                                 

                                                                        

                                                               
                                                    
 



                                                             
                                      

                                                       



                                                                               

                                            

         


                                                   
                        




                                                                            
                                  
   
                                                                        
 



                        

                                                                           


                       
                                              

                       
                                              
                                             


   



























                                                                           
                                                                         

                                                   


                                                   

                                 



                                                                      
 
                                
                       
                         
                                                
                                           
 

                                                               
 
                                         
 
                                   

                                 
                                                  
 
                                            
 

                                             
 
                             


   












                                                                             
                                       
 



                                               




                                                                    





                                                                         






                                                                 


   







                                                                              
                                                                       














                                                                              
                                       

                                                    
                                                                                   
                                         
                                                                           
                                                                             

                                                              
                                                                               
                                              

                                         
                                                                                    


                                                                  
                                               












                                                                                
                                                                                  





                                         
                                                                   
                                 
                                

                                   

                                                                          
 


                                   
                                
                                 


                       
                                                                   
                                            
                       
         
 

                                                                  
                                                    
 

                                                           
 
                                     
                                                    
 

                                                                
 
                       

         


                                              
                       
         
 

                                                         
                       
 

                                           
                                                    

                                       


   





























                                                                           
                                                                           
                                 

                                           
                                                                      
                                                                    
  
                                                                          
   

                                                                           
 

                                                                         

              
                                                                      
 

                                                      
                                                                     
                                                      
                                                                     
         
 
                  
                         
 

                                   


   
                                                                               
                                 



                                                                            
  
                                                                            
   

                                                                        
 
                                                     
 


                                                                            
                                 

                                                      

                                                                      
 

                                                                       
 



                                                                             










                                                                            
                                                        


                                                             
                                      



















                                                                              
                                                        

              
                                      









                                                                            


   













                                                                              

                                 
            
   
                                       
 
                                          
                      
 
                    
 
                                                  
                                                            
                                 
 
                                          





                 




                                                                 
                                                                  



                                                  
                                 

                              
                                                   



         
                                                                         
                                 
   






                                                                   

   
                                                                             

                                 
                                       
   
                           
 
              







                                                                            
 











                                                       

                                                                      
                           


                                                         
 

                                                                            
 
                    
                                      
 
                    
                                      
 

                                                                         


                                                                           
                                

                                    

                                   
                                              

         



                 



















                                                                          
                               



























                                                                          
                      

                            
                               
 
                                                          
                                                                         
                                                                 
                                                                     
                                                                   

                                 
                                                                 
                                                                     
                                                                   





                                                           
                                                                     

                                         

                                                                          
                                                             

                 
                                                          
                                                                        
                                                                       
                                                                           
                                                                         

                                 
                                                                       
                                                                           
                                                                         
                                 



                                                           
                                                                      

                                         

                                                                                
                                                                          
                                                    






                 
                                                                                
                                 
                      
   
                                                        
 
                             
 
                 
 



                                                                 
                                                      




                                                              
 
                                                     



                                                              
                 
         
 

                                                                     
                                                  

                                                       
                                                          
                 

         
                                

                                     
 
// SPDX-License-Identifier: AGPL-3.0-or-later

/* PASST - Plug A Simple Socket Transport
 *  for qemu/UNIX domain socket mode
 *
 * PASTA - Pack A Subtle Tap Abstraction
 *  for network namespace/tap device mode
 *
 * tcp.c - TCP L2-L4 translation state machine
 *
 * Copyright (c) 2020-2022 Red Hat GmbH
 * Author: Stefano Brivio <sbrivio@redhat.com>
 */

/**
 * DOC: Theory of Operation
 *
 *
 * PASST mode
 * ==========
 *
 * This implementation maps TCP traffic between a single L2 interface (tap) and
 * native TCP (L4) sockets, mimicking and reproducing as closely as possible the
 * inferred behaviour of applications running on a guest, connected via said L2
 * interface. Four connection flows are supported:
 * - from the local host to the guest behind the tap interface:
 *   - this is the main use case for proxies in service meshes
 *   - we bind to configured local ports, and relay traffic between L4 sockets
 *     with local endpoints and the L2 interface
 * - from remote hosts to the guest behind the tap interface:
 *   - this might be needed for services that need to be addressed directly,
 *     and typically configured with special port forwarding rules (which are
 *     not needed here)
 *   - we also relay traffic between L4 sockets with remote endpoints and the L2
 *     interface
 * - from the guest to the local host:
 *   - this is not observed in practice, but implemented for completeness and
 *     transparency
 * - from the guest to external hosts:
 *   - this might be needed for applications running on the guest that need to
 *     directly access internet services (e.g. NTP)
 *
 * Relevant goals are:
 * - transparency: sockets need to behave as if guest applications were running
 *   directly on the host. This is achieved by:
 *   - avoiding port and address translations whenever possible
 *   - mirroring TCP dynamics by observation of socket parameters (TCP_INFO
 *     socket option) and TCP headers of packets coming from the tap interface,
 *     reapplying those parameters in both flow directions (including TCP_MSS,
 *     TCP_WINDOW_CLAMP socket options)
 * - simplicity: only a small subset of TCP logic is implemented here and
 *   delegated as much as possible to the TCP implementations of guest and host
 *   kernel. This is achieved by:
 *   - avoiding a complete TCP stack reimplementation, with a modified TCP state
 *     machine focused on the translation of observed events instead
 *   - mirroring TCP dynamics as described above and hence avoiding the need for
 *     segmentation, explicit queueing, and reassembly of segments
 * - security:
 *   - no dynamic memory allocation is performed
 *   - TODO: synflood protection
 *
 * Portability is limited by usage of Linux-specific socket options.
 *
 *
 * Limits
 * ------
 *
 * To avoid the need for dynamic memory allocation, a maximum, reasonable amount
 * of connections is defined by TCP_MAX_CONNS (currently 128k).
 *
 * Data needs to linger on sockets as long as it's not acknowledged by the
 * guest, and is read using MSG_PEEK into preallocated static buffers sized
 * to the maximum supported window, 16 MiB ("discard" buffer, for already-sent
 * data) plus a number of maximum-MSS-sized buffers. This imposes a practical
 * limitation on window scaling, that is, the maximum factor is 256. Larger
 * factors will be accepted, but resulting, larger values are never advertised
 * to the other side, and not used while queueing data.
 *
 *
 * Ports
 * -----
 *
 * To avoid the need for ad-hoc configuration of port forwarding or allowed
 * ports, listening sockets can be opened and bound to all unbound ports on the
 * host, as far as process capabilities allow. This service needs to be started
 * after any application proxy that needs to bind to local ports. Mapped ports
 * can also be configured explicitly.
 *
 * No port translation is needed for connections initiated remotely or by the
 * local host: source port from socket is reused while establishing connections
 * to the guest.
 *
 * For connections initiated by the guest, it's not possible to force the same
 * source port as connections are established by the host kernel: that's the
 * only port translation needed.
 *
 *
 * Connection tracking and storage
 * -------------------------------
 *
 * Connections are tracked by struct tcp_tap_conn entries in the @tc
 * array, containing addresses, ports, TCP states and parameters. This
 * is statically allocated and indexed by an arbitrary connection
 * number. The array is compacted whenever a connection is closed, by
 * remapping the highest connection index in use to the one freed up.
 *
 * References used for the epoll interface report the connection index used for
 * the @tc array.
 *
 * IPv4 addresses are stored as IPv4-mapped IPv6 addresses to avoid the need for
 * separate data structures depending on the protocol version.
 *
 * - Inbound connection requests (to the guest) are mapped using the triple
 *   < source IP address, source port, destination port >
 * - Outbound connection requests (from the guest) are mapped using the triple
 *   < destination IP address, destination port, source port >
 *   where the source port is the one used by the guest, not the one used by the
 *   corresponding host socket
 *
 *
 * Initialisation
 * --------------
 *
 * Up to 2^15 + 2^14 listening sockets (excluding ephemeral ports, repeated for
 * IPv4 and IPv6) can be opened and bound to wildcard addresses. Some will fail
 * to bind (for low ports, or ports already bound, e.g. by a proxy). These are
 * added to the epoll list, with no separate storage.
 *
 *
 * Events and states
 * -----------------
 *
 * Instead of tracking connection states using a state machine, connection
 * events are used to determine state and actions for a given connection. This
 * makes the implementation simpler as most of the relevant tasks deal with
 * reactions to events, rather than state-associated actions. For user
 * convenience, approximate states are mapped in logs from events by
 * @tcp_state_str.
 *
 * The events are:
 *
 * - SOCK_ACCEPTED	connection accepted from socket, SYN sent to tap/guest
 *
 * - TAP_SYN_RCVD	tap/guest initiated connection, SYN received
 *
 * - TAP_SYN_ACK_SENT	SYN, ACK sent to tap/guest, valid for TAP_SYN_RCVD only
 *
 * - ESTABLISHED	connection established, the following events are valid:
 *
 * - SOCK_FIN_RCVD	FIN (EPOLLRDHUP) received from socket
 *
 * - SOCK_FIN_SENT	FIN (write shutdown) sent to socket
 *
 * - TAP_FIN_RCVD	FIN received from tap/guest
 *
 * - TAP_FIN_SENT	FIN sent to tap/guest
 *
 * - TAP_FIN_ACKED	ACK to FIN seen from tap/guest
 *
 * Setting any event in CONN_STATE_BITS (SOCK_ACCEPTED, TAP_SYN_RCVD,
 * ESTABLISHED) clears all the other events, as those represent the fundamental
 * connection states. No events (events == CLOSED) means the connection is
 * closed.
 *
 * Connection setup
 * ----------------
 *
 * - inbound connection (from socket to guest): on accept() from listening
 *   socket, the new socket is mapped in connection tracking table, and
 *   three-way handshake initiated towards the guest, advertising MSS and window
 *   size and scaling from socket parameters
 * - outbound connection (from guest to socket): on SYN segment from guest, a
 *   new socket is created and mapped in connection tracking table, setting
 *   MSS and window clamping from header and option of the observed SYN segment
 *
 *
 * Aging and timeout
 * -----------------
 *
 * Timeouts are implemented by means of timerfd timers, set based on flags:
 *
 * - SYN_TIMEOUT: if no ACK is received from tap/guest during handshake (flag
 *   ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, reset the
 *   connection
 *
 * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
 *   data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
 *   socket and reset sequence to what was acknowledged. If this persists for
 *   more than TCP_MAX_RETRANS times in a row, reset the connection
 *
 * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
 *   with TAP_FIN_SENT event), and no ACK is received within this time, reset
 *   the connection
 *
 * - FIN_TIMEOUT: if a FIN segment was acknowledged by tap/guest and a FIN
 *   segment (write shutdown) was sent via socket (events SOCK_FIN_SENT and
 *   TAP_FIN_ACKED), but no socket activity is detected from the socket within
 *   this time, reset the connection
 *
 * - ACT_TIMEOUT, in the presence of any event: if no activity is detected on
 *   either side, the connection is reset
 *
 * - ACK_INTERVAL elapsed after data segment received from tap without having
 *   sent an ACK segment, or zero-sized window advertised to tap/guest (flag
 *   ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent
 *
 *
 * Summary of data flows (with ESTABLISHED event)
 * ----------------------------------------------
 *
 * @seq_to_tap:		next sequence for packets to tap/guest
 * @seq_ack_from_tap:	last ACK number received from tap/guest
 * @seq_from_tap:	next sequence for packets from tap/guest (expected)
 * @seq_ack_to_tap:	last ACK number sent to tap/guest
 *
 * @seq_init_from_tap:	initial sequence number from tap/guest
 * @seq_init_to_tap:	initial sequence number from tap/guest
 *
 * @wnd_from_tap:	last window size received from tap, never scaled
 * @wnd_from_tap:	last window size advertised from tap, never scaled
 *
 * - from socket to tap/guest:
 *   - on new data from socket:
 *     - peek into buffer
 *     - send data to tap/guest:
 *       - starting at offset (@seq_to_tap - @seq_ack_from_tap)
 *       - in MSS-sized segments
 *       - increasing @seq_to_tap at each segment
 *       - up to window (until @seq_to_tap - @seq_ack_from_tap <= @wnd_from_tap)
 *     - on read error, send RST to tap/guest, close socket
 *     - on zero read, send FIN to tap/guest, set TAP_FIN_SENT
 *   - on ACK from tap/guest:
 *     - set @ts_ack_from_tap
 *     - check if it's the second duplicated ACK
 *     - consume buffer by difference between new ack_seq and @seq_ack_from_tap
 *     - update @seq_ack_from_tap from ack_seq in header
 *     - on two duplicated ACKs, reset @seq_to_tap to @seq_ack_from_tap, and
 *       resend with steps listed above
 *     - set TCP_WINDOW_CLAMP from TCP header from tap
 *
 * - from tap/guest to socket:
 *   - on packet from tap/guest:
 *     - set @ts_tap_act
 *     - set TCP_WINDOW_CLAMP from TCP header from tap
 *     - check seq from header against @seq_from_tap, if data is missing, send
 *       two ACKs with number @seq_ack_to_tap, discard packet
 *     - otherwise queue data to socket, set @seq_from_tap to seq from header
 *       plus payload length
 *     - in ESTABLISHED state, send ACK to tap as soon as we queue to the
 *       socket. In other states, query socket for TCP_INFO, set
 *       @seq_ack_to_tap to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and
 *       send ACK to tap/guest
 *
 *
 * PASTA mode
 * ==========
 *
 * For traffic directed to TCP ports configured for mapping to the tuntap device
 * in the namespace, and for non-local traffic coming from the tuntap device,
 * the implementation is identical as the PASST mode described in the previous
 * section.
 *
 * For local traffic directed to TCP ports configured for direct mapping between
 * namespaces, see the implementation in tcp_splice.c.
 */

#include <sched.h>
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <signal.h>
#include <stdlib.h>
#include <errno.h>
#include <limits.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <stdint.h>
#include <stdbool.h>
#include <stddef.h>
#include <string.h>
#include <sys/epoll.h>
#ifdef HAS_GETRANDOM
#include <sys/random.h>
#endif
#include <sys/socket.h>
#include <sys/timerfd.h>
#include <sys/types.h>
#include <sys/uio.h>
#include <time.h>

#include <linux/tcp.h> /* For struct tcp_info */

#include "checksum.h"
#include "util.h"
#include "passt.h"
#include "tap.h"
#include "siphash.h"
#include "pcap.h"
#include "conf.h"
#include "tcp_splice.h"
#include "log.h"
#include "inany.h"

#include "tcp_conn.h"

#define TCP_FRAMES_MEM			128
#define TCP_FRAMES							\
	(c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)

#define TCP_FILE_PRESSURE		30	/* % of c->nofile */
#define TCP_CONN_PRESSURE		30	/* % of c->tcp.conn_count */

#define TCP_HASH_TABLE_LOAD		70		/* % */
#define TCP_HASH_TABLE_SIZE		(TCP_MAX_CONNS * 100 /		\
					 TCP_HASH_TABLE_LOAD)

#define MAX_WS				8
#define MAX_WINDOW			(1 << (16 + (MAX_WS)))

/* MSS rounding: see SET_MSS() */
#define MSS_DEFAULT			536

struct tcp4_l2_head {	/* For MSS4 macro: keep in sync with tcp4_l2_buf_t */
	uint32_t psum;
	uint32_t tsum;
#ifdef __AVX2__
	uint8_t pad[18];
#else
	uint8_t pad[2];
#endif
	struct tap_hdr taph;
	struct iphdr iph;
	struct tcphdr th;
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)));
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif

struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
#ifdef __AVX2__
	uint8_t pad[14];
#else
	uint8_t pad[2];
#endif
	struct tap_hdr taph;
	struct ipv6hdr ip6h;
	struct tcphdr th;
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)));
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif

#define MSS4	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4)
#define MSS6	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4)

#define WINDOW_DEFAULT			14600		/* RFC 6928 */
#ifdef HAS_SND_WND
# define KERNEL_REPORTS_SND_WND(c)	(c->tcp.kernel_snd_wnd)
#else
# define KERNEL_REPORTS_SND_WND(c)	(0 && (c))
#endif

#define ACK_INTERVAL			10		/* ms */
#define SYN_TIMEOUT			10		/* s */
#define ACK_TIMEOUT			2
#define FIN_TIMEOUT			60
#define ACT_TIMEOUT			7200

#define LOW_RTT_TABLE_SIZE		8
#define LOW_RTT_THRESHOLD		10 /* us */

/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of
 * <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP
 */
#define SOL_TCP				IPPROTO_TCP

#define SEQ_LE(a, b)			((b) - (a) < MAX_WINDOW)
#define SEQ_LT(a, b)			((b) - (a) - 1 < MAX_WINDOW)
#define SEQ_GE(a, b)			((a) - (b) < MAX_WINDOW)
#define SEQ_GT(a, b)			((a) - (b) - 1 < MAX_WINDOW)

#define FIN		(1 << 0)
#define SYN		(1 << 1)
#define RST		(1 << 2)
#define ACK		(1 << 4)
/* Flags for internal usage */
#define DUP_ACK		(1 << 5)
#define ACK_IF_NEEDED	0		/* See tcp_send_flag() */

#define OPT_EOL		0
#define OPT_NOP		1
#define OPT_MSS		2
#define OPT_MSS_LEN	4
#define OPT_WS		3
#define OPT_WS_LEN	3
#define OPT_SACKP	4
#define OPT_SACK	5
#define OPT_TS		8

#define CONN_V4(conn)		(!!inany_v4(&(conn)->addr))
#define CONN_V6(conn)		(!CONN_V4(conn))
#define CONN_IS_CLOSING(conn)						\
	((conn->events & ESTABLISHED) &&				\
	 (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
#define CONN_HAS(conn, set)	((conn->events & (set)) == (set))

static const char *tcp_event_str[] __attribute((__unused__)) = {
	"SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",

	"SOCK_FIN_RCVD", "SOCK_FIN_SENT", "TAP_FIN_RCVD", "TAP_FIN_SENT",
	"TAP_FIN_ACKED",
};

static const char *tcp_state_str[] __attribute((__unused__)) = {
	"SYN_RCVD", "SYN_SENT", "ESTABLISHED",
	"SYN_RCVD",	/* approximately maps to TAP_SYN_ACK_SENT */

	/* Passive close: */
	"CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK", "LAST_ACK",
	/* Active close (+5): */
	"CLOSING", "FIN_WAIT_1", "FIN_WAIT_1", "FIN_WAIT_2", "TIME_WAIT",
};

static const char *tcp_flag_str[] __attribute((__unused__)) = {
	"STALLED", "LOCAL", "WND_CLAMPED", "ACTIVE_CLOSE", "ACK_TO_TAP_DUE",
	"ACK_FROM_TAP_DUE",
};

/* Listening sockets, used for automatic port forwarding in pasta mode only */
static int tcp_sock_init_ext	[NUM_PORTS][IP_VERSIONS];
static int tcp_sock_ns		[NUM_PORTS][IP_VERSIONS];

/* Table of destinations with very low RTT (assumed to be local), LRU */
static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];

/* Static buffers */

/**
 * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
 * @psum:	Partial IP header checksum (excluding tot_len and saddr)
 * @tsum:	Partial TCP header checksum (excluding length and saddr)
 * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
 * @taph:	Tap-level headers (partially pre-filled)
 * @iph:	Pre-filled IP header (except for tot_len and saddr)
 * @uh:		Headroom for TCP header
 * @data:	Storage for TCP payload
 */
static struct tcp4_l2_buf_t {
	uint32_t psum;		/* 0 */
	uint32_t tsum;		/* 4 */
#ifdef __AVX2__
	uint8_t pad[18];	/* 8, align th to 32 bytes */
#else
	uint8_t pad[2];		/*	align iph to 4 bytes	8 */
#endif
	struct tap_hdr taph;	/* 26				10 */
	struct iphdr iph;	/* 44				28 */
	struct tcphdr th;	/* 64				48 */
	uint8_t data[MSS4];	/* 84				68 */
				/* 65536			65532 */
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp4_l2_buf[TCP_FRAMES_MEM];

static unsigned int tcp4_l2_buf_used;

/**
 * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
 * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
 * @taph:	Tap-level headers (partially pre-filled)
 * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
 * @th:		Headroom for TCP header
 * @data:	Storage for TCP payload
 */
struct tcp6_l2_buf_t {
#ifdef __AVX2__
	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
#else
	uint8_t pad[2];		/*	align ip6h to 4 bytes	0 */
#endif
	struct tap_hdr taph;	/* 14				2 */
	struct ipv6hdr ip6h;	/* 32				20 */
	struct tcphdr th;	/* 72				60 */
	uint8_t data[MSS6];	/* 92				80 */
				/* 65536			65532 */
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp6_l2_buf[TCP_FRAMES_MEM];

static unsigned int tcp6_l2_buf_used;

/* recvmsg()/sendmsg() data for tap */
static char 		tcp_buf_discard		[MAX_WINDOW];
static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];

static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM];
static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM];
static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM];
static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM];

static struct mmsghdr	tcp_l2_mh		[TCP_FRAMES_MEM];

/* sendmsg() to socket */
static struct iovec	tcp_iov			[UIO_MAXIOV];

/**
 * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
 * @psum:	Partial IP header checksum (excluding tot_len and saddr)
 * @tsum:	Partial TCP header checksum (excluding length and saddr)
 * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
 * @taph:	Tap-level headers (partially pre-filled)
 * @iph:	Pre-filled IP header (except for tot_len and saddr)
 * @th:		Headroom for TCP header
 * @opts:	Headroom for TCP options
 */
static struct tcp4_l2_flags_buf_t {
	uint32_t psum;		/* 0 */
	uint32_t tsum;		/* 4 */
#ifdef __AVX2__
	uint8_t pad[18];	/* 8, align th to 32 bytes */
#else
	uint8_t pad[2];		/*	align iph to 4 bytes	8 */
#endif
	struct tap_hdr taph;	/* 26				10 */
	struct iphdr iph;	/* 44				28 */
	struct tcphdr th;	/* 64				48 */
	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp4_l2_flags_buf[TCP_FRAMES_MEM];

static unsigned int tcp4_l2_flags_buf_used;

/**
 * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags)
 * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
 * @taph:	Tap-level headers (partially pre-filled)
 * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
 * @th:		Headroom for TCP header
 * @opts:	Headroom for TCP options
 */
static struct tcp6_l2_flags_buf_t {
#ifdef __AVX2__
	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
#else
	uint8_t pad[2];		/*	align ip6h to 4 bytes		   0 */
#endif
	struct tap_hdr taph;	/* 14					   2 */
	struct ipv6hdr ip6h;	/* 32					  20 */
	struct tcphdr th	/* 72 */ __attribute__ ((aligned(4))); /* 60 */
	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp6_l2_flags_buf[TCP_FRAMES_MEM];

static unsigned int tcp6_l2_flags_buf_used;

/* TCP connections */
union tcp_conn tc[TCP_MAX_CONNS];

#define CONN(index)		(&tc[(index)].tap)
#define CONN_IDX(conn)		((union tcp_conn *)(conn) - tc)

/** conn_at_idx() - Find a connection by index, if present
 * @index:	Index of connection to lookup
 *
 * Return: pointer to connection, or NULL if @index is out of bounds
 */
static inline struct tcp_tap_conn *conn_at_idx(int index)
{
	if ((index < 0) || (index >= TCP_MAX_CONNS))
		return NULL;
	ASSERT(!(CONN(index)->c.spliced));
	return CONN(index);
}

/* Table for lookup from remote address, local port, remote port */
static struct tcp_tap_conn *tc_hash[TCP_HASH_TABLE_SIZE];

/* Pools for pre-opened sockets (in init) */
int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
int init_sock_pool6		[TCP_SOCK_POOL_SIZE];

/**
 * tcp_conn_epoll_events() - epoll events mask for given connection state
 * @events:	Current connection events
 * @conn_flags	Connection flags
 *
 * Return: epoll events mask corresponding to implied connection state
 */
static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
{
	if (!events)
		return 0;

	if (events & ESTABLISHED) {
		if (events & TAP_FIN_SENT)
			return EPOLLET;

		if (conn_flags & STALLED)
			return EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET;

		return EPOLLIN | EPOLLRDHUP;
	}

	if (events == TAP_SYN_RCVD)
		return EPOLLOUT | EPOLLET | EPOLLRDHUP;

	return EPOLLRDHUP;
}

static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
			 unsigned long flag);
#define conn_flag(c, conn, flag)					\
	do {								\
		trace("TCP: flag at %s:%i", __func__, __LINE__);	\
		conn_flag_do(c, conn, flag);				\
	} while (0)

/**
 * tcp_epoll_ctl() - Add/modify/delete epoll state from connection events
 * @c:		Execution context
 * @conn:	Connection pointer
 *
 * Return: 0 on success, negative error code on failure (not on deletion)
 */
static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
{
	int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
	union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock,
				.r.p.tcp.tcp.index = CONN_IDX(conn) };
	struct epoll_event ev = { .data.u64 = ref.u64 };

	if (conn->events == CLOSED) {
		if (conn->c.in_epoll)
			epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev);
		if (conn->timer != -1)
			epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev);
		return 0;
	}

	ev.events = tcp_conn_epoll_events(conn->events, conn->flags);

	if (epoll_ctl(c->epollfd, m, conn->sock, &ev))
		return -errno;

	conn->c.in_epoll = true;

	if (conn->timer != -1) {
		union epoll_ref ref_t = { .r.proto = IPPROTO_TCP,
					  .r.s = conn->sock,
					  .r.p.tcp.tcp.timer = 1,
					  .r.p.tcp.tcp.index = CONN_IDX(conn) };
		struct epoll_event ev_t = { .data.u64 = ref_t.u64,
					    .events = EPOLLIN | EPOLLET };

		if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t))
			return -errno;
	}

	return 0;
}

/**
 * tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed
 * @c:		Execution context
 * @conn:	Connection pointer
 *
 * #syscalls timerfd_create timerfd_settime
 */
static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
{
	struct itimerspec it = { { 0 }, { 0 } };

	if (conn->events == CLOSED)
		return;

	if (conn->timer == -1) {
		union epoll_ref ref = { .r.proto = IPPROTO_TCP,
					.r.s = conn->sock,
					.r.p.tcp.tcp.timer = 1,
					.r.p.tcp.tcp.index = CONN_IDX(conn) };
		struct epoll_event ev = { .data.u64 = ref.u64,
					  .events = EPOLLIN | EPOLLET };
		int fd;

		fd = timerfd_create(CLOCK_MONOTONIC, 0);
		if (fd == -1 || fd > SOCKET_MAX) {
			debug("TCP: failed to get timer: %s", strerror(errno));
			if (fd > -1)
				close(fd);
			conn->timer = -1;
			return;
		}
		conn->timer = fd;

		if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
			debug("TCP: failed to add timer: %s", strerror(errno));
			close(conn->timer);
			conn->timer = -1;
			return;
		}
	}

	if (conn->flags & ACK_TO_TAP_DUE) {
		it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000;
	} else if (conn->flags & ACK_FROM_TAP_DUE) {
		if (!(conn->events & ESTABLISHED))
			it.it_value.tv_sec = SYN_TIMEOUT;
		else
			it.it_value.tv_sec = ACK_TIMEOUT;
	} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
		it.it_value.tv_sec = FIN_TIMEOUT;
	} else {
		it.it_value.tv_sec = ACT_TIMEOUT;
	}

	debug("TCP: index %li, timer expires in %lu.%03lus", CONN_IDX(conn),
	      it.it_value.tv_sec, it.it_value.tv_nsec / 1000 / 1000);

	timerfd_settime(conn->timer, 0, &it, NULL);
}

/**
 * conn_flag_do() - Set/unset given flag, log, update epoll on STALLED flag
 * @c:		Execution context
 * @conn:	Connection pointer
 * @flag:	Flag to set, or ~flag to unset
 */
static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
			 unsigned long flag)
{
	if (flag & (flag - 1)) {
		int flag_index = fls(~flag);

		if (!(conn->flags & ~flag))
			return;

		conn->flags &= flag;
		if (flag_index >= 0) {
			debug("TCP: index %li: %s dropped", CONN_IDX(conn),
			      tcp_flag_str[flag_index]);
		}
	} else {
		int flag_index = fls(flag);

		if (conn->flags & flag) {
			/* Special case: setting ACK_FROM_TAP_DUE on a
			 * connection where it's already set is used to
			 * re-schedule the existing timer.
			 * TODO: define clearer semantics for timer-related
			 * flags and factor this into the logic below.
			 */
			if (flag == ACK_FROM_TAP_DUE)
				tcp_timer_ctl(c, conn);

			return;
		}

		conn->flags |= flag;
		if (flag_index >= 0) {
			debug("TCP: index %li: %s", CONN_IDX(conn),
			      tcp_flag_str[flag_index]);
		}
	}

	if (flag == STALLED || flag == ~STALLED)
		tcp_epoll_ctl(c, conn);

	if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE		  ||
	    (flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) ||
	    (flag == ~ACK_TO_TAP_DUE   && (conn->flags & ACK_FROM_TAP_DUE)))
		tcp_timer_ctl(c, conn);
}

/**
 * conn_event_do() - Set and log connection events, update epoll state
 * @c:		Execution context
 * @conn:	Connection pointer
 * @event:	Connection event
 */
static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
			  unsigned long event)
{
	int prev, new, num = fls(event);

	if (conn->events & event)
		return;

	prev = fls(conn->events);
	if (conn->flags & ACTIVE_CLOSE)
		prev += 5;

	if ((conn->events & ESTABLISHED) && (conn->events != ESTABLISHED))
		prev++;		/* i.e. SOCK_FIN_RCVD, not TAP_SYN_ACK_SENT */

	if (event == CLOSED || (event & CONN_STATE_BITS))
		conn->events = event;
	else
		conn->events |= event;

	new = fls(conn->events);

	if ((conn->events & ESTABLISHED) && (conn->events != ESTABLISHED)) {
		num++;
		new++;
	}
	if (conn->flags & ACTIVE_CLOSE)
		new += 5;

	if (prev != new) {
		debug("TCP: index %li, %s: %s -> %s", CONN_IDX(conn),
		      num == -1 	       ? "CLOSED" : tcp_event_str[num],
		      prev == -1	       ? "CLOSED" : tcp_state_str[prev],
		      (new == -1 || num == -1) ? "CLOSED" : tcp_state_str[new]);
	} else {
		debug("TCP: index %li, %s", CONN_IDX(conn),
		      num == -1 	       ? "CLOSED" : tcp_event_str[num]);
	}

	if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD))
		conn_flag(c, conn, ACTIVE_CLOSE);
	else
		tcp_epoll_ctl(c, conn);

	if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
		tcp_timer_ctl(c, conn);
}

#define conn_event(c, conn, event)					\
	do {								\
		trace("TCP: event at %s:%i", __func__, __LINE__);	\
		conn_event_do(c, conn, event);				\
	} while (0)

/**
 * tcp_rtt_dst_low() - Check if low RTT was seen for connection endpoint
 * @conn:	Connection pointer
 *
 * Return: 1 if destination is in low RTT table, 0 otherwise
 */
static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
{
	int i;

	for (i = 0; i < LOW_RTT_TABLE_SIZE; i++)
		if (inany_equals(&conn->addr, low_rtt_dst + i))
			return 1;

	return 0;
}

/**
 * tcp_rtt_dst_check() - Check tcpi_min_rtt, insert endpoint in table if low
 * @conn:	Connection pointer
 * @tinfo:	Pointer to struct tcp_info for socket
 */
static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
			      const struct tcp_info *tinfo)
{
#ifdef HAS_MIN_RTT
	int i, hole = -1;

	if (!tinfo->tcpi_min_rtt ||
	    (int)tinfo->tcpi_min_rtt > LOW_RTT_THRESHOLD)
		return;

	for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) {
		if (inany_equals(&conn->addr, low_rtt_dst + i))
			return;
		if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i))
			hole = i;
	}

	/* Keep gcc 12 happy: this won't actually happen because the table is
	 * guaranteed to have a hole, see the second memcpy() below.
	 */
	if (hole == -1)
		return;

	low_rtt_dst[hole++] = conn->addr;
	if (hole == LOW_RTT_TABLE_SIZE)
		hole = 0;
	inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any);
#else
	(void)conn;
	(void)tinfo;
#endif /* HAS_MIN_RTT */
}

/**
 * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.5 usage)
 * @conn:	Connection pointer
 */
static void tcp_get_sndbuf(struct tcp_tap_conn *conn)
{
	int s = conn->sock, sndbuf;
	socklen_t sl;
	uint64_t v;

	sl = sizeof(sndbuf);
	if (getsockopt(s, SOL_SOCKET, SO_SNDBUF, &sndbuf, &sl)) {
		SNDBUF_SET(conn, WINDOW_DEFAULT);
		return;
	}

	v = sndbuf;
	if (v >= SNDBUF_BIG)
		v /= 2;
	else if (v > SNDBUF_SMALL)
		v -= v * (v - SNDBUF_SMALL) / (SNDBUF_BIG - SNDBUF_SMALL) / 2;

	SNDBUF_SET(conn, MIN(INT_MAX, v));
}

/**
 * tcp_sock_set_bufsize() - Set SO_RCVBUF and SO_SNDBUF to maximum values
 * @s:		Socket, can be -1 to avoid check in the caller
 */
void tcp_sock_set_bufsize(const struct ctx *c, int s)
{
	int v = INT_MAX / 2; /* Kernel clamps and rounds, no need to check */

	if (s == -1)
		return;

	if (!c->low_rmem && setsockopt(s, SOL_SOCKET, SO_RCVBUF, &v, sizeof(v)))
		trace("TCP: failed to set SO_RCVBUF to %i", v);

	if (!c->low_wmem && setsockopt(s, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v)))
		trace("TCP: failed to set SO_SNDBUF to %i", v);
}

/**
 * tcp_update_check_ip4() - Update IPv4 with variable parts from stored one
 * @buf:	L2 packet buffer with final IPv4 header
 */
static void tcp_update_check_ip4(struct tcp4_l2_buf_t *buf)
{
	uint32_t sum = buf->psum;

	sum += buf->iph.tot_len;
	sum += (buf->iph.saddr >> 16) & 0xffff;
	sum += buf->iph.saddr & 0xffff;

	buf->iph.check = (uint16_t)~csum_fold(sum);
}

/**
 * tcp_update_check_tcp4() - Update TCP checksum from stored one
 * @buf:	L2 packet buffer with final IPv4 header
 */
static void tcp_update_check_tcp4(struct tcp4_l2_buf_t *buf)
{
	uint16_t tlen = ntohs(buf->iph.tot_len) - 20;
	uint32_t sum = buf->tsum;

	sum += (buf->iph.saddr >> 16) & 0xffff;
	sum += buf->iph.saddr & 0xffff;
	sum += htons(ntohs(buf->iph.tot_len) - 20);

	buf->th.check = 0;
	buf->th.check = csum(&buf->th, tlen, sum);
}

/**
 * tcp_update_check_tcp6() - Calculate TCP checksum for IPv6
 * @buf:	L2 packet buffer with final IPv6 header
 */
static void tcp_update_check_tcp6(struct tcp6_l2_buf_t *buf)
{
	int len = ntohs(buf->ip6h.payload_len) + sizeof(struct ipv6hdr);

	buf->ip6h.hop_limit = IPPROTO_TCP;
	buf->ip6h.version = 0;
	buf->ip6h.nexthdr = 0;

	buf->th.check = 0;
	buf->th.check = csum(&buf->ip6h, len, 0);

	buf->ip6h.hop_limit = 255;
	buf->ip6h.version = 6;
	buf->ip6h.nexthdr = IPPROTO_TCP;
}

/**
 * tcp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
 * @eth_d:	Ethernet destination address, NULL if unchanged
 * @eth_s:	Ethernet source address, NULL if unchanged
 * @ip_da:	Pointer to IPv4 destination address, NULL if unchanged
 */
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
		       const struct in_addr *ip_da)
{
	int i;

	for (i = 0; i < TCP_FRAMES_MEM; i++) {
		struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i];
		struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i];
		struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i];
		struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i];

		tap_update_mac(&b4->taph, eth_d, eth_s);
		tap_update_mac(&b6->taph, eth_d, eth_s);
		tap_update_mac(&b4f->taph, eth_d, eth_s);
		tap_update_mac(&b6f->taph, eth_d, eth_s);

		if (ip_da) {
			b4f->iph.daddr = b4->iph.daddr = ip_da->s_addr;
			if (!i) {
				b4f->iph.saddr = b4->iph.saddr = 0;
				b4f->iph.tot_len = b4->iph.tot_len = 0;
				b4f->iph.check = b4->iph.check = 0;
				b4f->psum = b4->psum = sum_16b(&b4->iph, 20);

				b4->tsum = ((ip_da->s_addr >> 16) & 0xffff) +
					    (ip_da->s_addr & 0xffff) +
					    htons(IPPROTO_TCP);
				b4f->tsum = b4->tsum;
			} else {
				b4f->psum = b4->psum = tcp4_l2_buf[0].psum;
				b4f->tsum = b4->tsum = tcp4_l2_buf[0].tsum;
			}
		}
	}
}

/**
 * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
 * @c:		Execution context
 */
static void tcp_sock4_iov_init(const struct ctx *c)
{
	struct iovec *iov;
	int i;

	for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) {
		tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) {
			.taph = TAP_HDR_INIT(ETH_P_IP),
			.iph = L2_BUF_IP4_INIT(IPPROTO_TCP),
			.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
		};
	}

	for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) {
		tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) {
			.taph = TAP_HDR_INIT(ETH_P_IP),
			.iph = L2_BUF_IP4_INIT(IPPROTO_TCP)
		};
	}

	for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
		iov->iov_base = tap_iov_base(c, &tcp4_l2_buf[i].taph);

	for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
		iov->iov_base = tap_iov_base(c, &tcp4_l2_flags_buf[i].taph);
}

/**
 * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
 * @c:		Execution context
 */
static void tcp_sock6_iov_init(const struct ctx *c)
{
	struct iovec *iov;
	int i;

	for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) {
		tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) {
			.taph = TAP_HDR_INIT(ETH_P_IPV6),
			.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP),
			.th = { .doff